bcachefs: Run btree updates after write out of write_point
authorKent Overstreet <kent.overstreet@linux.dev>
Mon, 31 Oct 2022 20:13:05 +0000 (16:13 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:29 +0000 (17:09 -0400)
In the write path, after the write to the block device(s) complete we
have to punt to process context to do the btree update.

Instead of using the work item embedded in op->cl, this patch switches
to a per write-point work item. This helps with two different issues:

 - lock contention: btree updates to the same writepoint will (usually)
   be updating the same alloc keys
 - context switch overhead: when we're bottlenecked on btree updates,
   having a thread (running out of a work item) checking the write point
   for completed ops is cheaper than queueing up a new work item and
   waking up a kworker.

In an arbitrary benchmark, 4k random writes with fio running inside a
VM, this patch resulted in a 10% improvement in total iops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/alloc_foreground.c
fs/bcachefs/alloc_foreground.h
fs/bcachefs/alloc_types.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/io.c
fs/bcachefs/io.h
fs/bcachefs/io_types.h
fs/bcachefs/super.c
fs/bcachefs/sysfs.c
fs/bcachefs/util.c
fs/bcachefs/util.h

index 76a4b8029bdfeae00c0e536bf6892f00d117dfb2..c4b4689fdd0f128346416a874c5857e7b335fbad 100644 (file)
@@ -762,16 +762,17 @@ out:
 /*
  * Get us an open_bucket we can allocate from, return with it locked:
  */
-struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
-                               unsigned target,
-                               unsigned erasure_code,
-                               struct write_point_specifier write_point,
-                               struct bch_devs_list *devs_have,
-                               unsigned nr_replicas,
-                               unsigned nr_replicas_required,
-                               enum alloc_reserve reserve,
-                               unsigned flags,
-                               struct closure *cl)
+int bch2_alloc_sectors_start(struct bch_fs *c,
+                            unsigned target,
+                            unsigned erasure_code,
+                            struct write_point_specifier write_point,
+                            struct bch_devs_list *devs_have,
+                            unsigned nr_replicas,
+                            unsigned nr_replicas_required,
+                            enum alloc_reserve reserve,
+                            unsigned flags,
+                            struct closure *cl,
+                            struct write_point **wp_ret)
 {
        struct write_point *wp;
        struct open_bucket *ob;
@@ -792,7 +793,7 @@ retry:
        write_points_nr = c->write_points_nr;
        have_cache      = false;
 
-       wp = writepoint_find(c, write_point.v);
+       *wp_ret = wp = writepoint_find(c, write_point.v);
 
        if (wp->data_type == BCH_DATA_user)
                ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
@@ -848,7 +849,7 @@ alloc_done:
 
        BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
 
-       return wp;
+       return 0;
 err:
        open_bucket_for_each(c, &wp->ptrs, ob, i)
                if (ptrs.nr < ARRAY_SIZE(ptrs.v))
@@ -866,9 +867,9 @@ err:
        switch (ret) {
        case -OPEN_BUCKETS_EMPTY:
        case -FREELIST_EMPTY:
-               return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
+               return cl ? -EAGAIN : -ENOSPC;
        case -INSUFFICIENT_DEVICES:
-               return ERR_PTR(-EROFS);
+               return -EROFS;
        default:
                BUG();
        }
@@ -895,13 +896,13 @@ struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
                                    struct bkey_i *k, unsigned sectors,
                                    bool cached)
-
 {
        struct open_bucket *ob;
        unsigned i;
 
        BUG_ON(sectors > wp->sectors_free);
-       wp->sectors_free -= sectors;
+       wp->sectors_free        -= sectors;
+       wp->sectors_allocated   += sectors;
 
        open_bucket_for_each(c, &wp->ptrs, ob, i) {
                struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
@@ -942,6 +943,10 @@ static inline void writepoint_init(struct write_point *wp,
 {
        mutex_init(&wp->lock);
        wp->data_type = type;
+
+       INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
+       INIT_LIST_HEAD(&wp->writes);
+       spin_lock_init(&wp->writes_lock);
 }
 
 void bch2_fs_allocator_foreground_init(struct bch_fs *c)
@@ -997,3 +1002,33 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
        }
 
 }
+
+static const char * const bch2_write_point_states[] = {
+#define x(n)   #n,
+       WRITE_POINT_STATES()
+#undef x
+       NULL
+};
+
+void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
+{
+       struct write_point *wp;
+       unsigned i;
+
+       for (wp = c->write_points;
+            wp < c->write_points + ARRAY_SIZE(c->write_points);
+            wp++) {
+               pr_buf(out, "%lu: ", wp->write_point);
+               bch2_hprint(out, wp->sectors_allocated);
+
+               pr_buf(out, " last wrote: ");
+               bch2_pr_time_units(out, sched_clock() - wp->last_used);
+
+               for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
+                       pr_buf(out, " %s: ", bch2_write_point_states[i]);
+                       bch2_pr_time_units(out, wp->time[i]);
+               }
+
+               pr_newline(out);
+       }
+}
index 3598c70b93b4f43faf9748bfa8769a85bcd8a066..9b4389b09cbb19d627cc8f845dc78bc52786d938 100644 (file)
@@ -122,14 +122,15 @@ int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
                      unsigned, unsigned *, bool *, enum alloc_reserve,
                      unsigned, struct closure *);
 
-struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
-                                            unsigned, unsigned,
-                                            struct write_point_specifier,
-                                            struct bch_devs_list *,
-                                            unsigned, unsigned,
-                                            enum alloc_reserve,
-                                            unsigned,
-                                            struct closure *);
+int bch2_alloc_sectors_start(struct bch_fs *,
+                            unsigned, unsigned,
+                            struct write_point_specifier,
+                            struct bch_devs_list *,
+                            unsigned, unsigned,
+                            enum alloc_reserve,
+                            unsigned,
+                            struct closure *,
+                            struct write_point **);
 
 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
@@ -156,4 +157,6 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *);
 
 void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
 
+void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
+
 #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
index e3a3eb2711588bca88062c9cc7d3d3dd4be8062d..9e00afb175592294c2ff88d5d3f5d01e4cce3f7a 100644 (file)
@@ -81,18 +81,46 @@ struct dev_stripe_state {
        u64                     next_alloc[BCH_SB_MEMBERS_MAX];
 };
 
+#define WRITE_POINT_STATES()           \
+       x(stopped)                      \
+       x(waiting_io)                   \
+       x(waiting_work)                 \
+       x(running)
+
+enum write_point_state {
+#define x(n)   WRITE_POINT_##n,
+       WRITE_POINT_STATES()
+#undef x
+       WRITE_POINT_STATE_NR
+};
+
 struct write_point {
-       struct hlist_node       node;
-       struct mutex            lock;
-       u64                     last_used;
-       unsigned long           write_point;
-       enum bch_data_type      data_type;
+       struct {
+               struct hlist_node       node;
+               struct mutex            lock;
+               u64                     last_used;
+               unsigned long           write_point;
+               enum bch_data_type      data_type;
 
-       /* calculated based on how many pointers we're actually going to use: */
-       unsigned                sectors_free;
+               /* calculated based on how many pointers we're actually going to use: */
+               unsigned                sectors_free;
+
+               struct open_buckets     ptrs;
+               struct dev_stripe_state stripe;
+
+               u64                     sectors_allocated;
+       } __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+       struct {
+               struct work_struct      index_update_work;
+
+               struct list_head        writes;
+               spinlock_t              writes_lock;
 
-       struct open_buckets     ptrs;
-       struct dev_stripe_state stripe;
+               enum write_point_state  state;
+               u64                     last_state_change;
+               u64                     time[WRITE_POINT_STATE_NR];
+       } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 };
 
 struct write_point_specifier {
index ff56c374ff2e6c02999d23223dbb0894276df397..d1e3e2c76e304fef457c98ede095f68d5023990e 100644 (file)
@@ -191,6 +191,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
        struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
        unsigned nr_reserve;
        enum alloc_reserve alloc_reserve;
+       int ret;
 
        if (flags & BTREE_INSERT_USE_RESERVE) {
                nr_reserve      = 0;
@@ -213,7 +214,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
        mutex_unlock(&c->btree_reserve_cache_lock);
 
 retry:
-       wp = bch2_alloc_sectors_start(c,
+       ret = bch2_alloc_sectors_start(c,
                                      c->opts.metadata_target ?:
                                      c->opts.foreground_target,
                                      0,
@@ -221,9 +222,9 @@ retry:
                                      &devs_have,
                                      res->nr_replicas,
                                      c->opts.metadata_replicas_required,
-                                     alloc_reserve, 0, cl);
-       if (IS_ERR(wp))
-               return ERR_CAST(wp);
+                                     alloc_reserve, 0, cl, &wp);
+       if (unlikely(ret))
+               return ERR_PTR(ret);
 
        if (wp->sectors_free < btree_sectors(c)) {
                struct open_bucket *ob;
index 2cc56979fcb353648799a9fc6d2c54fd00d28e3a..6bebbd44ccc81b9d7548735634cdf8e78ca243a1 100644 (file)
@@ -589,7 +589,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
        }
 }
 
-static void __bch2_write(struct closure *);
+static void __bch2_write(struct bch_write_op *);
 
 static void bch2_write_done(struct closure *cl)
 {
@@ -686,22 +686,86 @@ err:
        goto out;
 }
 
+static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
+{
+       if (state != wp->state) {
+               u64 now = ktime_get_ns();
+
+               if (wp->last_state_change &&
+                   time_after64(now, wp->last_state_change))
+                       wp->time[wp->state] += now - wp->last_state_change;
+               wp->state = state;
+               wp->last_state_change = now;
+       }
+}
+
+static inline void wp_update_state(struct write_point *wp, bool running)
+{
+       enum write_point_state state;
+
+       state = running                  ? WRITE_POINT_running :
+               !list_empty(&wp->writes) ? WRITE_POINT_waiting_io
+                                        : WRITE_POINT_stopped;
+
+       __wp_update_state(wp, state);
+}
+
 static void bch2_write_index(struct closure *cl)
 {
        struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-       struct bch_fs *c = op->c;
+       struct write_point *wp = op->wp;
+       struct workqueue_struct *wq = index_update_wq(op);
 
-       __bch2_write_index(op);
+       barrier();
 
-       if (!(op->flags & BCH_WRITE_DONE)) {
-               continue_at(cl, __bch2_write, index_update_wq(op));
-       } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
-               bch2_journal_flush_seq_async(&c->journal,
-                                            *op_journal_seq(op),
-                                            cl);
-               continue_at(cl, bch2_write_done, index_update_wq(op));
-       } else {
-               continue_at_nobarrier(cl, bch2_write_done, NULL);
+       /*
+        * We're not using wp->writes_lock here, so this is racey: that's ok,
+        * because this is just for diagnostic purposes, and we're running out
+        * of interrupt context here so if we were to take the log we'd have to
+        * switch to spin_lock_irq()/irqsave(), which is not free:
+        */
+       if (wp->state == WRITE_POINT_waiting_io)
+               __wp_update_state(wp, WRITE_POINT_waiting_work);
+
+       op->btree_update_ready = true;
+       queue_work(wq, &wp->index_update_work);
+}
+
+void bch2_write_point_do_index_updates(struct work_struct *work)
+{
+       struct write_point *wp =
+               container_of(work, struct write_point, index_update_work);
+       struct bch_write_op *op;
+
+       while (1) {
+               spin_lock(&wp->writes_lock);
+               list_for_each_entry(op, &wp->writes, wp_list)
+                       if (op->btree_update_ready) {
+                               list_del(&op->wp_list);
+                               goto unlock;
+                       }
+               op = NULL;
+unlock:
+               wp_update_state(wp, op != NULL);
+               spin_unlock(&wp->writes_lock);
+
+               if (!op)
+                       break;
+
+               op->flags |= BCH_WRITE_IN_WORKER;
+
+               __bch2_write_index(op);
+
+               if (!(op->flags & BCH_WRITE_DONE)) {
+                       __bch2_write(op);
+               } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
+                       bch2_journal_flush_seq_async(&op->c->journal,
+                                                    *op_journal_seq(op),
+                                                    &op->cl);
+                       continue_at(&op->cl, bch2_write_done, index_update_wq(op));
+               } else {
+                       bch2_write_done(&op->cl);
+               }
        }
 }
 
@@ -734,10 +798,8 @@ static void bch2_write_endio(struct bio *bio)
 
        if (parent)
                bio_endio(&parent->bio);
-       else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
-               closure_put(cl);
        else
-               continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
+               closure_put(cl);
 }
 
 static void init_append_extent(struct bch_write_op *op,
@@ -1136,19 +1198,18 @@ err:
        return ret;
 }
 
-static void __bch2_write(struct closure *cl)
+static void __bch2_write(struct bch_write_op *op)
 {
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
        struct bch_fs *c = op->c;
-       struct write_point *wp;
+       struct write_point *wp = NULL;
        struct bio *bio = NULL;
-       bool skip_put = true;
        unsigned nofs_flags;
        int ret;
 
        nofs_flags = memalloc_nofs_save();
 again:
        memset(&op->failed, 0, sizeof(op->failed));
+       op->btree_update_ready = false;
 
        do {
                struct bkey_i *key_to_write;
@@ -1158,13 +1219,13 @@ again:
                /* +1 for possible cache device: */
                if (op->open_buckets.nr + op->nr_replicas + 1 >
                    ARRAY_SIZE(op->open_buckets.v))
-                       goto flush_io;
+                       break;
 
                if (bch2_keylist_realloc(&op->insert_keys,
                                        op->inline_keys,
                                        ARRAY_SIZE(op->inline_keys),
                                        BKEY_EXTENT_U64s_MAX))
-                       goto flush_io;
+                       break;
 
                if ((op->flags & BCH_WRITE_FROM_INTERNAL) &&
                    percpu_ref_is_dying(&c->writes)) {
@@ -1177,7 +1238,7 @@ again:
                 * freeing up space on specific disks, which means that
                 * allocations for specific disks may hang arbitrarily long:
                 */
-               wp = bch2_alloc_sectors_start(c,
+               ret = bch2_alloc_sectors_start(c,
                        op->target,
                        op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
                        op->write_point,
@@ -1187,53 +1248,34 @@ again:
                        op->alloc_reserve,
                        op->flags,
                        (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
-                                     BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
-               EBUG_ON(!wp);
-
-               if (unlikely(IS_ERR(wp))) {
-                       if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
-                               ret = PTR_ERR(wp);
+                                     BCH_WRITE_ONLY_SPECIFIED_DEVS))
+                       ? NULL : &op->cl,
+                       &wp);
+               if (unlikely(ret)) {
+                       if (unlikely(ret != -EAGAIN))
                                goto err;
-                       }
 
-                       goto flush_io;
+                       break;
                }
 
-               /*
-                * It's possible for the allocator to fail, put us on the
-                * freelist waitlist, and then succeed in one of various retry
-                * paths: if that happens, we need to disable the skip_put
-                * optimization because otherwise there won't necessarily be a
-                * barrier before we free the bch_write_op:
-                */
-               if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
-                       skip_put = false;
+               EBUG_ON(!wp);
 
                bch2_open_bucket_get(c, wp, &op->open_buckets);
                ret = bch2_write_extent(op, wp, &bio);
+
                bch2_alloc_sectors_done(c, wp);
 
                if (ret < 0)
                        goto err;
 
-               if (ret) {
-                       skip_put = false;
-               } else {
-                       /*
-                        * for the skip_put optimization this has to be set
-                        * before we submit the bio:
-                        */
+               if (!ret)
                        op->flags |= BCH_WRITE_DONE;
-               }
 
                bio->bi_end_io  = bch2_write_endio;
                bio->bi_private = &op->cl;
                bio->bi_opf |= REQ_OP_WRITE;
 
-               if (!skip_put)
-                       closure_get(bio->bi_private);
-               else
-                       op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
+               closure_get(bio->bi_private);
 
                key_to_write = (void *) (op->insert_keys.keys_p +
                                         key_to_write_offset);
@@ -1241,55 +1283,49 @@ again:
                bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
                                          key_to_write);
        } while (ret);
-
-       if (!skip_put)
-               continue_at(cl, bch2_write_index, index_update_wq(op));
 out:
-       memalloc_nofs_restore(nofs_flags);
-       return;
-err:
-       op->error = ret;
-       op->flags |= BCH_WRITE_DONE;
-
-       continue_at(cl, bch2_write_index, index_update_wq(op));
-       goto out;
-flush_io:
        /*
         * If the write can't all be submitted at once, we generally want to
         * block synchronously as that signals backpressure to the caller.
-        *
-        * However, if we're running out of a workqueue, we can't block here
-        * because we'll be blocking other work items from completing:
         */
-       if (current->flags & PF_WQ_WORKER) {
-               continue_at(cl, bch2_write_index, index_update_wq(op));
-               goto out;
-       }
-
-       closure_sync(cl);
-
-       if (!bch2_keylist_empty(&op->insert_keys)) {
+       if (!(op->flags & BCH_WRITE_DONE) &&
+           !(op->flags & BCH_WRITE_IN_WORKER)) {
+               closure_sync(&op->cl);
                __bch2_write_index(op);
 
-               if (op->error) {
-                       op->flags |= BCH_WRITE_DONE;
-                       continue_at_nobarrier(cl, bch2_write_done, NULL);
-                       goto out;
-               }
+               if (!(op->flags & BCH_WRITE_DONE))
+                       goto again;
+               bch2_write_done(&op->cl);
+       } else {
+               spin_lock(&wp->writes_lock);
+               op->wp = wp;
+               list_add_tail(&op->wp_list, &wp->writes);
+               if (wp->state == WRITE_POINT_stopped)
+                       __wp_update_state(wp, WRITE_POINT_waiting_io);
+               spin_unlock(&wp->writes_lock);
+
+               continue_at(&op->cl, bch2_write_index, NULL);
        }
 
-       goto again;
+       memalloc_nofs_restore(nofs_flags);
+       return;
+err:
+       op->error = ret;
+       op->flags |= BCH_WRITE_DONE;
+       goto out;
 }
 
 static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
 {
-       struct closure *cl = &op->cl;
        struct bio *bio = &op->wbio.bio;
        struct bvec_iter iter;
        struct bkey_i_inline_data *id;
        unsigned sectors;
        int ret;
 
+       op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+       op->flags |= BCH_WRITE_DONE;
+
        bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
 
        ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
@@ -1317,11 +1353,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
        set_bkey_val_bytes(&id->k, data_len);
        bch2_keylist_push(&op->insert_keys);
 
-       op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
-       op->flags |= BCH_WRITE_DONE;
-
-       continue_at_nobarrier(cl, bch2_write_index, NULL);
-       return;
+       __bch2_write_index(op);
 err:
        bch2_write_done(&op->cl);
 }
@@ -1349,6 +1381,7 @@ void bch2_write(struct closure *cl)
        struct bch_fs *c = op->c;
        unsigned data_len;
 
+       EBUG_ON(op->cl.parent);
        BUG_ON(!op->nr_replicas);
        BUG_ON(!op->write_point.v);
        BUG_ON(!bkey_cmp(op->pos, POS_MAX));
@@ -1381,18 +1414,14 @@ void bch2_write(struct closure *cl)
                return;
        }
 
-       continue_at_nobarrier(cl, __bch2_write, NULL);
+       __bch2_write(op);
        return;
 err:
        bch2_disk_reservation_put(c, &op->res);
 
-       if (op->end_io) {
-               EBUG_ON(cl->parent);
-               closure_debug_destroy(cl);
+       closure_debug_destroy(&op->cl);
+       if (op->end_io)
                op->end_io(op);
-       } else {
-               closure_return(cl);
-       }
 }
 
 /* Cache promotion on read */
index f8ce9543c9e3e471ba2a4e86b016ab97980ec763..b484d33879681a8d5d77065139b5cb6dd28205a2 100644 (file)
@@ -41,7 +41,7 @@ enum bch_write_flags {
        __BCH_WRITE_CHECK_ENOSPC,
        __BCH_WRITE_MOVE,
        __BCH_WRITE_JOURNAL_SEQ_PTR,
-       __BCH_WRITE_SKIP_CLOSURE_PUT,
+       __BCH_WRITE_IN_WORKER,
        __BCH_WRITE_DONE,
 };
 
@@ -59,7 +59,7 @@ enum bch_write_flags {
 
 /* Internal: */
 #define BCH_WRITE_JOURNAL_SEQ_PTR      (1U << __BCH_WRITE_JOURNAL_SEQ_PTR)
-#define BCH_WRITE_SKIP_CLOSURE_PUT     (1U << __BCH_WRITE_SKIP_CLOSURE_PUT)
+#define BCH_WRITE_IN_WORKER            (1U << __BCH_WRITE_IN_WORKER)
 #define BCH_WRITE_DONE                 (1U << __BCH_WRITE_DONE)
 
 static inline u64 *op_journal_seq(struct bch_write_op *op)
@@ -115,6 +115,8 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 
 void bch2_write(struct closure *);
 
+void bch2_write_point_do_index_updates(struct work_struct *);
+
 static inline struct bch_write_bio *wbio_init(struct bio *bio)
 {
        struct bch_write_bio *wbio = to_wbio(bio);
index 53270f0a08a360c8efa7926cb41e8b073da2accb..c316a39d381a1f4111b4e53b56ce3feed108f17b 100644 (file)
@@ -119,6 +119,7 @@ struct bch_write_op {
        unsigned                nr_replicas_required:4;
        unsigned                alloc_reserve:3;
        unsigned                incompressible:1;
+       unsigned                btree_update_ready:1;
 
        struct bch_devs_list    devs_have;
        u16                     target;
@@ -134,6 +135,9 @@ struct bch_write_op {
 
        struct write_point_specifier write_point;
 
+       struct write_point      *wp;
+       struct list_head        wp_list;
+
        struct disk_reservation res;
 
        struct open_buckets     open_buckets;
index e4201aecdba1bd37f0e20b19cc4a68c321d43e5a..208482db3683490ead51a580fe4c63396faabc0d 100644 (file)
@@ -791,7 +791,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
 
        if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
-                               WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+                               WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) ||
            !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
                                WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
            !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
index 7e10adba5c75ce6bc843dd0b09660ccf1bb9e1ba..ec672134cb18d9cc3179330f3a3d634dd5eb429e 100644 (file)
@@ -180,6 +180,7 @@ read_attribute(btree_key_cache);
 read_attribute(btree_transactions);
 read_attribute(stripes_heap);
 read_attribute(open_buckets);
+read_attribute(write_points);
 
 read_attribute(internal_uuid);
 
@@ -418,6 +419,9 @@ SHOW(bch2_fs)
        if (attr == &sysfs_open_buckets)
                bch2_open_buckets_to_text(out, c);
 
+       if (attr == &sysfs_write_points)
+               bch2_write_points_to_text(out, c);
+
        if (attr == &sysfs_compression_stats)
                bch2_compression_stats_to_text(out, c);
 
@@ -563,6 +567,7 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_new_stripes,
        &sysfs_stripes_heap,
        &sysfs_open_buckets,
+       &sysfs_write_points,
        &sysfs_io_timers_read,
        &sysfs_io_timers_write,
 
index f290c069c6830a3c3f2c0d486d9907cbc6f72b89..e1d36d9b092c7a16560ede344c88db68140e6072 100644 (file)
@@ -390,7 +390,7 @@ static const struct time_unit *pick_time_units(u64 ns)
        return u;
 }
 
-static void pr_time_units(struct printbuf *out, u64 ns)
+void bch2_pr_time_units(struct printbuf *out, u64 ns)
 {
        const struct time_unit *u = pick_time_units(ns);
 
@@ -410,13 +410,13 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
               freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
 
        pr_buf(out, "frequency:\t");
-       pr_time_units(out, freq);
+       bch2_pr_time_units(out, freq);
 
        pr_buf(out, "\navg duration:\t");
-       pr_time_units(out, stats->average_duration);
+       bch2_pr_time_units(out, stats->average_duration);
 
        pr_buf(out, "\nmax duration:\t");
-       pr_time_units(out, stats->max_duration);
+       bch2_pr_time_units(out, stats->max_duration);
 
        i = eytzinger0_first(NR_QUANTILES);
        u = pick_time_units(stats->quantiles.entries[i].m);
index fd776fb281b74209edc401be8e788412d0c5ef01..085f1c35738367ae498081eb368850966531c98e 100644 (file)
@@ -352,6 +352,8 @@ static inline void pr_sectors(struct printbuf *out, u64 v)
        bch2_pr_units(out, v, v << 9);
 }
 
+void bch2_pr_time_units(struct printbuf *, u64);
+
 #ifdef __KERNEL__
 static inline void pr_time(struct printbuf *out, u64 time)
 {