bcachefs: Do updates in order they were queued up in
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 7 Sep 2019 18:16:00 +0000 (14:16 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:26 +0000 (17:08 -0400)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_iter.c
fs/bcachefs/btree_iter.h
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update.h
fs/bcachefs/btree_update_leaf.c
fs/bcachefs/buckets.c
fs/bcachefs/super.c

index a278921d3e6f3468ca07d53f3b65d478acedf426..17596aee23ccf8b9d5114a7b1fffd75cd6d5733c 100644 (file)
@@ -1671,7 +1671,10 @@ int bch2_trans_iter_free_on_commit(struct btree_trans *trans,
 static int bch2_trans_realloc_iters(struct btree_trans *trans,
                                    unsigned new_size)
 {
-       void *new_iters, *new_updates;
+       void *new_iters, *new_updates, *new_sorted;
+       size_t iters_bytes;
+       size_t updates_bytes;
+       size_t sorted_bytes;
 
        new_size = roundup_pow_of_two(new_size);
 
@@ -1684,9 +1687,13 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
 
        bch2_trans_unlock(trans);
 
-       new_iters = kmalloc(sizeof(struct btree_iter) * new_size +
-                           sizeof(struct btree_insert_entry) * (new_size + 4),
-                           GFP_NOFS);
+       iters_bytes     = sizeof(struct btree_iter) * new_size;
+       updates_bytes   = sizeof(struct btree_insert_entry) * (new_size + 4);
+       sorted_bytes    = sizeof(u8) * (new_size + 4);
+
+       new_iters = kmalloc(iters_bytes +
+                           updates_bytes +
+                           sorted_bytes, GFP_NOFS);
        if (new_iters)
                goto success;
 
@@ -1695,7 +1702,8 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
 
        trans->used_mempool = true;
 success:
-       new_updates = new_iters + sizeof(struct btree_iter) * new_size;
+       new_updates     = new_iters + iters_bytes;
+       new_sorted      = new_updates + updates_bytes;
 
        memcpy(new_iters, trans->iters,
               sizeof(struct btree_iter) * trans->nr_iters);
@@ -1710,9 +1718,10 @@ success:
        if (trans->iters != trans->iters_onstack)
                kfree(trans->iters);
 
-       trans->iters    = new_iters;
-       trans->updates  = new_updates;
-       trans->size     = new_size;
+       trans->iters            = new_iters;
+       trans->updates          = new_updates;
+       trans->updates_sorted   = new_sorted;
+       trans->size             = new_size;
 
        if (trans->iters_live) {
                trace_trans_restart_iters_realloced(trans->ip, trans->size);
@@ -1958,6 +1967,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
        trans->size             = ARRAY_SIZE(trans->iters_onstack);
        trans->iters            = trans->iters_onstack;
        trans->updates          = trans->updates_onstack;
+       trans->updates_sorted   = trans->updates_sorted_onstack;
        trans->fs_usage_deltas  = NULL;
 
        if (expected_nr_iters > trans->size)
@@ -1982,3 +1992,18 @@ int bch2_trans_exit(struct btree_trans *trans)
 
        return trans->error ? -EIO : 0;
 }
+
+void bch2_fs_btree_iter_exit(struct bch_fs *c)
+{
+       mempool_exit(&c->btree_iters_pool);
+}
+
+int bch2_fs_btree_iter_init(struct bch_fs *c)
+{
+       unsigned nr = BTREE_ITER_MAX;
+
+       return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
+                       sizeof(struct btree_iter) * nr +
+                       sizeof(struct btree_insert_entry) * (nr + 4) +
+                       sizeof(u8) * (nr + 4));
+}
index b54351073231c430aa7e6822593919eaef930084..b52d8bff011552582e767043b4dbce3c6220c75b 100644 (file)
@@ -303,4 +303,7 @@ void *bch2_trans_kmalloc(struct btree_trans *, size_t);
 void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
 int bch2_trans_exit(struct btree_trans *);
 
+void bch2_fs_btree_iter_exit(struct bch_fs *);
+int bch2_fs_btree_iter_init(struct bch_fs *);
+
 #endif /* _BCACHEFS_BTREE_ITER_H */
index 621cbfa22fc9e3ab839cc9108c449656d4d0005a..88e048fa0fbabd6d9448e204dd5497fac3b3a307 100644 (file)
@@ -264,8 +264,6 @@ struct btree_insert_entry {
        };
 
        bool                    deferred;
-       bool                    triggered;
-       bool                    marked;
 };
 
 #define BTREE_ITER_MAX         64
@@ -294,6 +292,7 @@ struct btree_trans {
 
        struct btree_iter       *iters;
        struct btree_insert_entry *updates;
+       u8                      *updates_sorted;
 
        /* update path: */
        struct journal_res      journal_res;
@@ -305,6 +304,7 @@ struct btree_trans {
 
        struct btree_iter       iters_onstack[2];
        struct btree_insert_entry updates_onstack[6];
+       u8                      updates_sorted_onstack[6];
 
        struct replicas_delta_list *fs_usage_deltas;
 };
index 08c17477e76cbe57ae7ab2a0e5914f7faace77bd..36e34b3d9213298fba90e02823f67bb4d0d33221 100644 (file)
@@ -119,8 +119,13 @@ int bch2_trans_commit(struct btree_trans *,
                      struct disk_reservation *,
                      u64 *, unsigned);
 
-struct btree_insert_entry *bch2_trans_update(struct btree_trans *,
-                                            struct btree_insert_entry);
+static inline void bch2_trans_update(struct btree_trans *trans,
+                                    struct btree_insert_entry entry)
+{
+       EBUG_ON(trans->nr_updates >= trans->nr_iters + 4);
+
+       trans->updates[trans->nr_updates++] = entry;
+}
 
 #define bch2_trans_do(_c, _journal_seq, _flags, _do)                   \
 ({                                                                     \
@@ -140,18 +145,6 @@ struct btree_insert_entry *bch2_trans_update(struct btree_trans *,
        _ret;                                                           \
 })
 
-/*
- * We sort transaction entries so that if multiple iterators point to the same
- * leaf node they'll be adjacent:
- */
-static inline bool same_leaf_as_prev(struct btree_trans *trans,
-                                    struct btree_insert_entry *i)
-{
-       return i != trans->updates &&
-               !i->deferred &&
-               i[0].iter->l[0].b == i[-1].iter->l[0].b;
-}
-
 #define __trans_next_update(_trans, _i, _filter)                       \
 ({                                                                     \
        while ((_i) < (_trans)->updates + (_trans->nr_updates) && !(_filter))\
@@ -171,8 +164,4 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans,
 #define trans_for_each_update_iter(trans, i)                           \
        __trans_for_each_update(trans, i, !(i)->deferred)
 
-#define trans_for_each_update_leaf(trans, i)                           \
-       __trans_for_each_update(trans, i, !(i)->deferred &&             \
-                              !same_leaf_as_prev(trans, i))
-
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
index 66b12e55d9465fede8b799146aa97083c1920f3b..657359059a087835516db1dd9ab4f61d12fbf196 100644 (file)
 
 #include <linux/sort.h>
 
+static inline bool same_leaf_as_prev(struct btree_trans *trans,
+                                    unsigned sorted_idx)
+{
+       struct btree_insert_entry *i = trans->updates +
+               trans->updates_sorted[sorted_idx];
+       struct btree_insert_entry *prev = sorted_idx
+               ? trans->updates + trans->updates_sorted[sorted_idx - 1]
+               : NULL;
+
+       return !i->deferred &&
+               prev &&
+               i->iter->l[0].b == prev->iter->l[0].b;
+}
+
+#define trans_for_each_update_sorted(_trans, _i, _iter)                        \
+       for (iter = 0;                                                  \
+            _iter < _trans->nr_updates &&                              \
+            (_i = _trans->updates + _trans->updates_sorted[_iter], 1); \
+            _iter++)
+
 inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
                                            struct btree_iter *iter)
 {
@@ -36,20 +56,21 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
                bch2_btree_init_next(c, b, iter);
 }
 
-static void btree_trans_lock_write(struct bch_fs *c, struct btree_trans *trans)
+static void btree_trans_lock_write(struct btree_trans *trans, bool lock)
 {
+       struct bch_fs *c = trans->c;
        struct btree_insert_entry *i;
+       unsigned iter;
 
-       trans_for_each_update_leaf(trans, i)
-               bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
-}
-
-static void btree_trans_unlock_write(struct btree_trans *trans)
-{
-       struct btree_insert_entry *i;
+       trans_for_each_update_sorted(trans, i, iter) {
+               if (same_leaf_as_prev(trans, iter))
+                       continue;
 
-       trans_for_each_update_leaf(trans, i)
-               bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
+               if (lock)
+                       bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
+               else
+                       bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
+       }
 }
 
 static inline int btree_trans_cmp(struct btree_insert_entry l,
@@ -59,6 +80,30 @@ static inline int btree_trans_cmp(struct btree_insert_entry l,
                btree_iter_cmp(l.iter, r.iter);
 }
 
+static inline void btree_trans_sort_updates(struct btree_trans *trans)
+{
+       struct btree_insert_entry *l, *r;
+       unsigned nr = 0, pos;
+
+       trans_for_each_update(trans, l) {
+               for (pos = 0; pos < nr; pos++) {
+                       r = trans->updates + trans->updates_sorted[pos];
+
+                       if (btree_trans_cmp(*l, *r) <= 0)
+                               break;
+               }
+
+               memmove(&trans->updates_sorted[pos + 1],
+                       &trans->updates_sorted[pos],
+                       (nr - pos) * sizeof(trans->updates_sorted[0]));
+
+               trans->updates_sorted[pos] = l - trans->updates;
+               nr++;
+       }
+
+       BUG_ON(nr != trans->nr_updates);
+}
+
 /* Inserting into a given leaf node (last stage of insert): */
 
 /* Handle overwrites and do insert, for non extents: */
@@ -488,12 +533,12 @@ static int btree_trans_check_can_insert(struct btree_trans *trans,
                                        struct btree_insert_entry **stopped_at)
 {
        struct btree_insert_entry *i;
-       unsigned u64s = 0;
+       unsigned iter, u64s = 0;
        int ret;
 
-       trans_for_each_update_iter(trans, i) {
+       trans_for_each_update_sorted(trans, i, iter) {
                /* Multiple inserts might go to same leaf: */
-               if (!same_leaf_as_prev(trans, i))
+               if (!same_leaf_as_prev(trans, iter))
                        u64s = 0;
 
                u64s += i->k->k.u64s;
@@ -542,7 +587,6 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct bch_fs_usage_online *fs_usage = NULL;
        struct btree_insert_entry *i;
-       bool saw_non_marked;
        unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
                ? BCH_BUCKET_MARK_BUCKET_INVALIDATE
                : 0;
@@ -551,35 +595,31 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
        trans_for_each_update_iter(trans, i)
                BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
 
+       /*
+        * note: running triggers will append more updates to the list of
+        * updates as we're walking it:
+        */
        trans_for_each_update_iter(trans, i)
-               i->marked = false;
-
-       do {
-               saw_non_marked = false;
-
-               trans_for_each_update_iter(trans, i) {
-                       if (i->marked)
-                               continue;
-
-                       saw_non_marked = true;
-                       i->marked = true;
-
-                       if (update_has_triggers(trans, i) &&
-                           update_triggers_transactional(trans, i)) {
-                               ret = bch2_trans_mark_update(trans, i->iter, i->k);
-                               if (ret == -EINTR)
-                                       trace_trans_restart_mark(trans->ip);
-                               if (ret)
-                                       goto out_clear_replicas;
-                       }
+               if (update_has_triggers(trans, i) &&
+                   update_triggers_transactional(trans, i)) {
+                       ret = bch2_trans_mark_update(trans, i->iter, i->k);
+                       if (ret == -EINTR)
+                               trace_trans_restart_mark(trans->ip);
+                       if (ret)
+                               goto out_clear_replicas;
                }
-       } while (saw_non_marked);
 
        trans_for_each_update(trans, i)
                btree_insert_entry_checks(trans, i);
        bch2_btree_trans_verify_locks(trans);
 
-       btree_trans_lock_write(c, trans);
+       /*
+        * No more updates can be added - sort updates so we can take write
+        * locks in the correct order:
+        */
+       btree_trans_sort_updates(trans);
+
+       btree_trans_lock_write(trans, true);
 
        if (race_fault()) {
                ret = -EINTR;
@@ -597,8 +637,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
                goto out;
 
        trans_for_each_update_iter(trans, i) {
-               if (i->deferred ||
-                   !btree_node_type_needs_gc(i->iter->btree_id))
+               if (!btree_node_type_needs_gc(i->iter->btree_id))
                        continue;
 
                if (!fs_usage) {
@@ -664,7 +703,7 @@ out:
               (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) &&
               trans->journal_res.ref);
 
-       btree_trans_unlock_write(trans);
+       btree_trans_lock_write(trans, false);
 
        if (fs_usage) {
                bch2_fs_usage_scratch_put(c, fs_usage);
@@ -689,19 +728,6 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        unsigned flags = trans->flags;
-       struct btree_insert_entry *src, *dst;
-
-       src = dst = trans->updates;
-
-       while (src < trans->updates + trans->nr_updates) {
-               if (!src->triggered) {
-                       *dst = *src;
-                       dst++;
-               }
-               src++;
-       }
-
-       trans->nr_updates = dst - trans->updates;
 
        /*
         * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
@@ -816,6 +842,7 @@ static int __bch2_trans_commit(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i;
+       unsigned iter;
        int ret;
 
        trans_for_each_update_iter(trans, i) {
@@ -837,8 +864,10 @@ static int __bch2_trans_commit(struct btree_trans *trans,
        if (trans->flags & BTREE_INSERT_NOUNLOCK)
                trans->nounlock = true;
 
-       trans_for_each_update_leaf(trans, i)
-               bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
+       trans_for_each_update_sorted(trans, i, iter)
+               if (!same_leaf_as_prev(trans, iter))
+                       bch2_foreground_maybe_merge(c, i->iter,
+                                                   0, trans->flags);
 
        trans->nounlock = false;
 
@@ -858,7 +887,8 @@ int bch2_trans_commit(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i = NULL;
-       unsigned orig_mem_top = trans->mem_top;
+       unsigned orig_nr_updates        = trans->nr_updates;
+       unsigned orig_mem_top           = trans->mem_top;
        int ret = 0;
 
        if (!trans->nr_updates)
@@ -931,39 +961,20 @@ out_noupdates:
 err:
        ret = bch2_trans_commit_error(trans, i, ret);
 
+       /* free updates and memory used by triggers, they'll be reexecuted: */
+       trans->nr_updates       = orig_nr_updates;
+       trans->mem_top          = orig_mem_top;
+
        /* can't loop if it was passed in and we changed it: */
        if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret)
                ret = -EINTR;
 
-       if (!ret) {
-               /* free memory used by triggers, they'll be reexecuted: */
-               trans->mem_top = orig_mem_top;
+       if (!ret)
                goto retry;
-       }
 
        goto out;
 }
 
-struct btree_insert_entry *bch2_trans_update(struct btree_trans *trans,
-                                            struct btree_insert_entry entry)
-{
-       struct btree_insert_entry *i;
-
-       BUG_ON(trans->nr_updates >= trans->nr_iters + 4);
-
-       for (i = trans->updates;
-            i < trans->updates + trans->nr_updates;
-            i++)
-               if (btree_trans_cmp(entry, *i) < 0)
-                       break;
-
-       memmove(&i[1], &i[0],
-               (void *) &trans->updates[trans->nr_updates] - (void *) i);
-       trans->nr_updates++;
-       *i = entry;
-       return i;
-}
-
 /**
  * bch2_btree_insert - insert keys into the extent btree
  * @c:                 pointer to struct bch_fs
index e432a450708125adc63aad49cff0102bbf8a2c92..b6340a2f6debad7c5c1093ef76d6c9f7ddc9c889 100644 (file)
@@ -1358,11 +1358,8 @@ static int trans_get_key(struct btree_trans *trans,
        struct btree_insert_entry *i;
        int ret;
 
-       for (i = trans->updates;
-            i < trans->updates + trans->nr_updates;
-            i++)
-               if (!i->deferred &&
-                   i->iter->btree_id == btree_id &&
+       trans_for_each_update_iter(trans, i)
+               if (i->iter->btree_id == btree_id &&
                    (btree_node_type_is_extents(btree_id)
                     ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
                       bkey_cmp(pos, i->k->k.p) < 0
@@ -1390,8 +1387,8 @@ static void *trans_update_key(struct btree_trans *trans,
                              struct btree_iter *iter,
                              unsigned u64s)
 {
+       struct btree_insert_entry *i;
        struct bkey_i *new_k;
-       unsigned i;
 
        new_k = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
        if (IS_ERR(new_k))
@@ -1400,19 +1397,13 @@ static void *trans_update_key(struct btree_trans *trans,
        bkey_init(&new_k->k);
        new_k->k.p = iter->pos;
 
-       for (i = 0; i < trans->nr_updates; i++)
-               if (!trans->updates[i].deferred &&
-                   trans->updates[i].iter == iter) {
-                       trans->updates[i].k = new_k;
+       trans_for_each_update_iter(trans, i)
+               if (i->iter == iter) {
+                       i->k = new_k;
                        return new_k;
                }
 
-       bch2_trans_update(trans, ((struct btree_insert_entry) {
-               .iter = iter,
-               .k = new_k,
-               .triggered = true,
-       }));
-
+       bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, new_k));
        return new_k;
 }
 
index 202c0b443ef48bd821d79ab11b29ae89ff1609e6..14e2f6828cc6f6b8a1899845fd59b935f68d78cc 100644 (file)
@@ -462,6 +462,7 @@ static void bch2_fs_free(struct bch_fs *c)
        bch2_fs_ec_exit(c);
        bch2_fs_encryption_exit(c);
        bch2_fs_io_exit(c);
+       bch2_fs_btree_iter_exit(c);
        bch2_fs_btree_cache_exit(c);
        bch2_fs_journal_exit(&c->journal);
        bch2_io_clock_exit(&c->io_clock[WRITE]);
@@ -474,7 +475,6 @@ static void bch2_fs_free(struct bch_fs *c)
        free_percpu(c->usage[0]);
        kfree(c->usage_base);
        free_percpu(c->pcpu);
-       mempool_exit(&c->btree_iters_pool);
        mempool_exit(&c->btree_bounce_pool);
        bioset_exit(&c->btree_bio);
        mempool_exit(&c->btree_interior_update_pool);
@@ -729,15 +729,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            !(c->online_reserved = alloc_percpu(u64)) ||
            mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
                                        btree_bytes(c)) ||
-           mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
-                       sizeof(struct btree_iter) * BTREE_ITER_MAX +
-                       sizeof(struct btree_insert_entry) *
-                       (BTREE_ITER_MAX + 4)) ||
            bch2_io_clock_init(&c->io_clock[READ]) ||
            bch2_io_clock_init(&c->io_clock[WRITE]) ||
            bch2_fs_journal_init(&c->journal) ||
            bch2_fs_replicas_init(c) ||
            bch2_fs_btree_cache_init(c) ||
+           bch2_fs_btree_iter_init(c) ||
            bch2_fs_io_init(c) ||
            bch2_fs_encryption_init(c) ||
            bch2_fs_compress_init(c) ||