bcachefs: Keep a sorted list of btree iterators
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 12 Jun 2021 19:45:45 +0000 (15:45 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:10 +0000 (17:09 -0400)
This will be used to make other operations on btree iterators within a
transaction more efficient, and enable some other improvements to how we
manage btree iterators.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_iter.c
fs/bcachefs/btree_iter.h
fs/bcachefs/btree_types.h
fs/bcachefs/util.h

index d1a03fdba9ce13241718728c93a48fea3f1b0f3d..c14be80931166c87536bba85c02c2ed512710011 100644 (file)
 #include <linux/prefetch.h>
 
 static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
+static inline void btree_trans_sort_iters(struct btree_trans *);
 static struct btree_iter *btree_iter_child_alloc(struct btree_iter *, unsigned long);
-static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *);
+static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *,
+                                                struct btree_iter *);
 static void btree_iter_copy(struct btree_iter *, struct btree_iter *);
 
+static inline int btree_iter_cmp(const struct btree_iter *l,
+                                const struct btree_iter *r)
+{
+       return   cmp_int(l->btree_id, r->btree_id) ?:
+               -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
+                bkey_cmp(l->real_pos, r->real_pos);
+}
+
 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
 {
        EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
@@ -925,6 +935,7 @@ static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter,
                        bch2_btree_node_iter_peek(&l->iter, l->b));
 
        iter->real_pos = k.k ? k.k->p : l->b->key.k.p;
+       iter->trans->iters_sorted = false;
        return k;
 }
 
@@ -935,6 +946,7 @@ static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter,
                        bch2_btree_node_iter_prev(&l->iter, l->b));
 
        iter->real_pos = k.k ? k.k->p : l->b->data->min_key;
+       iter->trans->iters_sorted = false;
        return k;
 }
 
@@ -1264,9 +1276,8 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
                                     unsigned long trace_ip)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter;
-       u8 sorted[BTREE_ITER_MAX];
-       int i, nr_sorted = 0;
+       struct btree_iter *iter, *prev = NULL;
+       int i;
 
        if (trans->in_traverse_all)
                return -EINTR;
@@ -1275,28 +1286,21 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
 retry_all:
        trans->restarted = false;
 
-       nr_sorted = 0;
-
-       trans_for_each_iter(trans, iter) {
-               sorted[nr_sorted++] = iter->idx;
+       trans_for_each_iter(trans, iter)
                iter->should_be_locked = false;
-       }
-
-#define btree_iter_cmp_by_idx(_l, _r)                          \
-               btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
 
-       bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
-#undef btree_iter_cmp_by_idx
+       btree_trans_sort_iters(trans);
 
-       for (i = nr_sorted - 2; i >= 0; --i) {
-               struct btree_iter *iter1 = trans->iters + sorted[i];
-               struct btree_iter *iter2 = trans->iters + sorted[i + 1];
+       trans_for_each_iter_inorder_reverse(trans, iter, i) {
+               if (prev) {
+                       if (iter->btree_id == prev->btree_id &&
+                           iter->locks_want < prev->locks_want)
+                               __bch2_btree_iter_upgrade(iter, prev->locks_want);
+                       else if (!iter->locks_want && prev->locks_want)
+                               __bch2_btree_iter_upgrade(iter, 1);
+               }
 
-               if (iter1->btree_id == iter2->btree_id &&
-                   iter1->locks_want < iter2->locks_want)
-                       __bch2_btree_iter_upgrade(iter1, iter2->locks_want);
-               else if (!iter1->locks_want && iter2->locks_want)
-                       __bch2_btree_iter_upgrade(iter1, 1);
+               prev = iter;
        }
 
        bch2_trans_unlock(trans);
@@ -1321,20 +1325,29 @@ retry_all:
        BUG_ON(ret && ret != -EINTR);
 
        /* Now, redo traversals in correct order: */
-       for (i = 0; i < nr_sorted; i++) {
-               unsigned idx = sorted[i];
+       i = 0;
+       while (i < trans->nr_sorted) {
+               iter = trans->iters + trans->sorted[i];
 
-               /*
-                * sucessfully traversing one iterator can cause another to be
-                * unlinked, in btree_key_cache_fill()
-                */
-               if (!(trans->iters_linked & (1ULL << idx)))
-                       continue;
+               EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
 
-               ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_);
+               ret = btree_iter_traverse_one(iter, _THIS_IP_);
                if (ret)
                        goto retry_all;
+
+               EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+
+               if (iter->nodes_locked)
+                       i++;
        }
+
+       /*
+        * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
+        * and relock(), relock() won't relock since iter->should_be_locked
+        * isn't set yet, which is all fine
+        */
+       trans_for_each_iter(trans, iter)
+               BUG_ON(iter->uptodate >= BTREE_ITER_NEED_TRAVERSE);
 out:
        bch2_btree_cache_cannibalize_unlock(c);
 
@@ -1536,6 +1549,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
        BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
 
        iter->pos = iter->real_pos = b->key.k.p;
+       iter->trans->iters_sorted = false;
 
        bch2_btree_iter_verify(iter);
        iter->should_be_locked = true;
@@ -1593,6 +1607,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
        }
 
        iter->pos = iter->real_pos = b->key.k.p;
+       iter->trans->iters_sorted = false;
 
        bch2_btree_iter_verify(iter);
        iter->should_be_locked = true;
@@ -1617,6 +1632,7 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p
 
        iter->real_pos = new_pos;
        iter->should_be_locked = false;
+       iter->trans->iters_sorted = false;
 
        if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
                btree_node_unlock(iter, 0);
@@ -2032,6 +2048,135 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
 
 /* new transactional stuff: */
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
+{
+       struct btree_iter *iter;
+       unsigned i;
+
+       BUG_ON(trans->nr_sorted != hweight64(trans->iters_linked));
+
+       trans_for_each_iter(trans, iter) {
+               BUG_ON(iter->sorted_idx >= trans->nr_sorted);
+               BUG_ON(trans->sorted[iter->sorted_idx] != iter->idx);
+       }
+
+       for (i = 0; i < trans->nr_sorted; i++) {
+               unsigned idx = trans->sorted[i];
+
+               EBUG_ON(!(trans->iters_linked & (1ULL << idx)));
+               BUG_ON(trans->iters[idx].sorted_idx != i);
+       }
+}
+#else
+static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
+#endif
+
+static void btree_trans_verify_sorted(struct btree_trans *trans)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+       struct btree_iter *iter, *prev = NULL;
+       unsigned i;
+
+       trans_for_each_iter_inorder(trans, iter, i) {
+               BUG_ON(prev && btree_iter_cmp(prev, iter) > 0);
+               prev = iter;
+       }
+#endif
+}
+
+static noinline void __btree_trans_sort_iters(struct btree_trans *trans)
+{
+       int i, l = 0, r = trans->nr_sorted, inc = 1;
+       bool swapped;
+
+       /*
+        * Cocktail shaker sort: this is efficient because iterators will be
+        * mostly sorteda.
+        */
+       do {
+               swapped = false;
+
+               for (i = inc > 0 ? l : r - 2;
+                    i + 1 < r && i >= l;
+                    i += inc) {
+                       if (btree_iter_cmp(trans->iters + trans->sorted[i],
+                                          trans->iters + trans->sorted[i + 1]) > 0) {
+                               swap(trans->sorted[i], trans->sorted[i + 1]);
+                               trans->iters[trans->sorted[i]].sorted_idx = i;
+                               trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1;
+                               swapped = true;
+                       }
+               }
+
+               if (inc > 0)
+                       --r;
+               else
+                       l++;
+               inc = -inc;
+       } while (swapped);
+
+       trans->iters_sorted = true;
+
+       btree_trans_verify_sorted(trans);
+}
+
+static inline void btree_trans_sort_iters(struct btree_trans *trans)
+{
+       btree_trans_verify_sorted_refs(trans);
+
+       if (trans->iters_sorted) {
+               if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
+                       btree_trans_verify_sorted(trans);
+               return;
+       }
+       __btree_trans_sort_iters(trans);
+}
+
+static inline void btree_iter_list_remove(struct btree_trans *trans,
+                                         struct btree_iter *iter)
+{
+       unsigned i;
+
+       EBUG_ON(iter->sorted_idx >= trans->nr_sorted);
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+       trans->nr_sorted--;
+       memmove_u64s_down_small(trans->sorted + iter->sorted_idx,
+                               trans->sorted + iter->sorted_idx + 1,
+                               DIV_ROUND_UP(trans->nr_sorted - iter->sorted_idx, 8));
+#else
+       array_remove_item(trans->sorted, trans->nr_sorted, iter->sorted_idx);
+#endif
+       for (i = iter->sorted_idx; i < trans->nr_sorted; i++)
+               trans->iters[trans->sorted[i]].sorted_idx = i;
+
+       iter->sorted_idx = U8_MAX;
+}
+
+static inline void btree_iter_list_add(struct btree_trans *trans,
+                                      struct btree_iter *pos,
+                                      struct btree_iter *iter)
+{
+       unsigned i;
+
+       iter->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
+
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+       memmove_u64s_up_small(trans->sorted + iter->sorted_idx + 1,
+                             trans->sorted + iter->sorted_idx,
+                             DIV_ROUND_UP(trans->nr_sorted - iter->sorted_idx, 8));
+       trans->nr_sorted++;
+       trans->sorted[iter->sorted_idx] = iter->idx;
+#else
+       array_insert_item(trans->sorted, trans->nr_sorted, iter->sorted_idx, iter->idx);
+#endif
+
+       for (i = iter->sorted_idx; i < trans->nr_sorted; i++)
+               trans->iters[trans->sorted[i]].sorted_idx = i;
+
+       btree_trans_verify_sorted_refs(trans);
+}
+
 static void btree_iter_child_free(struct btree_iter *iter)
 {
        struct btree_iter *child = btree_iter_child(iter);
@@ -2049,7 +2194,7 @@ static struct btree_iter *btree_iter_child_alloc(struct btree_iter *iter,
        struct btree_iter *child = btree_iter_child(iter);
 
        if (!child) {
-               child = btree_trans_iter_alloc(trans);
+               child = btree_trans_iter_alloc(trans, iter);
                child->ip_allocated     = ip;
                iter->child_idx         = child->idx;
 
@@ -2065,10 +2210,14 @@ static inline void __bch2_trans_iter_free(struct btree_trans *trans,
 {
        btree_iter_child_free(&trans->iters[idx]);
 
+       btree_iter_list_remove(trans, &trans->iters[idx]);
+
        __bch2_btree_iter_unlock(&trans->iters[idx]);
        trans->iters_linked             &= ~(1ULL << idx);
        trans->iters_live               &= ~(1ULL << idx);
        trans->iters_touched            &= ~(1ULL << idx);
+
+       btree_trans_verify_sorted_refs(trans);
 }
 
 int bch2_trans_iter_put(struct btree_trans *trans,
@@ -2109,12 +2258,15 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
 
        struct btree_iter *iter;
        struct btree_insert_entry *i;
+       unsigned idx;
        char buf[100];
 
-       trans_for_each_iter(trans, iter)
+       btree_trans_sort_iters(trans);
+
+       trans_for_each_iter_inorder(trans, iter, idx)
                printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n",
                       bch2_btree_ids[iter->btree_id],
-                      (bch2_bpos_to_text(&PBUF(buf), iter->pos), buf),
+                      (bch2_bpos_to_text(&PBUF(buf), iter->real_pos), buf),
                       btree_iter_live(trans, iter) ? " live" : "",
                       (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
                       iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
@@ -2130,11 +2282,14 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
        panic("trans iter oveflow\n");
 }
 
-static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
+static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans,
+                                                struct btree_iter *pos)
 {
        struct btree_iter *iter;
        unsigned idx;
 
+       btree_trans_verify_sorted_refs(trans);
+
        if (unlikely(trans->iters_linked ==
                     ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
                btree_trans_iter_alloc_fail(trans);
@@ -2145,10 +2300,13 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
        iter->trans             = trans;
        iter->idx               = idx;
        iter->child_idx         = U8_MAX;
+       iter->sorted_idx        = U8_MAX;
        iter->flags             = 0;
        iter->nodes_locked      = 0;
        iter->nodes_intent_locked = 0;
        trans->iters_linked     |= 1ULL << idx;
+
+       btree_iter_list_add(trans, pos, iter);
        return iter;
 }
 
@@ -2170,6 +2328,7 @@ static void btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
 
        dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
        dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
+       dst->trans->iters_sorted = false;
 }
 
 struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
@@ -2223,10 +2382,10 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
        }
 
        if (!best) {
-               iter = btree_trans_iter_alloc(trans);
+               iter = btree_trans_iter_alloc(trans, best);
                bch2_btree_iter_init(trans, iter, btree_id);
        } else if (btree_iter_keep(trans, best)) {
-               iter = btree_trans_iter_alloc(trans);
+               iter = btree_trans_iter_alloc(trans, best);
                btree_iter_copy(iter, best);
        } else {
                iter = best;
@@ -2307,7 +2466,7 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
 {
        struct btree_iter *iter;
 
-       iter = btree_trans_iter_alloc(trans);
+       iter = btree_trans_iter_alloc(trans, src);
        btree_iter_copy(iter, src);
 
        trans->iters_live |= 1ULL << iter->idx;
index aeabc07d2c9ca5b2f7e29352511f8906b25b6959..6fb0cb8252eb67d191c92a8d441807f057f03fc3 100644 (file)
@@ -71,6 +71,36 @@ __trans_next_iter(struct btree_trans *trans, unsigned idx)
             (_iter);                                                   \
             _iter = __trans_next_iter((_trans), (_iter)->idx + 1))
 
+static inline struct btree_iter *next_btree_iter(struct btree_trans *trans, struct btree_iter *iter)
+{
+       unsigned idx = iter ? iter->sorted_idx + 1 : 0;
+
+       EBUG_ON(idx > trans->nr_sorted);
+
+       return idx < trans->nr_sorted
+               ? trans->iters + trans->sorted[idx]
+               : NULL;
+}
+
+static inline struct btree_iter *prev_btree_iter(struct btree_trans *trans, struct btree_iter *iter)
+{
+       unsigned idx = iter ? iter->sorted_idx : trans->nr_sorted;
+
+       return idx
+               ? trans->iters + trans->sorted[idx - 1]
+               : NULL;
+}
+
+#define trans_for_each_iter_inorder(_trans, _iter, _i)                 \
+       for (_i = 0;                                                    \
+            ((_iter) = (_trans)->iters + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\
+            _i++)
+
+#define trans_for_each_iter_inorder_reverse(_trans, _iter, _i)         \
+       for (_i = trans->nr_sorted - 1;                                 \
+            ((_iter) = (_trans)->iters + trans->sorted[_i]), (_i) >= 0;\
+            --_i)
+
 static inline bool __iter_has_node(const struct btree_iter *iter,
                                   const struct btree *b)
 {
@@ -191,19 +221,14 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it
        iter->pos = bkey_start_pos(&iter->k);
 }
 
-static inline struct btree_iter *btree_iter_child(struct btree_iter *iter)
+static inline struct btree_iter *idx_to_btree_iter(struct btree_trans *trans, unsigned idx)
 {
-       return iter->child_idx == U8_MAX ? NULL
-               : iter->trans->iters + iter->child_idx;
+       return idx != U8_MAX ? trans->iters + idx : NULL;
 }
 
-/* Sort order for locking btree iterators: */
-static inline int btree_iter_lock_cmp(const struct btree_iter *l,
-                                     const struct btree_iter *r)
+static inline struct btree_iter *btree_iter_child(struct btree_iter *iter)
 {
-       return   cmp_int(l->btree_id, r->btree_id) ?:
-               -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
-                bkey_cmp(l->real_pos, r->real_pos);
+       return idx_to_btree_iter(iter->trans, iter->child_idx);
 }
 
 /*
index 4fa37fbc41fad6a17e7543f29ac156b26c2b92ff..7a9aece2eb87566b142a3e2fcb46f47fe3cd699a 100644 (file)
@@ -246,6 +246,7 @@ struct btree_iter {
 
        u8                      idx;
        u8                      child_idx;
+       u8                      sorted_idx;
 
        /* btree_iter_copy starts here: */
        u16                     flags;
@@ -379,11 +380,13 @@ struct btree_trans {
        unsigned long           ip;
        int                     srcu_idx;
 
+       u8                      nr_sorted;
        u8                      nr_updates;
        bool                    used_mempool:1;
        bool                    error:1;
        bool                    in_traverse_all:1;
        bool                    restarted:1;
+       bool                    iters_sorted:1;
        /*
         * For when bch2_trans_update notices we'll be splitting a compressed
         * extent:
@@ -398,6 +401,7 @@ struct btree_trans {
        unsigned                mem_bytes;
        void                    *mem;
 
+       u8                      sorted[BTREE_ITER_MAX + 8];
        struct btree_iter       *iters;
        struct btree_insert_entry *updates;
 
index a0cbebf190b48a35f621c1634a37328e9425e232..41dfc5867c9ed3b15649ef4312af6fee563d2f37 100644 (file)
@@ -593,6 +593,20 @@ static inline void memmove_u64s_down(void *dst, const void *src,
        __memmove_u64s_down(dst, src, u64s);
 }
 
+static inline void __memmove_u64s_down_small(void *dst, const void *src,
+                                      unsigned u64s)
+{
+       memcpy_u64s_small(dst, src, u64s);
+}
+
+static inline void memmove_u64s_down_small(void *dst, const void *src,
+                                    unsigned u64s)
+{
+       EBUG_ON(dst > src);
+
+       __memmove_u64s_down_small(dst, src, u64s);
+}
+
 static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
                                           unsigned u64s)
 {