bcachefs: Kill bch2_btree_node_get_sibling()
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 29 Mar 2021 05:13:31 +0000 (01:13 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:58 +0000 (17:08 -0400)
This patch reworks the btree node merge path to use a second btree
iterator to get the sibling node - which means
bch2_btree_iter_get_sibling() can be deleted. Also, it uses
bch2_btree_iter_traverse_all() if necessary - which means it should be
more reliable. We don't currently even try to make it work when
trans->nounlock is set - after a BTREE_INSERT_NOUNLOCK transaction
commit, hopefully this will be a worthwhile tradeoff.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_cache.c
fs/bcachefs/btree_cache.h
fs/bcachefs/btree_update_interior.c

index 85ac08b9270a767a855c8e8eb20f14d84ac09d03..2ec668c3427e950ebc42bf0e205dee294fcf29e8 100644 (file)
@@ -913,136 +913,6 @@ out:
        return b;
 }
 
-struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
-                                         struct btree_iter *iter,
-                                         struct btree *b,
-                                         enum btree_node_sibling sib)
-{
-       struct btree_trans *trans = iter->trans;
-       struct btree *parent;
-       struct btree_node_iter node_iter;
-       struct bkey_packed *k;
-       struct bkey_buf tmp;
-       struct btree *ret = NULL;
-       unsigned level = b->c.level;
-
-       bch2_bkey_buf_init(&tmp);
-
-       parent = btree_iter_node(iter, level + 1);
-       if (!parent)
-               return NULL;
-
-       /*
-        * There's a corner case where a btree_iter might have a node locked
-        * that is just outside its current pos - when
-        * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node.
-        *
-        * But the lock ordering checks in __bch2_btree_node_lock() go off of
-        * iter->pos, not the node's key: so if the iterator is marked as
-        * needing to be traversed, we risk deadlock if we don't bail out here:
-        */
-       if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
-               return ERR_PTR(-EINTR);
-
-       if (!bch2_btree_node_relock(iter, level + 1)) {
-               ret = ERR_PTR(-EINTR);
-               goto out;
-       }
-
-       node_iter = iter->l[parent->c.level].iter;
-
-       k = bch2_btree_node_iter_peek_all(&node_iter, parent);
-       BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
-
-       k = sib == btree_prev_sib
-               ? bch2_btree_node_iter_prev(&node_iter, parent)
-               : (bch2_btree_node_iter_advance(&node_iter, parent),
-                  bch2_btree_node_iter_peek(&node_iter, parent));
-       if (!k)
-               goto out;
-
-       bch2_bkey_buf_unpack(&tmp, c, parent, k);
-
-       ret = bch2_btree_node_get(c, iter, tmp.k, level,
-                                 SIX_LOCK_intent, _THIS_IP_);
-
-       if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
-               struct btree_iter *linked;
-
-               if (!bch2_btree_node_relock(iter, level + 1))
-                       goto out;
-
-               /*
-                * We might have got -EINTR because trylock failed, and we're
-                * holding other locks that would cause us to deadlock:
-                */
-               trans_for_each_iter(trans, linked)
-                       if (btree_iter_lock_cmp(iter, linked) < 0)
-                               __bch2_btree_iter_unlock(linked);
-
-               if (sib == btree_prev_sib)
-                       btree_node_unlock(iter, level);
-
-               ret = bch2_btree_node_get(c, iter, tmp.k, level,
-                                         SIX_LOCK_intent, _THIS_IP_);
-
-               /*
-                * before btree_iter_relock() calls btree_iter_verify_locks():
-                */
-               if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
-                       btree_node_unlock(iter, level + 1);
-
-               if (!bch2_btree_node_relock(iter, level)) {
-                       btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
-
-                       if (!IS_ERR(ret)) {
-                               six_unlock_intent(&ret->c.lock);
-                               ret = ERR_PTR(-EINTR);
-                       }
-               }
-
-               bch2_trans_relock(trans);
-       }
-out:
-       if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
-               btree_node_unlock(iter, level + 1);
-
-       if (PTR_ERR_OR_ZERO(ret) == -EINTR)
-               bch2_btree_iter_upgrade(iter, level + 2);
-
-       BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level));
-
-       if (!IS_ERR_OR_NULL(ret)) {
-               struct btree *n1 = ret, *n2 = b;
-
-               if (sib != btree_prev_sib)
-                       swap(n1, n2);
-
-               if (bpos_cmp(bpos_successor(n1->key.k.p),
-                            n2->data->min_key)) {
-                       char buf1[200], buf2[200];
-
-                       bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key));
-                       bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key));
-
-                       bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n"
-                                            "prev: %s\n"
-                                            "next: %s\n",
-                                            bch2_btree_ids[iter->btree_id], level,
-                                            buf1, buf2);
-
-                       six_unlock_intent(&ret->c.lock);
-                       ret = NULL;
-               }
-       }
-
-       bch2_btree_trans_verify_locks(trans);
-
-       bch2_bkey_buf_exit(&tmp, c);
-
-       return ret;
-}
-
 void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
                              const struct bkey_i *k,
                              enum btree_id btree_id, unsigned level)
@@ -1082,7 +952,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
               "    format: u64s %u fields %u %u %u %u %u\n"
               "    unpack fn len: %u\n"
               "    bytes used %zu/%zu (%zu%% full)\n"
-              "    sib u64s: %u, %u (merge threshold %zu)\n"
+              "    sib u64s: %u, %u (merge threshold %u)\n"
               "    nr packed keys %u\n"
               "    nr unpacked keys %u\n"
               "    floats %zu\n"
@@ -1099,7 +969,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
               b->nr.live_u64s * 100 / btree_max_u64s(c),
               b->sib_u64s[0],
               b->sib_u64s[1],
-              BTREE_FOREGROUND_MERGE_THRESHOLD(c),
+              c->btree_foreground_merge_threshold,
               b->nr.packed_keys,
               b->nr.unpacked_keys,
               stats.floats,
index 217988696a77473d3226d525ca4327eee93c1bc4..aa8fe4a1b04b9d06d017db61be239631054384c0 100644 (file)
@@ -26,9 +26,6 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
 struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
                                         enum btree_id, unsigned, bool);
 
-struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
-                               struct btree *, enum btree_node_sibling);
-
 void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
                              const struct bkey_i *, enum btree_id, unsigned);
 
index fddb0c3e716709176d1ae0d04fbdfce9aefcf099..af7c2df56692c2648af15a664773b4d52721e986 100644 (file)
@@ -1539,36 +1539,50 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c,
                                   enum btree_node_sibling sib)
 {
        struct btree_trans *trans = iter->trans;
+       struct btree_iter *sib_iter = NULL;
        struct btree_update *as;
        struct bkey_format_state new_s;
        struct bkey_format new_f;
        struct bkey_i delete;
        struct btree *b, *m, *n, *prev, *next, *parent;
+       struct bpos sib_pos;
        size_t sib_u64s;
        int ret = 0;
 
+       if (trans->nounlock)
+               return;
+
        BUG_ON(!btree_node_locked(iter, level));
 retry:
+       ret = bch2_btree_iter_traverse(iter);
+       if (ret)
+               goto err;
+
        BUG_ON(!btree_node_locked(iter, level));
 
        b = iter->l[level].b;
 
-       parent = btree_node_parent(iter, b);
-       if (!parent)
+       if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
+           (sib == btree_next_sib && !bpos_cmp(b->data->max_key, POS_MAX))) {
+               b->sib_u64s[sib] = U16_MAX;
                goto out;
+       }
 
-       if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
-               goto out;
+       sib_pos = sib == btree_prev_sib
+               ? bpos_predecessor(b->data->min_key)
+               : bpos_successor(b->data->max_key);
 
-       /* XXX: can't be holding read locks */
-       m = bch2_btree_node_get_sibling(c, iter, b, sib);
-       if (IS_ERR(m)) {
-               ret = PTR_ERR(m);
+       sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id,
+                                           sib_pos, U8_MAX, level,
+                                           BTREE_ITER_INTENT);
+       ret = bch2_btree_iter_traverse(sib_iter);
+       if (ret)
                goto err;
-       }
 
-       /* NULL means no sibling: */
-       if (!m) {
+       m = sib_iter->l[level].b;
+
+       if (btree_node_parent(iter, b) !=
+           btree_node_parent(sib_iter, m)) {
                b->sib_u64s[sib] = U16_MAX;
                goto out;
        }
@@ -1581,6 +1595,8 @@ retry:
                next = m;
        }
 
+       BUG_ON(bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key));
+
        bch2_bkey_format_init(&new_s);
        bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
        __bch2_btree_calc_format(&new_s, prev);
@@ -1598,23 +1614,21 @@ retry:
        }
 
        sib_u64s = min(sib_u64s, btree_max_u64s(c));
+       sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
        b->sib_u64s[sib] = sib_u64s;
 
-       if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
-               six_unlock_intent(&m->c.lock);
+       if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
                goto out;
-       }
 
+       parent = btree_node_parent(iter, b);
        as = bch2_btree_update_start(iter, level,
                         btree_update_reserve_required(c, parent) + 1,
                         flags|
                         BTREE_INSERT_NOFAIL|
                         BTREE_INSERT_USE_RESERVE);
        ret = PTR_ERR_OR_ZERO(as);
-       if (ret) {
-               six_unlock_intent(&m->c.lock);
+       if (ret)
                goto err;
-       }
 
        trace_btree_merge(c, b);
 
@@ -1648,6 +1662,7 @@ retry:
        bch2_btree_update_get_open_buckets(as, n);
 
        six_lock_increment(&b->c.lock, SIX_LOCK_intent);
+       six_lock_increment(&m->c.lock, SIX_LOCK_intent);
        bch2_btree_iter_node_drop(iter, b);
        bch2_btree_iter_node_drop(iter, m);
 
@@ -1663,6 +1678,7 @@ retry:
        bch2_btree_update_done(as);
 out:
        bch2_btree_trans_verify_locks(trans);
+       bch2_trans_iter_free(trans, sib_iter);
 
        /*
         * Don't downgrade locks here: we're called after successful insert,
@@ -1675,11 +1691,16 @@ out:
         */
        return;
 err:
-       BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
+       bch2_trans_iter_put(trans, sib_iter);
+       sib_iter = NULL;
+
+       if (ret == -EINTR && bch2_trans_relock(trans)) {
+               ret = 0;
+               goto retry;
+       }
 
        if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) {
-               bch2_trans_unlock(trans);
-               ret = bch2_btree_iter_traverse(iter);
+               ret = bch2_btree_iter_traverse_all(trans);
                if (!ret)
                        goto retry;
        }