bcachefs: Fix a deadlock
authorKent Overstreet <kent.overstreet@gmail.com>
Fri, 12 Jun 2020 18:58:07 +0000 (14:58 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:41 +0000 (17:08 -0400)
__bch2_btree_node_lock() was incorrectly using iter->pos as a proxy for
btree node lock ordering, this caused an off by one error that was
triggered by bch2_btree_node_get_sibling() getting the previous node.

This refactors the code to compare against btree node keys directly.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_iter.c
fs/bcachefs/btree_locking.h
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update_leaf.c

index bed0bb67a85dfb597056ce638b4fdd3b52e80c7d..35fe7db50fb509747e32849f136e924d0280b708 100644 (file)
@@ -101,7 +101,7 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
 
        if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
            (btree_node_lock_seq_matches(iter, b, level) &&
-            btree_node_lock_increment(iter, b, level, want))) {
+            btree_node_lock_increment(iter->trans, b, level, want))) {
                mark_btree_node_locked(iter, level, want);
                return true;
        } else {
@@ -130,7 +130,7 @@ static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
                goto success;
 
        if (btree_node_lock_seq_matches(iter, b, level) &&
-           btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
+           btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
                btree_node_unlock(iter, level);
                goto success;
        }
@@ -193,23 +193,18 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter,
 
 /* Slowpath: */
 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
-                          unsigned level,
-                          struct btree_iter *iter,
-                          enum six_lock_type type)
+                           unsigned level, struct btree_iter *iter,
+                           enum six_lock_type type)
 {
+       struct btree_trans *trans = iter->trans;
        struct btree_iter *linked;
        bool ret = true;
 
        /* Check if it's safe to block: */
-       trans_for_each_iter(iter->trans, linked) {
+       trans_for_each_iter(trans, linked) {
                if (!linked->nodes_locked)
                        continue;
 
-               /* Must lock btree nodes in key order: */
-               if ((cmp_int(iter->btree_id, linked->btree_id) ?:
-                    bkey_cmp(pos, linked->pos)) < 0)
-                       ret = false;
-
                /*
                 * Can't block taking an intent lock if we have _any_ nodes read
                 * locked:
@@ -224,13 +219,15 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                 */
                if (type == SIX_LOCK_intent &&
                    linked->nodes_locked != linked->nodes_intent_locked) {
-                       if (!(iter->trans->nounlock)) {
+                       if (!(trans->nounlock)) {
                                linked->locks_want = max_t(unsigned,
                                                linked->locks_want,
                                                __fls(linked->nodes_locked) + 1);
-                               btree_iter_get_locks(linked, true, false);
+                               if (!btree_iter_get_locks(linked, true, false))
+                                       ret = false;
+                       } else {
+                               ret = false;
                        }
-                       ret = false;
                }
 
                /*
@@ -240,14 +237,36 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                 */
                if (linked->btree_id == iter->btree_id &&
                    level > __fls(linked->nodes_locked)) {
-                       if (!(iter->trans->nounlock)) {
+                       if (!(trans->nounlock)) {
                                linked->locks_want =
                                        max(level + 1, max_t(unsigned,
                                            linked->locks_want,
                                            iter->locks_want));
-                               btree_iter_get_locks(linked, true, false);
+                               if (!btree_iter_get_locks(linked, true, false))
+                                       ret = false;
+                       } else {
+                               ret = false;
                        }
+               }
+
+               /* Must lock btree nodes in key order: */
+               if (iter->btree_id < linked->btree_id)
+                       ret = false;
+
+               if (iter->btree_id == linked->btree_id &&
+                   btree_node_locked(linked, level) &&
+                   bkey_cmp(pos, linked->l[level].b->key.k.p) <= 0)
                        ret = false;
+
+               /*
+                * Recheck if this is a node we already have locked - since one
+                * of the get_locks() calls might've successfully
+                * upgraded/relocked it:
+                */
+               if (linked->l[level].b == b &&
+                   btree_node_locked_type(linked, level) >= type) {
+                       six_lock_increment(&b->c.lock, type);
+                       return true;
                }
        }
 
@@ -2242,13 +2261,15 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
        mutex_lock(&c->btree_trans_lock);
        list_for_each_entry(trans, &c->btree_trans_list, list) {
-               pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
+               pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip);
 
                trans_for_each_iter(trans, iter) {
                        if (!iter->nodes_locked)
                                continue;
 
-                       pr_buf(out, "  iter %s:", bch2_btree_ids[iter->btree_id]);
+                       pr_buf(out, "  iter %u %s:",
+                              iter->idx,
+                              bch2_btree_ids[iter->btree_id]);
                        bch2_bpos_to_text(out, iter->pos);
                        pr_buf(out, "\n");
 
@@ -2256,8 +2277,8 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
                                if (btree_node_locked(iter, l)) {
                                        b = iter->l[l].b;
 
-                                       pr_buf(out, "    %p l=%u %s ",
-                                              b, l, btree_node_intent_locked(iter, l) ? "i" : "r");
+                                       pr_buf(out, "    %px %s l=%u ",
+                                              b, btree_node_intent_locked(iter, l) ? "i" : "r", l);
                                        bch2_bpos_to_text(out, b->key.k.p);
                                        pr_buf(out, "\n");
                                }
@@ -2266,7 +2287,13 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
                b = READ_ONCE(trans->locking);
                if (b) {
-                       pr_buf(out, "  locking %px l=%u %s:",
+                       pr_buf(out, "  locking iter %u l=%u %s:",
+                              trans->locking_iter_idx,
+                              trans->locking_level,
+                              bch2_btree_ids[trans->locking_btree_id]);
+                       bch2_bpos_to_text(out, trans->locking_pos);
+
+                       pr_buf(out, " node %px l=%u %s:",
                               b, b->c.level,
                               bch2_btree_ids[b->c.btree_id]);
                        bch2_bpos_to_text(out, b->key.k.p);
index cf1801ee14a2a23daad638671bf12151cc7fb658..4c80ab368e6995a9e1c6bc30c3a8a1cf4bf53970 100644 (file)
@@ -157,15 +157,15 @@ static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
  * Lock a btree node if we already have it locked on one of our linked
  * iterators:
  */
-static inline bool btree_node_lock_increment(struct btree_iter *iter,
+static inline bool btree_node_lock_increment(struct btree_trans *trans,
                                             struct btree *b, unsigned level,
                                             enum btree_node_locked_type want)
 {
-       struct btree_iter *linked;
+       struct btree_iter *iter;
 
-       trans_for_each_iter(iter->trans, linked)
-               if (linked->l[level].b == b &&
-                   btree_node_locked_type(linked, level) >= want) {
+       trans_for_each_iter(trans, iter)
+               if (iter->l[level].b == b &&
+                   btree_node_locked_type(iter, level) >= want) {
                        six_lock_increment(&b->c.lock, want);
                        return true;
                }
@@ -181,19 +181,23 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
                                   struct btree_iter *iter,
                                   enum six_lock_type type)
 {
+       struct btree_trans *trans = iter->trans;
        bool ret;
 
        EBUG_ON(level >= BTREE_MAX_DEPTH);
 #ifdef CONFIG_BCACHEFS_DEBUG
-       iter->trans->locking = b;
+       trans->locking          = b;
+       trans->locking_iter_idx = iter->idx;
+       trans->locking_pos      = pos;
+       trans->locking_btree_id = iter->btree_id;
+       trans->locking_level    = level;
 #endif
-
-       ret = likely(six_trylock_type(&b->c.lock, type)) ||
-               btree_node_lock_increment(iter, b, level, type) ||
+       ret   = likely(six_trylock_type(&b->c.lock, type)) ||
+               btree_node_lock_increment(trans, b, level, type) ||
                __bch2_btree_node_lock(b, pos, level, iter, type);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-       iter->trans->locking = NULL;
+       trans->locking = NULL;
 #endif
        return ret;
 }
index 78fbf922341eb2a02d46cf9b6e9e0d87e8aec904..58d54a4ac218984a7ad33769ad448986129d5a68 100644 (file)
@@ -287,6 +287,10 @@ struct btree_trans {
 #ifdef CONFIG_BCACHEFS_DEBUG
        struct list_head        list;
        struct btree            *locking;
+       unsigned                locking_iter_idx;
+       struct bpos             locking_pos;
+       u8                      locking_btree_id;
+       u8                      locking_level;
        pid_t                   pid;
 #endif
        unsigned long           ip;
index 028aa9bbeced9e22705ba0f55e3adced618580ff..9fbbd2a72e14b67368b1f76f83a8dbfa8e76d0ba 100644 (file)
@@ -481,7 +481,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
         * or anything else that might call bch2_trans_relock(), since that
         * would just retake the read locks:
         */
-       trans_for_each_iter_all(trans, iter) {
+       trans_for_each_iter(trans, iter) {
                if (iter->nodes_locked != iter->nodes_intent_locked) {
                        EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
                        EBUG_ON(trans->iters_live & (1ULL << iter->idx));