bcachefs: Don't deadlock when btree node reuse changes lock ordering
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 13 Jun 2020 02:29:48 +0000 (22:29 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:41 +0000 (17:08 -0400)
Btree node lock ordering is based on the logical key. However, 'struct
btree' may be reused for a different btree node under memory pressure.
This patch uses the new six lock callback to check if a btree node is no
longer the node we wanted to lock before blocking.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_cache.c
fs/bcachefs/btree_iter.c
fs/bcachefs/btree_locking.h
fs/bcachefs/btree_update_interior.c

index d31017de53aad061abdbecaddbea505281848730..9423cff1539ff4c1bc21bcf0221cfc448e6f5ca7 100644 (file)
@@ -678,6 +678,14 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
        return b;
 }
 
+static int lock_node_check_fn(struct six_lock *lock, void *p)
+{
+       struct btree *b = container_of(lock, struct btree, c.lock);
+       const struct bkey_i *k = p;
+
+       return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1;
+}
+
 /**
  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
  * in from disk if necessary.
@@ -750,8 +758,12 @@ lock_node:
                if (btree_node_read_locked(iter, level + 1))
                        btree_node_unlock(iter, level + 1);
 
-               if (!btree_node_lock(b, k->k.p, level, iter, lock_type))
+               if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
+                                    lock_node_check_fn, (void *) k)) {
+                       if (b->hash_val != btree_ptr_hash_val(k))
+                               goto retry;
                        return ERR_PTR(-EINTR);
+               }
 
                if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
                             b->c.level != level ||
@@ -803,6 +815,7 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b;
        struct bset_tree *t;
+       int ret;
 
        EBUG_ON(level >= BTREE_MAX_DEPTH);
 
@@ -823,7 +836,9 @@ retry:
                        return b;
        } else {
 lock_node:
-               six_lock_read(&b->c.lock, NULL, NULL);
+               ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
+               if (ret)
+                       goto retry;
 
                if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
                             b->c.btree_id != btree_id ||
index 35fe7db50fb509747e32849f136e924d0280b708..b11c8e2a8d6b0656b6b351c4943d72ccdd78c5ee 100644 (file)
@@ -194,10 +194,13 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter,
 /* Slowpath: */
 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                            unsigned level, struct btree_iter *iter,
-                           enum six_lock_type type)
+                           enum six_lock_type type,
+                           six_lock_should_sleep_fn should_sleep_fn,
+                           void *p)
 {
        struct btree_trans *trans = iter->trans;
        struct btree_iter *linked;
+       u64 start_time = local_clock();
        bool ret = true;
 
        /* Check if it's safe to block: */
@@ -275,7 +278,14 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                return false;
        }
 
-       __btree_node_lock_type(iter->trans->c, b, type);
+       if (six_trylock_type(&b->c.lock, type))
+               return true;
+
+       if (six_lock_type(&b->c.lock, type, should_sleep_fn, p))
+               return false;
+
+       bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
+                              start_time);
        return true;
 }
 
@@ -286,6 +296,11 @@ static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
 {
        unsigned l;
 
+       if (!(iter->trans->iters_linked & (1ULL << iter->idx))) {
+               BUG_ON(iter->nodes_locked);
+               return;
+       }
+
        for (l = 0; btree_iter_node(iter, l); l++) {
                if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
                    !btree_node_locked(iter, l))
@@ -300,7 +315,7 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans)
 {
        struct btree_iter *iter;
 
-       trans_for_each_iter(trans, iter)
+       trans_for_each_iter_all(trans, iter)
                bch2_btree_iter_verify_locks(iter);
 }
 #else
@@ -892,18 +907,26 @@ void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
                __btree_iter_init(linked, b->c.level);
 }
 
+static int lock_root_check_fn(struct six_lock *lock, void *p)
+{
+       struct btree *b = container_of(lock, struct btree, c.lock);
+       struct btree **rootp = p;
+
+       return b == *rootp ? 0 : -1;
+}
+
 static inline int btree_iter_lock_root(struct btree_iter *iter,
                                       unsigned depth_want)
 {
        struct bch_fs *c = iter->trans->c;
-       struct btree *b;
+       struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
        enum six_lock_type lock_type;
        unsigned i;
 
        EBUG_ON(iter->nodes_locked);
 
        while (1) {
-               b = READ_ONCE(c->btree_roots[iter->btree_id].b);
+               b = READ_ONCE(*rootp);
                iter->level = READ_ONCE(b->c.level);
 
                if (unlikely(iter->level < depth_want)) {
@@ -921,10 +944,11 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
 
                lock_type = __btree_lock_want(iter, iter->level);
                if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
-                                             iter, lock_type)))
+                                             iter, lock_type,
+                                             lock_root_check_fn, rootp)))
                        return -EINTR;
 
-               if (likely(b == c->btree_roots[iter->btree_id].b &&
+               if (likely(b == READ_ONCE(*rootp) &&
                           b->c.level == iter->level &&
                           !race_fault())) {
                        for (i = 0; i < iter->level; i++)
index 4c80ab368e6995a9e1c6bc30c3a8a1cf4bf53970..ffee6f2d7d4b6f116bd291a00f2bfe259d6260d2 100644 (file)
@@ -174,17 +174,21 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
 }
 
 bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
-                           struct btree_iter *, enum six_lock_type);
-
-static inline bool btree_node_lock(struct btree *b, struct bpos pos,
-                                  unsigned level,
-                                  struct btree_iter *iter,
-                                  enum six_lock_type type)
+                           struct btree_iter *, enum six_lock_type,
+                           six_lock_should_sleep_fn, void *);
+
+static inline bool btree_node_lock(struct btree *b,
+                       struct bpos pos, unsigned level,
+                       struct btree_iter *iter,
+                       enum six_lock_type type,
+                       six_lock_should_sleep_fn should_sleep_fn, void *p)
 {
        struct btree_trans *trans = iter->trans;
        bool ret;
 
        EBUG_ON(level >= BTREE_MAX_DEPTH);
+       EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+
 #ifdef CONFIG_BCACHEFS_DEBUG
        trans->locking          = b;
        trans->locking_iter_idx = iter->idx;
@@ -194,7 +198,8 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
 #endif
        ret   = likely(six_trylock_type(&b->c.lock, type)) ||
                btree_node_lock_increment(trans, b, level, type) ||
-               __bch2_btree_node_lock(b, pos, level, iter, type);
+               __bch2_btree_node_lock(b, pos, level, iter, type,
+                                      should_sleep_fn, p);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
        trans->locking = NULL;
index bb921852a093213086d946bc1bc3a2dc641677df..2d68f4eaca3426907dadde09926b07a7abf4a03c 100644 (file)
@@ -135,6 +135,8 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
 
        bch2_btree_node_hash_remove(&c->btree_cache, b);
 
+       six_lock_wakeup_all(&b->c.lock);
+
        mutex_lock(&c->btree_cache.lock);
        list_move(&b->list, &c->btree_cache.freeable);
        mutex_unlock(&c->btree_cache.lock);