bcachefs: Improve bch2_btree_iter_traverse_all()
authorKent Overstreet <kent.overstreet@gmail.com>
Wed, 14 Apr 2021 17:26:15 +0000 (13:26 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:00 +0000 (17:09 -0400)
By changing it to upgrade iterators to intent locks to avoid lock
restarts we can simplify __bch2_btree_node_lock() quite a bit - this
fixes a probable bug where it could potentially drop a lock on an
unrelated error but still succeed instead of causing a transaction
restart.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_iter.c
fs/bcachefs/btree_iter.h
fs/bcachefs/trace.h

index 7d8b7d765cf7a64e32dbdbefe5aef4107dcdd98f..c7a0ffc2cad5877d57f9eba23db1bffdbc96e215 100644 (file)
@@ -268,13 +268,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                 */
                if (type == SIX_LOCK_intent &&
                    linked->nodes_locked != linked->nodes_intent_locked) {
-                       linked->locks_want = max_t(unsigned,
-                                       linked->locks_want,
-                                       __fls(linked->nodes_locked) + 1);
-                       if (!btree_iter_get_locks(linked, true, false)) {
-                               deadlock_iter = linked;
-                               reason = 1;
-                       }
+                       deadlock_iter = linked;
+                       reason = 1;
                }
 
                if (linked->btree_id != iter->btree_id) {
@@ -303,14 +298,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                 * we're about to lock, it must have the ancestors locked too:
                 */
                if (level > __fls(linked->nodes_locked)) {
-                       linked->locks_want =
-                               max(level + 1, max_t(unsigned,
-                                   linked->locks_want,
-                                   iter->locks_want));
-                       if (!btree_iter_get_locks(linked, true, false)) {
-                               deadlock_iter = linked;
-                               reason = 5;
-                       }
+                       deadlock_iter = linked;
+                       reason = 5;
                }
 
                /* Must lock btree nodes in key order: */
@@ -320,26 +309,17 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                        deadlock_iter = linked;
                        reason = 7;
                }
-
-               /*
-                * Recheck if this is a node we already have locked - since one
-                * of the get_locks() calls might've successfully
-                * upgraded/relocked it:
-                */
-               if (linked->l[level].b == b &&
-                   btree_node_locked_type(linked, level) >= type) {
-                       six_lock_increment(&b->c.lock, type);
-                       return true;
-               }
        }
 
        if (unlikely(deadlock_iter)) {
                trace_trans_restart_would_deadlock(iter->trans->ip, ip,
-                               reason,
+                               trans->in_traverse_all, reason,
                                deadlock_iter->btree_id,
                                btree_iter_type(deadlock_iter),
+                               &deadlock_iter->real_pos,
                                iter->btree_id,
-                               btree_iter_type(iter));
+                               btree_iter_type(iter),
+                               &pos);
                return false;
        }
 
@@ -407,29 +387,11 @@ bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
 bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
                               unsigned new_locks_want)
 {
-       struct btree_iter *linked;
-
        EBUG_ON(iter->locks_want >= new_locks_want);
 
        iter->locks_want = new_locks_want;
 
-       if (btree_iter_get_locks(iter, true, true))
-               return true;
-
-       /*
-        * Ancestor nodes must be locked before child nodes, so set locks_want
-        * on iterators that might lock ancestors before us to avoid getting
-        * -EINTR later:
-        */
-       trans_for_each_iter(iter->trans, linked)
-               if (linked != iter &&
-                   linked->btree_id == iter->btree_id &&
-                   linked->locks_want < new_locks_want) {
-                       linked->locks_want = new_locks_want;
-                       btree_iter_get_locks(linked, true, false);
-               }
-
-       return false;
+       return btree_iter_get_locks(iter, true, true);
 }
 
 void __bch2_btree_iter_downgrade(struct btree_iter *iter,
@@ -1192,7 +1154,8 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
        struct bch_fs *c = trans->c;
        struct btree_iter *iter;
        u8 sorted[BTREE_ITER_MAX];
-       unsigned i, nr_sorted = 0;
+       int i, nr_sorted = 0;
+       bool relock_fail;
 
        if (trans->in_traverse_all)
                return -EINTR;
@@ -1200,15 +1163,36 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
        trans->in_traverse_all = true;
 retry_all:
        nr_sorted = 0;
+       relock_fail = false;
 
-       trans_for_each_iter(trans, iter)
+       trans_for_each_iter(trans, iter) {
+               if (!bch2_btree_iter_relock(iter, true))
+                       relock_fail = true;
                sorted[nr_sorted++] = iter->idx;
+       }
+
+       if (!relock_fail) {
+               trans->in_traverse_all = false;
+               return 0;
+       }
 
 #define btree_iter_cmp_by_idx(_l, _r)                          \
                btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
 
        bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
 #undef btree_iter_cmp_by_idx
+
+       for (i = nr_sorted - 2; i >= 0; --i) {
+               struct btree_iter *iter1 = trans->iters + sorted[i];
+               struct btree_iter *iter2 = trans->iters + sorted[i + 1];
+
+               if (iter1->btree_id == iter2->btree_id &&
+                   iter1->locks_want < iter2->locks_want)
+                       __bch2_btree_iter_upgrade(iter1, iter2->locks_want);
+               else if (!iter1->locks_want && iter2->locks_want)
+                       __bch2_btree_iter_upgrade(iter1, 1);
+       }
+
        bch2_trans_unlock(trans);
        cond_resched();
 
@@ -1258,6 +1242,8 @@ out:
        bch2_btree_cache_cannibalize_unlock(c);
 
        trans->in_traverse_all = false;
+
+       trace_trans_traverse_all(trans->ip);
        return ret;
 }
 
@@ -2210,7 +2196,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
        if (!(flags & TRANS_RESET_NOUNLOCK))
                bch2_trans_cond_resched(trans);
 
-       if (!(flags & TRANS_RESET_NOTRAVERSE))
+       if (!(flags & TRANS_RESET_NOTRAVERSE) &&
+           trans->iters_linked)
                bch2_btree_iter_traverse_all(trans);
 }
 
index 07d9b6d36e51fa5c1011648cfe0eefae8753fd7c..2f63adb9e4205b8ee6e2b8ade7194c7504c45569 100644 (file)
@@ -187,7 +187,7 @@ static inline int btree_iter_lock_cmp(const struct btree_iter *l,
 {
        return   cmp_int(l->btree_id, r->btree_id) ?:
                -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
-                bkey_cmp(l->pos, r->pos);
+                bkey_cmp(l->real_pos, r->real_pos);
 }
 
 /*
index 493f9223c5bd73fb98cd0af82fbdec137f0fbb9c..02f2662e7bde6445d2e1778cb133e34fba9e7262 100644 (file)
@@ -561,43 +561,70 @@ DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused,
 TRACE_EVENT(trans_restart_would_deadlock,
        TP_PROTO(unsigned long  trans_ip,
                 unsigned long  caller_ip,
+                bool           in_traverse_all,
                 unsigned       reason,
                 enum btree_id  have_btree_id,
                 unsigned       have_iter_type,
+                struct bpos    *have_pos,
                 enum btree_id  want_btree_id,
-                unsigned       want_iter_type),
-       TP_ARGS(trans_ip, caller_ip, reason,
-               have_btree_id, have_iter_type,
-               want_btree_id, want_iter_type),
+                unsigned       want_iter_type,
+                struct bpos    *want_pos),
+       TP_ARGS(trans_ip, caller_ip, in_traverse_all, reason,
+               have_btree_id, have_iter_type, have_pos,
+               want_btree_id, want_iter_type, want_pos),
 
        TP_STRUCT__entry(
                __field(unsigned long,          trans_ip        )
                __field(unsigned long,          caller_ip       )
+               __field(u8,                     in_traverse_all )
                __field(u8,                     reason          )
                __field(u8,                     have_btree_id   )
                __field(u8,                     have_iter_type  )
                __field(u8,                     want_btree_id   )
                __field(u8,                     want_iter_type  )
+
+               __field(u64,                    have_pos_inode  )
+               __field(u64,                    have_pos_offset )
+               __field(u32,                    have_pos_snapshot)
+               __field(u32,                    want_pos_snapshot)
+               __field(u64,                    want_pos_inode  )
+               __field(u64,                    want_pos_offset )
        ),
 
        TP_fast_assign(
                __entry->trans_ip               = trans_ip;
                __entry->caller_ip              = caller_ip;
+               __entry->in_traverse_all        = in_traverse_all;
                __entry->reason                 = reason;
                __entry->have_btree_id          = have_btree_id;
                __entry->have_iter_type         = have_iter_type;
                __entry->want_btree_id          = want_btree_id;
                __entry->want_iter_type         = want_iter_type;
+
+               __entry->have_pos_inode         = have_pos->inode;
+               __entry->have_pos_offset        = have_pos->offset;
+               __entry->have_pos_snapshot      = have_pos->snapshot;
+
+               __entry->want_pos_inode         = want_pos->inode;
+               __entry->want_pos_offset        = want_pos->offset;
+               __entry->want_pos_snapshot      = want_pos->snapshot;
        ),
 
-       TP_printk("%pS %pS because %u have %u:%u want %u:%u",
+       TP_printk("%pS %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
                  (void *) __entry->trans_ip,
                  (void *) __entry->caller_ip,
+                 __entry->in_traverse_all,
                  __entry->reason,
                  __entry->have_btree_id,
                  __entry->have_iter_type,
+                 __entry->have_pos_inode,
+                 __entry->have_pos_offset,
+                 __entry->have_pos_snapshot,
                  __entry->want_btree_id,
-                 __entry->want_iter_type)
+                 __entry->want_iter_type,
+                 __entry->want_pos_inode,
+                 __entry->want_pos_offset,
+                 __entry->want_pos_snapshot)
 );
 
 TRACE_EVENT(trans_restart_iters_realloced,
@@ -689,6 +716,11 @@ DEFINE_EVENT(transaction_restart,  trans_restart_traverse,
        TP_ARGS(ip)
 );
 
+DEFINE_EVENT(transaction_restart,      trans_traverse_all,
+       TP_PROTO(unsigned long ip),
+       TP_ARGS(ip)
+);
+
 DECLARE_EVENT_CLASS(node_lock_fail,
        TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
        TP_ARGS(level, iter_seq, node, node_seq),