bcachefs: Deadlock cycle detector
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 22 Aug 2022 17:23:47 +0000 (13:23 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:41 +0000 (17:09 -0400)
We've outgrown our own deadlock avoidance strategy.

The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.

Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.

That approach had some issues, though.
 - Sometimes we'd issue transaction restarts unnecessarily, when no
   deadlock would have actually occured. Lock ordering restarts have
   become our primary cause of transaction restarts, on some workloads
   totally 20% of actual transaction commits.

 - To avoid deadlock or livelock, we'd often have to take intent locks
   when we only wanted a read lock: with the lock ordering approach, it
   is actually illegal to hold _any_ read lock while blocking on an intent
   lock, and this has been causing us unnecessary lock contention.

 - It was getting fragile - the various lock ordering rules are not
   trivial, and we'd been seeing occasional livelock issues related to
   this machinery.

So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.

When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.

If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.

Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
fs/bcachefs/bcachefs_format.h
fs/bcachefs/btree_iter.c
fs/bcachefs/btree_iter.h
fs/bcachefs/btree_locking.c
fs/bcachefs/btree_locking.h
fs/bcachefs/btree_types.h
fs/bcachefs/debug.c
fs/bcachefs/errcode.h
fs/bcachefs/trace.h

index 0e80fe2568f26741abdca19c4d0cfa10a49bc18c..5471b797be9377b583330ad49e9eb6c13aab99ce 100644 (file)
@@ -1400,7 +1400,8 @@ struct bch_sb_field_disk_groups {
        x(trans_restart_key_cache_upgrade,              70)     \
        x(trans_traverse_all,                           71)     \
        x(transaction_commit,                           72)     \
-       x(write_super,                                  73)
+       x(write_super,                                  73)     \
+       x(trans_restart_would_deadlock_recursion_limit, 74)
 
 enum bch_persistent_counters {
 #define x(t, n, ...) BCH_COUNTER_##t,
index df9949cef9079fe0e52cd19838b28a36e87dbd75..5773b00e69ac1c48fc16e97dee6ff0994fd46f18 100644 (file)
@@ -2848,9 +2848,10 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *
        trans->fn               = fn;
        trans->last_begin_time  = ktime_get_ns();
        trans->fn_idx           = bch2_trans_get_fn_idx(trans, c, fn);
-       trans->task             = current;
+       trans->locking_wait.task = current;
        trans->journal_replay_not_finished =
                !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
+       closure_init_stack(&trans->ref);
 
        bch2_trans_alloc_paths(trans, c);
 
@@ -2877,7 +2878,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *
 
                mutex_lock(&c->btree_trans_lock);
                list_for_each_entry(pos, &c->btree_trans_list, list) {
-                       if (trans->task->pid < pos->task->pid) {
+                       if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
                                list_add_tail(&trans->list, &pos->list);
                                goto list_add_done;
                        }
@@ -2919,6 +2920,8 @@ void bch2_trans_exit(struct btree_trans *trans)
 
        bch2_trans_unlock(trans);
 
+       closure_sync(&trans->ref);
+
        if (s)
                s->max_mem = max(s->max_mem, trans->mem_max);
 
@@ -2997,7 +3000,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
        static char lock_types[] = { 'r', 'i', 'w' };
        unsigned l;
 
-       prt_printf(out, "%i %s\n", trans->task->pid, trans->fn);
+       prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
 
        trans_for_each_path(trans, path) {
                if (!path->nodes_locked)
@@ -3029,7 +3032,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
                       trans->locking_path_idx,
                       path->cached ? 'c' : 'b',
                       trans->locking_level,
-                      lock_types[trans->locking_lock_type],
+                      lock_types[trans->locking_wait.lock_want],
                       bch2_btree_ids[trans->locking_btree_id]);
                bch2_bpos_to_text(out, trans->locking_pos);
 
index 2f47889c688a2a52af6c0f73344e58da5c047a5e..04b6773d6e109b1322c94121d728956c39199991 100644 (file)
@@ -81,11 +81,14 @@ __trans_next_path(struct btree_trans *trans, unsigned idx)
        return &trans->paths[idx];
 }
 
-#define trans_for_each_path(_trans, _path)                             \
-       for (_path = __trans_next_path((_trans), 0);                    \
+#define trans_for_each_path_from(_trans, _path, _start)                        \
+       for (_path = __trans_next_path((_trans), _start);               \
             (_path);                                                   \
             _path = __trans_next_path((_trans), (_path)->idx + 1))
 
+#define trans_for_each_path(_trans, _path)                             \
+       trans_for_each_path_from(_trans, _path, 0)
+
 static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
 {
        unsigned idx = path ? path->sorted_idx + 1 : 0;
index 5b6d8184ea4522ef8f19585f723d42b8496ca820..869f4163a3c6cce68a00051c776d3b7ea88c8318 100644 (file)
@@ -52,10 +52,248 @@ void bch2_btree_node_unlock_write(struct btree_trans *trans,
 
 /* lock */
 
-void __bch2_btree_node_lock_write(struct btree_trans *trans,
-                                 struct btree_bkey_cached_common *b)
+/*
+ * @trans wants to lock @b with type @type
+ */
+struct trans_waiting_for_lock {
+       struct btree_trans              *trans;
+       struct btree_bkey_cached_common *node_want;
+       enum six_lock_type              lock_want;
+
+       /* for iterating over held locks :*/
+       u8                              path_idx;
+       u8                              level;
+       u64                             lock_start_time;
+};
+
+struct lock_graph {
+       struct trans_waiting_for_lock   g[8];
+       unsigned                        nr;
+};
+
+static void lock_graph_pop(struct lock_graph *g)
+{
+       closure_put(&g->g[--g->nr].trans->ref);
+}
+
+static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
+{
+       int ret;
+
+       if (i == g->g) {
+               ret = btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
+       } else {
+               i->trans->lock_must_abort = true;
+               ret = 0;
+       }
+
+       for (i = g->g + 1; i < g->g + g->nr; i++)
+               wake_up_process(i->trans->locking_wait.task);
+       return ret;
+}
+
+static noinline int break_cycle(struct lock_graph *g)
+{
+       struct trans_waiting_for_lock *i;
+
+       for (i = g->g; i < g->g + g->nr; i++) {
+               if (i->trans->lock_may_not_fail ||
+                   i->trans->locking_wait.lock_want == SIX_LOCK_write)
+                       continue;
+
+               return abort_lock(g, i);
+       }
+
+       for (i = g->g; i < g->g + g->nr; i++) {
+               if (i->trans->lock_may_not_fail ||
+                   !i->trans->in_traverse_all)
+                       continue;
+
+               return abort_lock(g, i);
+       }
+
+       for (i = g->g; i < g->g + g->nr; i++) {
+               if (i->trans->lock_may_not_fail)
+                       continue;
+
+               return abort_lock(g, i);
+       }
+
+       BUG();
+}
+
+static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans)
+{
+       struct btree_trans *orig_trans = g->g->trans;
+       struct trans_waiting_for_lock *i;
+       int ret = 0;
+
+       for (i = g->g; i < g->g + g->nr; i++) {
+               if (i->trans->locking != i->node_want)
+                       while (g->g + g->nr >= i) {
+                               lock_graph_pop(g);
+                               return 0;
+                       }
+
+               if (i->trans == trans) {
+                       ret = break_cycle(g);
+                       if (ret)
+                               goto deadlock;
+                       /*
+                        * If we didn't abort (instead telling another
+                        * transaction to abort), keep checking:
+                        */
+               }
+       }
+
+       if (g->nr == ARRAY_SIZE(g->g)) {
+               if (orig_trans->lock_may_not_fail)
+                       return 0;
+
+               trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
+               ret = btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
+               goto deadlock;
+       }
+
+       closure_get(&trans->ref);
+
+       g->g[g->nr++] = (struct trans_waiting_for_lock) {
+               .trans          = trans,
+               .node_want      = trans->locking,
+               .lock_want      = trans->locking_wait.lock_want,
+       };
+
+       return 0;
+deadlock:
+       while (g->nr)
+               lock_graph_pop(g);
+       return ret;
+}
+
+#if 0
+static void print_cycle(struct printbuf *out, struct lock_graph *g)
+{
+       struct trans_waiting_for_lock *i;
+
+       prt_str(out, "Found lock cycle:");
+       prt_newline(out);
+
+       for (i = g->g; i < g->g + g->nr; i++)
+               bch2_btree_trans_to_text(out, i->trans);
+}
+#endif
+
+static noinline void lock_graph_remove_non_waiters(struct lock_graph *g)
+{
+       struct trans_waiting_for_lock *i;
+
+       for (i = g->g + 1; i < g->g + g->nr; i++)
+               if (i->trans->locking != i->node_want ||
+                   i->trans->locking_wait.start_time != i[-1].lock_start_time) {
+                       while (g->g + g->nr >= i)
+                               lock_graph_pop(g);
+                       return;
+               }
+       BUG();
+}
+
+static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
+{
+       return t1 + t2 > 1;
+}
+
+static int check_for_deadlock(struct btree_trans *trans)
+{
+       struct lock_graph g;
+       struct trans_waiting_for_lock *top;
+       struct btree_bkey_cached_common *b;
+       struct btree_path *path;
+       int ret;
+
+       if (trans->lock_must_abort)
+               return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
+
+       g.nr = 0;
+       ret = lock_graph_descend(&g, trans);
+       BUG_ON(ret);
+next:
+       if (!g.nr)
+               return 0;
+
+       top = &g.g[g.nr - 1];
+
+       trans_for_each_path_from(top->trans, path, top->path_idx) {
+               if (!path->nodes_locked)
+                       continue;
+
+               if (top->path_idx != path->idx) {
+                       top->path_idx           = path->idx;
+                       top->level              = 0;
+                       top->lock_start_time    = 0;
+               }
+
+               for (;
+                    top->level < BTREE_MAX_DEPTH;
+                    top->level++, top->lock_start_time = 0) {
+                       int lock_held = btree_node_locked_type(path, top->level);
+
+                       if (lock_held == BTREE_NODE_UNLOCKED)
+                               continue;
+
+                       b = &READ_ONCE(path->l[top->level].b)->c;
+
+                       if (unlikely(IS_ERR_OR_NULL(b))) {
+                               lock_graph_remove_non_waiters(&g);
+                               goto next;
+                       }
+
+                       if (list_empty_careful(&b->lock.wait_list))
+                               continue;
+
+                       raw_spin_lock(&b->lock.wait_lock);
+                       list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
+                               BUG_ON(b != trans->locking);
+
+                               if (top->lock_start_time &&
+                                   time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
+                                       continue;
+
+                               top->lock_start_time = trans->locking_wait.start_time;
+
+                               /* Don't check for self deadlock: */
+                               if (trans == top->trans ||
+                                   !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
+                                       continue;
+
+                               ret = lock_graph_descend(&g, trans);
+                               raw_spin_unlock(&b->lock.wait_lock);
+
+                               if (ret)
+                                       return ret < 0 ? ret : 0;
+                               goto next;
+
+                       }
+                       raw_spin_unlock(&b->lock.wait_lock);
+               }
+       }
+
+       lock_graph_pop(&g);
+       goto next;
+}
+
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
+{
+       struct btree_trans *trans = p;
+
+       return check_for_deadlock(trans);
+}
+
+int __bch2_btree_node_lock_write(struct btree_trans *trans,
+                                struct btree_bkey_cached_common *b,
+                                bool lock_may_not_fail)
 {
        int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
+       int ret;
 
        /*
         * Must drop our read locks before calling six_lock_write() -
@@ -64,8 +302,10 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans,
         * locked:
         */
        six_lock_readers_add(&b->lock, -readers);
-       btree_node_lock_nopath_nofail(trans, b, SIX_LOCK_write);
+       ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail);
        six_lock_readers_add(&b->lock, readers);
+
+       return ret;
 }
 
 static inline bool path_has_read_locks(struct btree_path *path)
index aea2ebafffd8730b77356b8703195673a1cb0fa7..874dd4428b3af00b30bc95f233ea61f865f79e87 100644 (file)
@@ -183,22 +183,41 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
 void bch2_btree_node_unlock_write(struct btree_trans *,
                        struct btree_path *, struct btree *);
 
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
+
 /* lock: */
 
+static inline int __btree_node_lock_nopath(struct btree_trans *trans,
+                                        struct btree_bkey_cached_common *b,
+                                        enum six_lock_type type,
+                                        bool lock_may_not_fail)
+{
+       int ret;
+
+       trans->lock_may_not_fail = lock_may_not_fail;
+       trans->locking          = b;
+       trans->lock_must_abort  = false;
+
+       ret = six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
+                                  bch2_six_check_for_deadlock, trans);
+       WRITE_ONCE(trans->locking, NULL);
+       WRITE_ONCE(trans->locking_wait.start_time, 0);
+       return ret;
+}
+
 static inline int __must_check
 btree_node_lock_nopath(struct btree_trans *trans,
                       struct btree_bkey_cached_common *b,
                       enum six_lock_type type)
 {
-       six_lock_type(&b->lock, type, NULL, NULL);
-       return 0;
+       return __btree_node_lock_nopath(trans, b, type, false);
 }
 
 static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
                                         struct btree_bkey_cached_common *b,
                                         enum six_lock_type type)
 {
-       int ret = btree_node_lock_nopath(trans, b, type);
+       int ret = __btree_node_lock_nopath(trans, b, type, true);
 
        BUG_ON(ret);
 }
@@ -210,8 +229,6 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
                                       enum six_lock_type type,
                                       six_lock_should_sleep_fn should_sleep_fn, void *p)
 {
-       int ret;
-
        if (six_trylock_type(&b->lock, type))
                return 0;
 
@@ -219,11 +236,10 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
        trans->locking_pos      = pos;
        trans->locking_btree_id = path->btree_id;
        trans->locking_level    = level;
-       trans->locking_lock_type = type;
+       trans->lock_may_not_fail = false;
        trans->locking          = b;
-       ret = six_lock_type(&b->lock, type, should_sleep_fn, p);
-       trans->locking = NULL;
-       return ret;
+       return six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
+                                   bch2_six_check_for_deadlock, trans);
 }
 
 /*
@@ -279,12 +295,15 @@ static inline int btree_node_lock(struct btree_trans *trans,
        return ret;
 }
 
-void __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *);
+int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *, bool);
 
-static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
-                                             struct btree_path *path,
-                                             struct btree_bkey_cached_common *b)
+static inline int __btree_node_lock_write(struct btree_trans *trans,
+                                         struct btree_path *path,
+                                         struct btree_bkey_cached_common *b,
+                                         bool lock_may_not_fail)
 {
+       int ret;
+
        EBUG_ON(&path->l[b->level].b->c != b);
        EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq);
        EBUG_ON(!btree_node_intent_locked(path, b->level));
@@ -296,8 +315,21 @@ static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
         */
        mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write);
 
-       if (unlikely(!six_trylock_write(&b->lock)))
-               __bch2_btree_node_lock_write(trans, b);
+       ret = likely(six_trylock_write(&b->lock))
+               ? 0
+               : __bch2_btree_node_lock_write(trans, b, lock_may_not_fail);
+       if (ret)
+               mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent);
+
+       return ret;
+}
+
+static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
+                                             struct btree_path *path,
+                                             struct btree_bkey_cached_common *b)
+{
+       int ret = __btree_node_lock_write(trans, path, b, true);
+       BUG_ON(ret);
 }
 
 static inline int __must_check
@@ -305,8 +337,7 @@ bch2_btree_node_lock_write(struct btree_trans *trans,
                           struct btree_path *path,
                           struct btree_bkey_cached_common *b)
 {
-       bch2_btree_node_lock_write_nofail(trans, path, b);
-       return 0;
+       return __btree_node_lock_write(trans, path, b, false);
 }
 
 /* relock: */
index 0a3854b614e042bbb736bbbd864800c8b8ccb639..578cf8fa3d2fc59d8206f8c0be39781b6cbfff08 100644 (file)
@@ -387,15 +387,19 @@ struct btree_trans_commit_hook {
 struct btree_trans {
        struct bch_fs           *c;
        const char              *fn;
+       struct closure          ref;
        struct list_head        list;
        u64                     last_begin_time;
-       struct btree_bkey_cached_common *locking;
+
        unsigned                locking_path_idx;
        struct bpos             locking_pos;
        u8                      locking_btree_id;
        u8                      locking_level;
-       u8                      locking_lock_type;
-       struct task_struct      *task;
+       u8                      lock_may_not_fail;
+       u8                      lock_must_abort;
+       struct btree_bkey_cached_common *locking;
+       struct six_lock_waiter  locking_wait;
+
        int                     srcu_idx;
 
        u8                      fn_idx;
index 4fe20d36212e4aa968810849a07a9af63f3b43c5..6944dfef5bcb154527b5a8f0dc9d0a8d968eb03c 100644 (file)
@@ -534,7 +534,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 
        mutex_lock(&c->btree_trans_lock);
        list_for_each_entry(trans, &c->btree_trans_list, list) {
-               if (trans->task->pid <= i->iter)
+               if (trans->locking_wait.task->pid <= i->iter)
                        continue;
 
                ret = flush_buf(i);
@@ -546,11 +546,11 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
                prt_printf(&i->buf, "backtrace:");
                prt_newline(&i->buf);
                printbuf_indent_add(&i->buf, 2);
-               prt_backtrace(&i->buf, trans->task);
+               prt_backtrace(&i->buf, trans->locking_wait.task);
                printbuf_indent_sub(&i->buf, 2);
                prt_newline(&i->buf);
 
-               i->iter = trans->task->pid;
+               i->iter = trans->locking_wait.task->pid;
        }
        mutex_unlock(&c->btree_trans_lock);
 
index 3dc477eb3600c800564fd4fc4484d30711b07e65..1ea004f1adbb568db31c60f79fcd56113d460c50 100644 (file)
@@ -35,6 +35,7 @@
        x(BCH_ERR_transaction_restart,  transaction_restart_in_traverse_all)    \
        x(BCH_ERR_transaction_restart,  transaction_restart_would_deadlock)     \
        x(BCH_ERR_transaction_restart,  transaction_restart_would_deadlock_write)\
+       x(BCH_ERR_transaction_restart,  transaction_restart_deadlock_recursion_limit)\
        x(BCH_ERR_transaction_restart,  transaction_restart_upgrade)            \
        x(BCH_ERR_transaction_restart,  transaction_restart_key_cache_upgrade)  \
        x(BCH_ERR_transaction_restart,  transaction_restart_key_cache_fill)     \
index 62de89fcb74b0af28367dd37f12a7c7976a624bc..35c40678f4b5da03c31486d69db16abe02fc6aaa 100644 (file)
@@ -1065,6 +1065,12 @@ TRACE_EVENT(trans_restart_would_deadlock,
                  __entry->want_pos_snapshot)
 );
 
+DEFINE_EVENT(transaction_event,        trans_restart_would_deadlock_recursion_limit,
+       TP_PROTO(struct btree_trans *trans,
+                unsigned long caller_ip),
+       TP_ARGS(trans, caller_ip)
+);
+
 TRACE_EVENT(trans_restart_would_deadlock_write,
        TP_PROTO(struct btree_trans *trans),
        TP_ARGS(trans),