bcachefs: Better approach to write vs. read lock deadlocks
authorKent Overstreet <kent.overstreet@gmail.com>
Wed, 8 Sep 2021 01:25:32 +0000 (21:25 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:12 +0000 (17:09 -0400)
Instead of unconditionally upgrading read locks to intent locks in
do_bch2_trans_commit(), this patch changes the path that takes write
locks to first trylock, and then if trylock fails check if we have a
conflicting read lock, and restart the transaction if necessary.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
fs/bcachefs/btree_update_leaf.c
fs/bcachefs/trace.h

index 576f0739fdbd4a697777bf468e38793dad43e500..ab5cca892e1a7f5dd0e82f330035e2a1de95c834 100644 (file)
@@ -545,6 +545,54 @@ static inline void normalize_read_intent_locks(struct btree_trans *trans)
        bch2_trans_verify_locks(trans);
 }
 
+static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos)
+{
+       struct btree_path *path;
+       unsigned i;
+
+       trans_for_each_path_inorder(trans, path, i) {
+               //if (path == pos)
+               //      break;
+
+               if (path->nodes_locked != path->nodes_intent_locked)
+                       return true;
+       }
+
+       return false;
+}
+
+static inline int trans_lock_write(struct btree_trans *trans)
+{
+       struct btree_insert_entry *i;
+
+       trans_for_each_update(trans, i) {
+               if (same_leaf_as_prev(trans, i))
+                       continue;
+
+               if (!six_trylock_write(&insert_l(i)->b->c.lock)) {
+                       if (have_conflicting_read_lock(trans, i->path))
+                               goto fail;
+
+                       __btree_node_lock_type(trans->c, insert_l(i)->b,
+                                              SIX_LOCK_write);
+               }
+
+               bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+       }
+
+       return 0;
+fail:
+       while (--i >= trans->updates) {
+               if (same_leaf_as_prev(trans, i))
+                       continue;
+
+               bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
+       }
+
+       trace_trans_restart_would_deadlock_write(trans->ip);
+       return btree_trans_restart(trans);
+}
+
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
@@ -554,10 +602,25 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i;
-       struct btree_path *path;
        struct bkey_s_c old;
        int ret, u64s_delta = 0;
 
+       trans_for_each_update(trans, i) {
+               const char *invalid = bch2_bkey_invalid(c,
+                               bkey_i_to_s_c(i->k), i->bkey_type);
+               if (invalid) {
+                       char buf[200];
+
+                       bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
+                       bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n",
+                               buf, (void *) trans->ip,
+                               (void *) i->ip_allocated, invalid);
+                       bch2_fatal_error(c);
+                       return -EINVAL;
+               }
+               btree_insert_entry_checks(trans, i);
+       }
+
        trans_for_each_update(trans, i) {
                struct bkey u;
 
@@ -599,48 +662,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
        if (unlikely(ret))
                return ret;
 
-       /*
-        * Can't be holding any read locks when we go to take write locks:
-        * another thread could be holding an intent lock on the same node we
-        * have a read lock on, and it'll block trying to take a write lock
-        * (because we hold a read lock) and it could be blocking us by holding
-        * its own read lock (while we're trying to to take write locks).
-        *
-        * note - this must be done after bch2_trans_journal_preres_get_cold()
-        * or anything else that might call bch2_trans_relock(), since that
-        * would just retake the read locks:
-        */
-       trans_for_each_path(trans, path)
-               if (path->nodes_locked != path->nodes_intent_locked &&
-                   !bch2_btree_path_upgrade(trans, path, path->level + 1)) {
-                       trace_trans_restart_upgrade(trans->ip, trace_ip,
-                                                   path->btree_id, &path->pos);
-                       return btree_trans_restart(trans);
-               }
-
-       trans_for_each_update(trans, i) {
-               const char *invalid = bch2_bkey_invalid(c,
-                               bkey_i_to_s_c(i->k), i->bkey_type);
-               if (invalid) {
-                       char buf[200];
-
-                       bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
-                       bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n",
-                               buf, (void *) trans->ip,
-                               (void *) i->ip_allocated, invalid);
-                       bch2_fatal_error(c);
-                       return -EINVAL;
-               }
-               btree_insert_entry_checks(trans, i);
-       }
-
        normalize_read_intent_locks(trans);
 
-       trans_for_each_update(trans, i)
-               if (!same_leaf_as_prev(trans, i)) {
-                       btree_node_lock_type(c, insert_l(i)->b, SIX_LOCK_write);
-                       bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
-               }
+       ret = trans_lock_write(trans);
+       if (unlikely(ret))
+               return ret;
 
        ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
 
index 960dcc8ce3e6be9f6a819f415ee3121372108dd6..21d026277540fd5c5c75cc86fd18a980500f08ff 100644 (file)
@@ -756,6 +756,21 @@ TRACE_EVENT(trans_restart_would_deadlock,
                  __entry->want_pos_snapshot)
 );
 
+TRACE_EVENT(trans_restart_would_deadlock_write,
+       TP_PROTO(unsigned long trans_ip),
+       TP_ARGS(trans_ip),
+
+       TP_STRUCT__entry(
+               __field(unsigned long,          trans_ip        )
+       ),
+
+       TP_fast_assign(
+               __entry->trans_ip       = trans_ip;
+       ),
+
+       TP_printk("%ps", (void *) __entry->trans_ip)
+);
+
 TRACE_EVENT(trans_restart_mem_realloced,
        TP_PROTO(unsigned long trans_ip, unsigned long caller_ip,
                 unsigned long bytes),