bcachefs: Fix a deadlock
authorKent Overstreet <kent.overstreet@linux.dev>
Mon, 11 Sep 2023 03:33:08 +0000 (23:33 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:05 +0000 (17:09 -0400)
Waiting on a btree node write with btree locks held can deadlock, if the
write errors: the write error path has to do do a btree update to drop
the pointer to the replica that errored.

The interior update path has to wait on in flight btree writes before
freeing nodes on disk. Previously, this was done in
bch2_btree_interior_update_will_free_node(), and could deadlock; now, we
just stash a pointer to the node and do it in
btree_update_nodes_written(), just prior to the transactional part of
the update.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_io.c
fs/bcachefs/btree_update_interior.c
fs/bcachefs/btree_update_interior.h

index 0f2a730e04b59d8c685cec8e8b7b38c4e63f7f6a..4ffdc11f4d9a940afa7836c778c77f5a68bf32da 100644 (file)
@@ -1727,6 +1727,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
                        return;
 
                if (old & (1 << BTREE_NODE_write_in_flight)) {
+                       /*
+                        * XXX waiting on btree writes with btree locks held -
+                        * this can deadlock, and we hit the write error path
+                        */
                        btree_node_wait_on_io(b);
                        continue;
                }
index 6eeb0ca58b6a24377f7315f02ddcccb62090760b..569db972f3bb6e673aef3608ffcc29459210c695 100644 (file)
@@ -550,6 +550,22 @@ static void btree_update_nodes_written(struct btree_update *as)
 
        BUG_ON(!journal_pin_active(&as->journal));
 
+       /*
+        * Wait for any in flight writes to finish before we free the old nodes
+        * on disk:
+        */
+       for (i = 0; i < as->nr_old_nodes; i++) {
+               struct btree *old = as->old_nodes[i];
+               __le64 seq;
+
+               six_lock_read(&old->c.lock, NULL, NULL);
+               seq = old->data ? old->data->keys.seq : 0;
+               six_unlock_read(&old->c.lock);
+
+               if (seq == as->old_nodes_seq[i])
+                       btree_node_wait_on_io(old);
+       }
+
        /*
         * We did an update to a parent node where the pointers we added pointed
         * to child nodes that weren't written yet: now, the child nodes have
@@ -889,13 +905,9 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 
        btree_update_will_delete_key(as, &b->key);
 
-       /*
-        * XXX: Waiting on io with btree node locks held, we don't want to be
-        * doing this. We can't have btree writes happening after the space has
-        * been freed, but we really only need to block before
-        * btree_update_nodes_written_trans() happens.
-        */
-       btree_node_wait_on_io(b);
+       as->old_nodes[as->nr_old_nodes] = b;
+       as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
+       as->nr_old_nodes++;
 }
 
 void bch2_btree_update_done(struct btree_update *as)
index 7eef3dbb6ef178ee0c9082b6699ffe36469fb35e..7ed67b47e1b9377bf960c15de653074e3a786c74 100644 (file)
@@ -92,6 +92,10 @@ struct btree_update {
        struct btree                    *new_nodes[BTREE_UPDATE_NODES_MAX];
        unsigned                        nr_new_nodes;
 
+       struct btree                    *old_nodes[BTREE_UPDATE_NODES_MAX];
+       __le64                          old_nodes_seq[BTREE_UPDATE_NODES_MAX];
+       unsigned                        nr_old_nodes;
+
        open_bucket_idx_t               open_buckets[BTREE_UPDATE_NODES_MAX *
                                                     BCH_REPLICAS_MAX];
        open_bucket_idx_t               nr_open_buckets;