From: Kent Overstreet Date: Mon, 11 Sep 2023 03:33:08 +0000 (-0400) Subject: bcachefs: Fix a deadlock X-Git-Url: http://git.maquefel.me/?a=commitdiff_plain;h=ee7570546e89ece9b56eaa22c94a6ec0941ec827;p=linux.git bcachefs: Fix a deadlock Waiting on a btree node write with btree locks held can deadlock, if the write errors: the write error path has to do do a btree update to drop the pointer to the replica that errored. The interior update path has to wait on in flight btree writes before freeing nodes on disk. Previously, this was done in bch2_btree_interior_update_will_free_node(), and could deadlock; now, we just stash a pointer to the node and do it in btree_update_nodes_written(), just prior to the transactional part of the update. Signed-off-by: Kent Overstreet --- diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 0f2a730e04b59..4ffdc11f4d9a9 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1727,6 +1727,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) return; if (old & (1 << BTREE_NODE_write_in_flight)) { + /* + * XXX waiting on btree writes with btree locks held - + * this can deadlock, and we hit the write error path + */ btree_node_wait_on_io(b); continue; } diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 6eeb0ca58b6a2..569db972f3bb6 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -550,6 +550,22 @@ static void btree_update_nodes_written(struct btree_update *as) BUG_ON(!journal_pin_active(&as->journal)); + /* + * Wait for any in flight writes to finish before we free the old nodes + * on disk: + */ + for (i = 0; i < as->nr_old_nodes; i++) { + struct btree *old = as->old_nodes[i]; + __le64 seq; + + six_lock_read(&old->c.lock, NULL, NULL); + seq = old->data ? old->data->keys.seq : 0; + six_unlock_read(&old->c.lock); + + if (seq == as->old_nodes_seq[i]) + btree_node_wait_on_io(old); + } + /* * We did an update to a parent node where the pointers we added pointed * to child nodes that weren't written yet: now, the child nodes have @@ -889,13 +905,9 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, btree_update_will_delete_key(as, &b->key); - /* - * XXX: Waiting on io with btree node locks held, we don't want to be - * doing this. We can't have btree writes happening after the space has - * been freed, but we really only need to block before - * btree_update_nodes_written_trans() happens. - */ - btree_node_wait_on_io(b); + as->old_nodes[as->nr_old_nodes] = b; + as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq; + as->nr_old_nodes++; } void bch2_btree_update_done(struct btree_update *as) diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 7eef3dbb6ef17..7ed67b47e1b93 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -92,6 +92,10 @@ struct btree_update { struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; unsigned nr_new_nodes; + struct btree *old_nodes[BTREE_UPDATE_NODES_MAX]; + __le64 old_nodes_seq[BTREE_UPDATE_NODES_MAX]; + unsigned nr_old_nodes; + open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * BCH_REPLICAS_MAX]; open_bucket_idx_t nr_open_buckets;