bcachefs: Fix two more deadlocks
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 2 May 2020 20:21:35 +0000 (16:21 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:39 +0000 (17:08 -0400)
Deadlock on shutdown:

btree_update_nodes_written() unblocks btree nodes from being written;
after doing so, it has to check if they were marked as needing to be
written and if so kick off those writes - if that doesn't happen, we'll
never release journal pins and shutdown will get stuck when flushing the
journal.

There was an error path where this didn't happen, because in the error
path we don't actually want those btree nodes write to happen; however,
we still have to kick off the write path so the journal pins get
released. The btree write path checks if we're in a journal error state
and doesn't do the actual write if we are.

Also - there was another deadlock because btree_update_nodes_written()
was taking the btree update off of the unwritten_list too soon - before
getting a journal reservation, which could fail and have to be retried.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_io.c
fs/bcachefs/btree_update_interior.c

index 04537eb06e4aa459462f08ca04451c994a282fa1..32bd193a85c594c00d1e90a211a296716e1b1247 100644 (file)
@@ -1626,6 +1626,11 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
         * reflect that those writes were done and the data flushed from the
         * journal:
         *
+        * Also on journal error, the pending write may have updates that were
+        * never journalled (interior nodes, see btree_update_nodes_written()) -
+        * it's critical that we don't do the write in that case otherwise we
+        * will have updates visible that weren't in the journal:
+        *
         * Make sure to update b->written so bch2_btree_init_next() doesn't
         * break:
         */
index 1f0d955588581c05f4a68a7b9b043cbca0051f86..28568db5834ab1fc04dc50b5321c5de55ed9e8f8 100644 (file)
@@ -586,12 +586,12 @@ static void __bch2_btree_update_free(struct btree_update *as)
        bch2_journal_pin_drop(&c->journal, &as->journal);
        bch2_journal_pin_flush(&c->journal, &as->journal);
 
-       BUG_ON((as->nr_new_nodes || as->nr_pending) &&
-              !bch2_journal_error(&c->journal));;
+       BUG_ON(as->nr_new_nodes || as->nr_pending);
 
        if (as->reserve)
                bch2_btree_reserve_put(c, as->reserve);
 
+       list_del(&as->unwritten_list);
        list_del(&as->list);
 
        closure_debug_destroy(&as->cl);
@@ -625,12 +625,12 @@ static inline bool six_trylock_intentwrite(struct six_lock *lock)
 static void btree_update_nodes_written(struct closure *cl)
 {
        struct btree_update *as = container_of(cl, struct btree_update, cl);
-       struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
-       unsigned nr_new_nodes;
+       struct btree *nodes_need_write[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES + 1];
+       unsigned nr_nodes_need_write;
        struct journal_res res = { 0 };
        struct bch_fs *c = as->c;
+       struct btree_root *r;
        struct btree *b;
-       struct bset *i;
        int ret;
 
        /*
@@ -641,7 +641,7 @@ static void btree_update_nodes_written(struct closure *cl)
        mutex_lock(&c->btree_interior_update_lock);
        as->nodes_written = true;
 again:
-       nr_new_nodes = 0;
+       nr_nodes_need_write = 0;
        as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
                                      struct btree_update, unwritten_list);
        if (!as || !as->nodes_written) {
@@ -663,16 +663,16 @@ again:
                goto again;
        }
 
-       list_del(&as->unwritten_list);
-
        ret = bch2_journal_res_get(&c->journal, &res, as->journal_u64s,
                                   JOURNAL_RES_GET_NONBLOCK|
                                   JOURNAL_RES_GET_RESERVED);
        if (ret == -EAGAIN) {
                unsigned u64s = as->journal_u64s;
 
-               six_unlock_write(&b->c.lock);
-               six_unlock_intent(&b->c.lock);
+               if (b) {
+                       six_unlock_write(&b->c.lock);
+                       six_unlock_intent(&b->c.lock);
+               }
 
                mutex_unlock(&c->btree_interior_update_lock);
 
@@ -685,19 +685,22 @@ again:
                }
        }
 
-       if (ret) {
-               BUG_ON(!bch2_journal_error(&c->journal));
-               /* can't unblock btree writes */
-               goto free_update;
-       }
-
-       {
+       if (!ret) {
                struct journal_buf *buf = &c->journal.buf[res.idx];
                struct jset_entry *entry = vstruct_idx(buf->data, res.offset);
 
                res.offset      += as->journal_u64s;
                res.u64s        -= as->journal_u64s;
                memcpy_u64s(entry, as->journal_entries, as->journal_u64s);
+       } else {
+               /*
+                * On journal error we have to run most of the normal path so
+                * that shutdown works - unblocking btree node writes in
+                * particular and writing them if needed - except for
+                * journalling the update:
+                */
+
+               BUG_ON(!bch2_journal_error(&c->journal));
        }
 
        switch (as->mode) {
@@ -705,24 +708,41 @@ again:
                BUG();
        case BTREE_INTERIOR_UPDATING_NODE:
                /* @b is the node we did the final insert into: */
-               BUG_ON(!res.ref);
+
+               /*
+                * On failure to get a journal reservation, we still have to
+                * unblock the write and allow most of the write path to happen
+                * so that shutdown works, but the i->journal_seq mechanism
+                * won't work to prevent the btree write from being visible (we
+                * didn't get a journal sequence number) - instead
+                * __bch2_btree_node_write() doesn't do the actual write if
+                * we're in journal error state:
+                */
 
                list_del(&as->write_blocked_list);
 
-               i = btree_bset_last(b);
-               i->journal_seq = cpu_to_le64(
-                       max(res.seq,
-                           le64_to_cpu(i->journal_seq)));
+               if (!ret) {
+                       struct bset *i = btree_bset_last(b);
+
+                       i->journal_seq = cpu_to_le64(
+                               max(res.seq,
+                                   le64_to_cpu(i->journal_seq)));
+
+                       bch2_btree_add_journal_pin(c, b, res.seq);
+               }
+
+               nodes_need_write[nr_nodes_need_write++] = b;
 
-               bch2_btree_add_journal_pin(c, b, res.seq);
+               six_unlock_write(&b->c.lock);
+               six_unlock_intent(&b->c.lock);
                break;
 
        case BTREE_INTERIOR_UPDATING_AS:
                BUG_ON(b);
                break;
 
-       case BTREE_INTERIOR_UPDATING_ROOT: {
-               struct btree_root *r = &c->btree_roots[as->btree_id];
+       case BTREE_INTERIOR_UPDATING_ROOT:
+               r = &c->btree_roots[as->btree_id];
 
                BUG_ON(b);
 
@@ -734,42 +754,25 @@ again:
                mutex_unlock(&c->btree_root_lock);
                break;
        }
-       }
 
        bch2_journal_pin_drop(&c->journal, &as->journal);
 
        bch2_journal_res_put(&c->journal, &res);
        bch2_journal_preres_put(&c->journal, &as->journal_preres);
-free_update:
-       /* Do btree write after dropping journal res: */
-       if (b) {
-               six_unlock_write(&b->c.lock);
-               /*
-                * b->write_blocked prevented it from being written, so
-                * write it now if it needs to be written:
-                */
-               btree_node_write_if_need(c, b, SIX_LOCK_intent);
-               six_unlock_intent(&b->c.lock);
-       }
 
-       if (!ret) {
-               nr_new_nodes = as->nr_new_nodes;
-               memcpy(new_nodes,
-                      as->new_nodes,
-                      as->nr_new_nodes * sizeof(struct btree *));
+       while (as->nr_new_nodes) {
+               b = as->new_nodes[--as->nr_new_nodes];
 
-               while (as->nr_new_nodes) {
-                       struct btree *b = as->new_nodes[--as->nr_new_nodes];
+               BUG_ON(b->will_make_reachable != (unsigned long) as);
+               b->will_make_reachable = 0;
 
-                       BUG_ON(b->will_make_reachable != (unsigned long) as);
-                       b->will_make_reachable = 0;
-               }
-
-               while (as->nr_pending)
-                       bch2_btree_node_free_ondisk(c,
-                               &as->pending[--as->nr_pending], res.seq);
+               nodes_need_write[nr_nodes_need_write++] = b;
        }
 
+       while (as->nr_pending)
+               bch2_btree_node_free_ondisk(c,
+                       &as->pending[--as->nr_pending], res.seq);
+
        __bch2_btree_update_free(as);
        /*
         * for flush_held_btree_writes() waiting on updates to flush or
@@ -782,8 +785,10 @@ free_update:
         * */
        mutex_unlock(&c->btree_interior_update_lock);
 
-       while (nr_new_nodes) {
-               struct btree *b = new_nodes[--nr_new_nodes];
+       /* Do btree writes after dropping journal res/locks: */
+       while (nr_nodes_need_write) {
+               b = nodes_need_write[--nr_nodes_need_write];
+
                btree_node_lock_type(c, b, SIX_LOCK_read);
                bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
                six_unlock_read(&b->c.lock);
@@ -1036,6 +1041,7 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
        as->btree_id    = id;
        as->reserve     = reserve;
        INIT_LIST_HEAD(&as->write_blocked_list);
+       INIT_LIST_HEAD(&as->unwritten_list);
        as->journal_preres = journal_preres;
 
        bch2_keylist_init(&as->parent_keys, as->inline_keys);