bcachefs: Interior btree updates are now fully transactional
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 25 May 2020 18:57:06 +0000 (14:57 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:40 +0000 (17:08 -0400)
We now update the alloc info (bucket sector counts) atomically with
journalling the update to the interior btree nodes, and we also set new
btree roots atomically with the journalled part of the btree update.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
21 files changed:
fs/bcachefs/alloc_background.c
fs/bcachefs/bcachefs.h
fs/bcachefs/btree_gc.c
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/btree_update_interior.h
fs/bcachefs/btree_update_leaf.c
fs/bcachefs/buckets.c
fs/bcachefs/buckets.h
fs/bcachefs/journal.c
fs/bcachefs/journal.h
fs/bcachefs/journal_io.c
fs/bcachefs/journal_reclaim.c
fs/bcachefs/journal_reclaim.h
fs/bcachefs/keylist.c
fs/bcachefs/keylist.h
fs/bcachefs/migrate.c
fs/bcachefs/move.c
fs/bcachefs/recovery.c
fs/bcachefs/super-io.c
fs/bcachefs/super.c

index a08ae42cc073ca4ad0fb272b1a72fd37451e266a..b3c5d82c15de0168ef301b4f6c2d49a586b3895c 100644 (file)
@@ -1461,11 +1461,6 @@ again:
                }
        rcu_read_unlock();
 
-       if (c->btree_roots_dirty) {
-               bch2_journal_meta(&c->journal);
-               goto again;
-       }
-
        return !nodes_unwritten &&
                !bch2_btree_interior_updates_nr_pending(c);
 }
index 069a3c416bc5e4fc8e21b3ee3fd5b43aed3decea..e12946d686ddecf63d34ddb249c9796a55628b63 100644 (file)
@@ -603,13 +603,10 @@ struct bch_fs {
        struct bio_set          btree_bio;
 
        struct btree_root       btree_roots[BTREE_ID_NR];
-       bool                    btree_roots_dirty;
        struct mutex            btree_root_lock;
 
        struct btree_cache      btree_cache;
 
-       mempool_t               btree_reserve_pool;
-
        /*
         * Cache of allocated btree nodes - if we allocate a btree node and
         * don't use it, if we free it that space can't be reused until going
@@ -627,6 +624,9 @@ struct bch_fs {
        struct mutex            btree_interior_update_lock;
        struct closure_waitlist btree_interior_update_wait;
 
+       struct workqueue_struct *btree_interior_update_worker;
+       struct work_struct      btree_interior_update_work;
+
        mempool_t               btree_iters_pool;
 
        struct workqueue_struct *wq;
index 1a97a74b36c8922092b85c7041899cf140d1e771..6589fe0bad6c3d674b614c7b9ca4f50cb726c1a3 100644 (file)
@@ -466,6 +466,7 @@ static void bch2_mark_superblocks(struct bch_fs *c)
        mutex_unlock(&c->sb_lock);
 }
 
+#if 0
 /* Also see bch2_pending_btree_node_free_insert_done() */
 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 {
@@ -483,6 +484,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 
        mutex_unlock(&c->btree_interior_update_lock);
 }
+#endif
 
 static void bch2_mark_allocator_buckets(struct bch_fs *c)
 {
@@ -801,6 +803,10 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
        trace_gc_start(c);
 
        down_write(&c->gc_lock);
+
+       /* flush interior btree updates: */
+       closure_wait_event(&c->btree_interior_update_wait,
+                          !bch2_btree_interior_updates_nr_pending(c));
 again:
        ret = bch2_gc_start(c, metadata_only);
        if (ret)
@@ -812,7 +818,9 @@ again:
        if (ret)
                goto out;
 
+#if 0
        bch2_mark_pending_btree_node_frees(c);
+#endif
        bch2_mark_allocator_buckets(c);
 
        c->gc_count++;
@@ -1037,6 +1045,8 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
                btree_node_reset_sib_u64s(n);
 
                bch2_btree_build_aux_trees(n);
+
+               bch2_btree_update_add_new_node(as, n);
                six_unlock_write(&n->c.lock);
 
                bch2_btree_node_write(c, n, SIX_LOCK_intent);
@@ -1085,7 +1095,7 @@ next:
        bch2_btree_iter_node_replace(iter, new_nodes[0]);
 
        for (i = 0; i < nr_new_nodes; i++)
-               bch2_open_buckets_put(c, &new_nodes[i]->ob);
+               bch2_btree_update_get_open_buckets(as, new_nodes[i]);
 
        /* Free the old nodes and update our sliding window */
        for (i = 0; i < nr_old_nodes; i++) {
index 769c05c8d9380a63cc8d9809aa1350e1bee454b5..0ecd0047571285d5e048e29cccd19dd50de9d210 100644 (file)
@@ -310,6 +310,7 @@ struct btree_trans {
        /* update path: */
        struct jset_entry       *extra_journal_entries;
        unsigned                extra_journal_entry_u64s;
+       struct journal_entry_pin *journal_pin;
 
        struct journal_res      journal_res;
        struct journal_preres   journal_preres;
index 1867d732afd4fc3b38948445be108861f7a2a620..7d63c457a3bf4a3a51a8ac30d604a4e9835a47ea 100644 (file)
 
 #include <linux/random.h>
 
-static void btree_node_will_make_reachable(struct btree_update *,
-                                          struct btree *);
-static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
-
 /* Debug code: */
 
 /*
@@ -124,74 +120,6 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
 
 /* Btree node freeing/allocation: */
 
-static bool btree_key_matches(struct bch_fs *c,
-                             struct bkey_s_c l,
-                             struct bkey_s_c r)
-{
-       struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(l);
-       struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(r);
-       const struct bch_extent_ptr *ptr1, *ptr2;
-
-       bkey_for_each_ptr(ptrs1, ptr1)
-               bkey_for_each_ptr(ptrs2, ptr2)
-                       if (ptr1->dev == ptr2->dev &&
-                           ptr1->gen == ptr2->gen &&
-                           ptr1->offset == ptr2->offset)
-                               return true;
-
-       return false;
-}
-
-/*
- * We're doing the index update that makes @b unreachable, update stuff to
- * reflect that:
- *
- * Must be called _before_ btree_update_updated_root() or
- * btree_update_updated_node:
- */
-static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
-                                      struct bkey_s_c k,
-                                      struct bch_fs_usage *stats)
-{
-       struct bch_fs *c = as->c;
-       struct pending_btree_node_free *d;
-
-       for (d = as->pending; d < as->pending + as->nr_pending; d++)
-               if (!bkey_cmp(k.k->p, d->key.k.p) &&
-                   btree_key_matches(c, k, bkey_i_to_s_c(&d->key)))
-                       goto found;
-       BUG();
-found:
-       BUG_ON(d->index_update_done);
-       d->index_update_done = true;
-
-       /*
-        * We're dropping @k from the btree, but it's still live until the
-        * index update is persistent so we need to keep a reference around for
-        * mark and sweep to find - that's primarily what the
-        * btree_node_pending_free list is for.
-        *
-        * So here (when we set index_update_done = true), we're moving an
-        * existing reference to a different part of the larger "gc keyspace" -
-        * and the new position comes after the old position, since GC marks
-        * the pending free list after it walks the btree.
-        *
-        * If we move the reference while mark and sweep is _between_ the old
-        * and the new position, mark and sweep will see the reference twice
-        * and it'll get double accounted - so check for that here and subtract
-        * to cancel out one of mark and sweep's markings if necessary:
-        */
-
-       if (gc_pos_cmp(c->gc_pos, b
-                      ? gc_pos_btree_node(b)
-                      : gc_pos_btree_root(as->btree_id)) >= 0 &&
-           gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
-               bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key),
-                             0, 0, NULL, 0,
-                             BTREE_TRIGGER_OVERWRITE|
-                             BTREE_TRIGGER_GC);
-}
-
 static void __btree_node_free(struct bch_fs *c, struct btree *b)
 {
        trace_btree_node_free(c, b);
@@ -216,8 +144,6 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
 {
        struct open_buckets ob = b->ob;
 
-       btree_update_drop_new_node(c, b);
-
        b->ob.nr = 0;
 
        clear_btree_node_dirty(b);
@@ -237,39 +163,12 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
        trans_for_each_iter(iter->trans, linked)
                BUG_ON(linked->l[b->c.level].b == b);
 
-       /*
-        * Is this a node that isn't reachable on disk yet?
-        *
-        * Nodes that aren't reachable yet have writes blocked until they're
-        * reachable - now that we've cancelled any pending writes and moved
-        * things waiting on that write to wait on this update, we can drop this
-        * node from the list of nodes that the other update is making
-        * reachable, prior to freeing it:
-        */
-       btree_update_drop_new_node(c, b);
-
        six_lock_write(&b->c.lock, NULL, NULL);
        __btree_node_free(c, b);
        six_unlock_write(&b->c.lock);
        six_unlock_intent(&b->c.lock);
 }
 
-static void bch2_btree_node_free_ondisk(struct bch_fs *c,
-                       struct pending_btree_node_free *pending,
-                       u64 journal_seq)
-{
-       BUG_ON(!pending->index_update_done);
-
-       bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-                     0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE);
-
-       if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
-               bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-                             0, 0, NULL, journal_seq,
-                             BTREE_TRIGGER_OVERWRITE|
-                             BTREE_TRIGGER_GC);
-}
-
 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
                                             struct disk_reservation *res,
                                             struct closure *cl,
@@ -357,9 +256,9 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
        int ret;
 
        BUG_ON(level >= BTREE_MAX_DEPTH);
-       BUG_ON(!as->reserve->nr);
+       BUG_ON(!as->nr_prealloc_nodes);
 
-       b = as->reserve->b[--as->reserve->nr];
+       b = as->prealloc_nodes[--as->nr_prealloc_nodes];
 
        set_btree_node_accessed(b);
        set_btree_node_dirty(b);
@@ -394,8 +293,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 
        bch2_btree_build_aux_trees(b);
 
-       btree_node_will_make_reachable(as, b);
-
        ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
        BUG_ON(ret);
 
@@ -466,19 +363,20 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
        btree_node_set_format(b, b->data->format);
        bch2_btree_build_aux_trees(b);
 
+       bch2_btree_update_add_new_node(as, b);
        six_unlock_write(&b->c.lock);
 
        return b;
 }
 
-static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve)
+static void bch2_btree_reserve_put(struct btree_update *as)
 {
-       bch2_disk_reservation_put(c, &reserve->disk_res);
+       struct bch_fs *c = as->c;
 
        mutex_lock(&c->btree_reserve_cache_lock);
 
-       while (reserve->nr) {
-               struct btree *b = reserve->b[--reserve->nr];
+       while (as->nr_prealloc_nodes) {
+               struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
 
                six_unlock_write(&b->c.lock);
 
@@ -502,36 +400,14 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser
        }
 
        mutex_unlock(&c->btree_reserve_cache_lock);
-
-       mempool_free(reserve, &c->btree_reserve_pool);
 }
 
-static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
-                                                   unsigned nr_nodes,
-                                                   unsigned flags,
-                                                   struct closure *cl)
+static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
+                                 unsigned flags, struct closure *cl)
 {
-       struct btree_reserve *reserve;
+       struct bch_fs *c = as->c;
        struct btree *b;
-       struct disk_reservation disk_res = { 0, 0 };
-       unsigned sectors = nr_nodes * c->opts.btree_node_size;
-       int ret, disk_res_flags = 0;
-
-       if (flags & BTREE_INSERT_NOFAIL)
-               disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
-
-       /*
-        * This check isn't necessary for correctness - it's just to potentially
-        * prevent us from doing a lot of work that'll end up being wasted:
-        */
-       ret = bch2_journal_error(&c->journal);
-       if (ret)
-               return ERR_PTR(ret);
-
-       if (bch2_disk_reservation_get(c, &disk_res, sectors,
-                                     c->opts.metadata_replicas,
-                                     disk_res_flags))
-               return ERR_PTR(-ENOSPC);
+       int ret;
 
        BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
 
@@ -540,18 +416,11 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
         * open bucket reserve:
         */
        ret = bch2_btree_cache_cannibalize_lock(c, cl);
-       if (ret) {
-               bch2_disk_reservation_put(c, &disk_res);
-               return ERR_PTR(ret);
-       }
-
-       reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO);
-
-       reserve->disk_res = disk_res;
-       reserve->nr = 0;
+       if (ret)
+               return ret;
 
-       while (reserve->nr < nr_nodes) {
-               b = __bch2_btree_node_alloc(c, &disk_res,
+       while (as->nr_prealloc_nodes < nr_nodes) {
+               b = __bch2_btree_node_alloc(c, &as->disk_res,
                                            flags & BTREE_INSERT_NOWAIT
                                            ? NULL : cl, flags);
                if (IS_ERR(b)) {
@@ -563,21 +432,20 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
                if (ret)
                        goto err_free;
 
-               reserve->b[reserve->nr++] = b;
+               as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
        }
 
        bch2_btree_cache_cannibalize_unlock(c);
-       return reserve;
+       return 0;
 err_free:
-       bch2_btree_reserve_put(c, reserve);
        bch2_btree_cache_cannibalize_unlock(c);
        trace_btree_reserve_get_fail(c, nr_nodes, cl);
-       return ERR_PTR(ret);
+       return ret;
 }
 
 /* Asynchronous interior node update machinery */
 
-static void __bch2_btree_update_free(struct btree_update *as)
+static void bch2_btree_update_free(struct btree_update *as)
 {
        struct bch_fs *c = as->c;
 
@@ -585,14 +453,13 @@ static void __bch2_btree_update_free(struct btree_update *as)
 
        bch2_journal_pin_drop(&c->journal, &as->journal);
        bch2_journal_pin_flush(&c->journal, &as->journal);
+       bch2_disk_reservation_put(c, &as->disk_res);
+       bch2_btree_reserve_put(as);
 
-       BUG_ON(as->nr_new_nodes || as->nr_pending);
-
-       if (as->reserve)
-               bch2_btree_reserve_put(c, as->reserve);
-
+       mutex_lock(&c->btree_interior_update_lock);
        list_del(&as->unwritten_list);
        list_del(&as->list);
+       mutex_unlock(&c->btree_interior_update_lock);
 
        closure_debug_destroy(&as->cl);
        mempool_free(as, &c->btree_interior_update_pool);
@@ -600,37 +467,59 @@ static void __bch2_btree_update_free(struct btree_update *as)
        closure_wake_up(&c->btree_interior_update_wait);
 }
 
-static void bch2_btree_update_free(struct btree_update *as)
+static void btree_update_will_delete_key(struct btree_update *as,
+                                        struct bkey_i *k)
 {
-       struct bch_fs *c = as->c;
+       BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s >
+              ARRAY_SIZE(as->_old_keys));
+       bch2_keylist_add(&as->old_keys, k);
+}
 
-       mutex_lock(&c->btree_interior_update_lock);
-       __bch2_btree_update_free(as);
-       mutex_unlock(&c->btree_interior_update_lock);
+static void btree_update_will_add_key(struct btree_update *as,
+                                     struct bkey_i *k)
+{
+       BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s >
+              ARRAY_SIZE(as->_new_keys));
+       bch2_keylist_add(&as->new_keys, k);
 }
 
-static inline bool six_trylock_intentwrite(struct six_lock *lock)
+/*
+ * The transactional part of an interior btree node update, where we journal the
+ * update we did to the interior node and update alloc info:
+ */
+static int btree_update_nodes_written_trans(struct btree_trans *trans,
+                                           struct btree_update *as)
 {
-       if (!six_trylock_intent(lock))
-               return false;
+       struct bkey_i *k;
+       int ret;
+
+       trans->extra_journal_entries = (void *) &as->journal_entries[0];
+       trans->extra_journal_entry_u64s = as->journal_u64s;
+       trans->journal_pin = &as->journal;
 
-       if (!six_trylock_write(lock)) {
-               six_unlock_intent(lock);
-               return false;
+       for_each_keylist_key(&as->new_keys, k) {
+               ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
+                                         0, 0, BTREE_TRIGGER_INSERT);
+               if (ret)
+                       return ret;
        }
 
-       return true;
+       for_each_keylist_key(&as->old_keys, k) {
+               ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
+                                         0, 0, BTREE_TRIGGER_OVERWRITE);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
 }
 
-static void btree_update_nodes_written(struct closure *cl)
+static void btree_update_nodes_written(struct btree_update *as)
 {
-       struct btree_update *as = container_of(cl, struct btree_update, cl);
-       struct btree *nodes_need_write[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES + 1];
-       unsigned nr_nodes_need_write;
-       struct journal_res res = { 0 };
        struct bch_fs *c = as->c;
-       struct btree_root *r;
-       struct btree *b;
+       struct btree *b = as->b;
+       u64 journal_seq = 0;
+       unsigned i;
        int ret;
 
        /*
@@ -638,78 +527,17 @@ static void btree_update_nodes_written(struct closure *cl)
         * to child nodes that weren't written yet: now, the child nodes have
         * been written so we can write out the update to the interior node.
         */
-       mutex_lock(&c->btree_interior_update_lock);
-       as->nodes_written = true;
-again:
-       nr_nodes_need_write = 0;
-       as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
-                                     struct btree_update, unwritten_list);
-       if (!as || !as->nodes_written) {
-               mutex_unlock(&c->btree_interior_update_lock);
-               return;
-       }
-
-       b = as->b;
-       if (b && !six_trylock_intentwrite(&b->c.lock)) {
-               mutex_unlock(&c->btree_interior_update_lock);
-
-               btree_node_lock_type(c, b, SIX_LOCK_intent);
-               six_lock_write(&b->c.lock, NULL, NULL);
-
-               six_unlock_write(&b->c.lock);
-               six_unlock_intent(&b->c.lock);
-
-               mutex_lock(&c->btree_interior_update_lock);
-               goto again;
-       }
-
-       ret = bch2_journal_res_get(&c->journal, &res, as->journal_u64s,
-                                  JOURNAL_RES_GET_NONBLOCK|
-                                  JOURNAL_RES_GET_RESERVED);
-       if (ret == -EAGAIN) {
-               unsigned u64s = as->journal_u64s;
-
-               if (b) {
-                       six_unlock_write(&b->c.lock);
-                       six_unlock_intent(&b->c.lock);
-               }
-
-               mutex_unlock(&c->btree_interior_update_lock);
-
-               ret = bch2_journal_res_get(&c->journal, &res, u64s,
-                                          JOURNAL_RES_GET_CHECK|
-                                          JOURNAL_RES_GET_RESERVED);
-               if (!ret) {
-                       mutex_lock(&c->btree_interior_update_lock);
-                       goto again;
-               }
-       }
-
-       if (!ret) {
-               struct journal_buf *buf = &c->journal.buf[res.idx];
-               struct jset_entry *entry = vstruct_idx(buf->data, res.offset);
-
-               res.offset      += as->journal_u64s;
-               res.u64s        -= as->journal_u64s;
-               memcpy_u64s(entry, as->journal_entries, as->journal_u64s);
-       } else {
-               /*
-                * On journal error we have to run most of the normal path so
-                * that shutdown works - unblocking btree node writes in
-                * particular and writing them if needed - except for
-                * journalling the update:
-                */
-
-               BUG_ON(!bch2_journal_error(&c->journal));
-       }
-
-       switch (as->mode) {
-       case BTREE_INTERIOR_NO_UPDATE:
-               BUG();
-       case BTREE_INTERIOR_UPDATING_NODE:
-               /* @b is the node we did the final insert into: */
-
+       ret = bch2_trans_do(c, &as->disk_res, &journal_seq,
+                           BTREE_INSERT_NOFAIL|
+                           BTREE_INSERT_NOCHECK_RW|
+                           BTREE_INSERT_JOURNAL_RESERVED,
+                           btree_update_nodes_written_trans(&trans, as));
+       BUG_ON(ret && !bch2_journal_error(&c->journal));
+
+       if (b) {
                /*
+                * @b is the node we did the final insert into:
+                *
                 * On failure to get a journal reservation, we still have to
                 * unblock the write and allow most of the write path to happen
                 * so that shutdown works, but the i->journal_seq mechanism
@@ -719,83 +547,90 @@ again:
                 * we're in journal error state:
                 */
 
+               btree_node_lock_type(c, b, SIX_LOCK_intent);
+               btree_node_lock_type(c, b, SIX_LOCK_write);
+               mutex_lock(&c->btree_interior_update_lock);
+
                list_del(&as->write_blocked_list);
 
-               if (!ret) {
+               if (!ret && as->b == b) {
                        struct bset *i = btree_bset_last(b);
 
+                       BUG_ON(!b->c.level);
+                       BUG_ON(!btree_node_dirty(b));
+
                        i->journal_seq = cpu_to_le64(
-                               max(res.seq,
+                               max(journal_seq,
                                    le64_to_cpu(i->journal_seq)));
 
-                       bch2_btree_add_journal_pin(c, b, res.seq);
+                       bch2_btree_add_journal_pin(c, b, journal_seq);
                }
 
-               nodes_need_write[nr_nodes_need_write++] = b;
-
+               mutex_unlock(&c->btree_interior_update_lock);
                six_unlock_write(&b->c.lock);
-               six_unlock_intent(&b->c.lock);
-               break;
-
-       case BTREE_INTERIOR_UPDATING_AS:
-               BUG_ON(b);
-               break;
-
-       case BTREE_INTERIOR_UPDATING_ROOT:
-               r = &c->btree_roots[as->btree_id];
 
-               BUG_ON(b);
-
-               mutex_lock(&c->btree_root_lock);
-               bkey_copy(&r->key, as->parent_keys.keys);
-               r->level = as->level;
-               r->alive = true;
-               c->btree_roots_dirty = true;
-               mutex_unlock(&c->btree_root_lock);
-               break;
+               btree_node_write_if_need(c, b, SIX_LOCK_intent);
+               six_unlock_intent(&b->c.lock);
        }
 
        bch2_journal_pin_drop(&c->journal, &as->journal);
 
-       bch2_journal_res_put(&c->journal, &res);
        bch2_journal_preres_put(&c->journal, &as->journal_preres);
 
-       while (as->nr_new_nodes) {
-               b = as->new_nodes[--as->nr_new_nodes];
+       mutex_lock(&c->btree_interior_update_lock);
+       for (i = 0; i < as->nr_new_nodes; i++) {
+               b = as->new_nodes[i];
 
                BUG_ON(b->will_make_reachable != (unsigned long) as);
                b->will_make_reachable = 0;
+       }
+       mutex_unlock(&c->btree_interior_update_lock);
+
+       for (i = 0; i < as->nr_new_nodes; i++) {
+               b = as->new_nodes[i];
 
-               nodes_need_write[nr_nodes_need_write++] = b;
+               btree_node_lock_type(c, b, SIX_LOCK_read);
+               btree_node_write_if_need(c, b, SIX_LOCK_read);
+               six_unlock_read(&b->c.lock);
        }
 
-       while (as->nr_pending)
-               bch2_btree_node_free_ondisk(c,
-                       &as->pending[--as->nr_pending], res.seq);
+       for (i = 0; i < as->nr_open_buckets; i++)
+               bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
 
-       __bch2_btree_update_free(as);
-       /*
-        * for flush_held_btree_writes() waiting on updates to flush or
-        * nodes to be writeable:
-        */
-       closure_wake_up(&c->btree_interior_update_wait);
+       bch2_btree_update_free(as);
+}
 
-       /*
-        * Can't take btree node locks while holding btree_interior_update_lock:
-        * */
-       mutex_unlock(&c->btree_interior_update_lock);
+static void btree_interior_update_work(struct work_struct *work)
+{
+       struct bch_fs *c =
+               container_of(work, struct bch_fs, btree_interior_update_work);
+       struct btree_update *as;
 
-       /* Do btree writes after dropping journal res/locks: */
-       while (nr_nodes_need_write) {
-               b = nodes_need_write[--nr_nodes_need_write];
+       while (1) {
+               mutex_lock(&c->btree_interior_update_lock);
+               as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
+                                             struct btree_update, unwritten_list);
+               if (as && !as->nodes_written)
+                       as = NULL;
+               mutex_unlock(&c->btree_interior_update_lock);
 
-               btree_node_lock_type(c, b, SIX_LOCK_read);
-               bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
-               six_unlock_read(&b->c.lock);
+               if (!as)
+                       break;
+
+               btree_update_nodes_written(as);
        }
+}
+
+static void btree_update_set_nodes_written(struct closure *cl)
+{
+       struct btree_update *as = container_of(cl, struct btree_update, cl);
+       struct bch_fs *c = as->c;
 
        mutex_lock(&c->btree_interior_update_lock);
-       goto again;
+       as->nodes_written = true;
+       mutex_unlock(&c->btree_interior_update_lock);
+
+       queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
 }
 
 /*
@@ -814,7 +649,6 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
 
        as->mode        = BTREE_INTERIOR_UPDATING_NODE;
        as->b           = b;
-       as->level       = b->c.level;
        list_add(&as->write_blocked_list, &b->write_blocked);
 
        mutex_unlock(&c->btree_interior_update_lock);
@@ -845,25 +679,45 @@ static void btree_update_reparent(struct btree_update *as,
 
 static void btree_update_updated_root(struct btree_update *as, struct btree *b)
 {
+       struct bkey_i *insert = &b->key;
        struct bch_fs *c = as->c;
 
        BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
-       BUG_ON(!bch2_keylist_empty(&as->parent_keys));
+
+       BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
+              ARRAY_SIZE(as->journal_entries));
+
+       as->journal_u64s +=
+               journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
+                                 BCH_JSET_ENTRY_btree_root,
+                                 b->c.btree_id, b->c.level,
+                                 insert, insert->k.u64s);
 
        mutex_lock(&c->btree_interior_update_lock);
        list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
 
        as->mode        = BTREE_INTERIOR_UPDATING_ROOT;
-       as->level       = b->c.level;
-       bch2_keylist_add(&as->parent_keys, &b->key);
        mutex_unlock(&c->btree_interior_update_lock);
 }
 
-static void btree_node_will_make_reachable(struct btree_update *as,
-                                          struct btree *b)
+/*
+ * bch2_btree_update_add_new_node:
+ *
+ * This causes @as to wait on @b to be written, before it gets to
+ * bch2_btree_update_nodes_written
+ *
+ * Additionally, it sets b->will_make_reachable to prevent any additional writes
+ * to @b from happening besides the first until @b is reachable on disk
+ *
+ * And it adds @b to the list of @as's new nodes, so that we can update sector
+ * counts in bch2_btree_update_nodes_written:
+ */
+void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
 {
        struct bch_fs *c = as->c;
 
+       closure_get(&as->cl);
+
        mutex_lock(&c->btree_interior_update_lock);
        BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
        BUG_ON(b->will_make_reachable);
@@ -871,10 +725,14 @@ static void btree_node_will_make_reachable(struct btree_update *as,
        as->new_nodes[as->nr_new_nodes++] = b;
        b->will_make_reachable = 1UL|(unsigned long) as;
 
-       closure_get(&as->cl);
        mutex_unlock(&c->btree_interior_update_lock);
+
+       btree_update_will_add_key(as, &b->key);
 }
 
+/*
+ * returns true if @b was a new node
+ */
 static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
 {
        struct btree_update *as;
@@ -882,6 +740,11 @@ static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
        unsigned i;
 
        mutex_lock(&c->btree_interior_update_lock);
+       /*
+        * When b->will_make_reachable != 0, it owns a ref on as->cl that's
+        * dropped when it gets written by bch2_btree_complete_write - the
+        * xchg() is for synchronization with bch2_btree_complete_write:
+        */
        v = xchg(&b->will_make_reachable, 0);
        as = (struct btree_update *) (v & ~1UL);
 
@@ -903,25 +766,11 @@ found:
                closure_put(&as->cl);
 }
 
-static void btree_interior_update_add_node_reference(struct btree_update *as,
-                                                    struct btree *b)
+void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
 {
-       struct bch_fs *c = as->c;
-       struct pending_btree_node_free *d;
-
-       mutex_lock(&c->btree_interior_update_lock);
-
-       /* Add this node to the list of nodes being freed: */
-       BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
-
-       d = &as->pending[as->nr_pending++];
-       d->index_update_done    = false;
-       d->seq                  = b->data->keys.seq;
-       d->btree_id             = b->c.btree_id;
-       d->level                = b->c.level;
-       bkey_copy(&d->key, &b->key);
-
-       mutex_unlock(&c->btree_interior_update_lock);
+       while (b->ob.nr)
+               as->open_buckets[as->nr_open_buckets++] =
+                       b->ob.v[--b->ob.nr];
 }
 
 /*
@@ -941,8 +790,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
        if (btree_node_fake(b))
                return;
 
-       btree_interior_update_add_node_reference(as, b);
-
        mutex_lock(&c->btree_interior_update_lock);
 
        /*
@@ -984,16 +831,28 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
        bch2_journal_pin_drop(&c->journal, &w->journal);
 
        mutex_unlock(&c->btree_interior_update_lock);
+
+       /*
+        * Is this a node that isn't reachable on disk yet?
+        *
+        * Nodes that aren't reachable yet have writes blocked until they're
+        * reachable - now that we've cancelled any pending writes and moved
+        * things waiting on that write to wait on this update, we can drop this
+        * node from the list of nodes that the other update is making
+        * reachable, prior to freeing it:
+        */
+       btree_update_drop_new_node(c, b);
+
+       btree_update_will_delete_key(as, &b->key);
 }
 
 void bch2_btree_update_done(struct btree_update *as)
 {
        BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
 
-       bch2_btree_reserve_put(as->c, as->reserve);
-       as->reserve = NULL;
+       bch2_btree_reserve_put(as);
 
-       continue_at(&as->cl, btree_update_nodes_written, system_freezable_wq);
+       continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq);
 }
 
 struct btree_update *
@@ -1002,12 +861,32 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
                        struct closure *cl)
 {
        struct bch_fs *c = trans->c;
-       struct journal_preres journal_preres = { 0 };
-       struct btree_reserve *reserve;
        struct btree_update *as;
-       int ret;
+       int ret, disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
+               ? BCH_DISK_RESERVATION_NOFAIL : 0;
+
+       /*
+        * This check isn't necessary for correctness - it's just to potentially
+        * prevent us from doing a lot of work that'll end up being wasted:
+        */
+       ret = bch2_journal_error(&c->journal);
+       if (ret)
+               return ERR_PTR(ret);
+
+       as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
+       memset(as, 0, sizeof(*as));
+       closure_init(&as->cl, NULL);
+       as->c           = c;
+       as->mode        = BTREE_INTERIOR_NO_UPDATE;
+       as->btree_id    = id;
+       INIT_LIST_HEAD(&as->list);
+       INIT_LIST_HEAD(&as->unwritten_list);
+       INIT_LIST_HEAD(&as->write_blocked_list);
+       bch2_keylist_init(&as->old_keys, as->_old_keys);
+       bch2_keylist_init(&as->new_keys, as->_new_keys);
+       bch2_keylist_init(&as->parent_keys, as->inline_keys);
 
-       ret = bch2_journal_preres_get(&c->journal, &journal_preres,
+       ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
                                      BTREE_UPDATE_JOURNAL_RES,
                                      JOURNAL_RES_GET_NONBLOCK);
        if (ret == -EAGAIN) {
@@ -1016,46 +895,41 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 
                bch2_trans_unlock(trans);
 
-               ret = bch2_journal_preres_get(&c->journal, &journal_preres,
+               ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
                                              BTREE_UPDATE_JOURNAL_RES, 0);
                if (ret)
                        return ERR_PTR(ret);
 
                if (!bch2_trans_relock(trans)) {
-                       bch2_journal_preres_put(&c->journal, &journal_preres);
-                       return ERR_PTR(-EINTR);
+                       ret = -EINTR;
+                       goto err;
                }
        }
 
-       reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
-       if (IS_ERR(reserve)) {
-               bch2_journal_preres_put(&c->journal, &journal_preres);
-               return ERR_CAST(reserve);
-       }
-
-       as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
-       memset(as, 0, sizeof(*as));
-       closure_init(&as->cl, NULL);
-       as->c           = c;
-       as->mode        = BTREE_INTERIOR_NO_UPDATE;
-       as->btree_id    = id;
-       as->reserve     = reserve;
-       INIT_LIST_HEAD(&as->write_blocked_list);
-       INIT_LIST_HEAD(&as->unwritten_list);
-       as->journal_preres = journal_preres;
+       ret = bch2_disk_reservation_get(c, &as->disk_res,
+                       nr_nodes * c->opts.btree_node_size,
+                       c->opts.metadata_replicas,
+                       disk_res_flags);
+       if (ret)
+               goto err;
 
-       bch2_keylist_init(&as->parent_keys, as->inline_keys);
+       ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl);
+       if (ret)
+               goto err;
 
        mutex_lock(&c->btree_interior_update_lock);
        list_add_tail(&as->list, &c->btree_interior_update_list);
        mutex_unlock(&c->btree_interior_update_lock);
 
        return as;
+err:
+       bch2_btree_update_free(as);
+       return ERR_PTR(ret);
 }
 
 /* Btree root updates: */
 
-static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
+static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
 {
        /* Root nodes cannot be reaped */
        mutex_lock(&c->btree_cache.lock);
@@ -1073,38 +947,6 @@ static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
        bch2_recalc_btree_reserve(c);
 }
 
-static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
-{
-       struct bch_fs *c = as->c;
-       struct btree *old = btree_node_root(c, b);
-       struct bch_fs_usage_online *fs_usage;
-
-       __bch2_btree_set_root_inmem(c, b);
-
-       mutex_lock(&c->btree_interior_update_lock);
-       percpu_down_read(&c->mark_lock);
-       fs_usage = bch2_fs_usage_scratch_get(c);
-
-       bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
-                     0, 0, &fs_usage->u, 0,
-                     BTREE_TRIGGER_INSERT);
-       if (gc_visited(c, gc_pos_btree_root(b->c.btree_id)))
-               bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
-                                    0, 0, NULL, 0,
-                                    BTREE_TRIGGER_INSERT|
-                                    BTREE_TRIGGER_GC);
-
-       if (old && !btree_node_fake(old))
-               bch2_btree_node_free_index(as, NULL,
-                                          bkey_i_to_s_c(&old->key),
-                                          &fs_usage->u);
-       bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
-
-       bch2_fs_usage_scratch_put(c, fs_usage);
-       percpu_up_read(&c->mark_lock);
-       mutex_unlock(&c->btree_interior_update_lock);
-}
-
 /**
  * bch_btree_set_root - update the root in memory and on disk
  *
@@ -1135,7 +977,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
         */
        bch2_btree_node_lock_write(old, iter);
 
-       bch2_btree_set_root_inmem(as, b);
+       bch2_btree_set_root_inmem(c, b);
 
        btree_update_updated_root(as, b);
 
@@ -1156,57 +998,21 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
                                        struct bkey_i *insert,
                                        struct btree_node_iter *node_iter)
 {
-       struct bch_fs *c = as->c;
-       struct bch_fs_usage_online *fs_usage;
-       struct jset_entry *entry;
        struct bkey_packed *k;
-       struct bkey tmp;
 
        BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
               ARRAY_SIZE(as->journal_entries));
 
-       entry = (void *) &as->journal_entries[as->journal_u64s];
-       memset(entry, 0, sizeof(*entry));
-       entry->u64s     = cpu_to_le16(insert->k.u64s);
-       entry->type     = BCH_JSET_ENTRY_btree_keys;
-       entry->btree_id = b->c.btree_id;
-       entry->level    = b->c.level;
-       memcpy_u64s_small(entry->_data, insert, insert->k.u64s);
-       as->journal_u64s += jset_u64s(insert->k.u64s);
-
-       mutex_lock(&c->btree_interior_update_lock);
-       percpu_down_read(&c->mark_lock);
-       fs_usage = bch2_fs_usage_scratch_get(c);
-
-       bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
-                            0, 0, &fs_usage->u, 0,
-                            BTREE_TRIGGER_INSERT);
-
-       if (gc_visited(c, gc_pos_btree_node(b)))
-               bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
-                                    0, 0, NULL, 0,
-                                    BTREE_TRIGGER_INSERT|
-                                    BTREE_TRIGGER_GC);
+       as->journal_u64s +=
+               journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
+                                 BCH_JSET_ENTRY_btree_keys,
+                                 b->c.btree_id, b->c.level,
+                                 insert, insert->k.u64s);
 
        while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
               bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
                bch2_btree_node_iter_advance(node_iter, b);
 
-       /*
-        * If we're overwriting, look up pending delete and mark so that gc
-        * marks it on the pending delete list:
-        */
-       if (k && !bkey_cmp_packed(b, k, &insert->k))
-               bch2_btree_node_free_index(as, b,
-                                          bkey_disassemble(b, k, &tmp),
-                                          &fs_usage->u);
-
-       bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
-
-       bch2_fs_usage_scratch_put(c, fs_usage);
-       percpu_up_read(&c->mark_lock);
-       mutex_unlock(&c->btree_interior_update_lock);
-
        bch2_btree_bset_insert_key(iter, b, node_iter, insert);
        set_btree_node_dirty(b);
        set_btree_node_need_write(b);
@@ -1226,6 +1032,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
        struct bkey_packed *k, *prev = NULL;
 
        n2 = bch2_btree_node_alloc(as, n1->c.level);
+       bch2_btree_update_add_new_node(as, n2);
 
        n2->data->max_key       = n1->data->max_key;
        n2->data->format        = n1->format;
@@ -1321,14 +1128,6 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
        struct bkey_packed *src, *dst, *n;
        struct bset *i;
 
-       /*
-        * XXX
-        *
-        * these updates must be journalled
-        *
-        * oops
-        */
-
        BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
 
        bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
@@ -1380,6 +1179,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
        bch2_btree_interior_update_will_free_node(as, b);
 
        n1 = bch2_btree_node_alloc_replacement(as, b);
+       bch2_btree_update_add_new_node(as, n1);
 
        if (keys)
                btree_split_insert_keys(as, n1, iter, keys);
@@ -1439,11 +1239,11 @@ static void btree_split(struct btree_update *as, struct btree *b,
                bch2_btree_set_root(as, n1, iter);
        }
 
-       bch2_open_buckets_put(c, &n1->ob);
+       bch2_btree_update_get_open_buckets(as, n1);
        if (n2)
-               bch2_open_buckets_put(c, &n2->ob);
+               bch2_btree_update_get_open_buckets(as, n2);
        if (n3)
-               bch2_open_buckets_put(c, &n3->ob);
+               bch2_btree_update_get_open_buckets(as, n3);
 
        /* Successful split, update the iterator to point to the new nodes: */
 
@@ -1538,7 +1338,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
 
        bch2_btree_node_lock_for_insert(c, b, iter);
 
-       if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) {
+       if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
                bch2_btree_node_unlock_write(b, iter);
                goto split;
        }
@@ -1749,6 +1549,7 @@ retry:
        bch2_btree_interior_update_will_free_node(as, m);
 
        n = bch2_btree_node_alloc(as, b->c.level);
+       bch2_btree_update_add_new_node(as, n);
 
        btree_set_min(n, prev->data->min_key);
        btree_set_max(n, next->data->max_key);
@@ -1771,7 +1572,7 @@ retry:
 
        bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
 
-       bch2_open_buckets_put(c, &n->ob);
+       bch2_btree_update_get_open_buckets(as, n);
 
        six_lock_increment(&b->c.lock, SIX_LOCK_intent);
        bch2_btree_iter_node_drop(iter, b);
@@ -1859,6 +1660,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
        bch2_btree_interior_update_will_free_node(as, b);
 
        n = bch2_btree_node_alloc_replacement(as, b);
+       bch2_btree_update_add_new_node(as, n);
 
        bch2_btree_build_aux_trees(n);
        six_unlock_write(&n->c.lock);
@@ -1874,7 +1676,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
                bch2_btree_set_root(as, n, iter);
        }
 
-       bch2_open_buckets_put(c, &n->ob);
+       bch2_btree_update_get_open_buckets(as, n);
 
        six_lock_increment(&b->c.lock, SIX_LOCK_intent);
        bch2_btree_iter_node_drop(iter, b);
@@ -1949,49 +1751,8 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
        struct btree *parent;
        int ret;
 
-       /*
-        * Two corner cases that need to be thought about here:
-        *
-        * @b may not be reachable yet - there might be another interior update
-        * operation waiting on @b to be written, and we're gonna deliver the
-        * write completion to that interior update operation _before_
-        * persisting the new_key update
-        *
-        * That ends up working without us having to do anything special here:
-        * the reason is, we do kick off (and do the in memory updates) for the
-        * update for @new_key before we return, creating a new interior_update
-        * operation here.
-        *
-        * The new interior update operation here will in effect override the
-        * previous one. The previous one was going to terminate - make @b
-        * reachable - in one of two ways:
-        * - updating the btree root pointer
-        *   In that case,
-        *   no, this doesn't work. argh.
-        */
-
-       if (b->will_make_reachable)
-               as->must_rewrite = true;
-
-       btree_interior_update_add_node_reference(as, b);
-
-       /*
-        * XXX: the rest of the update path treats this like we're actually
-        * inserting a new node and deleting the existing node, so the
-        * reservation needs to include enough space for @b
-        *
-        * that is actually sketch as fuck though and I am surprised the code
-        * seems to work like that, definitely need to go back and rework it
-        * into something saner.
-        *
-        * (I think @b is just getting double counted until the btree update
-        * finishes and "deletes" @b on disk)
-        */
-       ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
-                       c->opts.btree_node_size *
-                       bch2_bkey_nr_ptrs(bkey_i_to_s_c(new_key)),
-                       BCH_DISK_RESERVATION_NOFAIL);
-       BUG_ON(ret);
+       btree_update_will_delete_key(as, &b->key);
+       btree_update_will_add_key(as, new_key);
 
        parent = btree_node_parent(iter, b);
        if (parent) {
@@ -2019,44 +1780,18 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
                        bkey_copy(&b->key, new_key);
                }
        } else {
-               struct bch_fs_usage_online *fs_usage;
-
                BUG_ON(btree_node_root(c, b) != b);
 
                bch2_btree_node_lock_write(b, iter);
+               bkey_copy(&b->key, new_key);
 
-               mutex_lock(&c->btree_interior_update_lock);
-               percpu_down_read(&c->mark_lock);
-               fs_usage = bch2_fs_usage_scratch_get(c);
-
-               bch2_mark_key_locked(c, bkey_i_to_s_c(new_key),
-                             0, 0, &fs_usage->u, 0,
-                             BTREE_TRIGGER_INSERT);
-               if (gc_visited(c, gc_pos_btree_root(b->c.btree_id)))
-                       bch2_mark_key_locked(c, bkey_i_to_s_c(new_key),
-                                            0, 0, NULL, 0,
-                                            BTREE_TRIGGER_INSERT||
-                                            BTREE_TRIGGER_GC);
-
-               bch2_btree_node_free_index(as, NULL,
-                                          bkey_i_to_s_c(&b->key),
-                                          &fs_usage->u);
-               bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
-
-               bch2_fs_usage_scratch_put(c, fs_usage);
-               percpu_up_read(&c->mark_lock);
-               mutex_unlock(&c->btree_interior_update_lock);
-
-               if (btree_ptr_hash_val(new_key) != b->hash_val) {
+               if (btree_ptr_hash_val(&b->key) != b->hash_val) {
                        mutex_lock(&c->btree_cache.lock);
                        bch2_btree_node_hash_remove(&c->btree_cache, b);
 
-                       bkey_copy(&b->key, new_key);
                        ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
                        BUG_ON(ret);
                        mutex_unlock(&c->btree_cache.lock);
-               } else {
-                       bkey_copy(&b->key, new_key);
                }
 
                btree_update_updated_root(as, b);
@@ -2171,7 +1906,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
 {
        BUG_ON(btree_node_root(c, b));
 
-       __bch2_btree_set_root_inmem(c, b);
+       bch2_btree_set_root_inmem(c, b);
 }
 
 void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
@@ -2211,7 +1946,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
                                          b->c.level, b->c.btree_id);
        BUG_ON(ret);
 
-       __bch2_btree_set_root_inmem(c, b);
+       bch2_btree_set_root_inmem(c, b);
 
        six_unlock_write(&b->c.lock);
        six_unlock_intent(&b->c.lock);
@@ -2248,10 +1983,59 @@ size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
        return ret;
 }
 
+void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset)
+{
+       struct btree_root *r;
+       struct jset_entry *entry;
+
+       mutex_lock(&c->btree_root_lock);
+
+       vstruct_for_each(jset, entry)
+               if (entry->type == BCH_JSET_ENTRY_btree_root) {
+                       r = &c->btree_roots[entry->btree_id];
+                       r->level = entry->level;
+                       r->alive = true;
+                       bkey_copy(&r->key, &entry->start[0]);
+               }
+
+       mutex_unlock(&c->btree_root_lock);
+}
+
+struct jset_entry *
+bch2_btree_roots_to_journal_entries(struct bch_fs *c,
+                                   struct jset_entry *start,
+                                   struct jset_entry *end)
+{
+       struct jset_entry *entry;
+       unsigned long have = 0;
+       unsigned i;
+
+       for (entry = start; entry < end; entry = vstruct_next(entry))
+               if (entry->type == BCH_JSET_ENTRY_btree_root)
+                       __set_bit(entry->btree_id, &have);
+
+       mutex_lock(&c->btree_root_lock);
+
+       for (i = 0; i < BTREE_ID_NR; i++)
+               if (c->btree_roots[i].alive && !test_bit(i, &have)) {
+                       journal_entry_set(end,
+                                         BCH_JSET_ENTRY_btree_root,
+                                         i, c->btree_roots[i].level,
+                                         &c->btree_roots[i].key,
+                                         c->btree_roots[i].key.u64s);
+                       end = vstruct_next(end);
+               }
+
+       mutex_unlock(&c->btree_root_lock);
+
+       return end;
+}
+
 void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
 {
+       if (c->btree_interior_update_worker)
+               destroy_workqueue(c->btree_interior_update_worker);
        mempool_exit(&c->btree_interior_update_pool);
-       mempool_exit(&c->btree_reserve_pool);
 }
 
 int bch2_fs_btree_interior_update_init(struct bch_fs *c)
@@ -2260,9 +2044,13 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c)
        INIT_LIST_HEAD(&c->btree_interior_update_list);
        INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
        mutex_init(&c->btree_interior_update_lock);
+       INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
+
+       c->btree_interior_update_worker =
+               alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
+       if (!c->btree_interior_update_worker)
+               return -ENOMEM;
 
-       return  mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
-                                         sizeof(struct btree_reserve)) ?:
-               mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
-                                         sizeof(struct btree_update));
+       return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
+                                        sizeof(struct btree_update));
 }
index 5cec87951dc708b9f12485283afea8609fe399f6..17bd1ca1fb780f169afac77d6c9f885ce00964aa 100644 (file)
@@ -6,34 +6,13 @@
 #include "btree_locking.h"
 #include "btree_update.h"
 
-struct btree_reserve {
-       struct disk_reservation disk_res;
-       unsigned                nr;
-       struct btree            *b[BTREE_RESERVE_MAX];
-};
-
 void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
 bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
                                struct bkey_format *);
 
-/* Btree node freeing/allocation: */
-
-/*
- * Tracks a btree node that has been (or is about to be) freed in memory, but
- * has _not_ yet been freed on disk (because the write that makes the new
- * node(s) visible and frees the old hasn't completed yet)
- */
-struct pending_btree_node_free {
-       bool                    index_update_done;
-
-       __le64                  seq;
-       enum btree_id           btree_id;
-       unsigned                level;
-       __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
-};
+#define BTREE_UPDATE_NODES_MAX         ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES)
 
-#define BTREE_UPDATE_JOURNAL_RES               \
-       ((BKEY_BTREE_PTR_U64s_MAX + 1) * (BTREE_MAX_DEPTH - 1) * 2)
+#define BTREE_UPDATE_JOURNAL_RES       (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
 
 /*
  * Tracks an in progress split/rewrite of a btree node and the update to the
@@ -72,9 +51,8 @@ struct btree_update {
        unsigned                        nodes_written:1;
 
        enum btree_id                   btree_id;
-       u8                              level;
 
-       struct btree_reserve            *reserve;
+       struct disk_reservation         disk_res;
        struct journal_preres           journal_preres;
 
        /*
@@ -96,17 +74,28 @@ struct btree_update {
         */
        struct journal_entry_pin        journal;
 
-       /*
-        * Nodes being freed:
-        * Protected by c->btree_node_pending_free_lock
-        */
-       struct pending_btree_node_free  pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
-       unsigned                        nr_pending;
+       /* Preallocated nodes we reserve when we start the update: */
+       struct btree                    *prealloc_nodes[BTREE_UPDATE_NODES_MAX];
+       unsigned                        nr_prealloc_nodes;
+
+       /* Nodes being freed: */
+       struct keylist                  old_keys;
+       u64                             _old_keys[BTREE_UPDATE_NODES_MAX *
+                                                 BKEY_BTREE_PTR_VAL_U64s_MAX];
+
+       /* Nodes being added: */
+       struct keylist                  new_keys;
+       u64                             _new_keys[BTREE_UPDATE_NODES_MAX *
+                                                 BKEY_BTREE_PTR_VAL_U64s_MAX];
 
        /* New nodes, that will be made reachable by this update: */
-       struct btree                    *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
+       struct btree                    *new_nodes[BTREE_UPDATE_NODES_MAX];
        unsigned                        nr_new_nodes;
 
+       u8                              open_buckets[BTREE_UPDATE_NODES_MAX *
+                                                    BCH_REPLICAS_MAX];
+       u8                              nr_open_buckets;
+
        unsigned                        journal_u64s;
        u64                             journal_entries[BTREE_UPDATE_JOURNAL_RES];
 
@@ -120,14 +109,12 @@ struct btree_update {
        u64                             inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
 };
 
-#define for_each_pending_btree_node_free(c, as, p)                     \
-       list_for_each_entry(as, &c->btree_interior_update_list, list)   \
-               for (p = as->pending; p < as->pending + as->nr_pending; p++)
-
 void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
                                struct btree_iter *);
 void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
 
+void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *);
+
 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
                                                  struct btree *,
                                                  struct bkey_format);
@@ -139,6 +126,7 @@ bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned,
 
 void bch2_btree_interior_update_will_free_node(struct btree_update *,
                                               struct btree *);
+void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
 
 void bch2_btree_insert_node(struct btree_update *, struct btree *,
                            struct btree_iter *, struct keylist *,
@@ -333,6 +321,10 @@ ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
 
 size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
 
+void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *);
+struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
+                                       struct jset_entry *, struct jset_entry *);
+
 void bch2_fs_btree_interior_update_exit(struct bch_fs *);
 int bch2_fs_btree_interior_update_init(struct bch_fs *);
 
index 98b60d230dce2abc4d5ae7b9251a85115c913e65..ffcaecc8a64fc2eaec3d32566b3f60b41c2be49f 100644 (file)
@@ -414,8 +414,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
        }
 
        if (unlikely(trans->extra_journal_entry_u64s)) {
-               memcpy_u64s_small(bch2_journal_reservation_entry(&c->journal,
-                                                                &trans->journal_res),
+               memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
                                  trans->extra_journal_entries,
                                  trans->extra_journal_entry_u64s);
 
@@ -521,6 +520,10 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
                        bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
                                                             i->iter);
 
+       if (!ret && trans->journal_pin)
+               bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq,
+                                    trans->journal_pin, NULL);
+
        /*
         * Drop journal reservation after dropping write locks, since dropping
         * the journal reservation may kick off a journal write:
index 43095ae4731d1dd5e1eac96bb2ad795101d7953c..5b827698c3e50ba10da5b842f5b694b122f8299b 100644 (file)
@@ -1180,7 +1180,7 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
        return 0;
 }
 
-int bch2_mark_key_locked(struct bch_fs *c,
+static int bch2_mark_key_locked(struct bch_fs *c,
                   struct bkey_s_c k,
                   unsigned offset, s64 sectors,
                   struct bch_fs_usage *fs_usage,
index 29ebc07a24977a91cd82031e9aef0edf5124ed54..cea66c76850d151b29726a00f7f009ca94ca9762 100644 (file)
@@ -259,8 +259,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
                               size_t, enum bch_data_type, unsigned,
                               struct gc_pos, unsigned);
 
-int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, unsigned, s64,
-                        struct bch_fs_usage *, u64, unsigned);
 int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64,
                  struct bch_fs_usage *, u64, unsigned);
 int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage_online *,
index 220daf88f7b9b68d3c345cb736f1323961b430bf..5c84569c3404022209be41e7be54d9d60c67c6cb 100644 (file)
@@ -958,15 +958,12 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
 
 void bch2_fs_journal_stop(struct journal *j)
 {
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
        bch2_journal_flush_all_pins(j);
 
        wait_event(j->wait, journal_entry_close(j));
 
        /* do we need to write another journal entry? */
-       if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) ||
-           c->btree_roots_dirty)
+       if (test_bit(JOURNAL_NOT_EMPTY, &j->flags))
                bch2_journal_meta(j);
 
        journal_quiesce(j);
index 6630db6ecc14eda5715a17b4a0513c53378f673a..2c55f74522e2d948ee5e1f9b19478dc5adcc41ad 100644 (file)
@@ -200,33 +200,40 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
 }
 
 static inline struct jset_entry *
-bch2_journal_reservation_entry(struct journal *j, struct journal_res *res)
+journal_res_entry(struct journal *j, struct journal_res *res)
 {
        return vstruct_idx(j->buf[res->idx].data, res->offset);
 }
 
+static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
+                                         enum btree_id id, unsigned level,
+                                         const void *data, unsigned u64s)
+{
+       entry->u64s     = cpu_to_le16(u64s);
+       entry->btree_id = id;
+       entry->level    = level;
+       entry->type     = type;
+       entry->pad[0]   = 0;
+       entry->pad[1]   = 0;
+       entry->pad[2]   = 0;
+       memcpy_u64s_small(entry->_data, data, u64s);
+
+       return jset_u64s(u64s);
+}
+
 static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
                                          unsigned type, enum btree_id id,
                                          unsigned level,
                                          const void *data, unsigned u64s)
 {
-       struct jset_entry *entry = bch2_journal_reservation_entry(j, res);
-       unsigned actual = jset_u64s(u64s);
+       unsigned actual = journal_entry_set(journal_res_entry(j, res),
+                              type, id, level, data, u64s);
 
        EBUG_ON(!res->ref);
        EBUG_ON(actual > res->u64s);
 
        res->offset     += actual;
        res->u64s       -= actual;
-
-       entry->u64s     = cpu_to_le16(u64s);
-       entry->btree_id = id;
-       entry->level    = level;
-       entry->type     = type;
-       entry->pad[0]   = 0;
-       entry->pad[1]   = 0;
-       entry->pad[2]   = 0;
-       memcpy_u64s_small(entry->_data, data, u64s);
 }
 
 static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
index 421fde39ac0ef31990860d8e95996ec145a9aa4c..1724c80b323c8046cc00e69de02096106f041d48 100644 (file)
@@ -2,6 +2,7 @@
 #include "bcachefs.h"
 #include "alloc_foreground.h"
 #include "btree_io.h"
+#include "btree_update_interior.h"
 #include "buckets.h"
 #include "checksum.h"
 #include "error.h"
@@ -992,8 +993,23 @@ void bch2_journal_write(struct closure *cl)
 
        j->write_start_time = local_clock();
 
-       start   = vstruct_last(jset);
-       end     = bch2_journal_super_entries_add_common(c, start,
+       /*
+        * New btree roots are set by journalling them; when the journal entry
+        * gets written we have to propagate them to c->btree_roots
+        *
+        * But, every journal entry we write has to contain all the btree roots
+        * (at least for now); so after we copy btree roots to c->btree_roots we
+        * have to get any missing btree roots and add them to this journal
+        * entry:
+        */
+
+       bch2_journal_entries_to_btree_roots(c, jset);
+
+       start = end = vstruct_last(jset);
+
+       end     = bch2_btree_roots_to_journal_entries(c, jset->start, end);
+
+       end     = bch2_journal_super_entries_add_common(c, end,
                                                le64_to_cpu(jset->seq));
        u64s    = (u64 *) end - (u64 *) start;
        BUG_ON(u64s > j->entry_u64s_reserved);
index 341106ab4a7741b2180331f4f0d7cbbcb9e19a38..6cb37045cf685fc2193ccffa3051a7464d316d13 100644 (file)
@@ -330,7 +330,7 @@ static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
 
        __journal_pin_drop(j, pin);
 
-       BUG_ON(!atomic_read(&pin_list->count));
+       BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j));
 
        atomic_inc(&pin_list->count);
        pin->seq        = seq;
index 883a0a5680afc73b7c788b42370cb8992a8d661f..3ef641f7ce3030c2844ae59bcc987fe8e634e875 100644 (file)
@@ -38,7 +38,7 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
                                        struct journal_entry_pin *pin,
                                        journal_pin_flush_fn flush_fn)
 {
-       if (unlikely(!journal_pin_active(pin)))
+       if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
                __bch2_journal_pin_add(j, seq, pin, flush_fn);
 }
 
index 5da54ced9cadb736a1edf22e77ddf81b9292f6fd..864dfaa67b7a4cb255724409297adae74743b849 100644 (file)
@@ -6,7 +6,7 @@
 int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
                        size_t nr_inline_u64s, size_t new_u64s)
 {
-       size_t oldsize = bch_keylist_u64s(l);
+       size_t oldsize = bch2_keylist_u64s(l);
        size_t newsize = oldsize + new_u64s;
        u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
        u64 *new_keys;
@@ -52,7 +52,7 @@ void bch2_keylist_pop_front(struct keylist *l)
 
        memmove_u64s_down(l->keys,
                          bkey_next(l->keys),
-                         bch_keylist_u64s(l));
+                         bch2_keylist_u64s(l));
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
index a7ff86b08abcd008f96b5f5b976812bab3fe20e2..195799bb20bcbfc91f206f9e0f0c1fde9d615324 100644 (file)
@@ -36,14 +36,14 @@ static inline bool bch2_keylist_empty(struct keylist *l)
        return l->top == l->keys;
 }
 
-static inline size_t bch_keylist_u64s(struct keylist *l)
+static inline size_t bch2_keylist_u64s(struct keylist *l)
 {
        return l->top_p - l->keys_p;
 }
 
 static inline size_t bch2_keylist_bytes(struct keylist *l)
 {
-       return bch_keylist_u64s(l) * sizeof(u64);
+       return bch2_keylist_u64s(l) * sizeof(u64);
 }
 
 static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
index e26fa1608f39d49b671ebeb3255574c9bca928bd..96c8690adc5bf51cfaecc8e8628a45109a07d7a1 100644 (file)
@@ -151,15 +151,8 @@ retry:
        }
 
        /* flush relevant btree updates */
-       while (1) {
-               closure_wait_event(&c->btree_interior_update_wait,
-                                  !bch2_btree_interior_updates_nr_pending(c) ||
-                                  c->btree_roots_dirty);
-               if (c->btree_roots_dirty)
-                       bch2_journal_meta(&c->journal);
-               if (!bch2_btree_interior_updates_nr_pending(c))
-                       break;
-       }
+       closure_wait_event(&c->btree_interior_update_wait,
+                          !bch2_btree_interior_updates_nr_pending(c));
 
        ret = 0;
 err:
index 882e86e70db7d4bd06c8678c60a4fc539af0f5c0..02cc5089a16363b5374de5b9d9c9c53ef51c7501 100644 (file)
@@ -774,14 +774,8 @@ int bch2_data_job(struct bch_fs *c,
 
                ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
 
-               while (1) {
-                       closure_wait_event(&c->btree_interior_update_wait,
-                                          !bch2_btree_interior_updates_nr_pending(c) ||
-                                          c->btree_roots_dirty);
-                       if (!bch2_btree_interior_updates_nr_pending(c))
-                               break;
-                       bch2_journal_meta(&c->journal);
-               }
+               closure_wait_event(&c->btree_interior_update_wait,
+                                  !bch2_btree_interior_updates_nr_pending(c));
 
                ret = bch2_replicas_gc2(c) ?: ret;
 
index 95265f1c2b217bb3104b702328ee6001479c085d..b386c7e15e97c6b582421f38b99813cbe54711e7 100644 (file)
@@ -763,6 +763,7 @@ static int verify_superblock_clean(struct bch_fs *c,
                        "superblock read clock doesn't match journal after clean shutdown");
 
        for (i = 0; i < BTREE_ID_NR; i++) {
+               char buf1[200], buf2[200];
                struct bkey_i *k1, *k2;
                unsigned l1 = 0, l2 = 0;
 
@@ -778,7 +779,11 @@ static int verify_superblock_clean(struct bch_fs *c,
                                    k1->k.u64s != k2->k.u64s ||
                                    memcmp(k1, k2, bkey_bytes(k1)) ||
                                    l1 != l2, c,
-                       "superblock btree root doesn't match journal after clean shutdown");
+                       "superblock btree root %u doesn't match journal after clean shutdown\n"
+                       "sb:      l=%u %s\n"
+                       "journal: l=%u %s\n", i,
+                       l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1),
+                       l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2));
        }
 fsck_err:
        return ret;
index c9d2a01fec299dc7fa75f2f459c956532235fa9f..eb5a91d232e025bdfb6bc00af445b0ebdb97f85c 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "btree_update_interior.h"
 #include "buckets.h"
 #include "checksum.h"
 #include "disk_groups.h"
@@ -955,7 +956,6 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 
        mutex_lock(&c->sb_lock);
        SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-       c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
        c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
        c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
        c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
@@ -989,27 +989,8 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
                                      struct jset_entry *entry,
                                      u64 journal_seq)
 {
-       struct btree_root *r;
        unsigned i;
 
-       mutex_lock(&c->btree_root_lock);
-
-       for (r = c->btree_roots;
-            r < c->btree_roots + BTREE_ID_NR;
-            r++)
-               if (r->alive) {
-                       entry_init_u64s(entry, r->key.u64s + 1);
-                       entry->btree_id = r - c->btree_roots;
-                       entry->level    = r->level;
-                       entry->type     = BCH_JSET_ENTRY_btree_root;
-                       bkey_copy(&entry->start[0], &r->key);
-
-                       entry = vstruct_next(entry);
-               }
-       c->btree_roots_dirty = false;
-
-       mutex_unlock(&c->btree_root_lock);
-
        percpu_down_read(&c->mark_lock);
 
        if (!journal_seq) {
@@ -1111,6 +1092,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 
        entry = sb_clean->start;
        entry = bch2_journal_super_entries_add_common(c, entry, 0);
+       entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
        BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
 
        memset(entry, 0,
index 4335e0a11c2e37457476d4783508d19f032f7d75..3cf75ac1b8047c63f694d28743324b57c9968bd6 100644 (file)
@@ -227,6 +227,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
                 */
                closure_wait_event(&c->btree_interior_update_wait,
                                   !bch2_btree_interior_updates_nr_pending(c));
+               flush_work(&c->btree_interior_update_work);
 
                clean_passes = wrote ? 0 : clean_passes + 1;
        } while (clean_passes < 2);
@@ -234,6 +235,10 @@ static void __bch2_fs_read_only(struct bch_fs *c)
        bch_verbose(c, "writing alloc info complete");
        set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
 nowrote_alloc:
+       closure_wait_event(&c->btree_interior_update_wait,
+                          !bch2_btree_interior_updates_nr_pending(c));
+       flush_work(&c->btree_interior_update_work);
+
        for_each_member_device(ca, c, i)
                bch2_dev_allocator_stop(ca);