bcachefs: Shutdown path improvements
authorKent Overstreet <kent.overstreet@gmail.com>
Sun, 17 Apr 2022 21:30:49 +0000 (17:30 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:32 +0000 (17:09 -0400)
We're seeing occasional firings of the assertion in the key cache
shutdown code that nr_dirty == 0, which means we must sometimes be doing
transaction commits after we've gone read only.

Cleanups & changes:
 - BCH_FS_ALLOC_CLEAN renamed to BCH_FS_CLEAN_SHUTDOWN
 - new helper bch2_btree_interior_updates_flush(), which returns true if
   it had to wait
 - bch2_btree_flush_writes() now also returns true if there were btree
   writes in flight
 - __bch2_fs_read_only now checks if btree writes were in flight in the
   shutdown loop: btree write completion does a transaction update, to
   update the pointer in the parent node
 - assert that !BCH_FS_CLEAN_SHUTDOWN in __bch2_trans_commit

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
fs/bcachefs/bcachefs.h
fs/bcachefs/btree_gc.c
fs/bcachefs/btree_io.c
fs/bcachefs/btree_io.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/btree_update_interior.h
fs/bcachefs/btree_update_leaf.c
fs/bcachefs/migrate.c
fs/bcachefs/move.c
fs/bcachefs/super.c

index 04d297b1da94b31a97a8614f07eabc00a2bfe24c..e7300a9f427cc4abad0c57052f0f5bd217821dbb 100644 (file)
@@ -494,7 +494,7 @@ struct bch_dev {
 
 enum {
        /* startup: */
-       BCH_FS_ALLOC_CLEAN,
+       BCH_FS_CLEAN_SHUTDOWN,
        BCH_FS_INITIAL_GC_DONE,
        BCH_FS_INITIAL_GC_UNFIXED,
        BCH_FS_TOPOLOGY_REPAIR_DONE,
index 5199f0240fcdd2590aa700c5542f1f091fb610ab..21afc720057070ad0ee94bc66cd42bd920de60dc 100644 (file)
@@ -1751,9 +1751,7 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
 
        down_write(&c->gc_lock);
 
-       /* flush interior btree updates: */
-       closure_wait_event(&c->btree_interior_update_wait,
-                          !bch2_btree_interior_updates_nr_pending(c));
+       bch2_btree_interior_updates_flush(c);
 
        ret   = bch2_gc_start(c, metadata_only) ?:
                bch2_gc_alloc_start(c, metadata_only) ?:
index f847928ab743b3d95851d597aa61ac031c8aa7fe..33c54803e0a2d99c287d0297a6aa5fe034ecc1a0 100644 (file)
@@ -2099,29 +2099,33 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        }
 }
 
-static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
+static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
 {
        struct bucket_table *tbl;
        struct rhash_head *pos;
        struct btree *b;
        unsigned i;
+       bool ret = false;
 restart:
        rcu_read_lock();
        for_each_cached_btree(b, c, tbl, i, pos)
                if (test_bit(flag, &b->flags)) {
                        rcu_read_unlock();
                        wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
+                       ret = true;
                        goto restart;
                }
        rcu_read_unlock();
+
+       return ret;
 }
 
-void bch2_btree_flush_all_reads(struct bch_fs *c)
+bool bch2_btree_flush_all_reads(struct bch_fs *c)
 {
-       __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
+       return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
 }
 
-void bch2_btree_flush_all_writes(struct bch_fs *c)
+bool bch2_btree_flush_all_writes(struct bch_fs *c)
 {
-       __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
+       return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
 }
index d818d87661e863a78b19b047276094e9a84696a1..8af853642123df33276aad4cf1bad547001e7e6a 100644 (file)
@@ -152,8 +152,8 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
        bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
 }
 
-void bch2_btree_flush_all_reads(struct bch_fs *);
-void bch2_btree_flush_all_writes(struct bch_fs *);
+bool bch2_btree_flush_all_reads(struct bch_fs *);
+bool bch2_btree_flush_all_writes(struct bch_fs *);
 
 static inline void compat_bformat(unsigned level, enum btree_id btree_id,
                                  unsigned version, unsigned big_endian,
index 7a092d852930bc45f4738b492a2ff2c748f1b64b..27ab1cde22170a8827487328facc41ad91fe8543 100644 (file)
@@ -2175,19 +2175,27 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
        mutex_unlock(&c->btree_interior_update_lock);
 }
 
-size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
+static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
 {
-       size_t ret = 0;
-       struct list_head *i;
+       bool ret;
 
        mutex_lock(&c->btree_interior_update_lock);
-       list_for_each(i, &c->btree_interior_update_list)
-               ret++;
+       ret = !list_empty(&c->btree_interior_update_list);
        mutex_unlock(&c->btree_interior_update_lock);
 
        return ret;
 }
 
+bool bch2_btree_interior_updates_flush(struct bch_fs *c)
+{
+       bool ret = bch2_btree_interior_updates_pending(c);
+
+       if (ret)
+               closure_wait_event(&c->btree_interior_update_wait,
+                                  !bch2_btree_interior_updates_pending(c));
+       return ret;
+}
+
 void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset)
 {
        struct btree_root *r;
index e72eb87956167d881bf3997d136ee0fef4d87d14..adfc6c24a7a402f3eeb88db33394909213d08fc2 100644 (file)
@@ -309,7 +309,7 @@ static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
 
 void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
 
-size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
+bool bch2_btree_interior_updates_flush(struct bch_fs *);
 
 void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *);
 struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
index ba6a9218610ab8d4e7fe09719a85afe495e935d7..9a2955f4ae6bea625ff2abe34fa50463d8c3a2fa 100644 (file)
@@ -1117,6 +1117,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
                        goto out_reset;
        }
 
+       EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+
        memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
 
        trans->journal_u64s             = trans->extra_journal_entries.nr;
index 6defc33322b3b24bd5f9a076165accce6f204aa0..5345697f2712cebdf44a59022cb3d93be1edf2c5 100644 (file)
@@ -175,10 +175,7 @@ next:
                        goto err;
        }
 
-       /* flush relevant btree updates */
-       closure_wait_event(&c->btree_interior_update_wait,
-                          !bch2_btree_interior_updates_nr_pending(c));
-
+       bch2_btree_interior_updates_flush(c);
        ret = 0;
 err:
        bch2_trans_exit(&trans);
index a219c10a71357eb41b07c9dc697e58aa48299e22..f18d603624c09c9b4ed21b389a06b1d3c839d700 100644 (file)
@@ -942,9 +942,7 @@ next:
        if (ret)
                bch_err(c, "error %i in bch2_move_btree", ret);
 
-       /* flush relevant btree updates */
-       closure_wait_event(&c->btree_interior_update_wait,
-                          !bch2_btree_interior_updates_nr_pending(c));
+       bch2_btree_interior_updates_flush(c);
 
        progress_list_del(c, stats);
        return ret;
index e4ccdc966fdb7218ad7e6510f12dfde3ddd90485..77b7bd61bf4317109ed423370e48507cdfe44f0a 100644 (file)
@@ -195,57 +195,33 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 {
        struct bch_dev *ca;
        unsigned i, clean_passes = 0;
+       u64 seq = 0;
 
        bch2_rebalance_stop(c);
        bch2_copygc_stop(c);
        bch2_gc_thread_stop(c);
 
-       /*
-        * Flush journal before stopping allocators, because flushing journal
-        * blacklist entries involves allocating new btree nodes:
-        */
-       bch2_journal_flush_all_pins(&c->journal);
-
        bch_verbose(c, "flushing journal and stopping allocators");
 
-       bch2_journal_flush_all_pins(&c->journal);
-
        do {
                clean_passes++;
 
-               if (bch2_journal_flush_all_pins(&c->journal))
-                       clean_passes = 0;
-
-               /*
-                * In flight interior btree updates will generate more journal
-                * updates and btree updates (alloc btree):
-                */
-               if (bch2_btree_interior_updates_nr_pending(c)) {
-                       closure_wait_event(&c->btree_interior_update_wait,
-                                          !bch2_btree_interior_updates_nr_pending(c));
+               if (bch2_btree_interior_updates_flush(c) ||
+                   bch2_journal_flush_all_pins(&c->journal) ||
+                   bch2_btree_flush_all_writes(c) ||
+                   seq != atomic64_read(&c->journal.seq)) {
+                       seq = atomic64_read(&c->journal.seq);
                        clean_passes = 0;
                }
-               flush_work(&c->btree_interior_update_work);
-
-               if (bch2_journal_flush_all_pins(&c->journal))
-                       clean_passes = 0;
        } while (clean_passes < 2);
-       bch_verbose(c, "flushing journal and stopping allocators complete");
 
-       set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
-
-       closure_wait_event(&c->btree_interior_update_wait,
-                          !bch2_btree_interior_updates_nr_pending(c));
-       flush_work(&c->btree_interior_update_work);
+       bch_verbose(c, "flushing journal and stopping allocators complete");
 
+       if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
+           !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+               set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
        bch2_fs_journal_stop(&c->journal);
 
-       /*
-        * the journal kicks off btree writes via reclaim - wait for in flight
-        * writes after stopping journal:
-        */
-       bch2_btree_flush_all_writes(c);
-
        /*
         * After stopping journal:
         */
@@ -304,7 +280,7 @@ void bch2_fs_read_only(struct bch_fs *c)
            !test_bit(BCH_FS_ERROR, &c->flags) &&
            !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
            test_bit(BCH_FS_STARTED, &c->flags) &&
-           test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) &&
+           test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) &&
            !c->opts.norecovery) {
                bch_verbose(c, "marking filesystem clean");
                bch2_fs_mark_clean(c);
@@ -395,7 +371,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
        if (ret)
                goto err;
 
-       clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
+       clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
 
        for_each_rw_member(ca, c, i)
                bch2_dev_allocator_add(c, ca);