bcachefs: Improvements to writing alloc info
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 17 Oct 2020 01:36:26 +0000 (21:36 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:44 +0000 (17:08 -0400)
Now that we've got transactional alloc info updates (and have for
awhile), we don't need to write it out on shutdown, and we don't need to
write it out on startup except when GC found errors - this is a big
improvement to mount/unmount performance.

This patch also fixes a few bugs where we weren't writing out alloc
info (on new filesystems, and new devices) and should have been.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/alloc_background.c
fs/bcachefs/alloc_background.h
fs/bcachefs/btree_gc.c
fs/bcachefs/ec.c
fs/bcachefs/ec.h
fs/bcachefs/recovery.c
fs/bcachefs/super.c

index 9fa7184188c27a3c618d722430214af1bf4792b4..459da00457efc9e83002d5dcfa8876508d8d29a9 100644 (file)
@@ -271,12 +271,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
        return 0;
 }
 
-enum alloc_write_ret {
-       ALLOC_WROTE,
-       ALLOC_NOWROTE,
-       ALLOC_END,
-};
-
 static int bch2_alloc_write_key(struct btree_trans *trans,
                                struct btree_iter *iter,
                                unsigned flags)
@@ -306,26 +300,17 @@ retry:
 
        old_u = bch2_alloc_unpack(k);
 
-       if (iter->pos.inode >= c->sb.nr_devices ||
-           !c->devs[iter->pos.inode])
-               return ALLOC_END;
-
        percpu_down_read(&c->mark_lock);
        ca      = bch_dev_bkey_exists(c, iter->pos.inode);
        ba      = bucket_array(ca);
 
-       if (iter->pos.offset >= ba->nbuckets) {
-               percpu_up_read(&c->mark_lock);
-               return ALLOC_END;
-       }
-
        g       = &ba->b[iter->pos.offset];
        m       = READ_ONCE(g->mark);
        new_u   = alloc_mem_to_key(g, m);
        percpu_up_read(&c->mark_lock);
 
        if (!bkey_alloc_unpacked_cmp(old_u, new_u))
-               return ALLOC_NOWROTE;
+               return 0;
 
        a = bkey_alloc_init(&alloc_key.k);
        a->k.p = iter->pos;
@@ -343,50 +328,55 @@ err:
        return ret;
 }
 
-int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
+int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
-       struct bch_dev *ca;
-       unsigned i;
+       u64 first_bucket, nbuckets;
        int ret = 0;
 
+       percpu_down_read(&c->mark_lock);
+       first_bucket    = bucket_array(ca)->first_bucket;
+       nbuckets        = bucket_array(ca)->nbuckets;
+       percpu_up_read(&c->mark_lock);
+
        BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
+                                  POS(ca->dev_idx, first_bucket),
                                   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-       for_each_rw_member(ca, c, i) {
-               unsigned first_bucket;
+       while (iter->pos.offset < nbuckets) {
+               bch2_trans_cond_resched(&trans);
 
-               percpu_down_read(&c->mark_lock);
-               first_bucket = bucket_array(ca)->first_bucket;
-               percpu_up_read(&c->mark_lock);
+               ret = bch2_alloc_write_key(&trans, iter, flags);
+               if (ret)
+                       break;
+               bch2_btree_iter_next_slot(iter);
+       }
 
-               bch2_btree_iter_set_pos(iter, POS(i, first_bucket));
+       bch2_trans_exit(&trans);
 
-               while (1) {
-                       bch2_trans_cond_resched(&trans);
+       return ret;
+}
 
-                       ret = bch2_alloc_write_key(&trans, iter, flags);
-                       if (ret < 0 || ret == ALLOC_END)
-                               break;
-                       if (ret == ALLOC_WROTE)
-                               *wrote = true;
-                       bch2_btree_iter_next_slot(iter);
-               }
+int bch2_alloc_write(struct bch_fs *c, unsigned flags)
+{
+       struct bch_dev *ca;
+       unsigned i;
+       int ret = 0;
 
-               if (ret < 0) {
+       for_each_rw_member(ca, c, i) {
+               bch2_dev_alloc_write(c, ca, flags);
+               if (ret) {
                        percpu_ref_put(&ca->io_ref);
                        break;
                }
        }
 
-       bch2_trans_exit(&trans);
-
-       return ret < 0 ? ret : 0;
+       return ret;
 }
 
 /* Bucket IO clocks: */
index 4f462696b747a88f9f80fd6dc31b74e291cee517..56a846fde8dd8d1855586f16e72a0a84f6beb454 100644 (file)
@@ -93,7 +93,8 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);
 
-int bch2_alloc_write(struct bch_fs *, unsigned, bool *);
+int bch2_dev_alloc_write(struct bch_fs *, struct bch_dev *, unsigned);
+int bch2_alloc_write(struct bch_fs *, unsigned);
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
index 2774f10054a928843a24f853092d24e81bf10aef..74012bea7126c196088d22f194ae9f20bb2ccf1d 100644 (file)
@@ -570,6 +570,7 @@ static int bch2_gc_done(struct bch_fs *c,
                        fsck_err(c, _msg ": got %llu, should be %llu"   \
                                , ##__VA_ARGS__, dst->_f, src->_f);     \
                dst->_f = src->_f;                                      \
+               ret = 1;                                                \
        }
 #define copy_stripe_field(_f, _msg, ...)                               \
        if (dst->_f != src->_f) {                                       \
@@ -580,6 +581,7 @@ static int bch2_gc_done(struct bch_fs *c,
                                dst->_f, src->_f);                      \
                dst->_f = src->_f;                                      \
                dst->dirty = true;                                      \
+               ret = 1;                                                \
        }
 #define copy_bucket_field(_f)                                          \
        if (dst->b[b].mark._f != src->b[b].mark._f) {                   \
@@ -590,6 +592,7 @@ static int bch2_gc_done(struct bch_fs *c,
                                bch2_data_types[dst->b[b].mark.data_type],\
                                dst->b[b].mark._f, src->b[b].mark._f);  \
                dst->b[b]._mark._f = src->b[b].mark._f;                 \
+               ret = 1;                                                \
        }
 #define copy_dev_field(_f, _msg, ...)                                  \
        copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
@@ -1396,7 +1399,7 @@ static int bch2_gc_thread(void *arg)
 #else
                ret = bch2_gc_gens(c);
 #endif
-               if (ret)
+               if (ret < 0)
                        bch_err(c, "btree gc failed: %i", ret);
 
                debug_check_no_locks_held();
index 0b1d0d2c323b106f8fa2cc6619cb088fa98b1177..c6d6f23d3f24b8c2675dd77dc210570d05de4772 100644 (file)
@@ -1448,7 +1448,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
        return 0;
 }
 
-int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
+int bch2_stripes_write(struct bch_fs *c, unsigned flags)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
@@ -1476,8 +1476,6 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
 
                if (ret)
                        break;
-
-               *wrote = true;
        }
 
        bch2_trans_exit(&trans);
index f8fc3d616cd787d64d954dc2026cdd8e85c95d69..6db16cf768daa40c8c91b8e2523208c146bfeac7 100644 (file)
@@ -156,7 +156,7 @@ void bch2_ec_flush_new_stripes(struct bch_fs *);
 
 struct journal_keys;
 int bch2_stripes_read(struct bch_fs *, struct journal_keys *);
-int bch2_stripes_write(struct bch_fs *, unsigned, bool *);
+int bch2_stripes_write(struct bch_fs *, unsigned);
 
 int bch2_ec_mem_alloc(struct bch_fs *, bool);
 
index 6e829bf0a31f3f1d5254e023e05c53fcd43e94e4..d70fa968db50d95c5fa63ced0c8080e14434d903 100644 (file)
@@ -845,9 +845,11 @@ static int verify_superblock_clean(struct bch_fs *c,
        }
 
        mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
-                       "superblock read clock doesn't match journal after clean shutdown");
+                       "superblock read clock %u doesn't match journal %u after clean shutdown",
+                       clean->read_clock, j->read_clock);
        mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
-                       "superblock read clock doesn't match journal after clean shutdown");
+                       "superblock write clock %u doesn't match journal %u after clean shutdown",
+                       clean->write_clock, j->write_clock);
 
        for (i = 0; i < BTREE_ID_NR; i++) {
                char buf1[200], buf2[200];
@@ -961,7 +963,7 @@ int bch2_fs_recovery(struct bch_fs *c)
        const char *err = "cannot allocate memory";
        struct bch_sb_field_clean *clean = NULL;
        u64 journal_seq;
-       bool wrote = false, write_sb = false;
+       bool write_sb = false, need_write_alloc = false;
        int ret;
 
        if (c->sb.clean)
@@ -1090,8 +1092,10 @@ int bch2_fs_recovery(struct bch_fs *c)
                bch_info(c, "starting metadata mark and sweep");
                err = "error in mark and sweep";
                ret = bch2_gc(c, &c->journal_keys, true, true);
-               if (ret)
+               if (ret < 0)
                        goto err;
+               if (ret)
+                       need_write_alloc = true;
                bch_verbose(c, "mark and sweep done");
        }
 
@@ -1101,8 +1105,10 @@ int bch2_fs_recovery(struct bch_fs *c)
                bch_info(c, "starting mark and sweep");
                err = "error in mark and sweep";
                ret = bch2_gc(c, &c->journal_keys, true, false);
-               if (ret)
+               if (ret < 0)
                        goto err;
+               if (ret)
+                       need_write_alloc = true;
                bch_verbose(c, "mark and sweep done");
        }
 
@@ -1126,7 +1132,7 @@ int bch2_fs_recovery(struct bch_fs *c)
                goto err;
        bch_verbose(c, "journal replay done");
 
-       if (!c->opts.nochanges) {
+       if (need_write_alloc && !c->opts.nochanges) {
                /*
                 * note that even when filesystem was clean there might be work
                 * to do here, if we ran gc (because of fsck) which recalculated
@@ -1134,8 +1140,8 @@ int bch2_fs_recovery(struct bch_fs *c)
                 */
                bch_verbose(c, "writing allocation info");
                err = "error writing out alloc info";
-               ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?:
-                       bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote);
+               ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW) ?:
+                       bch2_alloc_write(c, BTREE_INSERT_LAZY_RW);
                if (ret) {
                        bch_err(c, "error writing alloc info");
                        goto err;
@@ -1281,6 +1287,20 @@ int bch2_fs_initialize(struct bch_fs *c)
        bch2_fs_journal_start(&c->journal, 1, &journal);
        bch2_journal_set_replay_done(&c->journal);
 
+       err = "error going read-write";
+       ret = bch2_fs_read_write_early(c);
+       if (ret)
+               goto err;
+
+       /*
+        * Write out the superblock and journal buckets, now that we can do
+        * btree updates
+        */
+       err = "error writing alloc info";
+       ret = bch2_alloc_write(c, 0);
+       if (ret)
+               goto err;
+
        bch2_inode_init(c, &root_inode, 0, 0,
                        S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
        root_inode.bi_inum = BCACHEFS_ROOT_INO;
@@ -1289,7 +1309,7 @@ int bch2_fs_initialize(struct bch_fs *c)
        err = "error creating root directory";
        ret = bch2_btree_insert(c, BTREE_ID_INODES,
                                &packed_inode.inode.k_i,
-                               NULL, NULL, BTREE_INSERT_LAZY_RW);
+                               NULL, NULL, 0);
        if (ret)
                goto err;
 
index 85ba96cb2292e39df1f6469984016823903497d7..7656bf632d798fb0135bcb12c34e59febe480ae5 100644 (file)
@@ -176,9 +176,7 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
 static void __bch2_fs_read_only(struct bch_fs *c)
 {
        struct bch_dev *ca;
-       bool wrote = false;
        unsigned i, clean_passes = 0;
-       int ret;
 
        bch2_rebalance_stop(c);
        bch2_copygc_stop(c);
@@ -197,20 +195,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
        if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
                goto nowrote_alloc;
 
-       bch_verbose(c, "writing alloc info");
-       /*
-        * This should normally just be writing the bucket read/write clocks:
-        */
-       ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?:
-               bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
-       bch_verbose(c, "writing alloc info complete");
-
-       if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
-               bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
-
-       if (ret)
-               goto nowrote_alloc;
-
        bch_verbose(c, "flushing journal and stopping allocators");
 
        bch2_journal_flush_all_pins(&c->journal);
@@ -1666,6 +1650,11 @@ have_slot:
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
+       err = "alloc write failed";
+       ret = bch2_dev_alloc_write(c, ca, 0);
+       if (ret)
+               goto err;
+
        if (ca->mi.state == BCH_MEMBER_STATE_RW) {
                err = __bch2_dev_read_write(c, ca);
                if (err)