bcachefs: Various improvements to bch2_alloc_write()
authorKent Overstreet <kent.overstreet@gmail.com>
Wed, 17 Apr 2019 22:14:46 +0000 (18:14 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:21 +0000 (17:08 -0400)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/alloc_background.c
fs/bcachefs/alloc_background.h
fs/bcachefs/buckets.c
fs/bcachefs/ec.c
fs/bcachefs/journal_io.c
fs/bcachefs/super.c

index 82a68fabdc5f6fd41a006b3acc4b6ca0b4f2b341..25c18b8cd3a694a71d5d736864279af9ebcf3ffa 100644 (file)
@@ -129,15 +129,21 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
        *p += bytes;
 }
 
-struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *a)
+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
 {
-       struct bkey_alloc_unpacked ret = { .gen = a->gen };
-       const void *d = a->data;
-       unsigned idx = 0;
+       struct bkey_alloc_unpacked ret = { .gen = 0 };
+
+       if (k.k->type == KEY_TYPE_alloc) {
+               const struct bch_alloc *a = bkey_s_c_to_alloc(k).v;
+               const void *d = a->data;
+               unsigned idx = 0;
+
+               ret.gen = a->gen;
 
 #define x(_name, _bits)        ret._name = get_alloc_field(a, &d, idx++);
-       BCH_ALLOC_FIELDS()
+               BCH_ALLOC_FIELDS()
 #undef  x
+       }
        return ret;
 }
 
@@ -199,66 +205,18 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
                               get_alloc_field(a.v, &d, i));
 }
 
-static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a)
-{
-       const void *d = a->data;
-       unsigned idx = 0, data_type, dirty_sectors, cached_sectors;
-       struct bucket_mark m;
-
-       g->io_time[READ]        = get_alloc_field(a, &d, idx++);
-       g->io_time[WRITE]       = get_alloc_field(a, &d, idx++);
-       data_type               = get_alloc_field(a, &d, idx++);
-       dirty_sectors           = get_alloc_field(a, &d, idx++);
-       cached_sectors          = get_alloc_field(a, &d, idx++);
-       g->oldest_gen           = get_alloc_field(a, &d, idx++);
-
-       bucket_cmpxchg(g, m, ({
-               m.gen                   = a->gen;
-               m.data_type             = data_type;
-               m.dirty_sectors         = dirty_sectors;
-               m.cached_sectors        = cached_sectors;
-       }));
-
-       g->gen_valid            = 1;
-}
-
-static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
-                             struct bucket_mark m)
+static inline struct bkey_alloc_unpacked
+alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
 {
-       unsigned idx = 0;
-       void *d = a->v.data;
-
-       a->v.fields     = 0;
-       a->v.gen        = m.gen;
-
-       d = a->v.data;
-       put_alloc_field(a, &d, idx++, g->io_time[READ]);
-       put_alloc_field(a, &d, idx++, g->io_time[WRITE]);
-       put_alloc_field(a, &d, idx++, m.data_type);
-       put_alloc_field(a, &d, idx++, m.dirty_sectors);
-       put_alloc_field(a, &d, idx++, m.cached_sectors);
-       put_alloc_field(a, &d, idx++, g->oldest_gen);
-
-       set_bkey_val_bytes(&a->k, (void *) d - (void *) &a->v);
-}
-
-static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bch_dev *ca;
-       struct bkey_s_c_alloc a;
-
-       if (k.k->type != KEY_TYPE_alloc)
-               return;
-
-       a = bkey_s_c_to_alloc(k);
-       ca = bch_dev_bkey_exists(c, a.k->p.inode);
-
-       if (a.k->p.offset >= ca->mi.nbuckets)
-               return;
-
-       percpu_down_read(&c->mark_lock);
-       __alloc_read_key(bucket(ca, a.k->p.offset), a.v);
-       percpu_up_read(&c->mark_lock);
+       return (struct bkey_alloc_unpacked) {
+               .gen            = m.gen,
+               .oldest_gen     = g->oldest_gen,
+               .data_type      = m.data_type,
+               .dirty_sectors  = m.dirty_sectors,
+               .cached_sectors = m.cached_sectors,
+               .read_time      = g->io_time[READ],
+               .write_time     = g->io_time[WRITE],
+       };
 }
 
 int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
@@ -274,7 +232,7 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
        bch2_trans_init(&trans, c);
 
        for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
-               bch2_alloc_read_key(c, k);
+               bch2_mark_key(c, k, true, 0, NULL, 0, 0);
 
        ret = bch2_trans_exit(&trans) ?: ret;
        if (ret) {
@@ -284,7 +242,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 
        for_each_journal_key(*journal_keys, j)
                if (j->btree_id == BTREE_ID_ALLOC)
-                       bch2_alloc_read_key(c, bkey_i_to_s_c(j->k));
+                       bch2_mark_key(c, bkey_i_to_s_c(j->k),
+                                     true, 0, NULL, 0, 0);
 
        percpu_down_write(&c->mark_lock);
        bch2_dev_usage_from_buckets(c);
@@ -352,81 +311,32 @@ err:
        return ret;
 }
 
-static int __bch2_alloc_write_key(struct btree_trans *trans, struct bch_dev *ca,
-                                 size_t b, struct btree_iter *iter,
-                                 unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-#if 0
-       __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
-#else
-       /* hack: */
-       __BKEY_PADDED(k, 8) alloc_key;
-#endif
-       struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k);
-       struct bucket *g;
-       struct bucket_mark m, new;
-       int ret;
-
-       BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
-       a->k.p = POS(ca->dev_idx, b);
-
-       bch2_btree_iter_set_pos(iter, a->k.p);
-
-       ret = bch2_btree_iter_traverse(iter);
-       if (ret)
-               return ret;
-
-       percpu_down_read(&c->mark_lock);
-       g = bucket(ca, b);
-       m = READ_ONCE(g->mark);
-
-       if (!m.dirty) {
-               percpu_up_read(&c->mark_lock);
-               return 0;
-       }
-
-       __alloc_write_key(a, g, m);
-       percpu_up_read(&c->mark_lock);
-
-       bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
-
-       ret = bch2_trans_commit(trans, NULL, NULL,
-                               BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_NOMARK|
-                               flags);
-       if (ret)
-               return ret;
-
-       new = m;
-       new.dirty = false;
-       atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
-
-       if (ca->buckets_written)
-               set_bit(b, ca->buckets_written);
-
-       return 0;
-}
-
 int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bucket_array *buckets;
        struct bch_dev *ca;
+       struct bucket *g;
+       struct bucket_mark m, new;
+       struct bkey_alloc_unpacked old_u, new_u;
+       __BKEY_PADDED(k, 8) alloc_key; /* hack: */
+       struct bkey_i_alloc *a;
+       struct bkey_s_c k;
        unsigned i;
        size_t b;
        int ret = 0;
 
+       BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
+
        bch2_trans_init(&trans, c);
 
        iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
                                   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
        for_each_rw_member(ca, c, i) {
-relock:
                down_read(&ca->bucket_lock);
+restart:
                buckets = bucket_array(ca);
 
                for (b = buckets->first_bucket;
@@ -435,27 +345,70 @@ relock:
                        if (!buckets->b[b].mark.dirty)
                                continue;
 
+                       bch2_btree_iter_set_pos(iter, POS(i, b));
+                       k = bch2_btree_iter_peek_slot(iter);
+                       ret = bkey_err(k);
+                       if (ret)
+                               goto err;
+
+                       old_u = bch2_alloc_unpack(k);
+
+                       percpu_down_read(&c->mark_lock);
+                       g       = bucket(ca, b);
+                       m       = READ_ONCE(g->mark);
+                       new_u   = alloc_mem_to_key(g, m);
+                       percpu_up_read(&c->mark_lock);
+
+                       if (!m.dirty)
+                               continue;
+
                        if ((flags & BTREE_INSERT_LAZY_RW) &&
                            percpu_ref_is_zero(&c->writes)) {
                                up_read(&ca->bucket_lock);
                                bch2_trans_unlock(&trans);
 
                                ret = bch2_fs_read_write_early(c);
+                               down_read(&ca->bucket_lock);
+
                                if (ret)
-                                       goto out;
-                               goto relock;
+                                       goto err;
+                               goto restart;
                        }
 
-                       ret = __bch2_alloc_write_key(&trans, ca, b,
-                                                    iter, flags);
+                       a = bkey_alloc_init(&alloc_key.k);
+                       a->k.p = iter->pos;
+                       bch2_alloc_pack(a, new_u);
+
+                       bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
+                       ret = bch2_trans_commit(&trans, NULL, NULL,
+                                               BTREE_INSERT_NOFAIL|
+                                               BTREE_INSERT_NOMARK|
+                                               flags);
+err:
+                       if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
+                               bch_err(c, "error %i writing alloc info", ret);
+                               printk(KERN_CONT "dev %llu bucket %llu\n",
+                                      iter->pos.inode, iter->pos.offset);
+                               printk(KERN_CONT "gen %u -> %u\n", old_u.gen, new_u.gen);
+#define x(_name, _bits)                printk(KERN_CONT #_name " %u -> %u\n", old_u._name, new_u._name);
+                               BCH_ALLOC_FIELDS()
+#undef  x
+                       }
                        if (ret)
                                break;
 
+                       new = m;
+                       new.dirty = false;
+                       atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
+
+                       if (ca->buckets_written)
+                               set_bit(b, ca->buckets_written);
+
                        bch2_trans_cond_resched(&trans);
                        *wrote = true;
                }
                up_read(&ca->bucket_lock);
-out:
+
                if (ret) {
                        percpu_ref_put(&ca->io_ref);
                        break;
@@ -922,6 +875,7 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct bkey_i_alloc *a;
        struct bkey_alloc_unpacked u;
+       struct bucket *g;
        struct bucket_mark m;
        struct bkey_s_c k;
        bool invalidating_cached_data;
@@ -941,7 +895,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
        BUG_ON(!fifo_push(&ca->free_inc, b));
 
        bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
-       m = bucket(ca, b)->mark;
 
        spin_unlock(&c->freelist_lock);
        percpu_up_read(&c->mark_lock);
@@ -955,27 +908,26 @@ retry:
        if (ret)
                return ret;
 
-       if (k.k && k.k->type == KEY_TYPE_alloc)
-               u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
-       else
-               memset(&u, 0, sizeof(u));
+       /*
+        * The allocator has to start before journal replay is finished - thus,
+        * we have to trust the in memory bucket @m, not the version in the
+        * btree:
+        */
+       percpu_down_read(&c->mark_lock);
+       g = bucket(ca, b);
+       m = READ_ONCE(g->mark);
+       u = alloc_mem_to_key(g, m);
+       percpu_up_read(&c->mark_lock);
 
        invalidating_cached_data = m.cached_sectors != 0;
 
+       u.gen++;
        u.data_type     = 0;
        u.dirty_sectors = 0;
        u.cached_sectors = 0;
        u.read_time     = c->bucket_clock[READ].hand;
        u.write_time    = c->bucket_clock[WRITE].hand;
 
-       /*
-        * The allocator has to start before journal replay is finished - thus,
-        * we have to trust the in memory bucket @m, not the version in the
-        * btree:
-        */
-       //BUG_ON(u.dirty_sectors);
-       u.gen           = m.gen + 1;
-
        a = bkey_alloc_init(&alloc_key.k);
        a->k.p = iter->pos;
        bch2_alloc_pack(a, u);
index 02354c80a102ae3549eb2c7120112309bc2d9dfa..0c1a0f0dd2ab558835474ec18a65b9d21a596a00 100644 (file)
@@ -13,7 +13,7 @@ struct bkey_alloc_unpacked {
 #undef  x
 };
 
-struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *);
+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
 void bch2_alloc_pack(struct bkey_i_alloc *,
                     const struct bkey_alloc_unpacked);
 
index 6d04474f0e3a2776ead4f1b27ffc1e69c33b3086..2479ad37775ba2fe8d7b080c41de208acde2262b 100644 (file)
@@ -649,9 +649,13 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
        if (flags & BCH_BUCKET_MARK_GC)
                return 0;
 
-       u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
        ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+       if (k.k->p.offset >= ca->mi.nbuckets)
+               return 0;
+
        g = __bucket(ca, k.k->p.offset, gc);
+       u = bch2_alloc_unpack(k);
 
        old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({
                m.gen                   = u.gen;
@@ -1381,7 +1385,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
                goto out;
        }
 
-       u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
+       u = bch2_alloc_unpack(k);
 
        if (gen_after(u.gen, p.ptr.gen)) {
                ret = 1;
index 6761b5c24a12a3b9882955e117e9ec179e40e589..07245717ca4e8df5b3465cf7b047ba37f4ee882a 100644 (file)
@@ -1234,11 +1234,6 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
        return ret;
 }
 
-static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k)
-{
-       bch2_mark_key(c, k, true, 0, NULL, 0, 0);
-}
-
 int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
        struct journal_key *i;
@@ -1254,7 +1249,7 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
        bch2_trans_init(&trans, c);
 
        for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k, ret)
-               bch2_stripe_read_key(c, k);
+               bch2_mark_key(c, k, true, 0, NULL, 0, 0);
 
        ret = bch2_trans_exit(&trans) ?: ret;
        if (ret) {
@@ -1264,7 +1259,8 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 
        for_each_journal_key(*journal_keys, i)
                if (i->btree_id == BTREE_ID_EC)
-                       bch2_stripe_read_key(c, bkey_i_to_s_c(i->k));
+                       bch2_mark_key(c, bkey_i_to_s_c(i->k),
+                                     true, 0, NULL, 0, 0);
 
        return 0;
 }
index 4fd7b048050b802f7cd8c06723195cd45ab6d8d5..4e0c63f0076fc1079f7c834c6430080a8e5674f8 100644 (file)
@@ -947,7 +947,6 @@ out:
        return;
 err:
        bch2_fatal_error(c);
-       bch2_journal_halt(j);
        spin_lock(&j->lock);
        goto out;
 }
@@ -1059,7 +1058,6 @@ void bch2_journal_write(struct closure *cl)
        spin_unlock(&j->lock);
 
        if (ret) {
-               bch2_journal_halt(j);
                bch_err(c, "Unable to allocate journal write");
                bch2_fatal_error(c);
                continue_at(cl, journal_write_done, system_highpri_wq);
index 959638c986a08eea3b91e126a707185671265c40..8f25c1d9b8cb704e85e058896c5495c26d7251f3 100644 (file)
@@ -198,17 +198,14 @@ static void __bch2_fs_read_only(struct bch_fs *c)
        do {
                wrote = false;
 
-               ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
-               if (ret) {
-                       bch2_fs_inconsistent(c, "error writing out stripes");
-                       break;
-               }
+               ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?:
+                       bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
 
-               ret = bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
-               if (ret) {
+               if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
                        bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
+
+               if (ret)
                        break;
-               }
 
                for_each_member_device(ca, c, i)
                        bch2_dev_allocator_quiesce(c, ca);