bcachefs: Account for stripe parity sectors separately
authorKent Overstreet <kent.overstreet@gmail.com>
Thu, 9 Jul 2020 22:31:51 +0000 (18:31 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:45 +0000 (17:08 -0400)
Instead of trying to charge EC parity to the data within the stripe
(which is subject to rounding errors), let's charge it to the stripe
itself. It should also make -ENOSPC issues easier to deal with if we
charge for parity blocks up front, and means we can also make more fine
grained accounting available to the user.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/bcachefs_format.h
fs/bcachefs/buckets.c
fs/bcachefs/ec.c
fs/bcachefs/ec.h
fs/bcachefs/replicas.c

index a5b0c308fc46712ff16e8925de291c2afb64b62a..5465acd9cbe8892adf2aa67a2d6efb567aaacf7c 100644 (file)
@@ -1036,7 +1036,8 @@ LE64_BITMASK(BCH_KDF_SCRYPT_P,    struct bch_sb_field_crypt, kdf_flags, 32, 48);
        x(journal,      2)              \
        x(btree,        3)              \
        x(user,         4)              \
-       x(cached,       5)
+       x(cached,       5)              \
+       x(parity,       6)
 
 enum bch_data_type {
 #define x(t, n) BCH_DATA_##t,
index 80d11decb71ebb871ea7e3d719ed34bdd26d33b3..2277143b1890762c3742960570111601265edc4a 100644 (file)
 
 #include <linux/preempt.h>
 
+static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
+                                             enum bch_data_type data_type,
+                                             s64 sectors)
+{
+       switch (data_type) {
+       case BCH_DATA_btree:
+               fs_usage->btree         += sectors;
+               break;
+       case BCH_DATA_user:
+       case BCH_DATA_parity:
+               fs_usage->data          += sectors;
+               break;
+       case BCH_DATA_cached:
+               fs_usage->cached        += sectors;
+               break;
+       default:
+               break;
+       }
+}
+
 /*
  * Clear journal_seq_valid for buckets for which it's not needed, to prevent
  * wraparound:
@@ -132,17 +152,7 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
                struct bch_replicas_entry *e =
                        cpu_replicas_entry(&c->replicas, i);
 
-               switch (e->data_type) {
-               case BCH_DATA_btree:
-                       usage->btree    += usage->replicas[i];
-                       break;
-               case BCH_DATA_user:
-                       usage->data     += usage->replicas[i];
-                       break;
-               case BCH_DATA_cached:
-                       usage->cached   += usage->replicas[i];
-                       break;
-               }
+               fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
        }
 
        percpu_up_write(&c->mark_lock);
@@ -374,9 +384,14 @@ static inline int is_fragmented_bucket(struct bucket_mark m,
        return 0;
 }
 
+static inline int is_stripe_data_bucket(struct bucket_mark m)
+{
+       return m.stripe && m.data_type != BCH_DATA_parity;
+}
+
 static inline int bucket_stripe_sectors(struct bucket_mark m)
 {
-       return m.stripe ? m.dirty_sectors : 0;
+       return is_stripe_data_bucket(m) ? m.dirty_sectors : 0;
 }
 
 static inline enum bch_data_type bucket_type(struct bucket_mark m)
@@ -520,17 +535,7 @@ static inline int update_replicas(struct bch_fs *c,
        if (!fs_usage)
                return 0;
 
-       switch (r->data_type) {
-       case BCH_DATA_btree:
-               fs_usage->btree         += sectors;
-               break;
-       case BCH_DATA_user:
-               fs_usage->data          += sectors;
-               break;
-       case BCH_DATA_cached:
-               fs_usage->cached        += sectors;
-               break;
-       }
+       fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
        fs_usage->replicas[idx]         += sectors;
        return 0;
 }
@@ -958,12 +963,15 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
 }
 
 static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
-                            const struct bch_extent_ptr *ptr,
+                            unsigned ptr_idx,
                             struct bch_fs_usage *fs_usage,
-                            u64 journal_seq,
-                            unsigned flags,
+                            u64 journal_seq, unsigned flags,
                             bool enabled)
 {
+       const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+       unsigned nr_data = s->nr_blocks - s->nr_redundant;
+       bool parity = ptr_idx >= nr_data;
+       const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
        bool gc = flags & BTREE_TRIGGER_GC;
        struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
        struct bucket *g = PTR_BUCKET(ca, ptr, gc);
@@ -990,6 +998,12 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
                                      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
 
                new.stripe                      = enabled;
+
+               if ((flags & BTREE_TRIGGER_GC) && parity) {
+                       new.data_type = enabled ? BCH_DATA_parity : 0;
+                       new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0;
+               }
+
                if (journal_seq) {
                        new.journal_seq_valid   = 1;
                        new.journal_seq         = journal_seq;
@@ -1074,12 +1088,10 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
                                struct bch_extent_stripe_ptr p,
                                enum bch_data_type data_type,
                                struct bch_fs_usage *fs_usage,
-                               s64 sectors, unsigned flags,
-                               struct bch_replicas_padded *r,
-                               unsigned *nr_data,
-                               unsigned *nr_parity)
+                               s64 sectors, unsigned flags)
 {
        bool gc = flags & BTREE_TRIGGER_GC;
+       struct bch_replicas_padded r;
        struct stripe *m;
        unsigned i, blocks_nonempty = 0;
 
@@ -1094,14 +1106,10 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
                return -EIO;
        }
 
-       BUG_ON(m->r.e.data_type != data_type);
-
-       *nr_data        = m->nr_blocks - m->nr_redundant;
-       *nr_parity      = m->nr_redundant;
-       *r = m->r;
-
        m->block_sectors[p.block] += sectors;
 
+       r = m->r;
+
        for (i = 0; i < m->nr_blocks; i++)
                blocks_nonempty += m->block_sectors[i] != 0;
 
@@ -1113,6 +1121,9 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 
        spin_unlock(&c->ec_stripes_heap_lock);
 
+       r.e.data_type = data_type;
+       update_replicas(c, fs_usage, &r.e, sectors);
+
        return 0;
 }
 
@@ -1158,25 +1169,11 @@ static int bch2_mark_extent(struct bch_fs *c,
                        dirty_sectors          += disk_sectors;
                        r.e.devs[r.e.nr_devs++] = p.ptr.dev;
                } else {
-                       struct bch_replicas_padded ec_r;
-                       unsigned nr_data, nr_parity;
-                       s64 parity_sectors;
-
                        ret = bch2_mark_stripe_ptr(c, p.ec, data_type,
-                                       fs_usage, disk_sectors, flags,
-                                       &ec_r, &nr_data, &nr_parity);
+                                       fs_usage, disk_sectors, flags);
                        if (ret)
                                return ret;
 
-                       parity_sectors =
-                               __ptr_disk_sectors_delta(p.crc.live_size,
-                                       offset, sectors, flags,
-                                       p.crc.compressed_size * nr_parity,
-                                       p.crc.uncompressed_size * nr_data);
-
-                       update_replicas(c, fs_usage, &ec_r.e,
-                                       disk_sectors + parity_sectors);
-
                        /*
                         * There may be other dirty pointers in this extent, but
                         * if so they're not required for mounting if we have an
@@ -1216,7 +1213,7 @@ static int bch2_mark_stripe(struct bch_fs *c,
        if (!new_s) {
                /* Deleting: */
                for (i = 0; i < old_s->nr_blocks; i++) {
-                       ret = bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage,
+                       ret = bucket_set_stripe(c, old, i, fs_usage,
                                                journal_seq, flags, false);
                        if (ret)
                                return ret;
@@ -1228,6 +1225,10 @@ static int bch2_mark_stripe(struct bch_fs *c,
                        spin_unlock(&c->ec_stripes_heap_lock);
                }
 
+               if (gc)
+                       update_replicas(c, fs_usage, &m->r.e,
+                                       -((s64) m->sectors * m->nr_redundant));
+
                memset(m, 0, sizeof(*m));
        } else {
                BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks);
@@ -1240,12 +1241,12 @@ static int bch2_mark_stripe(struct bch_fs *c,
                                   sizeof(struct bch_extent_ptr))) {
 
                                if (old_s) {
-                                       bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage,
+                                       bucket_set_stripe(c, old, i, fs_usage,
                                                          journal_seq, flags, false);
                                        if (ret)
                                                return ret;
                                }
-                               ret = bucket_set_stripe(c, new, new_s->ptrs + i, fs_usage,
+                               ret = bucket_set_stripe(c, new, i, fs_usage,
                                                        journal_seq, flags, true);
                                if (ret)
                                        return ret;
@@ -1258,8 +1259,16 @@ static int bch2_mark_stripe(struct bch_fs *c,
                m->nr_blocks    = new_s->nr_blocks;
                m->nr_redundant = new_s->nr_redundant;
 
+               if (gc && old_s)
+                       update_replicas(c, fs_usage, &m->r.e,
+                                       -((s64) m->sectors * m->nr_redundant));
+
                bch2_bkey_to_replicas(&m->r.e, new);
 
+               if (gc)
+                       update_replicas(c, fs_usage, &m->r.e,
+                                       ((s64) m->sectors * m->nr_redundant));
+
                /* gc recalculates these fields: */
                if (!(flags & BTREE_TRIGGER_GC)) {
                        m->blocks_nonempty = 0;
@@ -1648,15 +1657,13 @@ out:
 
 static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
                        struct bch_extent_stripe_ptr p,
-                       s64 sectors, enum bch_data_type data_type,
-                       struct bch_replicas_padded *r,
-                       unsigned *nr_data,
-                       unsigned *nr_parity)
+                       s64 sectors, enum bch_data_type data_type)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter *iter;
        struct bkey_s_c k;
        struct bkey_i_stripe *s;
+       struct bch_replicas_padded r;
        int ret = 0;
 
        ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
@@ -1677,15 +1684,14 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
                goto out;
 
        bkey_reassemble(&s->k_i, k);
-
        stripe_blockcount_set(&s->v, p.block,
                stripe_blockcount_get(&s->v, p.block) +
                sectors);
-
-       *nr_data        = s->v.nr_blocks - s->v.nr_redundant;
-       *nr_parity      = s->v.nr_redundant;
-       bch2_bkey_to_replicas(&r->e, bkey_i_to_s_c(&s->k_i));
        bch2_trans_update(trans, iter, &s->k_i, 0);
+
+       bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
+       r.e.data_type = data_type;
+       update_replicas_list(trans, &r.e, sectors);
 out:
        bch2_trans_iter_put(trans, iter);
        return ret;
@@ -1730,25 +1736,11 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
                        dirty_sectors          += disk_sectors;
                        r.e.devs[r.e.nr_devs++] = p.ptr.dev;
                } else {
-                       struct bch_replicas_padded ec_r;
-                       unsigned nr_data, nr_parity;
-                       s64 parity_sectors;
-
                        ret = bch2_trans_mark_stripe_ptr(trans, p.ec,
-                                       disk_sectors, data_type,
-                                       &ec_r, &nr_data, &nr_parity);
+                                       disk_sectors, data_type);
                        if (ret)
                                return ret;
 
-                       parity_sectors =
-                               __ptr_disk_sectors_delta(p.crc.live_size,
-                                       offset, sectors, flags,
-                                       p.crc.compressed_size * nr_parity,
-                                       p.crc.uncompressed_size * nr_data);
-
-                       update_replicas_list(trans, &ec_r.e,
-                                            disk_sectors + parity_sectors);
-
                        r.e.nr_required = 0;
                }
        }
@@ -1760,15 +1752,26 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 }
 
 static int bch2_trans_mark_stripe(struct btree_trans *trans,
-                                 struct bkey_s_c k)
+                                 struct bkey_s_c k,
+                                 unsigned flags)
 {
        const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+       unsigned nr_data = s->nr_blocks - s->nr_redundant;
+       struct bch_replicas_padded r;
        struct bkey_alloc_unpacked u;
        struct bkey_i_alloc *a;
        struct btree_iter *iter;
+       bool deleting = flags & BTREE_TRIGGER_OVERWRITE;
+       s64 sectors = le16_to_cpu(s->sectors);
        unsigned i;
        int ret = 0;
 
+       if (deleting)
+               sectors = -sectors;
+
+       bch2_bkey_to_replicas(&r.e, k);
+       update_replicas_list(trans, &r.e, sectors * s->nr_redundant);
+
        /*
         * The allocator code doesn't necessarily update bucket gens in the
         * btree when incrementing them, right before handing out new buckets -
@@ -1776,11 +1779,20 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
         */
 
        for (i = 0; i < s->nr_blocks && !ret; i++) {
+               bool parity = i >= nr_data;
+
                ret = bch2_trans_start_alloc_update(trans, &iter,
                                                    &s->ptrs[i], &u);
                if (ret)
                        break;
 
+               if (parity) {
+                       u.dirty_sectors += sectors;
+                       u.data_type = u.dirty_sectors
+                               ? BCH_DATA_parity
+                               : 0;
+               }
+
                a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
                ret = PTR_ERR_OR_ZERO(a);
                if (ret)
@@ -1897,7 +1909,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
                return bch2_trans_mark_extent(trans, k, offset, sectors,
                                              flags, BCH_DATA_user);
        case KEY_TYPE_stripe:
-               return bch2_trans_mark_stripe(trans, k);
+               return bch2_trans_mark_stripe(trans, k, flags);
        case KEY_TYPE_inode:
                d = replicas_deltas_realloc(trans, 0);
 
index c6d6f23d3f24b8c2675dd77dc210570d05de4772..e5033b392432559722e38c2a05c60b78bc333c94 100644 (file)
@@ -343,12 +343,17 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
        unsigned offset = 0, bytes = buf->size << 9;
        struct bch_extent_ptr *ptr = &v->ptrs[idx];
        struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+       enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant
+               ? BCH_DATA_user
+               : BCH_DATA_parity;
 
        if (!bch2_dev_get_ioref(ca, rw)) {
                clear_bit(idx, buf->valid);
                return;
        }
 
+       this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
+
        while (offset < bytes) {
                unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
                                           DIV_ROUND_UP(bytes, PAGE_SIZE));
@@ -670,6 +675,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
 /* stripe creation: */
 
 static int ec_stripe_bkey_insert(struct bch_fs *c,
+                                struct ec_stripe_new *s,
                                 struct bkey_i_stripe *stripe)
 {
        struct btree_trans trans;
@@ -711,7 +717,7 @@ found_slot:
 
        bch2_trans_update(&trans, iter, &stripe->k_i, 0);
 
-       ret = bch2_trans_commit(&trans, NULL, NULL,
+       ret = bch2_trans_commit(&trans, &s->res, NULL,
                                BTREE_INSERT_NOFAIL);
 err:
        bch2_trans_iter_put(&trans, iter);
@@ -858,8 +864,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
        ret = s->existing_stripe
                ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i,
-                                   NULL, NULL, BTREE_INSERT_NOFAIL)
-               : ec_stripe_bkey_insert(c, &s->stripe.key);
+                                   &s->res, NULL, BTREE_INSERT_NOFAIL)
+               : ec_stripe_bkey_insert(c, s, &s->stripe.key);
        if (ret) {
                bch_err(c, "error creating stripe: error creating stripe key");
                goto err_put_writes;
@@ -886,6 +892,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 err_put_writes:
        percpu_ref_put(&c->writes);
 err:
+       bch2_disk_reservation_put(c, &s->res);
+
        open_bucket_for_each(c, &s->blocks, ob, i) {
                ob->ec = NULL;
                __bch2_open_bucket_put(c, ob);
@@ -1325,6 +1333,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
        struct open_bucket *ob;
        unsigned i, data_idx = 0;
        s64 idx;
+       int ret;
 
        closure_init_stack(&cl);
 
@@ -1356,6 +1365,22 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
                                }
                }
 
+               if (!h->s->existing_stripe &&
+                   !h->s->res.sectors) {
+                       ret = bch2_disk_reservation_get(c, &h->s->res,
+                                                       h->blocksize,
+                                                       h->s->nr_parity, 0);
+                       if (ret) {
+                               /* What should we do here? */
+                               bch_err(c, "unable to create new stripe: %i", ret);
+                               bch2_ec_stripe_head_put(c, h);
+                               h = NULL;
+                               goto out;
+
+                       }
+
+               }
+
                if (new_stripe_alloc_buckets(c, h)) {
                        bch2_ec_stripe_head_put(c, h);
                        h = NULL;
index 6db16cf768daa40c8c91b8e2523208c146bfeac7..15f751fc2a35d32bea03efde0cf5cbbd332bb41a 100644 (file)
@@ -3,6 +3,7 @@
 #define _BCACHEFS_EC_H
 
 #include "ec_types.h"
+#include "buckets_types.h"
 #include "keylist_types.h"
 
 const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c);
@@ -105,6 +106,7 @@ struct ec_stripe_new {
        struct open_buckets     blocks;
        u8                      data_block_idx[EC_STRIPE_MAX];
        struct open_buckets     parity;
+       struct disk_reservation res;
 
        struct keylist          keys;
        u64                     inline_keys[BKEY_U64s * 8];
index db0665abd60bc40ea7df7674f94c900e2cbf671c..f46aa1d70e35df4a482ac8fcea57f8b0adb3fd62 100644 (file)
@@ -122,7 +122,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
                extent_to_replicas(k, e);
                break;
        case KEY_TYPE_stripe:
-               e->data_type = BCH_DATA_user;
+               e->data_type = BCH_DATA_parity;
                stripe_to_replicas(k, e);
                break;
        }
@@ -449,7 +449,23 @@ static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
 
        bch2_bkey_to_replicas(&search.e, k);
 
-       return __bch2_mark_replicas(c, &search.e, check);
+       ret = __bch2_mark_replicas(c, &search.e, check);
+       if (ret)
+               return ret;
+
+       if (search.e.data_type == BCH_DATA_parity) {
+               search.e.data_type = BCH_DATA_cached;
+               ret = __bch2_mark_replicas(c, &search.e, check);
+               if (ret)
+                       return ret;
+
+               search.e.data_type = BCH_DATA_user;
+               ret = __bch2_mark_replicas(c, &search.e, check);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
 }
 
 bool bch2_bkey_replicas_marked(struct bch_fs *c,