bcachefs: Data update path won't accidentaly grow replicas
authorKent Overstreet <kent.overstreet@linux.dev>
Sat, 25 Nov 2023 02:51:45 +0000 (21:51 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 26 Nov 2023 02:48:42 +0000 (21:48 -0500)
Previously, there was a bug where if an extent had greater durability
than required (because we needed to move a durability=1 pointer and
ended up putting it on a durability 2 device), we would submit a write
for replicas=2 - the durability of the pointer being rewritten - instead
of the number of replicas required to bring it back up to the
data_replicas option.

This, plus the allocation path sometimes allocating on a greater
durability device than requested, meant that extents could continue
having more and more replicas added as they were being rewritten.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/data_update.c
fs/bcachefs/data_update.h
fs/bcachefs/errcode.h
fs/bcachefs/io_read.c
fs/bcachefs/move.c

index 5ed66202c22654a71b79fde919f6dd425e472874..71aa5e59787b8bc6dca216add572553363a44492 100644 (file)
@@ -356,7 +356,7 @@ void bch2_data_update_exit(struct data_update *update)
        bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
 }
 
-void bch2_update_unwritten_extent(struct btree_trans *trans,
+static void bch2_update_unwritten_extent(struct btree_trans *trans,
                                  struct data_update *update)
 {
        struct bch_fs *c = update->op.c;
@@ -436,7 +436,51 @@ void bch2_update_unwritten_extent(struct btree_trans *trans,
        }
 }
 
+int bch2_extent_drop_ptrs(struct btree_trans *trans,
+                         struct btree_iter *iter,
+                         struct bkey_s_c k,
+                         struct data_update_opts data_opts)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_i *n;
+       int ret;
+
+       n = bch2_bkey_make_mut_noupdate(trans, k);
+       ret = PTR_ERR_OR_ZERO(n);
+       if (ret)
+               return ret;
+
+       while (data_opts.kill_ptrs) {
+               unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
+               struct bch_extent_ptr *ptr;
+
+               bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
+               data_opts.kill_ptrs ^= 1U << drop;
+       }
+
+       /*
+        * If the new extent no longer has any pointers, bch2_extent_normalize()
+        * will do the appropriate thing with it (turning it into a
+        * KEY_TYPE_error key, or just a discard if it was a cached extent)
+        */
+       bch2_extent_normalize(c, bkey_i_to_s(n));
+
+       /*
+        * Since we're not inserting through an extent iterator
+        * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+        * we aren't using the extent overwrite path to delete, we're
+        * just using the normal key deletion path:
+        */
+       if (bkey_deleted(&n->k))
+               n->k.size = 0;
+
+       return bch2_trans_relock(trans) ?:
+               bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+               bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
 int bch2_data_update_init(struct btree_trans *trans,
+                         struct btree_iter *iter,
                          struct moving_context *ctxt,
                          struct data_update *m,
                          struct write_point_specifier wp,
@@ -452,7 +496,7 @@ int bch2_data_update_init(struct btree_trans *trans,
        const struct bch_extent_ptr *ptr;
        unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
        unsigned ptrs_locked = 0;
-       int ret;
+       int ret = 0;
 
        bch2_bkey_buf_init(&m->k);
        bch2_bkey_buf_reassemble(&m->k, c, k);
@@ -478,6 +522,8 @@ int bch2_data_update_init(struct btree_trans *trans,
        bkey_for_each_ptr(ptrs, ptr)
                percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);
 
+       unsigned durability_have = 0, durability_removing = 0;
+
        i = 0;
        bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
                bool locked;
@@ -489,8 +535,11 @@ int bch2_data_update_init(struct btree_trans *trans,
                                reserve_sectors += k.k->size;
 
                        m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
-               } else if (!p.ptr.cached) {
+                       durability_removing += bch2_extent_ptr_desired_durability(c, &p);
+               } else if (!p.ptr.cached &&
+                          !((1U << i) & m->data_opts.kill_ptrs)) {
                        bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+                       durability_have += bch2_extent_ptr_durability(c, &p);
                }
 
                /*
@@ -529,6 +578,29 @@ int bch2_data_update_init(struct btree_trans *trans,
                i++;
        }
 
+       /*
+        * If current extent durability is less than io_opts.data_replicas,
+        * we're not trying to rereplicate the extent up to data_replicas here -
+        * unless extra_replicas was specified
+        *
+        * Increasing replication is an explicit operation triggered by
+        * rereplicate, currently, so that users don't get an unexpected -ENOSPC
+        */
+       if (durability_have >= io_opts.data_replicas) {
+               m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
+               m->data_opts.rewrite_ptrs = 0;
+               /* if iter == NULL, it's just a promote */
+               if (iter)
+                       ret = bch2_extent_drop_ptrs(trans, iter, k, data_opts);
+               goto done;
+       }
+
+       m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) +
+               m->data_opts.extra_replicas;
+       m->op.nr_replicas_required = m->op.nr_replicas;
+
+       BUG_ON(!m->op.nr_replicas);
+
        if (reserve_sectors) {
                ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
                                m->data_opts.extra_replicas
@@ -538,14 +610,11 @@ int bch2_data_update_init(struct btree_trans *trans,
                        goto err;
        }
 
-       m->op.nr_replicas += m->data_opts.extra_replicas;
-       m->op.nr_replicas_required = m->op.nr_replicas;
-
-       BUG_ON(!m->op.nr_replicas);
+       if (bkey_extent_is_unwritten(k)) {
+               bch2_update_unwritten_extent(trans, m);
+               goto done;
+       }
 
-       /* Special handling required: */
-       if (bkey_extent_is_unwritten(k))
-               return -BCH_ERR_unwritten_extent_update;
        return 0;
 err:
        i = 0;
@@ -560,6 +629,9 @@ err:
        bch2_bkey_buf_exit(&m->k, c);
        bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
        return ret;
+done:
+       bch2_data_update_exit(m);
+       return ret ?: -BCH_ERR_data_update_done;
 }
 
 void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
index 9dc17b9d83795181798deb5af39401d4d6248581..991095bbd469baeb55de1c0d2636267e49d68a28 100644 (file)
@@ -32,9 +32,14 @@ int bch2_data_update_index_update(struct bch_write_op *);
 void bch2_data_update_read_done(struct data_update *,
                                struct bch_extent_crc_unpacked);
 
+int bch2_extent_drop_ptrs(struct btree_trans *,
+                         struct btree_iter *,
+                         struct bkey_s_c,
+                         struct data_update_opts);
+
 void bch2_data_update_exit(struct data_update *);
-void bch2_update_unwritten_extent(struct btree_trans *, struct data_update *);
-int bch2_data_update_init(struct btree_trans *, struct moving_context *,
+int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
+                         struct moving_context *,
                          struct data_update *,
                          struct write_point_specifier,
                          struct bch_io_opts, struct data_update_opts,
index 68a1a96bb7caf526a148a988c12913151c57b6d5..69b0627c2180e967151c974a2fa9c6b75ed7cb6b 100644 (file)
        x(BCH_ERR_fsck,                 fsck_repair_unimplemented)              \
        x(BCH_ERR_fsck,                 fsck_repair_impossible)                 \
        x(0,                            restart_recovery)                       \
-       x(0,                            unwritten_extent_update)                \
+       x(0,                            data_update_done)                       \
        x(EINVAL,                       device_state_not_allowed)               \
        x(EINVAL,                       member_info_missing)                    \
        x(EINVAL,                       mismatched_block_size)                  \
index a56ed553dc15e6c709c5fed992d0a5b097170703..36763865facd46ba84731074981091e678a37d31 100644 (file)
@@ -209,7 +209,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
        bio = &op->write.op.wbio.bio;
        bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
 
-       ret = bch2_data_update_init(trans, NULL, &op->write,
+       ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
                        writepoint_hashed((unsigned long) current),
                        opts,
                        (struct data_update_opts) {
index 71f86535281646c6c22d3372f5b72c3915d6284a..3b0a501b6baf0a35f1b288fcb5aa75409eb503fe 100644 (file)
@@ -229,49 +229,6 @@ void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
        scnprintf(stats->name, sizeof(stats->name), "%s", name);
 }
 
-static int bch2_extent_drop_ptrs(struct btree_trans *trans,
-                                struct btree_iter *iter,
-                                struct bkey_s_c k,
-                                struct data_update_opts data_opts)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_i *n;
-       int ret;
-
-       n = bch2_bkey_make_mut_noupdate(trans, k);
-       ret = PTR_ERR_OR_ZERO(n);
-       if (ret)
-               return ret;
-
-       while (data_opts.kill_ptrs) {
-               unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
-               struct bch_extent_ptr *ptr;
-
-               bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
-               data_opts.kill_ptrs ^= 1U << drop;
-       }
-
-       /*
-        * If the new extent no longer has any pointers, bch2_extent_normalize()
-        * will do the appropriate thing with it (turning it into a
-        * KEY_TYPE_error key, or just a discard if it was a cached extent)
-        */
-       bch2_extent_normalize(c, bkey_i_to_s(n));
-
-       /*
-        * Since we're not inserting through an extent iterator
-        * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
-        * we aren't using the extent overwrite path to delete, we're
-        * just using the normal key deletion path:
-        */
-       if (bkey_deleted(&n->k))
-               n->k.size = 0;
-
-       return bch2_trans_relock(trans) ?:
-               bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
-               bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
-}
-
 int bch2_move_extent(struct moving_context *ctxt,
                     struct move_bucket_in_flight *bucket_in_flight,
                     struct btree_iter *iter,
@@ -341,19 +298,11 @@ int bch2_move_extent(struct moving_context *ctxt,
        io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(k.k);
        io->rbio.bio.bi_end_io          = move_read_endio;
 
-       ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
+       ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
                                    io_opts, data_opts, iter->btree_id, k);
-       if (ret && ret != -BCH_ERR_unwritten_extent_update)
+       if (ret)
                goto err_free_pages;
 
-       if (ret == -BCH_ERR_unwritten_extent_update) {
-               bch2_update_unwritten_extent(trans, &io->write);
-               move_free(io);
-               return 0;
-       }
-
-       BUG_ON(ret);
-
        io->write.op.end_io = move_write_done;
 
        if (ctxt->rate)
@@ -397,6 +346,9 @@ err_free_pages:
 err_free:
        kfree(io);
 err:
+       if (ret == -BCH_ERR_data_update_done)
+               return 0;
+
        this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]);
        trace_move_extent_alloc_mem_fail2(c, k);
        return ret;