bcachefs: Mark overwrites from journal replay in initial gc
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 15 Apr 2019 18:58:00 +0000 (14:58 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:20 +0000 (17:08 -0400)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_gc.c
fs/bcachefs/btree_update.h
fs/bcachefs/btree_update_leaf.c
fs/bcachefs/buckets.c
fs/bcachefs/buckets.h
fs/bcachefs/recovery.c

index cf0a2f4b22af5b1fb39c148d07f2f59d42b240f4..2650f60b7cd77c80cb7f14ca03d4131bde5accf8 100644 (file)
@@ -273,11 +273,40 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
                (int) btree_id_to_gc_phase(r);
 }
 
+static int mark_journal_key(struct bch_fs *c, enum btree_id id,
+                           struct bkey_i *insert)
+{
+       struct btree_trans trans;
+       struct btree_iter *iter;
+       struct bkey_s_c k;
+       u8 max_stale;
+       int ret = 0;
+
+       ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true);
+       if (ret)
+               return ret;
+
+       bch2_trans_init(&trans, c);
+
+       for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k),
+                          BTREE_ITER_SLOTS, k) {
+               percpu_down_read(&c->mark_lock);
+               ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL,
+                                        BCH_BUCKET_MARK_GC|
+                                        BCH_BUCKET_MARK_NOATOMIC);
+               percpu_up_read(&c->mark_lock);
+
+               if (!ret)
+                       break;
+       }
+
+       return bch2_trans_exit(&trans);
+}
+
 static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
                          bool initial, bool metadata_only)
 {
        enum btree_id ids[BTREE_ID_NR];
-       u8 max_stale;
        unsigned i;
 
        for (i = 0; i < BTREE_ID_NR; i++)
@@ -299,9 +328,7 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
 
                        for_each_journal_key(*journal_keys, j)
                                if (j->btree_id == id) {
-                                       ret = bch2_gc_mark_key(c,
-                                               bkey_i_to_s_c(j->k),
-                                               &max_stale, initial);
+                                       ret = mark_journal_key(c, id, j->k);
                                        if (ret)
                                                return ret;
                                }
index 75ed02874767c7687cec29d85f38d9f4e802399a..7a638a76634f1105a28f52869e8f1f19bd99185e 100644 (file)
@@ -43,6 +43,7 @@ enum {
        __BTREE_INSERT_USE_ALLOC_RESERVE,
        __BTREE_INSERT_JOURNAL_REPLAY,
        __BTREE_INSERT_JOURNAL_RESERVED,
+       __BTREE_INSERT_NOMARK_OVERWRITES,
        __BTREE_INSERT_NOMARK,
        __BTREE_INSERT_NOWAIT,
        __BTREE_INSERT_GC_LOCK_HELD,
@@ -76,6 +77,9 @@ enum {
 
 #define BTREE_INSERT_JOURNAL_RESERVED  (1 << __BTREE_INSERT_JOURNAL_RESERVED)
 
+/* Don't mark overwrites, just new key: */
+#define BTREE_INSERT_NOMARK_OVERWRITES (1 << __BTREE_INSERT_NOMARK_OVERWRITES)
+
 /* Don't call bch2_mark_key: */
 #define BTREE_INSERT_NOMARK            (1 << __BTREE_INSERT_NOMARK)
 
index 48d3be5174713fc949cfce883555735e36cd54df..2633a5452b13637a818a32b6030799b3c8719f87 100644 (file)
@@ -542,20 +542,22 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 
        btree_trans_lock_write(c, trans);
 
-       trans_for_each_update_iter(trans, i) {
-               if (i->deferred ||
-                   !btree_node_type_needs_gc(i->iter->btree_id))
-                       continue;
+       if (likely(!(trans->flags & BTREE_INSERT_NOMARK))) {
+               trans_for_each_update_iter(trans, i) {
+                       if (i->deferred ||
+                           !btree_node_type_needs_gc(i->iter->btree_id))
+                               continue;
 
-               if (!fs_usage) {
-                       percpu_down_read(&c->mark_lock);
-                       fs_usage = bch2_fs_usage_scratch_get(c);
-               }
+                       if (!fs_usage) {
+                               percpu_down_read(&c->mark_lock);
+                               fs_usage = bch2_fs_usage_scratch_get(c);
+                       }
 
-               if (!bch2_bkey_replicas_marked_locked(c,
-                               bkey_i_to_s_c(i->k), true)) {
-                       ret = BTREE_INSERT_NEED_MARK_REPLICAS;
-                       goto out;
+                       if (!bch2_bkey_replicas_marked_locked(c,
+                                       bkey_i_to_s_c(i->k), true)) {
+                               ret = BTREE_INSERT_NEED_MARK_REPLICAS;
+                               goto out;
+                       }
                }
        }
 
@@ -602,16 +604,18 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
                                linked->flags |= BTREE_ITER_NOUNLOCK;
        }
 
-       trans_for_each_update_iter(trans, i)
-               bch2_mark_update(trans, i, fs_usage, 0);
-       if (fs_usage)
-               bch2_trans_fs_usage_apply(trans, fs_usage);
-
-       if (unlikely(c->gc_pos.phase)) {
+       if (likely(!(trans->flags & BTREE_INSERT_NOMARK))) {
                trans_for_each_update_iter(trans, i)
-                       if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
-                               bch2_mark_update(trans, i, NULL,
-                                                BCH_BUCKET_MARK_GC);
+                       bch2_mark_update(trans, i, fs_usage, 0);
+               if (fs_usage)
+                       bch2_trans_fs_usage_apply(trans, fs_usage);
+
+               if (unlikely(c->gc_pos.phase)) {
+                       trans_for_each_update_iter(trans, i)
+                               if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
+                                       bch2_mark_update(trans, i, NULL,
+                                                        BCH_BUCKET_MARK_GC);
+               }
        }
 
        trans_for_each_update(trans, i)
index 4fe66ee1f745d7b73ddbe0c1d614e81e453e0459..7a05ba5fd5891dd4fc6bba710389a87b0299b3ef 100644 (file)
@@ -1035,6 +1035,56 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
        return ret;
 }
 
+inline bool bch2_mark_overwrite(struct btree_trans *trans,
+                               struct btree_iter *iter,
+                               struct bkey_s_c old,
+                               struct bkey_i *new,
+                               struct bch_fs_usage *fs_usage,
+                               unsigned flags)
+{
+       struct bch_fs           *c = trans->c;
+       struct btree            *b = iter->l[0].b;
+       s64                     sectors = 0;
+
+       if (btree_node_is_extents(b)
+           ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
+           : bkey_cmp(new->k.p, old.k->p))
+               return false;
+
+       if (btree_node_is_extents(b)) {
+               switch (bch2_extent_overlap(&new->k, old.k)) {
+               case BCH_EXTENT_OVERLAP_ALL:
+                       sectors = -((s64) old.k->size);
+                       break;
+               case BCH_EXTENT_OVERLAP_BACK:
+                       sectors = bkey_start_offset(&new->k) -
+                               old.k->p.offset;
+                       break;
+               case BCH_EXTENT_OVERLAP_FRONT:
+                       sectors = bkey_start_offset(old.k) -
+                               new->k.p.offset;
+                       break;
+               case BCH_EXTENT_OVERLAP_MIDDLE:
+                       sectors = old.k->p.offset - new->k.p.offset;
+                       BUG_ON(sectors <= 0);
+
+                       bch2_mark_key_locked(c, old, true, sectors,
+                               fs_usage, trans->journal_res.seq,
+                               flags);
+
+                       sectors = bkey_start_offset(&new->k) -
+                               old.k->p.offset;
+                       break;
+               }
+
+               BUG_ON(sectors >= 0);
+       }
+
+       bch2_mark_key_locked(c, old, false, sectors,
+               fs_usage, trans->journal_res.seq, flags);
+       return true;
+}
+
 void bch2_mark_update(struct btree_trans *trans,
                      struct btree_insert_entry *insert,
                      struct bch_fs_usage *fs_usage,
@@ -1049,57 +1099,23 @@ void bch2_mark_update(struct btree_trans *trans,
        if (!btree_node_type_needs_gc(iter->btree_id))
                return;
 
-       if (!(trans->flags & BTREE_INSERT_NOMARK))
-               bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
-                       bpos_min(insert->k->k.p, b->key.k.p).offset -
-                       bkey_start_offset(&insert->k->k),
-                       fs_usage, trans->journal_res.seq, flags);
+       bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
+               bpos_min(insert->k->k.p, b->key.k.p).offset -
+               bkey_start_offset(&insert->k->k),
+               fs_usage, trans->journal_res.seq, flags);
+
+       if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
+               return;
 
        while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
                                                      KEY_TYPE_discard))) {
                struct bkey             unpacked;
-               struct bkey_s_c         k;
-               s64                     sectors = 0;
+               struct bkey_s_c         k = bkey_disassemble(b, _k, &unpacked);
 
-               k = bkey_disassemble(b, _k, &unpacked);
-
-               if (btree_node_is_extents(b)
-                   ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
-                   : bkey_cmp(insert->k->k.p, k.k->p))
+               if (!bch2_mark_overwrite(trans, iter, k, insert->k,
+                                        fs_usage, flags))
                        break;
 
-               if (btree_node_is_extents(b)) {
-                       switch (bch2_extent_overlap(&insert->k->k, k.k)) {
-                       case BCH_EXTENT_OVERLAP_ALL:
-                               sectors = -((s64) k.k->size);
-                               break;
-                       case BCH_EXTENT_OVERLAP_BACK:
-                               sectors = bkey_start_offset(&insert->k->k) -
-                                       k.k->p.offset;
-                               break;
-                       case BCH_EXTENT_OVERLAP_FRONT:
-                               sectors = bkey_start_offset(k.k) -
-                                       insert->k->k.p.offset;
-                               break;
-                       case BCH_EXTENT_OVERLAP_MIDDLE:
-                               sectors = k.k->p.offset - insert->k->k.p.offset;
-                               BUG_ON(sectors <= 0);
-
-                               bch2_mark_key_locked(c, k, true, sectors,
-                                       fs_usage, trans->journal_res.seq,
-                                       flags);
-
-                               sectors = bkey_start_offset(&insert->k->k) -
-                                       k.k->p.offset;
-                               break;
-                       }
-
-                       BUG_ON(sectors >= 0);
-               }
-
-               bch2_mark_key_locked(c, k, false, sectors,
-                       fs_usage, trans->journal_res.seq, flags);
-
                bch2_btree_node_iter_advance(&node_iter, b);
        }
 }
index 095015f17f7682ad1da4ad9acaf3c70d6a863834..90fffee1c28965c1d77d944adafc06c02228e2c9 100644 (file)
@@ -254,6 +254,9 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
 int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
                        struct disk_reservation *);
 
+bool bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
+                        struct bkey_s_c, struct bkey_i *,
+                        struct bch_fs_usage *, unsigned);
 void bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
                      struct bch_fs_usage *, unsigned);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);
index 5bfb38c4290f0bf5558eacbb4f6d142f119550ad..d207ff7b98f49079f6b6b88e2b58a2055df750f3 100644 (file)
@@ -203,63 +203,94 @@ static void replay_now_at(struct journal *j, u64 seq)
 static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter *iter, *split_iter;
        /*
-        * We might cause compressed extents to be
-        * split, so we need to pass in a
-        * disk_reservation:
+        * We might cause compressed extents to be split, so we need to pass in
+        * a disk_reservation:
         */
        struct disk_reservation disk_res =
                bch2_disk_reservation_init(c, 0);
-       BKEY_PADDED(k) split;
+       struct bkey_i *split;
+       bool split_compressed = false;
+       unsigned flags = BTREE_INSERT_ATOMIC|
+               BTREE_INSERT_NOFAIL|
+               BTREE_INSERT_LAZY_RW|
+               BTREE_INSERT_JOURNAL_REPLAY|
+               BTREE_INSERT_NOMARK;
        int ret;
 
        bch2_trans_init(&trans, c);
+       bch2_trans_preload_iters(&trans);
+retry:
+       bch2_trans_begin(&trans);
 
        iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
                                   bkey_start_pos(&k->k),
                                   BTREE_ITER_INTENT);
+
        do {
                ret = bch2_btree_iter_traverse(iter);
                if (ret)
-                       break;
+                       goto err;
 
-               bkey_copy(&split.k, k);
-               bch2_cut_front(iter->pos, &split.k);
-               bch2_extent_trim_atomic(&split.k, iter);
-
-               ret = bch2_disk_reservation_add(c, &disk_res,
-                               split.k.k.size *
-                               bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&split.k)),
-                               BCH_DISK_RESERVATION_NOFAIL);
-               BUG_ON(ret);
-
-               bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k));
-               ret = bch2_trans_commit(&trans, &disk_res, NULL,
-                                       BTREE_INSERT_ATOMIC|
-                                       BTREE_INSERT_NOFAIL|
-                                       BTREE_INSERT_LAZY_RW|
-                                       BTREE_INSERT_JOURNAL_REPLAY);
-       } while ((!ret || ret == -EINTR) &&
-                bkey_cmp(k->k.p, iter->pos));
+               split_iter = bch2_trans_copy_iter(&trans, iter);
+               ret = PTR_ERR_OR_ZERO(split_iter);
+               if (ret)
+                       goto err;
 
-       bch2_disk_reservation_put(c, &disk_res);
+               split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
+               ret = PTR_ERR_OR_ZERO(split);
+               if (ret)
+                       goto err;
 
-       /*
-        * This isn't strictly correct - we should only be relying on the btree
-        * node lock for synchronization with gc when we've got a write lock
-        * held.
-        *
-        * but - there are other correctness issues if btree gc were to run
-        * before journal replay finishes
-        */
-       BUG_ON(c->gc_pos.phase);
+               if (!split_compressed &&
+                   bch2_extent_is_compressed(bkey_i_to_s_c(k)) &&
+                   !bch2_extent_is_atomic(k, split_iter)) {
+                       ret = bch2_disk_reservation_add(c, &disk_res,
+                                       k->k.size *
+                                       bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)),
+                                       BCH_DISK_RESERVATION_NOFAIL);
+                       BUG_ON(ret);
+
+                       flags &= ~BTREE_INSERT_JOURNAL_REPLAY;
+                       flags &= ~BTREE_INSERT_NOMARK;
+                       flags |=  BTREE_INSERT_NOMARK_OVERWRITES;
+                       split_compressed = true;
+               }
 
-       bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
-                     NULL, 0, 0);
-       bch2_trans_exit(&trans);
+               bkey_copy(split, k);
+               bch2_cut_front(split_iter->pos, split);
+               bch2_extent_trim_atomic(split, split_iter);
 
-       return ret;
+               bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split));
+               bch2_btree_iter_set_pos(iter, split->k.p);
+       } while (bkey_cmp(iter->pos, k->k.p) < 0);
+
+       ret = bch2_trans_commit(&trans, &disk_res, NULL, flags);
+       if (ret)
+               goto err;
+
+       if (split_compressed) {
+               /*
+                * This isn't strictly correct - we should only be relying on
+                * the btree node lock for synchronization with gc when we've
+                * got a write lock held.
+                *
+                * but - there are other correctness issues if btree gc were to
+                * run before journal replay finishes
+                */
+               BUG_ON(c->gc_pos.phase);
+
+               bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
+                             NULL, 0, 0);
+       }
+err:
+       if (ret == -EINTR)
+               goto retry;
+
+       bch2_disk_reservation_put(c, &disk_res);
+
+       return bch2_trans_exit(&trans) ?: ret;
 }
 
 static int bch2_journal_replay(struct bch_fs *c,