bcachefs: Add (partial) support for fixing btree topology
authorKent Overstreet <kent.overstreet@gmail.com>
Wed, 27 Jan 2021 01:59:00 +0000 (20:59 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:52 +0000 (17:08 -0400)
When we walk the btrees during recovery, part of that is checking that
btree topology is correct: for every interior btree node, its child
nodes should exactly span the range the parent node covers.

Previously, we had checks for this, but not repair code. Now that we
have the ability to do btree updates during initial GC, this patch adds
that repair code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/bcachefs.h
fs/bcachefs/btree_cache.c
fs/bcachefs/btree_cache.h
fs/bcachefs/btree_gc.c
fs/bcachefs/recovery.c

index d5fc5eed73aef33f9ea12a3f556eaed53d31ec63..19ba23f7d9dda67dcfba03e278cc9bfb475e1d69 100644 (file)
@@ -509,7 +509,8 @@ enum {
        BCH_FS_ERRORS_FIXED,
 
        /* misc: */
-       BCH_FS_FIXED_GENS,
+       BCH_FS_NEED_ANOTHER_GC,
+       BCH_FS_DELETED_NODES,
        BCH_FS_NEED_ALLOC_WRITE,
        BCH_FS_REBUILD_REPLICAS,
        BCH_FS_HOLD_BTREE_WRITES,
index 4b29be7234c79813f0e5620e22977422a8f51f7f..443d669e6a302bc81247a9bb722d169ba8f5574b 100644 (file)
@@ -7,6 +7,7 @@
 #include "btree_iter.h"
 #include "btree_locking.h"
 #include "debug.h"
+#include "error.h"
 #include "trace.h"
 
 #include <linux/prefetch.h>
@@ -813,9 +814,12 @@ lock_node:
                return ERR_PTR(-EIO);
        }
 
-       EBUG_ON(b->c.btree_id != iter->btree_id ||
-               BTREE_NODE_LEVEL(b->data) != level ||
-               bkey_cmp(b->data->max_key, k->k.p));
+       EBUG_ON(b->c.btree_id != iter->btree_id);
+       EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+       EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
+       EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+               bkey_cmp(b->data->min_key,
+                        bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
 
        return b;
 }
@@ -823,7 +827,8 @@ lock_node:
 struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
                                         const struct bkey_i *k,
                                         enum btree_id btree_id,
-                                        unsigned level)
+                                        unsigned level,
+                                        bool nofill)
 {
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b;
@@ -838,6 +843,9 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
 retry:
        b = btree_cache_find(bc, k);
        if (unlikely(!b)) {
+               if (nofill)
+                       return NULL;
+
                b = bch2_btree_node_fill(c, NULL, k, btree_id,
                                         level, SIX_LOCK_read, true);
 
@@ -884,9 +892,12 @@ lock_node:
                return ERR_PTR(-EIO);
        }
 
-       EBUG_ON(b->c.btree_id != btree_id ||
-               BTREE_NODE_LEVEL(b->data) != level ||
-               bkey_cmp(b->data->max_key, k->k.p));
+       EBUG_ON(b->c.btree_id != btree_id);
+       EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+       EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
+       EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+               bkey_cmp(b->data->min_key,
+                        bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
 
        return b;
 }
@@ -996,8 +1007,22 @@ out:
                if (sib != btree_prev_sib)
                        swap(n1, n2);
 
-               BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p),
-                               n2->data->min_key));
+               if (bkey_cmp(bkey_successor(n1->key.k.p),
+                            n2->data->min_key)) {
+                       char buf1[200], buf2[200];
+
+                       bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key));
+                       bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key));
+
+                       bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n"
+                                            "prev: %s\n"
+                                            "next: %s\n",
+                                            bch2_btree_ids[iter->btree_id], level,
+                                            buf1, buf2);
+
+                       six_unlock_intent(&ret->c.lock);
+                       ret = NULL;
+               }
        }
 
        bch2_btree_trans_verify_locks(trans);
index 0eeca0bcc48ead34e563794f8a4bd1890faa8752..5fffae92effb35bc3ff61793e1b42660bc78a59f 100644 (file)
@@ -26,7 +26,7 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
                                  enum six_lock_type, unsigned long);
 
 struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
-                                        enum btree_id, unsigned);
+                                        enum btree_id, unsigned, bool);
 
 struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
                                struct btree *, enum btree_node_sibling);
index 8f347ba5b4e6fdc0e4326b2fa1e3d0a8cb658ca0..0dfb1f67225d61c5226afcb5d30250324693a42d 100644 (file)
@@ -50,6 +50,10 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
        __gc_pos_set(c, new_pos);
 }
 
+/*
+ * Missing: if an interior btree node is empty, we need to do something -
+ * perhaps just kill it
+ */
 static int bch2_gc_check_topology(struct bch_fs *c,
                                  struct btree *b,
                                  struct bkey_buf *prev,
@@ -62,6 +66,8 @@ static int bch2_gc_check_topology(struct bch_fs *c,
                ? node_start
                : bkey_successor(prev->k->k.p);
        char buf1[200], buf2[200];
+       bool update_min = false;
+       bool update_max = false;
        int ret = 0;
 
        if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
@@ -75,22 +81,79 @@ static int bch2_gc_check_topology(struct bch_fs *c,
                        bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
 
                if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c,
-                               "btree node with incorrect min_key:\n  prev %s\n  cur %s",
+                               "btree node with incorrect min_key at btree %s level %u:\n"
+                               "  prev %s\n"
+                               "  cur %s",
+                               bch2_btree_ids[b->c.btree_id], b->c.level,
                                buf1,
-                               (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) {
-                       BUG();
-               }
+                               (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)))
+                       update_min = true;
        }
 
        if (fsck_err_on(is_last &&
                        bkey_cmp(cur.k->k.p, node_end), c,
-                       "btree node with incorrect max_key:\n  %s\n  expected %s",
+                       "btree node with incorrect max_key at btree %s level %u:\n"
+                       "  %s\n"
+                       "  expected %s",
+                       bch2_btree_ids[b->c.btree_id], b->c.level,
                        (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
-                       (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) {
-               BUG();
-       }
+                       (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)))
+               update_max = true;
 
        bch2_bkey_buf_copy(prev, c, cur.k);
+
+       if (update_min || update_max) {
+               struct bkey_i *new;
+               struct bkey_i_btree_ptr_v2 *bp = NULL;
+               struct btree *n;
+
+               if (update_max) {
+                       ret = bch2_journal_key_delete(c, b->c.btree_id,
+                                                     b->c.level, cur.k->k.p);
+                       if (ret)
+                               return ret;
+               }
+
+               new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL);
+               if (!new)
+                       return -ENOMEM;
+
+               bkey_copy(new, cur.k);
+
+               if (new->k.type == KEY_TYPE_btree_ptr_v2)
+                       bp = bkey_i_to_btree_ptr_v2(new);
+
+               if (update_min)
+                       bp->v.min_key = expected_start;
+               if (update_max)
+                       new->k.p = node_end;
+               if (bp)
+                       SET_BTREE_PTR_RANGE_UPDATED(&bp->v, true);
+
+               ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level, new);
+               if (ret) {
+                       kfree(new);
+                       return ret;
+               }
+
+               n = bch2_btree_node_get_noiter(c, cur.k, b->c.btree_id,
+                                              b->c.level - 1, true);
+               if (n) {
+                       mutex_lock(&c->btree_cache.lock);
+                       bch2_btree_node_hash_remove(&c->btree_cache, n);
+
+                       bkey_copy(&n->key, new);
+                       if (update_min)
+                               n->data->min_key = expected_start;
+                       if (update_max)
+                               n->data->max_key = node_end;
+
+                       ret = __bch2_btree_node_hash_insert(&c->btree_cache, n);
+                       BUG_ON(ret);
+                       mutex_unlock(&c->btree_cache.lock);
+                       six_unlock_read(&n->c.lock);
+               }
+       }
 fsck_err:
        return ret;
 }
@@ -147,12 +210,13 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
                                        ptr->dev, PTR_BUCKET_NR(ca, ptr),
                                        bch2_data_types[ptr_data_type(k.k, ptr)],
                                        ptr->gen, g->mark.gen)) {
+                               /* XXX if it's a cached ptr, drop it */
                                g2->_mark.gen   = g->_mark.gen          = ptr->gen;
                                g2->gen_valid   = g->gen_valid          = true;
                                g2->_mark.data_type             = 0;
                                g2->_mark.dirty_sectors         = 0;
                                g2->_mark.cached_sectors        = 0;
-                               set_bit(BCH_FS_FIXED_GENS, &c->flags);
+                               set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
                                set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
                        }
                }
@@ -298,8 +362,6 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
                        break;
 
                if (b->c.level) {
-                       struct btree *child;
-
                        bch2_bkey_buf_reassemble(&cur, c, k);
                        k = bkey_i_to_s_c(cur.k);
 
@@ -310,26 +372,49 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
                                        !bch2_btree_and_journal_iter_peek(&iter).k);
                        if (ret)
                                break;
+               } else {
+                       bch2_btree_and_journal_iter_advance(&iter);
+               }
+       }
 
-                       if (b->c.level > target_depth) {
-                               child = bch2_btree_node_get_noiter(c, cur.k,
-                                                       b->c.btree_id, b->c.level - 1);
-                               ret = PTR_ERR_OR_ZERO(child);
-                               if (ret)
-                                       break;
+       if (b->c.level > target_depth) {
+               bch2_btree_and_journal_iter_exit(&iter);
+               bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
 
-                               ret = bch2_gc_btree_init_recurse(c, child,
-                                               target_depth);
-                               six_unlock_read(&child->c.lock);
+               while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+                       struct btree *child;
+
+                       bch2_bkey_buf_reassemble(&cur, c, k);
+                       bch2_btree_and_journal_iter_advance(&iter);
 
+                       child = bch2_btree_node_get_noiter(c, cur.k,
+                                               b->c.btree_id, b->c.level - 1,
+                                               false);
+                       ret = PTR_ERR_OR_ZERO(child);
+
+                       if (fsck_err_on(ret == -EIO, c,
+                                       "unreadable btree node")) {
+                               ret = bch2_journal_key_delete(c, b->c.btree_id,
+                                                             b->c.level, cur.k->k.p);
                                if (ret)
-                                       break;
+                                       return ret;
+
+                               set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+                               continue;
                        }
-               } else {
-                       bch2_btree_and_journal_iter_advance(&iter);
+
+                       if (ret)
+                               break;
+
+                       ret = bch2_gc_btree_init_recurse(c, child,
+                                                        target_depth);
+                       six_unlock_read(&child->c.lock);
+
+                       if (ret)
+                               break;
                }
        }
-
+fsck_err:
        bch2_bkey_buf_exit(&cur, c);
        bch2_bkey_buf_exit(&prev, c);
        bch2_btree_and_journal_iter_exit(&iter);
@@ -816,16 +901,15 @@ again:
        bch2_mark_allocator_buckets(c);
 
        c->gc_count++;
-out:
-       if (!ret &&
-           (test_bit(BCH_FS_FIXED_GENS, &c->flags) ||
-            (!iter && bch2_test_restart_gc))) {
+
+       if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
+           (!iter && bch2_test_restart_gc)) {
                /*
                 * XXX: make sure gens we fixed got saved
                 */
                if (iter++ <= 2) {
-                       bch_info(c, "Fixed gens, restarting mark and sweep:");
-                       clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+                       bch_info(c, "Second GC pass needed, restarting:");
+                       clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
                        __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 
                        percpu_down_write(&c->mark_lock);
@@ -840,7 +924,7 @@ out:
                bch_info(c, "Unable to fix bucket gens, looping");
                ret = -EINVAL;
        }
-
+out:
        if (!ret) {
                bch2_journal_block(&c->journal);
 
index 88a1d47e6e4bb31d682f0a88a6b1b915a5f2ce73..f470e0e233ce949c46480cb57242fae6d34074d3 100644 (file)
@@ -342,7 +342,8 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
                        bch2_btree_and_journal_iter_advance(&iter);
 
                        child = bch2_btree_node_get_noiter(c, tmp.k,
-                                               b->c.btree_id, b->c.level - 1);
+                                               b->c.btree_id, b->c.level - 1,
+                                               false);
 
                        ret = PTR_ERR_OR_ZERO(child);
                        if (ret)
@@ -766,7 +767,8 @@ static int bch2_journal_replay(struct bch_fs *c,
        bch2_journal_flush_all_pins(j);
        return bch2_journal_error(j);
 err:
-       bch_err(c, "journal replay: error %d while replaying key", ret);
+       bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
+               ret, bch2_btree_ids[i->btree_id], i->level);
        return ret;
 }