From f26c67f4a7c4951a312547790b11066bc510822e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 25 Jun 2023 18:04:46 -0400 Subject: [PATCH] bcachefs: Snapshot depth, skiplist fields This extents KEY_TYPE_snapshot to include some new fields: - depth, to indicate depth of this particular node from the root - skip[3], skiplist entries for quickly walking back up to the root These are to improve bch2_snapshot_is_ancestor(), making it O(ln(n)) instead of O(n) in the snapshot tree depth. Skiplist nodes are picked at random from the set of ancestor nodes, not some fixed fraction. This introduces bcachefs_metadata_version 1.1, snapshot_skiplists. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 6 +- fs/bcachefs/btree_iter.h | 8 ++ fs/bcachefs/recovery.c | 13 +- fs/bcachefs/subvolume.c | 261 ++++++++++++++++++++++++++++------ fs/bcachefs/subvolume.h | 33 ++++- fs/bcachefs/subvolume_types.h | 2 + 6 files changed, 267 insertions(+), 56 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 274e57740d741..6d693e4def5d2 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1148,6 +1148,8 @@ struct bch_snapshot { __le32 children[2]; __le32 subvol; __le32 tree; + __le32 depth; + __le32 skip[3]; }; LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) @@ -1625,7 +1627,9 @@ struct bch_sb_field_journal_seq_blacklist { x(snapshot_trees, BCH_VERSION(0, 29), \ RECOVERY_PASS_ALL_FSCK) \ x(major_minor, BCH_VERSION(1, 0), \ - 0) + 0) \ + x(snapshot_skiplists, BCH_VERSION(1, 1), \ + BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 63260f68bc674..13e92452270eb 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -795,6 +795,14 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) +#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id, \ + _start, _iter_flags, _k, \ + _disk_res, _journal_seq, _commit_flags,\ + _do) \ + for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ + (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_commit_flags))) + #define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \ _start, _end, _iter_flags, _k, \ _disk_res, _journal_seq, _commit_flags,\ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 0486ec9d281cf..c46297bd1cf95 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -594,10 +594,21 @@ static int bch2_journal_replay_key(struct btree_trans *trans, unsigned iter_flags = BTREE_ITER_INTENT| BTREE_ITER_NOT_EXTENTS; + unsigned update_flags = BTREE_TRIGGER_NORUN; int ret; + /* + * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to + * keep the key cache coherent with the underlying btree. Nothing + * besides the allocator is doing updates yet so we don't need key cache + * coherency for non-alloc btrees, and key cache fills for snapshots + * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until + * the snapshots recovery pass runs. + */ if (!k->level && k->btree_id == BTREE_ID_alloc) iter_flags |= BTREE_ITER_CACHED; + else + update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM; bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, k->level, @@ -610,7 +621,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans, if (k->overwritten) goto out; - ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN); + ret = bch2_trans_update(trans, &iter, k->k, update_flags); out: bch2_trans_iter_exit(trans, &iter); return ret; diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index f3852c433ca98..cdaaf49d3b3e3 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -8,8 +8,41 @@ #include "fs.h" #include "subvolume.h" +#include + static int bch2_subvolume_delete(struct btree_trans *, u32); +static inline u32 get_ancestor_below(struct bch_fs *c, u32 id, u32 ancestor) +{ + struct snapshot_t *s = snapshot_t(c, id); + + if (s->skip[2] <= ancestor) + return s->skip[2]; + if (s->skip[1] <= ancestor) + return s->skip[1]; + if (s->skip[0] <= ancestor) + return s->skip[0]; + return s->parent; +} + +bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) +{ + EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots); + + while (id && id < ancestor) + id = get_ancestor_below(c, id, ancestor); + + return id == ancestor; +} + +static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) +{ + while (id && id < ancestor) + id = snapshot_t(c, id)->parent; + + return id == ancestor; +} + /* Snapshot tree: */ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, @@ -95,6 +128,13 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, le32_to_cpu(s.v->children[1]), le32_to_cpu(s.v->subvol), le32_to_cpu(s.v->tree)); + + if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth)) + prt_printf(out, " depth %u skiplist %u %u %u", + le32_to_cpu(s.v->depth), + le32_to_cpu(s.v->skip[0]), + le32_to_cpu(s.v->skip[1]), + le32_to_cpu(s.v->skip[2])); } int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, @@ -140,6 +180,25 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, } } + if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { + if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || + le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) { + prt_printf(err, "skiplist not normalized"); + return -BCH_ERR_invalid_bkey; + } + + for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { + id = le32_to_cpu(s.v->skip[i]); + + if (!id != !s.v->parent || + (s.v->parent && + id <= k.k->p.offset)) { + prt_printf(err, "bad skiplist node %u)", id); + return -BCH_ERR_invalid_bkey; + } + } + } + return 0; } @@ -165,6 +224,21 @@ int bch2_mark_snapshot(struct btree_trans *trans, t->children[1] = le32_to_cpu(s.v->children[1]); t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; t->tree = le32_to_cpu(s.v->tree); + + if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) { + t->depth = le32_to_cpu(s.v->depth); + t->skip[0] = le32_to_cpu(s.v->skip[0]); + t->skip[1] = le32_to_cpu(s.v->skip[1]); + t->skip[2] = le32_to_cpu(s.v->skip[2]); + } else { + t->depth = 0; + t->skip[0] = 0; + t->skip[1] = 0; + t->skip[2] = 0; + } + + if (BCH_SNAPSHOT_DELETED(s.v)) + set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); } else { t->parent = 0; t->children[0] = 0; @@ -370,9 +444,9 @@ static int check_snapshot_tree(struct btree_trans *trans, "snapshot tree points to missing subvolume:\n %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || - fsck_err_on(!bch2_snapshot_is_ancestor(c, - le32_to_cpu(subvol.snapshot), - root_id), c, + fsck_err_on(!bch2_snapshot_is_ancestor_early(c, + le32_to_cpu(subvol.snapshot), + root_id), c, "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || @@ -441,7 +515,47 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans, if (ret) return ret; - return bch2_snapshot_is_ancestor(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot)); + return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot)); +} + +static u32 snapshot_skiplist_get(struct bch_fs *c, u32 id) +{ + struct snapshot_t *s; + + if (!id) + return 0; + + s = snapshot_t(c, id); + if (!s->parent) + return id; + + return bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); +} + +static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s) +{ + struct bch_snapshot a; + unsigned i; + int ret; + + for (i = 0; i < 3; i++) { + if (!s.parent != !s.skip[i]) + return false; + + if (!s.parent) + continue; + + ret = snapshot_lookup(trans, le32_to_cpu(s.skip[i]), &a); + if (bch2_err_matches(ret, ENOENT)) + return false; + if (ret) + return ret; + + if (a.tree != s.tree) + return false; + } + + return true; } /* @@ -451,14 +565,15 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans, */ static int snapshot_tree_ptr_repair(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c_snapshot *s) + struct bkey_s_c k, + struct bch_snapshot *s) { struct bch_fs *c = trans->c; struct btree_iter root_iter; struct bch_snapshot_tree s_t; struct bkey_s_c_snapshot root; struct bkey_i_snapshot *u; - u32 root_id = bch2_snapshot_root(c, s->k->p.offset), tree_id; + u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id; int ret; root = bch2_bkey_get_iter_typed(trans, &root_iter, @@ -484,32 +599,43 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans, goto err; u->v.tree = cpu_to_le32(tree_id); - if (s->k->p.snapshot == root_id) - *s = snapshot_i_to_s_c(u); + if (k.k->p.offset == root_id) + *s = u->v; } - if (s->k->p.snapshot != root_id) { - u = bch2_bkey_make_mut_typed(trans, iter, &s->s_c, 0, snapshot); + if (k.k->p.offset != root_id) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); if (ret) goto err; u->v.tree = cpu_to_le32(tree_id); - *s = snapshot_i_to_s_c(u); + *s = u->v; } err: bch2_trans_iter_exit(trans, &root_iter); return ret; } +static int cmp_le32(__le32 l, __le32 r) +{ + return cmp_int(le32_to_cpu(l), le32_to_cpu(r)); +} + static int check_snapshot(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct bkey_s_c_snapshot s; + struct bch_snapshot s; struct bch_subvolume subvol; struct bch_snapshot v; + struct bkey_i_snapshot *u; + u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); + u32 real_depth; + struct snapshot_t *parent = parent_id + ? snapshot_t(c, parent_id) + : NULL; struct printbuf buf = PRINTBUF; bool should_have_subvol; u32 i, id; @@ -518,94 +644,123 @@ static int check_snapshot(struct btree_trans *trans, if (k.k->type != KEY_TYPE_snapshot) return 0; - s = bkey_s_c_to_snapshot(k); - id = le32_to_cpu(s.v->parent); + memset(&s, 0, sizeof(s)); + memcpy(&s, k.v, bkey_val_bytes(k.k)); + + id = le32_to_cpu(s.parent); if (id) { ret = snapshot_lookup(trans, id, &v); if (bch2_err_matches(ret, ENOENT)) bch_err(c, "snapshot with nonexistent parent:\n %s", - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf)); + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); if (ret) goto err; - if (le32_to_cpu(v.children[0]) != s.k->p.offset && - le32_to_cpu(v.children[1]) != s.k->p.offset) { + if (le32_to_cpu(v.children[0]) != k.k->p.offset && + le32_to_cpu(v.children[1]) != k.k->p.offset) { bch_err(c, "snapshot parent %u missing pointer to child %llu", - id, s.k->p.offset); + id, k.k->p.offset); ret = -EINVAL; goto err; } } - for (i = 0; i < 2 && s.v->children[i]; i++) { - id = le32_to_cpu(s.v->children[i]); + for (i = 0; i < 2 && s.children[i]; i++) { + id = le32_to_cpu(s.children[i]); ret = snapshot_lookup(trans, id, &v); if (bch2_err_matches(ret, ENOENT)) bch_err(c, "snapshot node %llu has nonexistent child %u", - s.k->p.offset, id); + k.k->p.offset, id); if (ret) goto err; - if (le32_to_cpu(v.parent) != s.k->p.offset) { + if (le32_to_cpu(v.parent) != k.k->p.offset) { bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", - id, le32_to_cpu(v.parent), s.k->p.offset); + id, le32_to_cpu(v.parent), k.k->p.offset); ret = -EINVAL; goto err; } } - should_have_subvol = BCH_SNAPSHOT_SUBVOL(s.v) && - !BCH_SNAPSHOT_DELETED(s.v); + should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && + !BCH_SNAPSHOT_DELETED(&s); if (should_have_subvol) { - id = le32_to_cpu(s.v->subvol); + id = le32_to_cpu(s.subvol); ret = bch2_subvolume_get(trans, id, 0, false, &subvol); if (bch2_err_matches(ret, ENOENT)) bch_err(c, "snapshot points to nonexistent subvolume:\n %s", - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf)); + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); if (ret) goto err; - if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) { + if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) { bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", - s.k->p.offset); + k.k->p.offset); ret = -EINVAL; goto err; } } else { - if (fsck_err_on(s.v->subvol, c, "snapshot should not point to subvol:\n %s", - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - struct bkey_i_snapshot *u = bch2_trans_kmalloc(trans, sizeof(*u)); - + if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); if (ret) goto err; - bkey_reassemble(&u->k_i, s.s_c); u->v.subvol = 0; - ret = bch2_trans_update(trans, iter, &u->k_i, 0); - if (ret) - goto err; - - s = snapshot_i_to_s_c(u); + s = u->v; } } - ret = snapshot_tree_ptr_good(trans, s.k->p.offset, le32_to_cpu(s.v->tree)); + ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree)); if (ret < 0) goto err; if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n %s", - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = snapshot_tree_ptr_repair(trans, iter, &s); + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = snapshot_tree_ptr_repair(trans, iter, k, &s); if (ret) goto err; } ret = 0; - if (BCH_SNAPSHOT_DELETED(s.v)) - set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); + real_depth = parent ? parent->depth + 1 : 0; + + if (le32_to_cpu(s.depth) != real_depth && + (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || + fsck_err(c, "snapshot with incorrect depth field, should be %u:\n %s", + real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + u->v.depth = cpu_to_le32(real_depth); + s = u->v; + } + + ret = snapshot_skiplist_good(trans, s); + if (ret < 0) + goto err; + + if (!ret && + (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || + fsck_err(c, "snapshot with bad skiplist field:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + for (i = 0; i < ARRAY_SIZE(u->v.skip); i++) + u->v.skip[i] = cpu_to_le32(snapshot_skiplist_get(c, parent_id)); + + bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32); + s = u->v; + } + ret = 0; err: fsck_err: printbuf_exit(&buf); @@ -618,9 +773,13 @@ int bch2_check_snapshots(struct bch_fs *c) struct bkey_s_c k; int ret; + /* + * We iterate backwards as checking/fixing the depth field requires that + * the parent's depth already be correct: + */ ret = bch2_trans_run(c, - for_each_btree_key_commit(&trans, iter, - BTREE_ID_snapshots, POS_MIN, + for_each_btree_key_reverse_commit(&trans, iter, + BTREE_ID_snapshots, POS_MAX, BTREE_ITER_PREFETCH, k, NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, check_snapshot(&trans, &iter, k))); @@ -847,10 +1006,12 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, u32 *snapshot_subvols, unsigned nr_snapids) { + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_i_snapshot *n; struct bkey_s_c k; - unsigned i; + unsigned i, j; + u32 depth = parent ? snapshot_t(c, parent)->depth + 1 : 0; int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, @@ -880,6 +1041,12 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, n->v.parent = cpu_to_le32(parent); n->v.subvol = cpu_to_le32(snapshot_subvols[i]); n->v.tree = cpu_to_le32(tree); + n->v.depth = cpu_to_le32(depth); + + for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) + n->v.skip[j] = cpu_to_le32(snapshot_skiplist_get(c, parent)); + + bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index daa9a6b0819bb..ab0b4a6de255f 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -37,9 +37,34 @@ static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) return genradix_ptr(&c->snapshots, U32_MAX - id); } +static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) +{ + return snapshot_t(c, id)->parent; +} + static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) { +#ifdef CONFIG_BCACHEFS_DEBUG + u32 parent = snapshot_t(c, id)->parent; + + if (parent && + snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1) + panic("id %u depth=%u parent %u depth=%u\n", + id, snapshot_t(c, id)->depth, + parent, snapshot_t(c, parent)->depth); + + return parent; +#else return snapshot_t(c, id)->parent; +#endif +} + +static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) +{ + while (n--) + id = bch2_snapshot_parent(c, id); + + return id; } static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) @@ -84,13 +109,7 @@ static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) return 0; } -static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) -{ - while (id && id < ancestor) - id = bch2_snapshot_parent(c, id); - - return id == ancestor; -} +bool bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) { diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index c6c1cbad97816..750d975ac4682 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -8,6 +8,8 @@ typedef DARRAY(u32) snapshot_id_list; struct snapshot_t { u32 parent; + u32 skip[3]; + u32 depth; u32 children[2]; u32 subvol; /* Nonzero only if a subvolume points to this node: */ u32 tree; -- 2.30.2