bcachefs: Start using bpos.snapshot field
authorKent Overstreet <kent.overstreet@gmail.com>
Wed, 24 Mar 2021 22:02:16 +0000 (18:02 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:57 +0000 (17:08 -0400)
This patch starts treating the bpos.snapshot field like part of the key
in the btree code:

* bpos_successor() and bpos_predecessor() now include the snapshot field
* Keys in btrees that will be using snapshots (extents, inodes, dirents
  and xattrs) now always have their snapshot field set to U32_MAX

The btree iterator code gets a new flag, BTREE_ITER_ALL_SNAPSHOTS, that
determines whether we're iterating over keys in all snapshots or not -
internally, this controlls whether bkey_(successor|predecessor)
increment/decrement the snapshot field, or only the higher bits of the
key.

We add a new member to struct btree_iter, iter->snapshot: when
BTREE_ITER_ALL_SNAPSHOTS is not set, iter->pos.snapshot should always
equal iter->snapshot, which will be 0 for btrees that don't use
snapshots, and alsways U32_MAX for btrees that will use snapshots
(until we enable snapshot creation).

This patch also introduces a new metadata version number, and compat
code for reading from/writing to older versions - this isn't a forced
upgrade (yet).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
22 files changed:
fs/bcachefs/bcachefs_format.h
fs/bcachefs/bkey.c
fs/bcachefs/bkey.h
fs/bcachefs/bkey_methods.c
fs/bcachefs/bset.c
fs/bcachefs/btree_cache.c
fs/bcachefs/btree_gc.c
fs/bcachefs/btree_io.c
fs/bcachefs/btree_io.h
fs/bcachefs/btree_iter.c
fs/bcachefs/btree_iter.h
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/btree_update_leaf.c
fs/bcachefs/debug.c
fs/bcachefs/extents.c
fs/bcachefs/fsck.c
fs/bcachefs/inode.c
fs/bcachefs/io.c
fs/bcachefs/journal_io.c
fs/bcachefs/recovery.c
fs/bcachefs/tests.c

index 111f7d3c312e20c915b0c6d9eaf612317a085a03..2172d3cf368092646c8f13e54e68190e15fa906f 100644 (file)
@@ -142,19 +142,18 @@ struct bpos {
 #define KEY_SNAPSHOT_MAX               ((__u32)~0U)
 #define KEY_SIZE_MAX                   ((__u32)~0U)
 
-static inline struct bpos POS(__u64 inode, __u64 offset)
+static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
 {
-       struct bpos ret;
-
-       ret.inode       = inode;
-       ret.offset      = offset;
-       ret.snapshot    = 0;
-
-       return ret;
+       return (struct bpos) {
+               .inode          = inode,
+               .offset         = offset,
+               .snapshot       = snapshot,
+       };
 }
 
-#define POS_MIN                                POS(0, 0)
-#define POS_MAX                                POS(KEY_INODE_MAX, KEY_OFFSET_MAX)
+#define POS_MIN                                SPOS(0, 0, 0)
+#define POS_MAX                                SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
+#define POS(_inode, _offset)           SPOS(_inode, _offset, 0)
 
 /* Empty placeholder struct, for container_of() */
 struct bch_val {
@@ -1208,7 +1207,8 @@ enum bcachefs_metadata_version {
        bcachefs_metadata_version_new_versioning        = 10,
        bcachefs_metadata_version_bkey_renumber         = 10,
        bcachefs_metadata_version_inode_btree_change    = 11,
-       bcachefs_metadata_version_max                   = 12,
+       bcachefs_metadata_version_snapshot              = 12,
+       bcachefs_metadata_version_max                   = 13,
 };
 
 #define bcachefs_metadata_version_current      (bcachefs_metadata_version_max - 1)
@@ -1749,7 +1749,7 @@ struct btree_node {
        /* Closed interval: */
        struct bpos             min_key;
        struct bpos             max_key;
-       struct bch_extent_ptr   ptr;
+       struct bch_extent_ptr   _ptr; /* not used anymore */
        struct bkey_format      format;
 
        union {
index 8b2befac95d44c0baeb243330e7cfcb406ba4ac0..a0379f980f7e806556da76ec246601af8f89163f 100644 (file)
@@ -617,15 +617,19 @@ const char *bch2_bkey_format_validate(struct bkey_format *f)
                return "incorrect number of fields";
 
        for (i = 0; i < f->nr_fields; i++) {
+               unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+               u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
                u64 field_offset = le64_to_cpu(f->field_offset[i]);
 
-               if (f->bits_per_field[i] > 64)
+               if (f->bits_per_field[i] > unpacked_bits)
                        return "field too large";
 
-               if (field_offset &&
-                   (f->bits_per_field[i] == 64 ||
-                   (field_offset + ((1ULL << f->bits_per_field[i]) - 1) <
-                    field_offset)))
+               if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
+                       return "offset + bits overflow";
+
+               if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
+                    unpacked_mask) <
+                   field_offset)
                        return "offset + bits overflow";
 
                bits += f->bits_per_field[i];
@@ -1126,11 +1130,12 @@ void bch2_bkey_pack_test(void)
        struct bkey_packed p;
 
        struct bkey_format test_format = {
-               .key_u64s       = 2,
+               .key_u64s       = 3,
                .nr_fields      = BKEY_NR_FIELDS,
                .bits_per_field = {
                        13,
                        64,
+                       32,
                },
        };
 
index df23c5b48969666bd11c04181d84edad3cb39853..72b4267031d8d67906a42e2e2f21c80b0b7b9e0a 100644 (file)
@@ -258,24 +258,46 @@ static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
                format->bits_per_field[BKEY_FIELD_SNAPSHOT];
 }
 
-static inline struct bpos bkey_successor(struct bpos p)
+static inline struct bpos bpos_successor(struct bpos p)
 {
-       struct bpos ret = p;
+       if (!++p.snapshot &&
+           !++p.offset &&
+           !++p.inode)
+               BUG();
 
-       if (!++ret.offset)
-               BUG_ON(!++ret.inode);
+       return p;
+}
 
-       return ret;
+static inline struct bpos bpos_predecessor(struct bpos p)
+{
+       if (!p.snapshot-- &&
+           !p.offset-- &&
+           !p.inode--)
+               BUG();
+
+       return p;
 }
 
-static inline struct bpos bkey_predecessor(struct bpos p)
+static inline struct bpos bpos_nosnap_successor(struct bpos p)
 {
-       struct bpos ret = p;
+       p.snapshot = 0;
 
-       if (!ret.offset--)
-               BUG_ON(!ret.inode--);
+       if (!++p.offset &&
+           !++p.inode)
+               BUG();
 
-       return ret;
+       return p;
+}
+
+static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
+{
+       p.snapshot = 0;
+
+       if (!p.offset-- &&
+           !p.inode--)
+               BUG();
+
+       return p;
 }
 
 static inline u64 bkey_start_offset(const struct bkey *k)
index 5e7eadeb3b573d5f4525a5bb4424c69fed84816d..6fe95b802e130060b48caa6ad54b2f2c38620c9b 100644 (file)
@@ -119,9 +119,16 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
                        return "nonzero size field";
        }
 
-       if (k.k->p.snapshot)
+       if (type != BKEY_TYPE_btree &&
+           !btree_type_has_snapshots(type) &&
+           k.k->p.snapshot)
                return "nonzero snapshot";
 
+       if (type != BKEY_TYPE_btree &&
+           btree_type_has_snapshots(type) &&
+           k.k->p.snapshot != U32_MAX)
+               return "invalid snapshot field";
+
        if (type != BKEY_TYPE_btree &&
            !bkey_cmp(k.k->p, POS_MAX))
                return "POS_MAX key";
@@ -310,14 +317,15 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
        const struct bkey_ops *ops;
        struct bkey uk;
        struct bkey_s u;
+       unsigned nr_compat = 5;
        int i;
 
        /*
         * Do these operations in reverse order in the write path:
         */
 
-       for (i = 0; i < 4; i++)
-       switch (!write ? i : 3 - i) {
+       for (i = 0; i < nr_compat; i++)
+       switch (!write ? i : nr_compat - 1 - i) {
        case 0:
                if (big_endian != CPU_BIG_ENDIAN)
                        bch2_bkey_swab_key(f, k);
@@ -351,6 +359,28 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
                }
                break;
        case 3:
+               if (version < bcachefs_metadata_version_snapshot &&
+                   (level || btree_type_has_snapshots(btree_id))) {
+                       struct bkey_i *u = packed_to_bkey(k);
+
+                       if (u) {
+                               u->k.p.snapshot = write
+                                       ? 0 : U32_MAX;
+                       } else {
+                               u64 min_packed = f->field_offset[BKEY_FIELD_SNAPSHOT];
+                               u64 max_packed = min_packed +
+                                       ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+                               uk = __bch2_bkey_unpack_key(f, k);
+                               uk.p.snapshot = write
+                                       ? min_packed : min_t(u64, U32_MAX, max_packed);
+
+                               BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
+                       }
+               }
+
+               break;
+       case 4:
                if (!bkey_packed(k)) {
                        u = bkey_i_to_s(packed_to_bkey(k));
                } else {
index 5746199dfafb33da8df2e538992256ead9a16610..de4dc2fac1d632ff5d5cf48290f10f46ed792f19 100644 (file)
@@ -1438,7 +1438,7 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
  *    to the search key is going to have 0 sectors after the search key.
  *
  *    But this does mean that we can't just search for
- *    bkey_successor(start_of_range) to get the first extent that overlaps with
+ *    bpos_successor(start_of_range) to get the first extent that overlaps with
  *    the range we want - if we're unlucky and there's an extent that ends
  *    exactly where we searched, then there could be a deleted key at the same
  *    position and we'd get that when we search instead of the preceding extent
index 63b8423fa87c4ef47d6df1755071e5a2ccdad306..85ac08b9270a767a855c8e8eb20f14d84ac09d03 100644 (file)
@@ -1018,7 +1018,7 @@ out:
                if (sib != btree_prev_sib)
                        swap(n1, n2);
 
-               if (bpos_cmp(bkey_successor(n1->key.k.p),
+               if (bpos_cmp(bpos_successor(n1->key.k.p),
                             n2->data->min_key)) {
                        char buf1[200], buf2[200];
 
index 2710e4b35da31eb09bd5d690ccc6e6455681a41d..842840664562d32afe7f730212daaff7ddc61e25 100644 (file)
@@ -64,7 +64,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
        struct bpos node_end    = b->data->max_key;
        struct bpos expected_start = bkey_deleted(&prev->k->k)
                ? node_start
-               : bkey_successor(prev->k->k.p);
+               : bpos_successor(prev->k->k.p);
        char buf1[200], buf2[200];
        bool update_min = false;
        bool update_max = false;
@@ -1187,7 +1187,9 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
        bch2_trans_init(&trans, c, 0, 0);
 
        iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
-                                  BTREE_ITER_PREFETCH);
+                                  BTREE_ITER_PREFETCH|
+                                  BTREE_ITER_NOT_EXTENTS|
+                                  BTREE_ITER_ALL_SNAPSHOTS);
 
        while ((k = bch2_btree_iter_peek(iter)).k &&
               !(ret = bkey_err(k))) {
@@ -1405,7 +1407,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
                        n1->key.k.p = n1->data->max_key =
                                bkey_unpack_pos(n1, last);
 
-                       n2->data->min_key = bkey_successor(n1->data->max_key);
+                       n2->data->min_key = bpos_successor(n1->data->max_key);
 
                        memcpy_u64s(vstruct_last(s1),
                                    s2->start, u64s);
index 468b1a294ce9c59c16c0362cb79a7bf041e6b655..bc09f937742582c86910605e571d467136f34d74 100644 (file)
@@ -612,12 +612,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                             BTREE_ERR_MUST_RETRY, c, ca, b, i,
                             "incorrect level");
 
-               if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
-                       u64 *p = (u64 *) &bn->ptr;
-
-                       *p = swab64(*p);
-               }
-
                if (!write)
                        compat_btree_node(b->c.level, b->c.btree_id, version,
                                          BSET_BIG_ENDIAN(i), write, bn);
@@ -1328,8 +1322,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
        if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree))
                return -1;
 
-       ret = validate_bset(c, NULL, b, i, sectors, WRITE, false) ?:
-               validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false);
+       ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?:
+               validate_bset(c, NULL, b, i, sectors, WRITE, false);
        if (ret) {
                bch2_inconsistent_error(c);
                dump_stack();
@@ -1482,7 +1476,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                validate_before_checksum = true;
 
        /* validate_bset will be modifying: */
-       if (le16_to_cpu(i->version) <= bcachefs_metadata_version_inode_btree_change)
+       if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
                validate_before_checksum = true;
 
        /* if we're going to be encrypting, check metadata validity first: */
index f155a6cc175548c1315795d9cf1a858701a6caa8..9c14cd30a09e18d112c94e680cdbe80279b0852b 100644 (file)
@@ -189,8 +189,8 @@ void bch2_btree_flush_all_writes(struct bch_fs *);
 void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
 
 static inline void compat_bformat(unsigned level, enum btree_id btree_id,
-                                unsigned version, unsigned big_endian,
-                                int write, struct bkey_format *f)
+                                 unsigned version, unsigned big_endian,
+                                 int write, struct bkey_format *f)
 {
        if (version < bcachefs_metadata_version_inode_btree_change &&
            btree_id == BTREE_ID_inodes) {
@@ -199,6 +199,16 @@ static inline void compat_bformat(unsigned level, enum btree_id btree_id,
                swap(f->field_offset[BKEY_FIELD_INODE],
                     f->field_offset[BKEY_FIELD_OFFSET]);
        }
+
+       if (version < bcachefs_metadata_version_snapshot &&
+           (level || btree_type_has_snapshots(btree_id))) {
+               u64 max_packed =
+                       ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+               f->field_offset[BKEY_FIELD_SNAPSHOT] = write
+                       ? 0
+                       : U32_MAX - max_packed;
+       }
 }
 
 static inline void compat_bpos(unsigned level, enum btree_id btree_id,
@@ -222,16 +232,24 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
            btree_node_type_is_extents(btree_id) &&
            bpos_cmp(bn->min_key, POS_MIN) &&
            write)
-               bn->min_key = bkey_predecessor(bn->min_key);
+               bn->min_key = bpos_nosnap_predecessor(bn->min_key);
+
+       if (version < bcachefs_metadata_version_snapshot &&
+           write)
+               bn->max_key.snapshot = 0;
 
        compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
        compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
 
+       if (version < bcachefs_metadata_version_snapshot &&
+           !write)
+               bn->max_key.snapshot = U32_MAX;
+
        if (version < bcachefs_metadata_version_inode_btree_change &&
            btree_node_type_is_extents(btree_id) &&
            bpos_cmp(bn->min_key, POS_MIN) &&
            !write)
-               bn->min_key = bkey_successor(bn->min_key);
+               bn->min_key = bpos_nosnap_successor(bn->min_key);
 }
 
 #endif /* _BCACHEFS_BTREE_IO_H */
index 8c923aa01ea14fa83deb09815ed1eae2baaa2de3..972486a1f7242ea0ca8100bce7c989c4dd01e9ac 100644 (file)
 
 static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
 
+static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+{
+       EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+
+       /* Are we iterating over keys in all snapshots? */
+       if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+               p = bpos_successor(p);
+       } else {
+               p = bpos_nosnap_successor(p);
+               p.snapshot = iter->snapshot;
+       }
+
+       return p;
+}
+
+static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
+{
+       EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+
+       /* Are we iterating over keys in all snapshots? */
+       if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+               p = bpos_predecessor(p);
+       } else {
+               p = bpos_nosnap_predecessor(p);
+               p.snapshot = iter->snapshot;
+       }
+
+       return p;
+}
+
 static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
 {
        return l < BTREE_MAX_DEPTH &&
@@ -30,7 +60,7 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
 
        if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
            bkey_cmp(pos, POS_MAX))
-               pos = bkey_successor(pos);
+               pos = bkey_successor(iter, pos);
        return pos;
 }
 
@@ -591,10 +621,24 @@ err:
 
 static void bch2_btree_iter_verify(struct btree_iter *iter)
 {
+       enum btree_iter_type type = btree_iter_type(iter);
        unsigned i;
 
        EBUG_ON(iter->btree_id >= BTREE_ID_NR);
 
+       BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+              iter->pos.snapshot != iter->snapshot);
+
+       BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+              (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+       BUG_ON(type == BTREE_ITER_NODES &&
+              !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+       BUG_ON(type != BTREE_ITER_NODES &&
+              (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+              !btree_type_has_snapshots(iter->btree_id));
+
        bch2_btree_iter_verify_locks(iter);
 
        for (i = 0; i < BTREE_MAX_DEPTH; i++)
@@ -605,6 +649,9 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
 {
        enum btree_iter_type type = btree_iter_type(iter);
 
+       BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+              iter->pos.snapshot != iter->snapshot);
+
        BUG_ON((type == BTREE_ITER_KEYS ||
                type == BTREE_ITER_CACHED) &&
               (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
@@ -1434,7 +1481,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
                 * Haven't gotten to the end of the parent node: go back down to
                 * the next child node
                 */
-               btree_iter_set_search_pos(iter, bkey_successor(iter->pos));
+               btree_iter_set_search_pos(iter, bpos_successor(iter->pos));
 
                /* Unlock to avoid screwing up our lock invariants: */
                btree_node_unlock(iter, iter->level);
@@ -1508,7 +1555,7 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
        bool ret = bpos_cmp(pos, POS_MAX) != 0;
 
        if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-               pos = bkey_successor(pos);
+               pos = bkey_successor(iter, pos);
        bch2_btree_iter_set_pos(iter, pos);
        return ret;
 }
@@ -1519,7 +1566,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
        bool ret = bpos_cmp(pos, POS_MIN) != 0;
 
        if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-               pos = bkey_predecessor(pos);
+               pos = bkey_predecessor(iter, pos);
        bch2_btree_iter_set_pos(iter, pos);
        return ret;
 }
@@ -1535,7 +1582,7 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
         * btree, in that case we want iter->pos to reflect that:
         */
        if (ret)
-               btree_iter_set_search_pos(iter, bkey_successor(next_pos));
+               btree_iter_set_search_pos(iter, bpos_successor(next_pos));
        else
                bch2_btree_iter_set_pos(iter, POS_MAX);
 
@@ -1548,7 +1595,7 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
        bool ret = bpos_cmp(next_pos, POS_MIN) != 0;
 
        if (ret)
-               btree_iter_set_search_pos(iter, bkey_predecessor(next_pos));
+               btree_iter_set_search_pos(iter, bpos_predecessor(next_pos));
        else
                bch2_btree_iter_set_pos(iter, POS_MIN);
 
@@ -1594,13 +1641,13 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool wi
                k = btree_iter_level_peek(iter, &iter->l[0]);
 
                if (next_update &&
-                   bkey_cmp(next_update->k.p, iter->real_pos) <= 0)
+                   bpos_cmp(next_update->k.p, iter->real_pos) <= 0)
                        k = bkey_i_to_s_c(next_update);
 
                if (likely(k.k)) {
                        if (bkey_deleted(k.k)) {
                                btree_iter_set_search_pos(iter,
-                                               bkey_successor(k.k->p));
+                                               bkey_successor(iter, k.k->p));
                                continue;
                        }
 
@@ -1739,7 +1786,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
                if (iter->pos.inode == KEY_INODE_MAX)
                        return bkey_s_c_null;
 
-               bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos));
+               bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos));
        }
 
        pos = iter->pos;
@@ -1973,6 +2020,14 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 {
        struct btree_iter *iter, *best = NULL;
 
+       if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
+           !btree_type_has_snapshots(btree_id))
+               flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+
+       if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
+               pos.snapshot = btree_type_has_snapshots(btree_id)
+                       ? U32_MAX : 0;
+
        /* We always want a fresh iterator for node iterators: */
        if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_NODES)
                goto alloc_iter;
@@ -2007,11 +2062,14 @@ alloc_iter:
 
        if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
            btree_node_type_is_extents(btree_id) &&
-           !(flags & BTREE_ITER_NOT_EXTENTS))
+           !(flags & BTREE_ITER_NOT_EXTENTS) &&
+           !(flags & BTREE_ITER_ALL_SNAPSHOTS))
                flags |= BTREE_ITER_IS_EXTENTS;
 
        iter->flags = flags;
 
+       iter->snapshot = pos.snapshot;
+
        if (!(iter->flags & BTREE_ITER_INTENT))
                bch2_btree_iter_downgrade(iter);
        else if (!iter->locks_want)
@@ -2034,6 +2092,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
                __bch2_trans_get_iter(trans, btree_id, pos,
                                       BTREE_ITER_NODES|
                                       BTREE_ITER_NOT_EXTENTS|
+                                      BTREE_ITER_ALL_SNAPSHOTS|
                                       flags);
        unsigned i;
 
index 176661b3b87954290bc43df25f6893c77930c594..7585f989ad505ba151185b1159df3d19952ad097 100644 (file)
@@ -172,6 +172,9 @@ bool bch2_btree_iter_rewind(struct btree_iter *);
 
 static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 {
+       if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+               new_pos.snapshot = iter->snapshot;
+
        bkey_init(&iter->k);
        iter->k.p = iter->pos = new_pos;
 }
index bcd8db34d7ee50eaac9dd14898f2f547d0f65a90..0bcf171597442abfee45d2f330c18524a9e10888 100644 (file)
@@ -216,6 +216,7 @@ enum btree_iter_type {
 #define BTREE_ITER_CACHED_NOFILL       (1 << 9)
 #define BTREE_ITER_CACHED_NOCREATE     (1 << 10)
 #define BTREE_ITER_NOT_EXTENTS         (1 << 11)
+#define BTREE_ITER_ALL_SNAPSHOTS       (1 << 12)
 
 enum btree_iter_uptodate {
        BTREE_ITER_UPTODATE             = 0,
@@ -245,6 +246,8 @@ struct btree_iter {
        /* what we're searching for/what the iterator actually points to: */
        struct bpos             real_pos;
        struct bpos             pos_after_commit;
+       /* When we're filtering by snapshot, the snapshot ID we're looking for: */
+       unsigned                snapshot;
 
        u16                     flags;
        u8                      idx;
@@ -329,7 +332,7 @@ struct bkey_cached {
 struct btree_insert_entry {
        unsigned                trigger_flags;
        u8                      bkey_type;
-       u8                      btree_id;
+       enum btree_id           btree_id:8;
        u8                      level;
        unsigned                trans_triggers_run:1;
        unsigned                is_extent:1;
@@ -610,6 +613,17 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter)
        (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|            \
         BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
 
+#define BTREE_ID_HAS_SNAPSHOTS                         \
+       ((1U << BTREE_ID_extents)|                      \
+        (1U << BTREE_ID_inodes)|                       \
+        (1U << BTREE_ID_dirents)|                      \
+        (1U << BTREE_ID_xattrs))
+
+static inline bool btree_type_has_snapshots(enum btree_id id)
+{
+       return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
+}
+
 enum btree_trigger_flags {
        __BTREE_TRIGGER_NORUN,          /* Don't run triggers at all */
 
index ddb0d03e268cf98177eb567c46266a1fa19146f7..aad2629376459aff06b45be094d50c0d10e79909 100644 (file)
@@ -69,7 +69,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
                        break;
                }
 
-               next_node = bkey_successor(k.k->p);
+               next_node = bpos_successor(k.k->p);
        }
 #endif
 }
@@ -289,7 +289,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
        b->data->flags = 0;
        SET_BTREE_NODE_ID(b->data, as->btree_id);
        SET_BTREE_NODE_LEVEL(b->data, level);
-       b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr;
 
        if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
                struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
@@ -1100,6 +1099,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
        struct btree *n2;
        struct bset *set1, *set2;
        struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
+       struct bpos n1_pos;
 
        n2 = bch2_btree_node_alloc(as, n1->c.level);
        bch2_btree_update_add_new_node(as, n2);
@@ -1146,8 +1146,12 @@ static struct btree *__btree_split_node(struct btree_update *as,
        n1->nr.packed_keys      = nr_packed;
        n1->nr.unpacked_keys    = nr_unpacked;
 
-       btree_set_max(n1, bkey_unpack_pos(n1, prev));
-       btree_set_min(n2, bkey_successor(n1->key.k.p));
+       n1_pos = bkey_unpack_pos(n1, prev);
+       if (as->c->sb.version < bcachefs_metadata_version_snapshot)
+               n1_pos.snapshot = U32_MAX;
+
+       btree_set_max(n1, n1_pos);
+       btree_set_min(n2, bpos_successor(n1->key.k.p));
 
        bch2_bkey_format_init(&s);
        bch2_bkey_format_add_pos(&s, n2->data->min_key);
index a32c8f34039c0267b76f5eafd55ee7e2134f4c6e..88da89e8b170a76ad713f59a9f19557ecc7613be 100644 (file)
@@ -223,9 +223,17 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
 
-       BUG_ON(bch2_debug_check_bkeys &&
-              bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type));
-       BUG_ON(bpos_cmp(i->k->k.p, i->iter->real_pos));
+       if (bch2_debug_check_bkeys) {
+               const char *invalid = bch2_bkey_invalid(c,
+                               bkey_i_to_s_c(i->k), i->bkey_type);
+               if (invalid) {
+                       char buf[200];
+
+                       bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
+                       panic("invalid bkey %s on insert: %s\n", buf, invalid);
+               }
+       }
+       BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos));
        BUG_ON(i->level         != i->iter->level);
        BUG_ON(i->btree_id      != i->iter->btree_id);
 }
index 059972e5a1247a56f6b5383a4ed6d8885c47ec43..111310344cec2a5c3708c15b8bcdebd6c4de5e7e 100644 (file)
@@ -222,7 +222,9 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 
        bch2_trans_init(&trans, i->c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
+       iter = bch2_trans_get_iter(&trans, i->id, i->from,
+                                  BTREE_ITER_PREFETCH|
+                                  BTREE_ITER_ALL_SNAPSHOTS);
        k = bch2_btree_iter_peek(iter);
 
        while (k.k && !(err = bkey_err(k))) {
@@ -290,7 +292,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
                 * all nodes, meh
                 */
                i->from = bpos_cmp(POS_MAX, b->key.k.p)
-                       ? bkey_successor(b->key.k.p)
+                       ? bpos_successor(b->key.k.p)
                        : b->key.k.p;
 
                if (!i->size)
index 7ac3d75876557089b6b8a5736414683f3ffb6645..1f28dea26ca25b196d1a68effd0207fa67f76e28 100644 (file)
@@ -179,7 +179,8 @@ const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
        if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
                return "value too big";
 
-       if (bp.v->min_key.snapshot)
+       if (c->sb.version < bcachefs_metadata_version_snapshot &&
+           bp.v->min_key.snapshot)
                return "invalid min_key.snapshot";
 
        return bch2_bkey_ptrs_invalid(c, k);
@@ -211,8 +212,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
            btree_node_type_is_extents(btree_id) &&
            bkey_cmp(bp.v->min_key, POS_MIN))
                bp.v->min_key = write
-                       ? bkey_predecessor(bp.v->min_key)
-                       : bkey_successor(bp.v->min_key);
+                       ? bpos_nosnap_predecessor(bp.v->min_key)
+                       : bpos_nosnap_successor(bp.v->min_key);
 }
 
 /* KEY_TYPE_extent: */
index ffb30ef7ef006d2c951bd69719d85d32eca5c115..a3acae0ddfa905c3e0bde336c1edaa34d957ab39 100644 (file)
@@ -1318,6 +1318,7 @@ static int check_inode(struct btree_trans *trans,
                struct bkey_inode_buf p;
 
                bch2_inode_pack(c, &p, &u);
+               p.inode.k.p = iter->pos;
 
                ret = __bch2_trans_do(trans, NULL, NULL,
                                      BTREE_INSERT_NOFAIL|
index f676daf404a2d6b105b21028f688084fea02607e..7044ab73831cd18836fdfb9b132768d7ece4149b 100644 (file)
@@ -332,6 +332,7 @@ int bch2_inode_write(struct btree_trans *trans,
                return PTR_ERR(inode_p);
 
        bch2_inode_pack(trans->c, inode_p, inode);
+       inode_p->inode.k.p.snapshot = iter->snapshot;
        bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
        return 0;
 }
index 5ee9a6c2f4fdfa4ccfedcb6b6e46ddd3411f66d3..9c46f67c0d8e6d5d38f0459f297fbfc0b575d834 100644 (file)
@@ -332,6 +332,9 @@ int bch2_extent_update(struct btree_trans *trans,
 
                if (i_sectors_delta || new_i_size) {
                        bch2_inode_pack(trans->c, &inode_p, &inode_u);
+
+                       inode_p.inode.k.p.snapshot = iter->snapshot;
+
                        bch2_trans_update(trans, inode_iter,
                                          &inode_p.inode.k_i, 0);
                }
@@ -447,6 +450,8 @@ int bch2_write_index_default(struct bch_write_op *op)
 
                k = bch2_keylist_front(keys);
 
+               k->k.p.snapshot = iter->snapshot;
+
                bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
                bkey_copy(sk.k, k);
                bch2_cut_front(iter->pos, sk.k);
index 7783a874640a1e367985e662666042e0a547958a..4ab9cebee218f2cf584f737eb918b403df41b7b4 100644 (file)
@@ -1449,7 +1449,7 @@ void bch2_journal_write(struct closure *cl)
        if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
                validate_before_checksum = true;
 
-       if (le32_to_cpu(jset->version) <= bcachefs_metadata_version_inode_btree_change)
+       if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
                validate_before_checksum = true;
 
        if (validate_before_checksum &&
index 596f7c1e4245bec9b65120c5e86ac38686597047..a3a6abb88d6f7276cb4360cbe448e9db6bdac2a0 100644 (file)
@@ -998,6 +998,13 @@ int bch2_fs_recovery(struct bch_fs *c)
                goto err;
        }
 
+       if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
+               bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
+               ret = -EINVAL;
+               goto err;
+
+       }
+
        if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
                bch_info(c, "alloc_v2 feature bit not set, fsck required");
                c->opts.fsck = true;
@@ -1340,6 +1347,7 @@ int bch2_fs_initialize(struct bch_fs *c)
                        S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
        root_inode.bi_inum = BCACHEFS_ROOT_INO;
        bch2_inode_pack(c, &packed_inode, &root_inode);
+       packed_inode.inode.k.p.snapshot = U32_MAX;
 
        err = "error creating root directory";
        ret = bch2_btree_insert(c, BTREE_ID_inodes,
index 286587a118fe47a8c9ab36cae97770e991657b7b..3de48c593963752daec79d185b206bde699cafa8 100644 (file)
@@ -483,6 +483,7 @@ static int rand_insert(struct bch_fs *c, u64 nr)
        for (i = 0; i < nr; i++) {
                bkey_cookie_init(&k.k_i);
                k.k.p.offset = test_rand();
+               k.k.p.snapshot = U32_MAX;
 
                ret = __bch2_trans_do(&trans, NULL, NULL, 0,
                        __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));