bcachefs: Add journal_seq to inode & alloc keys
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 30 Oct 2021 01:14:23 +0000 (21:14 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:16 +0000 (17:09 -0400)
Add fields to inode & alloc keys that record the journal sequence number
when they were most recently modified.

For alloc keys, this is needed to know what journal sequence number we
have to flush before the bucket can be reused. Currently this is tracked
in memory, but we'll be getting rid of the in memory bucket array.

For inodes, this is needed for fsync when the inode has been evicted
from the vfs cache. Currently we use a bloom filter per outstanding
journal buf - but that mechanism has been broken since we added the
ability to not issue a flush/fua for every journal write.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
14 files changed:
fs/bcachefs/alloc_background.c
fs/bcachefs/alloc_background.h
fs/bcachefs/bcachefs_format.h
fs/bcachefs/bkey_methods.c
fs/bcachefs/btree_types.h
fs/bcachefs/buckets.c
fs/bcachefs/fs.c
fs/bcachefs/fsck.c
fs/bcachefs/inode.c
fs/bcachefs/inode.h
fs/bcachefs/io.c
fs/bcachefs/move.c
fs/bcachefs/quota.c
fs/bcachefs/recovery.c

index 3b6af70fa186745bf24980dbc69b3b08677f99c7..10514476cffe2d6de1390b50a30935aab3552c64 100644 (file)
@@ -147,10 +147,44 @@ static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
        return 0;
 }
 
-static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
+static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
+                               struct bkey_s_c k)
+{
+       struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
+       const u8 *in = a.v->data;
+       const u8 *end = bkey_val_end(a);
+       unsigned fieldnr = 0;
+       int ret;
+       u64 v;
+
+       out->gen        = a.v->gen;
+       out->oldest_gen = a.v->oldest_gen;
+       out->data_type  = a.v->data_type;
+       out->journal_seq = le64_to_cpu(a.v->journal_seq);
+
+#define x(_name, _bits)                                                        \
+       if (fieldnr < a.v->nr_fields) {                                 \
+               ret = bch2_varint_decode_fast(in, end, &v);             \
+               if (ret < 0)                                            \
+                       return ret;                                     \
+               in += ret;                                              \
+       } else {                                                        \
+               v = 0;                                                  \
+       }                                                               \
+       out->_name = v;                                                 \
+       if (v != out->_name)                                            \
+               return -1;                                              \
+       fieldnr++;
+
+       BCH_ALLOC_FIELDS_V2()
+#undef  x
+       return 0;
+}
+
+static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst,
                               const struct bkey_alloc_unpacked src)
 {
-       struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k);
+       struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k);
        unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
        u8 *out = a->v.data;
        u8 *end = (void *) &dst[1];
@@ -161,6 +195,7 @@ static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
        a->v.gen        = src.gen;
        a->v.oldest_gen = src.oldest_gen;
        a->v.data_type  = src.data_type;
+       a->v.journal_seq = cpu_to_le64(src.journal_seq);
 
 #define x(_name, _bits)                                                        \
        nr_fields++;                                                    \
@@ -194,10 +229,17 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
                .gen    = 0,
        };
 
-       if (k.k->type == KEY_TYPE_alloc_v2)
-               bch2_alloc_unpack_v2(&ret, k);
-       else if (k.k->type == KEY_TYPE_alloc)
+       switch (k.k->type) {
+       case KEY_TYPE_alloc:
                bch2_alloc_unpack_v1(&ret, k);
+               break;
+       case KEY_TYPE_alloc_v2:
+               bch2_alloc_unpack_v2(&ret, k);
+               break;
+       case KEY_TYPE_alloc_v3:
+               bch2_alloc_unpack_v3(&ret, k);
+               break;
+       }
 
        return ret;
 }
@@ -206,7 +248,7 @@ void bch2_alloc_pack(struct bch_fs *c,
                     struct bkey_alloc_buf *dst,
                     const struct bkey_alloc_unpacked src)
 {
-       bch2_alloc_pack_v2(dst, src);
+       bch2_alloc_pack_v3(dst, src);
 }
 
 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
@@ -249,13 +291,28 @@ const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
        return NULL;
 }
 
+const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_alloc_unpacked u;
+
+       if (k.k->p.inode >= c->sb.nr_devices ||
+           !c->devs[k.k->p.inode])
+               return "invalid device";
+
+       if (bch2_alloc_unpack_v3(&u, k))
+               return "unpack error";
+
+       return NULL;
+}
+
 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
                           struct bkey_s_c k)
 {
        struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
 
-       pr_buf(out, "gen %u oldest_gen %u data_type %s",
-              u.gen, u.oldest_gen, bch2_data_types[u.data_type]);
+       pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu",
+              u.gen, u.oldest_gen, bch2_data_types[u.data_type],
+              u.journal_seq);
 #define x(_name, ...)  pr_buf(out, " " #_name " %llu", (u64) u._name);
        BCH_ALLOC_FIELDS_V2()
 #undef  x
@@ -268,8 +325,7 @@ static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
        struct bucket *g;
        struct bkey_alloc_unpacked u;
 
-       if (k.k->type != KEY_TYPE_alloc &&
-           k.k->type != KEY_TYPE_alloc_v2)
+       if (!bkey_is_alloc(k.k))
                return 0;
 
        ca = bch_dev_bkey_exists(c, k.k->p.inode);
index a4f6bf56b18f6eee5266852e994f28f6d5b5f738..370573f8e05d7f94585210402129dcd8e5db6d94 100644 (file)
@@ -9,6 +9,7 @@
 extern const char * const bch2_allocator_states[];
 
 struct bkey_alloc_unpacked {
+       u64             journal_seq;
        u64             bucket;
        u8              dev;
        u8              gen;
@@ -21,19 +22,11 @@ struct bkey_alloc_unpacked {
 
 struct bkey_alloc_buf {
        struct bkey_i   k;
+       struct bch_alloc_v3 v;
 
-       union {
-       struct {
 #define x(_name,  _bits)               + _bits / 8
-       u8              _pad[8 + BCH_ALLOC_FIELDS_V1()];
+       u8              _pad[0 + BCH_ALLOC_FIELDS_V2()];
 #undef  x
-       } _v1;
-       struct {
-#define x(_name,  _bits)               + 8 + _bits / 8
-       u8              _pad[8 + BCH_ALLOC_FIELDS_V2()];
-#undef  x
-       } _v2;
-       };
 } __attribute__((packed, aligned(8)));
 
 /* How out of date a pointer gen is allowed to be: */
@@ -79,6 +72,7 @@ alloc_mem_to_key(struct btree_iter *iter,
 
 const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
 const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_alloc (struct bkey_ops) {                \
@@ -91,6 +85,18 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
        .val_to_text    = bch2_alloc_to_text,           \
 }
 
+#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) {     \
+       .key_invalid    = bch2_alloc_v3_invalid,        \
+       .val_to_text    = bch2_alloc_to_text,           \
+}
+
+static inline bool bkey_is_alloc(const struct bkey *k)
+{
+       return  k->type == KEY_TYPE_alloc ||
+               k->type == KEY_TYPE_alloc_v2 ||
+               k->type == KEY_TYPE_alloc_v3;
+}
+
 int bch2_alloc_read(struct bch_fs *);
 
 static inline void bch2_wake_allocator(struct bch_dev *ca)
index 8e1423b138a65ccf6c8e9015ec00eb86be32471f..21f1948ef8d029b10e1ae8a918b786f670c58fda 100644 (file)
@@ -348,7 +348,9 @@ static inline void bkey_init(struct bkey *k)
        x(indirect_inline_data, 19)                     \
        x(alloc_v2,             20)                     \
        x(subvolume,            21)                     \
-       x(snapshot,             22)
+       x(snapshot,             22)                     \
+       x(inode_v2,             23)                     \
+       x(alloc_v3,             24)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name    = nr,
@@ -685,6 +687,16 @@ struct bch_inode {
        __u8                    fields[0];
 } __attribute__((packed, aligned(8)));
 
+struct bch_inode_v2 {
+       struct bch_val          v;
+
+       __le64                  bi_journal_seq;
+       __le64                  bi_hash_seed;
+       __le64                  bi_flags;
+       __le16                  bi_mode;
+       __u8                    fields[0];
+} __attribute__((packed, aligned(8)));
+
 struct bch_inode_generation {
        struct bch_val          v;
 
@@ -776,6 +788,9 @@ LE32_BITMASK(INODE_STR_HASH,        struct bch_inode, bi_flags, 20, 24);
 LE32_BITMASK(INODE_NR_FIELDS,  struct bch_inode, bi_flags, 24, 31);
 LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
 
+LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
+LE64_BITMASK(INODEv2_NR_FIELDS,        struct bch_inode_v2, bi_flags, 24, 31);
+
 /* Dirents */
 
 /*
@@ -870,6 +885,17 @@ struct bch_alloc_v2 {
        x(stripe,               32)             \
        x(stripe_redundancy,    8)
 
+struct bch_alloc_v3 {
+       struct bch_val          v;
+       __le64                  journal_seq;
+       __le32                  flags;
+       __u8                    nr_fields;
+       __u8                    gen;
+       __u8                    oldest_gen;
+       __u8                    data_type;
+       __u8                    data[];
+} __attribute__((packed, aligned(8)));
+
 enum {
 #define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
        BCH_ALLOC_FIELDS_V1()
@@ -1276,7 +1302,8 @@ enum bcachefs_metadata_version {
        bcachefs_metadata_version_snapshot_2            = 15,
        bcachefs_metadata_version_reflink_p_fix         = 16,
        bcachefs_metadata_version_subvol_dirent         = 17,
-       bcachefs_metadata_version_max                   = 18,
+       bcachefs_metadata_version_inode_v2              = 18,
+       bcachefs_metadata_version_max                   = 19,
 };
 
 #define bcachefs_metadata_version_current      (bcachefs_metadata_version_max - 1)
index f7f4139072b5a3cbdabcf43dff85ea2f676de5af..c93004741b877a5afb359c7eeebd08d0e6b36e7b 100644 (file)
@@ -113,6 +113,7 @@ static unsigned bch2_key_types_allowed[] = {
                (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_whiteout)|
                (1U << KEY_TYPE_inode)|
+               (1U << KEY_TYPE_inode_v2)|
                (1U << KEY_TYPE_inode_generation),
        [BKEY_TYPE_dirents] =
                (1U << KEY_TYPE_deleted)|
@@ -128,7 +129,8 @@ static unsigned bch2_key_types_allowed[] = {
        [BKEY_TYPE_alloc] =
                (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_alloc)|
-               (1U << KEY_TYPE_alloc_v2),
+               (1U << KEY_TYPE_alloc_v2)|
+               (1U << KEY_TYPE_alloc_v3),
        [BKEY_TYPE_quotas] =
                (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_quota),
index d8c35ba9ec8936a62718962e8a9edb666374f47f..5331626e62a51fc839a9227ccda2b3dca9a9c3ca 100644 (file)
@@ -606,6 +606,7 @@ static inline bool btree_node_is_extents(struct btree *b)
 
 #define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS               \
        ((1U << BKEY_TYPE_alloc)|                       \
+        (1U << BKEY_TYPE_inodes)|                      \
         (1U << BKEY_TYPE_stripes)|                     \
         (1U << BKEY_TYPE_snapshots))
 
@@ -655,8 +656,12 @@ enum btree_update_flags {
 #define BTREE_TRIGGER_NOATOMIC         (1U << __BTREE_TRIGGER_NOATOMIC)
 
 #define BTREE_TRIGGER_WANTS_OLD_AND_NEW                \
-       ((1U << KEY_TYPE_stripe)|               \
+       ((1U << KEY_TYPE_alloc)|                \
+        (1U << KEY_TYPE_alloc_v2)|             \
+        (1U << KEY_TYPE_alloc_v3)|             \
+        (1U << KEY_TYPE_stripe)|               \
         (1U << KEY_TYPE_inode)|                \
+        (1U << KEY_TYPE_inode_v2)|             \
         (1U << KEY_TYPE_snapshot))
 
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
index 6e1837a0fc64d4dc6e057ba466b9c33004eec85c..b51b1cf3ca25ed5518f7e7279e7633c0996acabf 100644 (file)
@@ -13,6 +13,7 @@
 #include "buckets.h"
 #include "ec.h"
 #include "error.h"
+#include "inode.h"
 #include "movinggc.h"
 #include "recovery.h"
 #include "reflink.h"
@@ -541,8 +542,7 @@ static int bch2_mark_alloc(struct btree_trans *trans,
        struct bucket_mark old_m, m;
 
        /* We don't do anything for deletions - do we?: */
-       if (new.k->type != KEY_TYPE_alloc &&
-           new.k->type != KEY_TYPE_alloc_v2)
+       if (!bkey_is_alloc(new.k))
                return 0;
 
        /*
@@ -552,6 +552,15 @@ static int bch2_mark_alloc(struct btree_trans *trans,
            !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
                return 0;
 
+       if (flags & BTREE_TRIGGER_INSERT) {
+               struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v;
+
+               BUG_ON(!journal_seq);
+               BUG_ON(new.k->type != KEY_TYPE_alloc_v3);
+
+               v->journal_seq = cpu_to_le64(journal_seq);
+       }
+
        ca = bch_dev_bkey_exists(c, new.k->p.inode);
 
        if (new.k->p.offset >= ca->mi.nbuckets)
@@ -1095,12 +1104,24 @@ static int bch2_mark_inode(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct bch_fs_usage __percpu *fs_usage;
+       u64 journal_seq = trans->journal_res.seq;
 
-       preempt_disable();
-       fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
-       fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode;
-       fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode;
-       preempt_enable();
+       if (flags & BTREE_TRIGGER_INSERT) {
+               struct bch_inode_v2 *v = (struct bch_inode_v2 *) new.v;
+
+               BUG_ON(!journal_seq);
+               BUG_ON(new.k->type != KEY_TYPE_inode_v2);
+
+               v->bi_journal_seq = cpu_to_le64(journal_seq);
+       }
+
+       if (flags & BTREE_TRIGGER_GC) {
+               preempt_disable();
+               fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
+               fs_usage->nr_inodes += bkey_is_inode(new.k);
+               fs_usage->nr_inodes -= bkey_is_inode(old.k);
+               preempt_enable();
+       }
        return 0;
 }
 
@@ -1219,6 +1240,7 @@ static int bch2_mark_key_locked(struct btree_trans *trans,
        switch (k.k->type) {
        case KEY_TYPE_alloc:
        case KEY_TYPE_alloc_v2:
+       case KEY_TYPE_alloc_v3:
                return bch2_mark_alloc(trans, old, new, flags);
        case KEY_TYPE_btree_ptr:
        case KEY_TYPE_btree_ptr_v2:
@@ -1228,6 +1250,7 @@ static int bch2_mark_key_locked(struct btree_trans *trans,
        case KEY_TYPE_stripe:
                return bch2_mark_stripe(trans, old, new, flags);
        case KEY_TYPE_inode:
+       case KEY_TYPE_inode_v2:
                return bch2_mark_inode(trans, old, new, flags);
        case KEY_TYPE_reservation:
                return bch2_mark_reservation(trans, old, new, flags);
@@ -1685,8 +1708,7 @@ static int bch2_trans_mark_inode(struct btree_trans *trans,
                                 struct bkey_s_c new,
                                 unsigned flags)
 {
-       int nr = (new.k->type == KEY_TYPE_inode) -
-               (old.k->type == KEY_TYPE_inode);
+       int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
 
        if (nr) {
                struct replicas_delta_list *d =
@@ -1834,6 +1856,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
        case KEY_TYPE_stripe:
                return bch2_trans_mark_stripe(trans, old, new, flags);
        case KEY_TYPE_inode:
+       case KEY_TYPE_inode_v2:
                return bch2_trans_mark_inode(trans, old, new, flags);
        case KEY_TYPE_reservation:
                return bch2_trans_mark_reservation(trans, k, flags);
index 7647e117013d20d165110e0f9a026ddb8b7dc458..64627543fe1760342d9362a47f588e999aa03d5b 100644 (file)
@@ -1207,7 +1207,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum,
        inode->v.i_size         = bi->bi_size;
 
        inode->ei_flags         = 0;
-       inode->ei_journal_seq   = 0;
+       inode->ei_journal_seq   = bi->bi_journal_seq;
        inode->ei_quota_reserved = 0;
        inode->ei_qid           = bch_qid(bi);
        inode->ei_subvol        = inum.subvol;
index 9519ced976f2eb8f62382de4188e8c9d53d8e7fe..361dbf33802334ade01ad965be5f07a71e22155f 100644 (file)
@@ -133,7 +133,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
                goto err;
        }
 
-       ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+       ret = bch2_inode_unpack(k, inode);
 err:
        if (ret && ret != -EINTR)
                bch_err(trans->c, "error %i fetching inode %llu",
@@ -157,8 +157,8 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
        if (ret)
                goto err;
 
-       ret = k.k->type == KEY_TYPE_inode
-               ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
+       ret = bkey_is_inode(k.k)
+               ? bch2_inode_unpack(k, inode)
                : -ENOENT;
        if (!ret)
                *snapshot = iter.pos.snapshot;
@@ -261,7 +261,7 @@ retry:
        if (ret)
                goto err;
 
-       if (k.k->type != KEY_TYPE_inode) {
+       if (!bkey_is_inode(k.k)) {
                bch2_fs_inconsistent(trans->c,
                                     "inode %llu:%u not found when deleting",
                                     inum, snapshot);
@@ -269,7 +269,7 @@ retry:
                goto err;
        }
 
-       bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
+       bch2_inode_unpack(k, &inode_u);
 
        /* Subvolume root? */
        if (inode_u.bi_subvol) {
@@ -581,7 +581,7 @@ static int inode_walker_realloc(struct inode_walker *w)
 }
 
 static int add_inode(struct bch_fs *c, struct inode_walker *w,
-                    struct bkey_s_c_inode inode)
+                    struct bkey_s_c inode)
 {
        struct bch_inode_unpacked u;
        int ret;
@@ -623,8 +623,8 @@ static int __walk_inode(struct btree_trans *trans,
                if (k.k->p.offset != pos.inode)
                        break;
 
-               if (k.k->type == KEY_TYPE_inode)
-                       add_inode(c, w, bkey_s_c_to_inode(k));
+               if (bkey_is_inode(k.k))
+                       add_inode(c, w, k);
        }
        bch2_trans_iter_exit(trans, &iter);
 
@@ -676,11 +676,11 @@ static int __get_visible_inodes(struct btree_trans *trans,
                if (k.k->p.offset != inum)
                        break;
 
-               if (k.k->type != KEY_TYPE_inode)
+               if (!bkey_is_inode(k.k))
                        continue;
 
                if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
-                       add_inode(c, w, bkey_s_c_to_inode(k));
+                       add_inode(c, w, k);
                        if (k.k->p.snapshot >= s->pos.snapshot)
                                break;
                }
@@ -805,7 +805,6 @@ static int check_inode(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct bkey_s_c k;
-       struct bkey_s_c_inode inode;
        struct bch_inode_unpacked u;
        bool do_update = false;
        int ret;
@@ -830,19 +829,17 @@ static int check_inode(struct btree_trans *trans,
        if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
                return 0;
 
-       if (k.k->type != KEY_TYPE_inode)
+       if (!bkey_is_inode(k.k))
                return 0;
 
-       inode = bkey_s_c_to_inode(k);
+       BUG_ON(bch2_inode_unpack(k, &u));
 
        if (!full &&
-           !(inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
-                                  BCH_INODE_I_SECTORS_DIRTY|
-                                  BCH_INODE_UNLINKED)))
+           !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|
+                           BCH_INODE_I_SECTORS_DIRTY|
+                           BCH_INODE_UNLINKED)))
                return 0;
 
-       BUG_ON(bch2_inode_unpack(inode, &u));
-
        if (prev->bi_inum != u.bi_inum)
                *prev = u;
 
@@ -1963,10 +1960,10 @@ static int check_directory_structure(struct bch_fs *c)
                           BTREE_ITER_INTENT|
                           BTREE_ITER_PREFETCH|
                           BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-               if (k.k->type != KEY_TYPE_inode)
+               if (!bkey_is_inode(k.k))
                        continue;
 
-               ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
+               ret = bch2_inode_unpack(k, &u);
                if (ret) {
                        /* Should have been caught earlier in fsck: */
                        bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
@@ -2070,7 +2067,6 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
-       struct bkey_s_c_inode inode;
        struct bch_inode_unpacked u;
        int ret = 0;
 
@@ -2081,21 +2077,19 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
                           BTREE_ITER_INTENT|
                           BTREE_ITER_PREFETCH|
                           BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-               if (k.k->type != KEY_TYPE_inode)
+               if (!bkey_is_inode(k.k))
                        continue;
 
-               inode = bkey_s_c_to_inode(k);
+               /* Should never fail, checked by bch2_inode_invalid: */
+               BUG_ON(bch2_inode_unpack(k, &u));
 
                /*
                 * Backpointer and directory structure checks are sufficient for
                 * directories, since they can't have hardlinks:
                 */
-               if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
+               if (S_ISDIR(le16_to_cpu(u.bi_mode)))
                        continue;
 
-               /* Should never fail, checked by bch2_inode_invalid: */
-               BUG_ON(bch2_inode_unpack(inode, &u));
-
                if (!u.bi_nlink)
                        continue;
 
@@ -2169,7 +2163,6 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
-       struct bkey_s_c_inode inode;
        struct bch_inode_unpacked u;
        struct nlink *link = links->d;
        int ret = 0;
@@ -2184,14 +2177,13 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
                if (k.k->p.offset >= range_end)
                        break;
 
-               if (k.k->type != KEY_TYPE_inode)
+               if (!bkey_is_inode(k.k))
                        continue;
 
-               inode = bkey_s_c_to_inode(k);
-               if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
-                       continue;
+               BUG_ON(bch2_inode_unpack(k, &u));
 
-               BUG_ON(bch2_inode_unpack(inode, &u));
+               if (S_ISDIR(le16_to_cpu(u.bi_mode)))
+                       continue;
 
                if (!u.bi_nlink)
                        continue;
index 3ae321a99cee9a54a4047e27d8d8d6e084f742ed..728545141a3927028db5adc41a53f969b959c294 100644 (file)
@@ -35,29 +35,6 @@ static const u8 bits_table[8] = {
        13 * 8 - 8,
 };
 
-static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo)
-{
-       __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), };
-       unsigned shift, bytes, bits = likely(!hi)
-               ? fls64(lo)
-               : fls64(hi) + 64;
-
-       for (shift = 1; shift <= 8; shift++)
-               if (bits < bits_table[shift - 1])
-                       goto got_shift;
-
-       BUG();
-got_shift:
-       bytes = byte_table[shift - 1];
-
-       BUG_ON(out + bytes > end);
-
-       memcpy(out, (u8 *) in + 16 - bytes, bytes);
-       *out |= (1 << 8) >> shift;
-
-       return bytes;
-}
-
 static int inode_decode_field(const u8 *in, const u8 *end,
                              u64 out[2], unsigned *out_bits)
 {
@@ -92,42 +69,11 @@ static int inode_decode_field(const u8 *in, const u8 *end,
        return bytes;
 }
 
-static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed,
-                                       const struct bch_inode_unpacked *inode)
-{
-       struct bkey_i_inode *k = &packed->inode;
-       u8 *out = k->v.fields;
-       u8 *end = (void *) &packed[1];
-       u8 *last_nonzero_field = out;
-       unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
-       unsigned bytes;
-
-#define x(_name, _bits)                                                        \
-       out += inode_encode_field(out, end, 0, inode->_name);           \
-       nr_fields++;                                                    \
-                                                                       \
-       if (inode->_name) {                                             \
-               last_nonzero_field = out;                               \
-               last_nonzero_fieldnr = nr_fields;                       \
-       }
-
-       BCH_INODE_FIELDS()
-#undef  x
-
-       out = last_nonzero_field;
-       nr_fields = last_nonzero_fieldnr;
-
-       bytes = out - (u8 *) &packed->inode.v;
-       set_bkey_val_bytes(&packed->inode.k, bytes);
-       memset_u64s_tail(&packed->inode.v, 0, bytes);
-
-       SET_INODE_NR_FIELDS(&k->v, nr_fields);
-}
-
-static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
-                              const struct bch_inode_unpacked *inode)
+void bch2_inode_pack(struct bch_fs *c,
+                    struct bkey_inode_buf *packed,
+                    const struct bch_inode_unpacked *inode)
 {
-       struct bkey_i_inode *k = &packed->inode;
+       struct bkey_i_inode_v2 *k = &packed->inode;
        u8 *out = k->v.fields;
        u8 *end = (void *) &packed[1];
        u8 *last_nonzero_field = out;
@@ -135,6 +81,14 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
        unsigned bytes;
        int ret;
 
+       bkey_inode_v2_init(&packed->inode.k_i);
+       packed->inode.k.p.offset        = inode->bi_inum;
+       packed->inode.v.bi_journal_seq  = cpu_to_le64(inode->bi_journal_seq);
+       packed->inode.v.bi_hash_seed    = inode->bi_hash_seed;
+       packed->inode.v.bi_flags        = cpu_to_le64(inode->bi_flags);
+       packed->inode.v.bi_flags        = cpu_to_le64(inode->bi_flags);
+       packed->inode.v.bi_mode         = cpu_to_le16(inode->bi_mode);
+
 #define x(_name, _bits)                                                        \
        nr_fields++;                                                    \
                                                                        \
@@ -165,30 +119,12 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
        set_bkey_val_bytes(&packed->inode.k, bytes);
        memset_u64s_tail(&packed->inode.v, 0, bytes);
 
-       SET_INODE_NR_FIELDS(&k->v, nr_fields);
-}
-
-void bch2_inode_pack(struct bch_fs *c,
-                    struct bkey_inode_buf *packed,
-                    const struct bch_inode_unpacked *inode)
-{
-       bkey_inode_init(&packed->inode.k_i);
-       packed->inode.k.p.offset        = inode->bi_inum;
-       packed->inode.v.bi_hash_seed    = inode->bi_hash_seed;
-       packed->inode.v.bi_flags        = cpu_to_le32(inode->bi_flags);
-       packed->inode.v.bi_mode         = cpu_to_le16(inode->bi_mode);
-
-       if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) {
-               SET_INODE_NEW_VARINT(&packed->inode.v, true);
-               bch2_inode_pack_v2(packed, inode);
-       } else {
-               bch2_inode_pack_v1(packed, inode);
-       }
+       SET_INODEv2_NR_FIELDS(&k->v, nr_fields);
 
        if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
                struct bch_inode_unpacked unpacked;
 
-               int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode),
+               int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i),
                                           &unpacked);
                BUG_ON(ret);
                BUG_ON(unpacked.bi_inum         != inode->bi_inum);
@@ -237,17 +173,16 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
        return 0;
 }
 
-static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
-                               struct bch_inode_unpacked *unpacked)
+static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
+                               const u8 *in, const u8 *end,
+                               unsigned nr_fields)
 {
-       const u8 *in = inode.v->fields;
-       const u8 *end = bkey_val_end(inode);
        unsigned fieldnr = 0;
        int ret;
        u64 v[2];
 
 #define x(_name, _bits)                                                        \
-       if (fieldnr < INODE_NR_FIELDS(inode.v)) {                       \
+       if (fieldnr < nr_fields) {                                      \
                ret = bch2_varint_decode_fast(in, end, &v[0]);          \
                if (ret < 0)                                            \
                        return ret;                                     \
@@ -277,21 +212,43 @@ static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
        return 0;
 }
 
-int bch2_inode_unpack(struct bkey_s_c_inode inode,
+int bch2_inode_unpack(struct bkey_s_c k,
                      struct bch_inode_unpacked *unpacked)
 {
-       unpacked->bi_inum       = inode.k->p.offset;
-       unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
-       unpacked->bi_flags      = le32_to_cpu(inode.v->bi_flags);
-       unpacked->bi_mode       = le16_to_cpu(inode.v->bi_mode);
-
-       if (INODE_NEW_VARINT(inode.v)) {
-               return bch2_inode_unpack_v2(inode, unpacked);
-       } else {
-               return bch2_inode_unpack_v1(inode, unpacked);
+       switch (k.k->type) {
+       case KEY_TYPE_inode: {
+               struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+
+               unpacked->bi_inum       = inode.k->p.offset;
+               unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
+               unpacked->bi_flags      = le32_to_cpu(inode.v->bi_flags);
+               unpacked->bi_mode       = le16_to_cpu(inode.v->bi_mode);
+
+               if (INODE_NEW_VARINT(inode.v)) {
+                       return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+                                                   bkey_val_end(inode),
+                                                   INODE_NR_FIELDS(inode.v));
+               } else {
+                       return bch2_inode_unpack_v1(inode, unpacked);
+               }
+               break;
+       }
+       case KEY_TYPE_inode_v2: {
+               struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+
+               unpacked->bi_inum       = inode.k->p.offset;
+               unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
+               unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
+               unpacked->bi_flags      = le64_to_cpu(inode.v->bi_flags);
+               unpacked->bi_mode       = le16_to_cpu(inode.v->bi_mode);
+
+               return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+                                           bkey_val_end(inode),
+                                           INODEv2_NR_FIELDS(inode.v));
+       }
+       default:
+               BUG();
        }
-
-       return 0;
 }
 
 int bch2_inode_peek(struct btree_trans *trans,
@@ -317,11 +274,11 @@ int bch2_inode_peek(struct btree_trans *trans,
        if (ret)
                goto err;
 
-       ret = k.k->type == KEY_TYPE_inode ? 0 : -ENOENT;
+       ret = bkey_is_inode(k.k) ? 0 : -ENOENT;
        if (ret)
                goto err;
 
-       ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+       ret = bch2_inode_unpack(k, inode);
        if (ret)
                goto err;
 
@@ -363,7 +320,43 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
        if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
                return "invalid str hash type";
 
-       if (bch2_inode_unpack(inode, &unpacked))
+       if (bch2_inode_unpack(k, &unpacked))
+               return "invalid variable length fields";
+
+       if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
+               return "invalid data checksum type";
+
+       if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
+               return "invalid data checksum type";
+
+       if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
+           unpacked.bi_nlink != 0)
+               return "flagged as unlinked but bi_nlink != 0";
+
+       if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
+               return "subvolume root but not a directory";
+
+       return NULL;
+}
+
+const char *bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+       struct bch_inode_unpacked unpacked;
+
+       if (k.k->p.inode)
+               return "nonzero k.p.inode";
+
+       if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
+               return "incorrect value size";
+
+       if (k.k->p.offset < BLOCKDEV_INODE_MAX)
+               return "fs inode in blockdev range";
+
+       if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
+               return "invalid str hash type";
+
+       if (bch2_inode_unpack(k, &unpacked))
                return "invalid variable length fields";
 
        if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
@@ -384,10 +377,12 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
 
 static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
 {
-       pr_buf(out, "mode %o flags %x ", inode->bi_mode, inode->bi_flags);
+       pr_buf(out, "mode %o flags %x journal_seq %llu",
+              inode->bi_mode, inode->bi_flags,
+              inode->bi_journal_seq);
 
 #define x(_name, _bits)                                                \
-       pr_buf(out, #_name " %llu ", (u64) inode->_name);
+       pr_buf(out, " "#_name " %llu", (u64) inode->_name);
        BCH_INODE_FIELDS()
 #undef  x
 }
@@ -401,15 +396,14 @@ void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked
 void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
                       struct bkey_s_c k)
 {
-       struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
-       struct bch_inode_unpacked unpacked;
+       struct bch_inode_unpacked inode;
 
-       if (bch2_inode_unpack(inode, &unpacked)) {
+       if (bch2_inode_unpack(k, &inode)) {
                pr_buf(out, "(unpack error)");
                return;
        }
 
-       __bch2_inode_unpacked_to_text(out, &unpacked);
+       __bch2_inode_unpacked_to_text(out, &inode);
 }
 
 const char *bch2_inode_generation_invalid(const struct bch_fs *c,
@@ -485,6 +479,7 @@ static inline u32 bkey_generation(struct bkey_s_c k)
 {
        switch (k.k->type) {
        case KEY_TYPE_inode:
+       case KEY_TYPE_inode_v2:
                BUG();
        case KEY_TYPE_inode_generation:
                return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
@@ -542,7 +537,7 @@ again:
                }
 
                if (k.k->p.snapshot == snapshot &&
-                   k.k->type != KEY_TYPE_inode &&
+                   !bkey_is_inode(k.k) &&
                    !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) {
                        bch2_btree_iter_advance(iter);
                        continue;
@@ -585,7 +580,7 @@ found_slot:
        }
 
        /* We may have raced while the iterator wasn't pointing at pos: */
-       if (k.k->type == KEY_TYPE_inode ||
+       if (bkey_is_inode(k.k) ||
            bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p))
                goto again;
 
@@ -698,7 +693,7 @@ retry:
        if (ret)
                goto err;
 
-       if (k.k->type != KEY_TYPE_inode) {
+       if (!bkey_is_inode(k.k)) {
                bch2_fs_inconsistent(trans.c,
                                     "inode %llu not found when deleting",
                                     inum.inum);
@@ -706,7 +701,7 @@ retry:
                goto err;
        }
 
-       bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
+       bch2_inode_unpack(k, &inode_u);
 
        /* Subvolume root? */
        BUG_ON(inode_u.bi_subvol);
index 009b807cc1678fef2e7dd178520176a4027074ba..d433d48de4e0d69d45f58c364b9c8cbcde7a976f 100644 (file)
@@ -7,6 +7,7 @@
 extern const char * const bch2_inode_opts[];
 
 const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode (struct bkey_ops) {                \
@@ -14,6 +15,17 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
        .val_to_text    = bch2_inode_to_text,           \
 }
 
+#define bch2_bkey_ops_inode_v2 (struct bkey_ops) {     \
+       .key_invalid    = bch2_inode_v2_invalid,        \
+       .val_to_text    = bch2_inode_to_text,           \
+}
+
+static inline bool bkey_is_inode(const struct bkey *k)
+{
+       return  k->type == KEY_TYPE_inode ||
+               k->type == KEY_TYPE_inode_v2;
+}
+
 const char *bch2_inode_generation_invalid(const struct bch_fs *,
                                          struct bkey_s_c);
 void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *,
@@ -34,6 +46,7 @@ typedef u64 u96;
 
 struct bch_inode_unpacked {
        u64                     bi_inum;
+       u64                     bi_journal_seq;
        __le64                  bi_hash_seed;
        u32                     bi_flags;
        u16                     bi_mode;
@@ -44,7 +57,7 @@ struct bch_inode_unpacked {
 };
 
 struct bkey_inode_buf {
-       struct bkey_i_inode     inode;
+       struct bkey_i_inode_v2  inode;
 
 #define x(_name, _bits)                + 8 + _bits / 8
        u8              _pad[0 + BCH_INODE_FIELDS()];
@@ -53,7 +66,7 @@ struct bkey_inode_buf {
 
 void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
                     const struct bch_inode_unpacked *);
-int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
+int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
 
 void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
 
index ca4e7a5a64b9d040b1c9b7339c6c2785215634d1..0a9cb4d489f417e643e489efaf99b86bf2e7c4cf 100644 (file)
@@ -337,12 +337,12 @@ int bch2_extent_update(struct btree_trans *trans,
        if (ret)
                goto err;
 
-       ret = inode.k->type == KEY_TYPE_inode ? 0 : -ENOENT;
+       ret = bkey_is_inode(inode.k) ? 0 : -ENOENT;
        if (ret)
                goto err;
 
        if (i_sectors_delta || new_i_size) {
-               ret = bch2_inode_unpack(bkey_s_c_to_inode(inode), &inode_u);
+               ret = bch2_inode_unpack(inode, &inode_u);
                if (ret)
                        goto err;
 
index 2f260360b08987eab26b51d90ca748bfae1c5415..249d0b2be167767368b91d5bf9c99ba4a9491034 100644 (file)
@@ -623,11 +623,11 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos,
                goto err;
        }
 
-       ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO;
+       ret = bkey_is_inode(k.k) ? 0 : -EIO;
        if (ret)
                goto err;
 
-       ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+       ret = bch2_inode_unpack(k, inode);
        if (ret)
                goto err;
 err:
index 17fd5bf107bbdddfda284ce1b7a13d2d3526bbec..5f1216da76d05ef03014447403128dca6ec0e641 100644 (file)
@@ -439,9 +439,8 @@ int bch2_fs_quota_read(struct bch_fs *c)
 
        for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
-               switch (k.k->type) {
-               case KEY_TYPE_inode:
-                       ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
+               if (bkey_is_inode(k.k)) {
+                       ret = bch2_inode_unpack(k, &u);
                        if (ret)
                                return ret;
 
index 29fae6dbce765a6e32756f73634afb947a726944..d8e511a0664e4a77aeb0898922ee289139f6d893 100644 (file)
@@ -1015,13 +1015,13 @@ static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
        if (ret)
                goto err;
 
-       if (k.k->type != KEY_TYPE_inode) {
+       if (!bkey_is_inode(k.k)) {
                bch_err(c, "root inode not found");
                ret = -ENOENT;
                goto err;
        }
 
-       ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode);
+       ret = bch2_inode_unpack(k, &inode);
        BUG_ON(ret);
 
        inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
@@ -1093,6 +1093,9 @@ int bch2_fs_recovery(struct bch_fs *c)
                        bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
                        c->opts.version_upgrade = true;
                        c->opts.fsck            = true;
+               } else if (c->sb.version < bcachefs_metadata_version_inode_v2) {
+                       bch_info(c, "filesystem version is prior to inode_v2 - upgrading");
+                       c->opts.version_upgrade = true;
                }
        }