bcachefs: Inline data extents
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 9 Nov 2019 21:43:16 +0000 (16:43 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:32 +0000 (17:08 -0400)
This implements extents that have their data inline, in the value,
instead of the bkey value being pointers to the data - and the read and
write paths are updated to read from these new extent types and write
them out, when the write size is small enough.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/bcachefs_format.h
fs/bcachefs/bkey.h
fs/bcachefs/bkey_methods.c
fs/bcachefs/extents.c
fs/bcachefs/extents.h
fs/bcachefs/fs-io.c
fs/bcachefs/io.c
fs/bcachefs/io.h
fs/bcachefs/recovery.c

index 6ba83058384620315c684ac40651157f9cf4b9c2..e3004593874c029b096924830a446abd056c486e 100644 (file)
@@ -342,7 +342,8 @@ static inline void bkey_init(struct bkey *k)
        x(quota,                13)                     \
        x(stripe,               14)                     \
        x(reflink_p,            15)                     \
-       x(reflink_v,            16)
+       x(reflink_v,            16)                     \
+       x(inline_data,          17)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name    = nr,
@@ -915,6 +916,13 @@ struct bch_reflink_v {
        __u64                   _data[0];
 };
 
+/* Inline data */
+
+struct bch_inline_data {
+       struct bch_val          v;
+       u8                      data[0];
+};
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -1319,6 +1327,7 @@ enum bch_sb_features {
        BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
        BCH_FEATURE_REFLINK             = 6,
        BCH_FEATURE_NEW_SIPHASH         = 7,
+       BCH_FEATURE_INLINE_DATA         = 8,
        BCH_FEATURE_NR,
 };
 
index ba4d6329e37a4d9c7569b7b546d295de31d35dea..36e6ecc045146ed249096c0baf5f663d94d08160 100644 (file)
@@ -572,6 +572,7 @@ BKEY_VAL_ACCESSORS(quota);
 BKEY_VAL_ACCESSORS(stripe);
 BKEY_VAL_ACCESSORS(reflink_p);
 BKEY_VAL_ACCESSORS(reflink_v);
+BKEY_VAL_ACCESSORS(inline_data);
 
 /* byte order helpers */
 
index f01405dd502bb64612f44303a0ca5daad58810f5..5312184c37f71105327af13bd9036c16c0f6f1f2 100644 (file)
@@ -63,6 +63,23 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c,
        .key_invalid = empty_val_key_invalid,           \
 }
 
+static const char *key_type_inline_data_invalid(const struct bch_fs *c,
+                                          struct bkey_s_c k)
+{
+       return NULL;
+}
+
+static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
+                                        struct bkey_s_c k)
+{
+       pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k));
+}
+
+static const struct bkey_ops bch2_bkey_ops_inline_data = {
+       .key_invalid    = key_type_inline_data_invalid,
+       .val_to_text    = key_type_inline_data_to_text,
+};
+
 static const struct bkey_ops bch2_bkey_ops[] = {
 #define x(name, nr) [KEY_TYPE_##name]  = bch2_bkey_ops_##name,
        BCH_BKEY_TYPES()
@@ -83,9 +100,8 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
        if (k.k->u64s < BKEY_U64s)
                return "u64s too small";
 
-       if ((btree_node_type_is_extents(type) ||
-            type == BKEY_TYPE_BTREE) &&
-           bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
+       if (type == BKEY_TYPE_BTREE &&
+           bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
                return "value too big";
 
        if (btree_node_type_is_extents(type)) {
index 8f511760102a4a685c31c8258e4d680c9d6b4c0f..0e25fbe65b9547eaad258a840f571225e82161e7 100644 (file)
@@ -737,11 +737,6 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
        }
 
        switch (k.k->type) {
-       case KEY_TYPE_deleted:
-       case KEY_TYPE_discard:
-       case KEY_TYPE_error:
-       case KEY_TYPE_cookie:
-               break;
        case KEY_TYPE_extent:
        case KEY_TYPE_reflink_v: {
                struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
@@ -779,10 +774,18 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
                le64_add_cpu(&p.v->idx, sub);
                break;
        }
-       case KEY_TYPE_reservation:
+       case KEY_TYPE_inline_data: {
+               struct bkey_s_inline_data d = bkey_s_to_inline_data(k);
+
+               sub = min_t(u64, sub << 9, bkey_val_bytes(d.k));
+
+               memmove(d.v->data,
+                       d.v->data + sub,
+                       bkey_val_bytes(d.k) - sub);
+
+               new_val_u64s -= sub >> 3;
                break;
-       default:
-               BUG();
+       }
        }
 
        val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
@@ -814,6 +817,12 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k)
                new_val_u64s = 0;
        }
 
+       switch (k.k->type) {
+       case KEY_TYPE_inline_data:
+               new_val_u64s = min(new_val_u64s, k.k->size << 6);
+               break;
+       }
+
        val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
        BUG_ON(val_u64s_delta < 0);
 
index e360e19898125720b8bb4012a7384566214b8a06..35a66d4f4ea2e1dfca766c46416716e35953c46e 100644 (file)
@@ -456,6 +456,7 @@ static inline bool bkey_extent_is_direct_data(const struct bkey *k)
 static inline bool bkey_extent_is_data(const struct bkey *k)
 {
        return bkey_extent_is_direct_data(k) ||
+               k->type == KEY_TYPE_inline_data ||
                k->type == KEY_TYPE_reflink_p;
 }
 
@@ -469,6 +470,7 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
        case KEY_TYPE_reservation:
        case KEY_TYPE_reflink_p:
        case KEY_TYPE_reflink_v:
+       case KEY_TYPE_inline_data:
                return true;
        default:
                return false;
index fab952856e3650fd018a458b388a344b98b4f13a..7abe53be7dd39c0f1a3af13305dfe7e58b8d234b 100644 (file)
@@ -990,6 +990,18 @@ static void bch2_writepage_io_done(struct closure *cl)
                }
        }
 
+       if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
+               bio_for_each_segment_all(bvec, bio, iter) {
+                       struct bch_page_state *s;
+
+                       s = __bch2_page_state(bvec->bv_page);
+                       spin_lock(&s->lock);
+                       for (i = 0; i < PAGE_SECTORS; i++)
+                               s->s[i].nr_replicas = 0;
+                       spin_unlock(&s->lock);
+               }
+       }
+
        /*
         * racing with fallocate can cause us to add fewer sectors than
         * expected - but we shouldn't add more sectors than expected:
index ef953499e66ca12479d844983facfe2fe2441a60..8f558347ca7f5955b18573374b2e2aac7991c904 100644 (file)
@@ -539,16 +539,19 @@ static void __bch2_write_index(struct bch_write_op *op)
 
        for (src = keys->keys; src != keys->top; src = n) {
                n = bkey_next(src);
-               bkey_copy(dst, src);
 
-               bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr,
-                       test_bit(ptr->dev, op->failed.d));
+               if (bkey_extent_is_direct_data(&src->k)) {
+                       bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
+                                           test_bit(ptr->dev, op->failed.d));
 
-               if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) {
-                       ret = -EIO;
-                       goto err;
+                       if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
+                               ret = -EIO;
+                               goto err;
+                       }
                }
 
+               if (dst != src)
+                       memmove_u64s_down(dst, src, src->u64s);
                dst = bkey_next(dst);
        }
 
@@ -1092,7 +1095,7 @@ again:
 
                bio->bi_end_io  = bch2_write_endio;
                bio->bi_private = &op->cl;
-               bio->bi_opf     = REQ_OP_WRITE;
+               bio->bi_opf |= REQ_OP_WRITE;
 
                if (!skip_put)
                        closure_get(bio->bi_private);
@@ -1129,6 +1132,47 @@ flush_io:
        goto again;
 }
 
+static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
+{
+       struct closure *cl = &op->cl;
+       struct bio *bio = &op->wbio.bio;
+       struct bvec_iter iter;
+       struct bkey_i_inline_data *id;
+       unsigned sectors;
+       int ret;
+
+       ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
+                                  ARRAY_SIZE(op->inline_keys),
+                                  BKEY_U64s + DIV_ROUND_UP(data_len, 8));
+       if (ret) {
+               op->error = ret;
+               goto err;
+       }
+
+       sectors = bio_sectors(bio);
+       op->pos.offset += sectors;
+
+       id = bkey_inline_data_init(op->insert_keys.top);
+       id->k.p         = op->pos;
+       id->k.version   = op->version;
+       id->k.size      = sectors;
+
+       iter = bio->bi_iter;
+       iter.bi_size = data_len;
+       memcpy_from_bio(id->v.data, bio, iter);
+
+       while (data_len & 7)
+               id->v.data[data_len++] = '\0';
+       set_bkey_val_bytes(&id->k, data_len);
+       bch2_keylist_push(&op->insert_keys);
+
+       op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+       continue_at_nobarrier(cl, bch2_write_index, NULL);
+       return;
+err:
+       bch2_write_done(&op->cl);
+}
+
 /**
  * bch_write - handle a write to a cache device or flash only volume
  *
@@ -1150,22 +1194,22 @@ void bch2_write(struct closure *cl)
        struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
        struct bio *bio = &op->wbio.bio;
        struct bch_fs *c = op->c;
+       unsigned data_len;
 
        BUG_ON(!op->nr_replicas);
        BUG_ON(!op->write_point.v);
        BUG_ON(!bkey_cmp(op->pos, POS_MAX));
 
+       op->start_time = local_clock();
+       bch2_keylist_init(&op->insert_keys, op->inline_keys);
+       wbio_init(bio)->put_bio = false;
+
        if (bio_sectors(bio) & (c->opts.block_size - 1)) {
                __bcache_io_error(c, "misaligned write");
                op->error = -EIO;
                goto err;
        }
 
-       op->start_time = local_clock();
-
-       bch2_keylist_init(&op->insert_keys, op->inline_keys);
-       wbio_init(bio)->put_bio = false;
-
        if (c->opts.nochanges ||
            !percpu_ref_tryget(&c->writes)) {
                __bcache_io_error(c, "read only");
@@ -1175,6 +1219,14 @@ void bch2_write(struct closure *cl)
 
        bch2_increment_clock(c, bio_sectors(bio), WRITE);
 
+       data_len = min_t(u64, bio->bi_iter.bi_size,
+                        op->new_i_size - (op->pos.offset << 9));
+
+       if (data_len <= min(block_bytes(c) / 2, 1024U)) {
+               bch2_write_data_inline(op, data_len);
+               return;
+       }
+
        continue_at_nobarrier(cl, __bch2_write, NULL);
        return;
 err:
@@ -1892,6 +1944,19 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
        struct bpos pos = bkey_start_pos(k.k);
        int pick_ret;
 
+       if (k.k->type == KEY_TYPE_inline_data) {
+               struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
+               unsigned bytes = min_t(unsigned, iter.bi_size,
+                                      bkey_val_bytes(d.k));
+
+               swap(iter.bi_size, bytes);
+               memcpy_to_bio(&orig->bio, iter, d.v->data);
+               swap(iter.bi_size, bytes);
+               bio_advance_iter(&orig->bio, &iter, bytes);
+               zero_fill_bio_iter(&orig->bio, iter);
+               goto out_read_done;
+       }
+
        pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
 
        /* hole or reservation - just zero fill: */
index 81fc549a0c97ebb2547bf0fb6f3601c44accfed4..fa5841a86fcbd87f2e57127d75891d639ee764e5 100644 (file)
@@ -34,10 +34,11 @@ enum bch_write_flags {
        BCH_WRITE_PAGES_OWNED           = (1 << 5),
        BCH_WRITE_ONLY_SPECIFIED_DEVS   = (1 << 6),
        BCH_WRITE_NOPUT_RESERVATION     = (1 << 7),
+       BCH_WRITE_WROTE_DATA_INLINE     = (1 << 8),
 
        /* Internal: */
-       BCH_WRITE_JOURNAL_SEQ_PTR       = (1 << 8),
-       BCH_WRITE_SKIP_CLOSURE_PUT      = (1 << 9),
+       BCH_WRITE_JOURNAL_SEQ_PTR       = (1 << 9),
+       BCH_WRITE_SKIP_CLOSURE_PUT      = (1 << 10),
 };
 
 static inline u64 *op_journal_seq(struct bch_write_op *op)
index 2efe023b2f0d386b44403c5c8b7d2b247dbc0d56..9102a1ce1ec4c70065a98bde47c90f86d58d66ea 100644 (file)
@@ -913,6 +913,12 @@ int bch2_fs_recovery(struct bch_fs *c)
                write_sb = true;
        }
 
+       if (!(c->sb.features & (1ULL << BCH_FEATURE_INLINE_DATA))) {
+               c->disk_sb.sb->features[0] |=
+                       cpu_to_le64(1ULL << BCH_FEATURE_INLINE_DATA);
+               write_sb = true;
+       }
+
        if (!test_bit(BCH_FS_ERROR, &c->flags)) {
                c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
                write_sb = true;