bcachefs: Rework lru btree
authorKent Overstreet <kent.overstreet@linux.dev>
Mon, 5 Dec 2022 21:49:13 +0000 (16:49 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:52 +0000 (17:09 -0400)
This patch changes how the LRU index works:

Instead of using KEY_TYPE_lru where the bucket the lru entry points to
is part of the value, this switches to KEY_TYPE_set and encoding the
bucket we refer to in the low bits of the key.

This means that we no longer have to check for collisions when inserting
LRU entries. We'll be making using of this in the next patch, which adds
a btree write buffer - a pure write buffer for btree updates, where
updates are appended to a simple array and then periodically sorted and
batch inserted.

This is a new on disk format version, and a forced upgrade.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/alloc_background.c
fs/bcachefs/bcachefs_format.h
fs/bcachefs/bkey_methods.c
fs/bcachefs/lru.c
fs/bcachefs/lru.h
fs/bcachefs/recovery.c

index f515b038c14e44038a5dfcbbfd7975c309b16ed3..e81c04bc2327bfdf221a92087a340de9a2c96da6 100644 (file)
@@ -914,13 +914,11 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
        new_lru = alloc_lru_idx(*new_a);
 
        if (old_lru != new_lru) {
-               ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset,
-                                     old_lru, &new_lru, old);
+               ret = bch2_lru_change(trans, new->k.p.inode,
+                                     bucket_to_u64(new->k.p),
+                                     old_lru, new_lru);
                if (ret)
                        return ret;
-
-               if (new_a->data_type == BCH_DATA_cached)
-                       new_a->io_time[READ] = new_lru;
        }
 
        if (old_a->gen != new_a->gen) {
@@ -1510,7 +1508,6 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
        const struct bch_alloc_v4 *a;
        struct bkey_s_c alloc_k, k;
        struct printbuf buf = PRINTBUF;
-       struct printbuf buf2 = PRINTBUF;
        int ret;
 
        alloc_k = bch2_btree_iter_peek(alloc_iter);
@@ -1527,8 +1524,9 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
                return 0;
 
        bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
-                            POS(alloc_k.k->p.inode, a->io_time[READ]), 0);
-
+                            lru_pos(alloc_k.k->p.inode,
+                                    bucket_to_u64(alloc_k.k->p),
+                                    a->io_time[READ]), 0);
        k = bch2_btree_iter_peek_slot(&lru_iter);
        ret = bkey_err(k);
        if (ret)
@@ -1539,21 +1537,18 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
                        "  %s",
                (printbuf_reset(&buf),
                 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
-           fsck_err_on(k.k->type != KEY_TYPE_lru ||
-                       le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c,
-                       "incorrect/missing lru entry\n"
-                       "  %s\n"
+           fsck_err_on(k.k->type != KEY_TYPE_set, c,
+                       "missing lru entry\n"
                        "  %s",
                        (printbuf_reset(&buf),
-                        bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
-                       (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
+                        bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
                u64 read_time = a->io_time[READ] ?:
                        atomic64_read(&c->io_clock[READ].now);
 
                ret = bch2_lru_set(trans,
                                   alloc_k.k->p.inode,
-                                  alloc_k.k->p.offset,
-                                  &read_time);
+                                  bucket_to_u64(alloc_k.k->p),
+                                  read_time);
                if (ret)
                        goto err;
 
@@ -1574,7 +1569,6 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
 err:
 fsck_err:
        bch2_trans_iter_exit(trans, &lru_iter);
-       printbuf_exit(&buf2);
        printbuf_exit(&buf);
        return ret;
 }
@@ -1757,51 +1751,34 @@ void bch2_do_discards(struct bch_fs *c)
 }
 
 static int invalidate_one_bucket(struct btree_trans *trans,
-                                struct btree_iter *lru_iter, struct bkey_s_c k,
-                                unsigned dev_idx, s64 *nr_to_invalidate)
+                                struct btree_iter *lru_iter,
+                                struct bpos bucket,
+                                s64 *nr_to_invalidate)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter alloc_iter = { NULL };
        struct bkey_i_alloc_v4 *a;
-       struct bpos bucket;
        struct printbuf buf = PRINTBUF;
        unsigned cached_sectors;
        int ret = 0;
 
-       if (*nr_to_invalidate <= 0 || k.k->p.inode != dev_idx)
+       if (*nr_to_invalidate <= 0)
                return 1;
 
-       if (k.k->type != KEY_TYPE_lru) {
-               prt_printf(&buf, "non lru key in lru btree:\n  ");
-               bch2_bkey_val_to_text(&buf, c, k);
-
-               if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
-                       bch_err(c, "%s", buf.buf);
-               } else {
-                       bch2_trans_inconsistent(trans, "%s", buf.buf);
-                       ret = -EINVAL;
-               }
-
-               goto out;
-       }
-
-       bucket = POS(dev_idx, le64_to_cpu(bkey_s_c_to_lru(k).v->idx));
-
        a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
        ret = PTR_ERR_OR_ZERO(a);
        if (ret)
                goto out;
 
-       if (k.k->p.offset != alloc_lru_idx(a->v)) {
+       if (lru_pos_time(lru_iter->pos) != alloc_lru_idx(a->v)) {
                prt_printf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n  ");
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
+               bch2_bpos_to_text(&buf, lru_iter->pos);
                prt_printf(&buf, "\n  ");
-               bch2_bkey_val_to_text(&buf, c, k);
+               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
 
-               if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
-                       bch_err(c, "%s", buf.buf);
-               } else {
-                       bch2_trans_inconsistent(trans, "%s", buf.buf);
+               bch_err(c, "%s", buf.buf);
+               if (test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
+                       bch2_inconsistent_error(c);
                        ret = -EINVAL;
                }
 
@@ -1852,9 +1829,13 @@ static void bch2_do_invalidates_work(struct work_struct *work)
                s64 nr_to_invalidate =
                        should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
 
-               ret = for_each_btree_key2(&trans, iter, BTREE_ID_lru,
-                               POS(ca->dev_idx, 0), BTREE_ITER_INTENT, k,
-                       invalidate_one_bucket(&trans, &iter, k, ca->dev_idx, &nr_to_invalidate));
+               ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_lru,
+                               lru_pos(ca->dev_idx, 0, 0),
+                               lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
+                               BTREE_ITER_INTENT, k,
+                       invalidate_one_bucket(&trans, &iter,
+                                             u64_to_bucket(k.k->p.offset),
+                                             &nr_to_invalidate));
 
                if (ret < 0) {
                        percpu_ref_put(&ca->ref);
index 7e67d2e94a296f9836b9543314a863c72129de83..99f9fbd1401fa346c8d93b23fc8fd8290cd51216 100644 (file)
@@ -1562,7 +1562,8 @@ struct bch_sb_field_journal_seq_blacklist {
        x(backpointers,                 22)             \
        x(inode_v3,                     23)             \
        x(unwritten_extents,            24)             \
-       x(bucket_gens,                  25)
+       x(bucket_gens,                  25)             \
+       x(lru_v2,                       26)
 
 enum bcachefs_metadata_version {
        bcachefs_metadata_version_min = 9,
index 293188f47e8aa95340ad6efeca3bb595daf4b5a1..f40a3ea3f79b3af01a26f1e13939ce5a8f29544f 100644 (file)
@@ -186,7 +186,7 @@ static unsigned bch2_key_types_allowed[] = {
                (1U << KEY_TYPE_snapshot),
        [BKEY_TYPE_lru] =
                (1U << KEY_TYPE_deleted)|
-               (1U << KEY_TYPE_lru),
+               (1U << KEY_TYPE_set),
        [BKEY_TYPE_freespace] =
                (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_set),
index 12821868df71e74399c7b17f94a32c8c94c6afa0..6f7becb051bc36e9351ab0d9b514746c30ee6adf 100644 (file)
@@ -8,6 +8,7 @@
 #include "lru.h"
 #include "recovery.h"
 
+/* KEY_TYPE_lru is obsolete: */
 int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
                     int rw, struct printbuf *err)
 {
@@ -19,7 +20,7 @@ int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
                return -BCH_ERR_invalid_bkey;
        }
 
-       if (!k.k->p.offset) {
+       if (!lru_pos_time(k.k->p)) {
                prt_printf(err, "lru entry at time=0");
                return -BCH_ERR_invalid_bkey;
 
@@ -36,101 +37,57 @@ void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
        prt_printf(out, "idx %llu", le64_to_cpu(lru->idx));
 }
 
-int bch2_lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time,
-                   struct bkey_s_c orig_k)
+static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
+                       u64 dev_bucket, u64 time, unsigned key_type)
 {
        struct btree_iter iter;
-       struct bkey_s_c k;
-       u64 existing_idx;
-       struct printbuf buf = PRINTBUF;
+       struct bkey_i *k;
        int ret = 0;
 
        if (!time)
                return 0;
 
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
-                            POS(id, time),
-                            BTREE_ITER_INTENT|
-                            BTREE_ITER_WITH_UPDATES);
-       k = bch2_btree_iter_peek_slot(&iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
+       k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
+       ret = PTR_ERR_OR_ZERO(k);
+       if (unlikely(ret))
+               return ret;
 
-       if (k.k->type != KEY_TYPE_lru) {
-               bch2_bkey_val_to_text(&buf, trans->c, orig_k);
-               bch2_trans_inconsistent(trans,
-                       "pointer to nonexistent lru %llu:%llu\n%s",
-                       id, time, buf.buf);
-               ret = -EIO;
-               goto err;
-       }
+       bkey_init(&k->k);
+       k->k.type = key_type;
+       k->k.p = lru_pos(lru_id, dev_bucket, time);
 
-       existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
-       if (existing_idx != idx) {
-               bch2_bkey_val_to_text(&buf, trans->c, orig_k);
-               bch2_trans_inconsistent(trans,
-                       "lru %llu:%llu with wrong backpointer: got %llu, should be %llu\n%s",
-                       id, time, existing_idx, idx, buf.buf);
-               ret = -EIO;
-               goto err;
-       }
+       EBUG_ON(lru_pos_id(k->k.p) != lru_id);
+       EBUG_ON(lru_pos_time(k->k.p) != time);
+       EBUG_ON(k->k.p.offset != dev_bucket);
 
-       ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
+                            k->k.p, BTREE_ITER_INTENT);
+
+       ret = bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update(trans, &iter, k, 0);
        bch2_trans_iter_exit(trans, &iter);
-       printbuf_exit(&buf);
        return ret;
 }
 
-int bch2_lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time)
+int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
 {
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bkey_i_lru *lru;
-       int ret = 0;
-
-       if (!*time)
-               return 0;
-
-       for_each_btree_key_norestart(trans, iter, BTREE_ID_lru,
-                       POS(lru_id, *time),
-                       BTREE_ITER_SLOTS|
-                       BTREE_ITER_INTENT|
-                       BTREE_ITER_WITH_UPDATES, k, ret)
-               if (bkey_deleted(k.k))
-                       break;
-
-       if (ret)
-               goto err;
-
-       BUG_ON(iter.pos.inode != lru_id);
-       *time = iter.pos.offset;
-
-       lru = bch2_bkey_alloc(trans, &iter, lru);
-       ret = PTR_ERR_OR_ZERO(lru);
-       if (ret)
-               goto err;
-
-       lru->v.idx = cpu_to_le64(idx);
+       return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted);
+}
 
-       ret = bch2_trans_update(trans, &iter, &lru->k_i, 0);
-       if (ret)
-               goto err;
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
+int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
+{
+       return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set);
 }
 
-int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx,
-                   u64 old_time, u64 *new_time,
-                   struct bkey_s_c k)
+int bch2_lru_change(struct btree_trans *trans,
+                   u16 lru_id, u64 dev_bucket,
+                   u64 old_time, u64 new_time)
 {
-       if (old_time == *new_time)
+       if (old_time == new_time)
                return 0;
 
-       return  bch2_lru_delete(trans, id, idx, old_time, k) ?:
-               bch2_lru_set(trans, id, idx, new_time);
+       return  bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?:
+               bch2_lru_set(trans, lru_id, dev_bucket, new_time);
 }
 
 static int bch2_check_lru_key(struct btree_trans *trans,
@@ -144,12 +101,9 @@ static int bch2_check_lru_key(struct btree_trans *trans,
        const struct bch_alloc_v4 *a;
        struct printbuf buf1 = PRINTBUF;
        struct printbuf buf2 = PRINTBUF;
-       struct bpos alloc_pos;
+       struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset);
        int ret;
 
-       alloc_pos = POS(lru_k.k->p.inode,
-                       le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx));
-
        if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
                        "lru key points to nonexistent device:bucket %llu:%llu",
                        alloc_pos.inode, alloc_pos.offset))
@@ -163,10 +117,12 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 
        a = bch2_alloc_to_v4(k, &a_convert);
 
-       if (fsck_err_on(a->data_type != BCH_DATA_cached ||
-                       a->io_time[READ] != lru_k.k->p.offset, c,
-                       "incorrect lru entry %s\n"
+       if (fsck_err_on(lru_k.k->type != KEY_TYPE_set ||
+                       a->data_type != BCH_DATA_cached ||
+                       a->io_time[READ] != lru_pos_time(lru_k.k->p), c,
+                       "incorrect lru entry (time %llu) %s\n"
                        "  for %s",
+                       lru_pos_time(lru_k.k->p),
                        (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
                        (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
                ret = bch2_btree_delete_at(trans, lru_iter, 0);
index 925c29b49b867ae45d9b5baf08159e0fe9d53b53..2e22f139848ae36a21667f922140a7ad6e84c7d7 100644 (file)
@@ -2,6 +2,26 @@
 #ifndef _BCACHEFS_LRU_H
 #define _BCACHEFS_LRU_H
 
+#define LRU_TIME_BITS  48
+#define LRU_TIME_MAX   ((1ULL << LRU_TIME_BITS) - 1)
+
+static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
+{
+       EBUG_ON(time > LRU_TIME_MAX);
+
+       return POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket);
+}
+
+static inline u64 lru_pos_id(struct bpos pos)
+{
+       return pos.inode >> LRU_TIME_BITS;
+}
+
+static inline u64 lru_pos_time(struct bpos pos)
+{
+       return pos.inode & ~(~0ULL << LRU_TIME_BITS);
+}
+
 int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
@@ -10,9 +30,9 @@ void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
        .val_to_text    = bch2_lru_to_text,     \
 })
 
-int bch2_lru_delete(struct btree_trans *, u64, u64, u64, struct bkey_s_c);
-int bch2_lru_set(struct btree_trans *, u64, u64, u64 *);
-int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *, struct bkey_s_c);
+int bch2_lru_del(struct btree_trans *, u16, u64, u64);
+int bch2_lru_set(struct btree_trans *, u16, u64, u64);
+int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
 
 int bch2_check_lrus(struct bch_fs *);
 
index b10ba8963350a3275240a846baa7bcbd6cd0878e..8a78377bf9c5991846c7e4202107cac7d1b00656 100644 (file)
@@ -1094,14 +1094,11 @@ int bch2_fs_recovery(struct bch_fs *c)
        }
 
        if (!c->opts.nochanges) {
-               if (c->sb.version < bcachefs_metadata_version_backpointers) {
+               if (c->sb.version < bcachefs_metadata_version_lru_v2) {
                        bch_info(c, "version prior to backpointers, upgrade and fsck required");
                        c->opts.version_upgrade = true;
                        c->opts.fsck            = true;
                        c->opts.fix_errors      = FSCK_OPT_YES;
-               } else if (c->sb.version < bcachefs_metadata_version_inode_v3) {
-                       bch_info(c, "version prior to inode_v3, upgrade required");
-                       c->opts.version_upgrade = true;
                }
        }