bcachefs: Btree key cache coherency
authorKent Overstreet <kent.overstreet@gmail.com>
Wed, 12 Jan 2022 06:14:47 +0000 (01:14 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:23 +0000 (17:09 -0400)
 - Updates to non key cache iterators will now be transparently
   redirected to the key cache for cached btrees.

 - Except when creating new keys: then the update goes to underlying
   btree

For for iterating over a cached btree to work, we need to ensure that if
a key exists in the key cache, it also exists in the btree - otherwise
the iterator code will skip past it and not check the key cache.

Otherwise, for consistency, all updates should go to the same place -
the key cache.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/alloc_background.c
fs/bcachefs/btree_iter.c
fs/bcachefs/btree_key_cache.c
fs/bcachefs/btree_key_cache.h
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update.h
fs/bcachefs/btree_update_leaf.c
fs/bcachefs/trace.h

index 1353e72bbfb0884bf852ead447fa1f283ee5bc2d..55af41a63ff7e5dcfa5e212c22daf63f3903e2f9 100644 (file)
@@ -271,7 +271,8 @@ int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter,
                return PTR_ERR(a);
 
        bch2_alloc_pack(trans->c, a, *u);
-       return bch2_trans_update(trans, iter, &a->k, trigger_flags);
+       return bch2_trans_update(trans, iter, &a->k, trigger_flags|
+                                BTREE_UPDATE_NO_KEY_CACHE_COHERENCY);
 }
 
 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
index ff98024e76fc772413a8516134cf76af14c425ab..c7ba6ce270074e436f6e519f57289e65a69dc42b 100644 (file)
@@ -2188,6 +2188,8 @@ struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
                        break;
                if (bpos_cmp(i->k->k.p, iter->path->pos) < 0)
                        continue;
+               if (i->key_cache_already_flushed)
+                       continue;
                if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0)
                        ret = i->k;
        }
index 29d46d0aa5d3d474258b16b4bafbd6df16e684a9..72a54b9d13358334d70c823851d7e75eaf2370fd 100644 (file)
@@ -414,6 +414,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
         * */
        ret   = bch2_btree_iter_traverse(&b_iter) ?:
                bch2_trans_update(trans, &b_iter, ck->k,
+                                 BTREE_UPDATE_KEY_CACHE_RECLAIM|
                                  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
                                  BTREE_TRIGGER_NORUN) ?:
                bch2_trans_commit(trans, NULL, NULL,
@@ -555,13 +556,15 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
        return true;
 }
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
-                              enum btree_id id, struct bpos pos)
+void bch2_btree_key_cache_drop(struct btree_trans *trans,
+                              struct btree_path *path)
 {
-       BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos));
+       struct bkey_cached *ck = (void *) path->l[0].b;
+
+       ck->valid = false;
+
+       BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
 }
-#endif
 
 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
                                           struct shrink_control *sc)
index b3d241b134539e545a44557afd7fb16ebe87f4cf..670746e72dabae9cb3d56a5cbe69360ca4fdbe7d 100644 (file)
@@ -32,14 +32,8 @@ bool bch2_btree_insert_key_cached(struct btree_trans *,
                        struct btree_path *, struct bkey_i *);
 int bch2_btree_key_cache_flush(struct btree_trans *,
                               enum btree_id, struct bpos);
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_key_cache_verify_clean(struct btree_trans *,
-                               enum btree_id, struct bpos);
-#else
-static inline void
-bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
-                               enum btree_id id, struct bpos pos) {}
-#endif
+void bch2_btree_key_cache_drop(struct btree_trans *,
+                              struct btree_path *);
 
 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
index 6db2ac49ee3fa12394fcc0a42172157ff35fcaf8..0afade4f61f409f4f9b80539af54b2b7073404fa 100644 (file)
@@ -344,6 +344,7 @@ struct btree_insert_entry {
        bool                    cached:1;
        bool                    insert_trigger_run:1;
        bool                    overwrite_trigger_run:1;
+       bool                    key_cache_already_flushed:1;
        /*
         * @old_k may be a key from the journal; @old_btree_u64s always refers
         * to the size of the key being overwritten in the btree:
@@ -645,6 +646,8 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
 enum btree_update_flags {
        __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
        __BTREE_UPDATE_NOJOURNAL,
+       __BTREE_UPDATE_KEY_CACHE_RECLAIM,
+       __BTREE_UPDATE_NO_KEY_CACHE_COHERENCY,
 
        __BTREE_TRIGGER_NORUN,          /* Don't run triggers at all */
 
@@ -658,6 +661,9 @@ enum btree_update_flags {
 
 #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
 #define BTREE_UPDATE_NOJOURNAL         (1U << __BTREE_UPDATE_NOJOURNAL)
+#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
+#define BTREE_UPDATE_NO_KEY_CACHE_COHERENCY    \
+       (1U << __BTREE_UPDATE_NO_KEY_CACHE_COHERENCY)
 
 #define BTREE_TRIGGER_NORUN            (1U << __BTREE_TRIGGER_NORUN)
 
index 5e5a1b5e750eb1d75552c27c31743a5326072896..d9a406a28f4728b920b74a353f606f56c57e0dc7 100644 (file)
@@ -76,8 +76,6 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *,
 int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
                             struct bkey_i *, enum btree_update_flags);
 
-int __must_check bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
-                                  struct bkey_i *, enum btree_update_flags);
 int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
                                   struct bkey_i *, enum btree_update_flags);
 
index 2af2d75a06c52171a27b9d97aafff22ae47b0fb2..dc033991a4ecf7663c3a8f2e6cc2f8c511731827 100644 (file)
 #include <linux/prefetch.h>
 #include <linux/sort.h>
 
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+                         struct bkey_i *, enum btree_update_flags);
+
 static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
                                         const struct btree_insert_entry *r)
 {
        return   cmp_int(l->btree_id,   r->btree_id) ?:
+                cmp_int(l->cached,     r->cached) ?:
                 -cmp_int(l->level,     r->level) ?:
                 bpos_cmp(l->k->k.p,    r->k->k.p);
 }
@@ -378,9 +383,14 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 
        i->k->k.needs_whiteout = false;
 
-       did_work = !i->cached
-               ? btree_insert_key_leaf(trans, i)
-               : bch2_btree_insert_key_cached(trans, i->path, i->k);
+       if (!i->cached)
+               did_work = btree_insert_key_leaf(trans, i);
+       else if (!i->key_cache_already_flushed)
+               did_work = bch2_btree_insert_key_cached(trans, i->path, i->k);
+       else {
+               bch2_btree_key_cache_drop(trans, i->path);
+               did_work = false;
+       }
        if (!did_work)
                return;
 
@@ -987,18 +997,6 @@ int __bch2_trans_commit(struct btree_trans *trans)
                        goto out_reset;
        }
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-       /*
-        * if BTREE_TRIGGER_NORUN is set, it means we're probably being called
-        * from the key cache flush code:
-        */
-       trans_for_each_update(trans, i)
-               if (!i->cached &&
-                   !(i->flags & BTREE_TRIGGER_NORUN))
-                       bch2_btree_key_cache_verify_clean(trans,
-                                       i->btree_id, i->k->k.p);
-#endif
-
        ret = bch2_trans_commit_run_triggers(trans);
        if (ret)
                goto out;
@@ -1369,11 +1367,14 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
        return ret;
 }
 
-int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
-                                  struct bkey_i *k, enum btree_update_flags flags)
+static int __must_check
+bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
+                               struct bkey_i *k, enum btree_update_flags flags,
+                               unsigned long ip)
 {
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i, n;
+       int ret = 0;
 
        BUG_ON(!path->should_be_locked);
 
@@ -1388,7 +1389,7 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr
                .cached         = path->cached,
                .path           = path,
                .k              = k,
-               .ip_allocated   = _RET_IP_,
+               .ip_allocated   = ip,
        };
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -1409,17 +1410,6 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr
            !btree_insert_entry_cmp(&n, i)) {
                BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
 
-               /*
-                * This is a hack to ensure that inode creates update the btree,
-                * not the key cache, which helps with cache coherency issues in
-                * other areas:
-                */
-               if (n.cached && !i->cached) {
-                       i->k = n.k;
-                       i->flags = n.flags;
-                       return 0;
-               }
-
                bch2_path_put(trans, i->path, true);
                i->flags        = n.flags;
                i->cached       = n.cached;
@@ -1444,19 +1434,60 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr
                }
        }
 
-       __btree_path_get(n.path, true);
-       return 0;
+       __btree_path_get(i->path, true);
+
+       /*
+        * If a key is present in the key cache, it must also exist in the
+        * btree - this is necessary for cache coherency. When iterating over
+        * a btree that's cached in the key cache, the btree iter code checks
+        * the key cache - but the key has to exist in the btree for that to
+        * work:
+        */
+       if (path->cached &&
+           bkey_deleted(&i->old_k) &&
+           !(flags & BTREE_UPDATE_NO_KEY_CACHE_COHERENCY)) {
+               struct btree_path *btree_path;
+
+               i->key_cache_already_flushed = true;
+               i->flags |= BTREE_TRIGGER_NORUN;
+
+               btree_path = bch2_path_get(trans, path->btree_id, path->pos,
+                                          1, 0, BTREE_ITER_INTENT);
+
+               ret = bch2_btree_path_traverse(trans, btree_path, 0);
+               if (ret)
+                       goto err;
+
+               btree_path->should_be_locked = true;
+               ret = bch2_trans_update_by_path_trace(trans, btree_path, k, flags, ip);
+err:
+               bch2_path_put(trans, btree_path, true);
+       }
+
+       return ret;
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+                         struct bkey_i *k, enum btree_update_flags flags)
+{
+       return bch2_trans_update_by_path_trace(trans, path, k, flags, _RET_IP_);
 }
 
 int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
                                   struct bkey_i *k, enum btree_update_flags flags)
 {
+       struct btree_path *path = iter->update_path ?: iter->path;
+       struct bkey_cached *ck;
+       int ret;
+
        if (iter->flags & BTREE_ITER_IS_EXTENTS)
                return bch2_trans_update_extent(trans, iter, k, flags);
 
        if (bkey_deleted(&k->k) &&
+           !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
            (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
-               int ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+               ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
                if (unlikely(ret < 0))
                        return ret;
 
@@ -1464,8 +1495,45 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
                        k->k.type = KEY_TYPE_whiteout;
        }
 
-       return bch2_trans_update_by_path(trans, iter->update_path ?: iter->path,
-                                        k, flags);
+       /*
+        * Ensure that updates to cached btrees go to the key cache:
+        */
+       if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+           !path->cached &&
+           !path->level &&
+           btree_id_cached(trans->c, path->btree_id)) {
+               if (!iter->key_cache_path ||
+                   !iter->key_cache_path->should_be_locked ||
+                   bpos_cmp(iter->key_cache_path->pos, k->k.p)) {
+                       if (!iter->key_cache_path)
+                               iter->key_cache_path =
+                                       bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+                                                     BTREE_ITER_INTENT|BTREE_ITER_CACHED);
+
+                       iter->key_cache_path =
+                               bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+                                                       iter->flags & BTREE_ITER_INTENT);
+
+                       ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+                                                      BTREE_ITER_CACHED);
+                       if (unlikely(ret))
+                               return ret;
+
+                       ck = (void *) iter->key_cache_path->l[0].b;
+
+                       if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+                               trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_);
+                               btree_trans_restart(trans);
+                               return -EINTR;
+                       }
+
+                       iter->key_cache_path->should_be_locked = true;
+               }
+
+               path = iter->key_cache_path;
+       }
+
+       return bch2_trans_update_by_path(trans, path, k, flags);
 }
 
 void bch2_trans_commit_hook(struct btree_trans *trans,
index d432c90a14915b37d5b59ed1530ee982b5185154..5e78c396e24cc3f47192d132fcd7532f0d64dab9 100644 (file)
@@ -658,6 +658,12 @@ DEFINE_EVENT(transaction_restart,  trans_restart_mark_replicas,
        TP_ARGS(trans_fn, caller_ip)
 );
 
+DEFINE_EVENT(transaction_restart,      trans_restart_key_cache_raced,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip),
+       TP_ARGS(trans_fn, caller_ip)
+);
+
 DECLARE_EVENT_CLASS(transaction_restart_iter,
        TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip,