bcachefs: BTREE_ITER_FILTER_SNAPSHOTS
authorKent Overstreet <kent.overstreet@gmail.com>
Fri, 5 Mar 2021 03:29:25 +0000 (22:29 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:12 +0000 (17:09 -0400)
For snapshots, we need to implement btree lookups that return the first
key that's an ancestor of the snapshot ID the lookup is being done in -
and filter out keys in unrelated snapshots. This patch adds the btree
iterator flag BTREE_ITER_FILTER_SNAPSHOTS which does that filtering.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
fs/bcachefs/btree_iter.c
fs/bcachefs/btree_iter.h
fs/bcachefs/btree_key_cache.c
fs/bcachefs/btree_types.h

index 4cfd793f85e7abf8d546820698aaf1700f685002..b589b96bc9e7ab52b84a3628284f14c4a65854dd 100644 (file)
@@ -13,6 +13,7 @@
 #include "extents.h"
 #include "journal.h"
 #include "replicas.h"
+#include "subvolume.h"
 #include "trace.h"
 
 #include <linux/prefetch.h>
@@ -683,6 +684,55 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
               bkey_cmp(iter->pos, iter->k.p) > 0);
 }
 
+static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
+{
+       struct btree_trans *trans = iter->trans;
+       struct btree_iter copy;
+       struct bkey_s_c prev;
+       int ret = 0;
+
+       if (!bch2_debug_check_iterators)
+               return 0;
+
+       if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
+               return 0;
+
+       if (bkey_err(k) || !k.k)
+               return 0;
+
+       BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
+                                         iter->snapshot,
+                                         k.k->p.snapshot));
+
+       bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       prev = bch2_btree_iter_prev(&copy);
+       if (!prev.k)
+               goto out;
+
+       ret = bkey_err(prev);
+       if (ret)
+               goto out;
+
+       if (!bkey_cmp(prev.k->p, k.k->p) &&
+           bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
+                                     prev.k->p.snapshot) > 0) {
+               char buf1[100], buf2[200];
+
+               bch2_bkey_to_text(&PBUF(buf1), k.k);
+               bch2_bkey_to_text(&PBUF(buf2), prev.k);
+
+               panic("iter snap %u\n"
+                     "k    %s\n"
+                     "prev %s\n",
+                     iter->snapshot,
+                     buf1, buf2);
+       }
+out:
+       bch2_trans_iter_exit(trans, &copy);
+       return ret;
+}
+
 #else
 
 static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
@@ -691,6 +741,7 @@ static inline void bch2_btree_path_verify(struct btree_trans *trans,
                                          struct btree_path *path) {}
 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
+static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
 
 #endif
 
@@ -2004,11 +2055,25 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
                }
 
                if (likely(k.k)) {
-                       if (likely(!bkey_deleted(k.k)))
-                               break;
+                       /*
+                        * We can never have a key in a leaf node at POS_MAX, so
+                        * we don't have to check these successor() calls:
+                        */
+                       if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+                           !bch2_snapshot_is_ancestor(trans->c,
+                                                      iter->snapshot,
+                                                      k.k->p.snapshot)) {
+                               search_key = bpos_successor(k.k->p);
+                               continue;
+                       }
 
-                       /* Advance to next key: */
-                       search_key = bkey_successor(iter, k.k->p);
+                       if (bkey_whiteout(k.k) &&
+                           !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+                               search_key = bkey_successor(iter, k.k->p);
+                               continue;
+                       }
+
+                       break;
                } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
                        /* Advance to next leaf node: */
                        search_key = bpos_successor(iter->path->l[0].b->key.k.p);
@@ -2029,6 +2094,9 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
        else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
                iter->pos = bkey_start_pos(k.k);
 
+       if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+               iter->pos.snapshot = iter->snapshot;
+
        cmp = bpos_cmp(k.k->p, iter->path->pos);
        if (cmp) {
                iter->path = bch2_btree_path_make_mut(trans, iter->path,
@@ -2041,6 +2109,10 @@ out:
 
        bch2_btree_iter_verify_entry_exit(iter);
        bch2_btree_iter_verify(iter);
+       ret = bch2_btree_iter_verify_ret(iter, k);
+       if (unlikely(ret))
+               return bkey_s_c_err(ret);
+
        return k;
 }
 
@@ -2064,7 +2136,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 {
        struct btree_trans *trans = iter->trans;
        struct bpos search_key = iter->pos;
+       struct btree_path *saved_path = NULL;
        struct bkey_s_c k;
+       struct bkey saved_k;
+       const struct bch_val *saved_v;
        int ret;
 
        EBUG_ON(iter->path->cached || iter->path->level);
@@ -2072,6 +2147,9 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
        bch2_btree_iter_verify(iter);
        bch2_btree_iter_verify_entry_exit(iter);
 
+       if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+               search_key.snapshot = U32_MAX;
+
        while (1) {
                iter->path = btree_path_set_pos(trans, iter->path, search_key,
                                                iter->flags & BTREE_ITER_INTENT);
@@ -2088,12 +2166,55 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
                                          &iter->path->l[0], &iter->k);
                if (!k.k ||
                    ((iter->flags & BTREE_ITER_IS_EXTENTS)
-                    ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
-                    : bkey_cmp(k.k->p, iter->pos) > 0))
+                    ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0
+                    : bpos_cmp(k.k->p, search_key) > 0))
                        k = btree_path_level_prev(trans, iter->path,
                                                  &iter->path->l[0], &iter->k);
 
                if (likely(k.k)) {
+                       if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
+                               if (k.k->p.snapshot == iter->snapshot)
+                                       goto got_key;
+
+                               /*
+                                * If we have a saved candidate, and we're no
+                                * longer at the same _key_ (not pos), return
+                                * that candidate
+                                */
+                               if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
+                                       bch2_path_put(trans, iter->path,
+                                                     iter->flags & BTREE_ITER_INTENT);
+                                       iter->path = saved_path;
+                                       saved_path = NULL;
+                                       iter->k = saved_k;
+                                       k.v     = saved_v;
+                                       goto got_key;
+                               }
+
+                               if (bch2_snapshot_is_ancestor(iter->trans->c,
+                                                             iter->snapshot,
+                                                             k.k->p.snapshot)) {
+                                       if (saved_path)
+                                               bch2_path_put(trans, saved_path,
+                                                     iter->flags & BTREE_ITER_INTENT);
+                                       saved_path = btree_path_clone(trans, iter->path,
+                                                               iter->flags & BTREE_ITER_INTENT);
+                                       saved_k = *k.k;
+                                       saved_v = k.v;
+                               }
+
+                               search_key = bpos_predecessor(k.k->p);
+                               continue;
+                       }
+got_key:
+                       if (bkey_whiteout(k.k) &&
+                           !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+                               search_key = bkey_predecessor(iter, k.k->p);
+                               if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+                                       search_key.snapshot = U32_MAX;
+                               continue;
+                       }
+
                        break;
                } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) {
                        /* Advance to previous leaf node: */
@@ -2111,7 +2232,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
        /* Extents can straddle iter->pos: */
        if (bkey_cmp(k.k->p, iter->pos) < 0)
                iter->pos = k.k->p;
+
+       if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+               iter->pos.snapshot = iter->snapshot;
 out:
+       if (saved_path)
+               bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
        iter->path->should_be_locked = true;
 
        bch2_btree_iter_verify_entry_exit(iter);
@@ -2160,7 +2286,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
        if (unlikely(ret))
                return bkey_s_c_err(ret);
 
-       if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) {
+       if ((iter->flags & BTREE_ITER_CACHED) ||
+           !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
                struct bkey_i *next_update;
 
                next_update = btree_trans_peek_updates(iter);
@@ -2209,15 +2336,18 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
                if (bkey_cmp(iter->pos, next) < 0) {
                        bkey_init(&iter->k);
                        iter->k.p = iter->pos;
-                       bch2_key_resize(&iter->k,
-                                       min_t(u64, KEY_SIZE_MAX,
-                                             (next.inode == iter->pos.inode
-                                              ? next.offset
-                                              : KEY_OFFSET_MAX) -
-                                             iter->pos.offset));
+
+                       if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+                               bch2_key_resize(&iter->k,
+                                               min_t(u64, KEY_SIZE_MAX,
+                                                     (next.inode == iter->pos.inode
+                                                      ? next.offset
+                                                      : KEY_OFFSET_MAX) -
+                                                     iter->pos.offset));
+                               EBUG_ON(!iter->k.size);
+                       }
 
                        k = (struct bkey_s_c) { &iter->k, NULL };
-                       EBUG_ON(!k.k->size);
                }
        }
 
@@ -2225,6 +2355,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
        bch2_btree_iter_verify_entry_exit(iter);
        bch2_btree_iter_verify(iter);
+       ret = bch2_btree_iter_verify_ret(iter, k);
+       if (unlikely(ret))
+               return bkey_s_c_err(ret);
 
        return k;
 }
@@ -2392,6 +2525,13 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
        if (!btree_type_has_snapshots(btree_id) &&
            !(flags & __BTREE_ITER_ALL_SNAPSHOTS))
                flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+#if 0
+       /* let's have this be explicitly set: */
+       if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
+           btree_type_has_snapshots(btree_id) &&
+           !(flags & BTREE_ITER_ALL_SNAPSHOTS))
+               flags |= BTREE_ITER_FILTER_SNAPSHOTS;
+#endif
 
        if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
                pos.snapshot = btree_type_has_snapshots(btree_id)
index 58add0bb1c815faaf2e08558a0b901f44d74cba9..feb2fcff14854216acf634ecbd35e99e60a68e95 100644 (file)
@@ -260,6 +260,15 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it
        iter->pos = bkey_start_pos(&iter->k);
 }
 
+static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
+{
+       struct bpos pos = iter->pos;
+
+       iter->snapshot = snapshot;
+       pos.snapshot = snapshot;
+       bch2_btree_iter_set_pos(iter, pos);
+}
+
 /*
  * Unlocks before scheduling
  * Note: does not revalidate iterator
index 7be5805553744ddea0a20b809401ed959da1f74e..50b44e55dfe77e34a1aa157299615e5e163bc935 100644 (file)
@@ -372,7 +372,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 
        bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
                             BTREE_ITER_SLOTS|
-                            BTREE_ITER_INTENT);
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_ALL_SNAPSHOTS);
        bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
                             BTREE_ITER_CACHED|
                             BTREE_ITER_CACHED_NOFILL|
index 9250ac69e8b14d6c8d75398a47a3aacbcb2913f6..081b82d3848e840d17205673dee627561e0d6e71 100644 (file)
@@ -209,6 +209,7 @@ struct btree_node_iter {
 #define BTREE_ITER_WITH_UPDATES                (1 << 10)
 #define __BTREE_ITER_ALL_SNAPSHOTS     (1 << 11)
 #define BTREE_ITER_ALL_SNAPSHOTS       (1 << 12)
+#define BTREE_ITER_FILTER_SNAPSHOTS    (1 << 13)
 
 enum btree_path_uptodate {
        BTREE_ITER_UPTODATE             = 0,