bcachefs: lift ordering restriction on 0 size extents
authorKent Overstreet <kent.overstreet@gmail.com>
Fri, 22 Jul 2016 03:05:06 +0000 (19:05 -0800)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:09 +0000 (17:08 -0400)
This lifts the restriction that 0 size extents must not overlap with
other extents, which means we can now sort extents and non extents the
same way, and will let us simplify a bunch of other stuff as well.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/bset.c
fs/bcachefs/bset.h
fs/bcachefs/btree_gc.c
fs/bcachefs/btree_io.c
fs/bcachefs/btree_io.h
fs/bcachefs/btree_iter.c
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/btree_update_leaf.c
fs/bcachefs/extents.c

index cf83911b3f5d9d8f3b321a3f650f7e071b3feef4..27fa3e230e6eaaf086e0ec180350fa307f1567cf 100644 (file)
@@ -18,6 +18,9 @@
 #include <linux/random.h>
 #include <linux/prefetch.h>
 
+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
+                                                 struct btree *);
+
 struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
 {
        unsigned offset = __btree_node_key_to_offset(b, k);
@@ -63,8 +66,8 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
                _n = bkey_next(_k);
 
                bch2_bkey_to_text(buf, sizeof(buf), &k);
-               printk(KERN_ERR "block %u key %zi/%u: %s\n", set,
-                      _k->_data - i->_data, i->u64s, buf);
+               printk(KERN_ERR "block %u key %5u: %s\n", set,
+                      __btree_node_key_to_offset(b, _k), buf);
 
                if (_n == vstruct_last(i))
                        continue;
@@ -120,20 +123,6 @@ void bch2_dump_btree_node_iter(struct btree *b,
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 
-static bool keys_out_of_order(struct btree *b,
-                             const struct bkey_packed *prev,
-                             const struct bkey_packed *next,
-                             bool is_extents)
-{
-       struct bkey nextu = bkey_unpack_key(b, next);
-
-       return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 ||
-               ((is_extents
-                 ? !bkey_deleted(next)
-                 : !bkey_deleted(prev)) &&
-                !bkey_cmp_packed(b, prev, next));
-}
-
 void __bch2_verify_btree_nr_keys(struct btree *b)
 {
        struct bset_tree *t;
@@ -150,16 +139,21 @@ void __bch2_verify_btree_nr_keys(struct btree *b)
        BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
 }
 
-static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
-                                          struct btree *b,
-                                          struct bkey_packed *k)
+static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
+                                           struct btree *b)
 {
-       const struct bkey_packed *n = bch2_btree_node_iter_peek_all(iter, b);
+       struct btree_node_iter iter = *_iter;
+       const struct bkey_packed *k, *n;
+
+       k = bch2_btree_node_iter_peek_all(&iter, b);
+       __bch2_btree_node_iter_advance(&iter, b);
+       n = bch2_btree_node_iter_peek_all(&iter, b);
 
        bkey_unpack_key(b, k);
 
        if (n &&
-           keys_out_of_order(b, k, n, iter->is_extents)) {
+           __btree_node_iter_cmp(b, k, n) > 0) {
+               struct btree_node_iter_set *set;
                struct bkey ku = bkey_unpack_key(b, k);
                struct bkey nu = bkey_unpack_key(b, n);
                char buf1[80], buf2[80];
@@ -167,12 +161,22 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
                bch2_dump_btree_node(b);
                bch2_bkey_to_text(buf1, sizeof(buf1), &ku);
                bch2_bkey_to_text(buf2, sizeof(buf2), &nu);
-               panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2);
+               printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
+                      buf1, buf2);
+               printk(KERN_ERR "iter was:");
+
+               btree_node_iter_for_each(_iter, set) {
+                       struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
+                       struct bset_tree *t = bch2_bkey_to_bset(b, k);
+                       printk(" [%zi %zi]", t - b->set,
+                              k->_data - bset(b, t)->_data);
+               }
+               panic("\n");
        }
 }
 
 void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
-                               struct btree *b)
+                                struct btree *b)
 {
        struct btree_node_iter_set *set, *s2;
        struct bset_tree *t;
@@ -196,72 +200,72 @@ found:
        /* Verify iterator is sorted: */
        btree_node_iter_for_each(iter, set)
                BUG_ON(set != iter->data &&
-                      btree_node_iter_cmp(iter, b, set[-1], set[0]) > 0);
+                      btree_node_iter_cmp(b, set[-1], set[0]) > 0);
 }
 
-void bch2_verify_key_order(struct btree *b,
-                         struct btree_node_iter *iter,
-                         struct bkey_packed *where)
+void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
+                           struct bkey_packed *insert, unsigned clobber_u64s)
 {
        struct bset_tree *t = bch2_bkey_to_bset(b, where);
-       struct bkey_packed *k, *prev;
-       struct bkey uk, uw = bkey_unpack_key(b, where);
-
-       k = bch2_bkey_prev_all(b, t, where);
-       if (k &&
-           keys_out_of_order(b, k, where, iter->is_extents)) {
-               char buf1[100], buf2[100];
+       struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
+       struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
+#if 0
+       BUG_ON(prev &&
+              __btree_node_iter_cmp(b, prev, insert) > 0);
+#else
+       if (prev &&
+           __btree_node_iter_cmp(b, prev, insert) > 0) {
+               struct bkey k1 = bkey_unpack_key(b, prev);
+               struct bkey k2 = bkey_unpack_key(b, insert);
+               char buf1[100];
+               char buf2[100];
 
                bch2_dump_btree_node(b);
-               uk = bkey_unpack_key(b, k);
-               bch2_bkey_to_text(buf1, sizeof(buf1), &uk);
-               bch2_bkey_to_text(buf2, sizeof(buf2), &uw);
-               panic("out of order with prev:\n%s\n%s\n",
-                     buf1, buf2);
+               bch2_bkey_to_text(buf1, sizeof(buf1), &k1);
+               bch2_bkey_to_text(buf2, sizeof(buf2), &k2);
+
+               panic("prev > insert:\n"
+                     "prev    key %5u %s\n"
+                     "insert  key %5u %s\n",
+                      __btree_node_key_to_offset(b, prev), buf1,
+                      __btree_node_key_to_offset(b, insert), buf2);
        }
+#endif
+#if 0
+       BUG_ON(next != btree_bkey_last(b, t) &&
+              __btree_node_iter_cmp(b, insert, next) > 0);
+#else
+       if (next != btree_bkey_last(b, t) &&
+           __btree_node_iter_cmp(b, insert, next) > 0) {
+               struct bkey k1 = bkey_unpack_key(b, insert);
+               struct bkey k2 = bkey_unpack_key(b, next);
+               char buf1[100];
+               char buf2[100];
 
-       k = bkey_next(where);
-       BUG_ON(k != btree_bkey_last(b, t) &&
-              keys_out_of_order(b, where, k, iter->is_extents));
-
-       for_each_bset(b, t) {
-               if (where >= btree_bkey_first(b, t) ||
-                   where < btree_bkey_last(b, t))
-                       continue;
-
-               k = bch2_btree_node_iter_bset_pos(iter, b, t);
-
-               if (k == btree_bkey_last(b, t))
-                       k = bch2_bkey_prev_all(b, t, k);
-
-               while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 &&
-                      (prev = bch2_bkey_prev_all(b, t, k)))
-                       k = prev;
-
-               for (;
-                    k != btree_bkey_last(b, t);
-                    k = bkey_next(k)) {
-                       uk = bkey_unpack_key(b, k);
-
-                       if (iter->is_extents) {
-                               BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 ||
-                                        bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0));
-                       } else {
-                               BUG_ON(!bkey_cmp(uw.p, uk.p) &&
-                                      !bkey_deleted(&uk));
-                       }
-
-                       if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0)
-                               break;
-               }
+               bch2_dump_btree_node(b);
+               bch2_bkey_to_text(buf1, sizeof(buf1), &k1);
+               bch2_bkey_to_text(buf2, sizeof(buf2), &k2);
+
+               panic("insert > next:\n"
+                     "insert  key %5u %s\n"
+                     "next    key %5u %s\n",
+                      __btree_node_key_to_offset(b, insert), buf1,
+                      __btree_node_key_to_offset(b, next), buf2);
        }
+#endif
+}
+
+void bch2_verify_key_order(struct btree *b,
+                          struct btree_node_iter *_iter,
+                          struct bkey_packed *where)
+{
+       bch2_verify_insert_pos(b, where, where, where->u64s);
 }
 
 #else
 
 static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
-                                                  struct btree *b,
-                                                  struct bkey_packed *k) {}
+                                                  struct btree *b) {}
 
 #endif
 
@@ -1229,6 +1233,7 @@ void bch2_bset_insert(struct btree *b,
        struct bkey_packed packed, *src = bkey_to_packed(insert);
 
        bch2_bset_verify_rw_aux_tree(b, t);
+       bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
 
        if (bch2_bkey_pack_key(&packed, &insert->k, f))
                src = &packed;
@@ -1255,7 +1260,6 @@ void bch2_bset_insert(struct btree *b,
 
        bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
 
-       bch2_verify_key_order(b, iter, where);
        bch2_verify_btree_nr_keys(b);
 }
 
@@ -1461,7 +1465,7 @@ void bch2_btree_node_iter_push(struct btree_node_iter *iter,
 noinline __flatten __attribute__((cold))
 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
                              struct btree *b, struct bpos search,
-                             bool strictly_greater, bool is_extents)
+                             bool strictly_greater)
 {
        struct bset_tree *t;
 
@@ -1518,7 +1522,7 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
  */
 void bch2_btree_node_iter_init(struct btree_node_iter *iter,
                               struct btree *b, struct bpos search,
-                              bool strictly_greater, bool is_extents)
+                              bool strictly_greater)
 {
        struct bset_tree *t;
        struct bkey_packed p, *packed_search = NULL;
@@ -1526,7 +1530,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
        EBUG_ON(bkey_cmp(search, b->data->min_key) < 0);
        bset_aux_tree_verify(b);
 
-       __bch2_btree_node_iter_init(iter, is_extents);
+       memset(iter, 0, sizeof(*iter));
 
        switch (bch2_bkey_pack_pos_lossy(&p, search, b)) {
        case BKEY_PACK_POS_EXACT:
@@ -1537,7 +1541,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
                break;
        case BKEY_PACK_POS_FAIL:
                btree_node_iter_init_pack_failed(iter, b, search,
-                                       strictly_greater, is_extents);
+                                                strictly_greater);
                return;
        }
 
@@ -1552,12 +1556,11 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
 }
 
 void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
-                                         struct btree *b,
-                                         bool is_extents)
+                                         struct btree *b)
 {
        struct bset_tree *t;
 
-       __bch2_btree_node_iter_init(iter, is_extents);
+       memset(iter, 0, sizeof(*iter));
 
        for_each_bset(b, t)
                __bch2_btree_node_iter_push(iter, b,
@@ -1585,7 +1588,7 @@ static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
 {
        bool ret;
 
-       if ((ret = (btree_node_iter_cmp(iter, b,
+       if ((ret = (btree_node_iter_cmp(b,
                                        iter->data[first],
                                        iter->data[first + 1]) > 0)))
                swap(iter->data[first], iter->data[first + 1]);
@@ -1640,23 +1643,14 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
        btree_node_iter_sort_two(iter, b, 1);
 }
 
-/**
- * bch_btree_node_iter_advance - advance @iter by one key
- *
- * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might
- * momentarily have out of order extents.
- */
 void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
                                  struct btree *b)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
-       struct bkey_packed *k = bch2_btree_node_iter_peek_all(iter, b);
-
-       __bch2_btree_node_iter_advance(iter, b);
-       bch2_btree_node_iter_next_check(iter, b, k);
-#else
-       __bch2_btree_node_iter_advance(iter, b);
+       bch2_btree_node_iter_verify(iter, b);
+       bch2_btree_node_iter_next_check(iter, b);
 #endif
+       __bch2_btree_node_iter_advance(iter, b);
 }
 
 static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
@@ -1689,8 +1683,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *ite
                        bch2_btree_node_iter_bset_pos(iter, b, t),
                        min_key_type);
                if (k &&
-                   (!prev || __btree_node_iter_cmp(iter->is_extents, b,
-                                                   k, prev) > 0)) {
+                   (!prev || __btree_node_iter_cmp(b, k, prev) > 0)) {
                        prev = k;
                        end = t->end_offset;
                }
@@ -1723,11 +1716,11 @@ out:
                struct btree_node_iter iter2 = *iter;
 
                if (prev)
-                       bch2_btree_node_iter_advance(&iter2, b);
+                       __bch2_btree_node_iter_advance(&iter2, b);
 
                while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) {
                        BUG_ON(k->type >= min_key_type);
-                       bch2_btree_node_iter_advance(&iter2, b);
+                       __bch2_btree_node_iter_advance(&iter2, b);
                }
        }
 
index 2fa71d7c0e8a5c5871722d0dc0b1987d9c1daab6..0787030ccc7e7f9ae870e38d75eeb09d6d157e17 100644 (file)
@@ -369,6 +369,17 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b,
        return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
 }
 
+/* Returns true if @k is after iterator position @pos */
+static inline bool btree_iter_pos_cmp(struct btree_iter *iter,
+                                     const struct bkey *k)
+{
+       int cmp = bkey_cmp(k->p, iter->pos);
+
+       return cmp > 0 ||
+               (cmp == 0 &&
+                !(iter->flags & BTREE_ITER_IS_EXTENTS) && !bkey_deleted(k));
+}
+
 /* Returns true if @k is after iterator position @pos */
 static inline bool btree_iter_pos_cmp_packed(const struct btree *b,
                                             struct bpos *pos,
@@ -419,7 +430,7 @@ enum bch_extent_overlap {
 
 /* Returns how k overlaps with m */
 static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
-                                                        const struct bkey *m)
+                                                         const struct bkey *m)
 {
        int cmp1 = bkey_cmp(k->p, m->p) < 0;
        int cmp2 = bkey_cmp(bkey_start_pos(k),
@@ -430,20 +441,13 @@ static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
 
 /* Btree key iteration */
 
-static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter,
-                                             bool is_extents)
-{
-       iter->is_extents = is_extents;
-       memset(iter->data, 0, sizeof(iter->data));
-}
-
 void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
                              const struct bkey_packed *,
                              const struct bkey_packed *);
 void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
-                             struct bpos, bool, bool);
+                              struct bpos, bool);
 void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
-                                        struct btree *, bool);
+                                         struct btree *);
 struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
                                                 struct btree *,
                                                 struct bset_tree *);
@@ -470,32 +474,21 @@ static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
        return __btree_node_iter_set_end(iter, 0);
 }
 
-static inline int __btree_node_iter_cmp(bool is_extents,
-                                       struct btree *b,
-                                       struct bkey_packed *l,
-                                       struct bkey_packed *r)
+static inline int __btree_node_iter_cmp(struct btree *b,
+                                       const struct bkey_packed *l,
+                                       const struct bkey_packed *r)
 {
-       /*
-        * For non extents, when keys compare equal the deleted keys have to
-        * come first - so that bch2_btree_node_iter_next_check() can detect
-        * duplicate nondeleted keys (and possibly other reasons?)
-        *
-        * For extents, bkey_deleted() is used as a proxy for k->size == 0, so
-        * deleted keys have to sort last.
-        */
+       /* When keys compare equal deleted keys come first */
        return bkey_cmp_packed(b, l, r)
-               ?: (is_extents
-                   ? (int) bkey_deleted(l) - (int) bkey_deleted(r)
-                   : (int) bkey_deleted(r) - (int) bkey_deleted(l))
+               ?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
                ?: (l > r) - (l < r);
 }
 
-static inline int btree_node_iter_cmp(struct btree_node_iter *iter,
-                                     struct btree *b,
+static inline int btree_node_iter_cmp(struct btree *b,
                                      struct btree_node_iter_set l,
                                      struct btree_node_iter_set r)
 {
-       return __btree_node_iter_cmp(iter->is_extents, b,
+       return __btree_node_iter_cmp(b,
                        __btree_node_offset_to_key(b, l.k),
                        __btree_node_offset_to_key(b, r.k));
 }
@@ -582,21 +575,12 @@ bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
        return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_DISCARD + 1);
 }
 
-/*
- * Iterates over all _live_ keys - skipping deleted (and potentially
- * overlapping) keys
- */
-#define for_each_btree_node_key(b, k, iter, _is_extents)               \
-       for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
-            ((k) = bch2_btree_node_iter_peek(iter, b));                        \
-            bch2_btree_node_iter_advance(iter, b))
-
 struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
                                                struct btree *,
                                                struct bkey *);
 
-#define for_each_btree_node_key_unpack(b, k, iter, _is_extents, unpacked)\
-       for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
+#define for_each_btree_node_key_unpack(b, k, iter, unpacked)           \
+       for (bch2_btree_node_iter_init_from_start((iter), (b));         \
             (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
             bch2_btree_node_iter_advance(iter, b))
 
@@ -646,6 +630,8 @@ void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
 
 void __bch2_verify_btree_nr_keys(struct btree *);
 void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
+void bch2_verify_insert_pos(struct btree *, struct bkey_packed *,
+                           struct bkey_packed *, unsigned);
 void bch2_verify_key_order(struct btree *, struct btree_node_iter *,
                          struct bkey_packed *);
 
@@ -654,6 +640,10 @@ void bch2_verify_key_order(struct btree *, struct btree_node_iter *,
 static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
 static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
                                              struct btree *b) {}
+static inline void bch2_verify_insert_pos(struct btree *b,
+                                         struct bkey_packed *where,
+                                         struct bkey_packed *insert,
+                                         unsigned clobber_u64s) {}
 static inline void bch2_verify_key_order(struct btree *b,
                                        struct btree_node_iter *iter,
                                        struct bkey_packed *where) {}
index 1fbb9c657fc6cccc49101476933ed2863a7f0798..2526118fe9ce51c754106432e8fab67686df9e34 100644 (file)
@@ -217,7 +217,6 @@ static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
 
        if (btree_node_has_ptrs(b))
                for_each_btree_node_key_unpack(b, k, &iter,
-                                              btree_node_is_extents(b),
                                               &unpacked) {
                        bch2_bkey_debugcheck(c, b, k);
                        stale = max(stale, bch2_gc_mark_key(c, type, k, 0));
@@ -1044,7 +1043,6 @@ static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
                        struct bkey_s_c k;
 
                        for_each_btree_node_key_unpack(b, k, &node_iter,
-                                                      btree_node_is_extents(b),
                                                       &unpacked) {
                                ret = bch2_btree_mark_key_initial(c,
                                                        btree_node_type(b), k);
index 5c36acef2b138cb09d4f1f65f2007c9820943250..889870582566da109324d3c0faee0863c80083de 100644 (file)
@@ -22,7 +22,7 @@
 /* btree_node_iter_large: */
 
 #define btree_node_iter_cmp_heap(h, _l, _r)                            \
-       __btree_node_iter_cmp((iter)->is_extents, b,                    \
+       __btree_node_iter_cmp(b,                                        \
                               __btree_node_offset_to_key(b, (_l).k),   \
                               __btree_node_offset_to_key(b, (_r).k))
 
@@ -248,6 +248,9 @@ static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
        sort_iter_sort(iter, sort_extent_whiteouts_cmp);
 
        while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
+               if (bkey_deleted(in))
+                       continue;
+
                EBUG_ON(bkeyp_val_u64s(f, in));
                EBUG_ON(in->type != KEY_TYPE_DISCARD);
 
@@ -785,8 +788,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
 
        bch2_bset_set_no_aux_tree(dst, dst->set);
 
-       bch2_btree_node_iter_init_from_start(&src_iter, src,
-                                           btree_node_is_extents(src));
+       bch2_btree_node_iter_init_from_start(&src_iter, src);
 
        if (btree_node_ops(src)->key_normalize ||
            btree_node_ops(src)->key_merge)
@@ -1171,7 +1173,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
        int ret, retry_read = 0, write = READ;
 
        iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
-       __bch2_btree_node_iter_large_init(iter, btree_node_is_extents(b));
+       iter->used = 0;
 
        if (bch2_meta_read_fault("btree"))
                btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
index 0688ce420610cce1142b97d4106344e79847e1ca..7835f8a9e3a032bb8bc7a00e41c60a93a2d6c897 100644 (file)
@@ -146,20 +146,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *);
 /* Sorting */
 
 struct btree_node_iter_large {
-       u8              is_extents;
        u16             used;
 
        struct btree_node_iter_set data[MAX_BSETS];
 };
 
-static inline void
-__bch2_btree_node_iter_large_init(struct btree_node_iter_large *iter,
-                                 bool is_extents)
-{
-       iter->used = 0;
-       iter->is_extents = is_extents;
-}
-
 void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *,
                                        struct btree *);
 
index 8918268f99f4dd541eb3ff70771c4a9659a20f2f..9d92826181dca995c503db01f538f5d7ebef9f11 100644 (file)
@@ -375,14 +375,20 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
        struct btree_node_iter tmp = l->iter;
        struct bkey_packed *k;
 
+       if (iter->uptodate > BTREE_ITER_NEED_PEEK)
+               return;
+
        bch2_btree_node_iter_verify(&l->iter, b);
 
        /*
         * For interior nodes, the iterator will have skipped past
         * deleted keys:
+        *
+        * For extents, the iterator may have skipped past deleted keys (but not
+        * whiteouts)
         */
-       k = b->level
-               ? bch2_btree_node_iter_prev(&tmp, b)
+       k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS
+               ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_DISCARD)
                : bch2_btree_node_iter_prev_all(&tmp, b);
        if (k && btree_iter_pos_cmp_packed(b, &iter->pos, k,
                                iter->flags & BTREE_ITER_IS_EXTENTS)) {
@@ -390,7 +396,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
                struct bkey uk = bkey_unpack_key(b, k);
 
                bch2_bkey_to_text(buf, sizeof(buf), &uk);
-               panic("prev key should be before after pos:\n%s\n%llu:%llu\n",
+               panic("prev key should be before iter pos:\n%s\n%llu:%llu\n",
                      buf, iter->pos.inode, iter->pos.offset);
        }
 
@@ -401,15 +407,16 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
                struct bkey uk = bkey_unpack_key(b, k);
 
                bch2_bkey_to_text(buf, sizeof(buf), &uk);
-               panic("next key should be before iter pos:\n%llu:%llu\n%s\n",
+               panic("iter should be after current key:\n"
+                     "iter pos %llu:%llu\n"
+                     "cur key  %s\n",
                      iter->pos.inode, iter->pos.offset, buf);
        }
 
-       if (iter->uptodate == BTREE_ITER_UPTODATE &&
-           (iter->flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES) {
-               BUG_ON(!bkey_whiteout(&iter->k) &&
-                      bch2_btree_node_iter_end(&l->iter));
-       }
+       BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
+              (iter->flags & BTREE_ITER_TYPE) == BTREE_ITER_KEYS &&
+              !bkey_whiteout(&iter->k) &&
+              bch2_btree_node_iter_end(&l->iter));
 }
 
 void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
@@ -420,6 +427,11 @@ void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
                __bch2_btree_iter_verify(linked, b);
 }
 
+#else
+
+static inline void __bch2_btree_iter_verify(struct btree_iter *iter,
+                                           struct btree *b) {}
+
 #endif
 
 static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
@@ -434,7 +446,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
        struct btree_node_iter_set *set;
        unsigned offset = __btree_node_key_to_offset(b, where);
        int shift = new_u64s - clobber_u64s;
-       unsigned old_end = (int) __btree_node_key_to_offset(b, end) - shift;
+       unsigned old_end = t->end_offset - shift;
 
        btree_node_iter_for_each(node_iter, set)
                if (set->end == old_end)
@@ -456,7 +468,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
        }
        return;
 found:
-       set->end = (int) set->end + shift;
+       set->end = t->end_offset;
 
        /* Iterator hasn't gotten to the key that changed yet: */
        if (set->k < offset)
@@ -517,8 +529,7 @@ iter_current_key_not_modified:
                        k = bch2_bkey_prev_all(b, t,
                                bch2_btree_node_iter_bset_pos(node_iter, b, t));
                        if (k &&
-                           __btree_node_iter_cmp(node_iter, b,
-                                                 k, where) > 0) {
+                           __btree_node_iter_cmp(b, k, where) > 0) {
                                struct btree_node_iter_set *set;
                                unsigned offset =
                                        __btree_node_key_to_offset(b, bkey_next(k));
@@ -557,10 +568,6 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
                __bch2_btree_node_iter_fix(linked, b,
                                          &linked->l[b->level].iter, t,
                                          where, clobber_u64s, new_u64s);
-
-       /* interior node iterators are... special... */
-       if (!b->level)
-               bch2_btree_iter_verify(iter, b);
 }
 
 static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
@@ -647,17 +654,6 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
                btree_node_unlock(iter, b->level + 1);
 }
 
-/* Returns true if @k is after iterator position @pos */
-static inline bool btree_iter_pos_cmp(struct btree_iter *iter,
-                                     const struct bkey *k)
-{
-       int cmp = bkey_cmp(k->p, iter->pos);
-
-       return cmp > 0 ||
-               (cmp == 0 &&
-                !(iter->flags & BTREE_ITER_IS_EXTENTS) && !bkey_deleted(k));
-}
-
 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
                                             struct btree *b)
 {
@@ -679,8 +675,7 @@ static inline void __btree_iter_init(struct btree_iter *iter,
        struct btree_iter_level *l = &iter->l[b->level];
 
        bch2_btree_node_iter_init(&l->iter, b, iter->pos,
-                                 iter->flags & BTREE_ITER_IS_EXTENTS,
-                                 btree_node_is_extents(b));
+                                 iter->flags & BTREE_ITER_IS_EXTENTS);
 
        /* Skip to first non whiteout: */
        if (b->level)
@@ -1022,7 +1017,9 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
        }
 
        iter->uptodate = BTREE_ITER_NEED_PEEK;
+
        bch2_btree_iter_verify_locks(iter);
+       __bch2_btree_iter_verify(iter, iter->l[iter->level].b);
        return 0;
 }
 
@@ -1363,9 +1360,10 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 }
 
 static inline struct bkey_s_c
-__bch2_btree_iter_peek_slot(struct btree_iter *iter)
+__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
 {
        struct btree_iter_level *l = &iter->l[0];
+       struct btree_node_iter node_iter;
        struct bkey_s_c k;
        struct bkey n;
        int ret;
@@ -1376,6 +1374,17 @@ recheck:
               bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
                __btree_iter_advance(l);
 
+       /*
+        * iterator is now at the correct position for inserting at iter->pos,
+        * but we need to keep iterating until we find the first non whiteout so
+        * we know how big a hole we have, if any:
+        */
+
+       node_iter = l->iter;
+       if (k.k && bkey_whiteout(k.k))
+               k = __btree_iter_unpack(iter, l, &iter->k,
+                       bch2_btree_node_iter_peek(&node_iter, l->b));
+
        /*
         * If we got to the end of the node, check if we need to traverse to the
         * next node:
@@ -1392,6 +1401,13 @@ recheck:
        if (k.k &&
            !bkey_whiteout(k.k) &&
            bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
+               /*
+                * if we skipped forward to find the first non whiteout and
+                * there _wasn't_ actually a hole, we want the iterator to be
+                * pointed at the key we found:
+                */
+               l->iter = node_iter;
+
                EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
                EBUG_ON(bkey_deleted(k.k));
                iter->uptodate = BTREE_ITER_UPTODATE;
@@ -1399,41 +1415,88 @@ recheck:
        }
 
        /* hole */
+
+       /* holes can't span inode numbers: */
+       if (iter->pos.offset == KEY_OFFSET_MAX) {
+               if (iter->pos.inode == KEY_INODE_MAX)
+                       return bkey_s_c_null;
+
+               iter->pos = bkey_successor(iter->pos);
+               goto recheck;
+       }
+
+       if (!k.k)
+               k.k = &l->b->key.k;
+
        bkey_init(&n);
        n.p = iter->pos;
+       bch2_key_resize(&n,
+                       min_t(u64, KEY_SIZE_MAX,
+                             (k.k->p.inode == n.p.inode
+                              ? bkey_start_offset(k.k)
+                              : KEY_OFFSET_MAX) -
+                             n.p.offset));
+
+       //EBUG_ON(!n.size);
+       if (!n.size) {
+               char buf[100];
+               bch2_dump_btree_node(iter->l[0].b);
+
+               bch2_bkey_to_text(buf, sizeof(buf), k.k);
+               panic("iter at %llu:%llu\n"
+                     "next key %s\n",
+                     iter->pos.inode,
+                     iter->pos.offset,
+                     buf);
+       }
 
-       if (iter->flags & BTREE_ITER_IS_EXTENTS) {
-               if (n.p.offset == KEY_OFFSET_MAX) {
-                       if (n.p.inode == KEY_INODE_MAX)
-                               return bkey_s_c_null;
-
-                       iter->pos = bkey_successor(iter->pos);
-                       goto recheck;
-               }
+       iter->k = n;
+       iter->uptodate = BTREE_ITER_UPTODATE;
+       return (struct bkey_s_c) { &iter->k, NULL };
+}
 
-               if (k.k && bkey_whiteout(k.k)) {
-                       struct btree_node_iter node_iter = l->iter;
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_slot(struct btree_iter *iter)
+{
+       struct btree_iter_level *l = &iter->l[0];
+       struct bkey_s_c k;
+       int ret;
 
-                       k = __btree_iter_unpack(iter, l, &iter->k,
-                               bch2_btree_node_iter_peek(&node_iter, l->b));
-               }
+       if (iter->flags & BTREE_ITER_IS_EXTENTS)
+               return __bch2_btree_iter_peek_slot_extents(iter);
 
-               if (!k.k)
-                       k.k = &l->b->key.k;
+recheck:
+       while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
+              bkey_deleted(k.k) &&
+              bkey_cmp(k.k->p, iter->pos) == 0)
+               __btree_iter_advance(l);
 
-               bch2_key_resize(&n,
-                               min_t(u64, KEY_SIZE_MAX,
-                                     (k.k->p.inode == n.p.inode
-                                      ? bkey_start_offset(k.k)
-                                      : KEY_OFFSET_MAX) -
-                                     n.p.offset));
+       /*
+        * If we got to the end of the node, check if we need to traverse to the
+        * next node:
+        */
+       if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
+               btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+               ret = bch2_btree_iter_traverse(iter);
+               if (unlikely(ret))
+                       return bkey_s_c_err(ret);
 
-               EBUG_ON(!n.size);
+               goto recheck;
        }
 
-       iter->k = n;
-       iter->uptodate = BTREE_ITER_UPTODATE;
-       return (struct bkey_s_c) { &iter->k, NULL };
+       if (k.k &&
+           !bkey_deleted(k.k) &&
+           !bkey_cmp(iter->pos, k.k->p)) {
+               iter->uptodate = BTREE_ITER_UPTODATE;
+               return k;
+       } else {
+               /* hole */
+               bkey_init(&iter->k);
+               iter->k.p = iter->pos;
+
+               iter->uptodate = BTREE_ITER_UPTODATE;
+               return (struct bkey_s_c) { &iter->k, NULL };
+       }
 }
 
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
index 5376388e91e68b10bb03a219848158026683e7f6..d57ca3d08c1687e019670c2d4156a68120323d45 100644 (file)
@@ -176,8 +176,6 @@ struct btree_cache {
 };
 
 struct btree_node_iter {
-       u8              is_extents;
-
        struct btree_node_iter_set {
                u16     k, end;
        } data[MAX_BSETS];
@@ -459,9 +457,6 @@ struct btree_root {
  * we're holding the write lock and we know what key is about to be overwritten:
  */
 
-struct btree_iter;
-struct btree_node_iter;
-
 enum btree_insert_ret {
        BTREE_INSERT_OK,
        /* extent spanned multiple leaf nodes: have to traverse to next node: */
index b60eb3d33c7b5c5e7486870a13d2da27aa61a1f9..1fe6f1e3e8431690e555160363209f0353ac8b50 100644 (file)
@@ -35,7 +35,7 @@ static void btree_node_interior_verify(struct btree *b)
 
        BUG_ON(!b->level);
 
-       bch2_btree_node_iter_init(&iter, b, b->key.k.p, false, false);
+       bch2_btree_node_iter_init(&iter, b, b->key.k.p, false);
 #if 1
        BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) ||
               bkey_cmp_left_packed(b, k, &b->key.k.p));
@@ -1322,7 +1322,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 
        BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
 
-       bch2_btree_node_iter_init(&node_iter, b, k->k.p, false, false);
+       bch2_btree_node_iter_init(&node_iter, b, k->k.p, false);
 
        while (!bch2_keylist_empty(keys)) {
                k = bch2_keylist_front(keys);
index ce0223bd52b5998e63c1b3bdeeff75be26224f40..0ef519e8feed9cb087508b2425d95d8f4a3c964d 100644 (file)
@@ -64,7 +64,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
                        if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
                                bch2_bset_delete(b, k, clobber_u64s);
                                bch2_btree_node_iter_fix(iter, b, node_iter, t,
-                                                       k, clobber_u64s, 0);
+                                                        k, clobber_u64s, 0);
+                               bch2_btree_iter_verify(iter, b);
                                return true;
                        }
 
@@ -73,7 +74,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 
                k->type = KEY_TYPE_DELETED;
                bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
-                                       k->u64s, k->u64s);
+                                        k->u64s, k->u64s);
+               bch2_btree_iter_verify(iter, b);
 
                if (bkey_whiteout(&insert->k)) {
                        reserve_whiteout(b, k);
@@ -98,7 +100,8 @@ overwrite:
        bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
        if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
                bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
-                                       clobber_u64s, k->u64s);
+                                        clobber_u64s, k->u64s);
+       bch2_btree_iter_verify(iter, b);
        return true;
 }
 
index 803272b10e613b3b62441e40ef988e600586f837..df04af882c16cbe84c0969f3bdf449697f90cb75 100644 (file)
@@ -858,30 +858,34 @@ void bch2_key_resize(struct bkey *k,
  * that we have to unpack the key, modify the unpacked key - then this
  * copies/repacks the unpacked to the original as necessary.
  */
-static bool __extent_save(struct btree *b, struct btree_node_iter *iter,
-                         struct bkey_packed *dst, struct bkey *src)
+static void extent_save(struct btree *b, struct bkey_packed *dst,
+                       struct bkey *src)
 {
        struct bkey_format *f = &b->format;
        struct bkey_i *dst_unpacked;
-       bool ret;
 
-       if ((dst_unpacked = packed_to_bkey(dst))) {
+       if ((dst_unpacked = packed_to_bkey(dst)))
                dst_unpacked->k = *src;
-               ret = true;
-       } else {
-               ret = bch2_bkey_pack_key(dst, src, f);
-       }
-
-       if (ret && iter)
-               bch2_verify_key_order(b, iter, dst);
-
-       return ret;
+       else
+               BUG_ON(!bch2_bkey_pack_key(dst, src, f));
 }
 
-static void extent_save(struct btree *b, struct btree_node_iter *iter,
-                       struct bkey_packed *dst, struct bkey *src)
+static bool extent_i_save(struct btree *b, struct bkey_packed *dst,
+                         struct bkey_i *src)
 {
-       BUG_ON(!__extent_save(b, iter, dst, src));
+       struct bkey_format *f = &b->format;
+       struct bkey_i *dst_unpacked;
+       struct bkey_packed tmp;
+
+       if ((dst_unpacked = packed_to_bkey(dst)))
+               dst_unpacked->k = src->k;
+       else if (bch2_bkey_pack_key(&tmp, &src->k, f))
+               memcpy_u64s(dst, &tmp, f->key_u64s);
+       else
+               return false;
+
+       memcpy_u64s(bkeyp_val(f, dst), &src->v, bkey_val_u64s(&src->k));
+       return true;
 }
 
 /*
@@ -1010,7 +1014,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
                                sort_key_next(iter, b, _r);
                        } else {
                                __bch2_cut_front(l.k->p, r);
-                               extent_save(b, NULL, rk, r.k);
+                               extent_save(b, rk, r.k);
                        }
 
                        extent_sort_sift(iter, b, _r - iter->data);
@@ -1024,7 +1028,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
                        bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k);
 
                        __bch2_cut_front(r.k->p, l);
-                       extent_save(b, NULL, lk, l.k);
+                       extent_save(b, lk, l.k);
 
                        extent_sort_sift(iter, b, 0);
 
@@ -1032,7 +1036,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
                                           bkey_to_packed(&tmp.k));
                } else {
                        bch2_cut_back(bkey_start_pos(r.k), l.k);
-                       extent_save(b, NULL, lk, l.k);
+                       extent_save(b, lk, l.k);
                }
        }
 
@@ -1135,6 +1139,55 @@ extent_insert_should_stop(struct extent_insert_state *s)
                return BTREE_INSERT_OK;
 }
 
+static void verify_extent_nonoverlapping(struct btree *b,
+                                        struct btree_node_iter *_iter,
+                                        struct bkey_i *insert)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+       struct btree_node_iter iter;
+       struct bkey_packed *k;
+       struct bkey uk;
+
+       iter = *_iter;
+       k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_DISCARD);
+       BUG_ON(k &&
+              (uk = bkey_unpack_key(b, k),
+               bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
+
+       iter = *_iter;
+       k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_DISCARD);
+#if 0
+       BUG_ON(k &&
+              (uk = bkey_unpack_key(b, k),
+               bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
+#else
+       if (k &&
+           (uk = bkey_unpack_key(b, k),
+            bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
+               char buf1[100];
+               char buf2[100];
+
+               bch2_bkey_to_text(buf1, sizeof(buf1), &insert->k);
+               bch2_bkey_to_text(buf2, sizeof(buf2), &uk);
+
+               bch2_dump_btree_node(b);
+               panic("insert > next :\n"
+                     "insert %s\n"
+                     "next   %s\n",
+                     buf1, buf2);
+       }
+#endif
+
+#endif
+}
+
+static void verify_modified_extent(struct btree_iter *iter,
+                                  struct bkey_packed *k)
+{
+       bch2_btree_iter_verify(iter, iter->l[0].b);
+       bch2_verify_insert_pos(iter->l[0].b, k, k, k->u64s);
+}
+
 static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
                               struct bkey_i *insert)
 {
@@ -1148,6 +1201,14 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
        unsigned clobber_u64s;
 
        EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+       verify_extent_nonoverlapping(l->b, &l->iter, insert);
+
+       if (!prev) {
+               while ((prev = bch2_bkey_prev_all(l->b, t, where)) &&
+                      (bkey_cmp_left_packed(l->b, prev, &insert->k.p) ?:
+                       ((int) bkey_deleted(&insert->k) - (int) bkey_deleted(prev))) > 0)
+                       where = prev;
+       }
 
        if (prev)
                where = bkey_next(prev);
@@ -1173,12 +1234,15 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 
        bch2_bset_insert(l->b, &l->iter, where, insert, clobber_u64s);
        bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, where,
-                               clobber_u64s, where->u64s);
+                                clobber_u64s, where->u64s);
+       bch2_verify_key_order(l->b, &l->iter, where);
+       bch2_btree_iter_verify(iter, l->b);
        return;
 drop_deleted_keys:
        bch2_bset_delete(l->b, where, clobber_u64s);
        bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
                                 where, clobber_u64s, 0);
+       bch2_btree_iter_verify(iter, l->b);
 }
 
 static void extent_insert_committed(struct extent_insert_state *s)
@@ -1226,8 +1290,10 @@ static void extent_insert_committed(struct extent_insert_state *s)
 
        bch2_btree_journal_key(s->trans, iter, &split.k);
 
-       if (!s->deleting)
+       if (!s->deleting) {
+               bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
                extent_bset_insert(c, iter, &split.k);
+       }
 done:
        bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
 
@@ -1345,22 +1411,21 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
        struct btree_iter *iter = s->insert->iter;
        struct btree_iter_level *l = &iter->l[0];
        struct btree *b = l->b;
-       struct btree_node_iter *node_iter = &l->iter;
-       enum btree_insert_ret ret;
 
        switch (overlap) {
        case BCH_EXTENT_OVERLAP_FRONT:
                /* insert overlaps with start of k: */
                bch2_cut_subtract_front(s, insert->k.p, k);
                BUG_ON(bkey_deleted(k.k));
-               extent_save(b, node_iter, _k, k.k);
+               extent_save(b, _k, k.k);
+               bch2_verify_key_order(b, &l->iter, _k);
                break;
 
        case BCH_EXTENT_OVERLAP_BACK:
                /* insert overlaps with end of k: */
                bch2_cut_subtract_back(s, bkey_start_pos(&insert->k), k);
                BUG_ON(bkey_deleted(k.k));
-               extent_save(b, node_iter, _k, k.k);
+               extent_save(b, _k, k.k);
 
                /*
                 * As the auxiliary tree is indexed by the end of the
@@ -1368,46 +1433,31 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
                 * auxiliary tree.
                 */
                bch2_bset_fix_invalidated_key(b, t, _k);
-               bch2_btree_node_iter_fix(iter, b, node_iter, t,
-                                       _k, _k->u64s, _k->u64s);
+               bch2_btree_node_iter_fix(iter, b, &l->iter, t,
+                                        _k, _k->u64s, _k->u64s);
+               bch2_verify_key_order(b, &l->iter, _k);
                break;
 
        case BCH_EXTENT_OVERLAP_ALL: {
-               struct bpos orig_pos = k.k->p;
-
                /* The insert key completely covers k, invalidate k */
                if (!bkey_whiteout(k.k))
                        btree_keys_account_key_drop(&b->nr,
                                                t - b->set, _k);
 
                bch2_drop_subtract(s, k);
-               k.k->p = bkey_start_pos(&insert->k);
-               if (!__extent_save(b, node_iter, _k, k.k)) {
-                       /*
-                        * Couldn't repack: we aren't necessarily able
-                        * to repack if the new key is outside the range
-                        * of the old extent, so we have to split
-                        * @insert:
-                        */
-                       k.k->p = orig_pos;
-                       extent_save(b, node_iter, _k, k.k);
 
-                       ret = extent_insert_advance_pos(s, k.s_c);
-                       if (ret != BTREE_INSERT_OK)
-                               return ret;
+               if (t == bset_tree_last(l->b)) {
+                       unsigned u64s = _k->u64s;
 
-                       extent_insert_committed(s);
-                       /*
-                        * We split and inserted upto at k.k->p - that
-                        * has to coincide with iter->pos, so that we
-                        * don't have anything more we have to insert
-                        * until we recheck our journal reservation:
-                        */
-                       EBUG_ON(bkey_cmp(s->committed, k.k->p));
+                       bch2_bset_delete(l->b, _k, _k->u64s);
+                       bch2_btree_node_iter_fix(iter, b, &l->iter, t,
+                                                _k, u64s, 0);
+                       bch2_btree_iter_verify(iter, b);
                } else {
-                       bch2_bset_fix_invalidated_key(b, t, _k);
-                       bch2_btree_node_iter_fix(iter, b, node_iter, t,
-                                               _k, _k->u64s, _k->u64s);
+                       extent_save(b, _k, k.k);
+                       bch2_btree_node_iter_fix(iter, b, &l->iter, t,
+                                                _k, _k->u64s, _k->u64s);
+                       bch2_verify_key_order(b, &l->iter, _k);
                }
 
                break;
@@ -1436,7 +1486,8 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 
                bch2_cut_subtract_front(s, insert->k.p, k);
                BUG_ON(bkey_deleted(k.k));
-               extent_save(b, node_iter, _k, k.k);
+               extent_save(b, _k, k.k);
+               bch2_verify_key_order(b, &l->iter, _k);
 
                bch2_add_sectors(s, bkey_i_to_s_c(&split.k),
                                bkey_start_offset(&split.k.k),
@@ -1450,26 +1501,20 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 }
 
 static enum btree_insert_ret
-__bch2_delete_fixup_extent(struct extent_insert_state *s)
+__bch2_insert_fixup_extent(struct extent_insert_state *s)
 {
        struct bch_fs *c = s->trans->c;
        struct btree_iter *iter = s->insert->iter;
        struct btree_iter_level *l = &iter->l[0];
        struct btree *b = l->b;
-       struct btree_node_iter *node_iter = &l->iter;
        struct bkey_packed *_k;
        struct bkey unpacked;
        struct bkey_i *insert = s->insert->k;
        enum btree_insert_ret ret = BTREE_INSERT_OK;
 
-       EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
-
-       s->whiteout = *insert;
-       s->whiteout.k.type = KEY_TYPE_DISCARD;
-
        while (bkey_cmp(s->committed, insert->k.p) < 0 &&
               (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
-              (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
+              (_k = bch2_btree_node_iter_peek_filter(&l->iter, b, KEY_TYPE_DISCARD))) {
                struct bset_tree *t = bch2_bkey_to_bset(b, _k);
                struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
                enum bch_extent_overlap overlap;
@@ -1480,112 +1525,92 @@ __bch2_delete_fixup_extent(struct extent_insert_state *s)
                if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
                        break;
 
-               if (bkey_whiteout(k.k)) {
-                       s->committed = bpos_min(insert->k.p, k.k->p);
-                       goto next;
-               }
-
-               overlap = bch2_extent_overlap(&insert->k, k.k);
-
                ret = extent_insert_advance_pos(s, k.s_c);
                if (ret)
                        break;
 
-               s->do_journal = true;
+               overlap = bch2_extent_overlap(&insert->k, k.k);
 
-               if (overlap == BCH_EXTENT_OVERLAP_ALL) {
-                       btree_keys_account_key_drop(&b->nr,
-                                               t - b->set, _k);
-                       bch2_subtract_sectors(s, k.s_c,
-                                            bkey_start_offset(k.k), k.k->size);
-                       _k->type = KEY_TYPE_DISCARD;
-                       reserve_whiteout(b, _k);
-               } else if (k.k->needs_whiteout ||
-                          bkey_written(b, _k)) {
-                       struct bkey_i discard = *insert;
-
-                       discard.k.type = KEY_TYPE_DISCARD;
-
-                       switch (overlap) {
-                       case BCH_EXTENT_OVERLAP_FRONT:
-                               bch2_cut_front(bkey_start_pos(k.k), &discard);
-                               break;
-                       case BCH_EXTENT_OVERLAP_BACK:
-                               bch2_cut_back(k.k->p, &discard.k);
-                               break;
-                       default:
-                               break;
-                       }
+               if (!s->deleting) {
+                       if (k.k->needs_whiteout || bkey_written(b, _k))
+                               insert->k.needs_whiteout = true;
 
-                       discard.k.needs_whiteout = true;
+                       if (overlap == BCH_EXTENT_OVERLAP_ALL &&
+                           bkey_whiteout(k.k) &&
+                           k.k->needs_whiteout) {
+                               unreserve_whiteout(b, _k);
+                               _k->needs_whiteout = false;
+                       }
 
                        ret = extent_squash(s, insert, t, _k, k, overlap);
-                       BUG_ON(ret != BTREE_INSERT_OK);
-
-                       extent_bset_insert(c, iter, &discard);
                } else {
-                       ret = extent_squash(s, insert, t, _k, k, overlap);
-                       BUG_ON(ret != BTREE_INSERT_OK);
-               }
-next:
-               bch2_cut_front(s->committed, insert);
-               bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
-       }
-
-       return ret;
-}
-
-static enum btree_insert_ret
-__bch2_insert_fixup_extent(struct extent_insert_state *s)
-{
-       struct btree_iter *iter = s->insert->iter;
-       struct btree_iter_level *l = &iter->l[0];
-       struct btree *b = l->b;
-       struct btree_node_iter *node_iter = &l->iter;
-       struct bkey_packed *_k;
-       struct bkey unpacked;
-       struct bkey_i *insert = s->insert->k;
-       enum btree_insert_ret ret = BTREE_INSERT_OK;
-
-       while (bkey_cmp(s->committed, insert->k.p) < 0 &&
-              (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
-              (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
-               struct bset_tree *t = bch2_bkey_to_bset(b, _k);
-               struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
-               enum bch_extent_overlap overlap;
-
-               EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
-               EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
+                       if (bkey_whiteout(k.k))
+                               goto next;
+
+                       s->do_journal = true;
+
+                       if (overlap == BCH_EXTENT_OVERLAP_ALL) {
+                               btree_keys_account_key_drop(&b->nr,
+                                                       t - b->set, _k);
+                               bch2_subtract_sectors(s, k.s_c,
+                                                    bkey_start_offset(k.k), k.k->size);
+                               _k->type = KEY_TYPE_DISCARD;
+                               reserve_whiteout(b, _k);
+                       } else if (k.k->needs_whiteout ||
+                                  bkey_written(b, _k)) {
+                               struct bkey_i discard = *insert;
+
+                               discard.k.type = KEY_TYPE_DISCARD;
+
+                               switch (overlap) {
+                               case BCH_EXTENT_OVERLAP_FRONT:
+                                       bch2_cut_front(bkey_start_pos(k.k), &discard);
+                                       break;
+                               case BCH_EXTENT_OVERLAP_BACK:
+                                       bch2_cut_back(k.k->p, &discard.k);
+                                       break;
+                               default:
+                                       break;
+                               }
 
-               if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
-                       break;
+                               discard.k.needs_whiteout = true;
 
-               overlap = bch2_extent_overlap(&insert->k, k.k);
+                               ret = extent_squash(s, insert, t, _k, k, overlap);
+                               BUG_ON(ret != BTREE_INSERT_OK);
 
-               if (!k.k->size)
-                       goto squash;
+                               extent_bset_insert(c, iter, &discard);
+                       } else {
+                               ret = extent_squash(s, insert, t, _k, k, overlap);
+                               BUG_ON(ret != BTREE_INSERT_OK);
+                       }
+next:
+                       bch2_cut_front(s->committed, insert);
+                       bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
+               }
 
-               /*
-                * Only call advance pos & call hook for nonzero size extents:
-                */
-               ret = extent_insert_advance_pos(s, k.s_c);
-               if (ret)
+               if (ret != BTREE_INSERT_OK ||
+                   overlap == BCH_EXTENT_OVERLAP_FRONT ||
+                   overlap == BCH_EXTENT_OVERLAP_MIDDLE)
                        break;
+       }
 
-               if (k.k->size &&
-                   (k.k->needs_whiteout || bkey_written(b, _k)))
-                       insert->k.needs_whiteout = true;
+       if (ret == BTREE_INSERT_OK &&
+           bkey_cmp(s->committed, insert->k.p) < 0)
+               ret = extent_insert_advance_pos(s, bkey_s_c_null);
 
-               if (overlap == BCH_EXTENT_OVERLAP_ALL &&
-                   bkey_whiteout(k.k) &&
-                   k.k->needs_whiteout) {
-                       unreserve_whiteout(b, _k);
-                       _k->needs_whiteout = false;
-               }
-squash:
-               ret = extent_squash(s, insert, t, _k, k, overlap);
-               if (ret != BTREE_INSERT_OK)
-                       break;
+       /*
+        * may have skipped past some deleted extents greater than the insert
+        * key, before we got to a non deleted extent and knew we could bail out
+        * rewind the iterator a bit if necessary:
+        */
+       {
+               struct btree_node_iter node_iter = l->iter;
+               struct bkey uk;
+
+               while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
+                      (uk = bkey_unpack_key(l->b, _k),
+                       bkey_cmp(uk.p, s->committed) > 0))
+                       l->iter = node_iter;
        }
 
        return ret;
@@ -1647,6 +1672,11 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
                .deleting       = bkey_whiteout(&insert->k->k),
        };
 
+       if (s.deleting) {
+               s.whiteout = *insert->k;
+               s.whiteout.k.type = KEY_TYPE_DISCARD;
+       }
+
        EBUG_ON(iter->level);
        EBUG_ON(!insert->k->k.size);
 
@@ -1657,6 +1687,7 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
         * @insert->k and the node iterator that we're advancing:
         */
        EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
+       bch2_btree_iter_verify(iter, b);
 
        if (!s.deleting &&
            !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
@@ -1664,13 +1695,7 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
                                bkey_start_offset(&insert->k->k),
                                insert->k->k.size);
 
-       ret = !s.deleting
-               ? __bch2_insert_fixup_extent(&s)
-               : __bch2_delete_fixup_extent(&s);
-
-       if (ret == BTREE_INSERT_OK &&
-           bkey_cmp(s.committed, insert->k->k.p) < 0)
-               ret = extent_insert_advance_pos(&s, bkey_s_c_null);
+       ret = __bch2_insert_fixup_extent(&s);
 
        extent_insert_committed(&s);
 
@@ -2172,130 +2197,6 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,
        return BCH_MERGE_MERGE;
 }
 
-static void extent_i_save(struct btree *b, struct bkey_packed *dst,
-                         struct bkey_i *src)
-{
-       struct bkey_format *f = &b->format;
-       struct bkey_i *dst_unpacked;
-
-       BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k));
-
-       /*
-        * We don't want the bch2_verify_key_order() call in extent_save(),
-        * because we may be out of order with deleted keys that are about to be
-        * removed by extent_bset_insert()
-        */
-
-       if ((dst_unpacked = packed_to_bkey(dst)))
-               bkey_copy(dst_unpacked, src);
-       else
-               BUG_ON(!bch2_bkey_pack(dst, src, f));
-}
-
-static bool extent_merge_one_overlapping(struct btree_iter *iter,
-                                        struct bpos new_pos,
-                                        struct bset_tree *t,
-                                        struct bkey_packed *k, struct bkey uk,
-                                        bool check, bool could_pack)
-{
-       struct btree_iter_level *l = &iter->l[0];
-
-       BUG_ON(!bkey_deleted(k));
-
-       if (check) {
-               return !bkey_packed(k) || could_pack;
-       } else {
-               uk.p = new_pos;
-               extent_save(l->b, &l->iter, k, &uk);
-               bch2_bset_fix_invalidated_key(l->b, t, k);
-               bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
-                                        k, k->u64s, k->u64s);
-               return true;
-       }
-}
-
-static bool extent_merge_do_overlapping(struct btree_iter *iter,
-                                       struct bkey *m, bool back_merge)
-{
-       struct btree_iter_level *l = &iter->l[0];
-       struct btree *b = l->b;
-       struct btree_node_iter *node_iter = &l->iter;
-       struct bset_tree *t;
-       struct bkey_packed *k;
-       struct bkey uk;
-       struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m);
-       bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b);
-       bool check = true;
-
-       /*
-        * @m is the new merged extent:
-        *
-        * The merge took place in the last bset; we know there can't be any 0
-        * size extents overlapping with m there because if so they would have
-        * been between the two extents we merged.
-        *
-        * But in the other bsets, we have to check for and fix such extents:
-        */
-do_fixup:
-       for_each_bset(b, t) {
-               if (t == bset_tree_last(b))
-                       break;
-
-               /*
-                * if we don't find this bset in the iterator we already got to
-                * the end of that bset, so start searching from the end.
-                */
-               k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
-
-               if (k == btree_bkey_last(b, t))
-                       k = bch2_bkey_prev_all(b, t, k);
-               if (!k)
-                       continue;
-
-               if (back_merge) {
-                       /*
-                        * Back merge: 0 size extents will be before the key
-                        * that was just inserted (and thus the iterator
-                        * position) - walk backwards to find them
-                        */
-                       for (;
-                            k &&
-                            (uk = bkey_unpack_key(b, k),
-                             bkey_cmp(uk.p, bkey_start_pos(m)) > 0);
-                            k = bch2_bkey_prev_all(b, t, k)) {
-                               if (bkey_cmp(uk.p, m->p) >= 0)
-                                       continue;
-
-                               if (!extent_merge_one_overlapping(iter, new_pos,
-                                               t, k, uk, check, could_pack))
-                                       return false;
-                       }
-               } else {
-                       /* Front merge - walk forwards */
-                       for (;
-                            k != btree_bkey_last(b, t) &&
-                            (uk = bkey_unpack_key(b, k),
-                             bkey_cmp(uk.p, m->p) < 0);
-                            k = bkey_next(k)) {
-                               if (bkey_cmp(uk.p,
-                                            bkey_start_pos(m)) <= 0)
-                                       continue;
-
-                               if (!extent_merge_one_overlapping(iter, new_pos,
-                                               t, k, uk, check, could_pack))
-                                       return false;
-                       }
-               }
-       }
-
-       if (check) {
-               check = false;
-               goto do_fixup;
-       }
-
-       return true;
-}
-
 /*
  * When merging an extent that we're inserting into a btree node, the new merged
  * extent could overlap with an existing 0 size extent - if we don't fix that,
@@ -2312,13 +2213,13 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
 {
        struct btree *b = iter->l[0].b;
        struct btree_node_iter *node_iter = &iter->l[0].iter;
-       const struct bkey_format *f = &b->format;
-       struct bset_tree *t = bset_tree_last(b);
-       struct bkey_packed *m;
-       BKEY_PADDED(k) li;
-       BKEY_PADDED(k) ri;
-       struct bkey_i *mi;
-       struct bkey tmp;
+       BKEY_PADDED(k) li, ri;
+       struct bkey_packed *m   = back_merge ? l : r;
+       struct bkey_i *mi       = back_merge ? &li.k : &ri.k;
+       struct bset_tree *t     = bch2_bkey_to_bset(b, m);
+       enum merge_result ret;
+
+       EBUG_ON(bkey_written(b, m));
 
        /*
         * We need to save copies of both l and r, because we might get a
@@ -2327,57 +2228,49 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
        bch2_bkey_unpack(b, &li.k, l);
        bch2_bkey_unpack(b, &ri.k, r);
 
-       m = back_merge ? l : r;
-       mi = back_merge ? &li.k : &ri.k;
+       ret = bch2_extent_merge(c, b, &li.k, &ri.k);
+       if (ret == BCH_MERGE_NOMERGE)
+               return false;
 
-       /* l & r should be in last bset: */
-       EBUG_ON(bch2_bkey_to_bset(b, m) != t);
+       /*
+        * check if we overlap with deleted extents - would break the sort
+        * order:
+        */
+       if (back_merge) {
+               struct bkey_packed *n = bkey_next(m);
 
-       switch (bch2_extent_merge(c, b, &li.k, &ri.k)) {
-       case BCH_MERGE_NOMERGE:
-               return false;
-       case BCH_MERGE_PARTIAL:
-               if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &mi->k, f))
+               if (n != btree_bkey_last(b, t) &&
+                   bkey_cmp_left_packed(b, n, &li.k.k.p) <= 0 &&
+                   bkey_deleted(n))
                        return false;
+       } else if (ret == BCH_MERGE_MERGE) {
+               struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
 
-               if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
+               if (prev &&
+                   bkey_cmp_left_packed_byval(b, prev,
+                               bkey_start_pos(&li.k.k)) > 0)
                        return false;
+       }
 
-               extent_i_save(b, m, mi);
-               bch2_bset_fix_invalidated_key(b, t, m);
-
-               /*
-                * Update iterator to reflect what we just inserted - otherwise,
-                * the iter_fix() call is going to put us _before_ the key we
-                * just partially merged with:
-                */
-               if (back_merge)
-                       bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p);
-
-               bch2_btree_node_iter_fix(iter, b, node_iter,
-                                        t, m, m->u64s, m->u64s);
+       if (ret == BCH_MERGE_PARTIAL) {
+               if (!extent_i_save(b, m, mi))
+                       return false;
 
                if (!back_merge)
                        bkey_copy(packed_to_bkey(l), &li.k);
                else
                        bkey_copy(packed_to_bkey(r), &ri.k);
-               return false;
-       case BCH_MERGE_MERGE:
-               if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &li.k.k, f))
-                       return false;
-
-               if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
+       } else {
+               if (!extent_i_save(b, m, &li.k))
                        return false;
+       }
 
-               extent_i_save(b, m, &li.k);
-               bch2_bset_fix_invalidated_key(b, t, m);
+       bch2_bset_fix_invalidated_key(b, t, m);
+       bch2_btree_node_iter_fix(iter, b, node_iter,
+                                t, m, m->u64s, m->u64s);
+       verify_modified_extent(iter, m);
 
-               bch2_btree_node_iter_fix(iter, b, node_iter,
-                                        t, m, m->u64s, m->u64s);
-               return true;
-       default:
-               BUG();
-       }
+       return ret == BCH_MERGE_MERGE;
 }
 
 int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)