bcachefs: Btree split improvement
authorKent Overstreet <kent.overstreet@linux.dev>
Wed, 16 Nov 2022 02:52:12 +0000 (21:52 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:46 +0000 (17:09 -0400)
This improves the bkey_format calculation when splitting btree nodes.
Previously, we'd use a format calculated for the original node for the
lower of the two new nodes.

This was particularly bad on sequential insertions, where we iteratively
split the last btree node, whos format has to include KEY_MAX.

Now, we calculate formats precisely for the keys the two new nodes will
contain. This also should make splitting a bit more efficient, since
we're only copying keys once (from the original node to the new node,
instead of new node, replacement node, then upper split).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/bkey_sort.c
fs/bcachefs/bset.h
fs/bcachefs/btree_update_interior.c

index be0d4bc1afd3404856f732de3e210a4b1124cac3..557a79cad98670b5f753f6e7d71372b4418cf7a8 100644 (file)
@@ -144,6 +144,8 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
                else
                        bch2_bkey_unpack(src, (void *) out, in);
 
+               out->needs_whiteout = false;
+
                btree_keys_account_key_add(&nr, 0, out);
                out = bkey_next(out);
        }
index b352d5a40de0e28a0017e1592062e40b22d0ece7..fd2915a150708f9b9e5203d761c2ea2342e872ef 100644 (file)
@@ -447,6 +447,11 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
                                                struct btree *,
                                                struct bkey *);
 
+#define for_each_btree_node_key(b, k, iter)                            \
+       for (bch2_btree_node_iter_init_from_start((iter), (b));         \
+            (k = bch2_btree_node_iter_peek((iter), (b)));              \
+            bch2_btree_node_iter_advance(iter, b))
+
 #define for_each_btree_node_key_unpack(b, k, iter, unpacked)           \
        for (bch2_btree_node_iter_init_from_start((iter), (b));         \
             (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
index e0483abadd72ec654281df661c88c273944fe565..ac3a5ef1b1af2ada963464080cf9fa43fcb2c295 100644 (file)
@@ -377,14 +377,19 @@ static void btree_set_max(struct btree *b, struct bpos pos)
        b->data->max_key = pos;
 }
 
-struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
-                                                 struct btree_trans *trans,
-                                                 struct btree *b,
-                                                 struct bkey_format format)
+static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
+                                                      struct btree_trans *trans,
+                                                      struct btree *b)
 {
-       struct btree *n;
+       struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level);
+       struct bkey_format format = bch2_btree_calc_format(b);
 
-       n = bch2_btree_node_alloc(as, trans, b->c.level);
+       /*
+        * The keys might expand with the new format - if they wouldn't fit in
+        * the btree node anymore, use the old format for now:
+        */
+       if (!bch2_btree_node_format_fits(as->c, b, &format))
+               format = b->format;
 
        SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
 
@@ -397,27 +402,9 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
        bch2_btree_sort_into(as->c, n, b);
 
        btree_node_reset_sib_u64s(n);
-
-       n->key.k.p = b->key.k.p;
        return n;
 }
 
-static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
-                                                      struct btree_trans *trans,
-                                                      struct btree *b)
-{
-       struct bkey_format new_f = bch2_btree_calc_format(b);
-
-       /*
-        * The keys might expand with the new format - if they wouldn't fit in
-        * the btree node anymore, use the old format for now:
-        */
-       if (!bch2_btree_node_format_fits(as->c, b, &new_f))
-               new_f = b->format;
-
-       return __bch2_btree_node_alloc_replacement(as, trans, b, new_f);
-}
-
 static struct btree *__btree_root_alloc(struct btree_update *as,
                                struct btree_trans *trans, unsigned level)
 {
@@ -1331,8 +1318,12 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
                ;
 
        while (!bch2_keylist_empty(keys)) {
-               bch2_insert_fixup_btree_ptr(as, trans, path, b,
-                               &node_iter, bch2_keylist_front(keys));
+               struct bkey_i *k = bch2_keylist_front(keys);
+
+               if (bpos_cmp(k->k.p, b->key.k.p) > 0)
+                       break;
+
+               bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, k);
                bch2_keylist_pop_front(keys);
        }
 }
@@ -1341,109 +1332,91 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
  * Move keys from n1 (original replacement node, now lower node) to n2 (higher
  * node)
  */
-static struct btree *__btree_split_node(struct btree_update *as,
-                                       struct btree_trans *trans,
-                                       struct btree *n1)
+static void __btree_split_node(struct btree_update *as,
+                              struct btree_trans *trans,
+                              struct btree *b,
+                              struct btree *n[2])
 {
-       struct bkey_format_state s;
-       size_t nr_packed = 0, nr_unpacked = 0;
-       struct btree *n2;
-       struct bset *set1, *set2;
-       struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
-       struct bpos n1_pos;
+       struct bkey_packed *k;
+       struct bpos n1_pos = POS_MIN;
+       struct btree_node_iter iter;
+       struct bset *bsets[2];
+       struct bkey_format_state format[2];
+       struct bkey_packed *out[2];
+       struct bkey uk;
+       unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5;
+       int i;
 
-       n2 = bch2_btree_node_alloc(as, trans, n1->c.level);
+       for (i = 0; i < 2; i++) {
+               BUG_ON(n[i]->nsets != 1);
 
-       n2->data->max_key       = n1->data->max_key;
-       n2->data->format        = n1->format;
-       SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
-       n2->key.k.p = n1->key.k.p;
+               bsets[i] = btree_bset_first(n[i]);
+               out[i] = bsets[i]->start;
 
-       set1 = btree_bset_first(n1);
-       set2 = btree_bset_first(n2);
+               SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1);
+               bch2_bkey_format_init(&format[i]);
+       }
 
-       /*
-        * Has to be a linear search because we don't have an auxiliary
-        * search tree yet
-        */
-       k = set1->start;
-       while (1) {
-               struct bkey_packed *n = bkey_next(k);
+       u64s = 0;
+       for_each_btree_node_key(b, k, &iter) {
+               if (bkey_deleted(k))
+                       continue;
+
+               i = u64s >= n1_u64s;
+               u64s += k->u64s;
+               uk = bkey_unpack_key(b, k);
+               if (!i)
+                       n1_pos = uk.p;
+               bch2_bkey_format_add_key(&format[i], &uk);
+       }
 
-               if (n == vstruct_last(set1))
-                       break;
-               if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
-                       break;
+       btree_set_min(n[0], b->data->min_key);
+       btree_set_max(n[0], n1_pos);
+       btree_set_min(n[1], bpos_successor(n1_pos));
+       btree_set_max(n[1], b->data->max_key);
 
-               if (bkey_packed(k))
-                       nr_packed++;
-               else
-                       nr_unpacked++;
+       for (i = 0; i < 2; i++) {
+               bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key);
+               bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key);
 
-               prev = k;
-               k = n;
+               n[i]->data->format = bch2_bkey_format_done(&format[i]);
+               btree_node_set_format(n[i], n[i]->data->format);
        }
 
-       BUG_ON(!prev);
-       set2_start      = k;
-       set2_end        = vstruct_last(set1);
-
-       set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data);
-       set_btree_bset_end(n1, n1->set);
+       u64s = 0;
+       for_each_btree_node_key(b, k, &iter) {
+               if (bkey_deleted(k))
+                       continue;
 
-       n1->nr.live_u64s        = le16_to_cpu(set1->u64s);
-       n1->nr.bset_u64s[0]     = le16_to_cpu(set1->u64s);
-       n1->nr.packed_keys      = nr_packed;
-       n1->nr.unpacked_keys    = nr_unpacked;
+               i = u64s >= n1_u64s;
+               u64s += k->u64s;
 
-       n1_pos = bkey_unpack_pos(n1, prev);
-       if (as->c->sb.version < bcachefs_metadata_version_snapshot)
-               n1_pos.snapshot = U32_MAX;
-
-       btree_set_max(n1, n1_pos);
-       btree_set_min(n2, bpos_successor(n1->key.k.p));
+               if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k)
+                                       ? &b->format: &bch2_bkey_format_current, k))
+                       out[i]->format = KEY_FORMAT_LOCAL_BTREE;
+               else
+                       bch2_bkey_unpack(b, (void *) out[i], k);
 
-       bch2_bkey_format_init(&s);
-       bch2_bkey_format_add_pos(&s, n2->data->min_key);
-       bch2_bkey_format_add_pos(&s, n2->data->max_key);
+               out[i]->needs_whiteout = false;
 
-       for (k = set2_start; k != set2_end; k = bkey_next(k)) {
-               struct bkey uk = bkey_unpack_key(n1, k);
-               bch2_bkey_format_add_key(&s, &uk);
+               btree_keys_account_key_add(&n[i]->nr, 0, out[i]);
+               out[i] = bkey_next(out[i]);
        }
 
-       n2->data->format = bch2_bkey_format_done(&s);
-       btree_node_set_format(n2, n2->data->format);
+       for (i = 0; i < 2; i++) {
+               bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data);
 
-       out = set2->start;
-       memset(&n2->nr, 0, sizeof(n2->nr));
+               BUG_ON(!bsets[i]->u64s);
 
-       for (k = set2_start; k != set2_end; k = bkey_next(k)) {
-               BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k)
-                                      ? &n1->format : &bch2_bkey_format_current, k));
-               out->format = KEY_FORMAT_LOCAL_BTREE;
-               btree_keys_account_key_add(&n2->nr, 0, out);
-               out = bkey_next(out);
-       }
+               set_btree_bset_end(n[i], n[i]->set);
 
-       set2->u64s = cpu_to_le16((u64 *) out - set2->_data);
-       set_btree_bset_end(n2, n2->set);
+               btree_node_reset_sib_u64s(n[i]);
 
-       BUG_ON(!set1->u64s);
-       BUG_ON(!set2->u64s);
+               bch2_verify_btree_nr_keys(n[i]);
 
-       btree_node_reset_sib_u64s(n1);
-       btree_node_reset_sib_u64s(n2);
-
-       bch2_verify_btree_nr_keys(n1);
-       bch2_verify_btree_nr_keys(n2);
-
-       if (n1->c.level) {
-               btree_node_interior_verify(as->c, n1);
-               btree_node_interior_verify(as->c, n2);
+               if (b->c.level)
+                       btree_node_interior_verify(as->c, n[i]);
        }
-
-       return n2;
 }
 
 /*
@@ -1463,41 +1436,17 @@ static void btree_split_insert_keys(struct btree_update *as,
                                    struct btree *b,
                                    struct keylist *keys)
 {
-       struct btree_node_iter node_iter;
-       struct bkey_i *k = bch2_keylist_front(keys);
-       struct bkey_packed *src, *dst, *n;
-       struct bset *i;
+       if (!bch2_keylist_empty(keys) &&
+           bpos_cmp(bch2_keylist_front(keys)->k.p,
+                    b->data->max_key) <= 0) {
+               struct btree_node_iter node_iter;
 
-       bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
+               bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
 
-       __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
+               __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
 
-       /*
-        * We can't tolerate whiteouts here - with whiteouts there can be
-        * duplicate keys, and it would be rather bad if we picked a duplicate
-        * for the pivot:
-        */
-       i = btree_bset_first(b);
-       src = dst = i->start;
-       while (src != vstruct_last(i)) {
-               n = bkey_next(src);
-               if (!bkey_deleted(src)) {
-                       memmove_u64s_down(dst, src, src->u64s);
-                       dst = bkey_next(dst);
-               }
-               src = n;
+               btree_node_interior_verify(as->c, b);
        }
-
-       /* Also clear out the unwritten whiteouts area: */
-       b->whiteout_u64s = 0;
-
-       i->u64s = cpu_to_le16((u64 *) dst - i->_data);
-       set_btree_bset_end(b, b->set);
-
-       BUG_ON(b->nsets != 1 ||
-              b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
-
-       btree_node_interior_verify(as->c, b);
 }
 
 static int btree_split(struct btree_update *as, struct btree_trans *trans,
@@ -1516,15 +1465,21 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 
        bch2_btree_interior_update_will_free_node(as, b);
 
-       n1 = bch2_btree_node_alloc_replacement(as, trans, b);
+       if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
+               struct btree *n[2];
 
-       if (keys)
-               btree_split_insert_keys(as, trans, path, n1, keys);
-
-       if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
                trace_and_count(c, btree_node_split, c, b);
 
-               n2 = __btree_split_node(as, trans, n1);
+               n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
+               n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
+
+               __btree_split_node(as, trans, b, n);
+
+               if (keys) {
+                       btree_split_insert_keys(as, trans, path, n1, keys);
+                       btree_split_insert_keys(as, trans, path, n2, keys);
+                       BUG_ON(!bch2_keylist_empty(keys));
+               }
 
                bch2_btree_build_aux_trees(n2);
                bch2_btree_build_aux_trees(n1);
@@ -1573,6 +1528,13 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
        } else {
                trace_and_count(c, btree_node_compact, c, b);
 
+               n1 = bch2_btree_node_alloc_replacement(as, trans, b);
+
+               if (keys) {
+                       btree_split_insert_keys(as, trans, path, n1, keys);
+                       BUG_ON(!bch2_keylist_empty(keys));
+               }
+
                bch2_btree_build_aux_trees(n1);
                bch2_btree_update_add_new_node(as, n1);
                six_unlock_write(&n1->c.lock);