bcachefs: Fix usage of six lock's percpu mode
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 5 Mar 2022 00:16:04 +0000 (19:16 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:26 +0000 (17:09 -0400)
Six locks have a percpu mode, which we use for interior btree nodes, as
well as btree key cache keys for the subvolumes btree. We've been
switching locks back and forth between percpu and non percpu mode as
needed, but it turns out this is racy - when we're reusing an existing
node, other threads could be attempting to lock it while we're switching
it between modes.

This patch fixes this by never switching 'struct btree' between the two
modes, and instead segragating them between two different freed lists.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_cache.c
fs/bcachefs/btree_cache.h
fs/bcachefs/btree_io.c
fs/bcachefs/btree_key_cache.c
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/btree_update_interior.h
fs/bcachefs/debug.c

index 42253ca17f04345db40566eb4cb48a21517ecb9f..92a8cc704cabf301c659d09d0aa275339fd440bc 100644 (file)
@@ -40,6 +40,14 @@ static inline unsigned btree_cache_can_free(struct btree_cache *bc)
        return max_t(int, 0, bc->used - bc->reserve);
 }
 
+static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
+{
+       if (b->c.lock.readers)
+               list_move(&b->list, &bc->freed_pcpu);
+       else
+               list_move(&b->list, &bc->freed_nonpcpu);
+}
+
 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
 {
        struct btree_cache *bc = &c->btree_cache;
@@ -56,7 +64,8 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
        b->aux_data = NULL;
 
        bc->used--;
-       list_move(&b->list, &bc->freed);
+
+       btree_node_to_freedlist(bc, b);
 }
 
 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -162,11 +171,6 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
        b->c.level      = level;
        b->c.btree_id   = id;
 
-       if (level)
-               six_lock_pcpu_alloc(&b->c.lock);
-       else
-               six_lock_pcpu_free_rcu(&b->c.lock);
-
        mutex_lock(&bc->lock);
        ret = __bch2_btree_node_hash_insert(bc, b);
        if (!ret)
@@ -432,8 +436,10 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 
        BUG_ON(atomic_read(&c->btree_cache.dirty));
 
-       while (!list_empty(&bc->freed)) {
-               b = list_first_entry(&bc->freed, struct btree, list);
+       list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+
+       while (!list_empty(&bc->freed_nonpcpu)) {
+               b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
                list_del(&b->list);
                six_lock_pcpu_free(&b->c.lock);
                kfree(b);
@@ -487,7 +493,8 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
        mutex_init(&bc->lock);
        INIT_LIST_HEAD(&bc->live);
        INIT_LIST_HEAD(&bc->freeable);
-       INIT_LIST_HEAD(&bc->freed);
+       INIT_LIST_HEAD(&bc->freed_pcpu);
+       INIT_LIST_HEAD(&bc->freed_nonpcpu);
 }
 
 /*
@@ -562,9 +569,12 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
        }
 }
 
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
 {
        struct btree_cache *bc = &c->btree_cache;
+       struct list_head *freed = pcpu_read_locks
+               ? &bc->freed_pcpu
+               : &bc->freed_nonpcpu;
        struct btree *b, *b2;
        u64 start_time = local_clock();
        unsigned flags;
@@ -576,7 +586,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
         * We never free struct btree itself, just the memory that holds the on
         * disk node. Check the freed list before allocating a new one:
         */
-       list_for_each_entry(b, &bc->freed, list)
+       list_for_each_entry(b, freed, list)
                if (!btree_node_reclaim(c, b)) {
                        list_del_init(&b->list);
                        goto got_node;
@@ -586,6 +596,9 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
        if (!b)
                goto err_locked;
 
+       if (pcpu_read_locks)
+               six_lock_pcpu_alloc(&b->c.lock);
+
        BUG_ON(!six_trylock_intent(&b->c.lock));
        BUG_ON(!six_trylock_write(&b->c.lock));
 got_node:
@@ -598,7 +611,7 @@ got_node:
                if (!btree_node_reclaim(c, b2)) {
                        swap(b->data, b2->data);
                        swap(b->aux_data, b2->aux_data);
-                       list_move(&b2->list, &bc->freed);
+                       btree_node_to_freedlist(bc, b2);
                        six_unlock_write(&b2->c.lock);
                        six_unlock_intent(&b2->c.lock);
                        goto got_mem;
@@ -643,7 +656,7 @@ err_locked:
                if (b) {
                        swap(b->data, b2->data);
                        swap(b->aux_data, b2->aux_data);
-                       list_move(&b2->list, &bc->freed);
+                       btree_node_to_freedlist(bc, b2);
                        six_unlock_write(&b2->c.lock);
                        six_unlock_intent(&b2->c.lock);
                } else {
@@ -688,7 +701,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
                return ERR_PTR(-EINTR);
        }
 
-       b = bch2_btree_node_mem_alloc(c);
+       b = bch2_btree_node_mem_alloc(c, level != 0);
 
        if (trans && b == ERR_PTR(-ENOMEM)) {
                trans->memory_allocation_failure = true;
index 96f8f90e85a1da7ce40a222406e9a113676f0eab..83723805f12a507f355a6743e952fb5042d7cfe8 100644 (file)
@@ -20,7 +20,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
 int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
 
 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool);
 
 struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
                                  const struct bkey_i *, unsigned,
index 53f83340f69a28079b004903c9c6ab0e222f0e8b..3031b566a112b3f14f4e524e5f58d2add06144c8 100644 (file)
@@ -1542,7 +1542,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
                closure_sync(&cl);
        } while (ret);
 
-       b = bch2_btree_node_mem_alloc(c);
+       b = bch2_btree_node_mem_alloc(c, level != 0);
        bch2_btree_cache_cannibalize_unlock(c);
 
        BUG_ON(IS_ERR(b));
index 70f31b5379e7077724a93854dd23c2be1b8575cd..7e41552a57dfb6ab22ae3d77c2186502ea3c4386 100644 (file)
@@ -166,13 +166,13 @@ btree_key_cache_create(struct bch_fs *c,
                }
 
                was_new = false;
+       } else {
+               if (btree_id == BTREE_ID_subvolumes)
+                       six_lock_pcpu_alloc(&ck->c.lock);
+               else
+                       six_lock_pcpu_free(&ck->c.lock);
        }
 
-       if (btree_id == BTREE_ID_subvolumes)
-               six_lock_pcpu_alloc(&ck->c.lock);
-       else
-               six_lock_pcpu_free(&ck->c.lock);
-
        ck->c.level             = 0;
        ck->c.btree_id          = btree_id;
        ck->key.btree_id        = btree_id;
index 561406b4b7c26fb1685218dc2840c152b950e5bd..51eb686331bfa355603588581284db5fcc6d5563 100644 (file)
@@ -152,7 +152,8 @@ struct btree_cache {
        struct mutex            lock;
        struct list_head        live;
        struct list_head        freeable;
-       struct list_head        freed;
+       struct list_head        freed_pcpu;
+       struct list_head        freed_nonpcpu;
 
        /* Number of elements in live + freeable lists */
        unsigned                used;
index 523d1146b2e2fae301a19028b33901c20f08dcbd..43022b340f4e69855c9444f9baf68a0b9a375862 100644 (file)
@@ -181,6 +181,7 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
                                             struct disk_reservation *res,
                                             struct closure *cl,
+                                            bool interior_node,
                                             unsigned flags)
 {
        struct write_point *wp;
@@ -242,7 +243,7 @@ retry:
        bch2_open_bucket_get(c, wp, &ob);
        bch2_alloc_sectors_done(c, wp);
 mem_alloc:
-       b = bch2_btree_node_mem_alloc(c);
+       b = bch2_btree_node_mem_alloc(c, interior_node);
        six_unlock_write(&b->c.lock);
        six_unlock_intent(&b->c.lock);
 
@@ -260,12 +261,13 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 {
        struct bch_fs *c = as->c;
        struct btree *b;
+       struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
        int ret;
 
        BUG_ON(level >= BTREE_MAX_DEPTH);
-       BUG_ON(!as->nr_prealloc_nodes);
+       BUG_ON(!p->nr);
 
-       b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+       b = p->b[--p->nr];
 
        six_lock_intent(&b->c.lock, NULL, NULL);
        six_lock_write(&b->c.lock, NULL, NULL);
@@ -377,43 +379,49 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
 static void bch2_btree_reserve_put(struct btree_update *as)
 {
        struct bch_fs *c = as->c;
+       struct prealloc_nodes *p;
 
        mutex_lock(&c->btree_reserve_cache_lock);
 
-       while (as->nr_prealloc_nodes) {
-               struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+       for (p = as->prealloc_nodes;
+            p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
+            p++) {
+               while (p->nr) {
+                       struct btree *b = p->b[--p->nr];
 
-               six_lock_intent(&b->c.lock, NULL, NULL);
-               six_lock_write(&b->c.lock, NULL, NULL);
+                       six_lock_intent(&b->c.lock, NULL, NULL);
+                       six_lock_write(&b->c.lock, NULL, NULL);
 
-               if (c->btree_reserve_cache_nr <
-                   ARRAY_SIZE(c->btree_reserve_cache)) {
-                       struct btree_alloc *a =
-                               &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+                       if (c->btree_reserve_cache_nr <
+                           ARRAY_SIZE(c->btree_reserve_cache)) {
+                               struct btree_alloc *a =
+                                       &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
 
-                       a->ob = b->ob;
-                       b->ob.nr = 0;
-                       bkey_copy(&a->k, &b->key);
-               } else {
-                       bch2_open_buckets_put(c, &b->ob);
-               }
+                               a->ob = b->ob;
+                               b->ob.nr = 0;
+                               bkey_copy(&a->k, &b->key);
+                       } else {
+                               bch2_open_buckets_put(c, &b->ob);
+                       }
 
-               __btree_node_free(c, b);
-               six_unlock_write(&b->c.lock);
-               six_unlock_intent(&b->c.lock);
+                       __btree_node_free(c, b);
+                       six_unlock_write(&b->c.lock);
+                       six_unlock_intent(&b->c.lock);
+               }
        }
 
        mutex_unlock(&c->btree_reserve_cache_lock);
 }
 
-static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
+static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes[2],
                                  unsigned flags, struct closure *cl)
 {
        struct bch_fs *c = as->c;
        struct btree *b;
+       unsigned interior;
        int ret;
 
-       BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
+       BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
 
        /*
         * Protects reaping from the btree node cache and using the btree node
@@ -423,23 +431,28 @@ static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
        if (ret)
                return ret;
 
-       while (as->nr_prealloc_nodes < nr_nodes) {
-               b = __bch2_btree_node_alloc(c, &as->disk_res,
-                                           flags & BTREE_INSERT_NOWAIT
-                                           ? NULL : cl, flags);
-               if (IS_ERR(b)) {
-                       ret = PTR_ERR(b);
-                       goto err_free;
-               }
+       for (interior = 0; interior < 2; interior++) {
+               struct prealloc_nodes *p = as->prealloc_nodes + interior;
+
+               while (p->nr < nr_nodes[interior]) {
+                       b = __bch2_btree_node_alloc(c, &as->disk_res,
+                                                   flags & BTREE_INSERT_NOWAIT
+                                                   ? NULL : cl,
+                                                   interior, flags);
+                       if (IS_ERR(b)) {
+                               ret = PTR_ERR(b);
+                               goto err;
+                       }
 
-               as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
+                       p->b[p->nr++] = b;
+               }
        }
 
        bch2_btree_cache_cannibalize_unlock(c);
        return 0;
-err_free:
+err:
        bch2_btree_cache_cannibalize_unlock(c);
-       trace_btree_reserve_get_fail(c, nr_nodes, cl);
+       trace_btree_reserve_get_fail(c, nr_nodes[0] + nr_nodes[1], cl);
        return ret;
 }
 
@@ -942,7 +955,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
        u64 start_time = local_clock();
        int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
                ? BCH_DISK_RESERVATION_NOFAIL : 0;
-       unsigned nr_nodes;
+       unsigned nr_nodes[2];
        unsigned update_level = level;
        int journal_flags = 0;
        int ret = 0;
@@ -954,10 +967,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 
        closure_init_stack(&cl);
 retry:
-       nr_nodes = 0;
+       nr_nodes[0] = nr_nodes[1] = 0;
+       update_level = level;
 
        while (1) {
-               nr_nodes += 1 + split;
+               nr_nodes[!!update_level] += 1 + split;
                update_level++;
 
                if (!btree_path_node(path, update_level))
@@ -972,7 +986,7 @@ retry:
 
        /* Might have to allocate a new root: */
        if (update_level < BTREE_MAX_DEPTH)
-               nr_nodes += 1;
+               nr_nodes[1] += 1;
 
        if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
                trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
@@ -1050,7 +1064,7 @@ retry:
        }
 
        ret = bch2_disk_reservation_get(c, &as->disk_res,
-                       nr_nodes * btree_sectors(c),
+                       (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
                        c->opts.metadata_replicas,
                        disk_res_flags);
        if (ret)
@@ -1085,11 +1099,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
        list_del_init(&b->list);
        mutex_unlock(&c->btree_cache.lock);
 
-       if (b->c.level)
-               six_lock_pcpu_alloc(&b->c.lock);
-       else
-               six_lock_pcpu_free(&b->c.lock);
-
        mutex_lock(&c->btree_root_lock);
        BUG_ON(btree_node_root(c, b) &&
               (b->c.level < btree_node_root(c, b)->c.level ||
@@ -2015,7 +2024,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
                                return -EINTR;
                }
 
-               new_hash = bch2_btree_node_mem_alloc(c);
+               new_hash = bch2_btree_node_mem_alloc(c, false);
        }
 
        path->intent_ref++;
@@ -2091,7 +2100,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
                closure_sync(&cl);
        } while (ret);
 
-       b = bch2_btree_node_mem_alloc(c);
+       b = bch2_btree_node_mem_alloc(c, false);
        bch2_btree_cache_cannibalize_unlock(c);
 
        set_btree_node_fake(b);
index 8dc86fa636d680900034d8c1b9efd0c1374d0b15..e72eb87956167d881bf3997d136ee0fef4d87d14 100644 (file)
@@ -76,8 +76,10 @@ struct btree_update {
        struct journal_entry_pin        journal;
 
        /* Preallocated nodes we reserve when we start the update: */
-       struct btree                    *prealloc_nodes[BTREE_UPDATE_NODES_MAX];
-       unsigned                        nr_prealloc_nodes;
+       struct prealloc_nodes {
+               struct btree            *b[BTREE_UPDATE_NODES_MAX];
+               unsigned                nr;
+       }                               prealloc_nodes[2];
 
        /* Nodes being freed: */
        struct keylist                  old_keys;
index 1fff03d301a93a59a4d757ed62280e99663f8e40..457fcee7d8e19451617eb72e693547d32f1d7318 100644 (file)
@@ -443,6 +443,11 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
        bch2_flags_to_text(out, bch2_btree_node_flags, b->flags);
        pr_newline(out);
 
+       pr_buf(out, "pcpu read locks: ");
+       pr_tab(out);
+       pr_buf(out, "%u", b->c.lock.readers != NULL);
+       pr_newline(out);
+
        pr_buf(out, "written:");
        pr_tab(out);
        pr_buf(out, "%u", b->written);