From: Kent Overstreet Date: Sat, 5 Mar 2022 00:16:04 +0000 (-0500) Subject: bcachefs: Fix usage of six lock's percpu mode X-Git-Url: http://git.maquefel.me/?a=commitdiff_plain;h=3098553776a16c08446c408005090423d62e6b54;p=linux.git bcachefs: Fix usage of six lock's percpu mode Six locks have a percpu mode, which we use for interior btree nodes, as well as btree key cache keys for the subvolumes btree. We've been switching locks back and forth between percpu and non percpu mode as needed, but it turns out this is racy - when we're reusing an existing node, other threads could be attempting to lock it while we're switching it between modes. This patch fixes this by never switching 'struct btree' between the two modes, and instead segragating them between two different freed lists. Signed-off-by: Kent Overstreet Signed-off-by: Kent Overstreet --- diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 42253ca17f043..92a8cc704cabf 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -40,6 +40,14 @@ static inline unsigned btree_cache_can_free(struct btree_cache *bc) return max_t(int, 0, bc->used - bc->reserve); } +static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) +{ + if (b->c.lock.readers) + list_move(&b->list, &bc->freed_pcpu); + else + list_move(&b->list, &bc->freed_nonpcpu); +} + static void btree_node_data_free(struct bch_fs *c, struct btree *b) { struct btree_cache *bc = &c->btree_cache; @@ -56,7 +64,8 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) b->aux_data = NULL; bc->used--; - list_move(&b->list, &bc->freed); + + btree_node_to_freedlist(bc, b); } static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, @@ -162,11 +171,6 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, b->c.level = level; b->c.btree_id = id; - if (level) - six_lock_pcpu_alloc(&b->c.lock); - else - six_lock_pcpu_free_rcu(&b->c.lock); - mutex_lock(&bc->lock); ret = __bch2_btree_node_hash_insert(bc, b); if (!ret) @@ -432,8 +436,10 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) BUG_ON(atomic_read(&c->btree_cache.dirty)); - while (!list_empty(&bc->freed)) { - b = list_first_entry(&bc->freed, struct btree, list); + list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); + + while (!list_empty(&bc->freed_nonpcpu)) { + b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); list_del(&b->list); six_lock_pcpu_free(&b->c.lock); kfree(b); @@ -487,7 +493,8 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc) mutex_init(&bc->lock); INIT_LIST_HEAD(&bc->live); INIT_LIST_HEAD(&bc->freeable); - INIT_LIST_HEAD(&bc->freed); + INIT_LIST_HEAD(&bc->freed_pcpu); + INIT_LIST_HEAD(&bc->freed_nonpcpu); } /* @@ -562,9 +569,12 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) } } -struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) +struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks) { struct btree_cache *bc = &c->btree_cache; + struct list_head *freed = pcpu_read_locks + ? &bc->freed_pcpu + : &bc->freed_nonpcpu; struct btree *b, *b2; u64 start_time = local_clock(); unsigned flags; @@ -576,7 +586,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) * We never free struct btree itself, just the memory that holds the on * disk node. Check the freed list before allocating a new one: */ - list_for_each_entry(b, &bc->freed, list) + list_for_each_entry(b, freed, list) if (!btree_node_reclaim(c, b)) { list_del_init(&b->list); goto got_node; @@ -586,6 +596,9 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) if (!b) goto err_locked; + if (pcpu_read_locks) + six_lock_pcpu_alloc(&b->c.lock); + BUG_ON(!six_trylock_intent(&b->c.lock)); BUG_ON(!six_trylock_write(&b->c.lock)); got_node: @@ -598,7 +611,7 @@ got_node: if (!btree_node_reclaim(c, b2)) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); - list_move(&b2->list, &bc->freed); + btree_node_to_freedlist(bc, b2); six_unlock_write(&b2->c.lock); six_unlock_intent(&b2->c.lock); goto got_mem; @@ -643,7 +656,7 @@ err_locked: if (b) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); - list_move(&b2->list, &bc->freed); + btree_node_to_freedlist(bc, b2); six_unlock_write(&b2->c.lock); six_unlock_intent(&b2->c.lock); } else { @@ -688,7 +701,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, return ERR_PTR(-EINTR); } - b = bch2_btree_node_mem_alloc(c); + b = bch2_btree_node_mem_alloc(c, level != 0); if (trans && b == ERR_PTR(-ENOMEM)) { trans->memory_allocation_failure = true; diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 96f8f90e85a1d..83723805f12a5 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -20,7 +20,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); -struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); +struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool); struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, const struct bkey_i *, unsigned, diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 53f83340f69a2..3031b566a112b 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1542,7 +1542,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, closure_sync(&cl); } while (ret); - b = bch2_btree_node_mem_alloc(c); + b = bch2_btree_node_mem_alloc(c, level != 0); bch2_btree_cache_cannibalize_unlock(c); BUG_ON(IS_ERR(b)); diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 70f31b5379e70..7e41552a57dfb 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -166,13 +166,13 @@ btree_key_cache_create(struct bch_fs *c, } was_new = false; + } else { + if (btree_id == BTREE_ID_subvolumes) + six_lock_pcpu_alloc(&ck->c.lock); + else + six_lock_pcpu_free(&ck->c.lock); } - if (btree_id == BTREE_ID_subvolumes) - six_lock_pcpu_alloc(&ck->c.lock); - else - six_lock_pcpu_free(&ck->c.lock); - ck->c.level = 0; ck->c.btree_id = btree_id; ck->key.btree_id = btree_id; diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 561406b4b7c26..51eb686331bfa 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -152,7 +152,8 @@ struct btree_cache { struct mutex lock; struct list_head live; struct list_head freeable; - struct list_head freed; + struct list_head freed_pcpu; + struct list_head freed_nonpcpu; /* Number of elements in live + freeable lists */ unsigned used; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 523d1146b2e2f..43022b340f4e6 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -181,6 +181,7 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, struct disk_reservation *res, struct closure *cl, + bool interior_node, unsigned flags) { struct write_point *wp; @@ -242,7 +243,7 @@ retry: bch2_open_bucket_get(c, wp, &ob); bch2_alloc_sectors_done(c, wp); mem_alloc: - b = bch2_btree_node_mem_alloc(c); + b = bch2_btree_node_mem_alloc(c, interior_node); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -260,12 +261,13 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev { struct bch_fs *c = as->c; struct btree *b; + struct prealloc_nodes *p = &as->prealloc_nodes[!!level]; int ret; BUG_ON(level >= BTREE_MAX_DEPTH); - BUG_ON(!as->nr_prealloc_nodes); + BUG_ON(!p->nr); - b = as->prealloc_nodes[--as->nr_prealloc_nodes]; + b = p->b[--p->nr]; six_lock_intent(&b->c.lock, NULL, NULL); six_lock_write(&b->c.lock, NULL, NULL); @@ -377,43 +379,49 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) static void bch2_btree_reserve_put(struct btree_update *as) { struct bch_fs *c = as->c; + struct prealloc_nodes *p; mutex_lock(&c->btree_reserve_cache_lock); - while (as->nr_prealloc_nodes) { - struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes]; + for (p = as->prealloc_nodes; + p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes); + p++) { + while (p->nr) { + struct btree *b = p->b[--p->nr]; - six_lock_intent(&b->c.lock, NULL, NULL); - six_lock_write(&b->c.lock, NULL, NULL); + six_lock_intent(&b->c.lock, NULL, NULL); + six_lock_write(&b->c.lock, NULL, NULL); - if (c->btree_reserve_cache_nr < - ARRAY_SIZE(c->btree_reserve_cache)) { - struct btree_alloc *a = - &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; + if (c->btree_reserve_cache_nr < + ARRAY_SIZE(c->btree_reserve_cache)) { + struct btree_alloc *a = + &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; - a->ob = b->ob; - b->ob.nr = 0; - bkey_copy(&a->k, &b->key); - } else { - bch2_open_buckets_put(c, &b->ob); - } + a->ob = b->ob; + b->ob.nr = 0; + bkey_copy(&a->k, &b->key); + } else { + bch2_open_buckets_put(c, &b->ob); + } - __btree_node_free(c, b); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); + __btree_node_free(c, b); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + } } mutex_unlock(&c->btree_reserve_cache_lock); } -static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, +static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes[2], unsigned flags, struct closure *cl) { struct bch_fs *c = as->c; struct btree *b; + unsigned interior; int ret; - BUG_ON(nr_nodes > BTREE_RESERVE_MAX); + BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); /* * Protects reaping from the btree node cache and using the btree node @@ -423,23 +431,28 @@ static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, if (ret) return ret; - while (as->nr_prealloc_nodes < nr_nodes) { - b = __bch2_btree_node_alloc(c, &as->disk_res, - flags & BTREE_INSERT_NOWAIT - ? NULL : cl, flags); - if (IS_ERR(b)) { - ret = PTR_ERR(b); - goto err_free; - } + for (interior = 0; interior < 2; interior++) { + struct prealloc_nodes *p = as->prealloc_nodes + interior; + + while (p->nr < nr_nodes[interior]) { + b = __bch2_btree_node_alloc(c, &as->disk_res, + flags & BTREE_INSERT_NOWAIT + ? NULL : cl, + interior, flags); + if (IS_ERR(b)) { + ret = PTR_ERR(b); + goto err; + } - as->prealloc_nodes[as->nr_prealloc_nodes++] = b; + p->b[p->nr++] = b; + } } bch2_btree_cache_cannibalize_unlock(c); return 0; -err_free: +err: bch2_btree_cache_cannibalize_unlock(c); - trace_btree_reserve_get_fail(c, nr_nodes, cl); + trace_btree_reserve_get_fail(c, nr_nodes[0] + nr_nodes[1], cl); return ret; } @@ -942,7 +955,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, u64 start_time = local_clock(); int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) ? BCH_DISK_RESERVATION_NOFAIL : 0; - unsigned nr_nodes; + unsigned nr_nodes[2]; unsigned update_level = level; int journal_flags = 0; int ret = 0; @@ -954,10 +967,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, closure_init_stack(&cl); retry: - nr_nodes = 0; + nr_nodes[0] = nr_nodes[1] = 0; + update_level = level; while (1) { - nr_nodes += 1 + split; + nr_nodes[!!update_level] += 1 + split; update_level++; if (!btree_path_node(path, update_level)) @@ -972,7 +986,7 @@ retry: /* Might have to allocate a new root: */ if (update_level < BTREE_MAX_DEPTH) - nr_nodes += 1; + nr_nodes[1] += 1; if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) { trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_, @@ -1050,7 +1064,7 @@ retry: } ret = bch2_disk_reservation_get(c, &as->disk_res, - nr_nodes * btree_sectors(c), + (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), c->opts.metadata_replicas, disk_res_flags); if (ret) @@ -1085,11 +1099,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) list_del_init(&b->list); mutex_unlock(&c->btree_cache.lock); - if (b->c.level) - six_lock_pcpu_alloc(&b->c.lock); - else - six_lock_pcpu_free(&b->c.lock); - mutex_lock(&c->btree_root_lock); BUG_ON(btree_node_root(c, b) && (b->c.level < btree_node_root(c, b)->c.level || @@ -2015,7 +2024,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite return -EINTR; } - new_hash = bch2_btree_node_mem_alloc(c); + new_hash = bch2_btree_node_mem_alloc(c, false); } path->intent_ref++; @@ -2091,7 +2100,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) closure_sync(&cl); } while (ret); - b = bch2_btree_node_mem_alloc(c); + b = bch2_btree_node_mem_alloc(c, false); bch2_btree_cache_cannibalize_unlock(c); set_btree_node_fake(b); diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 8dc86fa636d68..e72eb87956167 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -76,8 +76,10 @@ struct btree_update { struct journal_entry_pin journal; /* Preallocated nodes we reserve when we start the update: */ - struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX]; - unsigned nr_prealloc_nodes; + struct prealloc_nodes { + struct btree *b[BTREE_UPDATE_NODES_MAX]; + unsigned nr; + } prealloc_nodes[2]; /* Nodes being freed: */ struct keylist old_keys; diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 1fff03d301a93..457fcee7d8e19 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -443,6 +443,11 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * bch2_flags_to_text(out, bch2_btree_node_flags, b->flags); pr_newline(out); + pr_buf(out, "pcpu read locks: "); + pr_tab(out); + pr_buf(out, "%u", b->c.lock.readers != NULL); + pr_newline(out); + pr_buf(out, "written:"); pr_tab(out); pr_buf(out, "%u", b->written);