From 8a92e545597a3eaca80f2df14eb9a783d96c8445 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 19 Nov 2020 19:54:40 -0500 Subject: [PATCH] bcachefs: Ensure journal reclaim runs when btree key cache is too dirty Ensuring the key cache isn't too dirty is critical for ensuring that the shrinker can reclaim memory. Signed-off-by: Kent Overstreet Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 8 +++++ fs/bcachefs/btree_key_cache.h | 9 ++++++ fs/bcachefs/journal_reclaim.c | 53 +++++++++++++++++++++---------- fs/bcachefs/trace.h | 59 +++++++++++++++++++++++++++++++++++ 4 files changed, 113 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 836bb23fe3bca..99e03852b814e 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -461,6 +461,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_cached *ck = (void *) iter->l[0].b; + bool kick_reclaim = false; BUG_ON(insert->u64s > ck->u64s); @@ -485,11 +486,18 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, set_bit(BKEY_CACHED_DIRTY, &ck->flags); c->btree_key_cache.nr_dirty++; + + if (bch2_nr_btree_keys_need_flush(c)) + kick_reclaim = true; + mutex_unlock(&c->btree_key_cache.lock); } bch2_journal_pin_update(&c->journal, trans->journal_res.seq, &ck->journal, btree_key_cache_journal_flush); + + if (kick_reclaim) + mod_delayed_work(c->journal_reclaim_wq, &c->journal.reclaim_work, 0); return true; } diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h index e64a8e9c726ff..7723a2178430f 100644 --- a/fs/bcachefs/btree_key_cache.h +++ b/fs/bcachefs/btree_key_cache.h @@ -1,6 +1,15 @@ #ifndef _BCACHEFS_BTREE_KEY_CACHE_H #define _BCACHEFS_BTREE_KEY_CACHE_H +static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) +{ + size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); + size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty); + size_t max_dirty = 1024 + (nr_keys * 3) / 4; + + return max_t(ssize_t, 0, nr_dirty - max_dirty); +} + struct bkey_cached * bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 1cd9c11a37f0e..7f8ab13256c86 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -1,11 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_key_cache.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" #include "replicas.h" #include "super.h" +#include "trace.h" /* Free space calculations: */ @@ -432,7 +434,6 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) list_move(&ret->list, &pin_list->flushed); BUG_ON(j->flush_in_progress); j->flush_in_progress = ret; - j->last_flushed = jiffies; } spin_unlock(&j->lock); @@ -441,17 +442,24 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) } /* returns true if we did work */ -static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, - unsigned min_nr) +static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush, + unsigned min_nr) { struct journal_entry_pin *pin; - bool ret = false; - u64 seq; + u64 seq, ret = 0; lockdep_assert_held(&j->reclaim_lock); - while ((pin = journal_get_next_pin(j, min_nr - ? U64_MAX : seq_to_flush, &seq))) { + while (1) { + cond_resched(); + + j->last_flushed = jiffies; + + pin = journal_get_next_pin(j, min_nr + ? U64_MAX : seq_to_flush, &seq); + if (!pin) + break; + if (min_nr) min_nr--; @@ -460,7 +468,7 @@ static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, BUG_ON(j->flush_in_progress != pin); j->flush_in_progress = NULL; wake_up(&j->pin_flush_wait); - ret = true; + ret++; } return ret; @@ -527,8 +535,8 @@ static u64 journal_seq_to_flush(struct journal *j) void bch2_journal_reclaim(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned min_nr = 0; - u64 seq_to_flush = 0; + u64 seq_to_flush, nr_flushed = 0; + size_t min_nr; lockdep_assert_held(&j->reclaim_lock); @@ -549,12 +557,25 @@ void bch2_journal_reclaim(struct journal *j) if (j->prereserved.reserved * 2 > j->prereserved.remaining) min_nr = 1; - if ((atomic_read(&c->btree_cache.dirty) * 4 > - c->btree_cache.used * 3) || - (c->btree_key_cache.nr_dirty * 4 > - c->btree_key_cache.nr_keys)) + if (atomic_read(&c->btree_cache.dirty) * 4 > + c->btree_cache.used * 3) min_nr = 1; - } while (journal_flush_pins(j, seq_to_flush, min_nr)); + + min_nr = max(min_nr, bch2_nr_btree_keys_need_flush(c)); + + trace_journal_reclaim_start(c, + min_nr, + j->prereserved.reserved, + j->prereserved.remaining, + atomic_read(&c->btree_cache.dirty), + c->btree_cache.used, + c->btree_key_cache.nr_dirty, + c->btree_key_cache.nr_keys); + + nr_flushed += journal_flush_pins(j, seq_to_flush, min_nr); + } while (min_nr); + + trace_journal_reclaim_finish(c, nr_flushed); if (!bch2_journal_error(j)) queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, @@ -582,7 +603,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - *did_work = journal_flush_pins(j, seq_to_flush, 0); + *did_work = journal_flush_pins(j, seq_to_flush, 0) != 0; spin_lock(&j->lock); /* diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 09653c7ed858d..2afc09ad64ea8 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -121,6 +121,65 @@ DEFINE_EVENT(bio, journal_write, TP_ARGS(bio) ); +TRACE_EVENT(journal_reclaim_start, + TP_PROTO(struct bch_fs *c, u64 min_nr, + u64 prereserved, u64 prereserved_total, + u64 btree_cache_dirty, u64 btree_cache_total, + u64 btree_key_cache_dirty, u64 btree_key_cache_total), + TP_ARGS(c, min_nr, prereserved, prereserved_total, + btree_cache_dirty, btree_cache_total, + btree_key_cache_dirty, btree_key_cache_total), + + TP_STRUCT__entry( + __array(char, uuid, 16 ) + __field(u64, min_nr ) + __field(u64, prereserved ) + __field(u64, prereserved_total ) + __field(u64, btree_cache_dirty ) + __field(u64, btree_cache_total ) + __field(u64, btree_key_cache_dirty ) + __field(u64, btree_key_cache_total ) + ), + + TP_fast_assign( + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); + __entry->min_nr = min_nr; + __entry->prereserved = prereserved; + __entry->prereserved_total = prereserved_total; + __entry->btree_cache_dirty = btree_cache_dirty; + __entry->btree_cache_total = btree_cache_total; + __entry->btree_key_cache_dirty = btree_key_cache_dirty; + __entry->btree_key_cache_total = btree_key_cache_total; + ), + + TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", + __entry->uuid, + __entry->min_nr, + __entry->prereserved, + __entry->prereserved_total, + __entry->btree_cache_dirty, + __entry->btree_cache_total, + __entry->btree_key_cache_dirty, + __entry->btree_key_cache_total) +); + +TRACE_EVENT(journal_reclaim_finish, + TP_PROTO(struct bch_fs *c, u64 nr_flushed), + TP_ARGS(c, nr_flushed), + + TP_STRUCT__entry( + __array(char, uuid, 16 ) + __field(u64, nr_flushed ) + ), + + TP_fast_assign( + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); + __entry->nr_flushed = nr_flushed; + ), + + TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed) +); + /* bset.c: */ DEFINE_EVENT(bpos, bkey_pack_pos_fail, -- 2.30.2