bcachefs: Add a pre-reserve mechanism for the journal
authorKent Overstreet <kent.overstreet@gmail.com>
Tue, 19 Feb 2019 18:41:36 +0000 (13:41 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:17 +0000 (17:08 -0400)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/journal.c
fs/bcachefs/journal.h
fs/bcachefs/journal_io.c
fs/bcachefs/journal_reclaim.c
fs/bcachefs/journal_types.h

index ba6adf11ef4261fa3bdab69de54224d72ae37243..0aae8fd74c8a5215a0e81ecfe9af27795818dbe6 100644 (file)
@@ -343,6 +343,16 @@ retry:
                return 0;
        }
 
+       if (!(flags & JOURNAL_RES_GET_RESERVED) &&
+           !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
+               /*
+                * Don't want to close current journal entry, just need to
+                * invoke reclaim:
+                */
+               ret = -ENOSPC;
+               goto unlock;
+       }
+
        /*
         * If we couldn't get a reservation because the current buf filled up,
         * and we had room for a bigger entry on disk, signal that we want to
@@ -366,7 +376,7 @@ retry:
        } else {
                ret = journal_entry_open(j);
        }
-
+unlock:
        if ((ret == -EAGAIN || ret == -ENOSPC) &&
            !j->res_get_blocked_start)
                j->res_get_blocked_start = local_clock() ?: 1;
@@ -378,6 +388,8 @@ retry:
                goto retry;
 
        if (ret == -ENOSPC) {
+               BUG_ON(!can_discard && (flags & JOURNAL_RES_GET_RESERVED));
+
                /*
                 * Journal is full - can't rely on reclaim from work item due to
                 * freezing:
@@ -423,6 +435,32 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
        return ret;
 }
 
+/* journal_preres: */
+
+static bool journal_preres_available(struct journal *j,
+                                    struct journal_preres *res,
+                                    unsigned new_u64s)
+{
+       bool ret = bch2_journal_preres_get_fast(j, res, new_u64s);
+
+       if (!ret)
+               bch2_journal_reclaim_work(&j->reclaim_work.work);
+
+       return ret;
+}
+
+int __bch2_journal_preres_get(struct journal *j,
+                             struct journal_preres *res,
+                             unsigned new_u64s)
+{
+       int ret;
+
+       closure_wait_event(&j->preres_wait,
+                  (ret = bch2_journal_error(j)) ||
+                  journal_preres_available(j, res, new_u64s));
+       return ret;
+}
+
 /* journal_entry_res: */
 
 void bch2_journal_entry_res_resize(struct journal *j,
@@ -1110,11 +1148,16 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
               "seq:\t\t\t%llu\n"
               "last_seq:\t\t%llu\n"
               "last_seq_ondisk:\t%llu\n"
+              "prereserved:\t\t%u/%u\n"
+              "current entry sectors:\t%u\n"
               "current entry:\t\t",
               fifo_used(&j->pin),
               journal_cur_seq(j),
               journal_last_seq(j),
-              j->last_seq_ondisk);
+              j->last_seq_ondisk,
+              j->prereserved.reserved,
+              j->prereserved.remaining,
+              j->cur_entry_sectors);
 
        switch (s.cur_entry_offset) {
        case JOURNAL_ENTRY_ERROR_VAL:
@@ -1136,8 +1179,9 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
               journal_state_count(s, s.idx));
 
        if (s.prev_buf_unwritten)
-               pr_buf(&out, "yes, ref %u\n",
-                      journal_state_count(s, !s.idx));
+               pr_buf(&out, "yes, ref %u sectors %u\n",
+                      journal_state_count(s, !s.idx),
+                      journal_prev_buf(j)->sectors);
        else
                pr_buf(&out, "no\n");
 
index 77d59fb0b15133bdf1bd464e6a6265fe02dee99c..809cf25f5a03cc18341a5bf05f5f4e6108446467 100644 (file)
@@ -119,6 +119,7 @@ static inline void journal_wake(struct journal *j)
 {
        wake_up(&j->wait);
        closure_wake_up(&j->async_wait);
+       closure_wake_up(&j->preres_wait);
 }
 
 static inline struct journal_buf *journal_cur_buf(struct journal *j)
@@ -274,6 +275,7 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
 
 #define JOURNAL_RES_GET_NONBLOCK       (1 << 0)
 #define JOURNAL_RES_GET_CHECK          (1 << 1)
+#define JOURNAL_RES_GET_RESERVED       (1 << 2)
 
 static inline int journal_res_get_fast(struct journal *j,
                                       struct journal_res *res,
@@ -294,6 +296,10 @@ static inline int journal_res_get_fast(struct journal *j,
 
                EBUG_ON(!journal_state_count(new, new.idx));
 
+               if (!(flags & JOURNAL_RES_GET_RESERVED) &&
+                   !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
+                       return 0;
+
                if (flags & JOURNAL_RES_GET_CHECK)
                        return 1;
 
@@ -333,6 +339,89 @@ out:
        return 0;
 }
 
+/* journal_preres: */
+
+static inline bool journal_check_may_get_unreserved(struct journal *j)
+{
+       union journal_preres_state s = READ_ONCE(j->prereserved);
+       bool ret = s.reserved <= s.remaining &&
+               fifo_free(&j->pin) > 8;
+
+       lockdep_assert_held(&j->lock);
+
+       if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
+               if (ret) {
+                       set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
+                       journal_wake(j);
+               } else {
+                       clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
+               }
+       }
+       return ret;
+}
+
+static inline void bch2_journal_preres_put(struct journal *j,
+                                          struct journal_preres *res)
+{
+       union journal_preres_state s = { .reserved = res->u64s };
+
+       if (!res->u64s)
+               return;
+
+       s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
+       res->u64s = 0;
+       closure_wake_up(&j->preres_wait);
+
+       if (s.reserved <= s.remaining &&
+           !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
+               spin_lock(&j->lock);
+               journal_check_may_get_unreserved(j);
+               spin_unlock(&j->lock);
+       }
+}
+
+int __bch2_journal_preres_get(struct journal *,
+                       struct journal_preres *, unsigned);
+
+static inline int bch2_journal_preres_get_fast(struct journal *j,
+                                              struct journal_preres *res,
+                                              unsigned new_u64s)
+{
+       int d = new_u64s - res->u64s;
+       union journal_preres_state old, new;
+       u64 v = atomic64_read(&j->prereserved.counter);
+
+       do {
+               old.v = new.v = v;
+
+               new.reserved += d;
+
+               if (new.reserved > new.remaining)
+                       return 0;
+       } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
+                                      old.v, new.v)) != old.v);
+
+       res->u64s += d;
+       return 1;
+}
+
+static inline int bch2_journal_preres_get(struct journal *j,
+                                         struct journal_preres *res,
+                                         unsigned new_u64s,
+                                         unsigned flags)
+{
+       if (new_u64s <= res->u64s)
+               return 0;
+
+       if (bch2_journal_preres_get_fast(j, res, new_u64s))
+               return 0;
+
+       if (flags & JOURNAL_RES_GET_NONBLOCK)
+               return -EAGAIN;
+
+       return __bch2_journal_preres_get(j, res, new_u64s);
+}
+
 /* journal_entry_res: */
 
 void bch2_journal_entry_res_resize(struct journal *,
index 07cfbb975c37f66acb3e458fa3961c95c9197cb4..db95257cec11a282b68c87d1906b0abdf02b105c 100644 (file)
@@ -974,6 +974,12 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
                                        journal_space_discarded)) {
                        ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
                        ja->sectors_free = ca->mi.bucket_size;
+
+                       /*
+                        * ja->bucket_seq[ja->cur_idx] must always have
+                        * something sensible:
+                        */
+                       ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
                }
        }
 
index a3c53b78ad10a5dcc14020d072ca4a91efc5b030..053fa4aa4f5f5a37d474a20fc47ed21060ae5215 100644 (file)
@@ -49,6 +49,18 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
        return available;
 }
 
+static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
+{
+       union journal_preres_state old, new;
+       u64 v = atomic64_read(&j->prereserved.counter);
+
+       do {
+               old.v = new.v = v;
+               new.remaining = u64s_remaining;
+       } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
+                                      old.v, new.v)) != old.v);
+}
+
 static struct journal_space {
        unsigned        next_entry;
        unsigned        remaining;
@@ -124,8 +136,9 @@ void bch2_journal_space_available(struct journal *j)
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bch_dev *ca;
        struct journal_space discarded, clean_ondisk, clean;
-       unsigned max_entry_size         = min(j->buf[0].buf_size >> 9,
-                                             j->buf[1].buf_size >> 9);
+       unsigned overhead, u64s_remaining = 0;
+       unsigned max_entry_size  = min(j->buf[0].buf_size >> 9,
+                                      j->buf[1].buf_size >> 9);
        unsigned i, nr_online = 0, nr_devs_want;
        bool can_discard = false;
        int ret = 0;
@@ -176,9 +189,17 @@ void bch2_journal_space_available(struct journal *j)
 
        if (!discarded.next_entry)
                ret = -ENOSPC;
+
+       overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
+               journal_entry_overhead(j);
+       u64s_remaining = clean.remaining << 6;
+       u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
+       u64s_remaining /= 4;
 out:
        j->cur_entry_sectors    = !ret ? discarded.next_entry : 0;
        j->cur_entry_error      = ret;
+       journal_set_remaining(j, u64s_remaining);
+       journal_check_may_get_unreserved(j);
 
        if (!ret)
                journal_wake(j);
@@ -454,7 +475,7 @@ void bch2_journal_reclaim(struct journal *j)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bch_dev *ca;
-       unsigned iter, bucket_to_flush, min_nr = 0;
+       unsigned iter, min_nr = 0;
        u64 seq_to_flush = 0;
 
        lockdep_assert_held(&j->reclaim_lock);
@@ -465,13 +486,22 @@ void bch2_journal_reclaim(struct journal *j)
 
        for_each_rw_member(ca, c, iter) {
                struct journal_device *ja = &ca->journal;
+               unsigned nr_buckets, bucket_to_flush;
 
                if (!ja->nr)
                        continue;
 
-
                /* Try to keep the journal at most half full: */
-               bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
+               nr_buckets = ja->nr / 2;
+
+               /* And include pre-reservations: */
+               nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
+                                          (ca->mi.bucket_size << 6) -
+                                          journal_entry_overhead(j));
+
+               nr_buckets = min(nr_buckets, ja->nr);
+
+               bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
                seq_to_flush = max_t(u64, seq_to_flush,
                                     ja->bucket_seq[bucket_to_flush]);
        }
@@ -490,6 +520,9 @@ void bch2_journal_reclaim(struct journal *j)
                       msecs_to_jiffies(j->reclaim_delay_ms)))
                min_nr = 1;
 
+       if (j->prereserved.reserved * 2 > j->prereserved.remaining)
+               min_nr = 1;
+
        journal_flush_pins(j, seq_to_flush, min_nr);
 
        if (!test_bit(BCH_FS_RO, &c->flags))
index c91a21e0780985dcfeaac553027aabbb8e7ca687..85bf5e2706f7be6ca2e9a32d007826fba78d6b65 100644 (file)
@@ -80,6 +80,14 @@ struct journal_res {
        u64                     seq;
 };
 
+/*
+ * For reserving space in the journal prior to getting a reservation on a
+ * particular journal entry:
+ */
+struct journal_preres {
+       unsigned                u64s;
+};
+
 union journal_res_state {
        struct {
                atomic64_t      counter;
@@ -98,6 +106,21 @@ union journal_res_state {
        };
 };
 
+union journal_preres_state {
+       struct {
+               atomic64_t      counter;
+       };
+
+       struct {
+               u64             v;
+       };
+
+       struct {
+               u32             reserved;
+               u32             remaining;
+       };
+};
+
 /* bytes: */
 #define JOURNAL_ENTRY_SIZE_MIN         (64U << 10) /* 64k */
 #define JOURNAL_ENTRY_SIZE_MAX         (4U  << 20) /* 4M */
@@ -122,6 +145,7 @@ enum {
        JOURNAL_STARTED,
        JOURNAL_NEED_WRITE,
        JOURNAL_NOT_EMPTY,
+       JOURNAL_MAY_GET_UNRESERVED,
 };
 
 /* Embedded in struct bch_fs */
@@ -142,6 +166,8 @@ struct journal {
         */
        int                     cur_entry_error;
 
+       union journal_preres_state prereserved;
+
        /* Reserved space in journal entry to be used just prior to write */
        unsigned                entry_u64s_reserved;
 
@@ -161,6 +187,7 @@ struct journal {
        /* Used when waiting because the journal was full */
        wait_queue_head_t       wait;
        struct closure_waitlist async_wait;
+       struct closure_waitlist preres_wait;
 
        struct closure          io;
        struct delayed_work     write_work;