bcachefs: Don't require flush/fua on every journal write
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 14 Nov 2020 14:59:58 +0000 (09:59 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:49 +0000 (17:08 -0400)
This patch adds a flag to journal entries which, if set, indicates that
they weren't done as flush/fua writes.

 - non flush/fua journal writes don't update last_seq (i.e. they don't
   free up space in the journal), thus the journal free space
   calculations now check whether nonflush journal writes are currently
   allowed (i.e. are we low on free space, or would doing a flush write
   free up a lot of space in the journal)

 - write_delay_ms, the user configurable option for when open journal
   entries are automatically written, is now interpreted as the max
   delay between flush journal writes (default 1 second).

 - bch2_journal_flush_seq_async is changed to ensure a flush write >=
   the requested sequence number has happened

 - journal read/replay must now ignore, and blacklist, any journal
   entries newer than the most recent flush entry in the journal. Also,
   the way the read_entire_journal option is handled has been improved;
   struct journal_replay now has an entry, 'ignore', for entries that
   were read but should not be used.

 - assorted refactoring and improvements related to journal read in
   journal_io.c and recovery.c

Previously, we'd have to issue a flush/fua write every time we
accumulated a full journal entry - typically the bucket size. Now we
need to issue them much less frequently: when an fsync is requested, or
it's been more than write_delay_ms since the last flush, or when we need
to free up space in the journal. This is a significant performance
improvement on many write heavy workloads.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/bcachefs_format.h
fs/bcachefs/journal.c
fs/bcachefs/journal.h
fs/bcachefs/journal_io.c
fs/bcachefs/journal_io.h
fs/bcachefs/journal_reclaim.c
fs/bcachefs/journal_seq_blacklist.c
fs/bcachefs/journal_types.h
fs/bcachefs/recovery.c

index f072e865e43f70bc9c5200e73840ed188f1f6d0a..7df2bc7ecd4f49320648316db117d92f85fa9b68 100644 (file)
@@ -1336,14 +1336,16 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,       struct bch_sb, flags[3],  0, 16);
        x(extents_above_btree_updates,  12)     \
        x(btree_updates_journalled,     13)     \
        x(reflink_inline_data,          14)     \
-       x(new_varint,                   15)
+       x(new_varint,                   15)     \
+       x(journal_no_flush,             16)
 
 #define BCH_SB_FEATURES_ALL                            \
        ((1ULL << BCH_FEATURE_new_siphash)|             \
         (1ULL << BCH_FEATURE_new_extent_overwrite)|    \
         (1ULL << BCH_FEATURE_btree_ptr_v2)|            \
         (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
-        (1ULL << BCH_FEATURE_new_varint))\
+        (1ULL << BCH_FEATURE_new_varint)|              \
+        (1ULL << BCH_FEATURE_journal_no_flush))
 
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
@@ -1582,6 +1584,7 @@ struct jset {
 
 LE32_BITMASK(JSET_CSUM_TYPE,   struct jset, flags, 0, 4);
 LE32_BITMASK(JSET_BIG_ENDIAN,  struct jset, flags, 4, 5);
+LE32_BITMASK(JSET_NO_FLUSH,    struct jset, flags, 5, 6);
 
 #define BCH_JOURNAL_BUCKETS_MIN                8
 
index 3bbb23d7739a78a89ef7cbf59f2ea2beffa55632..31168754d6b8395869696a66f7161252f1c3806f 100644 (file)
@@ -79,6 +79,8 @@ static void bch2_journal_buf_init(struct journal *j)
        struct journal_buf *buf = journal_cur_buf(j);
 
        bkey_extent_init(&buf->key);
+       buf->noflush    = false;
+       buf->must_flush = false;
 
        memset(buf->has_inode, 0, sizeof(buf->has_inode));
 
@@ -574,7 +576,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
        struct journal_buf *buf;
        int ret = 0;
 
-       if (seq <= j->seq_ondisk)
+       if (seq <= j->flushed_seq_ondisk)
                return 1;
 
        spin_lock(&j->lock);
@@ -585,16 +587,53 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
                goto out;
        }
 
-       if (seq <= j->seq_ondisk) {
+       if (seq <= j->flushed_seq_ondisk) {
                ret = 1;
                goto out;
        }
 
-       if (parent &&
-           (buf = journal_seq_to_buf(j, seq)))
-               if (!closure_wait(&buf->wait, parent))
+       /* if seq was written, but not flushed - flush a newer one instead */
+       seq = max(seq, last_unwritten_seq(j));
+
+recheck_need_open:
+       if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
+               struct journal_res res = { 0 };
+
+               spin_unlock(&j->lock);
+
+               ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+               if (ret)
+                       return ret;
+
+               seq = res.seq;
+               buf = j->buf + (seq & JOURNAL_BUF_MASK);
+               buf->must_flush = true;
+               set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+               if (parent && !closure_wait(&buf->wait, parent))
                        BUG();
 
+               bch2_journal_res_put(j, &res);
+
+               spin_lock(&j->lock);
+               goto want_write;
+       }
+
+       /*
+        * if write was kicked off without a flush, flush the next sequence
+        * number instead
+        */
+       buf = journal_seq_to_buf(j, seq);
+       if (buf->noflush) {
+               seq++;
+               goto recheck_need_open;
+       }
+
+       buf->must_flush = true;
+
+       if (parent && !closure_wait(&buf->wait, parent))
+               BUG();
+want_write:
        if (seq == journal_cur_seq(j))
                journal_entry_want_write(j);
 out:
@@ -979,6 +1018,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
        spin_lock(&j->lock);
 
        set_bit(JOURNAL_STARTED, &j->flags);
+       j->last_flush_write = jiffies;
 
        journal_pin_new_entry(j, 1);
 
@@ -1116,6 +1156,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
               "last_seq:\t\t%llu\n"
               "last_seq_ondisk:\t%llu\n"
               "prereserved:\t\t%u/%u\n"
+              "nr flush writes:\t%llu\n"
+              "nr noflush writes:\t%llu\n"
               "nr direct reclaim:\t%llu\n"
               "nr background reclaim:\t%llu\n"
               "current entry sectors:\t%u\n"
@@ -1127,6 +1169,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
               j->last_seq_ondisk,
               j->prereserved.reserved,
               j->prereserved.remaining,
+              j->nr_flush_writes,
+              j->nr_noflush_writes,
               j->nr_direct_reclaim,
               j->nr_background_reclaim,
               j->cur_entry_sectors,
index 1b6175cd6f1bd3759de1e606c54ffb3d8767d0e7..2c0014c3c02f58f1e01244e0673991ea2486d673 100644 (file)
@@ -136,7 +136,7 @@ static inline u64 journal_last_seq(struct journal *j)
 
 static inline u64 journal_cur_seq(struct journal *j)
 {
-       BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+       EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
 
        return j->pin.back - 1;
 }
index 1aeeb58d3c2aeae276e793fc1ca820e73d9e5f19..26556bb381b2725506f17c760794ae1173b1b887 100644 (file)
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
 #include "replicas.h"
 #include "trace.h"
 
+static void __journal_replay_free(struct journal_replay *i)
+{
+       list_del(&i->list);
+       kvpfree(i, offsetof(struct journal_replay, j) +
+               vstruct_bytes(&i->j));
+
+}
+
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+{
+       i->ignore = true;
+
+       if (!c->opts.read_entire_journal)
+               __journal_replay_free(i);
+}
+
 struct journal_list {
        struct closure          cl;
        struct mutex            lock;
@@ -35,28 +52,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
        struct bch_devs_list devs = { .nr = 0 };
        struct list_head *where;
        size_t bytes = vstruct_bytes(j);
-       __le64 last_seq;
+       u64 last_seq = 0;
        int ret;
 
-       last_seq = !list_empty(jlist->head)
-               ? list_last_entry(jlist->head, struct journal_replay,
-                                 list)->j.last_seq
-               : 0;
-
-       if (!c->opts.read_entire_journal) {
-               /* Is this entry older than the range we need? */
-               if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
-                       ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-                       goto out;
+       list_for_each_entry_reverse(i, jlist->head, list) {
+               if (!JSET_NO_FLUSH(&i->j)) {
+                       last_seq = le64_to_cpu(i->j.last_seq);
+                       break;
                }
+       }
 
-               /* Drop entries we don't need anymore */
+       /* Is this entry older than the range we need? */
+       if (!c->opts.read_entire_journal &&
+           le64_to_cpu(j->seq) < last_seq) {
+               ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+               goto out;
+       }
+
+       /* Drop entries we don't need anymore */
+       if (!JSET_NO_FLUSH(j)) {
                list_for_each_entry_safe(i, pos, jlist->head, list) {
                        if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
                                break;
-                       list_del(&i->list);
-                       kvpfree(i, offsetof(struct journal_replay, j) +
-                               vstruct_bytes(&i->j));
+                       journal_replay_free(c, i);
                }
        }
 
@@ -80,9 +98,7 @@ add:
        if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
                if (i->bad) {
                        devs = i->devs;
-                       list_del(&i->list);
-                       kvpfree(i, offsetof(struct journal_replay, j) +
-                               vstruct_bytes(&i->j));
+                       __journal_replay_free(i);
                } else if (bad) {
                        goto found;
                } else {
@@ -104,6 +120,7 @@ add:
        list_add(&i->list, where);
        i->devs = devs;
        i->bad  = bad;
+       i->ignore = false;
        unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
 found:
        if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
@@ -698,14 +715,16 @@ err:
        goto out;
 }
 
-int bch2_journal_read(struct bch_fs *c, struct list_head *list)
+int bch2_journal_read(struct bch_fs *c, struct list_head *list,
+                     u64 *blacklist_seq, u64 *start_seq)
 {
        struct journal_list jlist;
-       struct journal_replay *i;
+       struct journal_replay *i, *t;
        struct bch_dev *ca;
        unsigned iter;
        size_t keys = 0, entries = 0;
        bool degraded = false;
+       u64 seq, last_seq = 0;
        int ret = 0;
 
        closure_init_stack(&jlist.cl);
@@ -734,12 +753,97 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
        if (jlist.ret)
                return jlist.ret;
 
+       if (list_empty(list)) {
+               bch_info(c, "journal read done, but no entries found");
+               return 0;
+       }
+
+       i = list_last_entry(list, struct journal_replay, list);
+       *start_seq = le64_to_cpu(i->j.seq) + 1;
+
+       /*
+        * Find most recent flush entry, and ignore newer non flush entries -
+        * those entries will be blacklisted:
+        */
+       list_for_each_entry_safe_reverse(i, t, list, list) {
+               if (i->ignore)
+                       continue;
+
+               if (!JSET_NO_FLUSH(&i->j)) {
+                       last_seq        = le64_to_cpu(i->j.last_seq);
+                       *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
+                       break;
+               }
+
+               journal_replay_free(c, i);
+       }
+
+       if (!last_seq) {
+               fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
+               return -1;
+       }
+
+       /* Drop blacklisted entries and entries older than last_seq: */
+       list_for_each_entry_safe(i, t, list, list) {
+               if (i->ignore)
+                       continue;
+
+               seq = le64_to_cpu(i->j.seq);
+               if (seq < last_seq) {
+                       journal_replay_free(c, i);
+                       continue;
+               }
+
+               if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
+                       fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
+                                   "found blacklisted journal entry %llu", seq);
+
+                       journal_replay_free(c, i);
+               }
+       }
+
+       /* Check for missing entries: */
+       seq = last_seq;
+       list_for_each_entry(i, list, list) {
+               if (i->ignore)
+                       continue;
+
+               BUG_ON(seq > le64_to_cpu(i->j.seq));
+
+               while (seq < le64_to_cpu(i->j.seq)) {
+                       u64 missing_start, missing_end;
+
+                       while (seq < le64_to_cpu(i->j.seq) &&
+                              bch2_journal_seq_is_blacklisted(c, seq, false))
+                               seq++;
+
+                       if (seq == le64_to_cpu(i->j.seq))
+                               break;
+
+                       missing_start = seq;
+
+                       while (seq < le64_to_cpu(i->j.seq) &&
+                              !bch2_journal_seq_is_blacklisted(c, seq, false))
+                               seq++;
+
+                       missing_end = seq - 1;
+                       fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
+                                missing_start, missing_end,
+                                last_seq, *blacklist_seq - 1);
+               }
+
+               seq++;
+       }
+
        list_for_each_entry(i, list, list) {
                struct jset_entry *entry;
                struct bkey_i *k, *_n;
                struct bch_replicas_padded replicas;
                char buf[80];
 
+               if (i->ignore)
+                       continue;
+
                ret = jset_validate_entries(c, &i->j, READ);
                if (ret)
                        goto fsck_err;
@@ -767,12 +871,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                entries++;
        }
 
-       if (!list_empty(list)) {
-               i = list_last_entry(list, struct journal_replay, list);
+       bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
+                keys, entries, *start_seq);
 
-               bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
-                        keys, entries, le64_to_cpu(i->j.seq));
-       }
+       if (*start_seq != *blacklist_seq)
+               bch_info(c, "dropped unflushed entries %llu-%llu",
+                        *blacklist_seq, *start_seq - 1);
 fsck_err:
        return ret;
 }
@@ -990,8 +1094,12 @@ static void journal_write_done(struct closure *cl)
        j->seq_ondisk           = seq;
        if (err && (!j->err_seq || seq < j->err_seq))
                j->err_seq      = seq;
-       j->last_seq_ondisk      = last_seq;
-       bch2_journal_space_available(j);
+
+       if (!w->noflush) {
+               j->flushed_seq_ondisk = seq;
+               j->last_seq_ondisk = last_seq;
+               bch2_journal_space_available(j);
+       }
 
        /*
         * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
@@ -1067,6 +1175,22 @@ void bch2_journal_write(struct closure *cl)
 
        j->write_start_time = local_clock();
 
+       spin_lock(&j->lock);
+       if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
+           !w->must_flush &&
+           (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) &&
+           test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
+               w->noflush = true;
+               SET_JSET_NO_FLUSH(jset, true);
+               jset->last_seq = cpu_to_le64(j->last_seq_ondisk);
+
+               j->nr_noflush_writes++;
+       } else {
+               j->last_flush_write = jiffies;
+               j->nr_flush_writes++;
+       }
+       spin_unlock(&j->lock);
+
        /*
         * New btree roots are set by journalling them; when the journal entry
         * gets written we have to propagate them to c->btree_roots
@@ -1183,11 +1307,12 @@ retry_alloc:
                             sectors);
 
                bio = ca->journal.bio;
-               bio_reset(bio, ca->disk_sb.bdev,
-                         REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
+               bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
                bio->bi_iter.bi_sector  = ptr->offset;
                bio->bi_end_io          = journal_write_endio;
                bio->bi_private         = ca;
+               if (!JSET_NO_FLUSH(jset))
+                       bio->bi_opf    |= REQ_PREFLUSH|REQ_FUA;
                bch2_bio_map(bio, jset, sectors << 9);
 
                trace_journal_write(bio);
@@ -1196,18 +1321,19 @@ retry_alloc:
                ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
        }
 
-       for_each_rw_member(ca, c, i)
-               if (journal_flushes_device(ca) &&
-                   !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
-                       percpu_ref_get(&ca->io_ref);
-
-                       bio = ca->journal.bio;
-                       bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
-                       bio->bi_end_io          = journal_write_endio;
-                       bio->bi_private         = ca;
-                       closure_bio_submit(bio, cl);
-               }
-
+       if (!JSET_NO_FLUSH(jset)) {
+               for_each_rw_member(ca, c, i)
+                       if (journal_flushes_device(ca) &&
+                           !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
+                               percpu_ref_get(&ca->io_ref);
+
+                               bio = ca->journal.bio;
+                               bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+                               bio->bi_end_io          = journal_write_endio;
+                               bio->bi_private         = ca;
+                               closure_bio_submit(bio, cl);
+                       }
+       }
 no_io:
        bch2_bucket_seq_cleanup(c);
 
index 6958ee0f8cf23da1ab5a9c0588fedb3d8679678c..6b4c80968f52064370c4d3b767db29cd3cb59bed 100644 (file)
@@ -11,6 +11,7 @@ struct journal_replay {
        struct bch_devs_list    devs;
        /* checksum error, but we may want to try using it anyways: */
        bool                    bad;
+       bool                    ignore;
        /* must be last: */
        struct jset             j;
 };
@@ -37,7 +38,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
        for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)        \
                vstruct_for_each_safe(entry, k, _n)
 
-int bch2_journal_read(struct bch_fs *, struct list_head *);
+int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
 
 void bch2_journal_write(struct closure *);
 
index c6267284a02807b24fd08acee8c5f8449e52f003..a3d5405991b9c98edbe64af71a51421d883a0c20 100644 (file)
@@ -158,7 +158,7 @@ void bch2_journal_space_available(struct journal *j)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bch_dev *ca;
-       unsigned clean;
+       unsigned clean, clean_ondisk, total;
        unsigned overhead, u64s_remaining = 0;
        unsigned max_entry_size  = min(j->buf[0].buf_size >> 9,
                                       j->buf[1].buf_size >> 9);
@@ -204,13 +204,21 @@ void bch2_journal_space_available(struct journal *j)
        for (i = 0; i < journal_space_nr; i++)
                j->space[i] = __journal_space_available(j, nr_devs_want, i);
 
+       clean_ondisk    = j->space[journal_space_clean_ondisk].total;
        clean           = j->space[journal_space_clean].total;
+       total           = j->space[journal_space_total].total;
 
        if (!j->space[journal_space_discarded].next_entry)
                ret = cur_entry_journal_full;
        else if (!fifo_free(&j->pin))
                ret = cur_entry_journal_pin_full;
 
+       if ((clean - clean_ondisk <= total / 8) &&
+           (clean_ondisk * 2 > clean ))
+               set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+       else
+               clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+
        overhead = DIV_ROUND_UP(clean, max_entry_size) *
                journal_entry_overhead(j);
        u64s_remaining = clean << 6;
index d0f1bbf8f6a7984ff5f96d997235b49d484d2eee..e1b63f3879f44e50cc2fdd92ca3de8db03a3c7fa 100644 (file)
@@ -118,7 +118,7 @@ out_write_sb:
 out:
        mutex_unlock(&c->sb_lock);
 
-       return ret;
+       return ret ?: bch2_blacklist_table_initialize(c);
 }
 
 static int journal_seq_blacklist_table_cmp(const void *_l,
@@ -164,8 +164,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
        struct journal_seq_blacklist_table *t;
        unsigned i, nr = blacklist_nr_entries(bl);
 
-       BUG_ON(c->journal_seq_blacklist_table);
-
        if (!bl)
                return 0;
 
@@ -187,6 +185,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
                        journal_seq_blacklist_table_cmp,
                        NULL);
 
+       kfree(c->journal_seq_blacklist_table);
        c->journal_seq_blacklist_table = t;
        return 0;
 }
index 6b525dc6ab7cae73a7b5b731a244fbb083d3e051..cf9675310f2b6654b2339d55fa764b3d6ff669a0 100644 (file)
@@ -29,6 +29,8 @@ struct journal_buf {
        unsigned                disk_sectors;   /* maximum size entry could have been, if
                                                   buf_size was bigger */
        unsigned                u64s_reserved;
+       bool                    noflush;        /* write has already been kicked off, and was noflush */
+       bool                    must_flush;     /* something wants a flush */
        /* bloom filter: */
        unsigned long           has_inode[1024 / sizeof(unsigned long)];
 };
@@ -146,6 +148,7 @@ enum {
        JOURNAL_RECLAIM_STARTED,
        JOURNAL_NEED_WRITE,
        JOURNAL_MAY_GET_UNRESERVED,
+       JOURNAL_MAY_SKIP_FLUSH,
 };
 
 /* Embedded in struct bch_fs */
@@ -203,6 +206,7 @@ struct journal {
 
        /* seq, last_seq from the most recent journal entry successfully written */
        u64                     seq_ondisk;
+       u64                     flushed_seq_ondisk;
        u64                     last_seq_ondisk;
        u64                     err_seq;
        u64                     last_empty_seq;
@@ -252,11 +256,15 @@ struct journal {
 
        unsigned                write_delay_ms;
        unsigned                reclaim_delay_ms;
+       unsigned long           last_flush_write;
 
        u64                     res_get_blocked_start;
        u64                     need_write_time;
        u64                     write_start_time;
 
+       u64                     nr_flush_writes;
+       u64                     nr_noflush_writes;
+
        struct bch2_time_stats  *write_time;
        struct bch2_time_stats  *delay_time;
        struct bch2_time_stats  *blocked_time;
index 7ad5b823474766397b62a8bda1cb3298ebbef353..ecd51d45743a3705da4f1a6261b2469876f73add 100644 (file)
@@ -313,7 +313,7 @@ void bch2_journal_keys_free(struct journal_keys *keys)
 
 static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 {
-       struct journal_replay *p;
+       struct journal_replay *i;
        struct jset_entry *entry;
        struct bkey_i *k, *_n;
        struct journal_keys keys = { NULL };
@@ -323,35 +323,35 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
        if (list_empty(journal_entries))
                return keys;
 
-       keys.journal_seq_base =
-               le64_to_cpu(list_last_entry(journal_entries,
-                               struct journal_replay, list)->j.last_seq);
-
-       list_for_each_entry(p, journal_entries, list) {
-               if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+       list_for_each_entry(i, journal_entries, list) {
+               if (i->ignore)
                        continue;
 
-               for_each_jset_key(k, _n, entry, &p->j)
+               if (!keys.journal_seq_base)
+                       keys.journal_seq_base = le64_to_cpu(i->j.seq);
+
+               for_each_jset_key(k, _n, entry, &i->j)
                        nr_keys++;
        }
 
-
        keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
        if (!keys.d)
                goto err;
 
-       list_for_each_entry(p, journal_entries, list) {
-               if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+       list_for_each_entry(i, journal_entries, list) {
+               if (i->ignore)
                        continue;
 
-               for_each_jset_key(k, _n, entry, &p->j)
+               BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
+
+               for_each_jset_key(k, _n, entry, &i->j)
                        keys.d[keys.nr++] = (struct journal_key) {
                                .btree_id       = entry->btree_id,
                                .level          = entry->level,
                                .k              = k,
-                               .journal_seq    = le64_to_cpu(p->j.seq) -
+                               .journal_seq    = le64_to_cpu(i->j.seq) -
                                        keys.journal_seq_base,
-                               .journal_offset = k->_data - p->j._data,
+                               .journal_offset = k->_data - i->j._data,
                        };
        }
 
@@ -643,46 +643,6 @@ err:
        return ret;
 }
 
-static bool journal_empty(struct list_head *journal)
-{
-       return list_empty(journal) ||
-               journal_entry_empty(&list_last_entry(journal,
-                                       struct journal_replay, list)->j);
-}
-
-static int
-verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
-                                                 struct list_head *journal)
-{
-       struct journal_replay *i =
-               list_last_entry(journal, struct journal_replay, list);
-       u64 start_seq   = le64_to_cpu(i->j.last_seq);
-       u64 end_seq     = le64_to_cpu(i->j.seq);
-       u64 seq         = start_seq;
-       int ret = 0;
-
-       list_for_each_entry(i, journal, list) {
-               if (le64_to_cpu(i->j.seq) < start_seq)
-                       continue;
-
-               fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
-                       "journal entries %llu-%llu missing! (replaying %llu-%llu)",
-                       seq, le64_to_cpu(i->j.seq) - 1,
-                       start_seq, end_seq);
-
-               seq = le64_to_cpu(i->j.seq);
-
-               fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c,
-                           "found blacklisted journal entry %llu", seq);
-
-               do {
-                       seq++;
-               } while (bch2_journal_seq_is_blacklisted(c, seq, false));
-       }
-fsck_err:
-       return ret;
-}
-
 /* journal replay early: */
 
 static int journal_replay_entry_early(struct bch_fs *c,
@@ -767,6 +727,7 @@ static int journal_replay_early(struct bch_fs *c,
                                struct bch_sb_field_clean *clean,
                                struct list_head *journal)
 {
+       struct journal_replay *i;
        struct jset_entry *entry;
        int ret;
 
@@ -782,18 +743,19 @@ static int journal_replay_early(struct bch_fs *c,
                                return ret;
                }
        } else {
-               struct journal_replay *i =
-                       list_last_entry(journal, struct journal_replay, list);
+               list_for_each_entry(i, journal, list) {
+                       if (i->ignore)
+                               continue;
 
-               c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
-               c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
+                       c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
+                       c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
 
-               list_for_each_entry(i, journal, list)
                        vstruct_for_each(&i->j, entry) {
                                ret = journal_replay_entry_early(c, entry);
                                if (ret)
                                        return ret;
                        }
+               }
        }
 
        bch2_fs_usage_initialize(c);
@@ -842,9 +804,6 @@ static int verify_superblock_clean(struct bch_fs *c,
        struct bch_sb_field_clean *clean = *cleanp;
        int ret = 0;
 
-       if (!c->sb.clean || !j)
-               return 0;
-
        if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
                        "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
                        le64_to_cpu(clean->journal_seq),
@@ -971,7 +930,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 {
        const char *err = "cannot allocate memory";
        struct bch_sb_field_clean *clean = NULL;
-       u64 journal_seq;
+       struct jset *last_journal_entry = NULL;
+       u64 blacklist_seq, journal_seq;
        bool write_sb = false, need_write_alloc = false;
        int ret;
 
@@ -991,24 +951,38 @@ int bch2_fs_recovery(struct bch_fs *c)
                set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
        }
 
+       ret = bch2_blacklist_table_initialize(c);
+       if (ret) {
+               bch_err(c, "error initializing blacklist table");
+               goto err;
+       }
+
        if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
-               struct jset *j;
+               struct journal_replay *i;
 
-               ret = bch2_journal_read(c, &c->journal_entries);
+               ret = bch2_journal_read(c, &c->journal_entries,
+                                       &blacklist_seq, &journal_seq);
                if (ret)
                        goto err;
 
-               if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c,
+               list_for_each_entry_reverse(i, &c->journal_entries, list)
+                       if (!i->ignore) {
+                               last_journal_entry = &i->j;
+                               break;
+                       }
+
+               if (mustfix_fsck_err_on(c->sb.clean &&
+                                       last_journal_entry &&
+                                       !journal_entry_empty(last_journal_entry), c,
                                "filesystem marked clean but journal not empty")) {
                        c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
                        SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
                        c->sb.clean = false;
                }
 
-               if (!c->sb.clean && list_empty(&c->journal_entries)) {
-                       bch_err(c, "no journal entries found");
-                       ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
-                       goto err;
+               if (!last_journal_entry) {
+                       fsck_err_on(!c->sb.clean, c, "no journal entries found");
+                       goto use_clean;
                }
 
                c->journal_keys = journal_keys_sort(&c->journal_entries);
@@ -1017,16 +991,21 @@ int bch2_fs_recovery(struct bch_fs *c)
                        goto err;
                }
 
-               j = &list_last_entry(&c->journal_entries,
-                                    struct journal_replay, list)->j;
-
-               ret = verify_superblock_clean(c, &clean, j);
-               if (ret)
+               if (c->sb.clean && last_journal_entry) {
+                       ret = verify_superblock_clean(c, &clean,
+                                                     last_journal_entry);
+                       if (ret)
+                               goto err;
+               }
+       } else {
+use_clean:
+               if (!clean) {
+                       bch_err(c, "no superblock clean section found");
+                       ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
                        goto err;
 
-               journal_seq = le64_to_cpu(j->seq) + 1;
-       } else {
-               journal_seq = le64_to_cpu(clean->journal_seq) + 1;
+               }
+               blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
        }
 
        if (!c->sb.clean &&
@@ -1045,30 +1024,23 @@ int bch2_fs_recovery(struct bch_fs *c)
        if (ret)
                goto err;
 
-       if (!c->sb.clean) {
+       /*
+        * After an unclean shutdown, skip then next few journal sequence
+        * numbers as they may have been referenced by btree writes that
+        * happened before their corresponding journal writes - those btree
+        * writes need to be ignored, by skipping and blacklisting the next few
+        * journal sequence numbers:
+        */
+       if (!c->sb.clean)
+               journal_seq += 8;
+
+       if (blacklist_seq != journal_seq) {
                ret = bch2_journal_seq_blacklist_add(c,
-                                                    journal_seq,
-                                                    journal_seq + 8);
+                                       blacklist_seq, journal_seq);
                if (ret) {
                        bch_err(c, "error creating new journal seq blacklist entry");
                        goto err;
                }
-
-               journal_seq += 8;
-
-               /*
-                * The superblock needs to be written before we do any btree
-                * node writes: it will be in the read_write() path
-                */
-       }
-
-       ret = bch2_blacklist_table_initialize(c);
-
-       if (!list_empty(&c->journal_entries)) {
-               ret = verify_journal_entries_not_blacklisted_or_missing(c,
-                                                       &c->journal_entries);
-               if (ret)
-                       goto err;
        }
 
        ret = bch2_fs_journal_start(&c->journal, journal_seq,