x(extents_above_btree_updates,  12)     \
        x(btree_updates_journalled,     13)     \
        x(reflink_inline_data,          14)     \
-       x(new_varint,                   15)
+       x(new_varint,                   15)     \
+       x(journal_no_flush,             16)
 
 #define BCH_SB_FEATURES_ALL                            \
        ((1ULL << BCH_FEATURE_new_siphash)|             \
         (1ULL << BCH_FEATURE_new_extent_overwrite)|    \
         (1ULL << BCH_FEATURE_btree_ptr_v2)|            \
         (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
-        (1ULL << BCH_FEATURE_new_varint))\
+        (1ULL << BCH_FEATURE_new_varint)|              \
+        (1ULL << BCH_FEATURE_journal_no_flush))
 
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
 
 LE32_BITMASK(JSET_CSUM_TYPE,   struct jset, flags, 0, 4);
 LE32_BITMASK(JSET_BIG_ENDIAN,  struct jset, flags, 4, 5);
+LE32_BITMASK(JSET_NO_FLUSH,    struct jset, flags, 5, 6);
 
 #define BCH_JOURNAL_BUCKETS_MIN                8
 
 
        struct journal_buf *buf = journal_cur_buf(j);
 
        bkey_extent_init(&buf->key);
+       buf->noflush    = false;
+       buf->must_flush = false;
 
        memset(buf->has_inode, 0, sizeof(buf->has_inode));
 
        struct journal_buf *buf;
        int ret = 0;
 
-       if (seq <= j->seq_ondisk)
+       if (seq <= j->flushed_seq_ondisk)
                return 1;
 
        spin_lock(&j->lock);
                goto out;
        }
 
-       if (seq <= j->seq_ondisk) {
+       if (seq <= j->flushed_seq_ondisk) {
                ret = 1;
                goto out;
        }
 
-       if (parent &&
-           (buf = journal_seq_to_buf(j, seq)))
-               if (!closure_wait(&buf->wait, parent))
+       /* if seq was written, but not flushed - flush a newer one instead */
+       seq = max(seq, last_unwritten_seq(j));
+
+recheck_need_open:
+       if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
+               struct journal_res res = { 0 };
+
+               spin_unlock(&j->lock);
+
+               ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+               if (ret)
+                       return ret;
+
+               seq = res.seq;
+               buf = j->buf + (seq & JOURNAL_BUF_MASK);
+               buf->must_flush = true;
+               set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+               if (parent && !closure_wait(&buf->wait, parent))
                        BUG();
 
+               bch2_journal_res_put(j, &res);
+
+               spin_lock(&j->lock);
+               goto want_write;
+       }
+
+       /*
+        * if write was kicked off without a flush, flush the next sequence
+        * number instead
+        */
+       buf = journal_seq_to_buf(j, seq);
+       if (buf->noflush) {
+               seq++;
+               goto recheck_need_open;
+       }
+
+       buf->must_flush = true;
+
+       if (parent && !closure_wait(&buf->wait, parent))
+               BUG();
+want_write:
        if (seq == journal_cur_seq(j))
                journal_entry_want_write(j);
 out:
        spin_lock(&j->lock);
 
        set_bit(JOURNAL_STARTED, &j->flags);
+       j->last_flush_write = jiffies;
 
        journal_pin_new_entry(j, 1);
 
               "last_seq:\t\t%llu\n"
               "last_seq_ondisk:\t%llu\n"
               "prereserved:\t\t%u/%u\n"
+              "nr flush writes:\t%llu\n"
+              "nr noflush writes:\t%llu\n"
               "nr direct reclaim:\t%llu\n"
               "nr background reclaim:\t%llu\n"
               "current entry sectors:\t%u\n"
               j->last_seq_ondisk,
               j->prereserved.reserved,
               j->prereserved.remaining,
+              j->nr_flush_writes,
+              j->nr_noflush_writes,
               j->nr_direct_reclaim,
               j->nr_background_reclaim,
               j->cur_entry_sectors,
 
 
 static inline u64 journal_cur_seq(struct journal *j)
 {
-       BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+       EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
 
        return j->pin.back - 1;
 }
 
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
 #include "replicas.h"
 #include "trace.h"
 
+static void __journal_replay_free(struct journal_replay *i)
+{
+       list_del(&i->list);
+       kvpfree(i, offsetof(struct journal_replay, j) +
+               vstruct_bytes(&i->j));
+
+}
+
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+{
+       i->ignore = true;
+
+       if (!c->opts.read_entire_journal)
+               __journal_replay_free(i);
+}
+
 struct journal_list {
        struct closure          cl;
        struct mutex            lock;
        struct bch_devs_list devs = { .nr = 0 };
        struct list_head *where;
        size_t bytes = vstruct_bytes(j);
-       __le64 last_seq;
+       u64 last_seq = 0;
        int ret;
 
-       last_seq = !list_empty(jlist->head)
-               ? list_last_entry(jlist->head, struct journal_replay,
-                                 list)->j.last_seq
-               : 0;
-
-       if (!c->opts.read_entire_journal) {
-               /* Is this entry older than the range we need? */
-               if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
-                       ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-                       goto out;
+       list_for_each_entry_reverse(i, jlist->head, list) {
+               if (!JSET_NO_FLUSH(&i->j)) {
+                       last_seq = le64_to_cpu(i->j.last_seq);
+                       break;
                }
+       }
 
-               /* Drop entries we don't need anymore */
+       /* Is this entry older than the range we need? */
+       if (!c->opts.read_entire_journal &&
+           le64_to_cpu(j->seq) < last_seq) {
+               ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+               goto out;
+       }
+
+       /* Drop entries we don't need anymore */
+       if (!JSET_NO_FLUSH(j)) {
                list_for_each_entry_safe(i, pos, jlist->head, list) {
                        if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
                                break;
-                       list_del(&i->list);
-                       kvpfree(i, offsetof(struct journal_replay, j) +
-                               vstruct_bytes(&i->j));
+                       journal_replay_free(c, i);
                }
        }
 
        if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
                if (i->bad) {
                        devs = i->devs;
-                       list_del(&i->list);
-                       kvpfree(i, offsetof(struct journal_replay, j) +
-                               vstruct_bytes(&i->j));
+                       __journal_replay_free(i);
                } else if (bad) {
                        goto found;
                } else {
        list_add(&i->list, where);
        i->devs = devs;
        i->bad  = bad;
+       i->ignore = false;
        unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
 found:
        if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
        goto out;
 }
 
-int bch2_journal_read(struct bch_fs *c, struct list_head *list)
+int bch2_journal_read(struct bch_fs *c, struct list_head *list,
+                     u64 *blacklist_seq, u64 *start_seq)
 {
        struct journal_list jlist;
-       struct journal_replay *i;
+       struct journal_replay *i, *t;
        struct bch_dev *ca;
        unsigned iter;
        size_t keys = 0, entries = 0;
        bool degraded = false;
+       u64 seq, last_seq = 0;
        int ret = 0;
 
        closure_init_stack(&jlist.cl);
        if (jlist.ret)
                return jlist.ret;
 
+       if (list_empty(list)) {
+               bch_info(c, "journal read done, but no entries found");
+               return 0;
+       }
+
+       i = list_last_entry(list, struct journal_replay, list);
+       *start_seq = le64_to_cpu(i->j.seq) + 1;
+
+       /*
+        * Find most recent flush entry, and ignore newer non flush entries -
+        * those entries will be blacklisted:
+        */
+       list_for_each_entry_safe_reverse(i, t, list, list) {
+               if (i->ignore)
+                       continue;
+
+               if (!JSET_NO_FLUSH(&i->j)) {
+                       last_seq        = le64_to_cpu(i->j.last_seq);
+                       *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
+                       break;
+               }
+
+               journal_replay_free(c, i);
+       }
+
+       if (!last_seq) {
+               fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
+               return -1;
+       }
+
+       /* Drop blacklisted entries and entries older than last_seq: */
+       list_for_each_entry_safe(i, t, list, list) {
+               if (i->ignore)
+                       continue;
+
+               seq = le64_to_cpu(i->j.seq);
+               if (seq < last_seq) {
+                       journal_replay_free(c, i);
+                       continue;
+               }
+
+               if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
+                       fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
+                                   "found blacklisted journal entry %llu", seq);
+
+                       journal_replay_free(c, i);
+               }
+       }
+
+       /* Check for missing entries: */
+       seq = last_seq;
+       list_for_each_entry(i, list, list) {
+               if (i->ignore)
+                       continue;
+
+               BUG_ON(seq > le64_to_cpu(i->j.seq));
+
+               while (seq < le64_to_cpu(i->j.seq)) {
+                       u64 missing_start, missing_end;
+
+                       while (seq < le64_to_cpu(i->j.seq) &&
+                              bch2_journal_seq_is_blacklisted(c, seq, false))
+                               seq++;
+
+                       if (seq == le64_to_cpu(i->j.seq))
+                               break;
+
+                       missing_start = seq;
+
+                       while (seq < le64_to_cpu(i->j.seq) &&
+                              !bch2_journal_seq_is_blacklisted(c, seq, false))
+                               seq++;
+
+                       missing_end = seq - 1;
+                       fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
+                                missing_start, missing_end,
+                                last_seq, *blacklist_seq - 1);
+               }
+
+               seq++;
+       }
+
        list_for_each_entry(i, list, list) {
                struct jset_entry *entry;
                struct bkey_i *k, *_n;
                struct bch_replicas_padded replicas;
                char buf[80];
 
+               if (i->ignore)
+                       continue;
+
                ret = jset_validate_entries(c, &i->j, READ);
                if (ret)
                        goto fsck_err;
                entries++;
        }
 
-       if (!list_empty(list)) {
-               i = list_last_entry(list, struct journal_replay, list);
+       bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
+                keys, entries, *start_seq);
 
-               bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
-                        keys, entries, le64_to_cpu(i->j.seq));
-       }
+       if (*start_seq != *blacklist_seq)
+               bch_info(c, "dropped unflushed entries %llu-%llu",
+                        *blacklist_seq, *start_seq - 1);
 fsck_err:
        return ret;
 }
        j->seq_ondisk           = seq;
        if (err && (!j->err_seq || seq < j->err_seq))
                j->err_seq      = seq;
-       j->last_seq_ondisk      = last_seq;
-       bch2_journal_space_available(j);
+
+       if (!w->noflush) {
+               j->flushed_seq_ondisk = seq;
+               j->last_seq_ondisk = last_seq;
+               bch2_journal_space_available(j);
+       }
 
        /*
         * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
 
        j->write_start_time = local_clock();
 
+       spin_lock(&j->lock);
+       if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
+           !w->must_flush &&
+           (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) &&
+           test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
+               w->noflush = true;
+               SET_JSET_NO_FLUSH(jset, true);
+               jset->last_seq = cpu_to_le64(j->last_seq_ondisk);
+
+               j->nr_noflush_writes++;
+       } else {
+               j->last_flush_write = jiffies;
+               j->nr_flush_writes++;
+       }
+       spin_unlock(&j->lock);
+
        /*
         * New btree roots are set by journalling them; when the journal entry
         * gets written we have to propagate them to c->btree_roots
                             sectors);
 
                bio = ca->journal.bio;
-               bio_reset(bio, ca->disk_sb.bdev,
-                         REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
+               bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
                bio->bi_iter.bi_sector  = ptr->offset;
                bio->bi_end_io          = journal_write_endio;
                bio->bi_private         = ca;
+               if (!JSET_NO_FLUSH(jset))
+                       bio->bi_opf    |= REQ_PREFLUSH|REQ_FUA;
                bch2_bio_map(bio, jset, sectors << 9);
 
                trace_journal_write(bio);
                ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
        }
 
-       for_each_rw_member(ca, c, i)
-               if (journal_flushes_device(ca) &&
-                   !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
-                       percpu_ref_get(&ca->io_ref);
-
-                       bio = ca->journal.bio;
-                       bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
-                       bio->bi_end_io          = journal_write_endio;
-                       bio->bi_private         = ca;
-                       closure_bio_submit(bio, cl);
-               }
-
+       if (!JSET_NO_FLUSH(jset)) {
+               for_each_rw_member(ca, c, i)
+                       if (journal_flushes_device(ca) &&
+                           !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
+                               percpu_ref_get(&ca->io_ref);
+
+                               bio = ca->journal.bio;
+                               bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+                               bio->bi_end_io          = journal_write_endio;
+                               bio->bi_private         = ca;
+                               closure_bio_submit(bio, cl);
+                       }
+       }
 no_io:
        bch2_bucket_seq_cleanup(c);
 
 
        struct bch_devs_list    devs;
        /* checksum error, but we may want to try using it anyways: */
        bool                    bad;
+       bool                    ignore;
        /* must be last: */
        struct jset             j;
 };
        for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)        \
                vstruct_for_each_safe(entry, k, _n)
 
-int bch2_journal_read(struct bch_fs *, struct list_head *);
+int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
 
 void bch2_journal_write(struct closure *);
 
 
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bch_dev *ca;
-       unsigned clean;
+       unsigned clean, clean_ondisk, total;
        unsigned overhead, u64s_remaining = 0;
        unsigned max_entry_size  = min(j->buf[0].buf_size >> 9,
                                       j->buf[1].buf_size >> 9);
        for (i = 0; i < journal_space_nr; i++)
                j->space[i] = __journal_space_available(j, nr_devs_want, i);
 
+       clean_ondisk    = j->space[journal_space_clean_ondisk].total;
        clean           = j->space[journal_space_clean].total;
+       total           = j->space[journal_space_total].total;
 
        if (!j->space[journal_space_discarded].next_entry)
                ret = cur_entry_journal_full;
        else if (!fifo_free(&j->pin))
                ret = cur_entry_journal_pin_full;
 
+       if ((clean - clean_ondisk <= total / 8) &&
+           (clean_ondisk * 2 > clean ))
+               set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+       else
+               clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+
        overhead = DIV_ROUND_UP(clean, max_entry_size) *
                journal_entry_overhead(j);
        u64s_remaining = clean << 6;
 
 out:
        mutex_unlock(&c->sb_lock);
 
-       return ret;
+       return ret ?: bch2_blacklist_table_initialize(c);
 }
 
 static int journal_seq_blacklist_table_cmp(const void *_l,
        struct journal_seq_blacklist_table *t;
        unsigned i, nr = blacklist_nr_entries(bl);
 
-       BUG_ON(c->journal_seq_blacklist_table);
-
        if (!bl)
                return 0;
 
                        journal_seq_blacklist_table_cmp,
                        NULL);
 
+       kfree(c->journal_seq_blacklist_table);
        c->journal_seq_blacklist_table = t;
        return 0;
 }
 
        unsigned                disk_sectors;   /* maximum size entry could have been, if
                                                   buf_size was bigger */
        unsigned                u64s_reserved;
+       bool                    noflush;        /* write has already been kicked off, and was noflush */
+       bool                    must_flush;     /* something wants a flush */
        /* bloom filter: */
        unsigned long           has_inode[1024 / sizeof(unsigned long)];
 };
        JOURNAL_RECLAIM_STARTED,
        JOURNAL_NEED_WRITE,
        JOURNAL_MAY_GET_UNRESERVED,
+       JOURNAL_MAY_SKIP_FLUSH,
 };
 
 /* Embedded in struct bch_fs */
 
        /* seq, last_seq from the most recent journal entry successfully written */
        u64                     seq_ondisk;
+       u64                     flushed_seq_ondisk;
        u64                     last_seq_ondisk;
        u64                     err_seq;
        u64                     last_empty_seq;
 
        unsigned                write_delay_ms;
        unsigned                reclaim_delay_ms;
+       unsigned long           last_flush_write;
 
        u64                     res_get_blocked_start;
        u64                     need_write_time;
        u64                     write_start_time;
 
+       u64                     nr_flush_writes;
+       u64                     nr_noflush_writes;
+
        struct bch2_time_stats  *write_time;
        struct bch2_time_stats  *delay_time;
        struct bch2_time_stats  *blocked_time;
 
 
 static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 {
-       struct journal_replay *p;
+       struct journal_replay *i;
        struct jset_entry *entry;
        struct bkey_i *k, *_n;
        struct journal_keys keys = { NULL };
        if (list_empty(journal_entries))
                return keys;
 
-       keys.journal_seq_base =
-               le64_to_cpu(list_last_entry(journal_entries,
-                               struct journal_replay, list)->j.last_seq);
-
-       list_for_each_entry(p, journal_entries, list) {
-               if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+       list_for_each_entry(i, journal_entries, list) {
+               if (i->ignore)
                        continue;
 
-               for_each_jset_key(k, _n, entry, &p->j)
+               if (!keys.journal_seq_base)
+                       keys.journal_seq_base = le64_to_cpu(i->j.seq);
+
+               for_each_jset_key(k, _n, entry, &i->j)
                        nr_keys++;
        }
 
-
        keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
        if (!keys.d)
                goto err;
 
-       list_for_each_entry(p, journal_entries, list) {
-               if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+       list_for_each_entry(i, journal_entries, list) {
+               if (i->ignore)
                        continue;
 
-               for_each_jset_key(k, _n, entry, &p->j)
+               BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
+
+               for_each_jset_key(k, _n, entry, &i->j)
                        keys.d[keys.nr++] = (struct journal_key) {
                                .btree_id       = entry->btree_id,
                                .level          = entry->level,
                                .k              = k,
-                               .journal_seq    = le64_to_cpu(p->j.seq) -
+                               .journal_seq    = le64_to_cpu(i->j.seq) -
                                        keys.journal_seq_base,
-                               .journal_offset = k->_data - p->j._data,
+                               .journal_offset = k->_data - i->j._data,
                        };
        }
 
        return ret;
 }
 
-static bool journal_empty(struct list_head *journal)
-{
-       return list_empty(journal) ||
-               journal_entry_empty(&list_last_entry(journal,
-                                       struct journal_replay, list)->j);
-}
-
-static int
-verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
-                                                 struct list_head *journal)
-{
-       struct journal_replay *i =
-               list_last_entry(journal, struct journal_replay, list);
-       u64 start_seq   = le64_to_cpu(i->j.last_seq);
-       u64 end_seq     = le64_to_cpu(i->j.seq);
-       u64 seq         = start_seq;
-       int ret = 0;
-
-       list_for_each_entry(i, journal, list) {
-               if (le64_to_cpu(i->j.seq) < start_seq)
-                       continue;
-
-               fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
-                       "journal entries %llu-%llu missing! (replaying %llu-%llu)",
-                       seq, le64_to_cpu(i->j.seq) - 1,
-                       start_seq, end_seq);
-
-               seq = le64_to_cpu(i->j.seq);
-
-               fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c,
-                           "found blacklisted journal entry %llu", seq);
-
-               do {
-                       seq++;
-               } while (bch2_journal_seq_is_blacklisted(c, seq, false));
-       }
-fsck_err:
-       return ret;
-}
-
 /* journal replay early: */
 
 static int journal_replay_entry_early(struct bch_fs *c,
                                struct bch_sb_field_clean *clean,
                                struct list_head *journal)
 {
+       struct journal_replay *i;
        struct jset_entry *entry;
        int ret;
 
                                return ret;
                }
        } else {
-               struct journal_replay *i =
-                       list_last_entry(journal, struct journal_replay, list);
+               list_for_each_entry(i, journal, list) {
+                       if (i->ignore)
+                               continue;
 
-               c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
-               c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
+                       c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
+                       c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
 
-               list_for_each_entry(i, journal, list)
                        vstruct_for_each(&i->j, entry) {
                                ret = journal_replay_entry_early(c, entry);
                                if (ret)
                                        return ret;
                        }
+               }
        }
 
        bch2_fs_usage_initialize(c);
        struct bch_sb_field_clean *clean = *cleanp;
        int ret = 0;
 
-       if (!c->sb.clean || !j)
-               return 0;
-
        if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
                        "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
                        le64_to_cpu(clean->journal_seq),
 {
        const char *err = "cannot allocate memory";
        struct bch_sb_field_clean *clean = NULL;
-       u64 journal_seq;
+       struct jset *last_journal_entry = NULL;
+       u64 blacklist_seq, journal_seq;
        bool write_sb = false, need_write_alloc = false;
        int ret;
 
                set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
        }
 
+       ret = bch2_blacklist_table_initialize(c);
+       if (ret) {
+               bch_err(c, "error initializing blacklist table");
+               goto err;
+       }
+
        if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
-               struct jset *j;
+               struct journal_replay *i;
 
-               ret = bch2_journal_read(c, &c->journal_entries);
+               ret = bch2_journal_read(c, &c->journal_entries,
+                                       &blacklist_seq, &journal_seq);
                if (ret)
                        goto err;
 
-               if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c,
+               list_for_each_entry_reverse(i, &c->journal_entries, list)
+                       if (!i->ignore) {
+                               last_journal_entry = &i->j;
+                               break;
+                       }
+
+               if (mustfix_fsck_err_on(c->sb.clean &&
+                                       last_journal_entry &&
+                                       !journal_entry_empty(last_journal_entry), c,
                                "filesystem marked clean but journal not empty")) {
                        c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
                        SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
                        c->sb.clean = false;
                }
 
-               if (!c->sb.clean && list_empty(&c->journal_entries)) {
-                       bch_err(c, "no journal entries found");
-                       ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
-                       goto err;
+               if (!last_journal_entry) {
+                       fsck_err_on(!c->sb.clean, c, "no journal entries found");
+                       goto use_clean;
                }
 
                c->journal_keys = journal_keys_sort(&c->journal_entries);
                        goto err;
                }
 
-               j = &list_last_entry(&c->journal_entries,
-                                    struct journal_replay, list)->j;
-
-               ret = verify_superblock_clean(c, &clean, j);
-               if (ret)
+               if (c->sb.clean && last_journal_entry) {
+                       ret = verify_superblock_clean(c, &clean,
+                                                     last_journal_entry);
+                       if (ret)
+                               goto err;
+               }
+       } else {
+use_clean:
+               if (!clean) {
+                       bch_err(c, "no superblock clean section found");
+                       ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
                        goto err;
 
-               journal_seq = le64_to_cpu(j->seq) + 1;
-       } else {
-               journal_seq = le64_to_cpu(clean->journal_seq) + 1;
+               }
+               blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
        }
 
        if (!c->sb.clean &&
        if (ret)
                goto err;
 
-       if (!c->sb.clean) {
+       /*
+        * After an unclean shutdown, skip then next few journal sequence
+        * numbers as they may have been referenced by btree writes that
+        * happened before their corresponding journal writes - those btree
+        * writes need to be ignored, by skipping and blacklisting the next few
+        * journal sequence numbers:
+        */
+       if (!c->sb.clean)
+               journal_seq += 8;
+
+       if (blacklist_seq != journal_seq) {
                ret = bch2_journal_seq_blacklist_add(c,
-                                                    journal_seq,
-                                                    journal_seq + 8);
+                                       blacklist_seq, journal_seq);
                if (ret) {
                        bch_err(c, "error creating new journal seq blacklist entry");
                        goto err;
                }
-
-               journal_seq += 8;
-
-               /*
-                * The superblock needs to be written before we do any btree
-                * node writes: it will be in the read_write() path
-                */
-       }
-
-       ret = bch2_blacklist_table_initialize(c);
-
-       if (!list_empty(&c->journal_entries)) {
-               ret = verify_journal_entries_not_blacklisted_or_missing(c,
-                                                       &c->journal_entries);
-               if (ret)
-                       goto err;
        }
 
        ret = bch2_fs_journal_start(&c->journal, journal_seq,