bcachefs: Don't require flush/fua on every journal write

author Kent Overstreet <kent.overstreet@gmail.com>

Sat, 14 Nov 2020 14:59:58 +0000 (09:59 -0500)

committer Kent Overstreet <kent.overstreet@linux.dev>

Sun, 22 Oct 2023 21:08:49 +0000 (17:08 -0400)
author Kent Overstreet <kent.overstreet@gmail.com>
Sat, 14 Nov 2020 14:59:58 +0000 (09:59 -0500)
committer Kent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:49 +0000 (17:08 -0400)
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h

index f072e865e43f70bc9c5200e73840ed188f1f6d0a..7df2bc7ecd4f49320648316db117d92f85fa9b68 100644 (file)
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1336,14 +1336,16 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,       struct bch_sb, flags[3],  0, 16);
         x(extents_above_btree_updates,  12)     \
         x(btree_updates_journalled,     13)     \
         x(reflink_inline_data,          14)     \
-       x(new_varint,                   15)
+       x(new_varint,                   15)     \
+       x(journal_no_flush,             16)
  
  #define BCH_SB_FEATURES_ALL                            \
         ((1ULL << BCH_FEATURE_new_siphash)|             \
          (1ULL << BCH_FEATURE_new_extent_overwrite)|    \
          (1ULL << BCH_FEATURE_btree_ptr_v2)|            \
          (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
-        (1ULL << BCH_FEATURE_new_varint))\
+        (1ULL << BCH_FEATURE_new_varint)|              \
+        (1ULL << BCH_FEATURE_journal_no_flush))
  
  enum bch_sb_feature {
  #define x(f, n) BCH_FEATURE_##f,
@@ -1582,6 +1584,7 @@ struct jset {
  
  LE32_BITMASK(JSET_CSUM_TYPE,   struct jset, flags, 0, 4);
  LE32_BITMASK(JSET_BIG_ENDIAN,  struct jset, flags, 4, 5);
+LE32_BITMASK(JSET_NO_FLUSH,    struct jset, flags, 5, 6);
  
  #define BCH_JOURNAL_BUCKETS_MIN                8
  
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c

index 3bbb23d7739a78a89ef7cbf59f2ea2beffa55632..31168754d6b8395869696a66f7161252f1c3806f 100644 (file)
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -79,6 +79,8 @@ static void bch2_journal_buf_init(struct journal *j)
         struct journal_buf *buf = journal_cur_buf(j);
  
         bkey_extent_init(&buf->key);
+       buf->noflush    = false;
+       buf->must_flush = false;
  
         memset(buf->has_inode, 0, sizeof(buf->has_inode));
  
@@ -574,7 +576,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
         struct journal_buf *buf;
         int ret = 0;
  
-       if (seq <= j->seq_ondisk)
+       if (seq <= j->flushed_seq_ondisk)
                 return 1;
  
         spin_lock(&j->lock);
@@ -585,16 +587,53 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
                 goto out;
         }
  
-       if (seq <= j->seq_ondisk) {
+       if (seq <= j->flushed_seq_ondisk) {
                 ret = 1;
                 goto out;
         }
  
-       if (parent &&
-           (buf = journal_seq_to_buf(j, seq)))
-               if (!closure_wait(&buf->wait, parent))
+       /* if seq was written, but not flushed - flush a newer one instead */
+       seq = max(seq, last_unwritten_seq(j));
+
+recheck_need_open:
+       if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
+               struct journal_res res = { 0 };
+
+               spin_unlock(&j->lock);
+
+               ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+               if (ret)
+                       return ret;
+
+               seq = res.seq;
+               buf = j->buf + (seq & JOURNAL_BUF_MASK);
+               buf->must_flush = true;
+               set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+               if (parent && !closure_wait(&buf->wait, parent))
                         BUG();
  
+               bch2_journal_res_put(j, &res);
+
+               spin_lock(&j->lock);
+               goto want_write;
+       }
+
+       /*
+        * if write was kicked off without a flush, flush the next sequence
+        * number instead
+        */
+       buf = journal_seq_to_buf(j, seq);
+       if (buf->noflush) {
+               seq++;
+               goto recheck_need_open;
+       }
+
+       buf->must_flush = true;
+
+       if (parent && !closure_wait(&buf->wait, parent))
+               BUG();
+want_write:
         if (seq == journal_cur_seq(j))
                 journal_entry_want_write(j);
  out:
@@ -979,6 +1018,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
         spin_lock(&j->lock);
  
         set_bit(JOURNAL_STARTED, &j->flags);
+       j->last_flush_write = jiffies;
  
         journal_pin_new_entry(j, 1);
  
@@ -1116,6 +1156,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
                "last_seq:\t\t%llu\n"
                "last_seq_ondisk:\t%llu\n"
                "prereserved:\t\t%u/%u\n"
+              "nr flush writes:\t%llu\n"
+              "nr noflush writes:\t%llu\n"
                "nr direct reclaim:\t%llu\n"
                "nr background reclaim:\t%llu\n"
                "current entry sectors:\t%u\n"
@@ -1127,6 +1169,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
                j->last_seq_ondisk,
                j->prereserved.reserved,
                j->prereserved.remaining,
+              j->nr_flush_writes,
+              j->nr_noflush_writes,
                j->nr_direct_reclaim,
                j->nr_background_reclaim,
                j->cur_entry_sectors,
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h

index 1b6175cd6f1bd3759de1e606c54ffb3d8767d0e7..2c0014c3c02f58f1e01244e0673991ea2486d673 100644 (file)
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -136,7 +136,7 @@ static inline u64 journal_last_seq(struct journal *j)
  
  static inline u64 journal_cur_seq(struct journal *j)
  {
-       BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+       EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
  
         return j->pin.back - 1;
  }
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c

index 1aeeb58d3c2aeae276e793fc1ca820e73d9e5f19..26556bb381b2725506f17c760794ae1173b1b887 100644 (file)
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -10,9 +10,26 @@
  #include "journal.h"
  #include "journal_io.h"
  #include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
  #include "replicas.h"
  #include "trace.h"
  
+static void __journal_replay_free(struct journal_replay *i)
+{
+       list_del(&i->list);
+       kvpfree(i, offsetof(struct journal_replay, j) +
+               vstruct_bytes(&i->j));
+
+}
+
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+{
+       i->ignore = true;
+
+       if (!c->opts.read_entire_journal)
+               __journal_replay_free(i);
+}
+
  struct journal_list {
         struct closure          cl;
         struct mutex            lock;
@@ -35,28 +52,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
         struct bch_devs_list devs = { .nr = 0 };
         struct list_head *where;
         size_t bytes = vstruct_bytes(j);
-       __le64 last_seq;
+       u64 last_seq = 0;
         int ret;
  
-       last_seq = !list_empty(jlist->head)
-               ? list_last_entry(jlist->head, struct journal_replay,
-                                 list)->j.last_seq
-               : 0;
-
-       if (!c->opts.read_entire_journal) {
-               /* Is this entry older than the range we need? */
-               if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
-                       ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-                       goto out;
+       list_for_each_entry_reverse(i, jlist->head, list) {
+               if (!JSET_NO_FLUSH(&i->j)) {
+                       last_seq = le64_to_cpu(i->j.last_seq);
+                       break;
                 }
+       }
  
-               /* Drop entries we don't need anymore */
+       /* Is this entry older than the range we need? */
+       if (!c->opts.read_entire_journal &&
+           le64_to_cpu(j->seq) < last_seq) {
+               ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+               goto out;
+       }
+
+       /* Drop entries we don't need anymore */
+       if (!JSET_NO_FLUSH(j)) {
                 list_for_each_entry_safe(i, pos, jlist->head, list) {
                         if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
                                 break;
-                       list_del(&i->list);
-                       kvpfree(i, offsetof(struct journal_replay, j) +
-                               vstruct_bytes(&i->j));
+                       journal_replay_free(c, i);
                 }
         }
  
@@ -80,9 +98,7 @@ add:
         if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
                 if (i->bad) {
                         devs = i->devs;
-                       list_del(&i->list);
-                       kvpfree(i, offsetof(struct journal_replay, j) +
-                               vstruct_bytes(&i->j));
+                       __journal_replay_free(i);
                 } else if (bad) {
                         goto found;
                 } else {
@@ -104,6 +120,7 @@ add:
         list_add(&i->list, where);
         i->devs = devs;
         i->bad  = bad;
+       i->ignore = false;
         unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
  found:
         if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
@@ -698,14 +715,16 @@ err:
         goto out;
  }
  
-int bch2_journal_read(struct bch_fs *c, struct list_head *list)
+int bch2_journal_read(struct bch_fs *c, struct list_head *list,
+                     u64 *blacklist_seq, u64 *start_seq)
  {
         struct journal_list jlist;
-       struct journal_replay *i;
+       struct journal_replay *i, *t;
         struct bch_dev *ca;
         unsigned iter;
         size_t keys = 0, entries = 0;
         bool degraded = false;
+       u64 seq, last_seq = 0;
         int ret = 0;
  
         closure_init_stack(&jlist.cl);
@@ -734,12 +753,97 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
         if (jlist.ret)
                 return jlist.ret;
  
+       if (list_empty(list)) {
+               bch_info(c, "journal read done, but no entries found");
+               return 0;
+       }
+
+       i = list_last_entry(list, struct journal_replay, list);
+       *start_seq = le64_to_cpu(i->j.seq) + 1;
+
+       /*
+        * Find most recent flush entry, and ignore newer non flush entries -
+        * those entries will be blacklisted:
+        */
+       list_for_each_entry_safe_reverse(i, t, list, list) {
+               if (i->ignore)
+                       continue;
+
+               if (!JSET_NO_FLUSH(&i->j)) {
+                       last_seq        = le64_to_cpu(i->j.last_seq);
+                       *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
+                       break;
+               }
+
+               journal_replay_free(c, i);
+       }
+
+       if (!last_seq) {
+               fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
+               return -1;
+       }
+
+       /* Drop blacklisted entries and entries older than last_seq: */
+       list_for_each_entry_safe(i, t, list, list) {
+               if (i->ignore)
+                       continue;
+
+               seq = le64_to_cpu(i->j.seq);
+               if (seq < last_seq) {
+                       journal_replay_free(c, i);
+                       continue;
+               }
+
+               if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
+                       fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
+                                   "found blacklisted journal entry %llu", seq);
+
+                       journal_replay_free(c, i);
+               }
+       }
+
+       /* Check for missing entries: */
+       seq = last_seq;
+       list_for_each_entry(i, list, list) {
+               if (i->ignore)
+                       continue;
+
+               BUG_ON(seq > le64_to_cpu(i->j.seq));
+
+               while (seq < le64_to_cpu(i->j.seq)) {
+                       u64 missing_start, missing_end;
+
+                       while (seq < le64_to_cpu(i->j.seq) &&
+                              bch2_journal_seq_is_blacklisted(c, seq, false))
+                               seq++;
+
+                       if (seq == le64_to_cpu(i->j.seq))
+                               break;
+
+                       missing_start = seq;
+
+                       while (seq < le64_to_cpu(i->j.seq) &&
+                              !bch2_journal_seq_is_blacklisted(c, seq, false))
+                               seq++;
+
+                       missing_end = seq - 1;
+                       fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
+                                missing_start, missing_end,
+                                last_seq, *blacklist_seq - 1);
+               }
+
+               seq++;
+       }
+
         list_for_each_entry(i, list, list) {
                 struct jset_entry *entry;
                 struct bkey_i *k, *_n;
                 struct bch_replicas_padded replicas;
                 char buf[80];
  
+               if (i->ignore)
+                       continue;
+
                 ret = jset_validate_entries(c, &i->j, READ);
                 if (ret)
                         goto fsck_err;
@@ -767,12 +871,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                 entries++;
         }
  
-       if (!list_empty(list)) {
-               i = list_last_entry(list, struct journal_replay, list);
+       bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
+                keys, entries, *start_seq);
  
-               bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
-                        keys, entries, le64_to_cpu(i->j.seq));
-       }
+       if (*start_seq != *blacklist_seq)
+               bch_info(c, "dropped unflushed entries %llu-%llu",
+                        *blacklist_seq, *start_seq - 1);
  fsck_err:
         return ret;
  }
@@ -990,8 +1094,12 @@ static void journal_write_done(struct closure *cl)
         j->seq_ondisk           = seq;
         if (err && (!j->err_seq || seq < j->err_seq))
                 j->err_seq      = seq;
-       j->last_seq_ondisk      = last_seq;
-       bch2_journal_space_available(j);
+
+       if (!w->noflush) {
+               j->flushed_seq_ondisk = seq;
+               j->last_seq_ondisk = last_seq;
+               bch2_journal_space_available(j);
+       }
  
         /*
          * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
@@ -1067,6 +1175,22 @@ void bch2_journal_write(struct closure *cl)
  
         j->write_start_time = local_clock();
  
+       spin_lock(&j->lock);
+       if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
+           !w->must_flush &&
+           (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) &&
+           test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
+               w->noflush = true;
+               SET_JSET_NO_FLUSH(jset, true);
+               jset->last_seq = cpu_to_le64(j->last_seq_ondisk);
+
+               j->nr_noflush_writes++;
+       } else {
+               j->last_flush_write = jiffies;
+               j->nr_flush_writes++;
+       }
+       spin_unlock(&j->lock);
+
         /*
          * New btree roots are set by journalling them; when the journal entry
          * gets written we have to propagate them to c->btree_roots
@@ -1183,11 +1307,12 @@ retry_alloc:
                              sectors);
  
                 bio = ca->journal.bio;
-               bio_reset(bio, ca->disk_sb.bdev,
-                         REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
+               bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
                 bio->bi_iter.bi_sector  = ptr->offset;
                 bio->bi_end_io          = journal_write_endio;
                 bio->bi_private         = ca;
+               if (!JSET_NO_FLUSH(jset))
+                       bio->bi_opf    |= REQ_PREFLUSH|REQ_FUA;
                 bch2_bio_map(bio, jset, sectors << 9);
  
                 trace_journal_write(bio);
@@ -1196,18 +1321,19 @@ retry_alloc:
                 ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
         }
  
-       for_each_rw_member(ca, c, i)
-               if (journal_flushes_device(ca) &&
-                   !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
-                       percpu_ref_get(&ca->io_ref);
-
-                       bio = ca->journal.bio;
-                       bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
-                       bio->bi_end_io          = journal_write_endio;
-                       bio->bi_private         = ca;
-                       closure_bio_submit(bio, cl);
-               }
-
+       if (!JSET_NO_FLUSH(jset)) {
+               for_each_rw_member(ca, c, i)
+                       if (journal_flushes_device(ca) &&
+                           !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
+                               percpu_ref_get(&ca->io_ref);
+
+                               bio = ca->journal.bio;
+                               bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+                               bio->bi_end_io          = journal_write_endio;
+                               bio->bi_private         = ca;
+                               closure_bio_submit(bio, cl);
+                       }
+       }
  no_io:
         bch2_bucket_seq_cleanup(c);
  
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h

index 6958ee0f8cf23da1ab5a9c0588fedb3d8679678c..6b4c80968f52064370c4d3b767db29cd3cb59bed 100644 (file)
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -11,6 +11,7 @@ struct journal_replay {
         struct bch_devs_list    devs;
         /* checksum error, but we may want to try using it anyways: */
         bool                    bad;
+       bool                    ignore;
         /* must be last: */
         struct jset             j;
  };
@@ -37,7 +38,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
         for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)        \
                 vstruct_for_each_safe(entry, k, _n)
  
-int bch2_journal_read(struct bch_fs *, struct list_head *);
+int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
  
  void bch2_journal_write(struct closure *);
  
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c

index c6267284a02807b24fd08acee8c5f8449e52f003..a3d5405991b9c98edbe64af71a51421d883a0c20 100644 (file)
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -158,7 +158,7 @@ void bch2_journal_space_available(struct journal *j)
  {
         struct bch_fs *c = container_of(j, struct bch_fs, journal);
         struct bch_dev *ca;
-       unsigned clean;
+       unsigned clean, clean_ondisk, total;
         unsigned overhead, u64s_remaining = 0;
         unsigned max_entry_size  = min(j->buf[0].buf_size >> 9,
                                        j->buf[1].buf_size >> 9);
@@ -204,13 +204,21 @@ void bch2_journal_space_available(struct journal *j)
         for (i = 0; i < journal_space_nr; i++)
                 j->space[i] = __journal_space_available(j, nr_devs_want, i);
  
+       clean_ondisk    = j->space[journal_space_clean_ondisk].total;
         clean           = j->space[journal_space_clean].total;
+       total           = j->space[journal_space_total].total;
  
         if (!j->space[journal_space_discarded].next_entry)
                 ret = cur_entry_journal_full;
         else if (!fifo_free(&j->pin))
                 ret = cur_entry_journal_pin_full;
  
+       if ((clean - clean_ondisk <= total / 8) &&
+           (clean_ondisk * 2 > clean ))
+               set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+       else
+               clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+
         overhead = DIV_ROUND_UP(clean, max_entry_size) *
                 journal_entry_overhead(j);
         u64s_remaining = clean << 6;
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c

index d0f1bbf8f6a7984ff5f96d997235b49d484d2eee..e1b63f3879f44e50cc2fdd92ca3de8db03a3c7fa 100644 (file)
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -118,7 +118,7 @@ out_write_sb:
  out:
         mutex_unlock(&c->sb_lock);
  
-       return ret;
+       return ret ?: bch2_blacklist_table_initialize(c);
  }
  
  static int journal_seq_blacklist_table_cmp(const void *_l,
@@ -164,8 +164,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
         struct journal_seq_blacklist_table *t;
         unsigned i, nr = blacklist_nr_entries(bl);
  
-       BUG_ON(c->journal_seq_blacklist_table);
-
         if (!bl)
                 return 0;
  
@@ -187,6 +185,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
                         journal_seq_blacklist_table_cmp,
                         NULL);
  
+       kfree(c->journal_seq_blacklist_table);
         c->journal_seq_blacklist_table = t;
         return 0;
  }
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h

index 6b525dc6ab7cae73a7b5b731a244fbb083d3e051..cf9675310f2b6654b2339d55fa764b3d6ff669a0 100644 (file)
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -29,6 +29,8 @@ struct journal_buf {
         unsigned                disk_sectors;   /* maximum size entry could have been, if
                                                    buf_size was bigger */
         unsigned                u64s_reserved;
+       bool                    noflush;        /* write has already been kicked off, and was noflush */
+       bool                    must_flush;     /* something wants a flush */
         /* bloom filter: */
         unsigned long           has_inode[1024 / sizeof(unsigned long)];
  };
@@ -146,6 +148,7 @@ enum {
         JOURNAL_RECLAIM_STARTED,
         JOURNAL_NEED_WRITE,
         JOURNAL_MAY_GET_UNRESERVED,
+       JOURNAL_MAY_SKIP_FLUSH,
  };
  
  /* Embedded in struct bch_fs */
@@ -203,6 +206,7 @@ struct journal {
  
         /* seq, last_seq from the most recent journal entry successfully written */
         u64                     seq_ondisk;
+       u64                     flushed_seq_ondisk;
         u64                     last_seq_ondisk;
         u64                     err_seq;
         u64                     last_empty_seq;
@@ -252,11 +256,15 @@ struct journal {
  
         unsigned                write_delay_ms;
         unsigned                reclaim_delay_ms;
+       unsigned long           last_flush_write;
  
         u64                     res_get_blocked_start;
         u64                     need_write_time;
         u64                     write_start_time;
  
+       u64                     nr_flush_writes;
+       u64                     nr_noflush_writes;
+
         struct bch2_time_stats  *write_time;
         struct bch2_time_stats  *delay_time;
         struct bch2_time_stats  *blocked_time;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c

index 7ad5b823474766397b62a8bda1cb3298ebbef353..ecd51d45743a3705da4f1a6261b2469876f73add 100644 (file)
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -313,7 +313,7 @@ void bch2_journal_keys_free(struct journal_keys *keys)
  
  static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
  {
-       struct journal_replay *p;
+       struct journal_replay *i;
         struct jset_entry *entry;
         struct bkey_i *k, *_n;
         struct journal_keys keys = { NULL };
@@ -323,35 +323,35 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
         if (list_empty(journal_entries))
                 return keys;
  
-       keys.journal_seq_base =
-               le64_to_cpu(list_last_entry(journal_entries,
-                               struct journal_replay, list)->j.last_seq);
-
-       list_for_each_entry(p, journal_entries, list) {
-               if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+       list_for_each_entry(i, journal_entries, list) {
+               if (i->ignore)
                         continue;
  
-               for_each_jset_key(k, _n, entry, &p->j)
+               if (!keys.journal_seq_base)
+                       keys.journal_seq_base = le64_to_cpu(i->j.seq);
+
+               for_each_jset_key(k, _n, entry, &i->j)
                         nr_keys++;
         }
  
-
         keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
         if (!keys.d)
                 goto err;
  
-       list_for_each_entry(p, journal_entries, list) {
-               if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+       list_for_each_entry(i, journal_entries, list) {
+               if (i->ignore)
                         continue;
  
-               for_each_jset_key(k, _n, entry, &p->j)
+               BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
+
+               for_each_jset_key(k, _n, entry, &i->j)
                         keys.d[keys.nr++] = (struct journal_key) {
                                 .btree_id       = entry->btree_id,
                                 .level          = entry->level,
                                 .k              = k,
-                               .journal_seq    = le64_to_cpu(p->j.seq) -
+                               .journal_seq    = le64_to_cpu(i->j.seq) -
                                         keys.journal_seq_base,
-                               .journal_offset = k->_data - p->j._data,
+                               .journal_offset = k->_data - i->j._data,
                         };
         }
  
@@ -643,46 +643,6 @@ err:
         return ret;
  }
  
-static bool journal_empty(struct list_head *journal)
-{
-       return list_empty(journal) ||
-               journal_entry_empty(&list_last_entry(journal,
-                                       struct journal_replay, list)->j);
-}
-
-static int
-verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
-                                                 struct list_head *journal)
-{
-       struct journal_replay *i =
-               list_last_entry(journal, struct journal_replay, list);
-       u64 start_seq   = le64_to_cpu(i->j.last_seq);
-       u64 end_seq     = le64_to_cpu(i->j.seq);
-       u64 seq         = start_seq;
-       int ret = 0;
-
-       list_for_each_entry(i, journal, list) {
-               if (le64_to_cpu(i->j.seq) < start_seq)
-                       continue;
-
-               fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
-                       "journal entries %llu-%llu missing! (replaying %llu-%llu)",
-                       seq, le64_to_cpu(i->j.seq) - 1,
-                       start_seq, end_seq);
-
-               seq = le64_to_cpu(i->j.seq);
-
-               fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c,
-                           "found blacklisted journal entry %llu", seq);
-
-               do {
-                       seq++;
-               } while (bch2_journal_seq_is_blacklisted(c, seq, false));
-       }
-fsck_err:
-       return ret;
-}
-
  /* journal replay early: */
  
  static int journal_replay_entry_early(struct bch_fs *c,
@@ -767,6 +727,7 @@ static int journal_replay_early(struct bch_fs *c,
                                 struct bch_sb_field_clean *clean,
                                 struct list_head *journal)
  {
+       struct journal_replay *i;
         struct jset_entry *entry;
         int ret;
  
@@ -782,18 +743,19 @@ static int journal_replay_early(struct bch_fs *c,
                                 return ret;
                 }
         } else {
-               struct journal_replay *i =
-                       list_last_entry(journal, struct journal_replay, list);
+               list_for_each_entry(i, journal, list) {
+                       if (i->ignore)
+                               continue;
  
-               c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
-               c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
+                       c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
+                       c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
  
-               list_for_each_entry(i, journal, list)
                         vstruct_for_each(&i->j, entry) {
                                 ret = journal_replay_entry_early(c, entry);
                                 if (ret)
                                         return ret;
                         }
+               }
         }
  
         bch2_fs_usage_initialize(c);
@@ -842,9 +804,6 @@ static int verify_superblock_clean(struct bch_fs *c,
         struct bch_sb_field_clean *clean = *cleanp;
         int ret = 0;
  
-       if (!c->sb.clean || !j)
-               return 0;
-
         if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
                         "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
                         le64_to_cpu(clean->journal_seq),
@@ -971,7 +930,8 @@ int bch2_fs_recovery(struct bch_fs *c)
  {
         const char *err = "cannot allocate memory";
         struct bch_sb_field_clean *clean = NULL;
-       u64 journal_seq;
+       struct jset *last_journal_entry = NULL;
+       u64 blacklist_seq, journal_seq;
         bool write_sb = false, need_write_alloc = false;
         int ret;
  
@@ -991,24 +951,38 @@ int bch2_fs_recovery(struct bch_fs *c)
                 set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
         }
  
+       ret = bch2_blacklist_table_initialize(c);
+       if (ret) {
+               bch_err(c, "error initializing blacklist table");
+               goto err;
+       }
+
         if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
-               struct jset *j;
+               struct journal_replay *i;
  
-               ret = bch2_journal_read(c, &c->journal_entries);
+               ret = bch2_journal_read(c, &c->journal_entries,
+                                       &blacklist_seq, &journal_seq);
                 if (ret)
                         goto err;
  
-               if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c,
+               list_for_each_entry_reverse(i, &c->journal_entries, list)
+                       if (!i->ignore) {
+                               last_journal_entry = &i->j;
+                               break;
+                       }
+
+               if (mustfix_fsck_err_on(c->sb.clean &&
+                                       last_journal_entry &&
+                                       !journal_entry_empty(last_journal_entry), c,
                                 "filesystem marked clean but journal not empty")) {
                         c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
                         SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
                         c->sb.clean = false;
                 }
  
-               if (!c->sb.clean && list_empty(&c->journal_entries)) {
-                       bch_err(c, "no journal entries found");
-                       ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
-                       goto err;
+               if (!last_journal_entry) {
+                       fsck_err_on(!c->sb.clean, c, "no journal entries found");
+                       goto use_clean;
                 }
  
                 c->journal_keys = journal_keys_sort(&c->journal_entries);
@@ -1017,16 +991,21 @@ int bch2_fs_recovery(struct bch_fs *c)
                         goto err;
                 }
  
-               j = &list_last_entry(&c->journal_entries,
-                                    struct journal_replay, list)->j;
-
-               ret = verify_superblock_clean(c, &clean, j);
-               if (ret)
+               if (c->sb.clean && last_journal_entry) {
+                       ret = verify_superblock_clean(c, &clean,
+                                                     last_journal_entry);
+                       if (ret)
+                               goto err;
+               }
+       } else {
+use_clean:
+               if (!clean) {
+                       bch_err(c, "no superblock clean section found");
+                       ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
                         goto err;
  
-               journal_seq = le64_to_cpu(j->seq) + 1;
-       } else {
-               journal_seq = le64_to_cpu(clean->journal_seq) + 1;
+               }
+               blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
         }
  
         if (!c->sb.clean &&
@@ -1045,30 +1024,23 @@ int bch2_fs_recovery(struct bch_fs *c)
         if (ret)
                 goto err;
  
-       if (!c->sb.clean) {
+       /*
+        * After an unclean shutdown, skip then next few journal sequence
+        * numbers as they may have been referenced by btree writes that
+        * happened before their corresponding journal writes - those btree
+        * writes need to be ignored, by skipping and blacklisting the next few
+        * journal sequence numbers:
+        */
+       if (!c->sb.clean)
+               journal_seq += 8;
+
+       if (blacklist_seq != journal_seq) {
                 ret = bch2_journal_seq_blacklist_add(c,
-                                                    journal_seq,
-                                                    journal_seq + 8);
+                                       blacklist_seq, journal_seq);
                 if (ret) {
                         bch_err(c, "error creating new journal seq blacklist entry");
                         goto err;
                 }
-
-               journal_seq += 8;
-
-               /*
-                * The superblock needs to be written before we do any btree
-                * node writes: it will be in the read_write() path
-                */
-       }
-
-       ret = bch2_blacklist_table_initialize(c);
-
-       if (!list_empty(&c->journal_entries)) {
-               ret = verify_journal_entries_not_blacklisted_or_missing(c,
-                                                       &c->journal_entries);
-               if (ret)
-                       goto err;
         }
  
         ret = bch2_fs_journal_start(&c->journal, journal_seq,
author	Kent Overstreet <kent.overstreet@gmail.com>
	Sat, 14 Nov 2020 14:59:58 +0000 (09:59 -0500)
committer	Kent Overstreet <kent.overstreet@linux.dev>
	Sun, 22 Oct 2023 21:08:49 +0000 (17:08 -0400)
fs/bcachefs/bcachefs_format.h		patch \| blob \| history
fs/bcachefs/journal.c		patch \| blob \| history
fs/bcachefs/journal.h		patch \| blob \| history
fs/bcachefs/journal_io.c		patch \| blob \| history
fs/bcachefs/journal_io.h		patch \| blob \| history
fs/bcachefs/journal_reclaim.c		patch \| blob \| history
fs/bcachefs/journal_seq_blacklist.c		patch \| blob \| history
fs/bcachefs/journal_types.h		patch \| blob \| history
fs/bcachefs/recovery.c		patch \| blob \| history