bcachefs: Correctly order flushes and journal writes on multi device filesystems
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 16 Jan 2021 20:40:33 +0000 (15:40 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:51 +0000 (17:08 -0400)
All writes prior to a journal write need to be flushed before the
journal write itself happens. On single device filesystems, it suffices
to mark the write with REQ_PREFLUSH|REQ_FUA, but on multi device
filesystems we need to issue flushes to every device - and wait for them
to complete - before issuing the journal writes. Previously, we were
issuing flushes to every device, but we weren't waiting for them to
complete before issuing the journal writes.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/io.c
fs/bcachefs/journal.c
fs/bcachefs/journal.h
fs/bcachefs/journal_io.c
fs/bcachefs/journal_types.h

index bc1e2dc048507b8a949487a79bdba6375cd9dd47..8a4d05eee38167405408e414941a6f6c495183a7 100644 (file)
@@ -509,9 +509,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
                n->submit_time          = local_clock();
                n->bio.bi_iter.bi_sector = ptr->offset;
 
-               if (!journal_flushes_device(ca))
-                       n->bio.bi_opf |= REQ_FUA;
-
                if (likely(n->have_ioref)) {
                        this_cpu_add(ca->io_done->sectors[WRITE][type],
                                     bio_sectors(&n->bio));
index e90fe042302fff29a04607c667b824ab66e8829a..6f84a5dd06bc405cf6a89295ec401040db2a1449 100644 (file)
@@ -81,6 +81,7 @@ static void bch2_journal_buf_init(struct journal *j)
        bkey_extent_init(&buf->key);
        buf->noflush    = false;
        buf->must_flush = false;
+       buf->separate_flush = false;
 
        memset(buf->has_inode, 0, sizeof(buf->has_inode));
 
index df353a18011bd4770b79e112d519fee40bc85d9d..547c735ce3cb635b065d94bfda6500d7423b5599 100644 (file)
@@ -496,11 +496,6 @@ static inline int bch2_journal_error(struct journal *j)
 
 struct bch_dev;
 
-static inline bool journal_flushes_device(struct bch_dev *ca)
-{
-       return true;
-}
-
 static inline void bch2_journal_set_replay_done(struct journal *j)
 {
        BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
index f6c9681badea225a38be1166cfe844587b70ad4a..40da18d778a34afb72762024eff75035785233d2 100644 (file)
@@ -1188,6 +1188,51 @@ static void journal_write_endio(struct bio *bio)
        percpu_ref_put(&ca->io_ref);
 }
 
+static void do_journal_write(struct closure *cl)
+{
+       struct journal *j = container_of(cl, struct journal, io);
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct bch_dev *ca;
+       struct journal_buf *w = journal_last_unwritten_buf(j);
+       struct bch_extent_ptr *ptr;
+       struct bio *bio;
+       unsigned sectors = vstruct_sectors(w->data, c->block_bits);
+
+       extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+               ca = bch_dev_bkey_exists(c, ptr->dev);
+               if (!percpu_ref_tryget(&ca->io_ref)) {
+                       /* XXX: fix this */
+                       bch_err(c, "missing device for journal write\n");
+                       continue;
+               }
+
+               this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
+                            sectors);
+
+               bio = ca->journal.bio;
+               bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
+               bio->bi_iter.bi_sector  = ptr->offset;
+               bio->bi_end_io          = journal_write_endio;
+               bio->bi_private         = ca;
+
+               if (!JSET_NO_FLUSH(w->data))
+                       bio->bi_opf    |= REQ_FUA;
+               if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
+                       bio->bi_opf    |= REQ_PREFLUSH;
+
+               bch2_bio_map(bio, w->data, sectors << 9);
+
+               trace_journal_write(bio);
+               closure_bio_submit(bio, cl);
+
+               ca->journal.bucket_seq[ca->journal.cur_idx] =
+                       le64_to_cpu(w->data->seq);
+       }
+
+       continue_at(cl, journal_write_done, system_highpri_wq);
+       return;
+}
+
 void bch2_journal_write(struct closure *cl)
 {
        struct journal *j = container_of(cl, struct journal, io);
@@ -1197,9 +1242,8 @@ void bch2_journal_write(struct closure *cl)
        struct jset_entry *start, *end;
        struct jset *jset;
        struct bio *bio;
-       struct bch_extent_ptr *ptr;
        bool validate_before_checksum = false;
-       unsigned i, sectors, bytes, u64s;
+       unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
        int ret;
 
        BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
@@ -1329,45 +1373,28 @@ retry_alloc:
        if (c->opts.nochanges)
                goto no_io;
 
-       extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
-               ca = bch_dev_bkey_exists(c, ptr->dev);
-               if (!percpu_ref_tryget(&ca->io_ref)) {
-                       /* XXX: fix this */
-                       bch_err(c, "missing device for journal write\n");
-                       continue;
-               }
-
-               this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
-                            sectors);
+       for_each_rw_member(ca, c, i)
+               nr_rw_members++;
 
-               bio = ca->journal.bio;
-               bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
-               bio->bi_iter.bi_sector  = ptr->offset;
-               bio->bi_end_io          = journal_write_endio;
-               bio->bi_private         = ca;
-               if (!JSET_NO_FLUSH(jset))
-                       bio->bi_opf    |= REQ_PREFLUSH|REQ_FUA;
-               bch2_bio_map(bio, jset, sectors << 9);
+       if (nr_rw_members > 1)
+               w->separate_flush = true;
 
-               trace_journal_write(bio);
-               closure_bio_submit(bio, cl);
+       if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
+               for_each_rw_member(ca, c, i) {
+                       percpu_ref_get(&ca->io_ref);
 
-               ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
+                       bio = ca->journal.bio;
+                       bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+                       bio->bi_end_io          = journal_write_endio;
+                       bio->bi_private         = ca;
+                       closure_bio_submit(bio, cl);
+               }
        }
 
-       if (!JSET_NO_FLUSH(jset)) {
-               for_each_rw_member(ca, c, i)
-                       if (journal_flushes_device(ca) &&
-                           !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
-                               percpu_ref_get(&ca->io_ref);
-
-                               bio = ca->journal.bio;
-                               bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
-                               bio->bi_end_io          = journal_write_endio;
-                               bio->bi_private         = ca;
-                               closure_bio_submit(bio, cl);
-                       }
-       }
+       bch2_bucket_seq_cleanup(c);
+
+       continue_at(cl, do_journal_write, system_highpri_wq);
+       return;
 no_io:
        bch2_bucket_seq_cleanup(c);
 
index 150e691d531728f36fc80a6662086b6b4e2fa1ae..8ad10e46dd5dbeb56c3d9b1e3994f1ee00ed7f96 100644 (file)
@@ -31,6 +31,7 @@ struct journal_buf {
        unsigned                u64s_reserved;
        bool                    noflush;        /* write has already been kicked off, and was noflush */
        bool                    must_flush;     /* something wants a flush */
+       bool                    separate_flush;
        /* bloom filter: */
        unsigned long           has_inode[1024 / sizeof(unsigned long)];
 };