bcachefs: Improvements to the journal read error paths
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 24 Aug 2020 19:58:26 +0000 (15:58 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:44 +0000 (17:08 -0400)
 - Print out more information in error messages
 - On checksum error, keep the journal entry but mark it bad so that we
   can prefer entries from other devices that don't have bad checksums

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/journal_io.c
fs/bcachefs/journal_io.h

index 9df8dd75f4ec6eb0633b78e3996ffeb0c7980c36..80c833f1390bb1bee93275bf11fb76f627754a36 100644 (file)
@@ -28,9 +28,11 @@ struct journal_list {
  * be replayed:
  */
 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
-                            struct journal_list *jlist, struct jset *j)
+                            struct journal_list *jlist, struct jset *j,
+                            bool bad)
 {
        struct journal_replay *i, *pos;
+       struct bch_devs_list devs = { .nr = 0 };
        struct list_head *where;
        size_t bytes = vstruct_bytes(j);
        __le64 last_seq;
@@ -59,8 +61,31 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
        }
 
        list_for_each_entry_reverse(i, jlist->head, list) {
-               /* Duplicate? */
-               if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
+               if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
+                       where = &i->list;
+                       goto add;
+               }
+       }
+
+       where = jlist->head;
+add:
+       i = where->next != jlist->head
+               ? container_of(where->next, struct journal_replay, list)
+               : NULL;
+
+       /*
+        * Duplicate journal entries? If so we want the one that didn't have a
+        * checksum error:
+        */
+       if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
+               if (i->bad) {
+                       devs = i->devs;
+                       list_del(&i->list);
+                       kvpfree(i, offsetof(struct journal_replay, j) +
+                               vstruct_bytes(&i->j));
+               } else if (bad) {
+                       goto found;
+               } else {
                        fsck_err_on(bytes != vstruct_bytes(&i->j) ||
                                    memcmp(j, &i->j, bytes), c,
                                    "found duplicate but non identical journal entries (seq %llu)",
@@ -68,14 +93,8 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
                        goto found;
                }
 
-               if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
-                       where = &i->list;
-                       goto add;
-               }
        }
 
-       where = jlist->head;
-add:
        i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
        if (!i) {
                ret = -ENOMEM;
@@ -83,7 +102,8 @@ add:
        }
 
        list_add(&i->list, where);
-       i->devs.nr = 0;
+       i->devs = devs;
+       i->bad  = bad;
        unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
 found:
        if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
@@ -390,6 +410,7 @@ fsck_err:
 }
 
 static int jset_validate(struct bch_fs *c,
+                        struct bch_dev *ca,
                         struct jset *jset, u64 sector,
                         unsigned bucket_sectors_left,
                         unsigned sectors_read,
@@ -404,16 +425,19 @@ static int jset_validate(struct bch_fs *c,
                return JOURNAL_ENTRY_NONE;
 
        version = le32_to_cpu(jset->version);
-       if ((version != BCH_JSET_VERSION_OLD &&
-            version < bcachefs_metadata_version_min) ||
-           version >= bcachefs_metadata_version_max) {
-               bch_err(c, "unknown journal entry version %u", jset->version);
-               return BCH_FSCK_UNKNOWN_VERSION;
+       if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
+                                 version < bcachefs_metadata_version_min) ||
+                                version >= bcachefs_metadata_version_max, c,
+                       "%s sector %llu seq %llu: unknown journal entry version %u",
+                       ca->name, sector, le64_to_cpu(jset->seq),
+                       version)) {
+               /* XXX: note we might have missing journal entries */
+               return JOURNAL_ENTRY_BAD;
        }
 
        if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
-                                "journal entry too big (%zu bytes), sector %lluu",
-                                bytes, sector)) {
+                       "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
+                       ca->name, sector, le64_to_cpu(jset->seq), bytes)) {
                /* XXX: note we might have missing journal entries */
                return JOURNAL_ENTRY_BAD;
        }
@@ -422,13 +446,15 @@ static int jset_validate(struct bch_fs *c,
                return JOURNAL_ENTRY_REREAD;
 
        if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
-                       "journal entry with unknown csum type %llu sector %lluu",
-                       JSET_CSUM_TYPE(jset), sector))
+                       "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
+                       ca->name, sector, le64_to_cpu(jset->seq),
+                       JSET_CSUM_TYPE(jset)))
                return JOURNAL_ENTRY_BAD;
 
        csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
        if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
-                                "journal checksum bad, sector %llu", sector)) {
+                                "%s sector %llu seq %llu: journal checksum bad",
+                                ca->name, sector, le64_to_cpu(jset->seq))) {
                /* XXX: retry IO, when we start retrying checksum errors */
                /* XXX: note we might have missing journal entries */
                return JOURNAL_ENTRY_BAD;
@@ -439,8 +465,10 @@ static int jset_validate(struct bch_fs *c,
                     vstruct_end(jset) - (void *) jset->encrypted_start);
 
        if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
-                                "invalid journal entry: last_seq > seq"))
+                                "invalid journal entry: last_seq > seq")) {
                jset->last_seq = jset->seq;
+               return JOURNAL_ENTRY_BAD;
+       }
 
        return 0;
 fsck_err:
@@ -515,11 +543,12 @@ reread:
                        j = buf->data;
                }
 
-               ret = jset_validate(c, j, offset,
+               ret = jset_validate(c, ca, j, offset,
                                    end - offset, sectors_read,
                                    READ);
                switch (ret) {
                case BCH_FSCK_OK:
+                       sectors = vstruct_sectors(j, c->block_bits);
                        break;
                case JOURNAL_ENTRY_REREAD:
                        if (vstruct_bytes(j) > buf->size) {
@@ -536,8 +565,13 @@ reread:
                        goto next_block;
                case JOURNAL_ENTRY_BAD:
                        saw_bad = true;
+                       /*
+                        * On checksum error we don't really trust the size
+                        * field of the journal entry we read, so try reading
+                        * again at next block boundary:
+                        */
                        sectors = c->opts.block_size;
-                       goto next_block;
+                       break;
                default:
                        return ret;
                }
@@ -554,7 +588,7 @@ reread:
                ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 
                mutex_lock(&jlist->lock);
-               ret = journal_entry_add(c, ca, jlist, j);
+               ret = journal_entry_add(c, ca, jlist, j, ret != 0);
                mutex_unlock(&jlist->lock);
 
                switch (ret) {
@@ -565,8 +599,6 @@ reread:
                default:
                        return ret;
                }
-
-               sectors = vstruct_sectors(j, c->block_bits);
 next_block:
                pr_debug("next");
                offset          += sectors;
index 72e575f360afca614bb41178fc86f6558dd3e6a4..6958ee0f8cf23da1ab5a9c0588fedb3d8679678c 100644 (file)
@@ -9,6 +9,8 @@
 struct journal_replay {
        struct list_head        list;
        struct bch_devs_list    devs;
+       /* checksum error, but we may want to try using it anyways: */
+       bool                    bad;
        /* must be last: */
        struct jset             j;
 };