btrfs: scrub: introduce error reporting functionality for scrub_stripe
authorQu Wenruo <wqu@suse.com>
Mon, 20 Mar 2023 02:12:56 +0000 (10:12 +0800)
committerDavid Sterba <dsterba@suse.com>
Mon, 17 Apr 2023 16:01:24 +0000 (18:01 +0200)
The new helper, scrub_stripe_report_errors(), will report the result of
the scrub to system log.

The main reporting is done by introducing a new helper,
scrub_print_common_warning(), which is mostly the same content from
scrub_print_wanring(), but without the need for a scrub_block.

Since we're reporting the errors, it's the perfect time to update the
scrub stats too.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/scrub.c

index b2b1909862a86c3bc7ff4e22c889291c1cf8b8c8..5c21e25c83f48b4bee476d8634482e1d31bc037f 100644 (file)
@@ -105,6 +105,7 @@ enum scrub_stripe_flags {
  * Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
  */
 struct scrub_stripe {
+       struct scrub_ctx *sctx;
        struct btrfs_block_group *bg;
 
        struct page *pages[SCRUB_STRIPE_PAGES];
@@ -119,6 +120,13 @@ struct scrub_stripe {
        /* Should be BTRFS_STRIPE_LEN / sectorsize. */
        u16 nr_sectors;
 
+       /*
+        * How many data/meta extents are in this stripe.  Only for scrub status
+        * reporting purposes.
+        */
+       u16 nr_data_extents;
+       u16 nr_meta_extents;
+
        atomic_t pending_io;
        wait_queue_head_t io_wait;
        wait_queue_head_t repair_wait;
@@ -377,6 +385,7 @@ static void release_scrub_stripe(struct scrub_stripe *stripe)
        kfree(stripe->csums);
        stripe->sectors = NULL;
        stripe->csums = NULL;
+       stripe->sctx = NULL;
        stripe->state = 0;
 }
 
@@ -1046,10 +1055,10 @@ err:
        return 0;
 }
 
-static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
+static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev,
+                                      bool is_super, u64 logical, u64 physical)
 {
-       struct btrfs_device *dev;
-       struct btrfs_fs_info *fs_info;
+       struct btrfs_fs_info *fs_info = dev->fs_info;
        struct btrfs_path *path;
        struct btrfs_key found_key;
        struct extent_buffer *eb;
@@ -1062,22 +1071,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
        u8 ref_level = 0;
        int ret;
 
-       WARN_ON(sblock->sector_count < 1);
-       dev = sblock->dev;
-       fs_info = sblock->sctx->fs_info;
-
        /* Super block error, no need to search extent tree. */
-       if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+       if (is_super) {
                btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
-                       errstr, btrfs_dev_name(dev), sblock->physical);
+                                 errstr, btrfs_dev_name(dev), physical);
                return;
        }
        path = btrfs_alloc_path();
        if (!path)
                return;
 
-       swarn.physical = sblock->physical;
-       swarn.logical = sblock->logical;
+       swarn.physical = physical;
+       swarn.logical = logical;
        swarn.errstr = errstr;
        swarn.dev = NULL;
 
@@ -1126,6 +1131,13 @@ out:
        btrfs_free_path(path);
 }
 
+static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
+{
+       scrub_print_common_warning(errstr, sblock->dev,
+                       sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER,
+                       sblock->logical, sblock->physical);
+}
+
 static inline void scrub_get_recover(struct scrub_recover *recover)
 {
        refcount_inc(&recover->refs);
@@ -2453,6 +2465,131 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
        }
 }
 
+static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
+                                      struct scrub_stripe *stripe)
+{
+       static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+                                     DEFAULT_RATELIMIT_BURST);
+       struct btrfs_fs_info *fs_info = sctx->fs_info;
+       struct btrfs_device *dev = NULL;
+       u64 physical = 0;
+       int nr_data_sectors = 0;
+       int nr_meta_sectors = 0;
+       int nr_nodatacsum_sectors = 0;
+       int nr_repaired_sectors = 0;
+       int sector_nr;
+
+       /*
+        * Init needed infos for error reporting.
+        *
+        * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio()
+        * thus no need for dev/physical, error reporting still needs dev and physical.
+        */
+       if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
+               u64 mapped_len = fs_info->sectorsize;
+               struct btrfs_io_context *bioc = NULL;
+               int stripe_index = stripe->mirror_num - 1;
+               int ret;
+
+               /* For scrub, our mirror_num should always start at 1. */
+               ASSERT(stripe->mirror_num >= 1);
+               ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+                                      stripe->logical, &mapped_len, &bioc);
+               /*
+                * If we failed, dev will be NULL, and later detailed reports
+                * will just be skipped.
+                */
+               if (ret < 0)
+                       goto skip;
+               physical = bioc->stripes[stripe_index].physical;
+               dev = bioc->stripes[stripe_index].dev;
+               btrfs_put_bioc(bioc);
+       }
+
+skip:
+       for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
+               bool repaired = false;
+
+               if (stripe->sectors[sector_nr].is_metadata) {
+                       nr_meta_sectors++;
+               } else {
+                       nr_data_sectors++;
+                       if (!stripe->sectors[sector_nr].csum)
+                               nr_nodatacsum_sectors++;
+               }
+
+               if (test_bit(sector_nr, &stripe->init_error_bitmap) &&
+                   !test_bit(sector_nr, &stripe->error_bitmap)) {
+                       nr_repaired_sectors++;
+                       repaired = true;
+               }
+
+               /* Good sector from the beginning, nothing need to be done. */
+               if (!test_bit(sector_nr, &stripe->init_error_bitmap))
+                       continue;
+
+               /*
+                * Report error for the corrupted sectors.  If repaired, just
+                * output the message of repaired message.
+                */
+               if (repaired) {
+                       if (dev) {
+                               btrfs_err_rl_in_rcu(fs_info,
+                       "fixed up error at logical %llu on dev %s physical %llu",
+                                           stripe->logical, btrfs_dev_name(dev),
+                                           physical);
+                       } else {
+                               btrfs_err_rl_in_rcu(fs_info,
+                       "fixed up error at logical %llu on mirror %u",
+                                           stripe->logical, stripe->mirror_num);
+                       }
+                       continue;
+               }
+
+               /* The remaining are all for unrepaired. */
+               if (dev) {
+                       btrfs_err_rl_in_rcu(fs_info,
+       "unable to fixup (regular) error at logical %llu on dev %s physical %llu",
+                                           stripe->logical, btrfs_dev_name(dev),
+                                           physical);
+               } else {
+                       btrfs_err_rl_in_rcu(fs_info,
+       "unable to fixup (regular) error at logical %llu on mirror %u",
+                                           stripe->logical, stripe->mirror_num);
+               }
+
+               if (test_bit(sector_nr, &stripe->io_error_bitmap))
+                       if (__ratelimit(&rs) && dev)
+                               scrub_print_common_warning("i/o error", dev, false,
+                                                    stripe->logical, physical);
+               if (test_bit(sector_nr, &stripe->csum_error_bitmap))
+                       if (__ratelimit(&rs) && dev)
+                               scrub_print_common_warning("checksum error", dev, false,
+                                                    stripe->logical, physical);
+               if (test_bit(sector_nr, &stripe->meta_error_bitmap))
+                       if (__ratelimit(&rs) && dev)
+                               scrub_print_common_warning("header error", dev, false,
+                                                    stripe->logical, physical);
+       }
+
+       spin_lock(&sctx->stat_lock);
+       sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
+       sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
+       sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits;
+       sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits;
+       sctx->stat.no_csum += nr_nodatacsum_sectors;
+       sctx->stat.read_errors +=
+               bitmap_weight(&stripe->io_error_bitmap, stripe->nr_sectors);
+       sctx->stat.csum_errors +=
+               bitmap_weight(&stripe->csum_error_bitmap, stripe->nr_sectors);
+       sctx->stat.verify_errors +=
+               bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors);
+       sctx->stat.uncorrectable_errors +=
+               bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors);
+       sctx->stat.corrected_errors += nr_repaired_sectors;
+       spin_unlock(&sctx->stat_lock);
+}
+
 /*
  * The main entrance for all read related scrub work, including:
  *
@@ -2526,6 +2663,7 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
                        goto out;
        }
 out:
+       scrub_stripe_report_errors(stripe->sctx, stripe);
        set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state);
        wake_up(&stripe->repair_wait);
 }
@@ -4189,6 +4327,10 @@ int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
        if (ret)
                goto out;
        get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen);
+       if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+               stripe->nr_meta_extents++;
+       if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
+               stripe->nr_data_extents++;
        cur_logical = max(extent_start, cur_logical);
 
        /*
@@ -4222,6 +4364,10 @@ int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
                }
                get_extent_info(&path, &extent_start, &extent_len,
                                &extent_flags, &extent_gen);
+               if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+                       stripe->nr_meta_extents++;
+               if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
+                       stripe->nr_data_extents++;
                fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
                                     extent_flags, extent_gen);
                cur_logical = extent_start + extent_len;