btrfs: introduce a new helper to submit write bio for repair
authorQu Wenruo <wqu@suse.com>
Mon, 20 Mar 2023 02:12:49 +0000 (10:12 +0800)
committerDavid Sterba <dsterba@suse.com>
Mon, 17 Apr 2023 16:01:23 +0000 (18:01 +0200)
Both scrub and read-repair are utilizing a special repair writes that:

- Only writes back to a single device
  Even for read-repair on RAID56, we only update the corrupted data
  stripe itself, not triggering the full RMW path.

- Requires a valid @mirror_num
  For RAID56 case, only @mirror_num == 1 is valid.
  For non-RAID56 cases, we need @mirror_num to locate our stripe.

- No data csum generation needed

These two call sites still have some differences though:

- Read-repair goes plain bio
  It doesn't need a full btrfs_bio, and goes submit_bio_wait().

- New scrub repair would go btrfs_bio
  To simplify both read and write path.

So here this patch would:

- Introduce a common helper, btrfs_map_repair_block()
  Due to the single device nature, we can use an on-stack
  btrfs_io_stripe to pass device and its physical bytenr.

- Introduce a new interface, btrfs_submit_repair_bio(), for later scrub
  code
  This is for the incoming scrub code.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/bio.c
fs/btrfs/bio.h
fs/btrfs/raid56.h
fs/btrfs/volumes.c
fs/btrfs/volumes.h

index e40d1ababa084be057d22df1895bea5dfa10a230..5379c4714905e6695a21db9338a28236b8e5a56c 100644 (file)
@@ -735,12 +735,9 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
                            u64 length, u64 logical, struct page *page,
                            unsigned int pg_offset, int mirror_num)
 {
-       struct btrfs_device *dev;
+       struct btrfs_io_stripe smap = { 0 };
        struct bio_vec bvec;
        struct bio bio;
-       u64 map_length = 0;
-       u64 sector;
-       struct btrfs_io_context *bioc = NULL;
        int ret = 0;
 
        ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
@@ -749,68 +746,38 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
        if (btrfs_repair_one_zone(fs_info, logical))
                return 0;
 
-       map_length = length;
-
        /*
         * Avoid races with device replace and make sure our bioc has devices
         * associated to its stripes that don't go away while we are doing the
         * read repair operation.
         */
        btrfs_bio_counter_inc_blocked(fs_info);
-       if (btrfs_is_parity_mirror(fs_info, logical, length)) {
-               /*
-                * Note that we don't use BTRFS_MAP_WRITE because it's supposed
-                * to update all raid stripes, but here we just want to correct
-                * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
-                * stripe's dev and sector.
-                */
-               ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
-                                     &map_length, &bioc, 0);
-               if (ret)
-                       goto out_counter_dec;
-               ASSERT(bioc->mirror_num == 1);
-       } else {
-               ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
-                                     &map_length, &bioc, mirror_num);
-               if (ret)
-                       goto out_counter_dec;
-               /*
-                * This happens when dev-replace is also running, and the
-                * mirror_num indicates the dev-replace target.
-                *
-                * In this case, we don't need to do anything, as the read
-                * error just means the replace progress hasn't reached our
-                * read range, and later replace routine would handle it well.
-                */
-               if (mirror_num != bioc->mirror_num)
-                       goto out_counter_dec;
-       }
-
-       sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
-       dev = bioc->stripes[bioc->mirror_num - 1].dev;
-       btrfs_put_bioc(bioc);
+       ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
+       if (ret < 0)
+               goto out_counter_dec;
 
-       if (!dev || !dev->bdev ||
-           !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
+       if (!smap.dev->bdev ||
+           !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) {
                ret = -EIO;
                goto out_counter_dec;
        }
 
-       bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
-       bio.bi_iter.bi_sector = sector;
+       bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
+       bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
        __bio_add_page(&bio, page, length, pg_offset);
 
        btrfsic_check_bio(&bio);
        ret = submit_bio_wait(&bio);
        if (ret) {
                /* try to remap that extent elsewhere? */
-               btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+               btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
                goto out_bio_uninit;
        }
 
        btrfs_info_rl_in_rcu(fs_info,
                "read error corrected: ino %llu off %llu (dev %s sector %llu)",
-                            ino, start, btrfs_dev_name(dev), sector);
+                            ino, start, btrfs_dev_name(smap.dev),
+                            smap.physical >> SECTOR_SHIFT);
        ret = 0;
 
 out_bio_uninit:
@@ -820,6 +787,45 @@ out_counter_dec:
        return ret;
 }
 
+/*
+ * Submit a btrfs_bio based repair write.
+ *
+ * If @dev_replace is true, the write would be submitted to dev-replace target.
+ */
+void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
+{
+       struct btrfs_fs_info *fs_info = bbio->fs_info;
+       u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+       u64 length = bbio->bio.bi_iter.bi_size;
+       struct btrfs_io_stripe smap = { 0 };
+       int ret;
+
+       ASSERT(fs_info);
+       ASSERT(mirror_num > 0);
+       ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
+       ASSERT(!bbio->inode);
+
+       btrfs_bio_counter_inc_blocked(fs_info);
+       ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
+       if (ret < 0)
+               goto fail;
+
+       if (dev_replace) {
+               if (btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE && btrfs_is_zoned(fs_info)) {
+                       bbio->bio.bi_opf &= ~REQ_OP_WRITE;
+                       bbio->bio.bi_opf |= REQ_OP_ZONE_APPEND;
+               }
+               ASSERT(smap.dev == fs_info->dev_replace.srcdev);
+               smap.dev = fs_info->dev_replace.tgtdev;
+       }
+       __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
+       return;
+
+fail:
+       btrfs_bio_counter_dec(fs_info);
+       btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
+}
+
 int __init btrfs_bioset_init(void)
 {
        if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
index 51b4f3d93f040de3616e581e7082566c3afde160..a8eca3a6567320c68103eb1288bfa8d4e64d87ee 100644 (file)
@@ -98,6 +98,7 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
 #define REQ_BTRFS_CGROUP_PUNT                  REQ_FS_PRIVATE
 
 void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num);
+void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
                            u64 length, u64 logical, struct page *page,
                            unsigned int pg_offset, int mirror_num);
index df0e0abdeb1f0a0489c460acef38f70e2789bf5e..6583c225b1bdbaebcd65455370316f4bac16d449 100644 (file)
@@ -170,6 +170,11 @@ static inline int nr_data_stripes(const struct map_lookup *map)
        return map->num_stripes - btrfs_nr_parity_stripes(map->type);
 }
 
+static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc)
+{
+       return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type);
+}
+
 #define RAID5_P_STRIPE ((u64)-2)
 #define RAID6_Q_STRIPE ((u64)-1)
 
index c201d72f798e46d9bcb17f740791f21c217a2e8d..db6e15205be56de40a462a40ad370beaaacb4753 100644 (file)
@@ -8019,3 +8019,76 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
 
        return true;
 }
+
+static void map_raid56_repair_block(struct btrfs_io_context *bioc,
+                                   struct btrfs_io_stripe *smap,
+                                   u64 logical)
+{
+       int data_stripes = nr_bioc_data_stripes(bioc);
+       int i;
+
+       for (i = 0; i < data_stripes; i++) {
+               u64 stripe_start = bioc->full_stripe_logical +
+                                  (i << BTRFS_STRIPE_LEN_SHIFT);
+
+               if (logical >= stripe_start &&
+                   logical < stripe_start + BTRFS_STRIPE_LEN)
+                       break;
+       }
+       ASSERT(i < data_stripes);
+       smap->dev = bioc->stripes[i].dev;
+       smap->physical = bioc->stripes[i].physical +
+                       ((logical - bioc->full_stripe_logical) &
+                        BTRFS_STRIPE_LEN_MASK);
+}
+
+/*
+ * Map a repair write into a single device.
+ *
+ * A repair write is triggered by read time repair or scrub, which would only
+ * update the contents of a single device.
+ * Not update any other mirrors nor go through RMW path.
+ *
+ * Callers should ensure:
+ *
+ * - Call btrfs_bio_counter_inc_blocked() first
+ * - The range does not cross stripe boundary
+ * - Has a valid @mirror_num passed in.
+ */
+int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
+                          struct btrfs_io_stripe *smap, u64 logical,
+                          u32 length, int mirror_num)
+{
+       struct btrfs_io_context *bioc = NULL;
+       u64 map_length = length;
+       int mirror_ret = mirror_num;
+       int ret;
+
+       ASSERT(mirror_num > 0);
+
+       ret = __btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
+                               &bioc, smap, &mirror_ret, true);
+       if (ret < 0)
+               return ret;
+
+       /* The map range should not cross stripe boundary. */
+       ASSERT(map_length >= length);
+
+       /* Already mapped to single stripe. */
+       if (!bioc)
+               goto out;
+
+       /* Map the RAID56 multi-stripe writes to a single one. */
+       if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+               map_raid56_repair_block(bioc, smap, logical);
+               goto out;
+       }
+
+       ASSERT(mirror_num <= bioc->num_stripes);
+       smap->dev = bioc->stripes[mirror_num - 1].dev;
+       smap->physical = bioc->stripes[mirror_num - 1].physical;
+out:
+       btrfs_put_bioc(bioc);
+       ASSERT(smap->dev);
+       return 0;
+}
index 650e131d079e477017a203958199923e278d0d2f..bf47a1a70813babe2c39c0de3f3e0ebc559f224d 100644 (file)
@@ -587,6 +587,9 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                      struct btrfs_io_context **bioc_ret,
                      struct btrfs_io_stripe *smap, int *mirror_num_ret,
                      int need_raid_map);
+int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
+                          struct btrfs_io_stripe *smap, u64 logical,
+                          u32 length, int mirror_num);
 struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
                                               u64 logical, u64 *length_ret,
                                               u32 *num_stripes);