btrfs: add support for inserting raid stripe extents
authorJohannes Thumshirn <johannes.thumshirn@wdc.com>
Thu, 14 Sep 2023 16:06:58 +0000 (09:06 -0700)
committerDavid Sterba <dsterba@suse.com>
Thu, 12 Oct 2023 14:44:09 +0000 (16:44 +0200)
Add support for inserting stripe extents into the raid stripe tree on
completion of every write that needs an extra logical-to-physical
translation when using RAID.

Inserting the stripe extents happens after the data I/O has completed,
this is done to

  a) support zone-append and
  b) rule out the possibility of a RAID-write-hole.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/Makefile
fs/btrfs/bio.c
fs/btrfs/extent-tree.c
fs/btrfs/inode.c
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h
fs/btrfs/raid-stripe-tree.c [new file with mode: 0644]
fs/btrfs/raid-stripe-tree.h [new file with mode: 0644]
fs/btrfs/volumes.c
fs/btrfs/volumes.h

index c57d80729d4fe5d5e8cfe71389cc2c2f9f4d3214..525af975f61cde36f51741825fe9ab6901b3d9f4 100644 (file)
@@ -33,7 +33,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
           block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
           subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
-          lru_cache.o
+          lru_cache.o raid-stripe-tree.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
index 964714258f8649c3dad06320dc00027b342beb5f..ac6bc4f43ffb7c9318ef547845d58c77452b595c 100644 (file)
@@ -14,6 +14,7 @@
 #include "rcu-string.h"
 #include "zoned.h"
 #include "file-item.h"
+#include "raid-stripe-tree.h"
 
 static struct bio_set btrfs_bioset;
 static struct bio_set btrfs_clone_bioset;
@@ -415,6 +416,9 @@ static void btrfs_orig_write_end_io(struct bio *bio)
        else
                bio->bi_status = BLK_STS_OK;
 
+       if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
+               stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
        btrfs_orig_bbio_end_io(bbio);
        btrfs_put_bioc(bioc);
 }
@@ -426,6 +430,8 @@ static void btrfs_clone_write_end_io(struct bio *bio)
        if (bio->bi_status) {
                atomic_inc(&stripe->bioc->error);
                btrfs_log_dev_io_error(bio, stripe->dev);
+       } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+               stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
        }
 
        /* Pass on control to the original bio this one was cloned from */
@@ -487,6 +493,7 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
        bio->bi_private = &bioc->stripes[dev_nr];
        bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
        bioc->stripes[dev_nr].bioc = bioc;
+       bioc->size = bio->bi_iter.bi_size;
        btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
 }
 
@@ -496,6 +503,8 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
        if (!bioc) {
                /* Single mirror read/write fast path. */
                btrfs_bio(bio)->mirror_num = mirror_num;
+               if (bio_op(bio) != REQ_OP_READ)
+                       btrfs_bio(bio)->orig_physical = smap->physical;
                bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
                if (bio_op(bio) != REQ_OP_READ)
                        btrfs_bio(bio)->orig_physical = smap->physical;
@@ -688,6 +697,18 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
                        bio->bi_opf |= REQ_OP_ZONE_APPEND;
                }
 
+               if (is_data_bbio(bbio) && bioc &&
+                   btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
+                       /*
+                        * No locking for the list update, as we only add to
+                        * the list in the I/O submission path, and list
+                        * iteration only happens in the completion path, which
+                        * can't happen until after the last submission.
+                        */
+                       btrfs_get_bioc(bioc);
+                       list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list);
+               }
+
                /*
                 * Csum items for reloc roots have already been cloned at this
                 * point, so they are handled as part of the no-checksum case.
index 3e58c8df146363ebaf7607a193857ac619a139a5..31aae12619b71f0e2362453172bc5f78c95494cd 100644 (file)
@@ -42,6 +42,7 @@
 #include "file-item.h"
 #include "orphan.h"
 #include "tree-checker.h"
+#include "raid-stripe-tree.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
index e02a5ba5b5332dd47e67b4951a6a6273759b717d..b5e0ed3a36f78d2f54165a0e3e80798ab5daabf2 100644 (file)
@@ -71,6 +71,7 @@
 #include "super.h"
 #include "orphan.h"
 #include "backref.h"
+#include "raid-stripe-tree.h"
 
 struct btrfs_iget_args {
        u64 ino;
@@ -3091,6 +3092,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 
        trans->block_rsv = &inode->block_rsv;
 
+       ret = btrfs_insert_raid_extent(trans, ordered_extent);
+       if (ret)
+               goto out;
+
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
                compress_type = ordered_extent->compress_type;
        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
@@ -3224,7 +3229,8 @@ out:
 int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
 {
        if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
-           !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+           !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
+           list_empty(&ordered->bioc_list))
                btrfs_finish_ordered_zoned(ordered);
        return btrfs_finish_one_ordered(ordered);
 }
index 345c449d588ccbdfb4e3b7cf430bd9cd6392a87f..55c7d5543265c4df8830018b89d3bbabe835058c 100644 (file)
@@ -191,6 +191,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
        INIT_LIST_HEAD(&entry->log_list);
        INIT_LIST_HEAD(&entry->root_extent_list);
        INIT_LIST_HEAD(&entry->work_list);
+       INIT_LIST_HEAD(&entry->bioc_list);
        init_completion(&entry->completion);
 
        /*
index 173bd5c5df2624e567f022f3b0d4a93653cc89e3..1c51ac57e5dfdee1af8f484fb65f08c6ad9ef0fe 100644 (file)
@@ -151,6 +151,8 @@ struct btrfs_ordered_extent {
        struct completion completion;
        struct btrfs_work flush_work;
        struct list_head work_list;
+
+       struct list_head bioc_list;
 };
 
 static inline void
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
new file mode 100644 (file)
index 0000000..c093e0b
--- /dev/null
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/btrfs_tree.h>
+#include "ctree.h"
+#include "fs.h"
+#include "accessors.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "raid-stripe-tree.h"
+#include "volumes.h"
+#include "misc.h"
+#include "print-tree.h"
+
+static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
+                                       struct btrfs_io_context *bioc)
+{
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+       struct btrfs_key stripe_key;
+       struct btrfs_root *stripe_root = fs_info->stripe_root;
+       const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type);
+       u8 encoding = btrfs_bg_flags_to_raid_index(bioc->map_type);
+       struct btrfs_stripe_extent *stripe_extent;
+       const size_t item_size = struct_size(stripe_extent, strides, num_stripes);
+       int ret;
+
+       stripe_extent = kzalloc(item_size, GFP_NOFS);
+       if (!stripe_extent) {
+               btrfs_abort_transaction(trans, -ENOMEM);
+               btrfs_end_transaction(trans);
+               return -ENOMEM;
+       }
+
+       btrfs_set_stack_stripe_extent_encoding(stripe_extent, encoding);
+       for (int i = 0; i < num_stripes; i++) {
+               u64 devid = bioc->stripes[i].dev->devid;
+               u64 physical = bioc->stripes[i].physical;
+               u64 length = bioc->stripes[i].length;
+               struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i];
+
+               if (length == 0)
+                       length = bioc->size;
+
+               btrfs_set_stack_raid_stride_devid(raid_stride, devid);
+               btrfs_set_stack_raid_stride_physical(raid_stride, physical);
+       }
+
+       stripe_key.objectid = bioc->logical;
+       stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+       stripe_key.offset = bioc->size;
+
+       ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
+                               item_size);
+       if (ret)
+               btrfs_abort_transaction(trans, ret);
+
+       kfree(stripe_extent);
+
+       return ret;
+}
+
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+                            struct btrfs_ordered_extent *ordered_extent)
+{
+       struct btrfs_io_context *bioc;
+       int ret;
+
+       if (!btrfs_fs_incompat(trans->fs_info, RAID_STRIPE_TREE))
+               return 0;
+
+       list_for_each_entry(bioc, &ordered_extent->bioc_list, rst_ordered_entry) {
+               ret = btrfs_insert_one_raid_extent(trans, bioc);
+               if (ret)
+                       return ret;
+       }
+
+       while (!list_empty(&ordered_extent->bioc_list)) {
+               bioc = list_first_entry(&ordered_extent->bioc_list,
+                                       typeof(*bioc), rst_ordered_entry);
+               list_del(&bioc->rst_ordered_entry);
+               btrfs_put_bioc(bioc);
+       }
+
+       return ret;
+}
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
new file mode 100644 (file)
index 0000000..7a169e7
--- /dev/null
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023 Western Digital Corporation or its affiliates.
+ */
+
+#ifndef BTRFS_RAID_STRIPE_TREE_H
+#define BTRFS_RAID_STRIPE_TREE_H
+
+struct btrfs_io_context;
+struct btrfs_io_stripe;
+struct btrfs_ordered_extent;
+struct btrfs_trans_handle;
+
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+                            struct btrfs_ordered_extent *ordered_extent);
+
+static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
+                                                u64 map_type)
+{
+       u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
+       u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE))
+               return false;
+
+       if (type != BTRFS_BLOCK_GROUP_DATA)
+               return false;
+
+       if (profile & BTRFS_BLOCK_GROUP_RAID1_MASK)
+               return true;
+
+       return false;
+}
+
+#endif
index 22977246d8178ed8ed575fe07b1ad397bbfb7424..4f01dec76f1d1e7a49b3a0d61b0a1a4696a9a626 100644 (file)
@@ -5906,6 +5906,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 }
 
 static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+                                                      u64 logical,
                                                       u16 total_stripes)
 {
        struct btrfs_io_context *bioc;
@@ -5925,6 +5926,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
        bioc->fs_info = fs_info;
        bioc->replace_stripe_src = -1;
        bioc->full_stripe_logical = (u64)-1;
+       bioc->logical = logical;
 
        return bioc;
 }
@@ -6451,7 +6453,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                goto out;
        }
 
-       bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
+       bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes);
        if (!bioc) {
                ret = -ENOMEM;
                goto out;
index 0704c64cea1df89b7c048e6f443e77fa8440566b..000c56a3d2a2ebecb403ad1c3f8b601e79783fbc 100644 (file)
@@ -390,12 +390,11 @@ struct btrfs_fs_devices {
 
 struct btrfs_io_stripe {
        struct btrfs_device *dev;
-       union {
-               /* Block mapping */
-               u64 physical;
-               /* For the endio handler */
-               struct btrfs_io_context *bioc;
-       };
+       /* Block mapping. */
+       u64 physical;
+       u64 length;
+       /* For the endio handler. */
+       struct btrfs_io_context *bioc;
 };
 
 struct btrfs_discard_stripe {
@@ -428,6 +427,11 @@ struct btrfs_io_context {
        atomic_t error;
        u16 max_errors;
 
+       u64 logical;
+       u64 size;
+       /* Raid stripe tree ordered entry. */
+       struct list_head rst_ordered_entry;
+
        /*
         * The total number of stripes, including the extra duplicated
         * stripe for replace.