block: Introduce REQ_OP_ZONE_APPEND

author Keith Busch <kbusch@kernel.org>

Tue, 12 May 2020 08:55:47 +0000 (17:55 +0900)

committer Jens Axboe <axboe@kernel.dk>

Wed, 13 May 2020 02:36:28 +0000 (20:36 -0600)
author Keith Busch <kbusch@kernel.org>
Tue, 12 May 2020 08:55:47 +0000 (17:55 +0900)
committer Jens Axboe <axboe@kernel.dk>
Wed, 13 May 2020 02:36:28 +0000 (20:36 -0600)
diff --git a/block/bio.c b/block/bio.c

index aad0a6dad4f9bf8aec8c577583f1a98c10743fa7..3aa3c4ce2e5ef212f2624e8f2f099512eccad81a 100644 (file)
--- a/block/bio.c
+++ b/block/bio.c
@@ -1025,6 +1025,50 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
         return 0;
  }
  
+static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
+{
+       unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
+       unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
+       struct request_queue *q = bio->bi_disk->queue;
+       unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
+       struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
+       struct page **pages = (struct page **)bv;
+       ssize_t size, left;
+       unsigned len, i;
+       size_t offset;
+
+       if (WARN_ON_ONCE(!max_append_sectors))
+               return 0;
+
+       /*
+        * Move page array up in the allocated memory for the bio vecs as far as
+        * possible so that we can start filling biovecs from the beginning
+        * without overwriting the temporary page array.
+        */
+       BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
+       pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
+
+       size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+       if (unlikely(size <= 0))
+               return size ? size : -EFAULT;
+
+       for (left = size, i = 0; left > 0; left -= len, i++) {
+               struct page *page = pages[i];
+               bool same_page = false;
+
+               len = min_t(size_t, PAGE_SIZE - offset, left);
+               if (bio_add_hw_page(q, bio, page, len, offset,
+                               max_append_sectors, &same_page) != len)
+                       return -EINVAL;
+               if (same_page)
+                       put_page(page);
+               offset = 0;
+       }
+
+       iov_iter_advance(iter, size);
+       return 0;
+}
+
  /**
   * bio_iov_iter_get_pages - add user or kernel pages to a bio
   * @bio: bio to add pages to
@@ -1054,10 +1098,16 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
                 return -EINVAL;
  
         do {
-               if (is_bvec)
-                       ret = __bio_iov_bvec_add_pages(bio, iter);
-               else
-                       ret = __bio_iov_iter_get_pages(bio, iter);
+               if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+                       if (WARN_ON_ONCE(is_bvec))
+                               return -EINVAL;
+                       ret = __bio_iov_append_get_pages(bio, iter);
+               } else {
+                       if (is_bvec)
+                               ret = __bio_iov_bvec_add_pages(bio, iter);
+                       else
+                               ret = __bio_iov_iter_get_pages(bio, iter);
+               }
         } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
  
         if (is_bvec)
@@ -1460,6 +1510,10 @@ struct bio *bio_split(struct bio *bio, int sectors,
         BUG_ON(sectors <= 0);
         BUG_ON(sectors >= bio_sectors(bio));
  
+       /* Zone append commands cannot be split */
+       if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
+               return NULL;
+
         split = bio_clone_fast(bio, gfp, bs);
         if (!split)
                 return NULL;
diff --git a/block/blk-core.c b/block/blk-core.c

index 409e1a6b73b05853849717e3a46fec0f2837f91f..cf5b2163edfef8ef49af61590f92e6bc9a4f5347 100644 (file)
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -135,6 +135,7 @@ static const char *const blk_op_name[] = {
         REQ_OP_NAME(ZONE_OPEN),
         REQ_OP_NAME(ZONE_CLOSE),
         REQ_OP_NAME(ZONE_FINISH),
+       REQ_OP_NAME(ZONE_APPEND),
         REQ_OP_NAME(WRITE_SAME),
         REQ_OP_NAME(WRITE_ZEROES),
         REQ_OP_NAME(SCSI_IN),
@@ -240,6 +241,17 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
  
         bio_advance(bio, nbytes);
  
+       if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
+               /*
+                * Partial zone append completions cannot be supported as the
+                * BIO fragments may end up not being written sequentially.
+                */
+               if (bio->bi_iter.bi_size)
+                       bio->bi_status = BLK_STS_IOERR;
+               else
+                       bio->bi_iter.bi_sector = rq->__sector;
+       }
+
         /* don't actually finish bio if it's part of flush sequence */
         if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
                 bio_endio(bio);
@@ -887,6 +899,41 @@ out:
         return ret;
  }
  
+/*
+ * Check write append to a zoned block device.
+ */
+static inline blk_status_t blk_check_zone_append(struct request_queue *q,
+                                                struct bio *bio)
+{
+       sector_t pos = bio->bi_iter.bi_sector;
+       int nr_sectors = bio_sectors(bio);
+
+       /* Only applicable to zoned block devices */
+       if (!blk_queue_is_zoned(q))
+               return BLK_STS_NOTSUPP;
+
+       /* The bio sector must point to the start of a sequential zone */
+       if (pos & (blk_queue_zone_sectors(q) - 1) ||
+           !blk_queue_zone_is_seq(q, pos))
+               return BLK_STS_IOERR;
+
+       /*
+        * Not allowed to cross zone boundaries. Otherwise, the BIO will be
+        * split and could result in non-contiguous sectors being written in
+        * different zones.
+        */
+       if (nr_sectors > q->limits.chunk_sectors)
+               return BLK_STS_IOERR;
+
+       /* Make sure the BIO is small enough and will not get split */
+       if (nr_sectors > q->limits.max_zone_append_sectors)
+               return BLK_STS_IOERR;
+
+       bio->bi_opf |= REQ_NOMERGE;
+
+       return BLK_STS_OK;
+}
+
  static noinline_for_stack bool
  generic_make_request_checks(struct bio *bio)
  {
@@ -959,6 +1006,11 @@ generic_make_request_checks(struct bio *bio)
                 if (!q->limits.max_write_same_sectors)
                         goto not_supported;
                 break;
+       case REQ_OP_ZONE_APPEND:
+               status = blk_check_zone_append(q, bio);
+               if (status != BLK_STS_OK)
+                       goto end_io;
+               break;
         case REQ_OP_ZONE_RESET:
         case REQ_OP_ZONE_OPEN:
         case REQ_OP_ZONE_CLOSE:
diff --git a/block/blk-mq.c b/block/blk-mq.c

index d82cefb0474f21a61e3f7a912f516c0725a00197..9ee695bdf8739e224a1232f8dd0527ce8c62bdf8 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1183,6 +1183,19 @@ static void blk_mq_handle_dev_resource(struct request *rq,
         __blk_mq_requeue_request(rq);
  }
  
+static void blk_mq_handle_zone_resource(struct request *rq,
+                                       struct list_head *zone_list)
+{
+       /*
+        * If we end up here it is because we cannot dispatch a request to a
+        * specific zone due to LLD level zone-write locking or other zone
+        * related resource not being available. In this case, set the request
+        * aside in zone_list for retrying it later.
+        */
+       list_add(&rq->queuelist, zone_list);
+       __blk_mq_requeue_request(rq);
+}
+
  /*
   * Returns true if we did some work AND can potentially do more.
   */
@@ -1195,6 +1208,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
         int errors, queued;
         blk_status_t ret = BLK_STS_OK;
         bool no_budget_avail = false;
+       LIST_HEAD(zone_list);
  
         if (list_empty(list))
                 return false;
@@ -1256,6 +1270,16 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
                         blk_mq_handle_dev_resource(rq, list);
                         break;
+               } else if (ret == BLK_STS_ZONE_RESOURCE) {
+                       /*
+                        * Move the request to zone_list and keep going through
+                        * the dispatch list to find more requests the drive can
+                        * accept.
+                        */
+                       blk_mq_handle_zone_resource(rq, &zone_list);
+                       if (list_empty(list))
+                               break;
+                       continue;
                 }
  
                 if (unlikely(ret != BLK_STS_OK)) {
@@ -1267,6 +1291,9 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                 queued++;
         } while (!list_empty(list));
  
+       if (!list_empty(&zone_list))
+               list_splice_tail_init(&zone_list, list);
+
         hctx->dispatched[queued_to_index(queued)]++;
  
         /*
diff --git a/block/blk-settings.c b/block/blk-settings.c

index 2ab1967b971610418573ece91ffa4dd3478a4139..9a2c23cd97007355154316c40a92297af2e94acc 100644 (file)
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -48,6 +48,7 @@ void blk_set_default_limits(struct queue_limits *lim)
         lim->chunk_sectors = 0;
         lim->max_write_same_sectors = 0;
         lim->max_write_zeroes_sectors = 0;
+       lim->max_zone_append_sectors = 0;
         lim->max_discard_sectors = 0;
         lim->max_hw_discard_sectors = 0;
         lim->discard_granularity = 0;
@@ -83,6 +84,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
         lim->max_dev_sectors = UINT_MAX;
         lim->max_write_same_sectors = UINT_MAX;
         lim->max_write_zeroes_sectors = UINT_MAX;
+       lim->max_zone_append_sectors = UINT_MAX;
  }
  EXPORT_SYMBOL(blk_set_stacking_limits);
  
@@ -221,6 +223,33 @@ void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
  }
  EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
  
+/**
+ * blk_queue_max_zone_append_sectors - set max sectors for a single zone append
+ * @q:  the request queue for the device
+ * @max_zone_append_sectors: maximum number of sectors to write per command
+ **/
+void blk_queue_max_zone_append_sectors(struct request_queue *q,
+               unsigned int max_zone_append_sectors)
+{
+       unsigned int max_sectors;
+
+       if (WARN_ON(!blk_queue_is_zoned(q)))
+               return;
+
+       max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors);
+       max_sectors = min(q->limits.chunk_sectors, max_sectors);
+
+       /*
+        * Signal eventual driver bugs resulting in the max_zone_append sectors limit
+        * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set,
+        * or the max_hw_sectors limit not set.
+        */
+       WARN_ON(!max_sectors);
+
+       q->limits.max_zone_append_sectors = max_sectors;
+}
+EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors);
+
  /**
   * blk_queue_max_segments - set max hw segments for a request for this queue
   * @q:  the request queue for the device
@@ -470,6 +499,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
                                         b->max_write_same_sectors);
         t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
                                         b->max_write_zeroes_sectors);
+       t->max_zone_append_sectors = min(t->max_zone_append_sectors,
+                                       b->max_zone_append_sectors);
         t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
  
         t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c

index fca9b158f4a097da276d9a6e28945be2ce63312c..02643e149d5e1ff3b4aca26ca181863ae56eccd4 100644 (file)
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -218,6 +218,13 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
                 (unsigned long long)q->limits.max_write_zeroes_sectors << 9);
  }
  
+static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
+{
+       unsigned long long max_sectors = q->limits.max_zone_append_sectors;
+
+       return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT);
+}
+
  static ssize_t
  queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
  {
@@ -639,6 +646,11 @@ static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
         .show = queue_write_zeroes_max_show,
  };
  
+static struct queue_sysfs_entry queue_zone_append_max_entry = {
+       .attr = {.name = "zone_append_max_bytes", .mode = 0444 },
+       .show = queue_zone_append_max_show,
+};
+
  static struct queue_sysfs_entry queue_nonrot_entry = {
         .attr = {.name = "rotational", .mode = 0644 },
         .show = queue_show_nonrot,
@@ -749,6 +761,7 @@ static struct attribute *queue_attrs[] = {
         &queue_discard_zeroes_data_entry.attr,
         &queue_write_same_max_entry.attr,
         &queue_write_zeroes_max_entry.attr,
+       &queue_zone_append_max_entry.attr,
         &queue_nonrot_entry.attr,
         &queue_zoned_entry.attr,
         &queue_nr_zones_entry.attr,
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c

index 0a73230a8f1635d91b63bdf76c89ecb4dbb01cb9..82ad0244b3d0b94f3e32d6f4134edabbcfb8c1c7 100644 (file)
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1706,6 +1706,7 @@ out_put_budget:
         case BLK_STS_OK:
                 break;
         case BLK_STS_RESOURCE:
+       case BLK_STS_ZONE_RESOURCE:
                 if (atomic_read(&sdev->device_busy) ||
                     scsi_device_blocked(sdev))
                         ret = BLK_STS_DEV_RESOURCE;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h

index 90895d594e647d4a247696df5dd01cf5f514fd2d..b90dca1fa430471212241c8884a0d897de50308e 100644 (file)
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -63,6 +63,18 @@ typedef u8 __bitwise blk_status_t;
   */
  #define BLK_STS_DEV_RESOURCE   ((__force blk_status_t)13)
  
+/*
+ * BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone
+ * related resources are unavailable, but the driver can guarantee the queue
+ * will be rerun in the future once the resources become available again.
+ *
+ * This is different from BLK_STS_DEV_RESOURCE in that it explicitly references
+ * a zone specific resource and IO to a different zone on the same device could
+ * still be served. Examples of that are zones that are write-locked, but a read
+ * to the same zone could be served.
+ */
+#define BLK_STS_ZONE_RESOURCE  ((__force blk_status_t)14)
+
  /**
   * blk_path_error - returns true if error may be path related
   * @error: status the request was completed with
@@ -296,6 +308,8 @@ enum req_opf {
         REQ_OP_ZONE_CLOSE       = 11,
         /* Transition a zone to full */
         REQ_OP_ZONE_FINISH      = 12,
+       /* write data at the current zone write pointer */
+       REQ_OP_ZONE_APPEND      = 13,
  
         /* SCSI passthrough using struct scsi_request */
         REQ_OP_SCSI_IN          = 32,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index d736acf7f564ec8059825b45861699361bfcf8e2..5647c78bb876809d3bc36af1d298600d8b9ee4e7 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -332,6 +332,7 @@ struct queue_limits {
         unsigned int            max_hw_discard_sectors;
         unsigned int            max_write_same_sectors;
         unsigned int            max_write_zeroes_sectors;
+       unsigned int            max_zone_append_sectors;
         unsigned int            discard_granularity;
         unsigned int            discard_alignment;
  
@@ -750,6 +751,9 @@ static inline bool rq_mergeable(struct request *rq)
         if (req_op(rq) == REQ_OP_WRITE_ZEROES)
                 return false;
  
+       if (req_op(rq) == REQ_OP_ZONE_APPEND)
+               return false;
+
         if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
                 return false;
         if (rq->rq_flags & RQF_NOMERGE_FLAGS)
@@ -1084,6 +1088,8 @@ extern void blk_queue_max_write_same_sectors(struct request_queue *q,
  extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
                 unsigned int max_write_same_sectors);
  extern void blk_queue_logical_block_size(struct request_queue *, unsigned int);
+extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
+               unsigned int max_zone_append_sectors);
  extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
  extern void blk_queue_alignment_offset(struct request_queue *q,
                                        unsigned int alignment);
@@ -1301,6 +1307,11 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q)
         return q->limits.max_segment_size;
  }
  
+static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q)
+{
+       return q->limits.max_zone_append_sectors;
+}
+
  static inline unsigned queue_logical_block_size(const struct request_queue *q)
  {
         int retval = 512;
author	Keith Busch <kbusch@kernel.org>
	Tue, 12 May 2020 08:55:47 +0000 (17:55 +0900)
committer	Jens Axboe <axboe@kernel.dk>
	Wed, 13 May 2020 02:36:28 +0000 (20:36 -0600)
block/bio.c		patch \| blob \| history
block/blk-core.c		patch \| blob \| history
block/blk-mq.c		patch \| blob \| history
block/blk-settings.c		patch \| blob \| history
block/blk-sysfs.c		patch \| blob \| history
drivers/scsi/scsi_lib.c		patch \| blob \| history
include/linux/blk_types.h		patch \| blob \| history
include/linux/blkdev.h		patch \| blob \| history