block: Collapse padded I/O vecs exceeding IOV_MAX

author Hanna Czenczek <hreitz@redhat.com>

Tue, 11 Apr 2023 17:34:16 +0000 (19:34 +0200)

committer Hanna Czenczek <hreitz@redhat.com>

Mon, 5 Jun 2023 11:11:06 +0000 (13:11 +0200)
author Hanna Czenczek <hreitz@redhat.com>
Tue, 11 Apr 2023 17:34:16 +0000 (19:34 +0200)
committer Hanna Czenczek <hreitz@redhat.com>
Mon, 5 Jun 2023 11:11:06 +0000 (13:11 +0200)
diff --git a/block/io.c b/block/io.c

index f2dfc7c40577801ff3ff36e1f5595b85066bb83a..30748f0b5903f845802aa565410335fb00073574 100644 (file)
--- a/block/io.c
+++ b/block/io.c
@@ -1441,6 +1441,14 @@ out:
   * @merge_reads is true for small requests,
   * if @buf_len == @head + bytes + @tail. In this case it is possible that both
   * head and tail exist but @buf_len == align and @tail_buf == @buf.
+ *
+ * @write is true for write requests, false for read requests.
+ *
+ * If padding makes the vector too long (exceeding IOV_MAX), then we need to
+ * merge existing vector elements into a single one.  @collapse_bounce_buf acts
+ * as the bounce buffer in such cases.  @pre_collapse_qiov has the pre-collapse
+ * I/O vector elements so for read requests, the data can be copied back after
+ * the read is done.
   */
  typedef struct BdrvRequestPadding {
      uint8_t *buf;
@@ -1449,11 +1457,17 @@ typedef struct BdrvRequestPadding {
      size_t head;
      size_t tail;
      bool merge_reads;
+    bool write;
      QEMUIOVector local_qiov;
+
+    uint8_t *collapse_bounce_buf;
+    size_t collapse_len;
+    QEMUIOVector pre_collapse_qiov;
  } BdrvRequestPadding;
  
  static bool bdrv_init_padding(BlockDriverState *bs,
                                int64_t offset, int64_t bytes,
+                              bool write,
                                BdrvRequestPadding *pad)
  {
      int64_t align = bs->bl.request_alignment;
@@ -1485,6 +1499,8 @@ static bool bdrv_init_padding(BlockDriverState *bs,
          pad->tail_buf = pad->buf + pad->buf_len - align;
      }
  
+    pad->write = write;
+
      return true;
  }
  
@@ -1549,8 +1565,23 @@ zero_mem:
      return 0;
  }
  
-static void bdrv_padding_destroy(BdrvRequestPadding *pad)
+/**
+ * Free *pad's associated buffers, and perform any necessary finalization steps.
+ */
+static void bdrv_padding_finalize(BdrvRequestPadding *pad)
  {
+    if (pad->collapse_bounce_buf) {
+        if (!pad->write) {
+            /*
+             * If padding required elements in the vector to be collapsed into a
+             * bounce buffer, copy the bounce buffer content back
+             */
+            qemu_iovec_from_buf(&pad->pre_collapse_qiov, 0,
+                                pad->collapse_bounce_buf, pad->collapse_len);
+        }
+        qemu_vfree(pad->collapse_bounce_buf);
+        qemu_iovec_destroy(&pad->pre_collapse_qiov);
+    }
      if (pad->buf) {
          qemu_vfree(pad->buf);
          qemu_iovec_destroy(&pad->local_qiov);
@@ -1558,6 +1589,101 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
      memset(pad, 0, sizeof(*pad));
  }
  
+/*
+ * Create pad->local_qiov by wrapping @iov in the padding head and tail, while
+ * ensuring that the resulting vector will not exceed IOV_MAX elements.
+ *
+ * To ensure this, when necessary, the first two or three elements of @iov are
+ * merged into pad->collapse_bounce_buf and replaced by a reference to that
+ * bounce buffer in pad->local_qiov.
+ *
+ * After performing a read request, the data from the bounce buffer must be
+ * copied back into pad->pre_collapse_qiov (e.g. by bdrv_padding_finalize()).
+ */
+static int bdrv_create_padded_qiov(BlockDriverState *bs,
+                                   BdrvRequestPadding *pad,
+                                   struct iovec *iov, int niov,
+                                   size_t iov_offset, size_t bytes)
+{
+    int padded_niov, surplus_count, collapse_count;
+
+    /* Assert this invariant */
+    assert(niov <= IOV_MAX);
+
+    /*
+     * Cannot pad if resulting length would exceed SIZE_MAX.  Returning an error
+     * to the guest is not ideal, but there is little else we can do.  At least
+     * this will practically never happen on 64-bit systems.
+     */
+    if (SIZE_MAX - pad->head < bytes ||
+        SIZE_MAX - pad->head - bytes < pad->tail)
+    {
+        return -EINVAL;
+    }
+
+    /* Length of the resulting IOV if we just concatenated everything */
+    padded_niov = !!pad->head + niov + !!pad->tail;
+
+    qemu_iovec_init(&pad->local_qiov, MIN(padded_niov, IOV_MAX));
+
+    if (pad->head) {
+        qemu_iovec_add(&pad->local_qiov, pad->buf, pad->head);
+    }
+
+    /*
+     * If padded_niov > IOV_MAX, we cannot just concatenate everything.
+     * Instead, merge the first two or three elements of @iov to reduce the
+     * number of vector elements as necessary.
+     */
+    if (padded_niov > IOV_MAX) {
+        /*
+         * Only head and tail can have lead to the number of entries exceeding
+         * IOV_MAX, so we can exceed it by the head and tail at most.  We need
+         * to reduce the number of elements by `surplus_count`, so we merge that
+         * many elements plus one into one element.
+         */
+        surplus_count = padded_niov - IOV_MAX;
+        assert(surplus_count <= !!pad->head + !!pad->tail);
+        collapse_count = surplus_count + 1;
+
+        /*
+         * Move the elements to collapse into `pad->pre_collapse_qiov`, then
+         * advance `iov` (and associated variables) by those elements.
+         */
+        qemu_iovec_init(&pad->pre_collapse_qiov, collapse_count);
+        qemu_iovec_concat_iov(&pad->pre_collapse_qiov, iov,
+                              collapse_count, iov_offset, SIZE_MAX);
+        iov += collapse_count;
+        iov_offset = 0;
+        niov -= collapse_count;
+        bytes -= pad->pre_collapse_qiov.size;
+
+        /*
+         * Construct the bounce buffer to match the length of the to-collapse
+         * vector elements, and for write requests, initialize it with the data
+         * from those elements.  Then add it to `pad->local_qiov`.
+         */
+        pad->collapse_len = pad->pre_collapse_qiov.size;
+        pad->collapse_bounce_buf = qemu_blockalign(bs, pad->collapse_len);
+        if (pad->write) {
+            qemu_iovec_to_buf(&pad->pre_collapse_qiov, 0,
+                              pad->collapse_bounce_buf, pad->collapse_len);
+        }
+        qemu_iovec_add(&pad->local_qiov,
+                       pad->collapse_bounce_buf, pad->collapse_len);
+    }
+
+    qemu_iovec_concat_iov(&pad->local_qiov, iov, niov, iov_offset, bytes);
+
+    if (pad->tail) {
+        qemu_iovec_add(&pad->local_qiov,
+                       pad->buf + pad->buf_len - pad->tail, pad->tail);
+    }
+
+    assert(pad->local_qiov.niov == MIN(padded_niov, IOV_MAX));
+    return 0;
+}
+
  /*
   * bdrv_pad_request
   *
@@ -1565,6 +1691,8 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
   * read of padding, bdrv_padding_rmw_read() should be called separately if
   * needed.
   *
+ * @write is true for write requests, false for read requests.
+ *
   * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out:
   *  - on function start they represent original request
   *  - on failure or when padding is not needed they are unchanged
@@ -1573,26 +1701,34 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
  static int bdrv_pad_request(BlockDriverState *bs,
                              QEMUIOVector **qiov, size_t *qiov_offset,
                              int64_t *offset, int64_t *bytes,
+                            bool write,
                              BdrvRequestPadding *pad, bool *padded,
                              BdrvRequestFlags *flags)
  {
      int ret;
+    struct iovec *sliced_iov;
+    int sliced_niov;
+    size_t sliced_head, sliced_tail;
  
      bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort);
  
-    if (!bdrv_init_padding(bs, *offset, *bytes, pad)) {
+    if (!bdrv_init_padding(bs, *offset, *bytes, write, pad)) {
          if (padded) {
              *padded = false;
          }
          return 0;
      }
  
-    ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head,
-                                   *qiov, *qiov_offset, *bytes,
-                                   pad->buf + pad->buf_len - pad->tail,
-                                   pad->tail);
+    sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
+                                  &sliced_head, &sliced_tail,
+                                  &sliced_niov);
+
+    /* Guaranteed by bdrv_check_qiov_request() */
+    assert(*bytes <= SIZE_MAX);
+    ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
+                                  sliced_head, *bytes);
      if (ret < 0) {
-        bdrv_padding_destroy(pad);
+        bdrv_padding_finalize(pad);
          return ret;
      }
      *bytes += pad->head + pad->tail;
@@ -1659,8 +1795,8 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
          flags |= BDRV_REQ_COPY_ON_READ;
      }
  
-    ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
-                           NULL, &flags);
+    ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, false,
+                           &pad, NULL, &flags);
      if (ret < 0) {
          goto fail;
      }
@@ -1670,7 +1806,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
                                bs->bl.request_alignment,
                                qiov, qiov_offset, flags);
      tracked_request_end(&req);
-    bdrv_padding_destroy(&pad);
+    bdrv_padding_finalize(&pad);
  
  fail:
      bdrv_dec_in_flight(bs);
@@ -2002,7 +2138,7 @@ bdrv_co_do_zero_pwritev(BdrvChild *child, int64_t offset, int64_t bytes,
      /* This flag doesn't make sense for padding or zero writes */
      flags &= ~BDRV_REQ_REGISTERED_BUF;
  
-    padding = bdrv_init_padding(bs, offset, bytes, &pad);
+    padding = bdrv_init_padding(bs, offset, bytes, true, &pad);
      if (padding) {
          assert(!(flags & BDRV_REQ_NO_WAIT));
          bdrv_make_request_serialising(req, align);
@@ -2050,7 +2186,7 @@ bdrv_co_do_zero_pwritev(BdrvChild *child, int64_t offset, int64_t bytes,
      }
  
  out:
-    bdrv_padding_destroy(&pad);
+    bdrv_padding_finalize(&pad);
  
      return ret;
  }
@@ -2118,8 +2254,8 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
           * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do
           * alignment only if there is no ZERO flag.
           */
-        ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
-                               &padded, &flags);
+        ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, true,
+                               &pad, &padded, &flags);
          if (ret < 0) {
              return ret;
          }
@@ -2149,7 +2285,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
      ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
                                 qiov, qiov_offset, flags);
  
-    bdrv_padding_destroy(&pad);
+    bdrv_padding_finalize(&pad);
  
  out:
      tracked_request_end(&req);
author	Hanna Czenczek <hreitz@redhat.com>
	Tue, 11 Apr 2023 17:34:16 +0000 (19:34 +0200)
committer	Hanna Czenczek <hreitz@redhat.com>
	Mon, 5 Jun 2023 11:11:06 +0000 (13:11 +0200)