io_uring: add support for user mapped provided buffer ring
authorJens Axboe <axboe@kernel.dk>
Tue, 14 Mar 2023 17:07:19 +0000 (11:07 -0600)
committerJens Axboe <axboe@kernel.dk>
Mon, 3 Apr 2023 13:14:21 +0000 (07:14 -0600)
The ring mapped provided buffer rings rely on the application allocating
the memory for the ring, and then the kernel will map it. This generally
works fine, but runs into issues on some architectures where we need
to be able to ensure that the kernel and application virtual address for
the ring play nicely together. This at least impacts architectures that
set SHM_COLOUR, but potentially also anyone setting SHMLBA.

To use this variant of ring provided buffers, the application need not
allocate any memory for the ring. Instead the kernel will do so, and
the allocation must subsequently call mmap(2) on the ring with the
offset set to:

IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)

to get a virtual address for the buffer ring. Normally the application
would allocate a suitable piece of memory (and correctly aligned) and
simply pass that in via io_uring_buf_reg.ring_addr and the kernel would
map it.

Outside of the setup differences, the kernel allocate + user mapped
provided buffer ring works exactly the same.

Acked-by: Helge Deller <deller@gmx.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
include/uapi/linux/io_uring.h
io_uring/io_uring.c
io_uring/kbuf.c
io_uring/kbuf.h

index c3f3ea997f3a0f3b1bfa8ccf9d77820960f41ba0..1d59c816a5b8fa261e443cd9daeedaa9c694ba4d 100644 (file)
@@ -389,6 +389,9 @@ enum {
 #define IORING_OFF_SQ_RING             0ULL
 #define IORING_OFF_CQ_RING             0x8000000ULL
 #define IORING_OFF_SQES                        0x10000000ULL
+#define IORING_OFF_PBUF_RING           0x80000000ULL
+#define IORING_OFF_PBUF_SHIFT          16
+#define IORING_OFF_MMAP_MASK           0xf8000000ULL
 
 /*
  * Filled with the offset for mmap(2)
@@ -635,6 +638,20 @@ struct io_uring_buf_ring {
        };
 };
 
+/*
+ * Flags for IORING_REGISTER_PBUF_RING.
+ *
+ * IOU_PBUF_RING_MMAP: If set, kernel will allocate the memory for the ring.
+ *                     The application must not set a ring_addr in struct
+ *                     io_uring_buf_reg, instead it must subsequently call
+ *                     mmap(2) with the offset set as:
+ *                     IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
+ *                     to get a virtual mapping for the ring.
+ */
+enum {
+       IOU_PBUF_RING_MMAP      = 1,
+};
+
 /* argument for IORING_(UN)REGISTER_PBUF_RING */
 struct io_uring_buf_reg {
        __u64   ring_addr;
index b49b7ee12d60063648fb482547ea6c072e78cc05..d72aa92ce2d6dd3000041889569089497835b555 100644 (file)
@@ -3289,7 +3289,7 @@ static void *io_uring_validate_mmap_request(struct file *file,
        struct page *page;
        void *ptr;
 
-       switch (offset) {
+       switch (offset & IORING_OFF_MMAP_MASK) {
        case IORING_OFF_SQ_RING:
        case IORING_OFF_CQ_RING:
                ptr = ctx->rings;
@@ -3297,6 +3297,17 @@ static void *io_uring_validate_mmap_request(struct file *file,
        case IORING_OFF_SQES:
                ptr = ctx->sq_sqes;
                break;
+       case IORING_OFF_PBUF_RING: {
+               unsigned int bgid;
+
+               bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
+               mutex_lock(&ctx->uring_lock);
+               ptr = io_pbuf_get_address(ctx, bgid);
+               mutex_unlock(&ctx->uring_lock);
+               if (!ptr)
+                       return ERR_PTR(-EINVAL);
+               break;
+               }
        default:
                return ERR_PTR(-EINVAL);
        }
index 4b2f4a0ee9627fc7a43016091037589357bb2b33..cd1d9dddf58ec5f749dcdf80c78cdb9d6e967379 100644 (file)
@@ -137,7 +137,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
                return NULL;
 
        head &= bl->mask;
-       if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
+       /* mmaped buffers are always contig */
+       if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
                buf = &br->bufs[head];
        } else {
                int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
@@ -214,15 +215,27 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
        if (!nbufs)
                return 0;
 
-       if (bl->is_mapped && bl->buf_nr_pages) {
-               int j;
-
+       if (bl->is_mapped) {
                i = bl->buf_ring->tail - bl->head;
-               for (j = 0; j < bl->buf_nr_pages; j++)
-                       unpin_user_page(bl->buf_pages[j]);
-               kvfree(bl->buf_pages);
-               bl->buf_pages = NULL;
-               bl->buf_nr_pages = 0;
+               if (bl->is_mmap) {
+                       if (bl->buf_ring) {
+                               struct page *page;
+
+                               page = virt_to_head_page(bl->buf_ring);
+                               if (put_page_testzero(page))
+                                       free_compound_page(page);
+                               bl->buf_ring = NULL;
+                       }
+                       bl->is_mmap = 0;
+               } else if (bl->buf_nr_pages) {
+                       int j;
+
+                       for (j = 0; j < bl->buf_nr_pages; j++)
+                               unpin_user_page(bl->buf_pages[j]);
+                       kvfree(bl->buf_pages);
+                       bl->buf_pages = NULL;
+                       bl->buf_nr_pages = 0;
+               }
                /* make sure it's seen as empty */
                INIT_LIST_HEAD(&bl->buf_list);
                bl->is_mapped = 0;
@@ -482,6 +495,25 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
        bl->buf_nr_pages = nr_pages;
        bl->buf_ring = br;
        bl->is_mapped = 1;
+       bl->is_mmap = 0;
+       return 0;
+}
+
+static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
+                             struct io_buffer_list *bl)
+{
+       gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+       size_t ring_size;
+       void *ptr;
+
+       ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
+       ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
+       if (!ptr)
+               return -ENOMEM;
+
+       bl->buf_ring = ptr;
+       bl->is_mapped = 1;
+       bl->is_mmap = 1;
        return 0;
 }
 
@@ -496,12 +528,18 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 
        if (reg.resv[0] || reg.resv[1] || reg.resv[2])
                return -EINVAL;
-       if (reg.flags)
-               return -EINVAL;
-       if (!reg.ring_addr)
-               return -EFAULT;
-       if (reg.ring_addr & ~PAGE_MASK)
+       if (reg.flags & ~IOU_PBUF_RING_MMAP)
                return -EINVAL;
+       if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
+               if (!reg.ring_addr)
+                       return -EFAULT;
+               if (reg.ring_addr & ~PAGE_MASK)
+                       return -EINVAL;
+       } else {
+               if (reg.ring_addr)
+                       return -EINVAL;
+       }
+
        if (!is_power_of_2(reg.ring_entries))
                return -EINVAL;
 
@@ -526,17 +564,21 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
                        return -ENOMEM;
        }
 
-       ret = io_pin_pbuf_ring(&reg, bl);
-       if (ret) {
-               kfree(free_bl);
-               return ret;
-       }
+       if (!(reg.flags & IOU_PBUF_RING_MMAP))
+               ret = io_pin_pbuf_ring(&reg, bl);
+       else
+               ret = io_alloc_pbuf_ring(&reg, bl);
 
-       bl->nr_entries = reg.ring_entries;
-       bl->mask = reg.ring_entries - 1;
+       if (!ret) {
+               bl->nr_entries = reg.ring_entries;
+               bl->mask = reg.ring_entries - 1;
 
-       io_buffer_add_list(ctx, bl, reg.bgid);
-       return 0;
+               io_buffer_add_list(ctx, bl, reg.bgid);
+               return 0;
+       }
+
+       kfree(free_bl);
+       return ret;
 }
 
 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
@@ -564,3 +606,14 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
        }
        return 0;
 }
+
+void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
+{
+       struct io_buffer_list *bl;
+
+       bl = io_buffer_get_list(ctx, bgid);
+       if (!bl || !bl->is_mmap)
+               return NULL;
+
+       return bl->buf_ring;
+}
index 61b9c7dade9d36ca074c7614dae4b358c6aba812..d14345ef61fc8de071da1f0e54a7ac8d24ee1dd0 100644 (file)
@@ -26,6 +26,8 @@ struct io_buffer_list {
 
        /* ring mapped provided buffers */
        __u8 is_mapped;
+       /* ring mapped provided buffers, but mmap'ed by application */
+       __u8 is_mmap;
 };
 
 struct io_buffer {
@@ -53,6 +55,8 @@ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
 
 void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
 
+void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid);
+
 static inline void io_kbuf_recycle_ring(struct io_kiocb *req)
 {
        /*