netfs: Implement unbuffered/DIO read support
authorDavid Howells <dhowells@redhat.com>
Fri, 14 Jan 2022 17:39:55 +0000 (17:39 +0000)
committerDavid Howells <dhowells@redhat.com>
Thu, 28 Dec 2023 09:45:23 +0000 (09:45 +0000)
Implement support for unbuffered and DIO reads in the netfs library,
utilising the existing read helper code to do block splitting and
individual queuing.  The code also handles extraction of the destination
buffer from the supplied iterator, allowing async unbuffered reads to take
place.

The read will be split up according to the rsize setting and, if supplied,
the ->clamp_length() method.  Note that the next subrequest will be issued
as soon as issue_op returns, without waiting for previous ones to finish.
The network filesystem needs to pause or handle queuing them if it doesn't
want to fire them all at the server simultaneously.

Once all the subrequests have finished, the state will be assessed and the
amount of data to be indicated as having being obtained will be
determined.  As the subrequests may finish in any order, if an intermediate
subrequest is short, any further subrequests may be copied into the buffer
and then abandoned.

In the future, this will also take care of doing an unbuffered read from
encrypted content, with the decryption being done by the library.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org

fs/netfs/Makefile
fs/netfs/direct_read.c [new file with mode: 0644]
fs/netfs/internal.h
fs/netfs/io.c
fs/netfs/main.c
fs/netfs/objects.c
fs/netfs/stats.c
include/linux/netfs.h
include/trace/events/netfs.h
mm/filemap.c

index 85d8333a1ed463ba9db74776a0b2dc993fbaa44a..e968ab1eca400fb1ebd04570dc6d82284e404c34 100644 (file)
@@ -3,6 +3,7 @@
 netfs-y := \
        buffered_read.o \
        buffered_write.o \
+       direct_read.o \
        io.o \
        iterator.o \
        locking.o \
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
new file mode 100644 (file)
index 0000000..ad4370b
--- /dev/null
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Direct I/O support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/sched/mm.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/**
+ * netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read
+ * @iocb: The I/O control descriptor describing the read
+ * @iter: The output buffer (also specifies read length)
+ *
+ * Perform an unbuffered I/O or direct I/O from the file in @iocb to the
+ * output buffer.  No use is made of the pagecache.
+ *
+ * The caller must hold any appropriate locks.
+ */
+static ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter)
+{
+       struct netfs_io_request *rreq;
+       ssize_t ret;
+       size_t orig_count = iov_iter_count(iter);
+       bool async = !is_sync_kiocb(iocb);
+
+       _enter("");
+
+       if (!orig_count)
+               return 0; /* Don't update atime */
+
+       ret = kiocb_write_and_wait(iocb, orig_count);
+       if (ret < 0)
+               return ret;
+       file_accessed(iocb->ki_filp);
+
+       rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
+                                  iocb->ki_pos, orig_count,
+                                  NETFS_DIO_READ);
+       if (IS_ERR(rreq))
+               return PTR_ERR(rreq);
+
+       netfs_stat(&netfs_n_rh_dio_read);
+       trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_dio_read);
+
+       /* If this is an async op, we have to keep track of the destination
+        * buffer for ourselves as the caller's iterator will be trashed when
+        * we return.
+        *
+        * In such a case, extract an iterator to represent as much of the the
+        * output buffer as we can manage.  Note that the extraction might not
+        * be able to allocate a sufficiently large bvec array and may shorten
+        * the request.
+        */
+       if (user_backed_iter(iter)) {
+               ret = netfs_extract_user_iter(iter, rreq->len, &rreq->iter, 0);
+               if (ret < 0)
+                       goto out;
+               rreq->direct_bv = (struct bio_vec *)rreq->iter.bvec;
+               rreq->direct_bv_count = ret;
+               rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
+               rreq->len = iov_iter_count(&rreq->iter);
+       } else {
+               rreq->iter = *iter;
+               rreq->len = orig_count;
+               rreq->direct_bv_unpin = false;
+               iov_iter_advance(iter, orig_count);
+       }
+
+       // TODO: Set up bounce buffer if needed
+
+       if (async)
+               rreq->iocb = iocb;
+
+       ret = netfs_begin_read(rreq, is_sync_kiocb(iocb));
+       if (ret < 0)
+               goto out; /* May be -EIOCBQUEUED */
+       if (!async) {
+               // TODO: Copy from bounce buffer
+               iocb->ki_pos += rreq->transferred;
+               ret = rreq->transferred;
+       }
+
+out:
+       netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+       if (ret > 0)
+               orig_count -= ret;
+       if (ret != -EIOCBQUEUED)
+               iov_iter_revert(iter, orig_count - iov_iter_count(iter));
+       return ret;
+}
+
+/**
+ * netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read
+ * @iocb: The I/O control descriptor describing the read
+ * @iter: The output buffer (also specifies read length)
+ *
+ * Perform an unbuffered I/O or direct I/O from the file in @iocb to the
+ * output buffer.  No use is made of the pagecache.
+ */
+ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+       struct inode *inode = file_inode(iocb->ki_filp);
+       ssize_t ret;
+
+       if (!iter->count)
+               return 0; /* Don't update atime */
+
+       ret = netfs_start_io_direct(inode);
+       if (ret == 0) {
+               ret = netfs_unbuffered_read_iter_locked(iocb, iter);
+               netfs_end_io_direct(inode);
+       }
+       return ret;
+}
+EXPORT_SYMBOL(netfs_unbuffered_read_iter);
index 17e4ea4456c7555a47cb40f3f6fdef525a50235a..886c2e8f841fbe71a3b6188166c9906c30f72d03 100644 (file)
@@ -100,6 +100,7 @@ int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
  * stats.c
  */
 #ifdef CONFIG_NETFS_STATS
+extern atomic_t netfs_n_rh_dio_read;
 extern atomic_t netfs_n_rh_readahead;
 extern atomic_t netfs_n_rh_readpage;
 extern atomic_t netfs_n_rh_rreq;
index 774aef6ea4cbcd4a4539d70c12e05ad3c1ae0934..c972415c8aad382bf28306d19880fccfdad349f9 100644 (file)
@@ -78,7 +78,9 @@ static void netfs_read_from_server(struct netfs_io_request *rreq,
                                   struct netfs_io_subrequest *subreq)
 {
        netfs_stat(&netfs_n_rh_download);
-       if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
+
+       if (rreq->origin != NETFS_DIO_READ &&
+           iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
                pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n",
                        rreq->debug_id, subreq->debug_index,
                        iov_iter_count(&subreq->io_iter), subreq->len,
@@ -341,6 +343,43 @@ static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq)
        }
 }
 
+/*
+ * Determine how much we can admit to having read from a DIO read.
+ */
+static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
+{
+       struct netfs_io_subrequest *subreq;
+       unsigned int i;
+       size_t transferred = 0;
+
+       for (i = 0; i < rreq->direct_bv_count; i++)
+               flush_dcache_page(rreq->direct_bv[i].bv_page);
+
+       list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+               if (subreq->error || subreq->transferred == 0)
+                       break;
+               transferred += subreq->transferred;
+               if (subreq->transferred < subreq->len)
+                       break;
+       }
+
+       for (i = 0; i < rreq->direct_bv_count; i++)
+               flush_dcache_page(rreq->direct_bv[i].bv_page);
+
+       rreq->transferred = transferred;
+       task_io_account_read(transferred);
+
+       if (rreq->iocb) {
+               rreq->iocb->ki_pos += transferred;
+               if (rreq->iocb->ki_complete)
+                       rreq->iocb->ki_complete(
+                               rreq->iocb, rreq->error ? rreq->error : transferred);
+       }
+       if (rreq->netfs_ops->done)
+               rreq->netfs_ops->done(rreq);
+       inode_dio_end(rreq->inode);
+}
+
 /*
  * Assess the state of a read request and decide what to do next.
  *
@@ -361,7 +400,10 @@ again:
                return;
        }
 
-       netfs_rreq_unlock_folios(rreq);
+       if (rreq->origin != NETFS_DIO_READ)
+               netfs_rreq_unlock_folios(rreq);
+       else
+               netfs_rreq_assess_dio(rreq);
 
        trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
        clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
@@ -526,14 +568,16 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
                        struct netfs_io_subrequest *subreq,
                        struct iov_iter *io_iter)
 {
-       enum netfs_io_source source;
+       enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER;
        size_t lsize;
 
        _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
 
-       source = netfs_cache_prepare_read(subreq, rreq->i_size);
-       if (source == NETFS_INVALID_READ)
-               goto out;
+       if (rreq->origin != NETFS_DIO_READ) {
+               source = netfs_cache_prepare_read(subreq, rreq->i_size);
+               if (source == NETFS_INVALID_READ)
+                       goto out;
+       }
 
        if (source == NETFS_DOWNLOAD_FROM_SERVER) {
                /* Call out to the netfs to let it shrink the request to fit
@@ -544,6 +588,8 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
                 */
                if (subreq->len > rreq->i_size - subreq->start)
                        subreq->len = rreq->i_size - subreq->start;
+               if (rreq->rsize && subreq->len > rreq->rsize)
+                       subreq->len = rreq->rsize;
 
                if (rreq->netfs_ops->clamp_length &&
                    !rreq->netfs_ops->clamp_length(subreq)) {
@@ -662,6 +708,10 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
                return -EIO;
        }
 
+       if (rreq->origin == NETFS_DIO_READ)
+               inode_dio_begin(rreq->inode);
+
+       // TODO: Use bounce buffer if requested
        rreq->io_iter = rreq->iter;
 
        INIT_WORK(&rreq->work, netfs_rreq_work);
@@ -673,11 +723,25 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
        atomic_set(&rreq->nr_outstanding, 1);
        io_iter = rreq->io_iter;
        do {
+               _debug("submit %llx + %zx >= %llx",
+                      rreq->start, rreq->submitted, rreq->i_size);
+               if (rreq->origin == NETFS_DIO_READ &&
+                   rreq->start + rreq->submitted >= rreq->i_size)
+                       break;
                if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index))
                        break;
+               if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
+                   test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
+                       break;
 
        } while (rreq->submitted < rreq->len);
 
+       if (!rreq->submitted) {
+               netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
+               ret = 0;
+               goto out;
+       }
+
        if (sync) {
                /* Keep nr_outstanding incremented so that the ref always
                 * belongs to us, and the service code isn't punted off to a
@@ -694,7 +758,8 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
                            TASK_UNINTERRUPTIBLE);
 
                ret = rreq->error;
-               if (ret == 0 && rreq->submitted < rreq->len) {
+               if (ret == 0 && rreq->submitted < rreq->len &&
+                   rreq->origin != NETFS_DIO_READ) {
                        trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
                        ret = -EIO;
                }
@@ -702,7 +767,9 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
                /* If we decrement nr_outstanding to 0, the ref belongs to us. */
                if (atomic_dec_and_test(&rreq->nr_outstanding))
                        netfs_rreq_assess(rreq, false);
-               ret = 0;
+               ret = -EIOCBQUEUED;
        }
+
+out:
        return ret;
 }
index ab6cac11067625287d1e47afa5d69ce7e07db052..abb8857486ee1420a44893e2826ee9db76b03c8b 100644 (file)
@@ -30,6 +30,7 @@ static const char *netfs_origins[nr__netfs_io_origin] = {
        [NETFS_READPAGE]        = "RP",
        [NETFS_READ_FOR_WRITE]  = "RW",
        [NETFS_WRITEBACK]       = "WB",
+       [NETFS_DIO_READ]        = "DR",
 };
 
 /*
index 3aa0bfbc04ec3b00e5e31b4a7fd540a41a44afe8..7153f24e80341ddb516c395b123c998b5b533407 100644 (file)
@@ -20,7 +20,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
        struct inode *inode = file ? file_inode(file) : mapping->host;
        struct netfs_inode *ctx = netfs_inode(inode);
        struct netfs_io_request *rreq;
-       bool cached = netfs_is_cache_enabled(ctx);
+       bool is_dio = (origin == NETFS_DIO_READ);
+       bool cached = is_dio && netfs_is_cache_enabled(ctx);
        int ret;
 
        rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request),
@@ -42,6 +43,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
        __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
        if (cached)
                __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
+       if (file && file->f_flags & O_NONBLOCK)
+               __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
        if (rreq->netfs_ops->init_request) {
                ret = rreq->netfs_ops->init_request(rreq, file);
                if (ret < 0) {
index c1f85cd595a47f0ddc21fb5cf083faad3957b6cd..15fd5c3f0f392bd9253e091a32bd851f16f3aef6 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/seq_file.h>
 #include "internal.h"
 
+atomic_t netfs_n_rh_dio_read;
 atomic_t netfs_n_rh_readahead;
 atomic_t netfs_n_rh_readpage;
 atomic_t netfs_n_rh_rreq;
@@ -36,7 +37,8 @@ atomic_t netfs_n_wh_write_failed;
 
 int netfs_stats_show(struct seq_file *m, void *v)
 {
-       seq_printf(m, "Netfs  : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n",
+       seq_printf(m, "Netfs  : DR=%u RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n",
+                  atomic_read(&netfs_n_rh_dio_read),
                   atomic_read(&netfs_n_rh_readahead),
                   atomic_read(&netfs_n_rh_readpage),
                   atomic_read(&netfs_n_rh_write_begin),
index 70f578cf3715f97845eb38f193a98a424b48a269..7c13095684598387967de226d19b0e54dd8b862c 100644 (file)
@@ -226,6 +226,7 @@ enum netfs_io_origin {
        NETFS_READPAGE,                 /* This read is a synchronous read */
        NETFS_READ_FOR_WRITE,           /* This read is to prepare a write */
        NETFS_WRITEBACK,                /* This write was triggered by writepages */
+       NETFS_DIO_READ,                 /* This is a direct I/O read */
        nr__netfs_io_origin
 } __mode(byte);
 
@@ -240,6 +241,7 @@ struct netfs_io_request {
        };
        struct inode            *inode;         /* The file being accessed */
        struct address_space    *mapping;       /* The mapping being accessed */
+       struct kiocb            *iocb;          /* AIO completion vector */
        struct netfs_cache_resources cache_resources;
        struct list_head        proc_link;      /* Link in netfs_iorequests */
        struct list_head        subrequests;    /* Contributory I/O operations */
@@ -249,12 +251,14 @@ struct netfs_io_request {
        struct bio_vec          *direct_bv;     /* DIO buffer list (when handling iovec-iter) */
        unsigned int            direct_bv_count; /* Number of elements in direct_bv[] */
        unsigned int            debug_id;
+       unsigned int            rsize;          /* Maximum read size (0 for none) */
        unsigned int            wsize;          /* Maximum write size (0 for none) */
        unsigned int            subreq_counter; /* Next subreq->debug_index */
        atomic_t                nr_outstanding; /* Number of ops in progress */
        atomic_t                nr_copy_ops;    /* Number of copy-to-cache ops in progress */
        size_t                  submitted;      /* Amount submitted for I/O so far */
        size_t                  len;            /* Length of the request */
+       size_t                  transferred;    /* Amount to be indicated as transferred */
        short                   error;          /* 0 or error that occurred */
        enum netfs_io_origin    origin;         /* Origin of the request */
        bool                    direct_bv_unpin; /* T if direct_bv[] must be unpinned */
@@ -271,6 +275,8 @@ struct netfs_io_request {
 #define NETFS_RREQ_IN_PROGRESS         5       /* Unlocked when the request completes */
 #define NETFS_RREQ_WRITE_TO_CACHE      7       /* Need to write to the cache */
 #define NETFS_RREQ_UPLOAD_TO_SERVER    8       /* Need to write to the server */
+#define NETFS_RREQ_NONBLOCK            9       /* Don't block if possible (O_NONBLOCK) */
+#define NETFS_RREQ_BLOCKED             10      /* We blocked */
        const struct netfs_request_ops *netfs_ops;
        void (*cleanup)(struct netfs_io_request *req);
 };
@@ -367,6 +373,9 @@ struct netfs_cache_ops {
                               loff_t *_data_start, size_t *_data_len);
 };
 
+/* High-level read API. */
+ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter);
+
 /* High-level write API */
 ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
                            struct netfs_group *netfs_group);
index 082a5e717b58ff420c2cdab45fe97ad1809216b6..5a4edadf0e59d5cdaaed54f93eeff6c6453f826e 100644 (file)
@@ -16,6 +16,7 @@
  * Define enums for tracing information.
  */
 #define netfs_read_traces                                      \
+       EM(netfs_read_trace_dio_read,           "DIO-READ ")    \
        EM(netfs_read_trace_expanded,           "EXPANDED ")    \
        EM(netfs_read_trace_readahead,          "READAHEAD")    \
        EM(netfs_read_trace_readpage,           "READPAGE ")    \
@@ -31,7 +32,8 @@
        EM(NETFS_READAHEAD,                     "RA")           \
        EM(NETFS_READPAGE,                      "RP")           \
        EM(NETFS_READ_FOR_WRITE,                "RW")           \
-       E_(NETFS_WRITEBACK,                     "WB")
+       EM(NETFS_WRITEBACK,                     "WB")           \
+       E_(NETFS_DIO_READ,                      "DR")
 
 #define netfs_rreq_traces                                      \
        EM(netfs_rreq_trace_assess,             "ASSESS ")      \
@@ -70,6 +72,8 @@
 #define netfs_failures                                                 \
        EM(netfs_fail_check_write_begin,        "check-write-begin")    \
        EM(netfs_fail_copy_to_cache,            "copy-to-cache")        \
+       EM(netfs_fail_dio_read_short,           "dio-read-short")       \
+       EM(netfs_fail_dio_read_zero,            "dio-read-zero")        \
        EM(netfs_fail_read,                     "read")                 \
        EM(netfs_fail_short_read,               "short-read")           \
        EM(netfs_fail_prepare_write,            "prep-write")           \
@@ -81,6 +85,7 @@
        EM(netfs_rreq_trace_put_complete,       "PUT COMPLT ")  \
        EM(netfs_rreq_trace_put_discard,        "PUT DISCARD")  \
        EM(netfs_rreq_trace_put_failed,         "PUT FAILED ")  \
+       EM(netfs_rreq_trace_put_no_submit,      "PUT NO-SUBM")  \
        EM(netfs_rreq_trace_put_return,         "PUT RETURN ")  \
        EM(netfs_rreq_trace_put_subreq,         "PUT SUBREQ ")  \
        EM(netfs_rreq_trace_put_work,           "PUT WORK   ")  \
index f1c8c278310fd51384d0e5dcfd309c4aff9541ff..1c5271ed0cc08330c208f493e7e57cad901afcf7 100644 (file)
@@ -2678,6 +2678,7 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
 
        return filemap_write_and_wait_range(mapping, pos, end);
 }
+EXPORT_SYMBOL_GPL(kiocb_write_and_wait);
 
 int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
 {