xprtrdma: Provide a buffer to pad Write chunks of unaligned length
authorChuck Lever <chuck.lever@oracle.com>
Tue, 5 Oct 2021 14:17:59 +0000 (10:17 -0400)
committerTrond Myklebust <trond.myklebust@hammerspace.com>
Wed, 20 Oct 2021 22:09:54 +0000 (18:09 -0400)
This is a buffer to be left persistently registered while a
connection is up. Connection tear-down will automatically DMA-unmap,
invalidate, and dereg the MR. A persistently registered buffer is
lower in cost to provide, and it can never be coalesced into the
RDMA segment that carries the data payload.

An RPC that provisions a Write chunk with a non-aligned length now
uses this MR rather than the tail buffer of the RPC's rq_rcv_buf.

Reviewed-By: Tom Talpey <tom@talpey.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
include/trace/events/rpcrdma.h
net/sunrpc/xprtrdma/frwr_ops.c
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h

index de41954995926de1ff021f7d16ffecd174779ddf..afb2e394797c017461de7700d9d6d4a8340753a6 100644 (file)
@@ -375,10 +375,16 @@ DECLARE_EVENT_CLASS(xprtrdma_mr_class,
 
        TP_fast_assign(
                const struct rpcrdma_req *req = mr->mr_req;
-               const struct rpc_task *task = req->rl_slot.rq_task;
 
-               __entry->task_id = task->tk_pid;
-               __entry->client_id = task->tk_client->cl_clid;
+               if (req) {
+                       const struct rpc_task *task = req->rl_slot.rq_task;
+
+                       __entry->task_id = task->tk_pid;
+                       __entry->client_id = task->tk_client->cl_clid;
+               } else {
+                       __entry->task_id = 0;
+                       __entry->client_id = -1;
+               }
                __entry->mr_id  = mr->mr_ibmr->res.id;
                __entry->nents  = mr->mr_nents;
                __entry->handle = mr->mr_handle;
@@ -639,6 +645,7 @@ TRACE_EVENT(xprtrdma_nomrs_err,
 DEFINE_RDCH_EVENT(read);
 DEFINE_WRCH_EVENT(write);
 DEFINE_WRCH_EVENT(reply);
+DEFINE_WRCH_EVENT(wp);
 
 TRACE_DEFINE_ENUM(rpcrdma_noch);
 TRACE_DEFINE_ENUM(rpcrdma_noch_pullup);
index f700b34a5bfd24d8260b0b3641a29195368ef48c..3eccf365fcb8c855c0ae4d142185f7ec63bcb854 100644 (file)
@@ -666,3 +666,38 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
         */
        rpcrdma_force_disconnect(ep);
 }
+
+/**
+ * frwr_wp_create - Create an MR for padding Write chunks
+ * @r_xprt: transport resources to use
+ *
+ * Return 0 on success, negative errno on failure.
+ */
+int frwr_wp_create(struct rpcrdma_xprt *r_xprt)
+{
+       struct rpcrdma_ep *ep = r_xprt->rx_ep;
+       struct rpcrdma_mr_seg seg;
+       struct rpcrdma_mr *mr;
+
+       mr = rpcrdma_mr_get(r_xprt);
+       if (!mr)
+               return -EAGAIN;
+       mr->mr_req = NULL;
+       ep->re_write_pad_mr = mr;
+
+       seg.mr_len = XDR_UNIT;
+       seg.mr_page = virt_to_page(ep->re_write_pad);
+       seg.mr_offset = offset_in_page(ep->re_write_pad);
+       if (IS_ERR(frwr_map(r_xprt, &seg, 1, true, xdr_zero, mr)))
+               return -EIO;
+       trace_xprtrdma_mr_fastreg(mr);
+
+       mr->mr_cqe.done = frwr_wc_fastreg;
+       mr->mr_regwr.wr.next = NULL;
+       mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe;
+       mr->mr_regwr.wr.num_sge = 0;
+       mr->mr_regwr.wr.opcode = IB_WR_REG_MR;
+       mr->mr_regwr.wr.send_flags = 0;
+
+       return ib_post_send(ep->re_id->qp, &mr->mr_regwr.wr, NULL);
+}
index c335c13615645cc73478f84b3ac72ec5d3b19893..8035a983c8cec0306b0c0a7c23bcf7f85af7c621 100644 (file)
@@ -255,15 +255,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
                page_base = 0;
        }
 
-       if (type == rpcrdma_readch)
-               goto out;
-
-       /* When encoding a Write chunk, some servers need to see an
-        * extra segment for non-XDR-aligned Write chunks. The upper
-        * layer provides space in the tail iovec that may be used
-        * for this purpose.
-        */
-       if (type == rpcrdma_writech && r_xprt->rx_ep->re_implicit_roundup)
+       if (type == rpcrdma_readch || type == rpcrdma_writech)
                goto out;
 
        if (xdrbuf->tail[0].iov_len)
@@ -405,6 +397,7 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
                                     enum rpcrdma_chunktype wtype)
 {
        struct xdr_stream *xdr = &req->rl_stream;
+       struct rpcrdma_ep *ep = r_xprt->rx_ep;
        struct rpcrdma_mr_seg *seg;
        struct rpcrdma_mr *mr;
        int nsegs, nchunks;
@@ -443,6 +436,18 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
                nsegs -= mr->mr_nents;
        } while (nsegs);
 
+       if (xdr_pad_size(rqst->rq_rcv_buf.page_len)) {
+               if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0)
+                       return -EMSGSIZE;
+
+               trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr,
+                                       nsegs);
+               r_xprt->rx_stats.write_chunk_count++;
+               r_xprt->rx_stats.total_rdma_request += mr->mr_length;
+               nchunks++;
+               nsegs -= mr->mr_nents;
+       }
+
        /* Update count of segments in this Write chunk */
        *segcount = cpu_to_be32(nchunks);
 
index aaec3c9be8db62d28a7d76bdc408d1e4678a9695..c3784b7b685540ac2e345eb880723beac093db90 100644 (file)
@@ -551,6 +551,7 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt)
                goto out;
        }
        rpcrdma_mrs_create(r_xprt);
+       frwr_wp_create(r_xprt);
 
 out:
        trace_xprtrdma_connect(r_xprt, rc);
index d91f54eae00b37c1312746958cf59009f0b719d8..b6d8b3e6356cb3fcbf2dd8bcd9dc3d523a56f705 100644 (file)
 /*
  * RDMA Endpoint -- connection endpoint details
  */
+struct rpcrdma_mr;
 struct rpcrdma_ep {
        struct kref             re_kref;
        struct rdma_cm_id       *re_id;
        struct ib_pd            *re_pd;
        unsigned int            re_max_rdma_segs;
        unsigned int            re_max_fr_depth;
+       struct rpcrdma_mr       *re_write_pad_mr;
        bool                    re_implicit_roundup;
        enum ib_mr_type         re_mrtype;
        struct completion       re_done;
@@ -97,6 +99,8 @@ struct rpcrdma_ep {
        unsigned int            re_inline_recv; /* negotiated */
 
        atomic_t                re_completion_ids;
+
+       char                    re_write_pad[XDR_UNIT];
 };
 
 /* Pre-allocate extra Work Requests for handling reverse-direction
@@ -535,6 +539,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
 void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
 void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
 void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
+int frwr_wp_create(struct rpcrdma_xprt *r_xprt);
 
 /*
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c