RDMA/hns: Support flexible umem page size
authorChengchang Tang <tangchengchang@huawei.com>
Sat, 13 Jan 2024 08:59:33 +0000 (16:59 +0800)
committerLeon Romanovsky <leon@kernel.org>
Thu, 25 Jan 2024 09:54:38 +0000 (11:54 +0200)
In the current implementation, a fixed page size is used to
configure the umem PBL, which is not flexible enough and is
not conducive to the performance of the HW. Find a best page
size to get better performance.

Signed-off-by: Chengchang Tang <tangchengchang@huawei.com>
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
Link: https://lore.kernel.org/r/20240113085935.2838701-5-huangjunxian6@hisilicon.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
drivers/infiniband/hw/hns/hns_roce_device.h
drivers/infiniband/hw/hns/hns_roce_mr.c

index dd652dc090b02f30ba4fbca872ffbf83abcac368..1a8516019516a6700016f5a2c35bd582d47b9cc4 100644 (file)
@@ -179,6 +179,7 @@ enum {
 
 #define HNS_ROCE_CMD_SUCCESS                   1
 
+#define HNS_ROCE_MAX_HOP_NUM                   3
 /* The minimum page size is 4K for hardware */
 #define HNS_HW_PAGE_SHIFT                      12
 #define HNS_HW_PAGE_SIZE                       (1 << HNS_HW_PAGE_SHIFT)
@@ -269,6 +270,11 @@ struct hns_roce_hem_list {
        dma_addr_t root_ba; /* pointer to the root ba table */
 };
 
+enum mtr_type {
+       MTR_DEFAULT = 0,
+       MTR_PBL,
+};
+
 struct hns_roce_buf_attr {
        struct {
                size_t  size;  /* region size */
@@ -277,7 +283,10 @@ struct hns_roce_buf_attr {
        unsigned int region_count; /* valid region count */
        unsigned int page_shift;  /* buffer page shift */
        unsigned int user_access; /* umem access flag */
+       u64 iova;
+       enum mtr_type type;
        bool mtt_only; /* only alloc buffer-required MTT memory */
+       bool adaptive; /* adaptive for page_shift and hopnum */
 };
 
 struct hns_roce_hem_cfg {
index 74ea9d8482b971216f89814499a903999f204653..d00b4aa7214b5dcccceebcb35a53fed98f7535a8 100644 (file)
@@ -32,6 +32,7 @@
  */
 
 #include <linux/vmalloc.h>
+#include <linux/count_zeros.h>
 #include <rdma/ib_umem.h>
 #include <linux/math.h>
 #include "hns_roce_device.h"
@@ -103,6 +104,10 @@ static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr,
        buf_attr.user_access = mr->access;
        /* fast MR's buffer is alloced before mapping, not at creation */
        buf_attr.mtt_only = is_fast;
+       buf_attr.iova = mr->iova;
+       /* pagesize and hopnum is fixed for fast MR */
+       buf_attr.adaptive = !is_fast;
+       buf_attr.type = MTR_PBL;
 
        err = hns_roce_mtr_create(hr_dev, &mr->pbl_mtr, &buf_attr,
                                  hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT,
@@ -721,6 +726,16 @@ static int cal_mtr_pg_cnt(struct hns_roce_mtr *mtr)
        return page_cnt;
 }
 
+static bool need_split_huge_page(struct hns_roce_mtr *mtr)
+{
+       /* When HEM buffer uses 0-level addressing, the page size is
+        * equal to the whole buffer size. If the current MTR has multiple
+        * regions, we split the buffer into small pages(4k, required by hns
+        * ROCEE). These pages will be used in multiple regions.
+        */
+       return mtr->hem_cfg.is_direct && mtr->hem_cfg.region_count > 1;
+}
+
 static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr)
 {
        struct ib_device *ibdev = &hr_dev->ib_dev;
@@ -730,14 +745,8 @@ static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr)
        int npage;
        int ret;
 
-       /* When HEM buffer uses 0-level addressing, the page size is
-        * equal to the whole buffer size, and we split the buffer into
-        * small pages which is used to check whether the adjacent
-        * units are in the continuous space and its size is fixed to
-        * 4K based on hns ROCEE's requirement.
-        */
-       page_shift = mtr->hem_cfg.is_direct ? HNS_HW_PAGE_SHIFT :
-                                             mtr->hem_cfg.buf_pg_shift;
+       page_shift = need_split_huge_page(mtr) ? HNS_HW_PAGE_SHIFT :
+                                                mtr->hem_cfg.buf_pg_shift;
        /* alloc a tmp array to store buffer's dma address */
        pages = kvcalloc(page_count, sizeof(dma_addr_t), GFP_KERNEL);
        if (!pages)
@@ -757,7 +766,7 @@ static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr)
                goto err_alloc_list;
        }
 
-       if (mtr->hem_cfg.is_direct && npage > 1) {
+       if (need_split_huge_page(mtr) && npage > 1) {
                ret = mtr_check_direct_pages(pages, npage, page_shift);
                if (ret) {
                        ibdev_err(ibdev, "failed to check %s page: %d / %d.\n",
@@ -915,56 +924,89 @@ int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
        return ret;
 }
 
-static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev,
-                           struct hns_roce_buf_attr *attr,
-                           struct hns_roce_hem_cfg *cfg, u64 unalinged_size)
+static int get_best_page_shift(struct hns_roce_dev *hr_dev,
+                              struct hns_roce_mtr *mtr,
+                              struct hns_roce_buf_attr *buf_attr)
+{
+       unsigned int page_sz;
+
+       if (!buf_attr->adaptive || buf_attr->type != MTR_PBL || !mtr->umem)
+               return 0;
+
+       page_sz = ib_umem_find_best_pgsz(mtr->umem,
+                                        hr_dev->caps.page_size_cap,
+                                        buf_attr->iova);
+       if (!page_sz)
+               return -EINVAL;
+
+       buf_attr->page_shift = order_base_2(page_sz);
+       return 0;
+}
+
+static bool is_buf_attr_valid(struct hns_roce_dev *hr_dev,
+                             struct hns_roce_buf_attr *attr)
 {
        struct ib_device *ibdev = &hr_dev->ib_dev;
+
+       if (attr->region_count > ARRAY_SIZE(attr->region) ||
+           attr->region_count < 1 || attr->page_shift < HNS_HW_PAGE_SHIFT) {
+               ibdev_err(ibdev,
+                         "invalid buf attr, region count %d, page shift %u.\n",
+                         attr->region_count, attr->page_shift);
+               return false;
+       }
+
+       return true;
+}
+
+static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev,
+                           struct hns_roce_mtr *mtr,
+                           struct hns_roce_buf_attr *attr)
+{
+       struct hns_roce_hem_cfg *cfg = &mtr->hem_cfg;
        struct hns_roce_buf_region *r;
-       u64 first_region_padding;
-       int page_cnt, region_cnt;
        size_t buf_pg_sz;
        size_t buf_size;
+       int page_cnt, i;
+       u64 pgoff = 0;
+
+       if (!is_buf_attr_valid(hr_dev, attr))
+               return -EINVAL;
 
        /* If mtt is disabled, all pages must be within a continuous range */
        cfg->is_direct = !mtr_has_mtt(attr);
+       cfg->region_count = attr->region_count;
        buf_size = mtr_bufs_size(attr);
-       if (cfg->is_direct) {
+       if (need_split_huge_page(mtr)) {
                buf_pg_sz = HNS_HW_PAGE_SIZE;
                cfg->buf_pg_count = 1;
                /* The ROCEE requires the page size to be 4K * 2 ^ N. */
                cfg->buf_pg_shift = HNS_HW_PAGE_SHIFT +
                        order_base_2(DIV_ROUND_UP(buf_size, HNS_HW_PAGE_SIZE));
-               first_region_padding = 0;
        } else {
-               cfg->buf_pg_count = DIV_ROUND_UP(buf_size + unalinged_size,
-                                                1 << attr->page_shift);
+               buf_pg_sz = 1 << attr->page_shift;
+               cfg->buf_pg_count = mtr->umem ?
+                       ib_umem_num_dma_blocks(mtr->umem, buf_pg_sz) :
+                       DIV_ROUND_UP(buf_size, buf_pg_sz);
                cfg->buf_pg_shift = attr->page_shift;
-               buf_pg_sz = 1 << cfg->buf_pg_shift;
-               first_region_padding = unalinged_size;
+               pgoff = mtr->umem ? mtr->umem->address & ~PAGE_MASK : 0;
        }
 
        /* Convert buffer size to page index and page count for each region and
         * the buffer's offset needs to be appended to the first region.
         */
-       for (page_cnt = 0, region_cnt = 0; region_cnt < attr->region_count &&
-            region_cnt < ARRAY_SIZE(cfg->region); region_cnt++) {
-               r = &cfg->region[region_cnt];
+       for (page_cnt = 0, i = 0; i < attr->region_count; i++) {
+               r = &cfg->region[i];
                r->offset = page_cnt;
-               buf_size = hr_hw_page_align(attr->region[region_cnt].size +
-                                           first_region_padding);
-               r->count = DIV_ROUND_UP(buf_size, buf_pg_sz);
-               first_region_padding = 0;
-               page_cnt += r->count;
-               r->hopnum = to_hr_hem_hopnum(attr->region[region_cnt].hopnum,
-                                            r->count);
-       }
+               buf_size = hr_hw_page_align(attr->region[i].size + pgoff);
+               if (attr->type == MTR_PBL && mtr->umem)
+                       r->count = ib_umem_num_dma_blocks(mtr->umem, buf_pg_sz);
+               else
+                       r->count = DIV_ROUND_UP(buf_size, buf_pg_sz);
 
-       cfg->region_count = region_cnt;
-       if (cfg->region_count < 1 || cfg->buf_pg_shift < HNS_HW_PAGE_SHIFT) {
-               ibdev_err(ibdev, "failed to init mtr cfg, count %d shift %u.\n",
-                         cfg->region_count, cfg->buf_pg_shift);
-               return -EINVAL;
+               pgoff = 0;
+               page_cnt += r->count;
+               r->hopnum = to_hr_hem_hopnum(attr->region[i].hopnum, r->count);
        }
 
        return 0;
@@ -1054,7 +1096,6 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
                        unsigned int ba_page_shift, struct ib_udata *udata,
                        unsigned long user_addr)
 {
-       u64 pgoff = udata ? user_addr & ~PAGE_MASK : 0;
        struct ib_device *ibdev = &hr_dev->ib_dev;
        int ret;
 
@@ -1071,9 +1112,13 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
                                  "failed to alloc mtr bufs, ret = %d.\n", ret);
                        return ret;
                }
+
+               ret = get_best_page_shift(hr_dev, mtr, buf_attr);
+               if (ret)
+                       goto err_init_buf;
        }
 
-       ret = mtr_init_buf_cfg(hr_dev, buf_attr, &mtr->hem_cfg, pgoff);
+       ret = mtr_init_buf_cfg(hr_dev, mtr, buf_attr);
        if (ret)
                goto err_init_buf;