drm/amdkfd: Update SDMA queue management for GFX9.4.3
authorMukul Joshi <mukul.joshi@amd.com>
Tue, 10 May 2022 02:52:39 +0000 (22:52 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 9 Jun 2023 13:43:05 +0000 (09:43 -0400)
This patch updates SDMA queue management for multi XCC in GFX9.4.3.
- Allocate/deallocate SDMA queues from the correct SDMA engines
  based on the partition mode.
- Updates the kgd2kfd interface to fetch the correct SDMA register
  addresses.
- It also fixes dumping correct SDMA queue info in debugfs.

v2: squash in fix "drm/amdkfd: Fix XGMI SDMA user-mode queue allocation"

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
drivers/gpu/drm/amd/amdkfd/kfd_device.c
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 49d8087e469ec6f995fe9b92820469e2e8d8cb4f..e81bdca53f42fe47ba2437edca6a157979a84025 100644 (file)
 #include "oss/osssys_4_0_sh_mask.h"
 #include "v9_structs.h"
 #include "soc15.h"
+#include "sdma/sdma_4_4_2_offset.h"
+#include "sdma/sdma_4_4_2_sh_mask.h"
+
+static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
+{
+       return (struct v9_sdma_mqd *)mqd;
+}
+
+static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev,
+                                       unsigned int engine_id,
+                                       unsigned int queue_id)
+{
+       uint32_t sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, engine_id,
+                                       regSDMA_RLC0_RB_CNTL) -
+                                       regSDMA_RLC0_RB_CNTL;
+       uint32_t retval = sdma_engine_reg_base +
+                 queue_id * (regSDMA_RLC1_RB_CNTL - regSDMA_RLC0_RB_CNTL);
+
+       pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n", engine_id,
+                                                       queue_id, retval);
+       return retval;
+}
+
+int kgd_gfx_v9_4_3_hqd_sdma_load(struct amdgpu_device *adev, void *mqd,
+                                uint32_t __user *wptr, struct mm_struct *mm)
+{
+       struct v9_sdma_mqd *m;
+       uint32_t sdma_rlc_reg_offset;
+       unsigned long end_jiffies;
+       uint32_t data;
+       uint64_t data64;
+       uint64_t __user *wptr64 = (uint64_t __user *)wptr;
+
+       m = get_sdma_mqd(mqd);
+       sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
+                                                       m->sdma_queue_id);
+
+       WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL,
+               m->sdmax_rlcx_rb_cntl & (~SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK));
+
+       end_jiffies = msecs_to_jiffies(2000) + jiffies;
+       while (true) {
+               data = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_CONTEXT_STATUS);
+               if (data & SDMA_RLC0_CONTEXT_STATUS__IDLE_MASK)
+                       break;
+               if (time_after(jiffies, end_jiffies)) {
+                       pr_err("SDMA RLC not idle in %s\n", __func__);
+                       return -ETIME;
+               }
+               usleep_range(500, 1000);
+       }
+
+       WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_DOORBELL_OFFSET,
+               m->sdmax_rlcx_doorbell_offset);
+
+       data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA_RLC0_DOORBELL,
+                               ENABLE, 1);
+       WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_DOORBELL, data);
+       WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR,
+                                       m->sdmax_rlcx_rb_rptr);
+       WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR_HI,
+                                       m->sdmax_rlcx_rb_rptr_hi);
+
+       WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_MINOR_PTR_UPDATE, 1);
+       if (read_user_wptr(mm, wptr64, data64)) {
+               WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_WPTR,
+                       lower_32_bits(data64));
+               WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_WPTR_HI,
+                       upper_32_bits(data64));
+       } else {
+               WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_WPTR,
+                       m->sdmax_rlcx_rb_rptr);
+               WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_WPTR_HI,
+                       m->sdmax_rlcx_rb_rptr_hi);
+       }
+       WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_MINOR_PTR_UPDATE, 0);
+
+       WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
+       WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_BASE_HI,
+                       m->sdmax_rlcx_rb_base_hi);
+       WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR_ADDR_LO,
+                       m->sdmax_rlcx_rb_rptr_addr_lo);
+       WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR_ADDR_HI,
+                       m->sdmax_rlcx_rb_rptr_addr_hi);
+
+       data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA_RLC0_RB_CNTL,
+                               RB_ENABLE, 1);
+       WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL, data);
+
+       return 0;
+}
+
+int kgd_gfx_v9_4_3_hqd_sdma_dump(struct amdgpu_device *adev,
+                                uint32_t engine_id, uint32_t queue_id,
+                                uint32_t (**dump)[2], uint32_t *n_regs)
+{
+       uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev,
+                                                       engine_id, queue_id);
+       uint32_t i = 0, reg;
+#undef HQD_N_REGS
+#define HQD_N_REGS (19+6+7+12)
+#define DUMP_REG(addr) do {                            \
+               if (WARN_ON_ONCE(i >= HQD_N_REGS))      \
+                       break;                          \
+               (*dump)[i][0] = (addr) << 2;            \
+               (*dump)[i++][1] = RREG32(addr);         \
+       } while (0)
+
+       *dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
+       if (*dump == NULL)
+               return -ENOMEM;
+
+       for (reg = regSDMA_RLC0_RB_CNTL; reg <= regSDMA_RLC0_DOORBELL; reg++)
+               DUMP_REG(sdma_rlc_reg_offset + reg);
+       for (reg = regSDMA_RLC0_STATUS; reg <= regSDMA_RLC0_CSA_ADDR_HI; reg++)
+               DUMP_REG(sdma_rlc_reg_offset + reg);
+       for (reg = regSDMA_RLC0_IB_SUB_REMAIN;
+            reg <= regSDMA_RLC0_MINOR_PTR_UPDATE; reg++)
+               DUMP_REG(sdma_rlc_reg_offset + reg);
+       for (reg = regSDMA_RLC0_MIDCMD_DATA0;
+            reg <= regSDMA_RLC0_MIDCMD_CNTL; reg++)
+               DUMP_REG(sdma_rlc_reg_offset + reg);
+
+       WARN_ON_ONCE(i != HQD_N_REGS);
+       *n_regs = i;
+
+       return 0;
+}
+
+bool kgd_gfx_v9_4_3_hqd_sdma_is_occupied(struct amdgpu_device *adev, void *mqd)
+{
+       struct v9_sdma_mqd *m;
+       uint32_t sdma_rlc_reg_offset;
+       uint32_t sdma_rlc_rb_cntl;
+
+       m = get_sdma_mqd(mqd);
+       sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
+                                                       m->sdma_queue_id);
+
+       sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL);
+
+       if (sdma_rlc_rb_cntl & SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK)
+               return true;
+
+       return false;
+}
+
+int kgd_gfx_v9_4_3_hqd_sdma_destroy(struct amdgpu_device *adev, void *mqd,
+                                   unsigned int utimeout)
+{
+       struct v9_sdma_mqd *m;
+       uint32_t sdma_rlc_reg_offset;
+       uint32_t temp;
+       unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
+
+       m = get_sdma_mqd(mqd);
+       sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
+                                                       m->sdma_queue_id);
+
+       temp = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL);
+       temp = temp & ~SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK;
+       WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL, temp);
+
+       while (true) {
+               temp = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_CONTEXT_STATUS);
+               if (temp & SDMA_RLC0_CONTEXT_STATUS__IDLE_MASK)
+                       break;
+               if (time_after(jiffies, end_jiffies)) {
+                       pr_err("SDMA RLC not idle in %s\n", __func__);
+                       return -ETIME;
+               }
+               usleep_range(500, 1000);
+       }
+
+       WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_DOORBELL, 0);
+       WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL,
+               RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL) |
+               SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK);
+
+       m->sdmax_rlcx_rb_rptr =
+                       RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR);
+       m->sdmax_rlcx_rb_rptr_hi =
+                       RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR_HI);
+
+       return 0;
+}
 
 static int kgd_gfx_v9_4_3_set_pasid_vmid_mapping(struct amdgpu_device *adev,
                                u32 pasid, unsigned int vmid, uint32_t inst)
@@ -166,13 +352,13 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
        .init_interrupts = kgd_gfx_v9_init_interrupts,
        .hqd_load = kgd_gfx_v9_4_3_hqd_load,
        .hiq_mqd_load = kgd_gfx_v9_hiq_mqd_load,
-       .hqd_sdma_load = kgd_arcturus_hqd_sdma_load,
+       .hqd_sdma_load = kgd_gfx_v9_4_3_hqd_sdma_load,
        .hqd_dump = kgd_gfx_v9_hqd_dump,
-       .hqd_sdma_dump = kgd_arcturus_hqd_sdma_dump,
+       .hqd_sdma_dump = kgd_gfx_v9_4_3_hqd_sdma_dump,
        .hqd_is_occupied = kgd_gfx_v9_hqd_is_occupied,
-       .hqd_sdma_is_occupied = kgd_arcturus_hqd_sdma_is_occupied,
+       .hqd_sdma_is_occupied = kgd_gfx_v9_4_3_hqd_sdma_is_occupied,
        .hqd_destroy = kgd_gfx_v9_hqd_destroy,
-       .hqd_sdma_destroy = kgd_arcturus_hqd_sdma_destroy,
+       .hqd_sdma_destroy = kgd_gfx_v9_4_3_hqd_sdma_destroy,
        .wave_control_execute = kgd_gfx_v9_wave_control_execute,
        .get_atc_vmid_pasid_mapping_info =
                                kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
index 37c6dc5c37bfb5269c2ccfc799988dc0c2ca9f59..ec5f85ff34e5106469011aafd8ae3d952b6d6a30 100644 (file)
@@ -741,6 +741,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
                if (!node)
                        goto node_alloc_error;
 
+               node->node_id = i;
                node->adev = kfd->adev;
                node->kfd = kfd;
                node->kfd2kgd = kfd->kfd2kgd;
@@ -1323,15 +1324,16 @@ unsigned int kfd_get_num_sdma_engines(struct kfd_node *node)
 {
        /* If XGMI is not supported, all SDMA engines are PCIe */
        if (!node->adev->gmc.xgmi.supported)
-               return node->adev->sdma.num_instances;
+               return node->adev->sdma.num_instances/(int)node->kfd->num_nodes;
 
-       return min(node->adev->sdma.num_instances, 2);
+       return min(node->adev->sdma.num_instances/(int)node->kfd->num_nodes, 2);
 }
 
 unsigned int kfd_get_num_xgmi_sdma_engines(struct kfd_node *node)
 {
        /* After reserved for PCIe, the rest of engines are XGMI */
-       return node->adev->sdma.num_instances - kfd_get_num_sdma_engines(node);
+       return node->adev->sdma.num_instances/(int)node->kfd->num_nodes -
+               kfd_get_num_sdma_engines(node);
 }
 
 #if defined(CONFIG_DEBUG_FS)
index f78c1e7aad57d44a74b7bab31394eb0a7f728ff6..69419a53a14e575ff5fc4526576bfef88ef739c5 100644 (file)
@@ -124,6 +124,15 @@ static inline uint64_t get_reserved_sdma_queues_bitmap(struct device_queue_manag
        return dqm->dev->kfd->device_info.reserved_sdma_queues_bitmap;
 }
 
+static void init_sdma_bitmaps(struct device_queue_manager *dqm)
+{
+       bitmap_zero(dqm->sdma_bitmap, KFD_MAX_SDMA_QUEUES);
+       bitmap_set(dqm->sdma_bitmap, 0, get_num_sdma_queues(dqm));
+
+       bitmap_zero(dqm->xgmi_sdma_bitmap, KFD_MAX_SDMA_QUEUES);
+       bitmap_set(dqm->xgmi_sdma_bitmap, 0, get_num_xgmi_sdma_queues(dqm));
+}
+
 void program_sh_mem_settings(struct device_queue_manager *dqm,
                                        struct qcm_process_device *qpd)
 {
@@ -1268,24 +1277,6 @@ static void init_interrupts(struct device_queue_manager *dqm)
        }
 }
 
-static void init_sdma_bitmaps(struct device_queue_manager *dqm)
-{
-       unsigned int num_sdma_queues =
-               min_t(unsigned int, sizeof(dqm->sdma_bitmap)*8,
-                     get_num_sdma_queues(dqm));
-       unsigned int num_xgmi_sdma_queues =
-               min_t(unsigned int, sizeof(dqm->xgmi_sdma_bitmap)*8,
-                     get_num_xgmi_sdma_queues(dqm));
-
-       if (num_sdma_queues)
-               dqm->sdma_bitmap = GENMASK_ULL(num_sdma_queues-1, 0);
-       if (num_xgmi_sdma_queues)
-               dqm->xgmi_sdma_bitmap = GENMASK_ULL(num_xgmi_sdma_queues-1, 0);
-
-       dqm->sdma_bitmap &= ~get_reserved_sdma_queues_bitmap(dqm);
-       pr_info("sdma_bitmap: %llx\n", dqm->sdma_bitmap);
-}
-
 static int initialize_nocpsch(struct device_queue_manager *dqm)
 {
        int pipe, queue;
@@ -1375,46 +1366,49 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
        int bit;
 
        if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
-               if (dqm->sdma_bitmap == 0) {
+               if (bitmap_empty(dqm->sdma_bitmap, KFD_MAX_SDMA_QUEUES)) {
                        pr_err("No more SDMA queue to allocate\n");
                        return -ENOMEM;
                }
 
                if (restore_sdma_id) {
                        /* Re-use existing sdma_id */
-                       if (!(dqm->sdma_bitmap & (1ULL << *restore_sdma_id))) {
+                       if (!test_bit(*restore_sdma_id, dqm->sdma_bitmap)) {
                                pr_err("SDMA queue already in use\n");
                                return -EBUSY;
                        }
-                       dqm->sdma_bitmap &= ~(1ULL << *restore_sdma_id);
+                       clear_bit(*restore_sdma_id, dqm->sdma_bitmap);
                        q->sdma_id = *restore_sdma_id;
                } else {
                        /* Find first available sdma_id */
-                       bit = __ffs64(dqm->sdma_bitmap);
-                       dqm->sdma_bitmap &= ~(1ULL << bit);
+                       bit = find_first_bit(dqm->sdma_bitmap,
+                                            get_num_sdma_queues(dqm));
+                       clear_bit(bit, dqm->sdma_bitmap);
                        q->sdma_id = bit;
                }
 
-               q->properties.sdma_engine_id = q->sdma_id %
-                               kfd_get_num_sdma_engines(dqm->dev);
+               q->properties.sdma_engine_id =
+                       dqm->dev->node_id * get_num_all_sdma_engines(dqm) +
+                       q->sdma_id % kfd_get_num_sdma_engines(dqm->dev);
                q->properties.sdma_queue_id = q->sdma_id /
                                kfd_get_num_sdma_engines(dqm->dev);
        } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
-               if (dqm->xgmi_sdma_bitmap == 0) {
+               if (bitmap_empty(dqm->xgmi_sdma_bitmap, KFD_MAX_SDMA_QUEUES)) {
                        pr_err("No more XGMI SDMA queue to allocate\n");
                        return -ENOMEM;
                }
                if (restore_sdma_id) {
                        /* Re-use existing sdma_id */
-                       if (!(dqm->xgmi_sdma_bitmap & (1ULL << *restore_sdma_id))) {
+                       if (!test_bit(*restore_sdma_id, dqm->xgmi_sdma_bitmap)) {
                                pr_err("SDMA queue already in use\n");
                                return -EBUSY;
                        }
-                       dqm->xgmi_sdma_bitmap &= ~(1ULL << *restore_sdma_id);
+                       clear_bit(*restore_sdma_id, dqm->xgmi_sdma_bitmap);
                        q->sdma_id = *restore_sdma_id;
                } else {
-                       bit = __ffs64(dqm->xgmi_sdma_bitmap);
-                       dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
+                       bit = find_first_bit(dqm->xgmi_sdma_bitmap,
+                                            get_num_xgmi_sdma_queues(dqm));
+                       clear_bit(bit, dqm->xgmi_sdma_bitmap);
                        q->sdma_id = bit;
                }
                /* sdma_engine_id is sdma id including
@@ -1424,6 +1418,7 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
                 * PCIe-optimized ones
                 */
                q->properties.sdma_engine_id =
+                       dqm->dev->node_id * get_num_all_sdma_engines(dqm) +
                        kfd_get_num_sdma_engines(dqm->dev) +
                        q->sdma_id % kfd_get_num_xgmi_sdma_engines(dqm->dev);
                q->properties.sdma_queue_id = q->sdma_id /
@@ -1442,11 +1437,11 @@ static void deallocate_sdma_queue(struct device_queue_manager *dqm,
        if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
                if (q->sdma_id >= get_num_sdma_queues(dqm))
                        return;
-               dqm->sdma_bitmap |= (1ULL << q->sdma_id);
+               set_bit(q->sdma_id, dqm->sdma_bitmap);
        } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
                if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
                        return;
-               dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
+               set_bit(q->sdma_id, dqm->xgmi_sdma_bitmap);
        }
 }
 
index e554a48f30542edc0766eccdeffb562760e9635a..b11c474d406716160d11c1842162c494fb1caa43 100644 (file)
@@ -239,8 +239,8 @@ struct device_queue_manager {
        unsigned int            total_queue_count;
        unsigned int            next_pipe_to_allocate;
        unsigned int            *allocated_queues;
-       uint64_t                sdma_bitmap;
-       uint64_t                xgmi_sdma_bitmap;
+       DECLARE_BITMAP(sdma_bitmap, KFD_MAX_SDMA_QUEUES);
+       DECLARE_BITMAP(xgmi_sdma_bitmap, KFD_MAX_SDMA_QUEUES);
        /* the pasid mapping for each kfd vmid */
        uint16_t                vmid_pasid[VMID_NUM];
        uint64_t                pipelines_addr;
index 1337fcdf8958a0ea35f6c0e317d519f729611f13..5cfebcc8b3059304d8c776371311c84ca688471f 100644 (file)
 
 #define KFD_UNMAP_LATENCY_MS   (4000)
 
+#define KFD_MAX_SDMA_QUEUES    128
+
 /*
  * 512 = 0x200
  * The doorbell index distance between SDMA RLC (2*i) and (2*i+1) in the
@@ -260,6 +262,7 @@ struct kfd_vmid_info {
 struct kfd_dev;
 
 struct kfd_node {
+       unsigned int node_id;
        struct amdgpu_device *adev;     /* Duplicated here along with keeping
                                         * a copy in kfd_dev to save a hop
                                         */