drm/amdgpu: Support passing poison consumption ras block to SRIOV
authorYiPeng Chai <YiPeng.Chai@amd.com>
Tue, 23 Jan 2024 08:08:11 +0000 (16:08 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 25 Jan 2024 19:58:03 +0000 (14:58 -0500)
Support passing poison consumption ras blocks
to SRIOV.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
13 files changed:
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c

index 77e2636602887034c188ec695591d20e5b087b60..dfb93664e866165bfc3b8139c5fe450a5a86f1b3 100644 (file)
@@ -732,9 +732,10 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev)
        amdgpu_device_flush_hdp(adev, NULL);
 }
 
-void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
+       enum amdgpu_ras_block block, bool reset)
 {
-       amdgpu_umc_poison_handler(adev, reset);
+       amdgpu_umc_poison_handler(adev, block, reset);
 }
 
 int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
index 584a0cea5572d626526c4663ff262fca10fc7240..50d3e0149032a251b41bbd6e44188b61b0e8af28 100644 (file)
@@ -334,7 +334,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
 int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
                                struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
-                               bool reset);
+                       enum amdgpu_ras_block block, bool reset);
 bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
index ebcd1cb60052207c168f56db3e240d713b9fb078..79bf6bd428a5a46dbffe191a72da02a59e4e8266 100644 (file)
@@ -2041,7 +2041,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
                }
        }
 
-       amdgpu_umc_poison_handler(adev, false);
+       amdgpu_umc_poison_handler(adev, obj->head.block, false);
 
        if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
                poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
index a6cdb69897f2c45be9e00cf7ae79258a0bf7b5e8..20436f81856ad280f112bf52dd42ea6157443b04 100644 (file)
@@ -246,7 +246,8 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
        return 0;
 }
 
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, bool reset)
 {
        int ret = AMDGPU_RAS_SUCCESS;
 
@@ -297,7 +298,7 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
                }
        } else {
                if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
-                       adev->virt.ops->ras_poison_handler(adev);
+                       adev->virt.ops->ras_poison_handler(adev, block);
                else
                        dev_warn(adev->dev,
                                "No ras_poison_handler interface in SRIOV!\n");
index 83199296ed106f79ef7c40a63330885ffc5cf97e..26d2ae498daf22bf2833cfdde9333c5f26523b44 100644 (file)
@@ -102,7 +102,8 @@ struct amdgpu_umc {
 
 int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset);
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, bool reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
                struct amdgpu_irq_src *source,
                struct amdgpu_iv_entry *entry);
index f4963330c772a9c717dae129e6ccf2ec1c4d3ef0..f300d4a4457d39ed613977ec620866efc10f4972 100644 (file)
@@ -1189,7 +1189,7 @@ int amdgpu_vcn_process_poison_irq(struct amdgpu_device *adev,
                amdgpu_ras_interrupt_dispatch(adev, &ih_data);
        } else {
                if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
-                       adev->virt.ops->ras_poison_handler(adev);
+                       adev->virt.ops->ras_poison_handler(adev, ras_if->block);
                else
                        dev_warn(adev->dev,
                                "No ras_poison_handler interface in SRIOV for VCN!\n");
index 1b49c007ff62d4745bfeb3c8aee722f1f10bf6f6..fa7be5f277b957b2e8fa9dd9ebef2c543991aa41 100644 (file)
@@ -88,7 +88,8 @@ struct amdgpu_virt_ops {
        int (*wait_reset)(struct amdgpu_device *adev);
        void (*trans_msg)(struct amdgpu_device *adev, enum idh_request req,
                          u32 data1, u32 data2, u32 data3);
-       void (*ras_poison_handler)(struct amdgpu_device *adev);
+       void (*ras_poison_handler)(struct amdgpu_device *adev,
+                                       enum amdgpu_ras_block block);
 };
 
 /*
index 26d6286d86c9991f98c1ace2b9becb54eadeeae3..9e7ce1e6bc0613cda09a142e747344ea96877220 100644 (file)
@@ -69,7 +69,7 @@ static int gfx_v11_0_3_rlc_gc_fed_irq(struct amdgpu_device *adev,
                amdgpu_ras_interrupt_dispatch(adev, &ih_data);
        } else {
                if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
-                       adev->virt.ops->ras_poison_handler(adev);
+                       adev->virt.ops->ras_poison_handler(adev, ras_if->block);
                else
                        dev_warn(adev->dev,
                                "No ras_poison_handler interface in SRIOV for %s!\n", ras_if->name);
index 63725b2ebc03733f607aaf9dd9f8a649f75d2dae..a2bd2c3b1ef9c4a4dfac7ab131a6588429d00856 100644 (file)
@@ -404,7 +404,8 @@ static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
        return xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_INIT_DATA);
 }
 
-static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev)
+static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev,
+                                       enum amdgpu_ras_block block)
 {
        xgpu_ai_send_access_requests(adev, IDH_RAS_POISON);
 }
index 6a68ee946f1cc3f58862a8c67fd3c53f4fdb092b..d0a018da3c7a8f40aca14bfd773bbce1ce239474 100644 (file)
@@ -152,14 +152,14 @@ static void xgpu_nv_mailbox_trans_msg (struct amdgpu_device *adev,
        xgpu_nv_mailbox_set_valid(adev, false);
 }
 
-static int xgpu_nv_send_access_requests(struct amdgpu_device *adev,
-                                       enum idh_request req)
+static int xgpu_nv_send_access_requests_with_param(struct amdgpu_device *adev,
+                       enum idh_request req, u32 data1, u32 data2, u32 data3)
 {
        int r, retry = 1;
        enum idh_event event = -1;
 
 send_request:
-       xgpu_nv_mailbox_trans_msg(adev, req, 0, 0, 0);
+       xgpu_nv_mailbox_trans_msg(adev, req, data1, data2, data3);
 
        switch (req) {
        case IDH_REQ_GPU_INIT_ACCESS:
@@ -206,6 +206,13 @@ send_request:
        return 0;
 }
 
+static int xgpu_nv_send_access_requests(struct amdgpu_device *adev,
+                                       enum idh_request req)
+{
+       return xgpu_nv_send_access_requests_with_param(adev,
+                                               req, 0, 0, 0);
+}
+
 static int xgpu_nv_request_reset(struct amdgpu_device *adev)
 {
        int ret, i = 0;
@@ -424,9 +431,15 @@ void xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
        amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
 }
 
-static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev)
+static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev,
+               enum amdgpu_ras_block block)
 {
-       xgpu_nv_send_access_requests(adev, IDH_RAS_POISON);
+       if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) {
+               xgpu_nv_send_access_requests(adev, IDH_RAS_POISON);
+       } else {
+               xgpu_nv_send_access_requests_with_param(adev,
+                                       IDH_RAS_POISON, block, 0, 0);
+       }
 }
 
 const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
index a7697ec8188e094a78807e1a6fcada06318af191..9a06c6fb6605851ae9c26ff4a81c66d358b9a69f 100644 (file)
@@ -132,6 +132,7 @@ enum SQ_INTERRUPT_ERROR_TYPE {
 static void event_interrupt_poison_consumption(struct kfd_node *dev,
                                uint16_t pasid, uint16_t client_id)
 {
+       enum amdgpu_ras_block block = 0;
        int old_poison, ret = -EINVAL;
        struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 
@@ -151,12 +152,14 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
        case SOC15_IH_CLIENTID_SE3SH:
        case SOC15_IH_CLIENTID_UTCL2:
                ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
+               block = AMDGPU_RAS_BLOCK__GFX;
                break;
        case SOC15_IH_CLIENTID_SDMA0:
        case SOC15_IH_CLIENTID_SDMA1:
        case SOC15_IH_CLIENTID_SDMA2:
        case SOC15_IH_CLIENTID_SDMA3:
        case SOC15_IH_CLIENTID_SDMA4:
+               block = AMDGPU_RAS_BLOCK__SDMA;
                break;
        default:
                break;
@@ -171,12 +174,12 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
                dev_warn(dev->adev->dev,
                        "RAS poison consumption, unmap queue flow succeeded: client id %d\n",
                        client_id);
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
        } else {
                dev_warn(dev->adev->dev,
                        "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
                        client_id);
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
        }
 }
 
index 2a65792fd1162ba3f21f1600724897ec78449c2f..7e2859736a558fe899c8d1bb438daa07523f2c59 100644 (file)
@@ -191,6 +191,7 @@ static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1)
 static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
                                uint16_t pasid, uint16_t source_id)
 {
+       enum amdgpu_ras_block block = 0;
        int ret = -EINVAL;
        struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 
@@ -210,9 +211,11 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
        case SOC15_INTSRC_SQ_INTERRUPT_MSG:
                if (dev->dqm->ops.reset_queues)
                        ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
+               block = AMDGPU_RAS_BLOCK__GFX;
                break;
        case SOC21_INTSRC_SDMA_ECC:
        default:
+               block = AMDGPU_RAS_BLOCK__GFX;
                break;
        }
 
@@ -221,9 +224,9 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
        /* resetting queue passes, do page retirement without gpu reset
           resetting queue fails, fallback to gpu reset solution */
        if (!ret)
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
        else
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
 }
 
 static bool event_interrupt_isr_v11(struct kfd_node *dev,
index 27cdaea405017aed21ff447eec833068a9f9b101..91dd5e045b511d2aaa42aa4bd9b934e018c407ab 100644 (file)
@@ -143,6 +143,7 @@ enum SQ_INTERRUPT_ERROR_TYPE {
 static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
                                uint16_t pasid, uint16_t client_id)
 {
+       enum amdgpu_ras_block block = 0;
        int old_poison, ret = -EINVAL;
        struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 
@@ -162,12 +163,14 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
        case SOC15_IH_CLIENTID_SE3SH:
        case SOC15_IH_CLIENTID_UTCL2:
                ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
+               block = AMDGPU_RAS_BLOCK__GFX;
                break;
        case SOC15_IH_CLIENTID_SDMA0:
        case SOC15_IH_CLIENTID_SDMA1:
        case SOC15_IH_CLIENTID_SDMA2:
        case SOC15_IH_CLIENTID_SDMA3:
        case SOC15_IH_CLIENTID_SDMA4:
+               block = AMDGPU_RAS_BLOCK__SDMA;
                break;
        default:
                break;
@@ -182,12 +185,12 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
                dev_warn(dev->adev->dev,
                        "RAS poison consumption, unmap queue flow succeeded: client id %d\n",
                        client_id);
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
        } else {
                dev_warn(dev->adev->dev,
                        "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
                        client_id);
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
        }
 }