accel/habanalabs: add info ioctl for engine error reports
authorOfir Bitton <obitton@habana.ai>
Tue, 23 May 2023 07:42:19 +0000 (10:42 +0300)
committerOded Gabbay <ogabbay@kernel.org>
Mon, 9 Oct 2023 09:37:19 +0000 (12:37 +0300)
User gets notification for every engine error report, but he still
lacks the exact engine information. Hence, we allow user to query
for the exact engine reported an error.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/accel/habanalabs/common/device.c
drivers/accel/habanalabs/common/habanalabs.h
drivers/accel/habanalabs/common/habanalabs_ioctl.c
drivers/accel/habanalabs/gaudi2/gaudi2.c
include/uapi/drm/habanalabs_accel.h

index 28be0fc325eaeb8e03b15b9683148b3d94a4f5a5..80cce6b74d0523b4faa644c8a275f57707fb7de5 100644 (file)
@@ -2701,6 +2701,20 @@ void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info)
                *info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR;
 }
 
+void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 error_count)
+{
+       struct engine_err_info *info = &hdev->captured_err_info.engine_err;
+
+       /* Capture only the first engine error */
+       if (atomic_cmpxchg(&info->event_detected, 0, 1))
+               return;
+
+       info->event.timestamp = ktime_to_ns(ktime_get());
+       info->event.engine_id = engine_id;
+       info->event.error_count = error_count;
+       info->event_info_available = true;
+}
+
 void hl_enable_err_info_capture(struct hl_error_info *captured_err_info)
 {
        vfree(captured_err_info->page_fault_info.user_mappings);
index e69b9b195f48903b333d1e0028cafdbe52402b9c..2bd3dedfa4b59530fa86e6763187609bf8d5c504 100644 (file)
@@ -3062,6 +3062,20 @@ struct fw_err_info {
        bool                            event_info_available;
 };
 
+/**
+ * struct engine_err_info - engine error information.
+ * @event: holds information on the event.
+ * @event_detected: if set as 1, then an engine event was discovered for the
+ *                  first time after the driver has finished booting-up.
+ * @event_info_available: indicates that an engine event info is now available.
+ */
+struct engine_err_info {
+       struct hl_info_engine_err_event event;
+       atomic_t                        event_detected;
+       bool                            event_info_available;
+};
+
+
 /**
  * struct hl_error_info - holds information collected during an error.
  * @cs_timeout: CS timeout error information.
@@ -3070,6 +3084,7 @@ struct fw_err_info {
  * @page_fault_info: page fault information.
  * @hw_err: (fatal) hardware error information.
  * @fw_err: firmware error information.
+ * @engine_err: engine error information.
  */
 struct hl_error_info {
        struct cs_timeout_info          cs_timeout;
@@ -3078,6 +3093,7 @@ struct hl_error_info {
        struct page_fault_info          page_fault_info;
        struct hw_err_info              hw_err;
        struct fw_err_info              fw_err;
+       struct engine_err_info          engine_err;
 };
 
 /**
@@ -3952,6 +3968,7 @@ void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_
                                u64 *event_mask);
 void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask);
 void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info);
+void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 error_count);
 void hl_enable_err_info_capture(struct hl_error_info *captured_err_info);
 
 #ifdef CONFIG_DEBUG_FS
index 549b2518fae0e817e4f70e93d00cad9677c2d85c..097d65e493c81a22b9198f76576102ceaeb146d9 100644 (file)
@@ -875,6 +875,28 @@ static int fw_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
        return rc ? -EFAULT : 0;
 }
 
+static int engine_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+       void __user *user_buf = (void __user *) (uintptr_t) args->return_pointer;
+       struct hl_device *hdev = hpriv->hdev;
+       u32 user_buf_size = args->return_size;
+       struct engine_err_info *info;
+       int rc;
+
+       if (!user_buf)
+               return -EINVAL;
+
+       info = &hdev->captured_err_info.engine_err;
+       if (!info->event_info_available)
+               return 0;
+
+       if (user_buf_size < sizeof(struct hl_info_engine_err_event))
+               return -ENOMEM;
+
+       rc = copy_to_user(user_buf, &info->event, sizeof(struct hl_info_engine_err_event));
+       return rc ? -EFAULT : 0;
+}
+
 static int send_fw_generic_request(struct hl_device *hdev, struct hl_info_args *info_args)
 {
        void __user *buff = (void __user *) (uintptr_t) info_args->return_pointer;
@@ -1001,6 +1023,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
        case HL_INFO_FW_ERR_EVENT:
                return fw_err_info(hpriv, args);
 
+       case HL_INFO_USER_ENGINE_ERR_EVENT:
+               return engine_err_info(hpriv, args);
+
        case HL_INFO_DRAM_USAGE:
                return dram_usage_info(hpriv, args);
        default:
index 22a9aee9a7c961568ebc51af3aae4a28f97ed737..c317a95c3b3498597ee3d4b765e8310c6d98dbe3 100644 (file)
@@ -9589,6 +9589,171 @@ static int hl_arc_event_handle(struct hl_device *hdev, u16 event_type,
        }
 }
 
+static u16 event_id_to_engine_id(struct hl_device *hdev, u16 event_type)
+{
+       enum gaudi2_block_types type = GAUDI2_BLOCK_TYPE_MAX;
+       u16 index;
+
+       switch (event_type) {
+       case GAUDI2_EVENT_TPC0_AXI_ERR_RSP ... GAUDI2_EVENT_TPC24_AXI_ERR_RSP:
+               index = event_type - GAUDI2_EVENT_TPC0_AXI_ERR_RSP;
+               type = GAUDI2_BLOCK_TYPE_TPC;
+               break;
+       case GAUDI2_EVENT_TPC0_QM ... GAUDI2_EVENT_TPC24_QM:
+               index = event_type - GAUDI2_EVENT_TPC0_QM;
+               type = GAUDI2_BLOCK_TYPE_TPC;
+               break;
+       case GAUDI2_EVENT_MME0_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE:
+       case GAUDI2_EVENT_MME0_SPI_BASE ... GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID:
+       case GAUDI2_EVENT_MME0_QM:
+               index = 0;
+               type = GAUDI2_BLOCK_TYPE_MME;
+               break;
+       case GAUDI2_EVENT_MME1_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME1_CTRL_AXI_ERROR_RESPONSE:
+       case GAUDI2_EVENT_MME1_SPI_BASE ... GAUDI2_EVENT_MME1_WAP_SOURCE_RESULT_INVALID:
+       case GAUDI2_EVENT_MME1_QM:
+               index = 1;
+               type = GAUDI2_BLOCK_TYPE_MME;
+               break;
+       case GAUDI2_EVENT_MME2_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME2_CTRL_AXI_ERROR_RESPONSE:
+       case GAUDI2_EVENT_MME2_SPI_BASE ... GAUDI2_EVENT_MME2_WAP_SOURCE_RESULT_INVALID:
+       case GAUDI2_EVENT_MME2_QM:
+               index = 2;
+               type = GAUDI2_BLOCK_TYPE_MME;
+               break;
+       case GAUDI2_EVENT_MME3_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME3_CTRL_AXI_ERROR_RESPONSE:
+       case GAUDI2_EVENT_MME3_SPI_BASE ... GAUDI2_EVENT_MME3_WAP_SOURCE_RESULT_INVALID:
+       case GAUDI2_EVENT_MME3_QM:
+               index = 3;
+               type = GAUDI2_BLOCK_TYPE_MME;
+               break;
+       case GAUDI2_EVENT_KDMA_CH0_AXI_ERR_RSP:
+       case GAUDI2_EVENT_KDMA_BM_SPMU:
+       case GAUDI2_EVENT_KDMA0_CORE:
+               return GAUDI2_ENGINE_ID_KDMA;
+       case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP:
+       case GAUDI2_EVENT_PDMA0_CORE:
+       case GAUDI2_EVENT_PDMA0_BM_SPMU:
+       case GAUDI2_EVENT_PDMA0_QM:
+               return GAUDI2_ENGINE_ID_PDMA_0;
+       case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP:
+       case GAUDI2_EVENT_PDMA1_CORE:
+       case GAUDI2_EVENT_PDMA1_BM_SPMU:
+       case GAUDI2_EVENT_PDMA1_QM:
+               return GAUDI2_ENGINE_ID_PDMA_1;
+       case GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE ... GAUDI2_EVENT_DEC9_AXI_ERR_RSPONSE:
+               index = event_type - GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE;
+               type = GAUDI2_BLOCK_TYPE_DEC;
+               break;
+       case GAUDI2_EVENT_DEC0_SPI ... GAUDI2_EVENT_DEC9_BMON_SPMU:
+               index = (event_type - GAUDI2_EVENT_DEC0_SPI) >> 1;
+               type = GAUDI2_BLOCK_TYPE_DEC;
+               break;
+       case GAUDI2_EVENT_NIC0_AXI_ERROR_RESPONSE ... GAUDI2_EVENT_NIC11_AXI_ERROR_RESPONSE:
+               index = event_type - GAUDI2_EVENT_NIC0_AXI_ERROR_RESPONSE;
+               return GAUDI2_ENGINE_ID_NIC0_0 + (index * 2);
+       case GAUDI2_EVENT_NIC0_QM0 ... GAUDI2_EVENT_NIC11_QM1:
+               index = event_type - GAUDI2_EVENT_NIC0_QM0;
+               return GAUDI2_ENGINE_ID_NIC0_0 + index;
+       case GAUDI2_EVENT_NIC0_BMON_SPMU ... GAUDI2_EVENT_NIC11_SW_ERROR:
+               index = event_type - GAUDI2_EVENT_NIC0_BMON_SPMU;
+               return GAUDI2_ENGINE_ID_NIC0_0 + (index * 2);
+       case GAUDI2_EVENT_TPC0_BMON_SPMU ... GAUDI2_EVENT_TPC24_KERNEL_ERR:
+               index = (event_type - GAUDI2_EVENT_TPC0_BMON_SPMU) >> 1;
+               type = GAUDI2_BLOCK_TYPE_TPC;
+               break;
+       case GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE:
+       case GAUDI2_EVENT_ROTATOR0_BMON_SPMU:
+       case GAUDI2_EVENT_ROTATOR0_ROT0_QM:
+               return GAUDI2_ENGINE_ID_ROT_0;
+       case GAUDI2_EVENT_ROTATOR1_AXI_ERROR_RESPONSE:
+       case GAUDI2_EVENT_ROTATOR1_BMON_SPMU:
+       case GAUDI2_EVENT_ROTATOR1_ROT1_QM:
+               return GAUDI2_ENGINE_ID_ROT_1;
+       case GAUDI2_EVENT_HDMA0_BM_SPMU:
+       case GAUDI2_EVENT_HDMA0_QM:
+       case GAUDI2_EVENT_HDMA0_CORE:
+               return GAUDI2_DCORE0_ENGINE_ID_EDMA_0;
+       case GAUDI2_EVENT_HDMA1_BM_SPMU:
+       case GAUDI2_EVENT_HDMA1_QM:
+       case GAUDI2_EVENT_HDMA1_CORE:
+               return GAUDI2_DCORE0_ENGINE_ID_EDMA_1;
+       case GAUDI2_EVENT_HDMA2_BM_SPMU:
+       case GAUDI2_EVENT_HDMA2_QM:
+       case GAUDI2_EVENT_HDMA2_CORE:
+               return GAUDI2_DCORE1_ENGINE_ID_EDMA_0;
+       case GAUDI2_EVENT_HDMA3_BM_SPMU:
+       case GAUDI2_EVENT_HDMA3_QM:
+       case GAUDI2_EVENT_HDMA3_CORE:
+               return GAUDI2_DCORE1_ENGINE_ID_EDMA_1;
+       case GAUDI2_EVENT_HDMA4_BM_SPMU:
+       case GAUDI2_EVENT_HDMA4_QM:
+       case GAUDI2_EVENT_HDMA4_CORE:
+               return GAUDI2_DCORE2_ENGINE_ID_EDMA_0;
+       case GAUDI2_EVENT_HDMA5_BM_SPMU:
+       case GAUDI2_EVENT_HDMA5_QM:
+       case GAUDI2_EVENT_HDMA5_CORE:
+               return GAUDI2_DCORE2_ENGINE_ID_EDMA_1;
+       case GAUDI2_EVENT_HDMA6_BM_SPMU:
+       case GAUDI2_EVENT_HDMA6_QM:
+       case GAUDI2_EVENT_HDMA6_CORE:
+               return GAUDI2_DCORE3_ENGINE_ID_EDMA_0;
+       case GAUDI2_EVENT_HDMA7_BM_SPMU:
+       case GAUDI2_EVENT_HDMA7_QM:
+       case GAUDI2_EVENT_HDMA7_CORE:
+               return GAUDI2_DCORE3_ENGINE_ID_EDMA_1;
+       default:
+               break;
+       }
+
+       switch (type) {
+       case GAUDI2_BLOCK_TYPE_TPC:
+               switch (index) {
+               case TPC_ID_DCORE0_TPC0 ... TPC_ID_DCORE0_TPC5:
+                       return GAUDI2_DCORE0_ENGINE_ID_TPC_0 + index;
+               case TPC_ID_DCORE1_TPC0 ... TPC_ID_DCORE1_TPC5:
+                       return GAUDI2_DCORE1_ENGINE_ID_TPC_0 + index - TPC_ID_DCORE1_TPC0;
+               case TPC_ID_DCORE2_TPC0 ... TPC_ID_DCORE2_TPC5:
+                       return GAUDI2_DCORE2_ENGINE_ID_TPC_0 + index - TPC_ID_DCORE2_TPC0;
+               case TPC_ID_DCORE3_TPC0 ... TPC_ID_DCORE3_TPC5:
+                       return GAUDI2_DCORE3_ENGINE_ID_TPC_0 + index - TPC_ID_DCORE3_TPC0;
+               default:
+                       break;
+               }
+               break;
+       case GAUDI2_BLOCK_TYPE_MME:
+               switch (index) {
+               case MME_ID_DCORE0: return GAUDI2_DCORE0_ENGINE_ID_MME;
+               case MME_ID_DCORE1: return GAUDI2_DCORE1_ENGINE_ID_MME;
+               case MME_ID_DCORE2: return GAUDI2_DCORE2_ENGINE_ID_MME;
+               case MME_ID_DCORE3: return GAUDI2_DCORE3_ENGINE_ID_MME;
+               default:
+                       break;
+               }
+               break;
+       case GAUDI2_BLOCK_TYPE_DEC:
+               switch (index) {
+               case DEC_ID_DCORE0_DEC0: return GAUDI2_DCORE0_ENGINE_ID_DEC_0;
+               case DEC_ID_DCORE0_DEC1: return GAUDI2_DCORE0_ENGINE_ID_DEC_1;
+               case DEC_ID_DCORE1_DEC0: return GAUDI2_DCORE1_ENGINE_ID_DEC_0;
+               case DEC_ID_DCORE1_DEC1: return GAUDI2_DCORE1_ENGINE_ID_DEC_1;
+               case DEC_ID_DCORE2_DEC0: return GAUDI2_DCORE2_ENGINE_ID_DEC_0;
+               case DEC_ID_DCORE2_DEC1: return GAUDI2_DCORE2_ENGINE_ID_DEC_1;
+               case DEC_ID_DCORE3_DEC0: return GAUDI2_DCORE3_ENGINE_ID_DEC_0;
+               case DEC_ID_DCORE3_DEC1: return GAUDI2_DCORE3_ENGINE_ID_DEC_1;
+               case DEC_ID_PCIE_VDEC0: return GAUDI2_PCIE_ENGINE_ID_DEC_0;
+               case DEC_ID_PCIE_VDEC1: return GAUDI2_PCIE_ENGINE_ID_DEC_1;
+               default:
+                       break;
+               }
+               break;
+       default:
+               break;
+       }
+
+       return U16_MAX;
+}
+
 static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
 {
        struct gaudi2_device *gaudi2 = hdev->asic_specific;
@@ -10011,6 +10176,9 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
                }
        }
 
+       if (event_mask & HL_NOTIFIER_EVENT_USER_ENGINE_ERR)
+               hl_capture_engine_err(hdev, event_id_to_engine_id(hdev, event_type), error_count);
+
        /* Make sure to dump an error in case no error cause was printed so far.
         * Note that although we have counted the errors, we use this number as
         * a boolean.
index e6436f3e8ea606bd49250382a4acaa58be8d4653..f912869b151e41ee4abd9e1bd336df0aae26d61d 100644 (file)
@@ -809,6 +809,7 @@ enum hl_server_type {
  * HL_INFO_FW_ERR_EVENT   - Retrieve information on the reported FW error.
  *                          May return 0 even though no new data is available, in that case
  *                          timestamp will be 0.
+ * HL_INFO_USER_ENGINE_ERR_EVENT - Retrieve the last engine id that reported an error.
  */
 #define HL_INFO_HW_IP_INFO                     0
 #define HL_INFO_HW_EVENTS                      1
@@ -845,6 +846,7 @@ enum hl_server_type {
 #define HL_INFO_FW_GENERIC_REQ                 35
 #define HL_INFO_HW_ERR_EVENT                   36
 #define HL_INFO_FW_ERR_EVENT                   37
+#define HL_INFO_USER_ENGINE_ERR_EVENT          38
 
 #define HL_INFO_VERSION_MAX_LEN                        128
 #define HL_INFO_CARD_NAME_MAX_LEN              16
@@ -1226,6 +1228,20 @@ struct hl_info_fw_err_event {
        __u32 pad;
 };
 
+/**
+ * struct hl_info_engine_err_event - engine error info
+ * @timestamp: time-stamp of error occurrence
+ * @engine_id: engine id who reported the error.
+ * @error_count: Amount of errors reported.
+ * @pad: size padding for u64 granularity.
+ */
+struct hl_info_engine_err_event {
+       __s64 timestamp;
+       __u16 engine_id;
+       __u16 error_count;
+       __u32 pad;
+};
+
 /**
  * struct hl_info_dev_memalloc_page_sizes - valid page sizes in device mem alloc information.
  * @page_order_bitmask: bitmap in which a set bit represents the order of the supported page size