bool event_info_available;
};
+/**
+ * struct engine_err_info - engine error information.
+ * @event: holds information on the event.
+ * @event_detected: if set as 1, then an engine event was discovered for the
+ * first time after the driver has finished booting-up.
+ * @event_info_available: indicates that an engine event info is now available.
+ */
+struct engine_err_info {
+ struct hl_info_engine_err_event event;
+ atomic_t event_detected;
+ bool event_info_available;
+};
+
+
/**
* struct hl_error_info - holds information collected during an error.
* @cs_timeout: CS timeout error information.
* @page_fault_info: page fault information.
* @hw_err: (fatal) hardware error information.
* @fw_err: firmware error information.
+ * @engine_err: engine error information.
*/
struct hl_error_info {
struct cs_timeout_info cs_timeout;
struct page_fault_info page_fault_info;
struct hw_err_info hw_err;
struct fw_err_info fw_err;
+ struct engine_err_info engine_err;
};
/**
u64 *event_mask);
void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask);
void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info);
+void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 error_count);
void hl_enable_err_info_capture(struct hl_error_info *captured_err_info);
#ifdef CONFIG_DEBUG_FS
}
}
+static u16 event_id_to_engine_id(struct hl_device *hdev, u16 event_type)
+{
+ enum gaudi2_block_types type = GAUDI2_BLOCK_TYPE_MAX;
+ u16 index;
+
+ switch (event_type) {
+ case GAUDI2_EVENT_TPC0_AXI_ERR_RSP ... GAUDI2_EVENT_TPC24_AXI_ERR_RSP:
+ index = event_type - GAUDI2_EVENT_TPC0_AXI_ERR_RSP;
+ type = GAUDI2_BLOCK_TYPE_TPC;
+ break;
+ case GAUDI2_EVENT_TPC0_QM ... GAUDI2_EVENT_TPC24_QM:
+ index = event_type - GAUDI2_EVENT_TPC0_QM;
+ type = GAUDI2_BLOCK_TYPE_TPC;
+ break;
+ case GAUDI2_EVENT_MME0_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE:
+ case GAUDI2_EVENT_MME0_SPI_BASE ... GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID:
+ case GAUDI2_EVENT_MME0_QM:
+ index = 0;
+ type = GAUDI2_BLOCK_TYPE_MME;
+ break;
+ case GAUDI2_EVENT_MME1_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME1_CTRL_AXI_ERROR_RESPONSE:
+ case GAUDI2_EVENT_MME1_SPI_BASE ... GAUDI2_EVENT_MME1_WAP_SOURCE_RESULT_INVALID:
+ case GAUDI2_EVENT_MME1_QM:
+ index = 1;
+ type = GAUDI2_BLOCK_TYPE_MME;
+ break;
+ case GAUDI2_EVENT_MME2_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME2_CTRL_AXI_ERROR_RESPONSE:
+ case GAUDI2_EVENT_MME2_SPI_BASE ... GAUDI2_EVENT_MME2_WAP_SOURCE_RESULT_INVALID:
+ case GAUDI2_EVENT_MME2_QM:
+ index = 2;
+ type = GAUDI2_BLOCK_TYPE_MME;
+ break;
+ case GAUDI2_EVENT_MME3_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME3_CTRL_AXI_ERROR_RESPONSE:
+ case GAUDI2_EVENT_MME3_SPI_BASE ... GAUDI2_EVENT_MME3_WAP_SOURCE_RESULT_INVALID:
+ case GAUDI2_EVENT_MME3_QM:
+ index = 3;
+ type = GAUDI2_BLOCK_TYPE_MME;
+ break;
+ case GAUDI2_EVENT_KDMA_CH0_AXI_ERR_RSP:
+ case GAUDI2_EVENT_KDMA_BM_SPMU:
+ case GAUDI2_EVENT_KDMA0_CORE:
+ return GAUDI2_ENGINE_ID_KDMA;
+ case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP:
+ case GAUDI2_EVENT_PDMA0_CORE:
+ case GAUDI2_EVENT_PDMA0_BM_SPMU:
+ case GAUDI2_EVENT_PDMA0_QM:
+ return GAUDI2_ENGINE_ID_PDMA_0;
+ case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP:
+ case GAUDI2_EVENT_PDMA1_CORE:
+ case GAUDI2_EVENT_PDMA1_BM_SPMU:
+ case GAUDI2_EVENT_PDMA1_QM:
+ return GAUDI2_ENGINE_ID_PDMA_1;
+ case GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE ... GAUDI2_EVENT_DEC9_AXI_ERR_RSPONSE:
+ index = event_type - GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE;
+ type = GAUDI2_BLOCK_TYPE_DEC;
+ break;
+ case GAUDI2_EVENT_DEC0_SPI ... GAUDI2_EVENT_DEC9_BMON_SPMU:
+ index = (event_type - GAUDI2_EVENT_DEC0_SPI) >> 1;
+ type = GAUDI2_BLOCK_TYPE_DEC;
+ break;
+ case GAUDI2_EVENT_NIC0_AXI_ERROR_RESPONSE ... GAUDI2_EVENT_NIC11_AXI_ERROR_RESPONSE:
+ index = event_type - GAUDI2_EVENT_NIC0_AXI_ERROR_RESPONSE;
+ return GAUDI2_ENGINE_ID_NIC0_0 + (index * 2);
+ case GAUDI2_EVENT_NIC0_QM0 ... GAUDI2_EVENT_NIC11_QM1:
+ index = event_type - GAUDI2_EVENT_NIC0_QM0;
+ return GAUDI2_ENGINE_ID_NIC0_0 + index;
+ case GAUDI2_EVENT_NIC0_BMON_SPMU ... GAUDI2_EVENT_NIC11_SW_ERROR:
+ index = event_type - GAUDI2_EVENT_NIC0_BMON_SPMU;
+ return GAUDI2_ENGINE_ID_NIC0_0 + (index * 2);
+ case GAUDI2_EVENT_TPC0_BMON_SPMU ... GAUDI2_EVENT_TPC24_KERNEL_ERR:
+ index = (event_type - GAUDI2_EVENT_TPC0_BMON_SPMU) >> 1;
+ type = GAUDI2_BLOCK_TYPE_TPC;
+ break;
+ case GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE:
+ case GAUDI2_EVENT_ROTATOR0_BMON_SPMU:
+ case GAUDI2_EVENT_ROTATOR0_ROT0_QM:
+ return GAUDI2_ENGINE_ID_ROT_0;
+ case GAUDI2_EVENT_ROTATOR1_AXI_ERROR_RESPONSE:
+ case GAUDI2_EVENT_ROTATOR1_BMON_SPMU:
+ case GAUDI2_EVENT_ROTATOR1_ROT1_QM:
+ return GAUDI2_ENGINE_ID_ROT_1;
+ case GAUDI2_EVENT_HDMA0_BM_SPMU:
+ case GAUDI2_EVENT_HDMA0_QM:
+ case GAUDI2_EVENT_HDMA0_CORE:
+ return GAUDI2_DCORE0_ENGINE_ID_EDMA_0;
+ case GAUDI2_EVENT_HDMA1_BM_SPMU:
+ case GAUDI2_EVENT_HDMA1_QM:
+ case GAUDI2_EVENT_HDMA1_CORE:
+ return GAUDI2_DCORE0_ENGINE_ID_EDMA_1;
+ case GAUDI2_EVENT_HDMA2_BM_SPMU:
+ case GAUDI2_EVENT_HDMA2_QM:
+ case GAUDI2_EVENT_HDMA2_CORE:
+ return GAUDI2_DCORE1_ENGINE_ID_EDMA_0;
+ case GAUDI2_EVENT_HDMA3_BM_SPMU:
+ case GAUDI2_EVENT_HDMA3_QM:
+ case GAUDI2_EVENT_HDMA3_CORE:
+ return GAUDI2_DCORE1_ENGINE_ID_EDMA_1;
+ case GAUDI2_EVENT_HDMA4_BM_SPMU:
+ case GAUDI2_EVENT_HDMA4_QM:
+ case GAUDI2_EVENT_HDMA4_CORE:
+ return GAUDI2_DCORE2_ENGINE_ID_EDMA_0;
+ case GAUDI2_EVENT_HDMA5_BM_SPMU:
+ case GAUDI2_EVENT_HDMA5_QM:
+ case GAUDI2_EVENT_HDMA5_CORE:
+ return GAUDI2_DCORE2_ENGINE_ID_EDMA_1;
+ case GAUDI2_EVENT_HDMA6_BM_SPMU:
+ case GAUDI2_EVENT_HDMA6_QM:
+ case GAUDI2_EVENT_HDMA6_CORE:
+ return GAUDI2_DCORE3_ENGINE_ID_EDMA_0;
+ case GAUDI2_EVENT_HDMA7_BM_SPMU:
+ case GAUDI2_EVENT_HDMA7_QM:
+ case GAUDI2_EVENT_HDMA7_CORE:
+ return GAUDI2_DCORE3_ENGINE_ID_EDMA_1;
+ default:
+ break;
+ }
+
+ switch (type) {
+ case GAUDI2_BLOCK_TYPE_TPC:
+ switch (index) {
+ case TPC_ID_DCORE0_TPC0 ... TPC_ID_DCORE0_TPC5:
+ return GAUDI2_DCORE0_ENGINE_ID_TPC_0 + index;
+ case TPC_ID_DCORE1_TPC0 ... TPC_ID_DCORE1_TPC5:
+ return GAUDI2_DCORE1_ENGINE_ID_TPC_0 + index - TPC_ID_DCORE1_TPC0;
+ case TPC_ID_DCORE2_TPC0 ... TPC_ID_DCORE2_TPC5:
+ return GAUDI2_DCORE2_ENGINE_ID_TPC_0 + index - TPC_ID_DCORE2_TPC0;
+ case TPC_ID_DCORE3_TPC0 ... TPC_ID_DCORE3_TPC5:
+ return GAUDI2_DCORE3_ENGINE_ID_TPC_0 + index - TPC_ID_DCORE3_TPC0;
+ default:
+ break;
+ }
+ break;
+ case GAUDI2_BLOCK_TYPE_MME:
+ switch (index) {
+ case MME_ID_DCORE0: return GAUDI2_DCORE0_ENGINE_ID_MME;
+ case MME_ID_DCORE1: return GAUDI2_DCORE1_ENGINE_ID_MME;
+ case MME_ID_DCORE2: return GAUDI2_DCORE2_ENGINE_ID_MME;
+ case MME_ID_DCORE3: return GAUDI2_DCORE3_ENGINE_ID_MME;
+ default:
+ break;
+ }
+ break;
+ case GAUDI2_BLOCK_TYPE_DEC:
+ switch (index) {
+ case DEC_ID_DCORE0_DEC0: return GAUDI2_DCORE0_ENGINE_ID_DEC_0;
+ case DEC_ID_DCORE0_DEC1: return GAUDI2_DCORE0_ENGINE_ID_DEC_1;
+ case DEC_ID_DCORE1_DEC0: return GAUDI2_DCORE1_ENGINE_ID_DEC_0;
+ case DEC_ID_DCORE1_DEC1: return GAUDI2_DCORE1_ENGINE_ID_DEC_1;
+ case DEC_ID_DCORE2_DEC0: return GAUDI2_DCORE2_ENGINE_ID_DEC_0;
+ case DEC_ID_DCORE2_DEC1: return GAUDI2_DCORE2_ENGINE_ID_DEC_1;
+ case DEC_ID_DCORE3_DEC0: return GAUDI2_DCORE3_ENGINE_ID_DEC_0;
+ case DEC_ID_DCORE3_DEC1: return GAUDI2_DCORE3_ENGINE_ID_DEC_1;
+ case DEC_ID_PCIE_VDEC0: return GAUDI2_PCIE_ENGINE_ID_DEC_0;
+ case DEC_ID_PCIE_VDEC1: return GAUDI2_PCIE_ENGINE_ID_DEC_1;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return U16_MAX;
+}
+
static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
{
struct gaudi2_device *gaudi2 = hdev->asic_specific;
}
}
+ if (event_mask & HL_NOTIFIER_EVENT_USER_ENGINE_ERR)
+ hl_capture_engine_err(hdev, event_id_to_engine_id(hdev, event_type), error_count);
+
/* Make sure to dump an error in case no error cause was printed so far.
* Note that although we have counted the errors, we use this number as
* a boolean.