u8 type;
};
+#define MAX_QMAN_STREAMS_INFO 4
+#define OPCODE_INFO_MAX_ADDR_SIZE 8
+/**
+ * struct undefined_opcode_info - info about last undefined opcode error
+ * @timestamp: timestamp of the undefined opcode error
+ * @cb_addr_streams: CB addresses (per stream) that are currently exists in the PQ
+ * entiers. In case all streams array entries are
+ * filled with values, it means the execution was in Lower-CP.
+ * @cq_addr: the address of the current handled command buffer
+ * @cq_size: the size of the current handled command buffer
+ * @cb_addr_streams_len: num of streams - actual len of cb_addr_streams array.
+ * should be equal to 1 incase of undefined opcode
+ * in Upper-CP (specific stream) and equal to 4 incase
+ * of undefined opcode in Lower-CP.
+ * @engine_id: engine-id that the error occurred on
+ * @stream_id: the stream id the error occurred on. In case the stream equals to
+ * MAX_QMAN_STREAMS_INFO it means the error occurred on a Lower-CP.
+ * @write_enable: if set, writing to undefined opcode parameters in the structure
+ * is enable so the first (root cause) undefined opcode will not be
+ * overwritten.
+ */
+struct undefined_opcode_info {
+ ktime_t timestamp;
+ u64 cb_addr_streams[MAX_QMAN_STREAMS_INFO][OPCODE_INFO_MAX_ADDR_SIZE];
+ u64 cq_addr;
+ u32 cq_size;
+ u32 cb_addr_streams_len;
+ u32 engine_id;
+ u32 stream_id;
+ bool write_enable;
+};
+
/**
* struct last_error_session_info - info about last session errors occurred.
* @cs_timeout: CS timeout error last information.
* @razwi: razwi last information.
+ * @undef_opcode: undefined opcode information
*/
struct last_error_session_info {
- struct cs_timeout_info cs_timeout;
- struct razwi_info razwi;
+ struct cs_timeout_info cs_timeout;
+ struct razwi_info razwi;
+ struct undefined_opcode_info undef_opcode;
};
/**
int hl_build_hwmon_channel_info(struct hl_device *hdev,
struct cpucp_sensor *sensors_arr);
-void hl_notifier_event_send_all(struct hl_device *hdev, u64 event);
+void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask);
int hl_sysfs_init(struct hl_device *hdev);
void hl_sysfs_fini(struct hl_device *hdev);
[SP_NUM_CORES] = 1,
};
+static const int gaudi_queue_id_to_engine_id[] = {
+ [GAUDI_QUEUE_ID_DMA_0_0...GAUDI_QUEUE_ID_DMA_0_3] = GAUDI_ENGINE_ID_DMA_0,
+ [GAUDI_QUEUE_ID_DMA_1_0...GAUDI_QUEUE_ID_DMA_1_3] = GAUDI_ENGINE_ID_DMA_1,
+ [GAUDI_QUEUE_ID_CPU_PQ] = GAUDI_ENGINE_ID_SIZE,
+ [GAUDI_QUEUE_ID_DMA_2_0...GAUDI_QUEUE_ID_DMA_2_3] = GAUDI_ENGINE_ID_DMA_2,
+ [GAUDI_QUEUE_ID_DMA_3_0...GAUDI_QUEUE_ID_DMA_3_3] = GAUDI_ENGINE_ID_DMA_3,
+ [GAUDI_QUEUE_ID_DMA_4_0...GAUDI_QUEUE_ID_DMA_4_3] = GAUDI_ENGINE_ID_DMA_4,
+ [GAUDI_QUEUE_ID_DMA_5_0...GAUDI_QUEUE_ID_DMA_5_3] = GAUDI_ENGINE_ID_DMA_5,
+ [GAUDI_QUEUE_ID_DMA_6_0...GAUDI_QUEUE_ID_DMA_6_3] = GAUDI_ENGINE_ID_DMA_6,
+ [GAUDI_QUEUE_ID_DMA_7_0...GAUDI_QUEUE_ID_DMA_7_3] = GAUDI_ENGINE_ID_DMA_7,
+ [GAUDI_QUEUE_ID_MME_0_0...GAUDI_QUEUE_ID_MME_0_3] = GAUDI_ENGINE_ID_MME_0,
+ [GAUDI_QUEUE_ID_MME_1_0...GAUDI_QUEUE_ID_MME_1_3] = GAUDI_ENGINE_ID_MME_1,
+ [GAUDI_QUEUE_ID_TPC_0_0...GAUDI_QUEUE_ID_TPC_0_3] = GAUDI_ENGINE_ID_TPC_0,
+ [GAUDI_QUEUE_ID_TPC_1_0...GAUDI_QUEUE_ID_TPC_1_3] = GAUDI_ENGINE_ID_TPC_1,
+ [GAUDI_QUEUE_ID_TPC_2_0...GAUDI_QUEUE_ID_TPC_2_3] = GAUDI_ENGINE_ID_TPC_2,
+ [GAUDI_QUEUE_ID_TPC_3_0...GAUDI_QUEUE_ID_TPC_3_3] = GAUDI_ENGINE_ID_TPC_3,
+ [GAUDI_QUEUE_ID_TPC_4_0...GAUDI_QUEUE_ID_TPC_4_3] = GAUDI_ENGINE_ID_TPC_4,
+ [GAUDI_QUEUE_ID_TPC_5_0...GAUDI_QUEUE_ID_TPC_5_3] = GAUDI_ENGINE_ID_TPC_5,
+ [GAUDI_QUEUE_ID_TPC_6_0...GAUDI_QUEUE_ID_TPC_6_3] = GAUDI_ENGINE_ID_TPC_6,
+ [GAUDI_QUEUE_ID_TPC_7_0...GAUDI_QUEUE_ID_TPC_7_3] = GAUDI_ENGINE_ID_TPC_7,
+ [GAUDI_QUEUE_ID_NIC_0_0...GAUDI_QUEUE_ID_NIC_0_3] = GAUDI_ENGINE_ID_NIC_0,
+ [GAUDI_QUEUE_ID_NIC_1_0...GAUDI_QUEUE_ID_NIC_1_3] = GAUDI_ENGINE_ID_NIC_1,
+ [GAUDI_QUEUE_ID_NIC_2_0...GAUDI_QUEUE_ID_NIC_2_3] = GAUDI_ENGINE_ID_NIC_2,
+ [GAUDI_QUEUE_ID_NIC_3_0...GAUDI_QUEUE_ID_NIC_3_3] = GAUDI_ENGINE_ID_NIC_3,
+ [GAUDI_QUEUE_ID_NIC_4_0...GAUDI_QUEUE_ID_NIC_4_3] = GAUDI_ENGINE_ID_NIC_4,
+ [GAUDI_QUEUE_ID_NIC_5_0...GAUDI_QUEUE_ID_NIC_5_3] = GAUDI_ENGINE_ID_NIC_5,
+ [GAUDI_QUEUE_ID_NIC_6_0...GAUDI_QUEUE_ID_NIC_6_3] = GAUDI_ENGINE_ID_NIC_6,
+ [GAUDI_QUEUE_ID_NIC_7_0...GAUDI_QUEUE_ID_NIC_7_3] = GAUDI_ENGINE_ID_NIC_7,
+ [GAUDI_QUEUE_ID_NIC_8_0...GAUDI_QUEUE_ID_NIC_8_3] = GAUDI_ENGINE_ID_NIC_8,
+ [GAUDI_QUEUE_ID_NIC_9_0...GAUDI_QUEUE_ID_NIC_9_3] = GAUDI_ENGINE_ID_NIC_9,
+};
+
/* The order here is opposite to the order of the indexing in the h/w.
* i.e. SYNC_MGR_W_S is actually 0, SYNC_MGR_E_S is 1, etc.
*/
}
/**
- * gaudi_print_sw_config_stream_data - print SW config stream data
+ * gaudi_handle_sw_config_stream_data - print SW config stream data
*
* @hdev: pointer to the habanalabs device structure
* @stream: the QMAN's stream
* @qman_base: base address of QMAN registers block
+ * @event_mask: mask of the last events occurred
*/
-static void gaudi_print_sw_config_stream_data(struct hl_device *hdev, u32 stream,
- u64 qman_base)
+static void gaudi_handle_sw_config_stream_data(struct hl_device *hdev, u32 stream,
+ u64 qman_base, u64 event_mask)
{
u64 cq_ptr_lo, cq_ptr_hi, cq_tsize, cq_ptr;
u32 cq_ptr_lo_off, size;
size = RREG32(cq_tsize);
dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %u\n",
stream, cq_ptr, size);
+
+ if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
+ hdev->last_error.undef_opcode.cq_addr = cq_ptr;
+ hdev->last_error.undef_opcode.cq_size = size;
+ hdev->last_error.undef_opcode.stream_id = stream;
+ }
}
/**
- * gaudi_print_last_pqes_on_err - print last PQEs on error
+ * gaudi_handle_last_pqes_on_err - print last PQEs on error
*
* @hdev: pointer to the habanalabs device structure
* @qid_base: first QID of the QMAN (out of 4 streams)
* @stream: the QMAN's stream
* @qman_base: base address of QMAN registers block
+ * @event_mask: mask of the last events occurred
* @pr_sw_conf: if true print the SW config stream data (CQ PTR and SIZE)
*/
-static void gaudi_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base,
+static void gaudi_handle_last_pqes_on_err(struct hl_device *hdev, u32 qid_base,
u32 stream, u64 qman_base,
+ u64 event_mask,
bool pr_sw_conf)
{
u32 ci, qm_ci_stream_off, queue_len;
struct hl_hw_queue *q;
- u64 pq_ci;
+ u64 pq_ci, addr[PQ_FETCHER_CACHE_SIZE];
int i;
q = &hdev->kernel_queues[qid_base + stream];
hdev->asic_funcs->hw_queues_lock(hdev);
if (pr_sw_conf)
- gaudi_print_sw_config_stream_data(hdev, stream, qman_base);
+ gaudi_handle_sw_config_stream_data(hdev, stream, qman_base, event_mask);
ci = RREG32(pq_ci);
/* we should start printing form ci -1 */
ci = gaudi_queue_idx_dec(ci, queue_len);
+ memset(addr, 0, sizeof(addr));
for (i = 0; i < PQ_FETCHER_CACHE_SIZE; i++) {
struct hl_bd *bd;
- u64 addr;
u32 len;
bd = q->kernel_address;
if (!len)
break;
- addr = le64_to_cpu(bd->ptr);
+ addr[i] = le64_to_cpu(bd->ptr);
dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %u\n",
- stream, ci, addr, len);
+ stream, ci, addr[i], len);
/* get previous ci, wrap if needed */
ci = gaudi_queue_idx_dec(ci, queue_len);
}
+ if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
+ struct undefined_opcode_info *undef_opcode = &hdev->last_error.undef_opcode;
+ u32 arr_idx = undef_opcode->cb_addr_streams_len;
+
+ if (arr_idx == 0) {
+ undef_opcode->timestamp = ktime_get();
+ undef_opcode->engine_id = gaudi_queue_id_to_engine_id[qid_base];
+ }
+
+ memcpy(undef_opcode->cb_addr_streams[arr_idx], addr, sizeof(addr));
+ undef_opcode->cb_addr_streams_len++;
+ }
+
hdev->asic_funcs->hw_queues_unlock(hdev);
}
/**
- * print_qman_data_on_err - extract QMAN data on error
+ * handle_qman_data_on_err - extract QMAN data on error
*
* @hdev: pointer to the habanalabs device structure
* @qid_base: first QID of the QMAN (out of 4 streams)
* @stream: the QMAN's stream
* @qman_base: base address of QMAN registers block
+ * @event_mask: mask of the last events occurred
*
* This function attempt to exatract as much data as possible on QMAN error.
* On upper CP print the SW config stream data and last 8 PQEs.
* On lower CP print SW config data and last PQEs of ALL 4 upper CPs
*/
-static void print_qman_data_on_err(struct hl_device *hdev, u32 qid_base,
- u32 stream, u64 qman_base)
+static void handle_qman_data_on_err(struct hl_device *hdev, u32 qid_base,
+ u32 stream, u64 qman_base, u64 event_mask)
{
u32 i;
if (stream != QMAN_STREAMS) {
- gaudi_print_last_pqes_on_err(hdev, qid_base, stream, qman_base,
- true);
+ gaudi_handle_last_pqes_on_err(hdev, qid_base, stream,
+ qman_base, event_mask, true);
return;
}
- gaudi_print_sw_config_stream_data(hdev, stream, qman_base);
+ /* handle Lower-CP */
+ gaudi_handle_sw_config_stream_data(hdev, stream, qman_base, event_mask);
for (i = 0; i < QMAN_STREAMS; i++)
- gaudi_print_last_pqes_on_err(hdev, qid_base, i, qman_base,
- false);
+ gaudi_handle_last_pqes_on_err(hdev, qid_base, i,
+ qman_base, event_mask, false);
}
static void gaudi_handle_qman_err_generic(struct hl_device *hdev,
const char *qm_name,
u64 qman_base,
- u32 qid_base)
+ u32 qid_base,
+ u64 *event_mask)
{
u32 i, j, glbl_sts_val, arb_err_val, glbl_sts_clr_val;
u64 glbl_sts_addr, arb_err_addr;
glbl_sts_clr_val |= BIT(j);
}
}
+ /* check for undefined opcode */
+ if (glbl_sts_val & TPC0_QM_GLBL_STS1_CP_UNDEF_CMD_ERR_MASK &&
+ hdev->last_error.undef_opcode.write_enable) {
+ memset(&hdev->last_error.undef_opcode, 0,
+ sizeof(hdev->last_error.undef_opcode));
+
+ hdev->last_error.undef_opcode.write_enable = false;
+ *event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
+ }
/* Write 1 clear errors */
if (!hdev->stop_on_err)
WREG32(glbl_sts_addr + 4 * i, glbl_sts_clr_val);
else
- print_qman_data_on_err(hdev, qid_base, i, qman_base);
+ handle_qman_data_on_err(hdev, qid_base, i, qman_base, *event_mask);
}
arb_err_val = RREG32(arb_err_addr);
return;
}
- gaudi_handle_qman_err_generic(hdev, desc, qman_base, qid_base);
+ gaudi_handle_qman_err_generic(hdev, desc, qman_base, qid_base, event_mask);
}
static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,