}
}
+static void gaudi_print_out_of_sync_info(struct hl_device *hdev,
+ struct cpucp_pkt_sync_err *sync_err)
+{
+ struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI_QUEUE_ID_CPU_PQ];
+
+ dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n",
+ sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci));
+}
+
static int gaudi_soft_reset_late_init(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
event_type, cause);
break;
+ case GAUDI_EVENT_PKT_QUEUE_OUT_SYNC:
+ gaudi_print_irq_info(hdev, event_type, false);
+ gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
+ if (hdev->hard_reset_on_fw_events)
+ hl_device_reset(hdev, true, false);
+ else
+ hl_fw_unmask_irq(hdev, event_type);
+ break;
+
default:
dev_err(hdev->dev, "Received invalid H/W interrupt %d\n",
event_type);
return "THERMAL_ENV_S";
case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E:
return "THERMAL_ENV_E";
+ case GOYA_ASYNC_EVENT_PKT_QUEUE_OUT_SYNC:
+ return "QUEUE_OUT_OF_SYNC";
default:
return "N/A";
}
index = event_type - GOYA_ASYNC_EVENT_ID_DMA_BM_CH0;
snprintf(desc, size, _goya_get_event_desc(event_type), index);
break;
+ case GOYA_ASYNC_EVENT_PKT_QUEUE_OUT_SYNC:
+ snprintf(desc, size, _goya_get_event_desc(event_type));
+ break;
default:
snprintf(desc, size, _goya_get_event_desc(event_type));
break;
}
}
+static void goya_print_out_of_sync_info(struct hl_device *hdev,
+ struct cpucp_pkt_sync_err *sync_err)
+{
+ struct hl_hw_queue *q = &hdev->kernel_queues[GOYA_QUEUE_ID_CPU_PQ];
+
+ dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n",
+ sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci));
+}
+
static void goya_print_irq_info(struct hl_device *hdev, u16 event_type,
bool razwi)
{
goya_unmask_irq(hdev, event_type);
break;
+ case GOYA_ASYNC_EVENT_PKT_QUEUE_OUT_SYNC:
+ goya_print_irq_info(hdev, event_type, false);
+ goya_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
+ if (hdev->hard_reset_on_fw_events)
+ hl_device_reset(hdev, true, false);
+ else
+ hl_fw_unmask_irq(hdev, event_type);
+ break;
+
default:
dev_err(hdev->dev, "Received invalid H/W interrupt %d\n",
event_type);
#define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_SHIFT 6
#define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_MASK 0x000007C0
+/*
+ * info of the pkt queue pointers in the first async occurrence
+ */
+struct cpucp_pkt_sync_err {
+ __le32 pi;
+ __le32 ci;
+};
+
struct hl_eq_hbm_ecc_data {
/* SERR counter */
__le32 sec_cnt;
struct hl_eq_ecc_data ecc_data;
struct hl_eq_hbm_ecc_data hbm_ecc_data;
struct hl_eq_sm_sei_data sm_sei_data;
+ struct cpucp_pkt_sync_err pkt_sync_err;
__le64 data[7];
};
};
GAUDI_EVENT_NIC3_QP1 = 619,
GAUDI_EVENT_NIC4_QP0 = 620,
GAUDI_EVENT_NIC4_QP1 = 621,
+ GAUDI_EVENT_PKT_QUEUE_OUT_SYNC = 647,
GAUDI_EVENT_FIX_POWER_ENV_S = 658,
GAUDI_EVENT_FIX_POWER_ENV_E = 659,
GAUDI_EVENT_FIX_THERMAL_ENV_S = 660,
GOYA_ASYNC_EVENT_ID_HALT_MACHINE = 485,
GOYA_ASYNC_EVENT_ID_INTS_REGISTER = 486,
GOYA_ASYNC_EVENT_ID_SOFT_RESET = 487,
+ GOYA_ASYNC_EVENT_PKT_QUEUE_OUT_SYNC = 506,
GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_S = 507,
GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_E = 508,
GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_S = 509,