return;
reschedule:
+ /*
+ * prev_reset_trigger tracks consecutive fatal h/w errors until first
+ * heartbeat immediately post reset.
+ * If control reached here, then at least one heartbeat work has been
+ * scheduled since last reset/init cycle.
+ * So if the device is not already in reset cycle, reset the flag
+ * prev_reset_trigger as no reset occurred with HL_RESET_FW_FATAL_ERR
+ * status for at least one heartbeat. From this point driver restarts
+ * tracking future consecutive fatal errors.
+ */
+ if (!(atomic_read(&hdev->in_reset)))
+ hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
+
schedule_delayed_work(&hdev->work_heartbeat,
usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
}
mutex_unlock(&hdev->fpriv_list_lock);
}
+static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
+{
+ u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
+
+ /*
+ * 'reset cause' is being updated here, because getting here
+ * means that it's the 1st time and the last time we're here
+ * ('in_reset' makes sure of it). This makes sure that
+ * 'reset_cause' will continue holding its 1st recorded reason!
+ */
+ if (flags & HL_RESET_HEARTBEAT) {
+ hdev->curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
+ cur_reset_trigger = HL_RESET_HEARTBEAT;
+ } else if (flags & HL_RESET_TDR) {
+ hdev->curr_reset_cause = HL_RESET_CAUSE_TDR;
+ cur_reset_trigger = HL_RESET_TDR;
+ } else if (flags & HL_RESET_FW_FATAL_ERR) {
+ hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+ cur_reset_trigger = HL_RESET_FW_FATAL_ERR;
+ } else {
+ hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+ }
+
+ /*
+ * If reset cause is same twice, then reset_trigger_repeated
+ * is set and if this reset is due to a fatal FW error
+ * device is set to an unstable state.
+ */
+ if (hdev->prev_reset_trigger != cur_reset_trigger) {
+ hdev->prev_reset_trigger = cur_reset_trigger;
+ hdev->reset_trigger_repeated = 0;
+ } else {
+ hdev->reset_trigger_repeated = 1;
+ }
+
+ /* If reset is due to heartbeat, device CPU is no responsive in
+ * which case no point sending PCI disable message to it.
+ *
+ * If F/W is performing the reset, no need to send it a message to disable
+ * PCI access
+ */
+ if ((flags & HL_RESET_HARD) &&
+ !(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) {
+ /* Disable PCI access from device F/W so he won't send
+ * us additional interrupts. We disable MSI/MSI-X at
+ * the halt_engines function and we can't have the F/W
+ * sending us interrupts after that. We need to disable
+ * the access here because if the device is marked
+ * disable, the message won't be send. Also, in case
+ * of heartbeat, the device CPU is marked as disable
+ * so this message won't be sent
+ */
+ if (hl_fw_send_pci_access_msg(hdev,
+ CPUCP_PACKET_DISABLE_PCI_ACCESS))
+ dev_warn(hdev->dev,
+ "Failed to disable PCI access by F/W\n");
+ }
+}
+
/*
* hl_device_reset - reset the device
*
if (rc)
return 0;
- /*
- * 'reset cause' is being updated here, because getting here
- * means that it's the 1st time and the last time we're here
- * ('in_reset' makes sure of it). This makes sure that
- * 'reset_cause' will continue holding its 1st recorded reason!
- */
- if (flags & HL_RESET_HEARTBEAT)
- hdev->curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
- else if (flags & HL_RESET_TDR)
- hdev->curr_reset_cause = HL_RESET_CAUSE_TDR;
- else
- hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
-
- /* If reset is due to heartbeat, device CPU is no responsive in
- * which case no point sending PCI disable message to it.
- *
- * If F/W is performing the reset, no need to send it a message to disable
- * PCI access
- */
- if (hard_reset && !(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) {
- /* Disable PCI access from device F/W so he won't send
- * us additional interrupts. We disable MSI/MSI-X at
- * the halt_engines function and we can't have the F/W
- * sending us interrupts after that. We need to disable
- * the access here because if the device is marked
- * disable, the message won't be send. Also, in case
- * of heartbeat, the device CPU is marked as disable
- * so this message won't be sent
- */
- if (hl_fw_send_pci_access_msg(hdev,
- CPUCP_PACKET_DISABLE_PCI_ACCESS))
- dev_warn(hdev->dev,
- "Failed to disable PCI access by F/W\n");
- }
+ handle_reset_trigger(hdev, flags);
/* This also blocks future CS/VM/JOB completion operations */
hdev->disabled = true;
hdev->device_cpu_disabled = false;
hdev->hard_reset_pending = false;
+ if (hdev->reset_trigger_repeated &&
+ (hdev->prev_reset_trigger == HL_RESET_FW_FATAL_ERR)) {
+ /* if there 2 back to back resets from FW,
+ * ensure driver puts the driver in a unusable state
+ */
+ dev_crit(hdev->dev,
+ "Consecutive FW fatal errors received, stopping hard reset\n");
+ rc = -EIO;
+ goto out_err;
+ }
+
if (hdev->kernel_ctx) {
dev_crit(hdev->dev,
"kernel ctx was alive during hard reset, something is terribly wrong\n");
#define HL_STATE_DUMP_HIST_LEN 5
+/* Default value for device reset trigger , an invalid value */
+#define HL_RESET_TRIGGER_DEFAULT 0xFF
+
#define OBJ_NAMES_HASH_TABLE_BITS 7 /* 1 << 7 buckets */
#define SYNC_TO_ENGINE_HASH_TABLE_BITS 7 /* 1 << 7 buckets */
* - HL_RESET_FW
* F/W will perform the reset. No need to ask it to reset the device. This is relevant
* only when running with secured f/w
+ *
+ * - HL_RESET_FW_FATAL_ERR
+ * Set if reset is due to a fatal error from FW
*/
+
#define HL_RESET_HARD (1 << 0)
#define HL_RESET_FROM_RESET_THREAD (1 << 1)
#define HL_RESET_HEARTBEAT (1 << 2)
#define HL_RESET_TDR (1 << 3)
#define HL_RESET_DEVICE_RELEASE (1 << 4)
#define HL_RESET_FW (1 << 5)
+#define HL_RESET_FW_FATAL_ERR (1 << 6)
#define HL_MAX_SOBS_PER_MONITOR 8
* @supports_staged_submission: true if staged submissions are supported
* @curr_reset_cause: saves an enumerated reset cause when a hard reset is
* triggered, and cleared after it is shared with preboot.
+ * @prev_reset_trigger: saves the previous trigger which caused a reset, overidden
+ * with a new value on next reset
+ * @reset_trigger_repeated: set if device reset is triggered more than once with
+ * same cause.
* @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to
* complete instead.
* @device_cpu_is_halted: Flag to indicate whether the device CPU was already
u8 device_fini_pending;
u8 supports_staged_submission;
u8 curr_reset_cause;
+ u8 prev_reset_trigger;
+ u8 reset_trigger_repeated;
u8 skip_reset_on_timeout;
u8 device_cpu_is_halted;
u8 supports_wait_for_multi_cs;
{
struct gaudi_device *gaudi = hdev->asic_specific;
u32 ctl = le32_to_cpu(eq_entry->hdr.ctl);
+ u32 fw_fatal_err_flag = 0;
u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK)
>> EQ_CTL_EVENT_TYPE_SHIFT);
bool reset_required;
case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR:
gaudi_print_irq_info(hdev, event_type, true);
gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
+ fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
goto reset_device;
case GAUDI_EVENT_GIC500:
case GAUDI_EVENT_L2_RAM_ECC:
case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17:
gaudi_print_irq_info(hdev, event_type, false);
+ fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
goto reset_device;
case GAUDI_EVENT_HBM0_SPI_0:
gaudi_hbm_read_interrupts(hdev,
gaudi_hbm_event_to_dev(event_type),
&eq_entry->hbm_ecc_data);
+ fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
goto reset_device;
case GAUDI_EVENT_HBM0_SPI_1:
reset_device:
if (hdev->asic_prop.fw_security_enabled)
- hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FW);
+ hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FW | fw_fatal_err_flag);
else if (hdev->hard_reset_on_fw_events)
- hl_device_reset(hdev, HL_RESET_HARD);
+ hl_device_reset(hdev, HL_RESET_HARD | fw_fatal_err_flag);
else
hl_fw_unmask_irq(hdev, event_type);
}