accel/habanalabs/gaudi2: add eq health check using irq
authorfarah kassabri <fkassabri@habana.ai>
Wed, 23 Aug 2023 09:36:25 +0000 (12:36 +0300)
committerOded Gabbay <ogabbay@kernel.org>
Mon, 9 Oct 2023 09:37:21 +0000 (12:37 +0300)
This is the second patch for applying the eq health check mechanism
which will add support for the interrupt flow for gaudi2 asic.

More info about the interrupt mechanism:
set a dedicated msix for the eq error interrupt, and add
interrupt handler for it.
when FW detects some issue with EQ like EQ_FULL, it'll
raise that interrupt and driver should reset the device.
Driver will inform the FW which msix index to use through
the already existing handshake mechanism which will
send msix info message to fw.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/accel/habanalabs/common/habanalabs.h
drivers/accel/habanalabs/common/irq.c
drivers/accel/habanalabs/gaudi2/gaudi2.c
drivers/accel/habanalabs/gaudi2/gaudi2P.h
include/linux/habanalabs/cpucp_if.h

index e5b416852996faeed53696c60388530be854c89d..6f2cbd3c2e95d3f5d894ce601805875944cae09a 100644 (file)
@@ -3689,6 +3689,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg);
 irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg);
 irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg);
 irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg);
+irqreturn_t hl_irq_eq_error_interrupt_thread_handler(int irq, void *arg);
 u32 hl_cq_inc_ptr(u32 ptr);
 
 int hl_asid_init(struct hl_device *hdev);
index 10ac100bf9e24e274e07c3c6f39490a6f65a99a9..f6b6c54bc868a9b4e298618dc5c13852e69ba15d 100644 (file)
@@ -401,6 +401,18 @@ irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg)
        return IRQ_HANDLED;
 }
 
+irqreturn_t hl_irq_eq_error_interrupt_thread_handler(int irq, void *arg)
+{
+       u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
+       struct hl_device *hdev = arg;
+
+       dev_err(hdev->dev, "EQ error interrupt received\n");
+
+       hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask);
+
+       return IRQ_HANDLED;
+}
+
 /**
  * hl_irq_handler_eq - irq handler for event queue
  *
index e507847bf460e3346a0ef9acbb7f040b020f7a37..b0ba62b691ece32aa1a666114e0967bae10423b5 100644 (file)
@@ -4175,6 +4175,8 @@ static const char *gaudi2_irq_name(u16 irq_number)
                return "gaudi2 unexpected error";
        case GAUDI2_IRQ_NUM_USER_FIRST ... GAUDI2_IRQ_NUM_USER_LAST:
                return "gaudi2 user completion";
+       case GAUDI2_IRQ_NUM_EQ_ERROR:
+               return "gaudi2 eq error";
        default:
                return "invalid";
        }
@@ -4317,6 +4319,15 @@ static int gaudi2_enable_msix(struct hl_device *hdev)
                }
        }
 
+       irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR);
+       rc = request_threaded_irq(irq, NULL, hl_irq_eq_error_interrupt_thread_handler,
+                                       IRQF_ONESHOT, gaudi2_irq_name(GAUDI2_IRQ_NUM_EQ_ERROR),
+                                       hdev);
+       if (rc) {
+               dev_err(hdev->dev, "Failed to request IRQ %d", irq);
+               goto free_user_irq;
+       }
+
        gaudi2->hw_cap_initialized |= HW_CAP_MSIX;
 
        return 0;
@@ -4376,6 +4387,7 @@ static void gaudi2_sync_irqs(struct hl_device *hdev)
        }
 
        synchronize_irq(pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EVENT_QUEUE));
+       synchronize_irq(pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR));
 }
 
 static void gaudi2_disable_msix(struct hl_device *hdev)
@@ -4412,6 +4424,9 @@ static void gaudi2_disable_msix(struct hl_device *hdev)
        cq = &hdev->completion_queue[GAUDI2_RESERVED_CQ_CS_COMPLETION];
        free_irq(irq, cq);
 
+       irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR);
+       free_irq(irq, hdev);
+
        pci_free_irq_vectors(hdev->pdev);
 
        gaudi2->hw_cap_initialized &= ~HW_CAP_MSIX;
@@ -11345,6 +11360,7 @@ static int gaudi2_ack_mmu_page_fault_or_access_error(struct hl_device *hdev, u64
 static void gaudi2_get_msi_info(__le32 *table)
 {
        table[CPUCP_EVENT_QUEUE_MSI_TYPE] = cpu_to_le32(GAUDI2_EVENT_QUEUE_MSIX_IDX);
+       table[CPUCP_EVENT_QUEUE_ERR_MSI_TYPE] = cpu_to_le32(GAUDI2_IRQ_NUM_EQ_ERROR);
 }
 
 static int gaudi2_map_pll_idx_to_fw_idx(u32 pll_idx)
index 4535aa5ab5615f3d319ea58659a7c7514fe0220e..14e281fd9895f97fd1f6669b758f0b299ad912b0 100644 (file)
@@ -419,6 +419,7 @@ enum gaudi2_irq_num {
        GAUDI2_IRQ_NUM_NIC_PORT_FIRST,
        GAUDI2_IRQ_NUM_NIC_PORT_LAST = (GAUDI2_IRQ_NUM_NIC_PORT_FIRST + NIC_NUMBER_OF_PORTS - 1),
        GAUDI2_IRQ_NUM_TPC_ASSERT,
+       GAUDI2_IRQ_NUM_EQ_ERROR,
        GAUDI2_IRQ_NUM_RESERVED_FIRST,
        GAUDI2_IRQ_NUM_RESERVED_LAST = (GAUDI2_MSIX_ENTRIES - GAUDI2_TOTAL_USER_INTERRUPTS - 1),
        GAUDI2_IRQ_NUM_UNEXPECTED_ERROR = RESERVED_MSIX_UNEXPECTED_USER_ERROR_INTERRUPT,
index a18fa81aad1f5c9bbb72e2ded2e028a0e2bb6852..84d74c4ee4d3364cacba0e0db665a8e32d01428d 100644 (file)
@@ -1004,6 +1004,7 @@ enum cpucp_msi_type {
        CPUCP_NIC_PORT5_MSI_TYPE,
        CPUCP_NIC_PORT7_MSI_TYPE,
        CPUCP_NIC_PORT9_MSI_TYPE,
+       CPUCP_EVENT_QUEUE_ERR_MSI_TYPE,
        CPUCP_NUM_OF_MSI_TYPES
 };