habanalabs: improve communication protocol with cpucp
authorOfir Bitton <obitton@habana.ai>
Thu, 28 Jan 2021 14:30:25 +0000 (16:30 +0200)
committerOded Gabbay <ogabbay@kernel.org>
Mon, 8 Feb 2021 16:20:08 +0000 (18:20 +0200)
Current messaging communictaion protocol with cpucp can get out
of sync due to coherency issues. In order to improve the protocol
reliability, we modify the protocol to expect a different
acknowledgment for every packet sent to cpucp.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/common/firmware_if.c
drivers/misc/habanalabs/common/habanalabs.h
drivers/misc/habanalabs/gaudi/gaudi.c
drivers/misc/habanalabs/goya/goya.c
drivers/misc/habanalabs/include/common/hl_boot_if.h

index ba6920f2b4ab6451f1a6d6e22174e13e4580d471..31b52a223f0215d869a9d05aa21fa0f3105f1ddd 100644 (file)
@@ -90,9 +90,10 @@ int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
 int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
                                u16 len, u32 timeout, u64 *result)
 {
+       struct hl_hw_queue *queue = &hdev->kernel_queues[hw_queue_id];
        struct cpucp_packet *pkt;
        dma_addr_t pkt_dma_addr;
-       u32 tmp;
+       u32 tmp, expected_ack_val;
        int rc = 0;
 
        pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len,
@@ -115,14 +116,22 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
                goto out;
        }
 
+       /* set fence to a non valid value */
+       pkt->fence = UINT_MAX;
+
        rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, len, pkt_dma_addr);
        if (rc) {
                dev_err(hdev->dev, "Failed to send CB on CPU PQ (%d)\n", rc);
                goto out;
        }
 
+       if (hdev->asic_prop.fw_cpucp_ack_with_pi)
+               expected_ack_val = queue->pi;
+       else
+               expected_ack_val = CPUCP_PACKET_FENCE_VAL;
+
        rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp,
-                               (tmp == CPUCP_PACKET_FENCE_VAL), 1000,
+                               (tmp == expected_ack_val), 1000,
                                timeout, true);
 
        hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);
@@ -777,6 +786,10 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
                                CPU_BOOT_DEV_STS0_FW_HARD_RST_EN)
                        prop->hard_reset_done_by_fw = true;
 
+               if (prop->fw_boot_cpu_security_map &
+                               CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN)
+                       prop->fw_cpucp_ack_with_pi = true;
+
                dev_dbg(hdev->dev,
                        "Firmware boot CPU security status %#x\n",
                        prop->fw_boot_cpu_security_map);
index 30f32f2edb8a2e1b76ef9bcc531040556b3d5115..3c54010f7ab9ce9dc5cd1a15a5709a362e4a9307 100644 (file)
@@ -419,6 +419,8 @@ struct hl_mmu_properties {
  *                            from BOOT_DEV_STS0
  * @dram_supports_virtual_memory: is there an MMU towards the DRAM
  * @hard_reset_done_by_fw: true if firmware is handling hard reset flow
+ * @fw_cpucp_ack_with_pi: true if cpucp is acking messages with the PQ PI
+ *                        instead of a magic number
  * @num_functional_hbms: number of functional HBMs in each DCORE.
  */
 struct asic_fixed_properties {
@@ -479,6 +481,7 @@ struct asic_fixed_properties {
        u8                              fw_security_status_valid;
        u8                              dram_supports_virtual_memory;
        u8                              hard_reset_done_by_fw;
+       u8                              fw_cpucp_ack_with_pi;
        u8                              num_functional_hbms;
 };
 
index 52fcaf25531a588b9960066bd734cacfb031ddab..006c34ae35c2c891429c216727c4cd742ecdd5cd 100644 (file)
@@ -533,6 +533,7 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
        prop->fw_security_disabled = true;
        prop->fw_security_status_valid = false;
        prop->hard_reset_done_by_fw = false;
+       prop->fw_cpucp_ack_with_pi = false;
 
        return 0;
 }
@@ -4438,9 +4439,12 @@ static void gaudi_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
        /* ring the doorbell */
        WREG32(db_reg_offset, db_value);
 
-       if (hw_queue_id == GAUDI_QUEUE_ID_CPU_PQ)
+       if (hw_queue_id == GAUDI_QUEUE_ID_CPU_PQ) {
+               /* make sure device CPU will read latest data from host */
+               mb();
                WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
                                GAUDI_EVENT_PI_UPDATE);
+       }
 }
 
 static void gaudi_pqe_write(struct hl_device *hdev, __le64 *pqe,
index a954e7c0237598b3bd0d8dea30613ef4197ba424..53db7e96686647ef8e4796c956ea94a9007e8cd7 100644 (file)
@@ -461,6 +461,7 @@ int goya_get_fixed_properties(struct hl_device *hdev)
        prop->fw_security_disabled = true;
        prop->fw_security_status_valid = false;
        prop->hard_reset_done_by_fw = false;
+       prop->fw_cpucp_ack_with_pi = false;
 
        return 0;
 }
@@ -2806,9 +2807,12 @@ void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
        /* ring the doorbell */
        WREG32(db_reg_offset, db_value);
 
-       if (hw_queue_id == GOYA_QUEUE_ID_CPU_PQ)
+       if (hw_queue_id == GOYA_QUEUE_ID_CPU_PQ) {
+               /* make sure device CPU will read latest data from host */
+               mb();
                WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
                                GOYA_ASYNC_EVENT_ID_PI_UPDATE);
+       }
 }
 
 void goya_pqe_write(struct hl_device *hdev, __le64 *pqe, struct hl_bd *bd)
index 57785478a4ef840fd14ebe95e9252d0c8d58881d..e87f5a98e19391570977c5d7bcba18eb249551fe 100644 (file)
  *                                     FW handles HBM ECC indications.
  *                                     Initialized in: linux
  *
+ * CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN     Packets ack value used in the armcpd
+ *                                     is set to the PI counter.
+ *                                     Initialized in: linux
+ *
  * CPU_BOOT_DEV_STS0_ENABLED           Device status register enabled.
  *                                     This is a main indication that the
  *                                     running FW populates the device status
 #define CPU_BOOT_DEV_STS0_SP_SRAM_EN                   (1 << 12)
 #define CPU_BOOT_DEV_STS0_CLK_GATE_EN                  (1 << 13)
 #define CPU_BOOT_DEV_STS0_HBM_ECC_EN                   (1 << 14)
+#define CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN                        (1 << 15)
 #define CPU_BOOT_DEV_STS0_ENABLED                      (1 << 31)
 
 enum cpu_boot_status {