habanalabs/gaudi2: wait for preboot ready if HW state is dirty
authorOhad Sharabi <osharabi@habana.ai>
Wed, 30 Nov 2022 12:26:10 +0000 (14:26 +0200)
committerOded Gabbay <ogabbay@kernel.org>
Thu, 26 Jan 2023 08:56:22 +0000 (10:56 +0200)
Instead of waiting for BTM indication we should wait for preboot ready.
Consider the below scenario:
    1. FW update is being triggered
           - setting the dirty bit
    2. hard reset will be triggered due to the dirty bit
    3. FW initiates the reset:
           - dirty bit cleared
           - BTM indication cleared
           - preboot ready indication cleared
    4. during hard reset:
           - BTM indication will be set
           - BIST test performed and another reset triggered
    5. only after this reset the preboot will set the preboot ready

When polling on BTM indication alone we can lose sync with FW while
trying to communicate with FW that is during reset.
To overcome this we will always wait to preboot ready indication.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/common/firmware_if.c
drivers/misc/habanalabs/common/habanalabs.h
drivers/misc/habanalabs/gaudi2/gaudi2.c

index 537b1ae3fcb7e6a820d304f61b802b1608cdea8e..cda0bf3dbf1bdffb67bd6d74243bd6825bc4c36e 100644 (file)
@@ -1352,7 +1352,7 @@ static void detect_cpu_boot_status(struct hl_device *hdev, u32 status)
        }
 }
 
-static int hl_fw_wait_preboot_ready(struct hl_device *hdev)
+int hl_fw_wait_preboot_ready(struct hl_device *hdev)
 {
        struct pre_fw_load_props *pre_fw_load = &hdev->fw_loader.pre_fw_load;
        u32 status;
index de715c91a87e2dbb6f7e242176246696786ab441..e5443bf7fe12725661081c8c152294bcdfe60e1f 100644 (file)
@@ -3745,6 +3745,7 @@ int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power);
 void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev);
 void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev);
 int hl_fw_init_cpu(struct hl_device *hdev);
+int hl_fw_wait_preboot_ready(struct hl_device *hdev);
 int hl_fw_read_preboot_status(struct hl_device *hdev);
 int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev,
                                struct fw_load_mgr *fw_loader,
index 85041f33e42ac13d4ebc72b2074d76f5384c74fa..987ec44fa37831b06218a517f25a8d8dc5e398de 100644 (file)
@@ -5484,7 +5484,31 @@ static void gaudi2_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_rese
 
 skip_reset:
        if (driver_performs_reset || hard_reset)
-               gaudi2_poll_btm_indication(hdev, reset_sleep_ms, poll_timeout_us);
+               /*
+                * Instead of waiting for BTM indication we should wait for preboot ready:
+                * Consider the below scenario:
+                * 1. FW update is being triggered
+                *        - setting the dirty bit
+                * 2. hard reset will be triggered due to the dirty bit
+                * 3. FW initiates the reset:
+                *        - dirty bit cleared
+                *        - BTM indication cleared
+                *        - preboot ready indication cleared
+                * 4. during hard reset:
+                *        - BTM indication will be set
+                *        - BIST test performed and another reset triggered
+                * 5. only after this reset the preboot will set the preboot ready
+                *
+                * when polling on BTM indication alone we can lose sync with FW while trying to
+                * communicate with FW that is during reset.
+                * to overcome this we will always wait to preboot ready indication
+                */
+               if ((hdev->fw_components & FW_TYPE_PREBOOT_CPU)) {
+                       msleep(reset_sleep_ms);
+                       hl_fw_wait_preboot_ready(hdev);
+               } else {
+                       gaudi2_poll_btm_indication(hdev, reset_sleep_ms, poll_timeout_us);
+               }
        else
                gaudi2_get_soft_rst_done_indication(hdev, poll_timeout_us);