habanalabs: add reset support when user closes FD
authorOfir Bitton <obitton@habana.ai>
Wed, 10 Feb 2021 12:29:33 +0000 (14:29 +0200)
committerOded Gabbay <ogabbay@kernel.org>
Fri, 9 Apr 2021 11:09:22 +0000 (14:09 +0300)
In order to support command submissions that are done directly from
user space, the driver must perform soft reset once user closes its FD.
In case the soft reset fails or device is not idle, a hard reset should
be performed.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/common/device.c
drivers/misc/habanalabs/common/habanalabs.h

index 334009e83823677b3779b805add6134c6783889e..c74bdf4ae6aa8fb9946f59aa7a97aee08c68718b 100644 (file)
@@ -103,8 +103,24 @@ static int hl_device_release(struct inode *inode, struct file *filp)
                return 0;
        }
 
-       hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
-       hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
+       hl_cb_mgr_fini(hdev, &hpriv->cb_mgr);
+       hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr);
+
+       if (hdev->reset_upon_device_release) {
+               u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
+
+               /* We try soft reset first */
+               hl_device_reset(hdev, false, false);
+
+               /* If device is not idle perform hard reset */
+               if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
+                               HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
+                       dev_info(hdev->dev,
+                               "device is not idle (mask %#llx %#llx) after soft reset, performing hard reset",
+                               idle_mask[0], idle_mask[1]);
+                       hl_device_reset(hdev, true, false);
+               }
+       }
 
        hl_hpriv_put(hpriv);
 
index 4b321e4f8059f6735a913534662929c0d92c1982..4fdb4fa5728df9e65f46daa0d28c01aee5bcd876 100644 (file)
@@ -1920,6 +1920,7 @@ struct hl_mmu_funcs {
  * @device_fini_pending: true if device_fini was called and might be
  *                       waiting for the reset thread to finish
  * @supports_staged_submission: true if staged submissions are supported
+ * @reset_upon_device_release: true if reset is required upon device release
  */
 struct hl_device {
        struct pci_dev                  *pdev;
@@ -2026,6 +2027,7 @@ struct hl_device {
        u8                              process_kill_trial_cnt;
        u8                              device_fini_pending;
        u8                              supports_staged_submission;
+       u8                              reset_upon_device_release;
 
        /* Parameters for bring-up */
        u64                             nic_ports_mask;