habanalabs: remove stop-on-error flag from DMA
authorOmer Shpigelman <oshpigelman@habana.ai>
Sun, 22 Mar 2020 19:12:51 +0000 (21:12 +0200)
committerOded Gabbay <oded.gabbay@gmail.com>
Sun, 17 May 2020 09:06:22 +0000 (12:06 +0300)
Stop-on-error mode in DMA is useful as it stops the transaction
immediately upon error e.g. page fault.
But it may cause the next command submission to fail as is leaves the DMA
in unstable state.
Therefore we remove the stop-on-error configuration from the DMA.
Stop-on-err is still available for debug.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Documentation/ABI/testing/debugfs-driver-habanalabs
drivers/misc/habanalabs/debugfs.c
drivers/misc/habanalabs/goya/goya.c
drivers/misc/habanalabs/habanalabs.h
drivers/misc/habanalabs/include/goya/asic_reg/goya_masks.h

index a73601c5121e4773cbe673e876562d13fd41b6d3..67e04f2d7e1da430f83ecb268b4ac02be19d070a 100644 (file)
@@ -150,3 +150,10 @@ KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
 Description:    Displays a list with information about all the active virtual
                 address mappings per ASID
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/stop_on_err
+Date:           Mar 2020
+KernelVersion:  5.6
+Contact:        oded.gabbay@gmail.com
+Description:    Sets the stop-on_error option for the device engines. Value of
+                "0" is for disable, otherwise enable.
index 756d36ed5d95f767fecd5f3eb395a1a59521ac73..37beff3096f84e508c8543b35fbad12b0d3869a1 100644 (file)
@@ -970,6 +970,49 @@ static ssize_t hl_device_write(struct file *f, const char __user *buf,
        return count;
 }
 
+static ssize_t hl_stop_on_err_read(struct file *f, char __user *buf,
+                                       size_t count, loff_t *ppos)
+{
+       struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+       struct hl_device *hdev = entry->hdev;
+       char tmp_buf[200];
+       ssize_t rc;
+
+       if (*ppos)
+               return 0;
+
+       sprintf(tmp_buf, "%d\n", hdev->stop_on_err);
+       rc = simple_read_from_buffer(buf, strlen(tmp_buf) + 1, ppos, tmp_buf,
+                       strlen(tmp_buf) + 1);
+
+       return rc;
+}
+
+static ssize_t hl_stop_on_err_write(struct file *f, const char __user *buf,
+                                    size_t count, loff_t *ppos)
+{
+       struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+       struct hl_device *hdev = entry->hdev;
+       u32 value;
+       ssize_t rc;
+
+       if (atomic_read(&hdev->in_reset)) {
+               dev_warn_ratelimited(hdev->dev,
+                               "Can't change stop on error during reset\n");
+               return 0;
+       }
+
+       rc = kstrtouint_from_user(buf, count, 10, &value);
+       if (rc)
+               return rc;
+
+       hdev->stop_on_err = value ? 1 : 0;
+
+       hl_device_reset(hdev, false, false);
+
+       return count;
+}
+
 static const struct file_operations hl_data32b_fops = {
        .owner = THIS_MODULE,
        .read = hl_data_read32,
@@ -1015,6 +1058,12 @@ static const struct file_operations hl_device_fops = {
        .write = hl_device_write
 };
 
+static const struct file_operations hl_stop_on_err_fops = {
+       .owner = THIS_MODULE,
+       .read = hl_stop_on_err_read,
+       .write = hl_stop_on_err_write
+};
+
 static const struct hl_info_list hl_debugfs_list[] = {
        {"command_buffers", command_buffers_show, NULL},
        {"command_submission", command_submission_show, NULL},
@@ -1152,6 +1201,12 @@ void hl_debugfs_add_device(struct hl_device *hdev)
                                dev_entry,
                                &hl_device_fops);
 
+       debugfs_create_file("stop_on_err",
+                               0644,
+                               dev_entry->root,
+                               dev_entry,
+                               &hl_stop_on_err_fops);
+
        for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
 
                ent = debugfs_create_file(hl_debugfs_list[i].name,
index db125cf80850a8645a13b28867174cc0cec285b1..08f1d40800081cd7700caa080600808ac629006a 100644 (file)
@@ -800,6 +800,7 @@ static void goya_init_dma_qman(struct hl_device *hdev, int dma_id,
        u32 so_base_lo, so_base_hi;
        u32 gic_base_lo, gic_base_hi;
        u32 reg_off = dma_id * (mmDMA_QM_1_PQ_PI - mmDMA_QM_0_PQ_PI);
+       u32 dma_err_cfg = QMAN_DMA_ERR_MSG_EN;
 
        mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
        mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
@@ -836,7 +837,10 @@ static void goya_init_dma_qman(struct hl_device *hdev, int dma_id,
        else
                WREG32(mmDMA_QM_0_GLBL_PROT + reg_off, QMAN_DMA_FULLY_TRUSTED);
 
-       WREG32(mmDMA_QM_0_GLBL_ERR_CFG + reg_off, QMAN_DMA_ERR_MSG_EN);
+       if (hdev->stop_on_err)
+               dma_err_cfg |= 1 << DMA_QM_0_GLBL_ERR_CFG_DMA_STOP_ON_ERR_SHIFT;
+
+       WREG32(mmDMA_QM_0_GLBL_ERR_CFG + reg_off, dma_err_cfg);
        WREG32(mmDMA_QM_0_GLBL_CFG0 + reg_off, QMAN_DMA_ENABLE);
 }
 
index 31ebcf9458fe18551532d11cb4102dda0929e1e2..ae3db8eb2fb5737940c7552e5e329aa120b7c124 100644 (file)
@@ -1300,6 +1300,7 @@ struct hl_device_idle_busy_ts {
  * @in_debug: is device under debug. This, together with fpriv_list, enforces
  *            that only a single user is configuring the debug infrastructure.
  * @cdev_sysfs_created: were char devices and sysfs nodes created.
+ * @stop_on_err: true if engines should stop on error.
  */
 struct hl_device {
        struct pci_dev                  *pdev;
@@ -1380,6 +1381,7 @@ struct hl_device {
        u8                              dma_mask;
        u8                              in_debug;
        u8                              cdev_sysfs_created;
+       u8                              stop_on_err;
 
        /* Parameters for bring-up */
        u8                              mmu_enable;
index 3c44ef3a23ed8d098d9811ca47d7fd5e49058c15..067489bd048e25b0202f77b0441bd8fc4ea26d33 100644 (file)
@@ -55,8 +55,7 @@
        (1 << DMA_QM_0_GLBL_ERR_CFG_DMA_ERR_MSG_EN_SHIFT) | \
        (1 << DMA_QM_0_GLBL_ERR_CFG_PQF_STOP_ON_ERR_SHIFT) | \
        (1 << DMA_QM_0_GLBL_ERR_CFG_CQF_STOP_ON_ERR_SHIFT) | \
-       (1 << DMA_QM_0_GLBL_ERR_CFG_CP_STOP_ON_ERR_SHIFT) | \
-       (1 << DMA_QM_0_GLBL_ERR_CFG_DMA_STOP_ON_ERR_SHIFT))
+       (1 << DMA_QM_0_GLBL_ERR_CFG_CP_STOP_ON_ERR_SHIFT))
 
 #define QMAN_MME_ENABLE                (\
        (1 << MME_QM_GLBL_CFG0_PQF_EN_SHIFT) | \