habanalabs: enable stop-on-error debugfs setting per ASIC
authorTomer Tayar <ttayar@habana.ai>
Wed, 12 Jan 2022 18:08:01 +0000 (20:08 +0200)
committerOded Gabbay <ogabbay@kernel.org>
Mon, 28 Feb 2022 12:22:05 +0000 (14:22 +0200)
On Goya and Gaudi, the stop-on-error configuration can be set via
debugfs. However, in future devices, this configuration will always be
enabled.
Modify the debugfs node to be allowed only for ASICs that support this
dynamic configuration.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Documentation/ABI/testing/debugfs-driver-habanalabs
drivers/misc/habanalabs/common/debugfs.c
drivers/misc/habanalabs/common/habanalabs.h
drivers/misc/habanalabs/gaudi/gaudi.c
drivers/misc/habanalabs/goya/goya.c

index 783001a574b34491eaf576d12caf2b562b2ed72d..bcf6915987e4a489fb20b6658bfa9aec9d313ade 100644 (file)
@@ -222,6 +222,7 @@ KernelVersion:  5.6
 Contact:        ogabbay@kernel.org
 Description:    Sets the stop-on_error option for the device engines. Value of
                 "0" is for disable, otherwise enable.
+                Relevant only for GOYA and GAUDI.
 
 What:           /sys/kernel/debug/habanalabs/hl<n>/timeout_locked
 Date:           Sep 2021
index e3ee5f45d20c6d87b198914438ee323eacd3005b..9f0aaf0ef43b0803a88ec824b3f1f02889870973 100644 (file)
@@ -1071,6 +1071,9 @@ static ssize_t hl_stop_on_err_read(struct file *f, char __user *buf,
        char tmp_buf[200];
        ssize_t rc;
 
+       if (!hdev->asic_prop.configurable_stop_on_err)
+               return -EOPNOTSUPP;
+
        if (*ppos)
                return 0;
 
@@ -1089,6 +1092,9 @@ static ssize_t hl_stop_on_err_write(struct file *f, const char __user *buf,
        u32 value;
        ssize_t rc;
 
+       if (!hdev->asic_prop.configurable_stop_on_err)
+               return -EOPNOTSUPP;
+
        if (hdev->reset_info.in_reset) {
                dev_warn_ratelimited(hdev->dev,
                                "Can't change stop on error during reset\n");
index b06e2b0812b6858655365b1b7ee9713b337fbc7c..93116fe71ef66b1111776c1191892e393782db3e 100644 (file)
@@ -561,6 +561,7 @@ struct hl_hints_range {
  *                              use-case of doing soft-reset in training (due
  *                              to the fact that training runs on multiple
  *                              devices)
+ * @configurable_stop_on_err: is stop-on-error option configurable via debugfs.
  */
 struct asic_fixed_properties {
        struct hw_queue_properties      *hw_queues_props;
@@ -644,6 +645,7 @@ struct asic_fixed_properties {
        u8                              use_get_power_for_reset_history;
        u8                              supports_soft_reset;
        u8                              allow_inference_soft_reset;
+       u8                              configurable_stop_on_err;
 };
 
 /**
index f2242aa3baa226e67b02296e4b116ad505b27a18..61aa6dce6ddeeebd60d15baac5b4dda222ed67be 100644 (file)
@@ -669,6 +669,8 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 
        prop->use_get_power_for_reset_history = true;
 
+       prop->configurable_stop_on_err = true;
+
        return 0;
 }
 
index 3785fb33260d0bcb3bbb0ff8bcb128a887610324..c8143b6616af099e06665fd9732f7b540037037e 100644 (file)
@@ -483,6 +483,8 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 
        prop->use_get_power_for_reset_history = true;
 
+       prop->configurable_stop_on_err = true;
+
        return 0;
 }