habanalabs: refactor reset information variables

author Ofir Bitton <obitton@habana.ai>

Tue, 23 Nov 2021 13:15:22 +0000 (15:15 +0200)

committer Oded Gabbay <ogabbay@kernel.org>

Sun, 26 Dec 2021 12:41:28 +0000 (14:41 +0200)
author Ofir Bitton <obitton@habana.ai>
Tue, 23 Nov 2021 13:15:22 +0000 (15:15 +0200)
committer Oded Gabbay <ogabbay@kernel.org>
Sun, 26 Dec 2021 12:41:28 +0000 (14:41 +0200)
diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c

index e7534b5129fa11a9055219667de487635a33392e..649380bb189f6fb9664a15ac6bd6aa247130b7ef 100644 (file)
--- a/drivers/misc/habanalabs/common/command_buffer.c
+++ b/drivers/misc/habanalabs/common/command_buffer.c
@@ -250,7 +250,7 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
          * Can't use generic function to check this because of special case
          * where we create a CB as part of the reset process
          */
-       if ((hdev->disabled) || ((atomic_read(&hdev->in_reset)) &&
+       if ((hdev->disabled) || ((atomic_read(&hdev->reset_info.in_reset)) &&
                                         (ctx_id != HL_KERNEL_ASID_ID))) {
                 dev_warn_ratelimited(hdev->dev,
                         "Device is disabled or in reset. Can't create new CBs\n");
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c

index d39343f90bc2a252cb7f4abc15f6cad64cc92771..0a4ef13d9ac47256e233ed27d180619201d78e59 100644 (file)
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -777,7 +777,7 @@ static void cs_timedout(struct work_struct *work)
                 if (hdev->reset_on_lockup)
                         hl_device_reset(hdev, HL_DRV_RESET_TDR);
                 else
-                       hdev->needs_reset = true;
+                       hdev->reset_info.needs_reset = true;
         }
  }
  
@@ -814,7 +814,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
         cs->encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
         cs->timeout_jiffies = timeout;
         cs->skip_reset_on_timeout =
-               hdev->skip_reset_on_timeout ||
+               hdev->reset_info.skip_reset_on_timeout ||
                 !!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT);
         cs->submission_time_jiffies = jiffies;
         INIT_LIST_HEAD(&cs->job_list);
diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c

index 2e9c31d79d5e98c813e3da2a792be5d41bbcc444..746d1a18de63837c29092ab1d770df43eb6aeb29 100644 (file)
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -527,7 +527,7 @@ static int engines_show(struct seq_file *s, void *data)
         struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
         struct hl_device *hdev = dev_entry->hdev;
  
-       if (atomic_read(&hdev->in_reset)) {
+       if (atomic_read(&hdev->reset_info.in_reset)) {
                 dev_warn_ratelimited(hdev->dev,
                                 "Can't check device idle during reset\n");
                 return 0;
@@ -658,7 +658,7 @@ static ssize_t hl_data_read32(struct file *f, char __user *buf,
         ssize_t rc;
         u32 val;
  
-       if (atomic_read(&hdev->in_reset)) {
+       if (atomic_read(&hdev->reset_info.in_reset)) {
                 dev_warn_ratelimited(hdev->dev, "Can't read during reset\n");
                 return 0;
         }
@@ -694,7 +694,7 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf,
         u32 value;
         ssize_t rc;
  
-       if (atomic_read(&hdev->in_reset)) {
+       if (atomic_read(&hdev->reset_info.in_reset)) {
                 dev_warn_ratelimited(hdev->dev, "Can't write during reset\n");
                 return 0;
         }
@@ -731,7 +731,7 @@ static ssize_t hl_data_read64(struct file *f, char __user *buf,
         ssize_t rc;
         u64 val;
  
-       if (atomic_read(&hdev->in_reset)) {
+       if (atomic_read(&hdev->reset_info.in_reset)) {
                 dev_warn_ratelimited(hdev->dev, "Can't read during reset\n");
                 return 0;
         }
@@ -767,7 +767,7 @@ static ssize_t hl_data_write64(struct file *f, const char __user *buf,
         u64 value;
         ssize_t rc;
  
-       if (atomic_read(&hdev->in_reset)) {
+       if (atomic_read(&hdev->reset_info.in_reset)) {
                 dev_warn_ratelimited(hdev->dev, "Can't write during reset\n");
                 return 0;
         }
@@ -802,7 +802,7 @@ static ssize_t hl_dma_size_write(struct file *f, const char __user *buf,
         ssize_t rc;
         u32 size;
  
-       if (atomic_read(&hdev->in_reset)) {
+       if (atomic_read(&hdev->reset_info.in_reset)) {
                 dev_warn_ratelimited(hdev->dev, "Can't DMA during reset\n");
                 return 0;
         }
@@ -1077,7 +1077,7 @@ static ssize_t hl_clk_gate_write(struct file *f, const char __user *buf,
         u64 value;
         ssize_t rc;
  
-       if (atomic_read(&hdev->in_reset)) {
+       if (atomic_read(&hdev->reset_info.in_reset)) {
                 dev_warn_ratelimited(hdev->dev,
                                 "Can't change clock gating during reset\n");
                 return 0;
@@ -1119,7 +1119,7 @@ static ssize_t hl_stop_on_err_write(struct file *f, const char __user *buf,
         u32 value;
         ssize_t rc;
  
-       if (atomic_read(&hdev->in_reset)) {
+       if (atomic_read(&hdev->reset_info.in_reset)) {
                 dev_warn_ratelimited(hdev->dev,
                                 "Can't change stop on error during reset\n");
                 return 0;
@@ -1497,7 +1497,7 @@ void hl_debugfs_add_device(struct hl_device *hdev)
         debugfs_create_x8("skip_reset_on_timeout",
                                 0644,
                                 dev_entry->root,
-                               &hdev->skip_reset_on_timeout);
+                               &hdev->reset_info.skip_reset_on_timeout);
  
         debugfs_create_file("state_dump",
                                 0600,
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c

index f1f482c5cdcb67e4bfebd78423934a1f5d9da738..f8f9eb7a934f8af78706d44d493b6b292cc9ed76 100644 (file)
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -17,9 +17,9 @@ enum hl_device_status hl_device_status(struct hl_device *hdev)
  {
         enum hl_device_status status;
  
-       if (atomic_read(&hdev->in_reset))
+       if (atomic_read(&hdev->reset_info.in_reset))
                 status = HL_DEVICE_STATUS_IN_RESET;
-       else if (hdev->needs_reset)
+       else if (hdev->reset_info.needs_reset)
                 status = HL_DEVICE_STATUS_NEEDS_RESET;
         else if (hdev->disabled)
                 status = HL_DEVICE_STATUS_MALFUNCTION;
@@ -452,7 +452,7 @@ static int device_early_init(struct hl_device *hdev)
         INIT_LIST_HEAD(&hdev->fpriv_ctrl_list);
         mutex_init(&hdev->fpriv_list_lock);
         mutex_init(&hdev->fpriv_ctrl_list_lock);
-       atomic_set(&hdev->in_reset, 0);
+       atomic_set(&hdev->reset_info.in_reset, 0);
         mutex_init(&hdev->clk_throttling.lock);
  
         return 0;
@@ -544,8 +544,8 @@ reschedule:
          * status for at least one heartbeat. From this point driver restarts
          * tracking future consecutive fatal errors.
          */
-       if (!(atomic_read(&hdev->in_reset)))
-               hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
+       if (!(atomic_read(&hdev->reset_info.in_reset)))
+               hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
  
         schedule_delayed_work(&hdev->work_heartbeat,
                         usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
@@ -639,12 +639,12 @@ int hl_device_set_debug_mode(struct hl_device *hdev, struct hl_ctx *ctx, bool en
                         goto out;
                 }
  
-               if (!hdev->hard_reset_pending)
+               if (!hdev->reset_info.hard_reset_pending)
                         hdev->asic_funcs->halt_coresight(hdev, ctx);
  
                 hdev->in_debug = 0;
  
-               if (!hdev->hard_reset_pending)
+               if (!hdev->reset_info.hard_reset_pending)
                         hdev->asic_funcs->set_clock_gating(hdev);
  
                 goto out;
@@ -722,7 +722,7 @@ int hl_device_suspend(struct hl_device *hdev)
         pci_save_state(hdev->pdev);
  
         /* Block future CS/VM/JOB completion operations */
-       rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
+       rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1);
         if (rc) {
                 dev_err(hdev->dev, "Can't suspend while in reset\n");
                 return -EIO;
@@ -777,7 +777,7 @@ int hl_device_resume(struct hl_device *hdev)
  
  
         hdev->disabled = false;
-       atomic_set(&hdev->in_reset, 0);
+       atomic_set(&hdev->reset_info.in_reset, 0);
  
         rc = hl_device_reset(hdev, HL_DRV_RESET_HARD);
         if (rc) {
@@ -906,16 +906,16 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
          * 'reset_cause' will continue holding its 1st recorded reason!
          */
         if (flags & HL_DRV_RESET_HEARTBEAT) {
-               hdev->curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
+               hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
                 cur_reset_trigger = HL_DRV_RESET_HEARTBEAT;
         } else if (flags & HL_DRV_RESET_TDR) {
-               hdev->curr_reset_cause = HL_RESET_CAUSE_TDR;
+               hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_TDR;
                 cur_reset_trigger = HL_DRV_RESET_TDR;
         } else if (flags & HL_DRV_RESET_FW_FATAL_ERR) {
-               hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+               hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
                 cur_reset_trigger = HL_DRV_RESET_FW_FATAL_ERR;
         } else {
-               hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+               hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
         }
  
         /*
@@ -923,11 +923,11 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
          * is set and if this reset is due to a fatal FW error
          * device is set to an unstable state.
          */
-       if (hdev->prev_reset_trigger != cur_reset_trigger) {
-               hdev->prev_reset_trigger = cur_reset_trigger;
-               hdev->reset_trigger_repeated = 0;
+       if (hdev->reset_info.prev_reset_trigger != cur_reset_trigger) {
+               hdev->reset_info.prev_reset_trigger = cur_reset_trigger;
+               hdev->reset_info.reset_trigger_repeated = 0;
         } else {
-               hdev->reset_trigger_repeated = 1;
+               hdev->reset_info.reset_trigger_repeated = 1;
         }
  
         /* If reset is due to heartbeat, device CPU is no responsive in
@@ -987,7 +987,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
         from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR);
         fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW);
  
-       if (!hard_reset && !hdev->supports_soft_reset) {
+       if (!hard_reset && !hdev->asic_prop.supports_soft_reset) {
                 hard_instead_soft = true;
                 hard_reset = true;
         }
@@ -1004,7 +1004,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
                 goto do_reset;
         }
  
-       if (!hard_reset && !hdev->allow_inference_soft_reset) {
+       if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) {
                 hard_instead_soft = true;
                 hard_reset = true;
         }
@@ -1024,13 +1024,14 @@ do_reset:
          */
         if (!from_hard_reset_thread) {
                 /* Block future CS/VM/JOB completion operations */
-               rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
+               rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1);
                 if (rc)
                         return 0;
  
                 handle_reset_trigger(hdev, flags);
  
-               hdev->is_in_soft_reset = !hard_reset;
+               /* This still allows the completion of some KDMA ops */
+               hdev->reset_info.is_in_soft_reset = !hard_reset;
  
                 /* This also blocks future CS/VM/JOB completion operations */
                 hdev->disabled = true;
@@ -1047,7 +1048,7 @@ do_reset:
  
  again:
         if ((hard_reset) && (!from_hard_reset_thread)) {
-               hdev->hard_reset_pending = true;
+               hdev->reset_info.hard_reset_pending = true;
  
                 hdev->process_kill_trial_cnt = 0;
  
@@ -1128,10 +1129,11 @@ kill_processes:
  
         if (hard_reset) {
                 hdev->device_cpu_disabled = false;
-               hdev->hard_reset_pending = false;
+               hdev->reset_info.hard_reset_pending = false;
  
-               if (hdev->reset_trigger_repeated &&
-                               (hdev->prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR)) {
+               if (hdev->reset_info.reset_trigger_repeated &&
+                               (hdev->reset_info.prev_reset_trigger ==
+                                               HL_DRV_RESET_FW_FATAL_ERR)) {
                         /* if there 2 back to back resets from FW,
                          * ensure driver puts the driver in a unusable state
                          */
@@ -1182,7 +1184,7 @@ kill_processes:
          * is required for the initialization itself
          */
         hdev->disabled = false;
-       hdev->is_in_soft_reset = false;
+       hdev->reset_info.is_in_soft_reset = false;
  
         rc = hdev->asic_funcs->hw_init(hdev);
         if (rc) {
@@ -1232,13 +1234,13 @@ kill_processes:
                 }
         }
  
-       atomic_set(&hdev->in_reset, 0);
-       hdev->needs_reset = false;
+       atomic_set(&hdev->reset_info.in_reset, 0);
+       hdev->reset_info.needs_reset = false;
  
         dev_notice(hdev->dev, "Successfully finished resetting the device\n");
  
         if (hard_reset) {
-               hdev->hard_reset_cnt++;
+               hdev->reset_info.hard_reset_cnt++;
  
                 /* After reset is done, we are ready to receive events from
                  * the F/W. We can't do it before because we will ignore events
@@ -1247,30 +1249,30 @@ kill_processes:
                  */
                 hdev->asic_funcs->enable_events_from_fw(hdev);
         } else if (!reset_upon_device_release) {
-               hdev->soft_reset_cnt++;
+               hdev->reset_info.soft_reset_cnt++;
         }
  
         return 0;
  
  out_err:
         hdev->disabled = true;
-       hdev->is_in_soft_reset = false;
+       hdev->reset_info.is_in_soft_reset = false;
  
         if (hard_reset) {
                 dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n");
-               hdev->hard_reset_cnt++;
+               hdev->reset_info.hard_reset_cnt++;
         } else if (reset_upon_device_release) {
                 dev_err(hdev->dev, "Failed to reset device after user release\n");
                 hard_reset = true;
                 goto again;
         } else {
                 dev_err(hdev->dev, "Failed to do soft-reset\n");
-               hdev->soft_reset_cnt++;
+               hdev->reset_info.soft_reset_cnt++;
                 hard_reset = true;
                 goto again;
         }
  
-       atomic_set(&hdev->in_reset, 0);
+       atomic_set(&hdev->reset_info.in_reset, 0);
  
         return rc;
  }
@@ -1604,10 +1606,10 @@ void hl_device_fini(struct hl_device *hdev)
          */
  
         timeout = ktime_add_us(ktime_get(), reset_sec * 1000 * 1000);
-       rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
+       rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1);
         while (rc) {
                 usleep_range(50, 200);
-               rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
+               rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1);
                 if (ktime_compare(ktime_get(), timeout) > 0) {
                         dev_crit(hdev->dev,
                                 "Failed to remove device because reset function did not finish\n");
@@ -1629,7 +1631,7 @@ void hl_device_fini(struct hl_device *hdev)
  
         take_release_locks(hdev);
  
-       hdev->hard_reset_pending = true;
+       hdev->reset_info.hard_reset_pending = true;
  
         hl_hwmon_fini(hdev);
  
diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c

index 2cc2015c24162693d11a86baf1233eefa7324d3f..6775c5c3166b4daf4ed8a7a070b3efbcd3394c84 100644 (file)
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -2371,14 +2371,14 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
         if (rc)
                 goto protocol_err;
  
-       if (hdev->curr_reset_cause) {
+       if (hdev->reset_info.curr_reset_cause) {
                 rc = hl_fw_dynamic_send_msg(hdev, fw_loader,
-                               HL_COMMS_RESET_CAUSE_TYPE, &hdev->curr_reset_cause);
+                               HL_COMMS_RESET_CAUSE_TYPE, &hdev->reset_info.curr_reset_cause);
                 if (rc)
                         goto protocol_err;
  
                 /* Clear current reset cause */
-               hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+               hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
         }
  
         if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) {
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h

index fc1bdc07a1695e94b39a06dd0afd1475b1bdf79b..47eaeff9e924a7c36e797ece9c545879adb55cfc 100644 (file)
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -547,6 +547,13 @@ struct hl_hints_range {
   *                         false otherwise.
   * @use_get_power_for_reset_history: To support backward compatibility for Goya
   *                                   and Gaudi
+ * @supports_soft_reset: is soft reset supported.
+ * @allow_inference_soft_reset: true if the ASIC supports soft reset that is
+ *                              initiated by user or TDR. This is only true
+ *                              in inference ASICs, as there is no real-world
+ *                              use-case of doing soft-reset in training (due
+ *                              to the fact that training runs on multiple
+ *                              devices)
   */
  struct asic_fixed_properties {
         struct hw_queue_properties      *hw_queues_props;
@@ -628,6 +635,8 @@ struct asic_fixed_properties {
         u8                              dynamic_fw_load;
         u8                              gic_interrupts_enable;
         u8                              use_get_power_for_reset_history;
+       u8                              supports_soft_reset;
+       u8                              allow_inference_soft_reset;
  };
  
  /**
@@ -2446,6 +2455,39 @@ struct last_error_session_info {
         u8              razwi_type;
  };
  
+/**
+ * struct hl_reset_info - holds current device reset information.
+ * @in_reset: is device in reset flow.
+ * @soft_reset_cnt: number of soft reset since the driver was loaded.
+ * @hard_reset_cnt: number of hard reset since the driver was loaded.
+ * @is_in_soft_reset: Device is currently in soft reset process.
+ * @needs_reset: true if reset_on_lockup is false and device should be reset
+ *               due to lockup.
+ * @hard_reset_pending: is there a hard reset work pending.
+ * @curr_reset_cause: saves an enumerated reset cause when a hard reset is
+ *                    triggered, and cleared after it is shared with preboot.
+ * @prev_reset_trigger: saves the previous trigger which caused a reset, overidden
+ *                      with a new value on next reset
+ * @reset_trigger_repeated: set if device reset is triggered more than once with
+ *                          same cause.
+ * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to
+ *                         complete instead.
+ */
+struct hl_reset_info {
+       atomic_t        in_reset;
+       u32             soft_reset_cnt;
+       u32             hard_reset_cnt;
+       u8              is_in_soft_reset;
+       u8              needs_reset;
+       u8              hard_reset_pending;
+
+       u8              curr_reset_cause;
+       u8              prev_reset_trigger;
+       u8              reset_trigger_repeated;
+
+       u8              skip_reset_on_timeout;
+};
+
  /**
   * struct hl_device - habanalabs device structure.
   * @pdev: pointer to PCI device, can be NULL in case of simulator device.
@@ -2514,6 +2556,7 @@ struct last_error_session_info {
   * @state_dump_specs: constants and dictionaries needed to dump system state.
   * @multi_cs_completion: array of multi-CS completion.
   * @clk_throttling: holds information about current/previous clock throttling events
+ * @reset_info: holds current device reset information.
   * @last_error: holds information about last session in which CS timeout or razwi error occurred.
   * @stream_master_qid_arr: pointer to array with QIDs of master streams.
   * @dram_used_mem: current DRAM memory consumption.
@@ -2538,13 +2581,10 @@ struct last_error_session_info {
   *                                  session.
   * @open_counter: number of successful device open operations.
   * @fw_poll_interval_usec: FW status poll interval in usec.
- * @in_reset: is device in reset flow.
   * @card_type: Various ASICs have several card types. This indicates the card
   *             type of the current device.
   * @major: habanalabs kernel driver major.
   * @high_pll: high PLL profile frequency.
- * @soft_reset_cnt: number of soft reset since the driver was loaded.
- * @hard_reset_cnt: number of hard reset since the driver was loaded.
   * @id: device minor.
   * @id_control: minor of the control device
   * @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
@@ -2552,7 +2592,6 @@ struct last_error_session_info {
   * @disabled: is device disabled.
   * @late_init_done: is late init stage was done during initialization.
   * @hwmon_initialized: is H/W monitor sensors was initialized.
- * @hard_reset_pending: is there a hard reset work pending.
   * @heartbeat: is heartbeat sanity check towards CPU-CP enabled.
   * @reset_on_lockup: true if a reset should be done in case of stuck CS, false
   *                   otherwise.
@@ -2575,35 +2614,17 @@ struct last_error_session_info {
   * @sync_stream_queue_idx: helper index for sync stream queues initialization.
   * @collective_mon_idx: helper index for collective initialization
   * @supports_coresight: is CoreSight supported.
- * @supports_soft_reset: is soft reset supported.
- * @allow_inference_soft_reset: true if the ASIC supports soft reset that is
- *                              initiated by user or TDR. This is only true
- *                              in inference ASICs, as there is no real-world
- *                              use-case of doing soft-reset in training (due
- *                              to the fact that training runs on multiple
- *                              devices)
   * @supports_cb_mapping: is mapping a CB to the device's MMU supported.
- * @needs_reset: true if reset_on_lockup is false and device should be reset
- *               due to lockup.
   * @process_kill_trial_cnt: number of trials reset thread tried killing
   *                          user processes
   * @device_fini_pending: true if device_fini was called and might be
   *                       waiting for the reset thread to finish
   * @supports_staged_submission: true if staged submissions are supported
- * @curr_reset_cause: saves an enumerated reset cause when a hard reset is
- *                    triggered, and cleared after it is shared with preboot.
- * @prev_reset_trigger: saves the previous trigger which caused a reset, overidden
- *                      with a new value on next reset
- * @reset_trigger_repeated: set if device reset is triggered more than once with
- *                          same cause.
- * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to
- *                         complete instead.
   * @device_cpu_is_halted: Flag to indicate whether the device CPU was already
   *                        halted. We can't halt it again because the COMMS
   *                        protocol will throw an error. Relevant only for
   *                        cases where Linux was not loaded to device CPU
   * @supports_wait_for_multi_cs: true if wait for multi CS is supported
- * @is_in_soft_reset: Device is currently in soft reset process.
   * @is_compute_ctx_active: Whether there is an active compute context executing.
   */
  struct hl_device {
@@ -2678,6 +2699,8 @@ struct hl_device {
         struct hl_clk_throttle          clk_throttling;
         struct last_error_session_info  last_error;
  
+       struct hl_reset_info            reset_info;
+
         u32                             *stream_master_qid_arr;
         atomic64_t                      dram_used_mem;
         u64                             timeout_jiffies;
@@ -2689,20 +2712,16 @@ struct hl_device {
         u64                             last_open_session_duration_jif;
         u64                             open_counter;
         u64                             fw_poll_interval_usec;
-       atomic_t                        in_reset;
         ktime_t                         last_successful_open_ktime;
         enum cpucp_card_types           card_type;
         u32                             major;
         u32                             high_pll;
-       u32                             soft_reset_cnt;
-       u32                             hard_reset_cnt;
         u16                             id;
         u16                             id_control;
         u16                             cpu_pci_msb_addr;
         u8                              disabled;
         u8                              late_init_done;
         u8                              hwmon_initialized;
-       u8                              hard_reset_pending;
         u8                              heartbeat;
         u8                              reset_on_lockup;
         u8                              dram_default_page_mapping;
@@ -2719,21 +2738,13 @@ struct hl_device {
         u8                              sync_stream_queue_idx;
         u8                              collective_mon_idx;
         u8                              supports_coresight;
-       u8                              supports_soft_reset;
-       u8                              allow_inference_soft_reset;
         u8                              supports_cb_mapping;
-       u8                              needs_reset;
         u8                              process_kill_trial_cnt;
         u8                              device_fini_pending;
         u8                              supports_staged_submission;
-       u8                              curr_reset_cause;
-       u8                              prev_reset_trigger;
-       u8                              reset_trigger_repeated;
-       u8                              skip_reset_on_timeout;
         u8                              device_cpu_is_halted;
         u8                              supports_wait_for_multi_cs;
         u8                              stream_master_qid_arr_size;
-       u8                              is_in_soft_reset;
         u8                              is_compute_ctx_active;
  
         /* Parameters for bring-up */
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c

index aa4e07b1f83961a6b98bdad811e96dfe11b1143e..690b763c7a95a4e3c6af77cf590b2e9002f920e0 100644 (file)
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -289,8 +289,8 @@ static int fixup_device_params(struct hl_device *hdev)
         hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
  
         hdev->stop_on_err = true;
-       hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
-       hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
+       hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+       hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
  
         /* Enable only after the initialization of the device */
         hdev->disabled = true;
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c

index 7ddf70a0ca8a77be78de640aabc940fc11e66784..3ba3a8ffda3e5a5717e5dcb733afe372a7c14a4f 100644 (file)
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -269,8 +269,8 @@ static int get_reset_count(struct hl_device *hdev, struct hl_info_args *args)
         if ((!max_size) || (!out))
                 return -EINVAL;
  
-       reset_count.hard_reset_cnt = hdev->hard_reset_cnt;
-       reset_count.soft_reset_cnt = hdev->soft_reset_cnt;
+       reset_count.hard_reset_cnt = hdev->reset_info.hard_reset_cnt;
+       reset_count.soft_reset_cnt = hdev->reset_info.soft_reset_cnt;
  
         return copy_to_user(out, &reset_count,
                 min((size_t) max_size, sizeof(reset_count))) ? -EFAULT : 0;
diff --git a/drivers/misc/habanalabs/common/irq.c b/drivers/misc/habanalabs/common/irq.c

index 6454ea12bf3aa2edb4e7ac955014ffceb461bb4a..1b6bdc900c26d3e863ffa1aa03e3aa977c68de0a 100644 (file)
--- a/drivers/misc/habanalabs/common/irq.c
+++ b/drivers/misc/habanalabs/common/irq.c
@@ -249,7 +249,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg)
                  */
                 dma_rmb();
  
-               if (hdev->disabled && !hdev->is_in_soft_reset) {
+               if (hdev->disabled && !hdev->reset_info.is_in_soft_reset) {
                         dev_warn(hdev->dev, "Device disabled but received an EQ event\n");
                         goto skip_irq;
                 }
diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c

index b8596846f3dca4fefc8beaff1071d902cb7ee850..c1eefaebacb64f64990338fd1cfad68265fdef99 100644 (file)
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -2624,7 +2624,7 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx)
          * Clearly something went wrong on hard reset so no point in printing
          * another side effect error
          */
-       if (!hdev->hard_reset_pending && !hash_empty(ctx->mem_hash))
+       if (!hdev->reset_info.hard_reset_pending && !hash_empty(ctx->mem_hash))
                 dev_dbg(hdev->dev,
                         "user released device without removing its memory mappings\n");
  
diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c

index 1af568e46f46c584fc15df2f1519cd91e7ef6ab4..45c715325e2a7fcf4b046dedd3c97ddfeb2a3ce8 100644 (file)
--- a/drivers/misc/habanalabs/common/sysfs.c
+++ b/drivers/misc/habanalabs/common/sysfs.c
@@ -211,7 +211,7 @@ static ssize_t soft_reset_store(struct device *dev,
                 goto out;
         }
  
-       if (!hdev->allow_inference_soft_reset) {
+       if (!hdev->asic_prop.allow_inference_soft_reset) {
                 dev_err(hdev->dev, "Device does not support inference soft-reset\n");
                 goto out;
         }
@@ -303,7 +303,7 @@ static ssize_t soft_reset_cnt_show(struct device *dev,
  {
         struct hl_device *hdev = dev_get_drvdata(dev);
  
-       return sprintf(buf, "%d\n", hdev->soft_reset_cnt);
+       return sprintf(buf, "%d\n", hdev->reset_info.soft_reset_cnt);
  }
  
  static ssize_t hard_reset_cnt_show(struct device *dev,
@@ -311,7 +311,7 @@ static ssize_t hard_reset_cnt_show(struct device *dev,
  {
         struct hl_device *hdev = dev_get_drvdata(dev);
  
-       return sprintf(buf, "%d\n", hdev->hard_reset_cnt);
+       return sprintf(buf, "%d\n", hdev->reset_info.hard_reset_cnt);
  }
  
  static ssize_t max_power_show(struct device *dev, struct device_attribute *attr,
@@ -478,7 +478,7 @@ int hl_sysfs_init(struct hl_device *hdev)
                 return rc;
         }
  
-       if (!hdev->allow_inference_soft_reset)
+       if (!hdev->asic_prop.allow_inference_soft_reset)
                 return 0;
  
         rc = device_add_groups(hdev->dev, hl_dev_inference_attr_groups);
@@ -495,7 +495,7 @@ void hl_sysfs_fini(struct hl_device *hdev)
  {
         device_remove_groups(hdev->dev, hl_dev_attr_groups);
  
-       if (!hdev->allow_inference_soft_reset)
+       if (!hdev->asic_prop.allow_inference_soft_reset)
                 return;
  
         device_remove_groups(hdev->dev, hl_dev_inference_attr_groups);
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c

index b3431eac4f0467adf2def11a3b884bd2ff90a26d..013c6da2e3ca1eede8d3feb2d0776f66fb980e9f 100644 (file)
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -4325,7 +4325,7 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset
                  * In case watchdog hasn't expired but we still got HB, then this won't do any
                  * damage.
                  */
-               if (hdev->curr_reset_cause == HL_RESET_CAUSE_HEARTBEAT) {
+               if (hdev->reset_info.curr_reset_cause == HL_RESET_CAUSE_HEARTBEAT) {
                         if (hdev->asic_prop.hard_reset_done_by_fw)
                                 hl_fw_ask_hard_reset_without_linux(hdev);
                         else
@@ -6564,7 +6564,7 @@ static u64 gaudi_read_pte(struct hl_device *hdev, u64 addr)
  {
         struct gaudi_device *gaudi = hdev->asic_specific;
  
-       if (hdev->hard_reset_pending)
+       if (hdev->reset_info.hard_reset_pending)
                 return U64_MAX;
  
         return readq(hdev->pcie_bar[HBM_BAR_ID] +
@@ -6575,7 +6575,7 @@ static void gaudi_write_pte(struct hl_device *hdev, u64 addr, u64 val)
  {
         struct gaudi_device *gaudi = hdev->asic_specific;
  
-       if (hdev->hard_reset_pending)
+       if (hdev->reset_info.hard_reset_pending)
                 return;
  
         writeq(val, hdev->pcie_bar[HBM_BAR_ID] +
@@ -8341,7 +8341,7 @@ static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
         int rc;
  
         if (!(gaudi->hw_cap_initialized & HW_CAP_MMU) ||
-               hdev->hard_reset_pending)
+               hdev->reset_info.hard_reset_pending)
                 return 0;
  
         if (hdev->pldm)
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c

index f4473013f1ee30d74206d9a343d36f648dbf3b80..fbcc7bbf44b3e648ddc0c55b2840a2768dabfc28 100644 (file)
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -1033,8 +1033,8 @@ static int goya_sw_init(struct hl_device *hdev)
  
         spin_lock_init(&goya->hw_queues_lock);
         hdev->supports_coresight = true;
-       hdev->supports_soft_reset = true;
-       hdev->allow_inference_soft_reset = true;
+       hdev->asic_prop.supports_soft_reset = true;
+       hdev->asic_prop.allow_inference_soft_reset = true;
         hdev->supports_wait_for_multi_cs = false;
  
         hdev->asic_funcs->set_pci_memory_regions(hdev);
@@ -4477,7 +4477,7 @@ static u64 goya_read_pte(struct hl_device *hdev, u64 addr)
  {
         struct goya_device *goya = hdev->asic_specific;
  
-       if (hdev->hard_reset_pending)
+       if (hdev->reset_info.hard_reset_pending)
                 return U64_MAX;
  
         return readq(hdev->pcie_bar[DDR_BAR_ID] +
@@ -4488,7 +4488,7 @@ static void goya_write_pte(struct hl_device *hdev, u64 addr, u64 val)
  {
         struct goya_device *goya = hdev->asic_specific;
  
-       if (hdev->hard_reset_pending)
+       if (hdev->reset_info.hard_reset_pending)
                 return;
  
         writeq(val, hdev->pcie_bar[DDR_BAR_ID] +
@@ -5308,7 +5308,7 @@ static int goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
         int rc;
  
         if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
-               hdev->hard_reset_pending)
+               hdev->reset_info.hard_reset_pending)
                 return 0;
  
         /* no need in L1 only invalidation in Goya */
author	Ofir Bitton <obitton@habana.ai>
	Tue, 23 Nov 2021 13:15:22 +0000 (15:15 +0200)
committer	Oded Gabbay <ogabbay@kernel.org>
	Sun, 26 Dec 2021 12:41:28 +0000 (14:41 +0200)
drivers/misc/habanalabs/common/command_buffer.c		patch \| blob \| history
drivers/misc/habanalabs/common/command_submission.c		patch \| blob \| history
drivers/misc/habanalabs/common/debugfs.c		patch \| blob \| history
drivers/misc/habanalabs/common/device.c		patch \| blob \| history
drivers/misc/habanalabs/common/firmware_if.c		patch \| blob \| history
drivers/misc/habanalabs/common/habanalabs.h		patch \| blob \| history
drivers/misc/habanalabs/common/habanalabs_drv.c		patch \| blob \| history
drivers/misc/habanalabs/common/habanalabs_ioctl.c		patch \| blob \| history
drivers/misc/habanalabs/common/irq.c		patch \| blob \| history
drivers/misc/habanalabs/common/memory.c		patch \| blob \| history
drivers/misc/habanalabs/common/sysfs.c		patch \| blob \| history
drivers/misc/habanalabs/gaudi/gaudi.c		patch \| blob \| history
drivers/misc/habanalabs/goya/goya.c		patch \| blob \| history