scsi: mpt3sas: Handle firmware faults during second half of IOC init
authorSuganath Prabu S <suganath-prabu.subramani@broadcom.com>
Tue, 18 May 2021 05:16:25 +0000 (10:46 +0530)
committerMartin K. Petersen <martin.petersen@oracle.com>
Tue, 1 Jun 2021 02:48:20 +0000 (22:48 -0400)
If a firmware fault occurs while scanning the devices during IOC
initialization then the driver issues the hard reset operation to recover
the IOC. However, the driver is not issuing a Port enable request
messageĀ as part of hard reset operation during IOC initialization.  Due to
this, the driver will not receive get any device discovery-related events
and hence devices will not be accessible.

Teach the driver to gracefully handle firmware faults while scanning for
target devices during IOC initialization. Make the driver issue a port
enable request message as part of hard reset operation. This permits
receiving device discovery-related events from the firmware after the hard
reset operation completes.

Link: https://lore.kernel.org/r/20210518051625.1596742-4-suganath-prabu.subramani@broadcom.com
Signed-off-by: Suganath Prabu S <suganath-prabu.subramani@broadcom.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
drivers/scsi/mpt3sas/mpt3sas_base.c
drivers/scsi/mpt3sas/mpt3sas_base.h
drivers/scsi/mpt3sas/mpt3sas_scsih.c

index 4500d53f09f77a82373ec1bda1ef0a8d48056fa2..bc4ed3ed4b9ade225073ba60b254344e6e0ec149 100644 (file)
@@ -7205,7 +7205,7 @@ mpt3sas_port_enable_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index,
        if (ioc_status != MPI2_IOCSTATUS_SUCCESS)
                ioc->port_enable_failed = 1;
 
-       if (ioc->is_driver_loading) {
+       if (ioc->port_enable_cmds.status & MPT3_CMD_COMPLETE_ASYNC) {
                if (ioc_status == MPI2_IOCSTATUS_SUCCESS) {
                        mpt3sas_port_enable_complete(ioc);
                        return 1;
@@ -7214,6 +7214,7 @@ mpt3sas_port_enable_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index,
                        ioc->start_scan = 0;
                        return 1;
                }
+               ioc->port_enable_cmds.status &= ~MPT3_CMD_COMPLETE_ASYNC;
        }
        complete(&ioc->port_enable_cmds.done);
        return 1;
@@ -7308,6 +7309,7 @@ mpt3sas_port_enable(struct MPT3SAS_ADAPTER *ioc)
        }
        ioc->drv_internal_flags |= MPT_DRV_INTERNAL_FIRST_PE_ISSUED;
        ioc->port_enable_cmds.status = MPT3_CMD_PENDING;
+       ioc->port_enable_cmds.status |= MPT3_CMD_COMPLETE_ASYNC;
        mpi_request = mpt3sas_base_get_msg_frame(ioc, smid);
        ioc->port_enable_cmds.smid = smid;
        memset(mpi_request, 0, sizeof(Mpi2PortEnableRequest_t));
@@ -7856,7 +7858,7 @@ _base_make_ioc_operational(struct MPT3SAS_ADAPTER *ioc)
        if (r)
                return r;
 
-       if (ioc->is_driver_loading) {
+       if (!ioc->shost_recovery) {
 
                if (ioc->is_warpdrive && ioc->manu_pg10.OEMIdentifier
                    == 0x80) {
@@ -8276,8 +8278,6 @@ _base_clear_outstanding_mpt_commands(struct MPT3SAS_ADAPTER *ioc)
                        ioc->start_scan_failed =
                                MPI2_IOCSTATUS_INTERNAL_ERROR;
                        ioc->start_scan = 0;
-                       ioc->port_enable_cmds.status =
-                               MPT3_CMD_NOT_USED;
                } else {
                        complete(&ioc->port_enable_cmds.done);
                }
index c7b001618fc000cce8896bf2a23156a26423ef45..d4834c8ee9c0d361d279ead52b0da70653b09854 100644 (file)
@@ -500,6 +500,7 @@ struct MPT3SAS_DEVICE {
 #define MPT3_CMD_PENDING       0x0002  /* pending */
 #define MPT3_CMD_REPLY_VALID   0x0004  /* reply is valid */
 #define MPT3_CMD_RESET         0x0008  /* host reset dropped the command */
+#define MPT3_CMD_COMPLETE_ASYNC 0x0010  /* tells whether cmd completes in same thread or not */
 
 /**
  * struct _internal_cmd - internal commands struct
index 79e34b5090b1c13e476fa29de489362dd4e17660..d70ae57d897ff4b15fde8d2a7f7f008937736202 100644 (file)
@@ -78,6 +78,7 @@ static void _scsih_pcie_device_remove_from_sml(struct MPT3SAS_ADAPTER *ioc,
 static void
 _scsih_pcie_check_device(struct MPT3SAS_ADAPTER *ioc, u16 handle);
 static u8 _scsih_check_for_pending_tm(struct MPT3SAS_ADAPTER *ioc, u16 smid);
+static void _scsih_complete_devices_scanning(struct MPT3SAS_ADAPTER *ioc);
 
 /* global parameters */
 LIST_HEAD(mpt3sas_ioc_list);
@@ -3631,8 +3632,6 @@ _scsih_error_recovery_delete_devices(struct MPT3SAS_ADAPTER *ioc)
 {
        struct fw_event_work *fw_event;
 
-       if (ioc->is_driver_loading)
-               return;
        fw_event = alloc_fw_event_work(0);
        if (!fw_event)
                return;
@@ -3693,6 +3692,14 @@ _scsih_fw_event_cleanup_queue(struct MPT3SAS_ADAPTER *ioc)
        if ((list_empty(&ioc->fw_event_list) && !ioc->current_event) ||
            !ioc->firmware_event_thread)
                return;
+       /*
+        * Set current running event as ignore, so that
+        * current running event will exit quickly.
+        * As diag reset has occurred it is of no use
+        * to process remaining stale event data entries.
+        */
+       if (ioc->shost_recovery && ioc->current_event)
+               ioc->current_event->ignore = 1;
 
        ioc->fw_events_cleanup = 1;
        while ((fw_event = dequeue_next_fw_event(ioc)) ||
@@ -3719,6 +3726,19 @@ _scsih_fw_event_cleanup_queue(struct MPT3SAS_ADAPTER *ioc)
                        continue;
                }
 
+               /*
+                * Driver has to clear ioc->start_scan flag when
+                * it is cleaning up MPT3SAS_PORT_ENABLE_COMPLETE,
+                * otherwise scsi_scan_host() API waits for the
+                * 5 minute timer to expire. If we exit from
+                * scsi_scan_host() early then we can issue the
+                * new port enable request as part of current diag reset.
+                */
+               if (fw_event->event == MPT3SAS_PORT_ENABLE_COMPLETE) {
+                       ioc->port_enable_cmds.status |= MPT3_CMD_RESET;
+                       ioc->start_scan = 0;
+               }
+
                /*
                 * Wait on the fw_event to complete. If this returns 1, then
                 * the event was never executed, and we need a put for the
@@ -10140,6 +10160,17 @@ _scsih_remove_unresponding_devices(struct MPT3SAS_ADAPTER *ioc)
         * owner for the reference the list had on any object we prune.
         */
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
+
+       /*
+        * Clean up the sas_device_init_list list as
+        * driver goes for fresh scan as part of diag reset.
+        */
+       list_for_each_entry_safe(sas_device, sas_device_next,
+           &ioc->sas_device_init_list, list) {
+               list_del_init(&sas_device->list);
+               sas_device_put(sas_device);
+       }
+
        list_for_each_entry_safe(sas_device, sas_device_next,
            &ioc->sas_device_list, list) {
                if (!sas_device->responding)
@@ -10161,6 +10192,16 @@ _scsih_remove_unresponding_devices(struct MPT3SAS_ADAPTER *ioc)
        ioc_info(ioc, "Removing unresponding devices: pcie end-devices\n");
        INIT_LIST_HEAD(&head);
        spin_lock_irqsave(&ioc->pcie_device_lock, flags);
+       /*
+        * Clean up the pcie_device_init_list list as
+        * driver goes for fresh scan as part of diag reset.
+        */
+       list_for_each_entry_safe(pcie_device, pcie_device_next,
+           &ioc->pcie_device_init_list, list) {
+               list_del_init(&pcie_device->list);
+               pcie_device_put(pcie_device);
+       }
+
        list_for_each_entry_safe(pcie_device, pcie_device_next,
            &ioc->pcie_device_list, list) {
                if (!pcie_device->responding)
@@ -10563,8 +10604,7 @@ void
 mpt3sas_scsih_reset_done_handler(struct MPT3SAS_ADAPTER *ioc)
 {
        dtmprintk(ioc, ioc_info(ioc, "%s: MPT3_IOC_DONE_RESET\n", __func__));
-       if ((!ioc->is_driver_loading) && !(disable_discovery > 0 &&
-                                          !ioc->sas_hba.num_phys)) {
+       if (!(disable_discovery > 0 && !ioc->sas_hba.num_phys)) {
                if (ioc->multipath_on_hba) {
                        _scsih_sas_port_refresh(ioc);
                        _scsih_update_vphys_after_reset(ioc);
@@ -10619,6 +10659,18 @@ _mpt3sas_fw_work(struct MPT3SAS_ADAPTER *ioc, struct fw_event_work *fw_event)
                _scsih_del_dirty_vphy(ioc);
                _scsih_del_dirty_port_entries(ioc);
                _scsih_scan_for_devices_after_reset(ioc);
+               /*
+                * If diag reset has occurred during the driver load
+                * then driver has to complete the driver load operation
+                * by executing the following items:
+                *- Register the devices from sas_device_init_list to SML
+                *- clear is_driver_loading flag,
+                *- start the watchdog thread.
+                * In happy driver load path, above things are taken care of when
+                * driver executes scsih_scan_finished().
+                */
+               if (ioc->is_driver_loading)
+                       _scsih_complete_devices_scanning(ioc);
                _scsih_set_nvme_max_shutdown_latency(ioc);
                break;
        case MPT3SAS_PORT_ENABLE_COMPLETE:
@@ -10764,11 +10816,23 @@ mpt3sas_scsih_event_callback(struct MPT3SAS_ADAPTER *ioc, u8 msix_index,
                _scsih_check_topo_delete_events(ioc,
                    (Mpi2EventDataSasTopologyChangeList_t *)
                    mpi_reply->EventData);
+               /*
+                * No need to add the topology change list
+                * event to fw event work queue when
+                * diag reset is going on. Since during diag
+                * reset driver scan the devices by reading
+                * sas device page0's not by processing the
+                * events.
+                */
+               if (ioc->shost_recovery)
+                       return 1;
                break;
        case MPI2_EVENT_PCIE_TOPOLOGY_CHANGE_LIST:
        _scsih_check_pcie_topo_remove_events(ioc,
                    (Mpi26EventDataPCIeTopologyChangeList_t *)
                    mpi_reply->EventData);
+               if (ioc->shost_recovery)
+                       return 1;
                break;
        case MPI2_EVENT_IR_CONFIGURATION_CHANGE_LIST:
                _scsih_check_ir_config_unhide_events(ioc,
@@ -11284,13 +11348,27 @@ _scsih_probe_boot_devices(struct MPT3SAS_ADAPTER *ioc)
 
        if (channel == RAID_CHANNEL) {
                raid_device = device;
+               /*
+                * If this boot vd is already registered with SML then
+                * no need to register it again as part of device scanning
+                * after diag reset during driver load operation.
+                */
+               if (raid_device->starget)
+                       return;
                rc = scsi_add_device(ioc->shost, RAID_CHANNEL,
                    raid_device->id, 0);
                if (rc)
                        _scsih_raid_device_remove(ioc, raid_device);
        } else if (channel == PCIE_CHANNEL) {
-               spin_lock_irqsave(&ioc->pcie_device_lock, flags);
                pcie_device = device;
+               /*
+                * If this boot NVMe device is already registered with SML then
+                * no need to register it again as part of device scanning
+                * after diag reset during driver load operation.
+                */
+               if (pcie_device->starget)
+                       return;
+               spin_lock_irqsave(&ioc->pcie_device_lock, flags);
                tid = pcie_device->id;
                list_move_tail(&pcie_device->list, &ioc->pcie_device_list);
                spin_unlock_irqrestore(&ioc->pcie_device_lock, flags);
@@ -11298,8 +11376,15 @@ _scsih_probe_boot_devices(struct MPT3SAS_ADAPTER *ioc)
                if (rc)
                        _scsih_pcie_device_remove(ioc, pcie_device);
        } else {
-               spin_lock_irqsave(&ioc->sas_device_lock, flags);
                sas_device = device;
+               /*
+                * If this boot sas/sata device is already registered with SML
+                * then no need to register it again as part of device scanning
+                * after diag reset during driver load operation.
+                */
+               if (sas_device->starget)
+                       return;
+               spin_lock_irqsave(&ioc->sas_device_lock, flags);
                handle = sas_device->handle;
                sas_address_parent = sas_device->sas_address_parent;
                sas_address = sas_device->sas_address;
@@ -11597,6 +11682,25 @@ scsih_scan_start(struct Scsi_Host *shost)
                ioc_info(ioc, "port enable: FAILED\n");
 }
 
+/**
+ * _scsih_complete_devices_scanning - add the devices to sml and
+ * complete ioc initialization.
+ * @ioc: per adapter object
+ *
+ * Return nothing.
+ */
+static void _scsih_complete_devices_scanning(struct MPT3SAS_ADAPTER *ioc)
+{
+
+       if (ioc->wait_for_discovery_to_complete) {
+               ioc->wait_for_discovery_to_complete = 0;
+               _scsih_probe_devices(ioc);
+       }
+
+       mpt3sas_base_start_watchdog(ioc);
+       ioc->is_driver_loading = 0;
+}
+
 /**
  * scsih_scan_finished - scsi lld callback for .scan_finished
  * @shost: SCSI host pointer
@@ -11610,6 +11714,8 @@ static int
 scsih_scan_finished(struct Scsi_Host *shost, unsigned long time)
 {
        struct MPT3SAS_ADAPTER *ioc = shost_priv(shost);
+       u32 ioc_state;
+       int issue_hard_reset = 0;
 
        if (disable_discovery > 0) {
                ioc->is_driver_loading = 0;
@@ -11624,9 +11730,30 @@ scsih_scan_finished(struct Scsi_Host *shost, unsigned long time)
                return 1;
        }
 
-       if (ioc->start_scan)
+       if (ioc->start_scan) {
+               ioc_state = mpt3sas_base_get_iocstate(ioc, 0);
+               if ((ioc_state & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) {
+                       mpt3sas_print_fault_code(ioc, ioc_state &
+                           MPI2_DOORBELL_DATA_MASK);
+                       issue_hard_reset = 1;
+                       goto out;
+               } else if ((ioc_state & MPI2_IOC_STATE_MASK) ==
+                               MPI2_IOC_STATE_COREDUMP) {
+                       mpt3sas_base_coredump_info(ioc, ioc_state &
+                           MPI2_DOORBELL_DATA_MASK);
+                       mpt3sas_base_wait_for_coredump_completion(ioc, __func__);
+                       issue_hard_reset = 1;
+                       goto out;
+               }
                return 0;
+       }
 
+       if (ioc->port_enable_cmds.status & MPT3_CMD_RESET) {
+               ioc_info(ioc,
+                   "port enable: aborted due to diag reset\n");
+               ioc->port_enable_cmds.status = MPT3_CMD_NOT_USED;
+               goto out;
+       }
        if (ioc->start_scan_failed) {
                ioc_info(ioc, "port enable: FAILED with (ioc_status=0x%08x)\n",
                         ioc->start_scan_failed);
@@ -11638,13 +11765,14 @@ scsih_scan_finished(struct Scsi_Host *shost, unsigned long time)
 
        ioc_info(ioc, "port enable: SUCCESS\n");
        ioc->port_enable_cmds.status = MPT3_CMD_NOT_USED;
+       _scsih_complete_devices_scanning(ioc);
 
-       if (ioc->wait_for_discovery_to_complete) {
-               ioc->wait_for_discovery_to_complete = 0;
-               _scsih_probe_devices(ioc);
+out:
+       if (issue_hard_reset) {
+               ioc->port_enable_cmds.status = MPT3_CMD_NOT_USED;
+               if (mpt3sas_base_hard_reset_handler(ioc, SOFT_RESET))
+                       ioc->is_driver_loading = 0;
        }
-       mpt3sas_base_start_watchdog(ioc);
-       ioc->is_driver_loading = 0;
        return 1;
 }