scsi: mpi3mr: Graceful handling of surprise removal of PCIe HBA
authorSreekanth Reddy <sreekanth.reddy@broadcom.com>
Mon, 12 Sep 2022 13:57:37 +0000 (19:27 +0530)
committerMartin K. Petersen <martin.petersen@oracle.com>
Sun, 25 Sep 2022 17:49:52 +0000 (13:49 -0400)
Implement graceful handling of surprise or orderly removal of PCIe HBA:

 - Detect a hot removal of the controller at certain critical places in the
   driver. Early detection will help to reduce the time taken for cleaning
   up the hot-removed controller at the driver level.

 - Poll the status of the port enable issued after reset once every 5
   seconds to avoid a long delay in detecting unavailable controller.

Link: https://lore.kernel.org/r/20220912135742.11764-5-sreekanth.reddy@broadcom.com
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Sreekanth Reddy <sreekanth.reddy@broadcom.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
drivers/scsi/mpi3mr/mpi3mr.h
drivers/scsi/mpi3mr/mpi3mr_fw.c
drivers/scsi/mpi3mr/mpi3mr_os.c

index 0f47b451beb32f5304e9f71f9f750437a16fdf9e..0eb0647fe58053ae90ab895da18760fab9b999f8 100644 (file)
@@ -118,6 +118,7 @@ extern atomic64_t event_counter;
 /* command/controller interaction timeout definitions in seconds */
 #define MPI3MR_INTADMCMD_TIMEOUT               60
 #define MPI3MR_PORTENABLE_TIMEOUT              300
+#define MPI3MR_PORTENABLE_POLL_INTERVAL                5
 #define MPI3MR_ABORTTM_TIMEOUT                 60
 #define MPI3MR_RESETTM_TIMEOUT                 60
 #define MPI3MR_RESET_HOST_IOWAIT_TIMEOUT       5
@@ -1389,4 +1390,6 @@ void mpi3mr_print_device_event_notice(struct mpi3mr_ioc *mrioc,
 void mpi3mr_refresh_sas_ports(struct mpi3mr_ioc *mrioc);
 void mpi3mr_refresh_expanders(struct mpi3mr_ioc *mrioc);
 void mpi3mr_add_event_wait_for_device_refresh(struct mpi3mr_ioc *mrioc);
+void mpi3mr_flush_drv_cmds(struct mpi3mr_ioc *mrioc);
+void mpi3mr_flush_cmds_for_unrecovered_controller(struct mpi3mr_ioc *mrioc);
 #endif /*MPI3MR_H_INCLUDED*/
index 78792f27b73b2e17f000fd602c7a07381fd27578..a10cffaa37aee7416fac8485fbf2d1947b4dccd8 100644 (file)
@@ -431,6 +431,9 @@ static int mpi3mr_process_admin_reply_q(struct mpi3mr_ioc *mrioc)
                return 0;
 
        do {
+               if (mrioc->unrecoverable)
+                       break;
+
                mrioc->admin_req_ci = le16_to_cpu(reply_desc->request_queue_ci);
                mpi3mr_process_admin_reply_desc(mrioc, reply_desc, &reply_dma);
                if (reply_dma)
@@ -516,6 +519,9 @@ int mpi3mr_process_op_reply_q(struct mpi3mr_ioc *mrioc,
        }
 
        do {
+               if (mrioc->unrecoverable)
+                       break;
+
                req_q_idx = le16_to_cpu(reply_desc->request_queue_id) - 1;
                op_req_q = &mrioc->req_qinfo[req_q_idx];
 
@@ -577,7 +583,8 @@ int mpi3mr_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
 
        mrioc = (struct mpi3mr_ioc *)shost->hostdata;
 
-       if ((mrioc->reset_in_progress || mrioc->prepare_for_reset))
+       if ((mrioc->reset_in_progress || mrioc->prepare_for_reset ||
+           mrioc->unrecoverable))
                return 0;
 
        num_entries = mpi3mr_process_op_reply_q(mrioc,
@@ -673,7 +680,7 @@ static irqreturn_t mpi3mr_isr_poll(int irq, void *privdata)
 
        /* Poll for pending IOs completions */
        do {
-               if (!mrioc->intr_enabled)
+               if (!mrioc->intr_enabled || mrioc->unrecoverable)
                        break;
 
                if (!midx)
@@ -1220,6 +1227,14 @@ static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc)
                        msleep(100);
                } while (--timeout);
 
+               if (!pci_device_is_present(mrioc->pdev)) {
+                       mrioc->unrecoverable = 1;
+                       ioc_err(mrioc,
+                           "controller is not present while waiting to reset\n");
+                       retval = -1;
+                       goto out_device_not_present;
+               }
+
                ioc_state = mpi3mr_get_iocstate(mrioc);
                ioc_info(mrioc,
                    "controller is in %s state after waiting to reset\n",
@@ -1277,6 +1292,13 @@ static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc)
                            mpi3mr_iocstate_name(ioc_state));
                        return 0;
                }
+               if (!pci_device_is_present(mrioc->pdev)) {
+                       mrioc->unrecoverable = 1;
+                       ioc_err(mrioc,
+                           "controller is not present at the bringup\n");
+                       retval = -1;
+                       goto out_device_not_present;
+               }
                msleep(100);
        } while (--timeout);
 
@@ -1285,6 +1307,7 @@ out_failed:
        ioc_err(mrioc,
            "failed to bring to ready state,  current state: %s\n",
            mpi3mr_iocstate_name(ioc_state));
+out_device_not_present:
        return retval;
 }
 
@@ -2223,6 +2246,17 @@ void mpi3mr_check_rh_fault_ioc(struct mpi3mr_ioc *mrioc, u32 reason_code)
 {
        u32 ioc_status, host_diagnostic, timeout;
 
+       if (mrioc->unrecoverable) {
+               ioc_err(mrioc, "controller is unrecoverable\n");
+               return;
+       }
+
+       if (!pci_device_is_present(mrioc->pdev)) {
+               mrioc->unrecoverable = 1;
+               ioc_err(mrioc, "controller is not present\n");
+               return;
+       }
+
        ioc_status = readl(&mrioc->sysif_regs->ioc_status);
        if ((ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) ||
            (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT)) {
@@ -2414,8 +2448,20 @@ static void mpi3mr_watchdog_work(struct work_struct *work)
        u32 fault, host_diagnostic, ioc_status;
        u32 reset_reason = MPI3MR_RESET_FROM_FAULT_WATCH;
 
-       if (mrioc->reset_in_progress || mrioc->unrecoverable)
+       if (mrioc->reset_in_progress)
+               return;
+
+       if (!mrioc->unrecoverable && !pci_device_is_present(mrioc->pdev)) {
+               ioc_err(mrioc, "watchdog could not detect the controller\n");
+               mrioc->unrecoverable = 1;
+       }
+
+       if (mrioc->unrecoverable) {
+               ioc_err(mrioc,
+                   "flush pending commands for unrecoverable controller\n");
+               mpi3mr_flush_cmds_for_unrecovered_controller(mrioc);
                return;
+       }
 
        if (mrioc->ts_update_counter++ >= MPI3MR_TSUPDATE_INTERVAL) {
                mrioc->ts_update_counter = 0;
@@ -2460,7 +2506,7 @@ static void mpi3mr_watchdog_work(struct work_struct *work)
                ioc_info(mrioc,
                    "controller requires system power cycle, marking controller as unrecoverable\n");
                mrioc->unrecoverable = 1;
-               return;
+               goto schedule_work;
        case MPI3_SYSIF_FAULT_CODE_SOFT_RESET_IN_PROGRESS:
                return;
        case MPI3_SYSIF_FAULT_CODE_CI_ACTIVATION_RESET:
@@ -3396,10 +3442,13 @@ out_failed:
 static void mpi3mr_port_enable_complete(struct mpi3mr_ioc *mrioc,
        struct mpi3mr_drv_cmd *drv_cmd)
 {
-       drv_cmd->state = MPI3MR_CMD_NOTUSED;
        drv_cmd->callback = NULL;
-       mrioc->scan_failed = drv_cmd->ioc_status;
        mrioc->scan_started = 0;
+       if (drv_cmd->state & MPI3MR_CMD_RESET)
+               mrioc->scan_failed = MPI3_IOCSTATUS_INTERNAL_ERROR;
+       else
+               mrioc->scan_failed = drv_cmd->ioc_status;
+       drv_cmd->state = MPI3MR_CMD_NOTUSED;
 }
 
 /**
@@ -3897,8 +3946,12 @@ int mpi3mr_reinit_ioc(struct mpi3mr_ioc *mrioc, u8 is_resume)
        int retval = 0;
        u8 retry = 0;
        struct mpi3_ioc_facts_data facts_data;
+       u32 pe_timeout, ioc_status;
 
 retry_init:
+       pe_timeout =
+           (MPI3MR_PORTENABLE_TIMEOUT / MPI3MR_PORTENABLE_POLL_INTERVAL);
+
        dprint_reset(mrioc, "bringing up the controller to ready state\n");
        retval = mpi3mr_bring_ioc_ready(mrioc);
        if (retval) {
@@ -3994,11 +4047,46 @@ retry_init:
        }
 
        ioc_info(mrioc, "sending port enable\n");
-       retval = mpi3mr_issue_port_enable(mrioc, 0);
+       retval = mpi3mr_issue_port_enable(mrioc, 1);
        if (retval) {
                ioc_err(mrioc, "failed to issue port enable\n");
                goto out_failed;
        }
+       do {
+               ssleep(MPI3MR_PORTENABLE_POLL_INTERVAL);
+               if (mrioc->init_cmds.state == MPI3MR_CMD_NOTUSED)
+                       break;
+               if (!pci_device_is_present(mrioc->pdev))
+                       mrioc->unrecoverable = 1;
+               if (mrioc->unrecoverable) {
+                       retval = -1;
+                       goto out_failed_noretry;
+               }
+               ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+               if ((ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) ||
+                   (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT)) {
+                       mpi3mr_print_fault_info(mrioc);
+                       mrioc->init_cmds.is_waiting = 0;
+                       mrioc->init_cmds.callback = NULL;
+                       mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+                       goto out_failed;
+               }
+       } while (--pe_timeout);
+
+       if (!pe_timeout) {
+               ioc_err(mrioc, "port enable timed out\n");
+               mpi3mr_check_rh_fault_ioc(mrioc,
+                   MPI3MR_RESET_FROM_PE_TIMEOUT);
+               mrioc->init_cmds.is_waiting = 0;
+               mrioc->init_cmds.callback = NULL;
+               mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+               goto out_failed;
+       } else if (mrioc->scan_failed) {
+               ioc_err(mrioc,
+                   "port enable failed with status=0x%04x\n",
+                   mrioc->scan_failed);
+       } else
+               ioc_info(mrioc, "port enable completed successfully\n");
 
        ioc_info(mrioc, "controller %s completed successfully\n",
            (is_resume)?"resume":"re-initialization");
@@ -4417,7 +4505,7 @@ static inline void mpi3mr_drv_cmd_comp_reset(struct mpi3mr_ioc *mrioc,
  *
  * Return: Nothing.
  */
-static void mpi3mr_flush_drv_cmds(struct mpi3mr_ioc *mrioc)
+void mpi3mr_flush_drv_cmds(struct mpi3mr_ioc *mrioc)
 {
        struct mpi3mr_drv_cmd *cmdptr;
        u8 i;
@@ -4850,6 +4938,7 @@ out:
                mrioc->unrecoverable = 1;
                mrioc->reset_in_progress = 0;
                retval = -1;
+               mpi3mr_flush_cmds_for_unrecovered_controller(mrioc);
        }
        mrioc->prev_reset_result = retval;
        mutex_unlock(&mrioc->reset_mutex);
index f1a6448e3d85c4a4ebc1eabef0023d6e66d72ef2..f983e92b69530b421cefb57296a48e015bb8bd4b 100644 (file)
@@ -582,6 +582,39 @@ void mpi3mr_flush_host_io(struct mpi3mr_ioc *mrioc)
            mrioc->flush_io_count);
 }
 
+/**
+ * mpi3mr_flush_cmds_for_unrecovered_controller - Flush all pending cmds
+ * @mrioc: Adapter instance reference
+ *
+ * This function waits for currently running IO poll threads to
+ * exit and then flushes all host I/Os and any internal pending
+ * cmds. This is executed after controller is marked as
+ * unrecoverable.
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_flush_cmds_for_unrecovered_controller(struct mpi3mr_ioc *mrioc)
+{
+       struct Scsi_Host *shost = mrioc->shost;
+       int i;
+
+       if (!mrioc->unrecoverable)
+               return;
+
+       if (mrioc->op_reply_qinfo) {
+               for (i = 0; i < mrioc->num_queues; i++) {
+                       while (atomic_read(&mrioc->op_reply_qinfo[i].in_use))
+                               udelay(500);
+                       atomic_set(&mrioc->op_reply_qinfo[i].pend_ios, 0);
+               }
+       }
+       mrioc->flush_io_count = 0;
+       blk_mq_tagset_busy_iter(&shost->tag_set,
+           mpi3mr_flush_scmd, (void *)mrioc);
+       mpi3mr_flush_delayed_cmd_lists(mrioc);
+       mpi3mr_flush_drv_cmds(mrioc);
+}
+
 /**
  * mpi3mr_alloc_tgtdev - target device allocator
  *
@@ -1815,6 +1848,13 @@ static void mpi3mr_fwevt_bh(struct mpi3mr_ioc *mrioc,
        if (mrioc->stop_drv_processing)
                goto out;
 
+       if (mrioc->unrecoverable) {
+               dprint_event_bh(mrioc,
+                   "ignoring event(0x%02x) in bottom half handler due to unrecoverable controller\n",
+                   fwevt->event_id);
+               goto out;
+       }
+
        if (!fwevt->process_evt)
                goto evt_ack;
 
@@ -5024,6 +5064,11 @@ static void mpi3mr_remove(struct pci_dev *pdev)
        while (mrioc->reset_in_progress || mrioc->is_driver_loading)
                ssleep(1);
 
+       if (!pci_device_is_present(mrioc->pdev)) {
+               mrioc->unrecoverable = 1;
+               mpi3mr_flush_cmds_for_unrecovered_controller(mrioc);
+       }
+
        mpi3mr_bsg_exit(mrioc);
        mrioc->stop_drv_processing = 1;
        mpi3mr_cleanup_fwevt_list(mrioc);