scsi: core: Avoid leaving shost->last_reset with stale value if EH does not run
authorEwan D. Milne <emilne@redhat.com>
Fri, 29 Oct 2021 19:43:10 +0000 (15:43 -0400)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 18 Nov 2021 18:15:51 +0000 (19:15 +0100)
commit 5ae17501bc62a49b0b193dcce003f16375f16654 upstream.

The changes to issue the abort from the scmd->abort_work instead of the EH
thread introduced a problem if eh_deadline is used.  If aborting the
command(s) is successful, and there are never any scmds added to the
shost->eh_cmd_q, there is no code path which will reset the ->last_reset
value back to zero.

The effect of this is that after a successful abort with no EH thread
activity, a subsequent timeout, perhaps a long time later, might
immediately be considered past a user-set eh_deadline time, and the host
will be reset with no attempt at recovery.

Fix this by resetting ->last_reset back to zero in scmd_eh_abort_handler()
if it is determined that the EH thread will not run to do this.

Thanks to Gopinath Marappan for investigating this problem.

Link: https://lore.kernel.org/r/20211029194311.17504-2-emilne@redhat.com
Fixes: e494f6a72839 ("[SCSI] improved eh timeout handler")
Cc: stable@vger.kernel.org
Signed-off-by: Ewan D. Milne <emilne@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/scsi/hosts.c
drivers/scsi/scsi_error.c
drivers/scsi/scsi_lib.c
include/scsi/scsi_cmnd.h
include/scsi/scsi_host.h

index 24b72ee4246fb66ce89b0a1ce98601fef77b8dd9..0165dad803001444522f0bb8c557ae2bedd946bb 100644 (file)
@@ -388,6 +388,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
        shost->shost_state = SHOST_CREATED;
        INIT_LIST_HEAD(&shost->__devices);
        INIT_LIST_HEAD(&shost->__targets);
+       INIT_LIST_HEAD(&shost->eh_abort_list);
        INIT_LIST_HEAD(&shost->eh_cmd_q);
        INIT_LIST_HEAD(&shost->starved_list);
        init_waitqueue_head(&shost->host_wait);
index b6c86cce57bfa2ecf9ce2ee52b5d39fa2f67d790..408d49c304b8dc1a8012bcbdd6268d7b451d39b3 100644 (file)
@@ -135,6 +135,23 @@ static bool scsi_eh_should_retry_cmd(struct scsi_cmnd *cmd)
        return true;
 }
 
+static void scsi_eh_complete_abort(struct scsi_cmnd *scmd, struct Scsi_Host *shost)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(shost->host_lock, flags);
+       list_del_init(&scmd->eh_entry);
+       /*
+        * If the abort succeeds, and there is no further
+        * EH action, clear the ->last_reset time.
+        */
+       if (list_empty(&shost->eh_abort_list) &&
+           list_empty(&shost->eh_cmd_q))
+               if (shost->eh_deadline != -1)
+                       shost->last_reset = 0;
+       spin_unlock_irqrestore(shost->host_lock, flags);
+}
+
 /**
  * scmd_eh_abort_handler - Handle command aborts
  * @work:      command to be aborted.
@@ -152,6 +169,7 @@ scmd_eh_abort_handler(struct work_struct *work)
                container_of(work, struct scsi_cmnd, abort_work.work);
        struct scsi_device *sdev = scmd->device;
        enum scsi_disposition rtn;
+       unsigned long flags;
 
        if (scsi_host_eh_past_deadline(sdev->host)) {
                SCSI_LOG_ERROR_RECOVERY(3,
@@ -175,12 +193,14 @@ scmd_eh_abort_handler(struct work_struct *work)
                                SCSI_LOG_ERROR_RECOVERY(3,
                                        scmd_printk(KERN_WARNING, scmd,
                                                    "retry aborted command\n"));
+                               scsi_eh_complete_abort(scmd, sdev->host);
                                scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
                                return;
                        } else {
                                SCSI_LOG_ERROR_RECOVERY(3,
                                        scmd_printk(KERN_WARNING, scmd,
                                                    "finish aborted command\n"));
+                               scsi_eh_complete_abort(scmd, sdev->host);
                                scsi_finish_command(scmd);
                                return;
                        }
@@ -193,6 +213,9 @@ scmd_eh_abort_handler(struct work_struct *work)
                }
        }
 
+       spin_lock_irqsave(sdev->host->host_lock, flags);
+       list_del_init(&scmd->eh_entry);
+       spin_unlock_irqrestore(sdev->host->host_lock, flags);
        scsi_eh_scmd_add(scmd);
 }
 
@@ -223,6 +246,8 @@ scsi_abort_command(struct scsi_cmnd *scmd)
        spin_lock_irqsave(shost->host_lock, flags);
        if (shost->eh_deadline != -1 && !shost->last_reset)
                shost->last_reset = jiffies;
+       BUG_ON(!list_empty(&scmd->eh_entry));
+       list_add_tail(&scmd->eh_entry, &shost->eh_abort_list);
        spin_unlock_irqrestore(shost->host_lock, flags);
 
        scmd->eh_eflags |= SCSI_EH_ABORT_SCHEDULED;
index 572673873ddf847f10761c4f2c68e69f6d3430f0..05f8f6e8f766e588020eb47351dfbcfd7f66d609 100644 (file)
@@ -1143,6 +1143,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
        cmd->sense_buffer = buf;
        cmd->prot_sdb = prot;
        cmd->flags = flags;
+       INIT_LIST_HEAD(&cmd->eh_entry);
        INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler);
        cmd->jiffies_at_alloc = jiffies_at_alloc;
        cmd->retries = retries;
index eaf04c9a1dfcb541e15b3316da0a698281244aef..59afe8787cf7c0a99aad55f8fb8da99cd509a7d1 100644 (file)
@@ -68,7 +68,7 @@ struct scsi_pointer {
 struct scsi_cmnd {
        struct scsi_request req;
        struct scsi_device *device;
-       struct list_head eh_entry; /* entry for the host eh_cmd_q */
+       struct list_head eh_entry; /* entry for the host eh_abort_list/eh_cmd_q */
        struct delayed_work abort_work;
 
        struct rcu_head rcu;
index 75363707b73f9cc202ab728d980cd7de679793ee..1a02e58eb4e442049318d452f3680a1142053e6f 100644 (file)
@@ -556,6 +556,7 @@ struct Scsi_Host {
 
        struct mutex            scan_mutex;/* serialize scanning activity */
 
+       struct list_head        eh_abort_list;
        struct list_head        eh_cmd_q;
        struct task_struct    * ehandler;  /* Error recovery thread. */
        struct completion     * eh_action; /* Wait for specific actions on the