scsi: qla2xxx: Add heartbeat check
authorQuinn Tran <qutran@marvell.com>
Sat, 19 Jun 2021 05:24:27 +0000 (22:24 -0700)
committerMartin K. Petersen <martin.petersen@oracle.com>
Wed, 23 Jun 2021 01:54:20 +0000 (21:54 -0400)
Use "no-op" mailbox command to check if the adapter firmware is still
responsive.

Link: https://lore.kernel.org/r/20210619052427.6440-1-njavali@marvell.com
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Quinn Tran <qutran@marvell.com>
Signed-off-by: Nilesh Javali <njavali@marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
drivers/scsi/qla2xxx/qla_def.h
drivers/scsi/qla2xxx/qla_gbl.h
drivers/scsi/qla2xxx/qla_init.c
drivers/scsi/qla2xxx/qla_iocb.c
drivers/scsi/qla2xxx/qla_isr.c
drivers/scsi/qla2xxx/qla_mbx.c
drivers/scsi/qla2xxx/qla_nvme.c
drivers/scsi/qla2xxx/qla_os.c

index def4d99f80e99a60a87e4db136a4ba9c036b93c2..2f67ec1df3e665c988db8113ea0c3fbbcef98c87 100644 (file)
@@ -3660,6 +3660,8 @@ struct qla_qpair {
        struct qla_tgt_counters tgt_counters;
        uint16_t cpuid;
        struct qla_fw_resources fwres ____cacheline_aligned;
+       u32     cmd_cnt;
+       u32     cmd_completion_cnt;
 };
 
 /* Place holder for FW buffer parameters */
@@ -4616,6 +4618,7 @@ struct qla_hw_data {
 
        struct qla_hw_data_stat stat;
        pci_error_state_t pci_error_state;
+       u64 prev_cmd_cnt;
 };
 
 struct active_regions {
@@ -4743,6 +4746,7 @@ typedef struct scsi_qla_host {
 #define SET_ZIO_THRESHOLD_NEEDED 32
 #define ISP_ABORT_TO_ROM       33
 #define VPORT_DELETE           34
+#define HEARTBEAT_CHK          38
 
 #define PROCESS_PUREX_IOCB     63
 
index 418be9a2fcf6578774ef63f0c4762369db8ce7df..2f867da822aeea5572b0c8ff6138f53cca9c3eb8 100644 (file)
@@ -549,6 +549,7 @@ extern int qla2xxx_read_remote_register(scsi_qla_host_t *, uint32_t,
     uint32_t *);
 extern int qla2xxx_write_remote_register(scsi_qla_host_t *, uint32_t,
     uint32_t);
+void qla_no_op_mb(struct scsi_qla_host *vha);
 
 /*
  * Global Function Prototypes in qla_isr.c source file.
index eb825318e3f5e33d8588053f266b528fb7698acd..f8f471157109ea93bc79b0bc8cbd7a89c1fa3a97 100644 (file)
@@ -6870,10 +6870,14 @@ qla2x00_abort_isp_cleanup(scsi_qla_host_t *vha)
        ha->flags.fw_init_done = 0;
        ha->chip_reset++;
        ha->base_qpair->chip_reset = ha->chip_reset;
+       ha->base_qpair->cmd_cnt = ha->base_qpair->cmd_completion_cnt = 0;
        for (i = 0; i < ha->max_qpairs; i++) {
-               if (ha->queue_pair_map[i])
+               if (ha->queue_pair_map[i]) {
                        ha->queue_pair_map[i]->chip_reset =
                                ha->base_qpair->chip_reset;
+                       ha->queue_pair_map[i]->cmd_cnt =
+                           ha->queue_pair_map[i]->cmd_completion_cnt = 0;
+               }
        }
 
        /* purge MBox commands */
index 38b5bdde2405531d7a8b336db502469c6b75a16f..d0ee843f6b04ffa685b78d817db5d4d7fad981d4 100644 (file)
@@ -1710,6 +1710,7 @@ qla24xx_start_scsi(srb_t *sp)
        } else
                req->ring_ptr++;
 
+       sp->qpair->cmd_cnt++;
        sp->flags |= SRB_DMA_VALID;
 
        /* Set chip new ring index. */
@@ -1912,6 +1913,7 @@ qla24xx_dif_start_scsi(srb_t *sp)
        } else
                req->ring_ptr++;
 
+       sp->qpair->cmd_cnt++;
        /* Set chip new ring index. */
        wrt_reg_dword(req->req_q_in, req->ring_index);
 
@@ -2068,6 +2070,7 @@ qla2xxx_start_scsi_mq(srb_t *sp)
        } else
                req->ring_ptr++;
 
+       sp->qpair->cmd_cnt++;
        sp->flags |= SRB_DMA_VALID;
 
        /* Set chip new ring index. */
@@ -2284,6 +2287,7 @@ qla2xxx_dif_start_scsi_mq(srb_t *sp)
        } else
                req->ring_ptr++;
 
+       sp->qpair->cmd_cnt++;
        /* Set chip new ring index. */
        wrt_reg_dword(req->req_q_in, req->ring_index);
 
index 19fe2c1659d0b53cf57847722ccdf00dea04508d..d9fb093a60a1fc9404de8876d96942e62a337731 100644 (file)
@@ -2322,6 +2322,8 @@ static void qla24xx_nvme_iocb_entry(scsi_qla_host_t *vha, struct req_que *req,
 
        if (unlikely(iocb->u.nvme.aen_op))
                atomic_dec(&sp->vha->hw->nvme_active_aen_cnt);
+       else
+               sp->qpair->cmd_completion_cnt++;
 
        if (unlikely(comp_status != CS_COMPLETE))
                logit = 1;
@@ -2967,6 +2969,8 @@ qla2x00_status_entry(scsi_qla_host_t *vha, struct rsp_que *rsp, void *pkt)
                return;
        }
 
+       sp->qpair->cmd_completion_cnt++;
+
        /* Fast path completion. */
        if (comp_status == CS_COMPLETE && scsi_status == 0) {
                qla2x00_process_completed_request(vha, req, handle);
index 0bcd8afdc0ff32e6208fb1fdef9fa6b71eade6b2..9f3ad8aa649cb031f46cc0bccb6b552d75cdabe9 100644 (file)
@@ -6939,3 +6939,30 @@ ql26xx_led_config(scsi_qla_host_t *vha, uint16_t options, uint16_t *led)
 
        return rval;
 }
+
+/**
+ * qla_no_op_mb(): This MB is used to check if FW is still alive and
+ * able to generate an interrupt. Otherwise, a timeout will trigger
+ * FW dump + reset
+ * @vha: host adapter pointer
+ * Return: None
+ */
+void qla_no_op_mb(struct scsi_qla_host *vha)
+{
+       mbx_cmd_t mc;
+       mbx_cmd_t *mcp = &mc;
+       int rval;
+
+       memset(&mc, 0, sizeof(mc));
+       mcp->mb[0] = 0; // noop cmd= 0
+       mcp->out_mb = MBX_0;
+       mcp->in_mb = MBX_0;
+       mcp->tov = 5;
+       mcp->flags = 0;
+       rval = qla2x00_mailbox_command(vha, mcp);
+
+       if (rval) {
+               ql_dbg(ql_dbg_async, vha, 0x7071,
+                       "Failed %s %x\n", __func__, rval);
+       }
+}
index e119f8b24e33ef78335e297c01b78ad2ad2e83c0..3e5c70a1d969cc0157f2aa762173905707a767ca 100644 (file)
@@ -536,6 +536,10 @@ static inline int qla2x00_start_nvme_mq(srb_t *sp)
                req->ring_ptr++;
        }
 
+       /* ignore nvme async cmd due to long timeout */
+       if (!nvme->u.nvme.aen_op)
+               sp->qpair->cmd_cnt++;
+
        /* Set chip new ring index. */
        wrt_reg_dword(req->req_q_in, req->ring_index);
 
index 4eab564ea6a052bcb6d7b10a99f7607932819ea3..cedd558f65ebfac65a42bd07b65b25011871bc6f 100644 (file)
@@ -6969,6 +6969,17 @@ intr_on_check:
                        qla2x00_lip_reset(base_vha);
                }
 
+               if (test_bit(HEARTBEAT_CHK, &base_vha->dpc_flags)) {
+                       /*
+                        * if there is a mb in progress then that's
+                        * enough of a check to see if fw is still ticking.
+                        */
+                       if (!ha->flags.mbox_busy && base_vha->flags.init_done)
+                               qla_no_op_mb(base_vha);
+
+                       clear_bit(HEARTBEAT_CHK, &base_vha->dpc_flags);
+               }
+
                ha->dpc_active = 0;
 end_loop:
                set_current_state(TASK_INTERRUPTIBLE);
@@ -7025,6 +7036,61 @@ qla2x00_rst_aen(scsi_qla_host_t *vha)
        }
 }
 
+static bool qla_do_heartbeat(struct scsi_qla_host *vha)
+{
+       u64 cmd_cnt, prev_cmd_cnt;
+       bool do_hb = false;
+       struct qla_hw_data *ha = vha->hw;
+       int i;
+
+       /* if cmds are still pending down in fw, then do hb */
+       if (ha->base_qpair->cmd_cnt != ha->base_qpair->cmd_completion_cnt) {
+               do_hb = true;
+               goto skip;
+       }
+
+       for (i = 0; i < ha->max_qpairs; i++) {
+               if (ha->queue_pair_map[i] &&
+                   ha->queue_pair_map[i]->cmd_cnt !=
+                   ha->queue_pair_map[i]->cmd_completion_cnt) {
+                       do_hb = true;
+                       break;
+               }
+       }
+
+skip:
+       prev_cmd_cnt = ha->prev_cmd_cnt;
+       cmd_cnt = ha->base_qpair->cmd_cnt;
+       for (i = 0; i < ha->max_qpairs; i++) {
+               if (ha->queue_pair_map[i])
+                       cmd_cnt += ha->queue_pair_map[i]->cmd_cnt;
+       }
+       ha->prev_cmd_cnt = cmd_cnt;
+
+       if (!do_hb && ((cmd_cnt - prev_cmd_cnt) > 50))
+               /*
+                * IOs are completing before periodic hb check.
+                * IOs seems to be running, do hb for sanity check.
+                */
+               do_hb = true;
+
+       return do_hb;
+}
+
+static void qla_heart_beat(struct scsi_qla_host *vha)
+{
+       if (vha->vp_idx)
+               return;
+
+       if (vha->hw->flags.eeh_busy || qla2x00_chip_is_down(vha))
+               return;
+
+       if (qla_do_heartbeat(vha)) {
+               set_bit(HEARTBEAT_CHK, &vha->dpc_flags);
+               qla2xxx_wake_dpc(vha);
+       }
+}
+
 /**************************************************************************
 *   qla2x00_timer
 *
@@ -7243,6 +7309,8 @@ qla2x00_timer(struct timer_list *t)
                qla2xxx_wake_dpc(vha);
        }
 
+       qla_heart_beat(vha);
+
        qla2x00_restart_timer(vha, WATCH_INTERVAL);
 }