octeon_ep: add heartbeat monitor
authorVeerasenareddy Burru <vburru@marvell.com>
Fri, 24 Mar 2023 17:47:03 +0000 (10:47 -0700)
committerDavid S. Miller <davem@davemloft.net>
Mon, 27 Mar 2023 07:37:54 +0000 (08:37 +0100)
Monitor periodic heartbeat messages from device firmware.
Presence of heartbeat indicates the device is active and running.
If the heartbeat is missed for configured interval indicates
firmware has crashed and device is unusable; in this case, PF driver
stops and uninitialize the device.

Signed-off-by: Veerasenareddy Burru <vburru@marvell.com>
Signed-off-by: Abhijit Ayarekar <aayarekar@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/ethernet/marvell/octeon_ep/octep_cn9k_pf.c
drivers/net/ethernet/marvell/octeon_ep/octep_config.h
drivers/net/ethernet/marvell/octeon_ep/octep_main.c
drivers/net/ethernet/marvell/octeon_ep/octep_main.h
drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h

index e2503c9bc8a10ec110191847cff79760b0330b60..90c3a419932d1e32466c6092fb22c9f84697baed 100644 (file)
@@ -16,6 +16,9 @@
 #define CTRL_MBOX_MAX_PF       128
 #define CTRL_MBOX_SZ           ((size_t)(0x400000 / CTRL_MBOX_MAX_PF))
 
+#define FW_HB_INTERVAL_IN_SECS         1
+#define FW_HB_MISS_COUNT               10
+
 /* Names of Hardware non-queue generic interrupts */
 static char *cn93_non_ioq_msix_names[] = {
        "epf_ire_rint",
@@ -249,6 +252,10 @@ static void octep_init_config_cn93_pf(struct octep_device *oct)
        conf->ctrl_mbox_cfg.barmem_addr = (void __iomem *)oct->mmio[2].hw_addr +
                                           (0x400000ull * 7) +
                                           (link * CTRL_MBOX_SZ);
+
+       conf->hb_interval = FW_HB_INTERVAL_IN_SECS;
+       conf->max_hb_miss_cnt = FW_HB_MISS_COUNT;
+
 }
 
 /* Setup registers for a hardware Tx Queue  */
@@ -383,6 +390,8 @@ static bool octep_poll_non_ioq_interrupts_cn93_pf(struct octep_device *oct)
                octep_write_csr64(oct, CN93_SDP_EPF_OEI_RINT, reg0);
                if (reg0 & CN93_SDP_EPF_OEI_RINT_DATA_BIT_MBOX)
                        queue_work(octep_wq, &oct->ctrl_mbox_task);
+               else if (reg0 & CN93_SDP_EPF_OEI_RINT_DATA_BIT_HBEAT)
+                       atomic_set(&oct->hb_miss_cnt, 0);
 
                handled = true;
        }
index f208f3f9a44768e61ec18cad2e73965b68147da5..df7cd39d9fce139f403b3270c8e50ff553dd4599 100644 (file)
@@ -200,5 +200,11 @@ struct octep_config {
 
        /* ctrl mbox config */
        struct octep_ctrl_mbox_config ctrl_mbox_cfg;
+
+       /* Configured maximum heartbeat miss count */
+       u32 max_hb_miss_cnt;
+
+       /* Configured firmware heartbeat interval in secs */
+       u32 hb_interval;
 };
 #endif /* _OCTEP_CONFIG_H_ */
index ba0d5fe3081dad7c3ddd3781d461213790ff5138..e1853da280f961b3f14099fa2ff51bc4b2fb7312 100644 (file)
@@ -901,6 +901,37 @@ static void octep_intr_poll_task(struct work_struct *work)
                           msecs_to_jiffies(OCTEP_INTR_POLL_TIME_MSECS));
 }
 
+/**
+ * octep_hb_timeout_task - work queue task to check firmware heartbeat.
+ *
+ * @work: pointer to hb work_struct
+ *
+ * Check for heartbeat miss count. Uninitialize oct device if miss count
+ * exceeds configured max heartbeat miss count.
+ *
+ **/
+static void octep_hb_timeout_task(struct work_struct *work)
+{
+       struct octep_device *oct = container_of(work, struct octep_device,
+                                               hb_task.work);
+
+       int miss_cnt;
+
+       miss_cnt = atomic_inc_return(&oct->hb_miss_cnt);
+       if (miss_cnt < oct->conf->max_hb_miss_cnt) {
+               queue_delayed_work(octep_wq, &oct->hb_task,
+                                  msecs_to_jiffies(oct->conf->hb_interval * 1000));
+               return;
+       }
+
+       dev_err(&oct->pdev->dev, "Missed %u heartbeats. Uninitializing\n",
+               miss_cnt);
+       rtnl_lock();
+       if (netif_running(oct->netdev))
+               octep_stop(oct->netdev);
+       rtnl_unlock();
+}
+
 /**
  * octep_ctrl_mbox_task - work queue task to handle ctrl mbox messages.
  *
@@ -938,7 +969,7 @@ static const char *octep_devid_to_str(struct octep_device *oct)
 int octep_device_setup(struct octep_device *oct)
 {
        struct pci_dev *pdev = oct->pdev;
-       int i;
+       int i, ret;
 
        /* allocate memory for oct->conf */
        oct->conf = kzalloc(sizeof(*oct->conf), GFP_KERNEL);
@@ -973,7 +1004,15 @@ int octep_device_setup(struct octep_device *oct)
 
        oct->pkind = CFG_GET_IQ_PKIND(oct->conf);
 
-       return octep_ctrl_net_init(oct);
+       ret = octep_ctrl_net_init(oct);
+       if (ret)
+               return ret;
+
+       atomic_set(&oct->hb_miss_cnt, 0);
+       INIT_DELAYED_WORK(&oct->hb_task, octep_hb_timeout_task);
+       queue_delayed_work(octep_wq, &oct->hb_task,
+                          msecs_to_jiffies(oct->conf->hb_interval * 1000));
+       return 0;
 
 unsupported_dev:
        for (i = 0; i < OCTEP_MMIO_REGIONS; i++)
@@ -1002,6 +1041,7 @@ static void octep_device_cleanup(struct octep_device *oct)
        }
 
        octep_ctrl_net_uninit(oct);
+       cancel_delayed_work_sync(&oct->hb_task);
 
        oct->hw_ops.soft_reset(oct);
        for (i = 0; i < OCTEP_MMIO_REGIONS; i++) {
index 836d990ba3fa5ac0265eda6d945a86eeaa4515fd..e0907a7191330e53a810a14a00e2d02c46fde194 100644 (file)
@@ -280,6 +280,13 @@ struct octep_device {
        bool poll_non_ioq_intr;
        /* Work entry to poll non-ioq interrupts */
        struct delayed_work intr_poll_task;
+
+       /* Firmware heartbeat timer */
+       struct timer_list hb_timer;
+       /* Firmware heartbeat miss count tracked by timer */
+       atomic_t hb_miss_cnt;
+       /* Task to reset device on heartbeat miss */
+       struct delayed_work hb_task;
 };
 
 static inline u16 OCTEP_MAJOR_REV(struct octep_device *oct)
index 0466fd9a002d035fb12abaa69e758b47f1c37257..b25c3093dc7b4e78075506d86e44da7389e335d3 100644 (file)
 
 /* bit 0 for control mbox interrupt */
 #define CN93_SDP_EPF_OEI_RINT_DATA_BIT_MBOX    BIT_ULL(0)
+/* bit 1 for firmware heartbeat interrupt */
+#define CN93_SDP_EPF_OEI_RINT_DATA_BIT_HBEAT   BIT_ULL(1)
 
 #endif /* _OCTEP_REGS_CN9K_PF_H_ */