habanalabs: add information about PCIe controller
authorOfir Bitton <obitton@habana.ai>
Tue, 21 Jul 2020 07:49:51 +0000 (10:49 +0300)
committerOded Gabbay <oded.gabbay@gmail.com>
Tue, 22 Sep 2020 15:49:49 +0000 (18:49 +0300)
Update firmware header with new API for getting pcie info
such as tx/rx throughput and replay counter.
These counters are needed by customers for monitor and maintenance
of multiple devices.
Add new opcodes to the INFO ioctl to retrieve these counters.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
drivers/misc/habanalabs/common/firmware_if.c
drivers/misc/habanalabs/common/habanalabs.h
drivers/misc/habanalabs/common/habanalabs_ioctl.c
drivers/misc/habanalabs/gaudi/gaudi.c
drivers/misc/habanalabs/goya/goya.c
drivers/misc/habanalabs/include/common/armcp_if.h
include/uapi/misc/habanalabs.h

index f52bc690dfc5c68601950f878dee136d12ff006b..61f5edc96e1627f78d7887945b5d097929a4436c 100644 (file)
@@ -363,6 +363,54 @@ out:
        return rc;
 }
 
+int hl_fw_armcp_pci_counters_get(struct hl_device *hdev,
+               struct hl_info_pci_counters *counters)
+{
+       struct armcp_packet pkt = {};
+       long result;
+       int rc;
+
+       pkt.ctl = cpu_to_le32(ARMCP_PACKET_PCIE_THROUGHPUT_GET <<
+                       ARMCP_PKT_CTL_OPCODE_SHIFT);
+
+       /* Fetch PCI rx counter */
+       pkt.index = cpu_to_le32(armcp_pcie_throughput_rx);
+       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+                                       HL_ARMCP_INFO_TIMEOUT_USEC, &result);
+       if (rc) {
+               dev_err(hdev->dev,
+                       "Failed to handle ArmCP PCI info pkt, error %d\n", rc);
+               return rc;
+       }
+       counters->rx_throughput = result;
+
+       /* Fetch PCI tx counter */
+       pkt.index = cpu_to_le32(armcp_pcie_throughput_tx);
+       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+                                       HL_ARMCP_INFO_TIMEOUT_USEC, &result);
+       if (rc) {
+               dev_err(hdev->dev,
+                       "Failed to handle ArmCP PCI info pkt, error %d\n", rc);
+               return rc;
+       }
+       counters->tx_throughput = result;
+
+       /* Fetch PCI replay counter */
+       pkt.ctl = cpu_to_le32(ARMCP_PACKET_PCIE_REPLAY_CNT_GET <<
+                       ARMCP_PKT_CTL_OPCODE_SHIFT);
+
+       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+                       HL_ARMCP_INFO_TIMEOUT_USEC, &result);
+       if (rc) {
+               dev_err(hdev->dev,
+                       "Failed to handle ArmCP PCI info pkt, error %d\n", rc);
+               return rc;
+       }
+       counters->replay_cnt = (u32) result;
+
+       return rc;
+}
+
 static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg)
 {
        u32 err_val;
index f97eebc649797bad390dba0ac225bf720b60732c..2c9fcb5132156d9fa53f1184c384fef01b8e4832 100644 (file)
@@ -1483,6 +1483,7 @@ struct hl_device_idle_busy_ts {
  * @soft_reset_cnt: number of soft reset since the driver was loaded.
  * @hard_reset_cnt: number of hard reset since the driver was loaded.
  * @idle_busy_ts_idx: index of current entry in idle_busy_ts_arr
+ * @clk_throttling_reason: bitmask represents the current clk throttling reasons
  * @id: device minor.
  * @id_control: minor of the control device
  * @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
@@ -1587,6 +1588,7 @@ struct hl_device {
        u32                             soft_reset_cnt;
        u32                             hard_reset_cnt;
        u32                             idle_busy_ts_idx;
+       u32                             clk_throttling_reason;
        u16                             id;
        u16                             id_control;
        u16                             cpu_pci_msb_addr;
@@ -1841,6 +1843,8 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
 int hl_fw_send_heartbeat(struct hl_device *hdev);
 int hl_fw_armcp_info_get(struct hl_device *hdev);
 int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size);
+int hl_fw_armcp_pci_counters_get(struct hl_device *hdev,
+               struct hl_info_pci_counters *counters);
 int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
                        u32 msg_to_cpu_reg, u32 cpu_msg_status_reg,
                        u32 boot_err0_reg, bool skip_bmc,
index 5af1c03da47398a562078a7b975b1b24210ca025..4d838b1a3bbee5a20cbcbaed4b5af2b9a60174e5 100644 (file)
@@ -276,6 +276,41 @@ static int time_sync_info(struct hl_device *hdev, struct hl_info_args *args)
                min((size_t) max_size, sizeof(time_sync))) ? -EFAULT : 0;
 }
 
+static int pci_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+       struct hl_device *hdev = hpriv->hdev;
+       struct hl_info_pci_counters pci_counters = {0};
+       u32 max_size = args->return_size;
+       void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+       int rc;
+
+       if ((!max_size) || (!out))
+               return -EINVAL;
+
+       rc = hl_fw_armcp_pci_counters_get(hdev, &pci_counters);
+       if (rc)
+               return rc;
+
+       return copy_to_user(out, &pci_counters,
+               min((size_t) max_size, sizeof(pci_counters))) ? -EFAULT : 0;
+}
+
+static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+       struct hl_device *hdev = hpriv->hdev;
+       struct hl_info_clk_throttle clk_throttle = {0};
+       u32 max_size = args->return_size;
+       void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+       if ((!max_size) || (!out))
+               return -EINVAL;
+
+       clk_throttle.clk_throttling_reason = hdev->clk_throttling_reason;
+
+       return copy_to_user(out, &clk_throttle,
+               min((size_t) max_size, sizeof(clk_throttle))) ? -EFAULT : 0;
+}
+
 static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 {
        struct hl_device *hdev = hpriv->hdev;
@@ -360,6 +395,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
        case HL_INFO_CS_COUNTERS:
                return cs_counters_info(hpriv, args);
 
+       case HL_INFO_PCI_COUNTERS:
+               return pci_counters_info(hpriv, args);
+
+       case HL_INFO_CLK_THROTTLE_REASON:
+               return clk_throttle_info(hpriv, args);
+
        default:
                dev_err(dev, "Invalid request %d\n", args->op);
                rc = -ENOTTY;
index 4009b7df4cafec33ae1dc8bdb7090e18def1224a..adb5c5594ac1a47230e1980050c7e8cb1234af1d 100644 (file)
@@ -5653,21 +5653,25 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev,
 {
        switch (event_type) {
        case GAUDI_EVENT_FIX_POWER_ENV_S:
+               hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
                dev_info_ratelimited(hdev->dev,
                        "Clock throttling due to power consumption\n");
                break;
 
        case GAUDI_EVENT_FIX_POWER_ENV_E:
+               hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
                dev_info_ratelimited(hdev->dev,
                        "Power envelop is safe, back to optimal clock\n");
                break;
 
        case GAUDI_EVENT_FIX_THERMAL_ENV_S:
+               hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
                dev_info_ratelimited(hdev->dev,
                        "Clock throttling due to overheating\n");
                break;
 
        case GAUDI_EVENT_FIX_THERMAL_ENV_E:
+               hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
                dev_info_ratelimited(hdev->dev,
                        "Thermal envelop is safe, back to optimal clock\n");
                break;
index 33cd2ae653d23441e1686d48d8df2aa05b1c80e7..954f2c022d337724ab250405fe5ee7c7eebb3939 100644 (file)
@@ -4580,18 +4580,22 @@ static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type)
 {
        switch (event_type) {
        case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_S:
+               hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
                dev_info_ratelimited(hdev->dev,
                        "Clock throttling due to power consumption\n");
                break;
        case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_E:
+               hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
                dev_info_ratelimited(hdev->dev,
                        "Power envelop is safe, back to optimal clock\n");
                break;
        case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_S:
+               hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
                dev_info_ratelimited(hdev->dev,
                        "Clock throttling due to overheating\n");
                break;
        case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E:
+               hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
                dev_info_ratelimited(hdev->dev,
                        "Thermal envelop is safe, back to optimal clock\n");
                break;
index 07f9972db28d9ef4bb70717a708ae3341b808bb7..1403c937253c93c9e57e4b34d9299a7a4ab39f8c 100644 (file)
@@ -243,6 +243,8 @@ enum armcp_packet_id {
        ARMCP_PACKET_TEMPERATURE_SET,           /* sysfs */
        ARMCP_PACKET_VOLTAGE_SET,               /* sysfs */
        ARMCP_PACKET_CURRENT_SET,               /* sysfs */
+       ARMCP_PACKET_PCIE_THROUGHPUT_GET,       /* internal */
+       ARMCP_PACKET_PCIE_REPLAY_CNT_GET,       /* internal */
 };
 
 #define ARMCP_PACKET_FENCE_VAL 0xFE8CE7A5
@@ -277,6 +279,9 @@ struct armcp_packet {
                        __u8 pad; /* unused */
                };
 
+               /* For any general request */
+               __le32 index;
+
                /* For frequency get/set */
                __le32 pll_index;
 
@@ -344,6 +349,11 @@ enum armcp_pwm_attributes {
        armcp_pwm_enable
 };
 
+enum armcp_pcie_throughput_attributes {
+       armcp_pcie_throughput_tx,
+       armcp_pcie_throughput_rx
+};
+
 /* Event Queue Packets */
 
 struct eq_generic_event {
index d5c4f983b7a8cb90d7e696546ef4ef90ebdeed8e..ee13b919db35a8c238847d3864fc97358eb353a9 100644 (file)
@@ -264,6 +264,8 @@ enum hl_device_status {
  * HL_INFO_TIME_SYNC     - Retrieve the device's time alongside the host's time
  *                         for synchronization.
  * HL_INFO_CS_COUNTERS   - Retrieve command submission counters
+ * HL_INFO_PCI_COUNTERS  - Retrieve PCI counters
+ * HL_INFO_CLK_THROTTLE_REASON - Retrieve clock throttling reason
  */
 #define HL_INFO_HW_IP_INFO             0
 #define HL_INFO_HW_EVENTS              1
@@ -276,6 +278,8 @@ enum hl_device_status {
 #define HL_INFO_RESET_COUNT            9
 #define HL_INFO_TIME_SYNC              10
 #define HL_INFO_CS_COUNTERS            11
+#define HL_INFO_PCI_COUNTERS           12
+#define HL_INFO_CLK_THROTTLE_REASON    13
 
 #define HL_INFO_VERSION_MAX_LEN        128
 #define HL_INFO_CARD_NAME_MAX_LEN      16
@@ -340,6 +344,29 @@ struct hl_info_time_sync {
        __u64 host_time;
 };
 
+/**
+ * struct hl_info_pci_counters - pci counters
+ * @rx_throughput: PCI rx throughput KBps
+ * @tx_throughput: PCI tx throughput KBps
+ * @replay_cnt: PCI replay counter
+ */
+struct hl_info_pci_counters {
+       __u64 rx_throughput;
+       __u64 tx_throughput;
+       __u64 replay_cnt;
+};
+
+#define HL_CLK_THROTTLE_POWER  0x1
+#define HL_CLK_THROTTLE_THERMAL        0x2
+
+/**
+ * struct hl_info_clk_throttle - clock throttling reason
+ * @clk_throttling_reason: each bit represents a clk throttling reason
+ */
+struct hl_info_clk_throttle {
+       __u32 clk_throttling_reason;
+};
+
 /**
  * struct hl_info_cs_counters - command submission counters
  * @out_of_mem_drop_cnt: dropped due to memory allocation issue