misc: smpro-errmon: Add Ampere's SMpro error monitor driver
authorQuan Nguyen <quan@os.amperecomputing.com>
Mon, 31 Oct 2022 02:44:41 +0000 (09:44 +0700)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 10 Nov 2022 18:02:43 +0000 (19:02 +0100)
Add Ampere's SMpro error monitor driver for monitoring and reporting
RAS-related errors as reported by SMpro co-processor found on Ampere's
Altra processor family.

Signed-off-by: Quan Nguyen <quan@os.amperecomputing.com>
Link: https://lore.kernel.org/r/20221031024442.2490881-3-quan@os.amperecomputing.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro [new file with mode: 0644]
drivers/misc/Kconfig
drivers/misc/Makefile
drivers/misc/smpro-errmon.c [new file with mode: 0644]

diff --git a/Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro b/Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro
new file mode 100644 (file)
index 0000000..2b84dc8
--- /dev/null
@@ -0,0 +1,264 @@
+What:          /sys/bus/platform/devices/smpro-errmon.*/error_[core|mem|pcie|other]_[ce|ue]
+KernelVersion: 6.1
+Contact:       Quan Nguyen <quan@os.amperecomputing.com>
+Description:
+               (RO) Contains the 48-byte Ampere (Vendor-Specific) Error Record printed
+               in hex format according to the table below:
+
+               +--------+---------------+-------------+------------------------------------------------------------+
+               | Offset |     Field     | Size (byte) |                     Description                            |
+               +--------+---------------+-------------+------------------------------------------------------------+
+               | 00     | Error Type    | 1           | See :ref:`the table below <smpro-error-types>` for details |
+               +--------+---------------+-------------+------------------------------------------------------------+
+               | 01     | Subtype       | 1           | See :ref:`the table below <smpro-error-types>` for details |
+               +--------+---------------+-------------+------------------------------------------------------------+
+               | 02     | Instance      | 2           | See :ref:`the table below <smpro-error-types>` for details |
+               +--------+---------------+-------------+------------------------------------------------------------+
+               | 04     | Error status  | 4           | See ARM RAS specification for details                      |
+               +--------+---------------+-------------+------------------------------------------------------------+
+               | 08     | Error Address | 8           | See ARM RAS specification for details                      |
+               +--------+---------------+-------------+------------------------------------------------------------+
+               | 16     | Error Misc 0  | 8           | See ARM RAS specification for details                      |
+               +--------+---------------+-------------+------------------------------------------------------------+
+               | 24     | Error Misc 1  | 8           | See ARM RAS specification for details                      |
+               +--------+---------------+-------------+------------------------------------------------------------+
+               | 32     | Error Misc 2  | 8           | See ARM RAS specification for details                      |
+               +--------+---------------+-------------+------------------------------------------------------------+
+               | 40     | Error Misc 3  | 8           | See ARM RAS specification for details                      |
+               +--------+---------------+-------------+------------------------------------------------------------+
+
+               The table below defines the value of error types, their subtype, subcomponent and instance:
+
+               .. _smpro-error-types:
+
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               |   Error Group   | Error Type | Sub type | Sub component  |               Instance                 |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | CPM (core)      | 0          | 0        | Snoop-Logic    | CPM #                                  |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | CPM (core)      | 0          | 2        | Armv8 Core 1   | CPM #                                  |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | MCU (mem)       | 1          | 1        | ERR1           | MCU # \| SLOT << 11                    |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | MCU (mem)       | 1          | 2        | ERR2           | MCU # \| SLOT << 11                    |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | MCU (mem)       | 1          | 3        | ERR3           | MCU #                                  |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | MCU (mem)       | 1          | 4        | ERR4           | MCU #                                  |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | MCU (mem)       | 1          | 5        | ERR5           | MCU #                                  |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | MCU (mem)       | 1          | 6        | ERR6           | MCU #                                  |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | MCU (mem)       | 1          | 7        | Link Error     | MCU #                                  |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | Mesh (other)    | 2          | 0        | Cross Point    | X \| (Y << 5) \| NS <<11               |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | Mesh (other)    | 2          | 1        | Home Node(IO)  | X \| (Y << 5) \| NS <<11               |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | Mesh (other)    | 2          | 2        | Home Node(Mem) | X \| (Y << 5) \| NS <<11 \| device<<12 |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | Mesh (other)    | 2          | 4        | CCIX Node      | X \| (Y << 5) \| NS <<11               |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | 2P Link (other) | 3          | 0        | N/A            | Altra 2P Link #                        |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | GIC (other)     | 5          | 0        | ERR0           | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | GIC (other)     | 5          | 1        | ERR1           | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | GIC (other)     | 5          | 2        | ERR2           | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | GIC (other)     | 5          | 3        | ERR3           | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | GIC (other)     | 5          | 4        | ERR4           | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | GIC (other)     | 5          | 5        | ERR5           | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | GIC (other)     | 5          | 6        | ERR6           | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | GIC (other)     | 5          | 7        | ERR7           | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | GIC (other)     | 5          | 8        | ERR8           | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | GIC (other)     | 5          | 9        | ERR9           | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | GIC (other)     | 5          | 10       | ERR10          | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | GIC (other)     | 5          | 11       | ERR11          | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | GIC (other)     | 5          | 12       | ERR12          | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | GIC (other)     | 5          | 13-21    | ERR13          | RC # + 1                               |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | SMMU (other)    | 6          | TCU      | 100            | RC #                                   |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | SMMU (other)    | 6          | TBU0     | 0              | RC #                                   |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | SMMU (other)    | 6          | TBU1     | 1              | RC #                                   |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | SMMU (other)    | 6          | TBU2     | 2              | RC #                                   |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | SMMU (other)    | 6          | TBU3     | 3              | RC #                                   |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | SMMU (other)    | 6          | TBU4     | 4              | RC #                                   |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | SMMU (other)    | 6          | TBU5     | 5              | RC #                                   |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | SMMU (other)    | 6          | TBU6     | 6              | RC #                                   |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | SMMU (other)    | 6          | TBU7     | 7              | RC #                                   |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | SMMU (other)    | 6          | TBU8     | 8              | RC #                                   |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | SMMU (other)    | 6          | TBU9     | 9              | RC #                                   |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | PCIe AER (pcie) | 7          | Root     | 0              | RC #                                   |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | PCIe AER (pcie) | 7          | Device   | 1              | RC #                                   |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | PCIe RC (pcie)  | 8          | RCA HB   | 0              | RC #                                   |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | PCIe RC (pcie)  | 8          | RCB HB   | 1              | RC #                                   |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | PCIe RC (pcie)  | 8          | RASDP    | 8              | RC #                                   |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | OCM (other)     | 9          | ERR0     | 0              | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | OCM (other)     | 9          | ERR1     | 1              | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | OCM (other)     | 9          | ERR2     | 2              | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | SMpro (other)   | 10         | ERR0     | 0              | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | SMpro (other)   | 10         | ERR1     | 1              | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | SMpro (other)   | 10         | MPA_ERR  | 2              | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | PMpro (other)   | 11         | ERR0     | 0              | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | PMpro (other)   | 11         | ERR1     | 1              | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+               | PMpro (other)   | 11         | MPA_ERR  | 2              | 0                                      |
+               +-----------------+------------+----------+----------------+----------------------------------------+
+
+               Example::
+
+                # cat error_other_ue
+                880807001e004010401040101500000001004010401040100c0000000000000000000000000000000000000000000000
+
+               The detail of each sysfs entries is as below:
+
+               +-------------+---------------------------------------------------------+----------------------------------+
+               |   Error     |                   Sysfs entry                           |   Description (when triggered)   |
+               +-------------+---------------------------------------------------------+----------------------------------+
+               | Core's CE   | /sys/bus/platform/devices/smpro-errmon.*/error_core_ce  | Core has CE error                |
+               +-------------+---------------------------------------------------------+----------------------------------+
+               | Core's UE   | /sys/bus/platform/devices/smpro-errmon.*/error_core_ue  | Core has UE error                |
+               +-------------+---------------------------------------------------------+----------------------------------+
+               | Memory's CE | /sys/bus/platform/devices/smpro-errmon.*/error_mem_ce   | Memory has CE error              |
+               +-------------+---------------------------------------------------------+----------------------------------+
+               | Memory's UE | /sys/bus/platform/devices/smpro-errmon.*/error_mem_ue   | Memory has UE error              |
+               +-------------+---------------------------------------------------------+----------------------------------+
+               | PCIe's CE   | /sys/bus/platform/devices/smpro-errmon.*/error_pcie_ce  | any PCIe controller has CE error |
+               +-------------+---------------------------------------------------------+----------------------------------+
+               | PCIe's UE   | /sys/bus/platform/devices/smpro-errmon.*/error_pcie_ue  | any PCIe controller has UE error |
+               +-------------+---------------------------------------------------------+----------------------------------+
+               | Other's CE  | /sys/bus/platform/devices/smpro-errmon.*/error_other_ce | any other CE error               |
+               +-------------+---------------------------------------------------------+----------------------------------+
+               | Other's UE  | /sys/bus/platform/devices/smpro-errmon.*/error_other_ue | any other UE error               |
+               +-------------+---------------------------------------------------------+----------------------------------+
+
+               UE: Uncorrect-able Error
+               CE: Correct-able Error
+
+               For details, see section `3.3 Ampere (Vendor-Specific) Error Record Formats,
+               Altra Family RAS Supplement`.
+
+
+What:          /sys/bus/platform/devices/smpro-errmon.*/overflow_[core|mem|pcie|other]_[ce|ue]
+KernelVersion: 6.1
+Contact:       Quan Nguyen <quan@os.amperecomputing.com>
+Description:
+               (RO) Return the overflow status of each type HW error reported:
+
+                 - 0      : No overflow
+                 - 1      : There is an overflow and the oldest HW errors are dropped
+
+               The detail of each sysfs entries is as below:
+
+               +-------------+-----------------------------------------------------------+---------------------------------------+
+               |   Overflow  |                   Sysfs entry                             |             Description               |
+               +-------------+-----------------------------------------------------------+---------------------------------------+
+               | Core's CE   | /sys/bus/platform/devices/smpro-errmon.*/overflow_core_ce | Core CE error overflow                |
+               +-------------+-----------------------------------------------------------+---------------------------------------+
+               | Core's UE   | /sys/bus/platform/devices/smpro-errmon.*/overflow_core_ue | Core UE error overflow                |
+               +-------------+-----------------------------------------------------------+---------------------------------------+
+               | Memory's CE | /sys/bus/platform/devices/smpro-errmon.*/overflow_mem_ce  | Memory CE error overflow              |
+               +-------------+-----------------------------------------------------------+---------------------------------------+
+               | Memory's UE | /sys/bus/platform/devices/smpro-errmon.*/overflow_mem_ue  | Memory UE error overflow              |
+               +-------------+-----------------------------------------------------------+---------------------------------------+
+               | PCIe's CE   | /sys/bus/platform/devices/smpro-errmon.*/overflow_pcie_ce | any PCIe controller CE error overflow |
+               +-------------+-----------------------------------------------------------+---------------------------------------+
+               | PCIe's UE   | /sys/bus/platform/devices/smpro-errmon.*/overflow_pcie_ue | any PCIe controller UE error overflow |
+               +-------------+-----------------------------------------------------------+---------------------------------------+
+               | Other's CE  | /sys/bus/platform/devices/smpro-errmon.*/overflow_other_ce| any other CE error overflow           |
+               +-------------+-----------------------------------------------------------+---------------------------------------+
+               | Other's UE  | /sys/bus/platform/devices/smpro-errmon.*/overflow_other_ue| other UE error overflow               |
+               +-------------+-----------------------------------------------------------+---------------------------------------+
+
+               where:
+
+                 - UE: Uncorrect-able Error
+                 - CE: Correct-able Error
+
+What:          /sys/bus/platform/devices/smpro-errmon.*/[error|warn]_[smpro|pmpro]
+KernelVersion: 6.1
+Contact:       Quan Nguyen <quan@os.amperecomputing.com>
+Description:
+               (RO) Contains the internal firmware error/warning printed as hex format.
+
+               The detail of each sysfs entries is as below:
+
+               +---------------+------------------------------------------------------+--------------------------+
+               |   Error       |                   Sysfs entry                        |        Description       |
+               +---------------+------------------------------------------------------+--------------------------+
+               | SMpro error   | /sys/bus/platform/devices/smpro-errmon.*/error_smpro | system has SMpro error   |
+               +---------------+------------------------------------------------------+--------------------------+
+               | SMpro warning | /sys/bus/platform/devices/smpro-errmon.*/warn_smpro  | system has SMpro warning |
+               +---------------+------------------------------------------------------+--------------------------+
+               | PMpro error   | /sys/bus/platform/devices/smpro-errmon.*/error_pmpro | system has PMpro error   |
+               +---------------+------------------------------------------------------+--------------------------+
+               | PMpro warning | /sys/bus/platform/devices/smpro-errmon.*/warn_pmpro  | system has PMpro warning |
+               +---------------+------------------------------------------------------+--------------------------+
+
+               For details, see section `5.10 RAS Internal Error Register Definitions,
+               Altra Family Soc BMC Interface Specification`.
+
+What:          /sys/bus/platform/devices/smpro-errmon.*/event_[vrd_warn_fault|vrd_hot|dimm_hot]
+KernelVersion: 6.1
+Contact:       Quan Nguyen <quan@os.amperecomputing.com>
+Description:
+               (RO) Contains the detail information in case of VRD/DIMM warning/hot events
+               in hex format as below::
+
+                   AAAA
+
+               where:
+
+                 - ``AAAA``: The event detail information data
+
+               The detail of each sysfs entries is as below:
+
+               +---------------+---------------------------------------------------------------+---------------------+
+               |   Event       |                        Sysfs entry                            |     Description     |
+               +---------------+---------------------------------------------------------------+---------------------+
+               | VRD HOT       | /sys/bus/platform/devices/smpro-errmon.*/event_vrd_hot        | VRD Hot             |
+               +---------------+---------------------------------------------------------------+---------------------+
+               | VR Warn/Fault | /sys/bus/platform/devices/smpro-errmon.*/event_vrd_warn_fault | VR Warning or Fault |
+               +---------------+---------------------------------------------------------------+---------------------+
+               | DIMM HOT      | /sys/bus/platform/devices/smpro-errmon.*/event_dimm_hot       | DIMM Hot            |
+               +---------------+---------------------------------------------------------------+---------------------+
+
+               For more details, see section `5.7 GPI Status Registers,
+               Altra Family Soc BMC Interface Specification`.
+
index 358ad56f65245e2094960dad0e6ee6f52f0110b1..b9ceee949dab8eb121116cc7dc74100e960f22f0 100644 (file)
@@ -176,6 +176,18 @@ config SGI_XP
          this feature will allow for direct communication between SSIs
          based on a network adapter and DMA messaging.
 
+config SMPRO_ERRMON
+       tristate "Ampere Computing SMPro error monitor driver"
+       depends on MFD_SMPRO || COMPILE_TEST
+       help
+         Say Y here to get support for the SMpro error monitor function
+         provided by Ampere Computing's Altra and Altra Max SoCs. Upon
+         loading, the driver creates sysfs files which can be use to gather
+         multiple HW error data reported via read and write system calls.
+
+         To compile this driver as a module, say M here. The driver will be
+         called smpro-errmon.
+
 config CS5535_MFGPT
        tristate "CS5535/CS5536 Geode Multi-Function General Purpose Timer (MFGPT) support"
        depends on MFD_CS5535
index ac9b3e757ba1dfbbd44d5410ab31fd335fba90e9..bbe24d4511a33bb89b6ea2db83b9ebc8d9beb6da 100644 (file)
@@ -23,6 +23,7 @@ obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o
 obj-$(CONFIG_KGDB_TESTS)       += kgdbts.o
 obj-$(CONFIG_SGI_XP)           += sgi-xp/
 obj-$(CONFIG_SGI_GRU)          += sgi-gru/
+obj-$(CONFIG_SMPRO_ERRMON)     += smpro-errmon.o
 obj-$(CONFIG_CS5535_MFGPT)     += cs5535-mfgpt.o
 obj-$(CONFIG_GEHC_ACHC)                += gehc-achc.o
 obj-$(CONFIG_HP_ILO)           += hpilo.o
diff --git a/drivers/misc/smpro-errmon.c b/drivers/misc/smpro-errmon.c
new file mode 100644 (file)
index 0000000..d1431d4
--- /dev/null
@@ -0,0 +1,529 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Ampere Computing SoC's SMpro Error Monitoring Driver
+ *
+ * Copyright (c) 2022, Ampere Computing LLC
+ *
+ */
+
+#include <linux/i2c.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+
+/* GPI RAS Error Registers */
+#define GPI_RAS_ERR            0x7E
+
+/* Core and L2C Error Registers */
+#define CORE_CE_ERR_CNT                0x80
+#define CORE_CE_ERR_LEN                0x81
+#define CORE_CE_ERR_DATA       0x82
+#define CORE_UE_ERR_CNT                0x83
+#define CORE_UE_ERR_LEN                0x84
+#define CORE_UE_ERR_DATA       0x85
+
+/* Memory Error Registers */
+#define MEM_CE_ERR_CNT         0x90
+#define MEM_CE_ERR_LEN         0x91
+#define MEM_CE_ERR_DATA                0x92
+#define MEM_UE_ERR_CNT         0x93
+#define MEM_UE_ERR_LEN         0x94
+#define MEM_UE_ERR_DATA                0x95
+
+/* RAS Error/Warning Registers */
+#define ERR_SMPRO_TYPE         0xA0
+#define ERR_PMPRO_TYPE         0xA1
+#define ERR_SMPRO_INFO_LO      0xA2
+#define ERR_SMPRO_INFO_HI      0xA3
+#define ERR_SMPRO_DATA_LO      0xA4
+#define ERR_SMPRO_DATA_HI      0xA5
+#define WARN_SMPRO_INFO_LO     0xAA
+#define WARN_SMPRO_INFO_HI     0xAB
+#define ERR_PMPRO_INFO_LO      0xA6
+#define ERR_PMPRO_INFO_HI      0xA7
+#define ERR_PMPRO_DATA_LO      0xA8
+#define ERR_PMPRO_DATA_HI      0xA9
+#define WARN_PMPRO_INFO_LO     0xAC
+#define WARN_PMPRO_INFO_HI     0xAD
+
+/* PCIE Error Registers */
+#define PCIE_CE_ERR_CNT                0xC0
+#define PCIE_CE_ERR_LEN                0xC1
+#define PCIE_CE_ERR_DATA       0xC2
+#define PCIE_UE_ERR_CNT                0xC3
+#define PCIE_UE_ERR_LEN                0xC4
+#define PCIE_UE_ERR_DATA       0xC5
+
+/* Other Error Registers */
+#define OTHER_CE_ERR_CNT       0xD0
+#define OTHER_CE_ERR_LEN       0xD1
+#define OTHER_CE_ERR_DATA      0xD2
+#define OTHER_UE_ERR_CNT       0xD8
+#define OTHER_UE_ERR_LEN       0xD9
+#define OTHER_UE_ERR_DATA      0xDA
+
+/* Event Data Registers */
+#define VRD_WARN_FAULT_EVENT_DATA      0x78
+#define VRD_HOT_EVENT_DATA             0x79
+#define DIMM_HOT_EVENT_DATA            0x7A
+
+#define MAX_READ_BLOCK_LENGTH  48
+
+#define RAS_SMPRO_ERR          0
+#define RAS_PMPRO_ERR          1
+
+enum RAS_48BYTES_ERR_TYPES {
+       CORE_CE_ERR,
+       CORE_UE_ERR,
+       MEM_CE_ERR,
+       MEM_UE_ERR,
+       PCIE_CE_ERR,
+       PCIE_UE_ERR,
+       OTHER_CE_ERR,
+       OTHER_UE_ERR,
+       NUM_48BYTES_ERR_TYPE,
+};
+
+struct smpro_error_hdr {
+       u8 count;       /* Number of the RAS errors */
+       u8 len;         /* Number of data bytes */
+       u8 data;        /* Start of 48-byte data */
+       u8 max_cnt;     /* Max num of errors */
+};
+
+/*
+ * Included Address of registers to get Count, Length of data and Data
+ * of the 48 bytes error data
+ */
+static struct smpro_error_hdr smpro_error_table[] = {
+       [CORE_CE_ERR] = {
+               .count = CORE_CE_ERR_CNT,
+               .len = CORE_CE_ERR_LEN,
+               .data = CORE_CE_ERR_DATA,
+               .max_cnt = 32
+       },
+       [CORE_UE_ERR] = {
+               .count = CORE_UE_ERR_CNT,
+               .len = CORE_UE_ERR_LEN,
+               .data = CORE_UE_ERR_DATA,
+               .max_cnt = 32
+       },
+       [MEM_CE_ERR] = {
+               .count = MEM_CE_ERR_CNT,
+               .len = MEM_CE_ERR_LEN,
+               .data = MEM_CE_ERR_DATA,
+               .max_cnt = 16
+       },
+       [MEM_UE_ERR] = {
+               .count = MEM_UE_ERR_CNT,
+               .len = MEM_UE_ERR_LEN,
+               .data = MEM_UE_ERR_DATA,
+               .max_cnt = 16
+       },
+       [PCIE_CE_ERR] = {
+               .count = PCIE_CE_ERR_CNT,
+               .len = PCIE_CE_ERR_LEN,
+               .data = PCIE_CE_ERR_DATA,
+               .max_cnt = 96
+       },
+       [PCIE_UE_ERR] = {
+               .count = PCIE_UE_ERR_CNT,
+               .len = PCIE_UE_ERR_LEN,
+               .data = PCIE_UE_ERR_DATA,
+               .max_cnt = 96
+       },
+       [OTHER_CE_ERR] = {
+               .count = OTHER_CE_ERR_CNT,
+               .len = OTHER_CE_ERR_LEN,
+               .data = OTHER_CE_ERR_DATA,
+               .max_cnt = 8
+       },
+       [OTHER_UE_ERR] = {
+               .count = OTHER_UE_ERR_CNT,
+               .len = OTHER_UE_ERR_LEN,
+               .data = OTHER_UE_ERR_DATA,
+               .max_cnt = 8
+       },
+};
+
+/*
+ * List of SCP registers which are used to get
+ * one type of RAS Internal errors.
+ */
+struct smpro_int_error_hdr {
+       u8 type;
+       u8 info_l;
+       u8 info_h;
+       u8 data_l;
+       u8 data_h;
+       u8 warn_l;
+       u8 warn_h;
+};
+
+static struct smpro_int_error_hdr list_smpro_int_error_hdr[] = {
+       [RAS_SMPRO_ERR] = {
+               .type = ERR_SMPRO_TYPE,
+               .info_l = ERR_SMPRO_INFO_LO,
+               .info_h = ERR_SMPRO_INFO_HI,
+               .data_l = ERR_SMPRO_DATA_LO,
+               .data_h = ERR_SMPRO_DATA_HI,
+               .warn_l = WARN_SMPRO_INFO_LO,
+               .warn_h = WARN_SMPRO_INFO_HI,
+       },
+       [RAS_PMPRO_ERR] = {
+               .type = ERR_PMPRO_TYPE,
+               .info_l = ERR_PMPRO_INFO_LO,
+               .info_h = ERR_PMPRO_INFO_HI,
+               .data_l = ERR_PMPRO_DATA_LO,
+               .data_h = ERR_PMPRO_DATA_HI,
+               .warn_l = WARN_PMPRO_INFO_LO,
+               .warn_h = WARN_PMPRO_INFO_HI,
+       },
+};
+
+struct smpro_errmon {
+       struct regmap *regmap;
+};
+
+enum EVENT_TYPES {
+       VRD_WARN_FAULT_EVENT,
+       VRD_HOT_EVENT,
+       DIMM_HOT_EVENT,
+       NUM_EVENTS_TYPE,
+};
+
+/* Included Address of event source and data registers */
+static u8 smpro_event_table[NUM_EVENTS_TYPE] = {
+       VRD_WARN_FAULT_EVENT_DATA,
+       VRD_HOT_EVENT_DATA,
+       DIMM_HOT_EVENT_DATA,
+};
+
+static ssize_t smpro_event_data_read(struct device *dev,
+                                    struct device_attribute *da, char *buf,
+                                    int channel)
+{
+       struct smpro_errmon *errmon = dev_get_drvdata(dev);
+       s32 event_data;
+       int ret;
+
+       ret = regmap_read(errmon->regmap, smpro_event_table[channel], &event_data);
+       if (ret)
+               return ret;
+       /* Clear event after read */
+       if (event_data != 0)
+               regmap_write(errmon->regmap, smpro_event_table[channel], event_data);
+
+       return sysfs_emit(buf, "%04x\n", event_data);
+}
+
+static ssize_t smpro_overflow_data_read(struct device *dev, struct device_attribute *da,
+                                       char *buf, int channel)
+{
+       struct smpro_errmon *errmon = dev_get_drvdata(dev);
+       struct smpro_error_hdr *err_info;
+       s32 err_count;
+       int ret;
+
+       err_info = &smpro_error_table[channel];
+
+       ret = regmap_read(errmon->regmap, err_info->count, &err_count);
+       if (ret)
+               return ret;
+
+       /* Bit 8 indicates the overflow status */
+       return sysfs_emit(buf, "%d\n", (err_count & BIT(8)) ? 1 : 0);
+}
+
+static ssize_t smpro_error_data_read(struct device *dev, struct device_attribute *da,
+                                    char *buf, int channel)
+{
+       struct smpro_errmon *errmon = dev_get_drvdata(dev);
+       unsigned char err_data[MAX_READ_BLOCK_LENGTH];
+       struct smpro_error_hdr *err_info;
+       s32 err_count, err_length;
+       int ret;
+
+       err_info = &smpro_error_table[channel];
+
+       ret = regmap_read(errmon->regmap, err_info->count, &err_count);
+       /* Error count is the low byte */
+       err_count &= 0xff;
+       if (ret || !err_count || err_count > err_info->max_cnt)
+               return ret;
+
+       ret = regmap_read(errmon->regmap, err_info->len, &err_length);
+       if (ret || err_length <= 0)
+               return ret;
+
+       if (err_length > MAX_READ_BLOCK_LENGTH)
+               err_length = MAX_READ_BLOCK_LENGTH;
+
+       memset(err_data, 0x00, MAX_READ_BLOCK_LENGTH);
+       ret = regmap_noinc_read(errmon->regmap, err_info->data, err_data, err_length);
+       if (ret < 0)
+               return ret;
+
+       /* clear the error */
+       ret = regmap_write(errmon->regmap, err_info->count, 0x100);
+       if (ret)
+               return ret;
+       /*
+        * The output of Core/Memory/PCIe/Others UE/CE errors follows the format
+        * specified in section 5.8.1 CE/UE Error Data record in
+        * Altra SOC BMC Interface specification.
+        */
+       return sysfs_emit(buf, "%*phN\n", MAX_READ_BLOCK_LENGTH, err_data);
+}
+
+/*
+ * Output format:
+ * <4-byte hex value of error info><4-byte hex value of error extensive data>
+ * Where:
+ *   + error info : The error information
+ *   + error data : Extensive data (32 bits)
+ * Reference to section 5.10 RAS Internal Error Register Definition in
+ * Altra SOC BMC Interface specification
+ */
+static ssize_t smpro_internal_err_read(struct device *dev, struct device_attribute *da,
+                                      char *buf, int channel)
+{
+       struct smpro_errmon *errmon = dev_get_drvdata(dev);
+       struct smpro_int_error_hdr *err_info;
+       unsigned int err[4] = { 0 };
+       unsigned int err_type;
+       unsigned int val;
+       int ret;
+
+       /* read error status */
+       ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val);
+       if (ret)
+               return ret;
+
+       if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) ||
+           (channel == RAS_PMPRO_ERR && !(val & BIT(1))))
+               return 0;
+
+       err_info = &list_smpro_int_error_hdr[channel];
+       ret = regmap_read(errmon->regmap, err_info->type, &val);
+       if (ret)
+               return ret;
+
+       err_type = (val & BIT(1)) ? BIT(1) :
+                  (val & BIT(2)) ? BIT(2) : 0;
+
+       if (!err_type)
+               return 0;
+
+       ret = regmap_read(errmon->regmap, err_info->info_l, err + 1);
+       if (ret)
+               return ret;
+
+       ret = regmap_read(errmon->regmap, err_info->info_h, err);
+       if (ret)
+               return ret;
+
+       if (err_type & BIT(2)) {
+               /* Error with data type */
+               ret = regmap_read(errmon->regmap, err_info->data_l, err + 3);
+               if (ret)
+                       return ret;
+
+               ret = regmap_read(errmon->regmap, err_info->data_h, err + 2);
+               if (ret)
+                       return ret;
+       }
+
+       /* clear the read errors */
+       ret = regmap_write(errmon->regmap, err_info->type, err_type);
+       if (ret)
+               return ret;
+
+       return sysfs_emit(buf, "%*phN\n", (int)sizeof(err), err);
+}
+
+/*
+ * Output format:
+ * <4-byte hex value of warining info>
+ * Reference to section 5.10 RAS Internal Error Register Definition in
+ * Altra SOC BMC Interface specification
+ */
+static ssize_t smpro_internal_warn_read(struct device *dev, struct device_attribute *da,
+                                       char *buf, int channel)
+{
+       struct smpro_errmon *errmon = dev_get_drvdata(dev);
+       struct smpro_int_error_hdr *err_info;
+       unsigned int warn[2] = { 0 };
+       unsigned int val;
+       int ret;
+
+       /* read error status */
+       ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val);
+       if (ret)
+               return ret;
+
+       if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) ||
+           (channel == RAS_PMPRO_ERR && !(val & BIT(1))))
+               return 0;
+
+       err_info = &list_smpro_int_error_hdr[channel];
+       ret = regmap_read(errmon->regmap, err_info->type, &val);
+       if (ret)
+               return ret;
+
+       if (!(val & BIT(0)))
+               return 0;
+
+       ret = regmap_read(errmon->regmap, err_info->warn_l, warn + 1);
+       if (ret)
+               return ret;
+
+       ret = regmap_read(errmon->regmap, err_info->warn_h, warn);
+       if (ret)
+               return ret;
+
+       /* clear the warning */
+       ret = regmap_write(errmon->regmap, err_info->type, BIT(0));
+       if (ret)
+               return ret;
+
+       return sysfs_emit(buf, "%*phN\n", (int)sizeof(warn), warn);
+}
+
+#define ERROR_OVERFLOW_RO(_error, _index) \
+       static ssize_t overflow_##_error##_show(struct device *dev,            \
+                                               struct device_attribute *da,   \
+                                               char *buf)                     \
+       {                                                                      \
+               return smpro_overflow_data_read(dev, da, buf, _index);         \
+       }                                                                      \
+       static DEVICE_ATTR_RO(overflow_##_error)
+
+ERROR_OVERFLOW_RO(core_ce, CORE_CE_ERR);
+ERROR_OVERFLOW_RO(core_ue, CORE_UE_ERR);
+ERROR_OVERFLOW_RO(mem_ce, MEM_CE_ERR);
+ERROR_OVERFLOW_RO(mem_ue, MEM_UE_ERR);
+ERROR_OVERFLOW_RO(pcie_ce, PCIE_CE_ERR);
+ERROR_OVERFLOW_RO(pcie_ue, PCIE_UE_ERR);
+ERROR_OVERFLOW_RO(other_ce, OTHER_CE_ERR);
+ERROR_OVERFLOW_RO(other_ue, OTHER_UE_ERR);
+
+#define ERROR_RO(_error, _index) \
+       static ssize_t error_##_error##_show(struct device *dev,            \
+                                            struct device_attribute *da,   \
+                                            char *buf)                     \
+       {                                                                   \
+               return smpro_error_data_read(dev, da, buf, _index);         \
+       }                                                                   \
+       static DEVICE_ATTR_RO(error_##_error)
+
+ERROR_RO(core_ce, CORE_CE_ERR);
+ERROR_RO(core_ue, CORE_UE_ERR);
+ERROR_RO(mem_ce, MEM_CE_ERR);
+ERROR_RO(mem_ue, MEM_UE_ERR);
+ERROR_RO(pcie_ce, PCIE_CE_ERR);
+ERROR_RO(pcie_ue, PCIE_UE_ERR);
+ERROR_RO(other_ce, OTHER_CE_ERR);
+ERROR_RO(other_ue, OTHER_UE_ERR);
+
+static ssize_t error_smpro_show(struct device *dev, struct device_attribute *da, char *buf)
+{
+       return smpro_internal_err_read(dev, da, buf, RAS_SMPRO_ERR);
+}
+static DEVICE_ATTR_RO(error_smpro);
+
+static ssize_t error_pmpro_show(struct device *dev, struct device_attribute *da, char *buf)
+{
+       return smpro_internal_err_read(dev, da, buf, RAS_PMPRO_ERR);
+}
+static DEVICE_ATTR_RO(error_pmpro);
+
+static ssize_t warn_smpro_show(struct device *dev, struct device_attribute *da, char *buf)
+{
+       return smpro_internal_warn_read(dev, da, buf, RAS_SMPRO_ERR);
+}
+static DEVICE_ATTR_RO(warn_smpro);
+
+static ssize_t warn_pmpro_show(struct device *dev, struct device_attribute *da, char *buf)
+{
+       return smpro_internal_warn_read(dev, da, buf, RAS_PMPRO_ERR);
+}
+static DEVICE_ATTR_RO(warn_pmpro);
+
+#define EVENT_RO(_event, _index) \
+       static ssize_t event_##_event##_show(struct device *dev,            \
+                                            struct device_attribute *da,   \
+                                            char *buf)                     \
+       {                                                                   \
+               return smpro_event_data_read(dev, da, buf, _index);         \
+       }                                                                   \
+       static DEVICE_ATTR_RO(event_##_event)
+
+EVENT_RO(vrd_warn_fault, VRD_WARN_FAULT_EVENT);
+EVENT_RO(vrd_hot, VRD_HOT_EVENT);
+EVENT_RO(dimm_hot, DIMM_HOT_EVENT);
+
+static struct attribute *smpro_errmon_attrs[] = {
+       &dev_attr_overflow_core_ce.attr,
+       &dev_attr_overflow_core_ue.attr,
+       &dev_attr_overflow_mem_ce.attr,
+       &dev_attr_overflow_mem_ue.attr,
+       &dev_attr_overflow_pcie_ce.attr,
+       &dev_attr_overflow_pcie_ue.attr,
+       &dev_attr_overflow_other_ce.attr,
+       &dev_attr_overflow_other_ue.attr,
+       &dev_attr_error_core_ce.attr,
+       &dev_attr_error_core_ue.attr,
+       &dev_attr_error_mem_ce.attr,
+       &dev_attr_error_mem_ue.attr,
+       &dev_attr_error_pcie_ce.attr,
+       &dev_attr_error_pcie_ue.attr,
+       &dev_attr_error_other_ce.attr,
+       &dev_attr_error_other_ue.attr,
+       &dev_attr_error_smpro.attr,
+       &dev_attr_error_pmpro.attr,
+       &dev_attr_warn_smpro.attr,
+       &dev_attr_warn_pmpro.attr,
+       &dev_attr_event_vrd_warn_fault.attr,
+       &dev_attr_event_vrd_hot.attr,
+       &dev_attr_event_dimm_hot.attr,
+       NULL
+};
+
+ATTRIBUTE_GROUPS(smpro_errmon);
+
+static int smpro_errmon_probe(struct platform_device *pdev)
+{
+       struct smpro_errmon *errmon;
+
+       errmon = devm_kzalloc(&pdev->dev, sizeof(struct smpro_errmon), GFP_KERNEL);
+       if (!errmon)
+               return -ENOMEM;
+
+       platform_set_drvdata(pdev, errmon);
+
+       errmon->regmap = dev_get_regmap(pdev->dev.parent, NULL);
+       if (!errmon->regmap)
+               return -ENODEV;
+
+       return 0;
+}
+
+static struct platform_driver smpro_errmon_driver = {
+       .probe          = smpro_errmon_probe,
+       .driver = {
+               .name   = "smpro-errmon",
+               .dev_groups = smpro_errmon_groups,
+       },
+};
+
+module_platform_driver(smpro_errmon_driver);
+
+MODULE_AUTHOR("Tung Nguyen <tung.nguyen@amperecomputing.com>");
+MODULE_AUTHOR("Thinh Pham <thinh.pham@amperecomputing.com>");
+MODULE_AUTHOR("Hoang Nguyen <hnguyen@amperecomputing.com>");
+MODULE_AUTHOR("Thu Nguyen <thu@os.amperecomputing.com>");
+MODULE_AUTHOR("Quan Nguyen <quan@os.amperecomputing.com>");
+MODULE_DESCRIPTION("Ampere Altra SMpro driver");
+MODULE_LICENSE("GPL");