misc: smpro-errmon: Add Ampere's SMpro error monitor driver
Add Ampere's SMpro error monitor driver for monitoring and reporting RAS-related errors as reported by SMpro co-processor found on Ampere's Altra processor family. Signed-off-by: Quan Nguyen <quan@os.amperecomputing.com> Link: https://lore.kernel.org/r/20221031024442.2490881-3-quan@os.amperecomputing.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
parent
c08645ea21
commit
4a4a4e9eba
@ -0,0 +1,264 @@
|
||||
What: /sys/bus/platform/devices/smpro-errmon.*/error_[core|mem|pcie|other]_[ce|ue]
|
||||
KernelVersion: 6.1
|
||||
Contact: Quan Nguyen <quan@os.amperecomputing.com>
|
||||
Description:
|
||||
(RO) Contains the 48-byte Ampere (Vendor-Specific) Error Record printed
|
||||
in hex format according to the table below:
|
||||
|
||||
+--------+---------------+-------------+------------------------------------------------------------+
|
||||
| Offset | Field | Size (byte) | Description |
|
||||
+--------+---------------+-------------+------------------------------------------------------------+
|
||||
| 00 | Error Type | 1 | See :ref:`the table below <smpro-error-types>` for details |
|
||||
+--------+---------------+-------------+------------------------------------------------------------+
|
||||
| 01 | Subtype | 1 | See :ref:`the table below <smpro-error-types>` for details |
|
||||
+--------+---------------+-------------+------------------------------------------------------------+
|
||||
| 02 | Instance | 2 | See :ref:`the table below <smpro-error-types>` for details |
|
||||
+--------+---------------+-------------+------------------------------------------------------------+
|
||||
| 04 | Error status | 4 | See ARM RAS specification for details |
|
||||
+--------+---------------+-------------+------------------------------------------------------------+
|
||||
| 08 | Error Address | 8 | See ARM RAS specification for details |
|
||||
+--------+---------------+-------------+------------------------------------------------------------+
|
||||
| 16 | Error Misc 0 | 8 | See ARM RAS specification for details |
|
||||
+--------+---------------+-------------+------------------------------------------------------------+
|
||||
| 24 | Error Misc 1 | 8 | See ARM RAS specification for details |
|
||||
+--------+---------------+-------------+------------------------------------------------------------+
|
||||
| 32 | Error Misc 2 | 8 | See ARM RAS specification for details |
|
||||
+--------+---------------+-------------+------------------------------------------------------------+
|
||||
| 40 | Error Misc 3 | 8 | See ARM RAS specification for details |
|
||||
+--------+---------------+-------------+------------------------------------------------------------+
|
||||
|
||||
The table below defines the value of error types, their subtype, subcomponent and instance:
|
||||
|
||||
.. _smpro-error-types:
|
||||
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| Error Group | Error Type | Sub type | Sub component | Instance |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| CPM (core) | 0 | 0 | Snoop-Logic | CPM # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| CPM (core) | 0 | 2 | Armv8 Core 1 | CPM # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| MCU (mem) | 1 | 1 | ERR1 | MCU # \| SLOT << 11 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| MCU (mem) | 1 | 2 | ERR2 | MCU # \| SLOT << 11 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| MCU (mem) | 1 | 3 | ERR3 | MCU # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| MCU (mem) | 1 | 4 | ERR4 | MCU # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| MCU (mem) | 1 | 5 | ERR5 | MCU # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| MCU (mem) | 1 | 6 | ERR6 | MCU # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| MCU (mem) | 1 | 7 | Link Error | MCU # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| Mesh (other) | 2 | 0 | Cross Point | X \| (Y << 5) \| NS <<11 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| Mesh (other) | 2 | 1 | Home Node(IO) | X \| (Y << 5) \| NS <<11 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| Mesh (other) | 2 | 2 | Home Node(Mem) | X \| (Y << 5) \| NS <<11 \| device<<12 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| Mesh (other) | 2 | 4 | CCIX Node | X \| (Y << 5) \| NS <<11 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| 2P Link (other) | 3 | 0 | N/A | Altra 2P Link # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| GIC (other) | 5 | 0 | ERR0 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| GIC (other) | 5 | 1 | ERR1 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| GIC (other) | 5 | 2 | ERR2 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| GIC (other) | 5 | 3 | ERR3 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| GIC (other) | 5 | 4 | ERR4 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| GIC (other) | 5 | 5 | ERR5 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| GIC (other) | 5 | 6 | ERR6 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| GIC (other) | 5 | 7 | ERR7 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| GIC (other) | 5 | 8 | ERR8 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| GIC (other) | 5 | 9 | ERR9 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| GIC (other) | 5 | 10 | ERR10 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| GIC (other) | 5 | 11 | ERR11 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| GIC (other) | 5 | 12 | ERR12 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| GIC (other) | 5 | 13-21 | ERR13 | RC # + 1 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| SMMU (other) | 6 | TCU | 100 | RC # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| SMMU (other) | 6 | TBU0 | 0 | RC # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| SMMU (other) | 6 | TBU1 | 1 | RC # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| SMMU (other) | 6 | TBU2 | 2 | RC # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| SMMU (other) | 6 | TBU3 | 3 | RC # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| SMMU (other) | 6 | TBU4 | 4 | RC # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| SMMU (other) | 6 | TBU5 | 5 | RC # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| SMMU (other) | 6 | TBU6 | 6 | RC # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| SMMU (other) | 6 | TBU7 | 7 | RC # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| SMMU (other) | 6 | TBU8 | 8 | RC # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| SMMU (other) | 6 | TBU9 | 9 | RC # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| PCIe AER (pcie) | 7 | Root | 0 | RC # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| PCIe AER (pcie) | 7 | Device | 1 | RC # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| PCIe RC (pcie) | 8 | RCA HB | 0 | RC # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| PCIe RC (pcie) | 8 | RCB HB | 1 | RC # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| PCIe RC (pcie) | 8 | RASDP | 8 | RC # |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| OCM (other) | 9 | ERR0 | 0 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| OCM (other) | 9 | ERR1 | 1 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| OCM (other) | 9 | ERR2 | 2 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| SMpro (other) | 10 | ERR0 | 0 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| SMpro (other) | 10 | ERR1 | 1 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| SMpro (other) | 10 | MPA_ERR | 2 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| PMpro (other) | 11 | ERR0 | 0 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| PMpro (other) | 11 | ERR1 | 1 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
| PMpro (other) | 11 | MPA_ERR | 2 | 0 |
|
||||
+-----------------+------------+----------+----------------+----------------------------------------+
|
||||
|
||||
Example::
|
||||
|
||||
# cat error_other_ue
|
||||
880807001e004010401040101500000001004010401040100c0000000000000000000000000000000000000000000000
|
||||
|
||||
The detail of each sysfs entries is as below:
|
||||
|
||||
+-------------+---------------------------------------------------------+----------------------------------+
|
||||
| Error | Sysfs entry | Description (when triggered) |
|
||||
+-------------+---------------------------------------------------------+----------------------------------+
|
||||
| Core's CE | /sys/bus/platform/devices/smpro-errmon.*/error_core_ce | Core has CE error |
|
||||
+-------------+---------------------------------------------------------+----------------------------------+
|
||||
| Core's UE | /sys/bus/platform/devices/smpro-errmon.*/error_core_ue | Core has UE error |
|
||||
+-------------+---------------------------------------------------------+----------------------------------+
|
||||
| Memory's CE | /sys/bus/platform/devices/smpro-errmon.*/error_mem_ce | Memory has CE error |
|
||||
+-------------+---------------------------------------------------------+----------------------------------+
|
||||
| Memory's UE | /sys/bus/platform/devices/smpro-errmon.*/error_mem_ue | Memory has UE error |
|
||||
+-------------+---------------------------------------------------------+----------------------------------+
|
||||
| PCIe's CE | /sys/bus/platform/devices/smpro-errmon.*/error_pcie_ce | any PCIe controller has CE error |
|
||||
+-------------+---------------------------------------------------------+----------------------------------+
|
||||
| PCIe's UE | /sys/bus/platform/devices/smpro-errmon.*/error_pcie_ue | any PCIe controller has UE error |
|
||||
+-------------+---------------------------------------------------------+----------------------------------+
|
||||
| Other's CE | /sys/bus/platform/devices/smpro-errmon.*/error_other_ce | any other CE error |
|
||||
+-------------+---------------------------------------------------------+----------------------------------+
|
||||
| Other's UE | /sys/bus/platform/devices/smpro-errmon.*/error_other_ue | any other UE error |
|
||||
+-------------+---------------------------------------------------------+----------------------------------+
|
||||
|
||||
UE: Uncorrect-able Error
|
||||
CE: Correct-able Error
|
||||
|
||||
For details, see section `3.3 Ampere (Vendor-Specific) Error Record Formats,
|
||||
Altra Family RAS Supplement`.
|
||||
|
||||
|
||||
What: /sys/bus/platform/devices/smpro-errmon.*/overflow_[core|mem|pcie|other]_[ce|ue]
|
||||
KernelVersion: 6.1
|
||||
Contact: Quan Nguyen <quan@os.amperecomputing.com>
|
||||
Description:
|
||||
(RO) Return the overflow status of each type HW error reported:
|
||||
|
||||
- 0 : No overflow
|
||||
- 1 : There is an overflow and the oldest HW errors are dropped
|
||||
|
||||
The detail of each sysfs entries is as below:
|
||||
|
||||
+-------------+-----------------------------------------------------------+---------------------------------------+
|
||||
| Overflow | Sysfs entry | Description |
|
||||
+-------------+-----------------------------------------------------------+---------------------------------------+
|
||||
| Core's CE | /sys/bus/platform/devices/smpro-errmon.*/overflow_core_ce | Core CE error overflow |
|
||||
+-------------+-----------------------------------------------------------+---------------------------------------+
|
||||
| Core's UE | /sys/bus/platform/devices/smpro-errmon.*/overflow_core_ue | Core UE error overflow |
|
||||
+-------------+-----------------------------------------------------------+---------------------------------------+
|
||||
| Memory's CE | /sys/bus/platform/devices/smpro-errmon.*/overflow_mem_ce | Memory CE error overflow |
|
||||
+-------------+-----------------------------------------------------------+---------------------------------------+
|
||||
| Memory's UE | /sys/bus/platform/devices/smpro-errmon.*/overflow_mem_ue | Memory UE error overflow |
|
||||
+-------------+-----------------------------------------------------------+---------------------------------------+
|
||||
| PCIe's CE | /sys/bus/platform/devices/smpro-errmon.*/overflow_pcie_ce | any PCIe controller CE error overflow |
|
||||
+-------------+-----------------------------------------------------------+---------------------------------------+
|
||||
| PCIe's UE | /sys/bus/platform/devices/smpro-errmon.*/overflow_pcie_ue | any PCIe controller UE error overflow |
|
||||
+-------------+-----------------------------------------------------------+---------------------------------------+
|
||||
| Other's CE | /sys/bus/platform/devices/smpro-errmon.*/overflow_other_ce| any other CE error overflow |
|
||||
+-------------+-----------------------------------------------------------+---------------------------------------+
|
||||
| Other's UE | /sys/bus/platform/devices/smpro-errmon.*/overflow_other_ue| other UE error overflow |
|
||||
+-------------+-----------------------------------------------------------+---------------------------------------+
|
||||
|
||||
where:
|
||||
|
||||
- UE: Uncorrect-able Error
|
||||
- CE: Correct-able Error
|
||||
|
||||
What: /sys/bus/platform/devices/smpro-errmon.*/[error|warn]_[smpro|pmpro]
|
||||
KernelVersion: 6.1
|
||||
Contact: Quan Nguyen <quan@os.amperecomputing.com>
|
||||
Description:
|
||||
(RO) Contains the internal firmware error/warning printed as hex format.
|
||||
|
||||
The detail of each sysfs entries is as below:
|
||||
|
||||
+---------------+------------------------------------------------------+--------------------------+
|
||||
| Error | Sysfs entry | Description |
|
||||
+---------------+------------------------------------------------------+--------------------------+
|
||||
| SMpro error | /sys/bus/platform/devices/smpro-errmon.*/error_smpro | system has SMpro error |
|
||||
+---------------+------------------------------------------------------+--------------------------+
|
||||
| SMpro warning | /sys/bus/platform/devices/smpro-errmon.*/warn_smpro | system has SMpro warning |
|
||||
+---------------+------------------------------------------------------+--------------------------+
|
||||
| PMpro error | /sys/bus/platform/devices/smpro-errmon.*/error_pmpro | system has PMpro error |
|
||||
+---------------+------------------------------------------------------+--------------------------+
|
||||
| PMpro warning | /sys/bus/platform/devices/smpro-errmon.*/warn_pmpro | system has PMpro warning |
|
||||
+---------------+------------------------------------------------------+--------------------------+
|
||||
|
||||
For details, see section `5.10 RAS Internal Error Register Definitions,
|
||||
Altra Family Soc BMC Interface Specification`.
|
||||
|
||||
What: /sys/bus/platform/devices/smpro-errmon.*/event_[vrd_warn_fault|vrd_hot|dimm_hot]
|
||||
KernelVersion: 6.1
|
||||
Contact: Quan Nguyen <quan@os.amperecomputing.com>
|
||||
Description:
|
||||
(RO) Contains the detail information in case of VRD/DIMM warning/hot events
|
||||
in hex format as below::
|
||||
|
||||
AAAA
|
||||
|
||||
where:
|
||||
|
||||
- ``AAAA``: The event detail information data
|
||||
|
||||
The detail of each sysfs entries is as below:
|
||||
|
||||
+---------------+---------------------------------------------------------------+---------------------+
|
||||
| Event | Sysfs entry | Description |
|
||||
+---------------+---------------------------------------------------------------+---------------------+
|
||||
| VRD HOT | /sys/bus/platform/devices/smpro-errmon.*/event_vrd_hot | VRD Hot |
|
||||
+---------------+---------------------------------------------------------------+---------------------+
|
||||
| VR Warn/Fault | /sys/bus/platform/devices/smpro-errmon.*/event_vrd_warn_fault | VR Warning or Fault |
|
||||
+---------------+---------------------------------------------------------------+---------------------+
|
||||
| DIMM HOT | /sys/bus/platform/devices/smpro-errmon.*/event_dimm_hot | DIMM Hot |
|
||||
+---------------+---------------------------------------------------------------+---------------------+
|
||||
|
||||
For more details, see section `5.7 GPI Status Registers,
|
||||
Altra Family Soc BMC Interface Specification`.
|
||||
|
@ -176,6 +176,18 @@ config SGI_XP
|
||||
this feature will allow for direct communication between SSIs
|
||||
based on a network adapter and DMA messaging.
|
||||
|
||||
config SMPRO_ERRMON
|
||||
tristate "Ampere Computing SMPro error monitor driver"
|
||||
depends on MFD_SMPRO || COMPILE_TEST
|
||||
help
|
||||
Say Y here to get support for the SMpro error monitor function
|
||||
provided by Ampere Computing's Altra and Altra Max SoCs. Upon
|
||||
loading, the driver creates sysfs files which can be use to gather
|
||||
multiple HW error data reported via read and write system calls.
|
||||
|
||||
To compile this driver as a module, say M here. The driver will be
|
||||
called smpro-errmon.
|
||||
|
||||
config CS5535_MFGPT
|
||||
tristate "CS5535/CS5536 Geode Multi-Function General Purpose Timer (MFGPT) support"
|
||||
depends on MFD_CS5535
|
||||
|
@ -23,6 +23,7 @@ obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o
|
||||
obj-$(CONFIG_KGDB_TESTS) += kgdbts.o
|
||||
obj-$(CONFIG_SGI_XP) += sgi-xp/
|
||||
obj-$(CONFIG_SGI_GRU) += sgi-gru/
|
||||
obj-$(CONFIG_SMPRO_ERRMON) += smpro-errmon.o
|
||||
obj-$(CONFIG_CS5535_MFGPT) += cs5535-mfgpt.o
|
||||
obj-$(CONFIG_GEHC_ACHC) += gehc-achc.o
|
||||
obj-$(CONFIG_HP_ILO) += hpilo.o
|
||||
|
529
drivers/misc/smpro-errmon.c
Normal file
529
drivers/misc/smpro-errmon.c
Normal file
@ -0,0 +1,529 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* Ampere Computing SoC's SMpro Error Monitoring Driver
|
||||
*
|
||||
* Copyright (c) 2022, Ampere Computing LLC
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/i2c.h>
|
||||
#include <linux/mod_devicetable.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/platform_device.h>
|
||||
#include <linux/regmap.h>
|
||||
|
||||
/* GPI RAS Error Registers */
|
||||
#define GPI_RAS_ERR 0x7E
|
||||
|
||||
/* Core and L2C Error Registers */
|
||||
#define CORE_CE_ERR_CNT 0x80
|
||||
#define CORE_CE_ERR_LEN 0x81
|
||||
#define CORE_CE_ERR_DATA 0x82
|
||||
#define CORE_UE_ERR_CNT 0x83
|
||||
#define CORE_UE_ERR_LEN 0x84
|
||||
#define CORE_UE_ERR_DATA 0x85
|
||||
|
||||
/* Memory Error Registers */
|
||||
#define MEM_CE_ERR_CNT 0x90
|
||||
#define MEM_CE_ERR_LEN 0x91
|
||||
#define MEM_CE_ERR_DATA 0x92
|
||||
#define MEM_UE_ERR_CNT 0x93
|
||||
#define MEM_UE_ERR_LEN 0x94
|
||||
#define MEM_UE_ERR_DATA 0x95
|
||||
|
||||
/* RAS Error/Warning Registers */
|
||||
#define ERR_SMPRO_TYPE 0xA0
|
||||
#define ERR_PMPRO_TYPE 0xA1
|
||||
#define ERR_SMPRO_INFO_LO 0xA2
|
||||
#define ERR_SMPRO_INFO_HI 0xA3
|
||||
#define ERR_SMPRO_DATA_LO 0xA4
|
||||
#define ERR_SMPRO_DATA_HI 0xA5
|
||||
#define WARN_SMPRO_INFO_LO 0xAA
|
||||
#define WARN_SMPRO_INFO_HI 0xAB
|
||||
#define ERR_PMPRO_INFO_LO 0xA6
|
||||
#define ERR_PMPRO_INFO_HI 0xA7
|
||||
#define ERR_PMPRO_DATA_LO 0xA8
|
||||
#define ERR_PMPRO_DATA_HI 0xA9
|
||||
#define WARN_PMPRO_INFO_LO 0xAC
|
||||
#define WARN_PMPRO_INFO_HI 0xAD
|
||||
|
||||
/* PCIE Error Registers */
|
||||
#define PCIE_CE_ERR_CNT 0xC0
|
||||
#define PCIE_CE_ERR_LEN 0xC1
|
||||
#define PCIE_CE_ERR_DATA 0xC2
|
||||
#define PCIE_UE_ERR_CNT 0xC3
|
||||
#define PCIE_UE_ERR_LEN 0xC4
|
||||
#define PCIE_UE_ERR_DATA 0xC5
|
||||
|
||||
/* Other Error Registers */
|
||||
#define OTHER_CE_ERR_CNT 0xD0
|
||||
#define OTHER_CE_ERR_LEN 0xD1
|
||||
#define OTHER_CE_ERR_DATA 0xD2
|
||||
#define OTHER_UE_ERR_CNT 0xD8
|
||||
#define OTHER_UE_ERR_LEN 0xD9
|
||||
#define OTHER_UE_ERR_DATA 0xDA
|
||||
|
||||
/* Event Data Registers */
|
||||
#define VRD_WARN_FAULT_EVENT_DATA 0x78
|
||||
#define VRD_HOT_EVENT_DATA 0x79
|
||||
#define DIMM_HOT_EVENT_DATA 0x7A
|
||||
|
||||
#define MAX_READ_BLOCK_LENGTH 48
|
||||
|
||||
#define RAS_SMPRO_ERR 0
|
||||
#define RAS_PMPRO_ERR 1
|
||||
|
||||
enum RAS_48BYTES_ERR_TYPES {
|
||||
CORE_CE_ERR,
|
||||
CORE_UE_ERR,
|
||||
MEM_CE_ERR,
|
||||
MEM_UE_ERR,
|
||||
PCIE_CE_ERR,
|
||||
PCIE_UE_ERR,
|
||||
OTHER_CE_ERR,
|
||||
OTHER_UE_ERR,
|
||||
NUM_48BYTES_ERR_TYPE,
|
||||
};
|
||||
|
||||
struct smpro_error_hdr {
|
||||
u8 count; /* Number of the RAS errors */
|
||||
u8 len; /* Number of data bytes */
|
||||
u8 data; /* Start of 48-byte data */
|
||||
u8 max_cnt; /* Max num of errors */
|
||||
};
|
||||
|
||||
/*
|
||||
* Included Address of registers to get Count, Length of data and Data
|
||||
* of the 48 bytes error data
|
||||
*/
|
||||
static struct smpro_error_hdr smpro_error_table[] = {
|
||||
[CORE_CE_ERR] = {
|
||||
.count = CORE_CE_ERR_CNT,
|
||||
.len = CORE_CE_ERR_LEN,
|
||||
.data = CORE_CE_ERR_DATA,
|
||||
.max_cnt = 32
|
||||
},
|
||||
[CORE_UE_ERR] = {
|
||||
.count = CORE_UE_ERR_CNT,
|
||||
.len = CORE_UE_ERR_LEN,
|
||||
.data = CORE_UE_ERR_DATA,
|
||||
.max_cnt = 32
|
||||
},
|
||||
[MEM_CE_ERR] = {
|
||||
.count = MEM_CE_ERR_CNT,
|
||||
.len = MEM_CE_ERR_LEN,
|
||||
.data = MEM_CE_ERR_DATA,
|
||||
.max_cnt = 16
|
||||
},
|
||||
[MEM_UE_ERR] = {
|
||||
.count = MEM_UE_ERR_CNT,
|
||||
.len = MEM_UE_ERR_LEN,
|
||||
.data = MEM_UE_ERR_DATA,
|
||||
.max_cnt = 16
|
||||
},
|
||||
[PCIE_CE_ERR] = {
|
||||
.count = PCIE_CE_ERR_CNT,
|
||||
.len = PCIE_CE_ERR_LEN,
|
||||
.data = PCIE_CE_ERR_DATA,
|
||||
.max_cnt = 96
|
||||
},
|
||||
[PCIE_UE_ERR] = {
|
||||
.count = PCIE_UE_ERR_CNT,
|
||||
.len = PCIE_UE_ERR_LEN,
|
||||
.data = PCIE_UE_ERR_DATA,
|
||||
.max_cnt = 96
|
||||
},
|
||||
[OTHER_CE_ERR] = {
|
||||
.count = OTHER_CE_ERR_CNT,
|
||||
.len = OTHER_CE_ERR_LEN,
|
||||
.data = OTHER_CE_ERR_DATA,
|
||||
.max_cnt = 8
|
||||
},
|
||||
[OTHER_UE_ERR] = {
|
||||
.count = OTHER_UE_ERR_CNT,
|
||||
.len = OTHER_UE_ERR_LEN,
|
||||
.data = OTHER_UE_ERR_DATA,
|
||||
.max_cnt = 8
|
||||
},
|
||||
};
|
||||
|
||||
/*
|
||||
* List of SCP registers which are used to get
|
||||
* one type of RAS Internal errors.
|
||||
*/
|
||||
struct smpro_int_error_hdr {
|
||||
u8 type;
|
||||
u8 info_l;
|
||||
u8 info_h;
|
||||
u8 data_l;
|
||||
u8 data_h;
|
||||
u8 warn_l;
|
||||
u8 warn_h;
|
||||
};
|
||||
|
||||
static struct smpro_int_error_hdr list_smpro_int_error_hdr[] = {
|
||||
[RAS_SMPRO_ERR] = {
|
||||
.type = ERR_SMPRO_TYPE,
|
||||
.info_l = ERR_SMPRO_INFO_LO,
|
||||
.info_h = ERR_SMPRO_INFO_HI,
|
||||
.data_l = ERR_SMPRO_DATA_LO,
|
||||
.data_h = ERR_SMPRO_DATA_HI,
|
||||
.warn_l = WARN_SMPRO_INFO_LO,
|
||||
.warn_h = WARN_SMPRO_INFO_HI,
|
||||
},
|
||||
[RAS_PMPRO_ERR] = {
|
||||
.type = ERR_PMPRO_TYPE,
|
||||
.info_l = ERR_PMPRO_INFO_LO,
|
||||
.info_h = ERR_PMPRO_INFO_HI,
|
||||
.data_l = ERR_PMPRO_DATA_LO,
|
||||
.data_h = ERR_PMPRO_DATA_HI,
|
||||
.warn_l = WARN_PMPRO_INFO_LO,
|
||||
.warn_h = WARN_PMPRO_INFO_HI,
|
||||
},
|
||||
};
|
||||
|
||||
struct smpro_errmon {
|
||||
struct regmap *regmap;
|
||||
};
|
||||
|
||||
enum EVENT_TYPES {
|
||||
VRD_WARN_FAULT_EVENT,
|
||||
VRD_HOT_EVENT,
|
||||
DIMM_HOT_EVENT,
|
||||
NUM_EVENTS_TYPE,
|
||||
};
|
||||
|
||||
/* Included Address of event source and data registers */
|
||||
static u8 smpro_event_table[NUM_EVENTS_TYPE] = {
|
||||
VRD_WARN_FAULT_EVENT_DATA,
|
||||
VRD_HOT_EVENT_DATA,
|
||||
DIMM_HOT_EVENT_DATA,
|
||||
};
|
||||
|
||||
static ssize_t smpro_event_data_read(struct device *dev,
|
||||
struct device_attribute *da, char *buf,
|
||||
int channel)
|
||||
{
|
||||
struct smpro_errmon *errmon = dev_get_drvdata(dev);
|
||||
s32 event_data;
|
||||
int ret;
|
||||
|
||||
ret = regmap_read(errmon->regmap, smpro_event_table[channel], &event_data);
|
||||
if (ret)
|
||||
return ret;
|
||||
/* Clear event after read */
|
||||
if (event_data != 0)
|
||||
regmap_write(errmon->regmap, smpro_event_table[channel], event_data);
|
||||
|
||||
return sysfs_emit(buf, "%04x\n", event_data);
|
||||
}
|
||||
|
||||
static ssize_t smpro_overflow_data_read(struct device *dev, struct device_attribute *da,
|
||||
char *buf, int channel)
|
||||
{
|
||||
struct smpro_errmon *errmon = dev_get_drvdata(dev);
|
||||
struct smpro_error_hdr *err_info;
|
||||
s32 err_count;
|
||||
int ret;
|
||||
|
||||
err_info = &smpro_error_table[channel];
|
||||
|
||||
ret = regmap_read(errmon->regmap, err_info->count, &err_count);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* Bit 8 indicates the overflow status */
|
||||
return sysfs_emit(buf, "%d\n", (err_count & BIT(8)) ? 1 : 0);
|
||||
}
|
||||
|
||||
static ssize_t smpro_error_data_read(struct device *dev, struct device_attribute *da,
|
||||
char *buf, int channel)
|
||||
{
|
||||
struct smpro_errmon *errmon = dev_get_drvdata(dev);
|
||||
unsigned char err_data[MAX_READ_BLOCK_LENGTH];
|
||||
struct smpro_error_hdr *err_info;
|
||||
s32 err_count, err_length;
|
||||
int ret;
|
||||
|
||||
err_info = &smpro_error_table[channel];
|
||||
|
||||
ret = regmap_read(errmon->regmap, err_info->count, &err_count);
|
||||
/* Error count is the low byte */
|
||||
err_count &= 0xff;
|
||||
if (ret || !err_count || err_count > err_info->max_cnt)
|
||||
return ret;
|
||||
|
||||
ret = regmap_read(errmon->regmap, err_info->len, &err_length);
|
||||
if (ret || err_length <= 0)
|
||||
return ret;
|
||||
|
||||
if (err_length > MAX_READ_BLOCK_LENGTH)
|
||||
err_length = MAX_READ_BLOCK_LENGTH;
|
||||
|
||||
memset(err_data, 0x00, MAX_READ_BLOCK_LENGTH);
|
||||
ret = regmap_noinc_read(errmon->regmap, err_info->data, err_data, err_length);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
/* clear the error */
|
||||
ret = regmap_write(errmon->regmap, err_info->count, 0x100);
|
||||
if (ret)
|
||||
return ret;
|
||||
/*
|
||||
* The output of Core/Memory/PCIe/Others UE/CE errors follows the format
|
||||
* specified in section 5.8.1 CE/UE Error Data record in
|
||||
* Altra SOC BMC Interface specification.
|
||||
*/
|
||||
return sysfs_emit(buf, "%*phN\n", MAX_READ_BLOCK_LENGTH, err_data);
|
||||
}
|
||||
|
||||
/*
|
||||
* Output format:
|
||||
* <4-byte hex value of error info><4-byte hex value of error extensive data>
|
||||
* Where:
|
||||
* + error info : The error information
|
||||
* + error data : Extensive data (32 bits)
|
||||
* Reference to section 5.10 RAS Internal Error Register Definition in
|
||||
* Altra SOC BMC Interface specification
|
||||
*/
|
||||
static ssize_t smpro_internal_err_read(struct device *dev, struct device_attribute *da,
|
||||
char *buf, int channel)
|
||||
{
|
||||
struct smpro_errmon *errmon = dev_get_drvdata(dev);
|
||||
struct smpro_int_error_hdr *err_info;
|
||||
unsigned int err[4] = { 0 };
|
||||
unsigned int err_type;
|
||||
unsigned int val;
|
||||
int ret;
|
||||
|
||||
/* read error status */
|
||||
ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) ||
|
||||
(channel == RAS_PMPRO_ERR && !(val & BIT(1))))
|
||||
return 0;
|
||||
|
||||
err_info = &list_smpro_int_error_hdr[channel];
|
||||
ret = regmap_read(errmon->regmap, err_info->type, &val);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
err_type = (val & BIT(1)) ? BIT(1) :
|
||||
(val & BIT(2)) ? BIT(2) : 0;
|
||||
|
||||
if (!err_type)
|
||||
return 0;
|
||||
|
||||
ret = regmap_read(errmon->regmap, err_info->info_l, err + 1);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = regmap_read(errmon->regmap, err_info->info_h, err);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (err_type & BIT(2)) {
|
||||
/* Error with data type */
|
||||
ret = regmap_read(errmon->regmap, err_info->data_l, err + 3);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = regmap_read(errmon->regmap, err_info->data_h, err + 2);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* clear the read errors */
|
||||
ret = regmap_write(errmon->regmap, err_info->type, err_type);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return sysfs_emit(buf, "%*phN\n", (int)sizeof(err), err);
|
||||
}
|
||||
|
||||
/*
|
||||
* Output format:
|
||||
* <4-byte hex value of warining info>
|
||||
* Reference to section 5.10 RAS Internal Error Register Definition in
|
||||
* Altra SOC BMC Interface specification
|
||||
*/
|
||||
static ssize_t smpro_internal_warn_read(struct device *dev, struct device_attribute *da,
|
||||
char *buf, int channel)
|
||||
{
|
||||
struct smpro_errmon *errmon = dev_get_drvdata(dev);
|
||||
struct smpro_int_error_hdr *err_info;
|
||||
unsigned int warn[2] = { 0 };
|
||||
unsigned int val;
|
||||
int ret;
|
||||
|
||||
/* read error status */
|
||||
ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) ||
|
||||
(channel == RAS_PMPRO_ERR && !(val & BIT(1))))
|
||||
return 0;
|
||||
|
||||
err_info = &list_smpro_int_error_hdr[channel];
|
||||
ret = regmap_read(errmon->regmap, err_info->type, &val);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (!(val & BIT(0)))
|
||||
return 0;
|
||||
|
||||
ret = regmap_read(errmon->regmap, err_info->warn_l, warn + 1);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = regmap_read(errmon->regmap, err_info->warn_h, warn);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* clear the warning */
|
||||
ret = regmap_write(errmon->regmap, err_info->type, BIT(0));
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return sysfs_emit(buf, "%*phN\n", (int)sizeof(warn), warn);
|
||||
}
|
||||
|
||||
#define ERROR_OVERFLOW_RO(_error, _index) \
|
||||
static ssize_t overflow_##_error##_show(struct device *dev, \
|
||||
struct device_attribute *da, \
|
||||
char *buf) \
|
||||
{ \
|
||||
return smpro_overflow_data_read(dev, da, buf, _index); \
|
||||
} \
|
||||
static DEVICE_ATTR_RO(overflow_##_error)
|
||||
|
||||
ERROR_OVERFLOW_RO(core_ce, CORE_CE_ERR);
|
||||
ERROR_OVERFLOW_RO(core_ue, CORE_UE_ERR);
|
||||
ERROR_OVERFLOW_RO(mem_ce, MEM_CE_ERR);
|
||||
ERROR_OVERFLOW_RO(mem_ue, MEM_UE_ERR);
|
||||
ERROR_OVERFLOW_RO(pcie_ce, PCIE_CE_ERR);
|
||||
ERROR_OVERFLOW_RO(pcie_ue, PCIE_UE_ERR);
|
||||
ERROR_OVERFLOW_RO(other_ce, OTHER_CE_ERR);
|
||||
ERROR_OVERFLOW_RO(other_ue, OTHER_UE_ERR);
|
||||
|
||||
#define ERROR_RO(_error, _index) \
|
||||
static ssize_t error_##_error##_show(struct device *dev, \
|
||||
struct device_attribute *da, \
|
||||
char *buf) \
|
||||
{ \
|
||||
return smpro_error_data_read(dev, da, buf, _index); \
|
||||
} \
|
||||
static DEVICE_ATTR_RO(error_##_error)
|
||||
|
||||
ERROR_RO(core_ce, CORE_CE_ERR);
|
||||
ERROR_RO(core_ue, CORE_UE_ERR);
|
||||
ERROR_RO(mem_ce, MEM_CE_ERR);
|
||||
ERROR_RO(mem_ue, MEM_UE_ERR);
|
||||
ERROR_RO(pcie_ce, PCIE_CE_ERR);
|
||||
ERROR_RO(pcie_ue, PCIE_UE_ERR);
|
||||
ERROR_RO(other_ce, OTHER_CE_ERR);
|
||||
ERROR_RO(other_ue, OTHER_UE_ERR);
|
||||
|
||||
static ssize_t error_smpro_show(struct device *dev, struct device_attribute *da, char *buf)
|
||||
{
|
||||
return smpro_internal_err_read(dev, da, buf, RAS_SMPRO_ERR);
|
||||
}
|
||||
static DEVICE_ATTR_RO(error_smpro);
|
||||
|
||||
static ssize_t error_pmpro_show(struct device *dev, struct device_attribute *da, char *buf)
|
||||
{
|
||||
return smpro_internal_err_read(dev, da, buf, RAS_PMPRO_ERR);
|
||||
}
|
||||
static DEVICE_ATTR_RO(error_pmpro);
|
||||
|
||||
static ssize_t warn_smpro_show(struct device *dev, struct device_attribute *da, char *buf)
|
||||
{
|
||||
return smpro_internal_warn_read(dev, da, buf, RAS_SMPRO_ERR);
|
||||
}
|
||||
static DEVICE_ATTR_RO(warn_smpro);
|
||||
|
||||
static ssize_t warn_pmpro_show(struct device *dev, struct device_attribute *da, char *buf)
|
||||
{
|
||||
return smpro_internal_warn_read(dev, da, buf, RAS_PMPRO_ERR);
|
||||
}
|
||||
static DEVICE_ATTR_RO(warn_pmpro);
|
||||
|
||||
#define EVENT_RO(_event, _index) \
|
||||
static ssize_t event_##_event##_show(struct device *dev, \
|
||||
struct device_attribute *da, \
|
||||
char *buf) \
|
||||
{ \
|
||||
return smpro_event_data_read(dev, da, buf, _index); \
|
||||
} \
|
||||
static DEVICE_ATTR_RO(event_##_event)
|
||||
|
||||
EVENT_RO(vrd_warn_fault, VRD_WARN_FAULT_EVENT);
|
||||
EVENT_RO(vrd_hot, VRD_HOT_EVENT);
|
||||
EVENT_RO(dimm_hot, DIMM_HOT_EVENT);
|
||||
|
||||
static struct attribute *smpro_errmon_attrs[] = {
|
||||
&dev_attr_overflow_core_ce.attr,
|
||||
&dev_attr_overflow_core_ue.attr,
|
||||
&dev_attr_overflow_mem_ce.attr,
|
||||
&dev_attr_overflow_mem_ue.attr,
|
||||
&dev_attr_overflow_pcie_ce.attr,
|
||||
&dev_attr_overflow_pcie_ue.attr,
|
||||
&dev_attr_overflow_other_ce.attr,
|
||||
&dev_attr_overflow_other_ue.attr,
|
||||
&dev_attr_error_core_ce.attr,
|
||||
&dev_attr_error_core_ue.attr,
|
||||
&dev_attr_error_mem_ce.attr,
|
||||
&dev_attr_error_mem_ue.attr,
|
||||
&dev_attr_error_pcie_ce.attr,
|
||||
&dev_attr_error_pcie_ue.attr,
|
||||
&dev_attr_error_other_ce.attr,
|
||||
&dev_attr_error_other_ue.attr,
|
||||
&dev_attr_error_smpro.attr,
|
||||
&dev_attr_error_pmpro.attr,
|
||||
&dev_attr_warn_smpro.attr,
|
||||
&dev_attr_warn_pmpro.attr,
|
||||
&dev_attr_event_vrd_warn_fault.attr,
|
||||
&dev_attr_event_vrd_hot.attr,
|
||||
&dev_attr_event_dimm_hot.attr,
|
||||
NULL
|
||||
};
|
||||
|
||||
ATTRIBUTE_GROUPS(smpro_errmon);
|
||||
|
||||
static int smpro_errmon_probe(struct platform_device *pdev)
|
||||
{
|
||||
struct smpro_errmon *errmon;
|
||||
|
||||
errmon = devm_kzalloc(&pdev->dev, sizeof(struct smpro_errmon), GFP_KERNEL);
|
||||
if (!errmon)
|
||||
return -ENOMEM;
|
||||
|
||||
platform_set_drvdata(pdev, errmon);
|
||||
|
||||
errmon->regmap = dev_get_regmap(pdev->dev.parent, NULL);
|
||||
if (!errmon->regmap)
|
||||
return -ENODEV;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct platform_driver smpro_errmon_driver = {
|
||||
.probe = smpro_errmon_probe,
|
||||
.driver = {
|
||||
.name = "smpro-errmon",
|
||||
.dev_groups = smpro_errmon_groups,
|
||||
},
|
||||
};
|
||||
|
||||
module_platform_driver(smpro_errmon_driver);
|
||||
|
||||
MODULE_AUTHOR("Tung Nguyen <tung.nguyen@amperecomputing.com>");
|
||||
MODULE_AUTHOR("Thinh Pham <thinh.pham@amperecomputing.com>");
|
||||
MODULE_AUTHOR("Hoang Nguyen <hnguyen@amperecomputing.com>");
|
||||
MODULE_AUTHOR("Thu Nguyen <thu@os.amperecomputing.com>");
|
||||
MODULE_AUTHOR("Quan Nguyen <quan@os.amperecomputing.com>");
|
||||
MODULE_DESCRIPTION("Ampere Altra SMpro driver");
|
||||
MODULE_LICENSE("GPL");
|
Loading…
Reference in New Issue
Block a user