[linux-yocto] [PATCH 30/48] drivers/edac: CMEM EDAC support on config load.
Daniel Dragomir
daniel.dragomir at windriver.com
Mon Dec 11 05:14:00 PST 2017
From: Marek Majtyka <marekx.majtyka at intel.com>
Implemented CMEM EDAC support on rte config load/unload.
- fixed bugs found CMEM/SMEM:
- work queue hang on driver removal
- unnecessary edac sysfs entries (valid only for ddr3)
Signed-off-by: Marek Majtyka <marekx.majtyka at intel.com>
---
drivers/edac/axxia_edac-cmc_56xx.c | 796 ++++++++++++++++++++++++++++---------
drivers/edac/axxia_edac-mc_56xx.c | 228 +++++++----
2 files changed, 760 insertions(+), 264 deletions(-)
diff --git a/drivers/edac/axxia_edac-cmc_56xx.c b/drivers/edac/axxia_edac-cmc_56xx.c
index f99c46d..c4bf2d0 100644
--- a/drivers/edac/axxia_edac-cmc_56xx.c
+++ b/drivers/edac/axxia_edac-cmc_56xx.c
@@ -25,6 +25,9 @@
#include <linux/mfd/syscon.h>
#include <linux/regmap.h>
#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
#include "edac_core.h"
#include "edac_module.h"
@@ -78,12 +81,17 @@
#define INT_BIT_5 (0x00000020)
#define INT_BIT_6 (0x00000040)
#define INT_BIT_7 (0x00000080)
+#define INT_BIT_8 (0x00000100)
#define INT_BIT_11 (0x00000800)
#define INT_BIT_21 (0x00200000)
#define INT_BIT_25 (0x02000000)
#define INT_BIT_30 (0x40000000)
#define INT_BIT_31 (0x80000000)
+#define CM_INT_MASK_BASE_PROBE (~(\
+ INT_BIT_8 |\
+ INT_BIT_31))
+
#define CM_INT_MASK_BASE (~(\
INT_BIT_1 |\
INT_BIT_2 |\
@@ -110,70 +118,70 @@
INT_BIT_30 |\
INT_BIT_31))
-#define CM_INT_MASK_ALL (0xffffffff)
+#define CM_INT_MASK_ALL (0x7fffffff)
#define ALIVE_NOTIFICATION_PERIOD (90*1000)
static int log = 1;
-module_param(log, int, S_IRUGO|S_IWUSR);
+module_param(log, int, 0644);
MODULE_PARM_DESC(log, "Log each error to kernel log.");
static int force_restart = 1;
-module_param(force_restart, int, S_IRUGO|S_IWUSR);
+module_param(force_restart, int, 0644);
MODULE_PARM_DESC(force_restart, "Machine restart on fatal error.");
static atomic64_t mc_counter = ATOMIC_INIT(0);
/*
- Bit [31] = Logical OR of all lower bits.
- Bit [30] = A CRC error occurred on the write data bus.
- Bit [29] = The user-initiated DLL resync has completed.
- Bit [28] = A state change has been detected on the dfi_init_complete
- signal after initialization.
- Bit [27] = The assertion of the INHIBIT_DRAM_CMD parameter has successfully
- inhibited the command queue.
- Bit [26] = The register interface-initiated mode register write has completed
- and another mode register write may be issued.
- Bit [25] = MPR read command, initiated with a software MPR_READ request, is
- complete.
- Bit [24] = Error received from the PHY on the DFI bus.
- Bit [23] = RESERVED
- Bit [22] = RESERVED
- Bit [21] = A parity error has been detected on the address/control bus on
- a registered DIMM.
- Bit [20] = The leveling operation has completed.
- Bit [19] = A read leveling gate training operation has been requested.
- Bit [18] = A read leveling operation has been requested.
- Bit [17] = A write leveling operation has been requested.
- Bit [16] = A DFI update error has occurred. Error information can be found in
- the UPDATE_ERROR_STATUS parameter.
- Bit [15] = A write leveling error has occurred. Error information can be found
- in the WRLVL_ERROR_STATUS parameter.
- Bit [14] = A read leveling gate training error has occurred. Error information
- can be found in the RDLVL_ERROR_STATUS parameter.
- Bit [13] = A read leveling error has occurred. Error information can be found
- in the RDLVL_ERROR_STATUS parameter.
- Bit [12] = The user has programmed an invalid setting associated with user
- words per burst.
- Examples:
- Setting param_reduc when burst length = 2.
- A 1:2 MC:PHY clock ratio with burst length = 2.
- Bit [11] = A wrap cycle crossing a DRAM page has been detected. This is
- unsupported & may result in memory data corruption.
- Bit [10] = The BIST operation has been completed.
- Bit [9] = The low power operation has been completed.
- Bit [8] = The MC initialization has been completed.
- Bit [7] = An error occurred on the port command channel.
- Bit [6] = Multiple uncorrectable ECC events have been detected.
- Bit [5] = An uncorrectable ECC event has been detected.
- Bit [4] = Multiple correctable ECC events have been detected.
- Bit [3] = A correctable ECC event has been detected.
- Bit [2] = Multiple accesses outside the defined PHYSICAL memory space
- have occurred.
- Bit [1] = A memory access outside the defined PHYSICAL memory space
- has occurred.
- Bit [0] = The memory reset is valid on the DFI bus.
-
- Of these 1, 2, 3, 4, 5, 6, 7, 11, 21, 25, and 30 are of interest.
-*/
+ * Bit [31] = Logical OR of all lower bits.
+ * Bit [30] = A CRC error occurred on the write data bus.
+ * Bit [29] = The user-initiated DLL resync has completed.
+ * Bit [28] = A state change has been detected on the dfi_init_complete
+ * signal after initialization.
+ * Bit [27] = The assertion of the INHIBIT_DRAM_CMD parameter has successfully
+ * inhibited the command queue.
+ * Bit [26] = The register interface-initiated mode register write has completed
+ * and another mode register write may be issued.
+ * Bit [25] = MPR read command, initiated with a software MPR_READ request, is
+ * complete.
+ * Bit [24] = Error received from the PHY on the DFI bus.
+ * Bit [23] = RESERVED
+ * Bit [22] = RESERVED
+ * Bit [21] = A parity error has been detected on the address/control bus on
+ * a registered DIMM.
+ * Bit [20] = The leveling operation has completed.
+ * Bit [19] = A read leveling gate training operation has been requested.
+ * Bit [18] = A read leveling operation has been requested.
+ * Bit [17] = A write leveling operation has been requested.
+ * Bit [16] = A DFI update error has occurred. Error information can be found
+ * in the UPDATE_ERROR_STATUS parameter.
+ * Bit [15] = A write leveling error has occurred. Error information can be
+ * found in the WRLVL_ERROR_STATUS parameter.
+ * Bit [14] = A read leveling gate training error has occurred. Error
+ * information can be found in the RDLVL_ERROR_STATUS parameter.
+ * Bit [13] = A read leveling error has occurred. Error information can be
+ * found in the RDLVL_ERROR_STATUS parameter.
+ * Bit [12] = The user has programmed an invalid setting associated with user
+ * words per burst.
+ * Examples:
+ * Setting param_reduc when burst length = 2.
+ * A 1:2 MC:PHY clock ratio with burst length = 2.
+ * Bit [11] = A wrap cycle crossing a DRAM page has been detected. This is
+ * unsupported & may result in memory data corruption.
+ * Bit [10] = The BIST operation has been completed.
+ * Bit [9] = The low power operation has been completed.
+ * Bit [8] = The MC initialization has been completed.
+ * Bit [7] = An error occurred on the port command channel.
+ * Bit [6] = Multiple uncorrectable ECC events have been detected.
+ * Bit [5] = An uncorrectable ECC event has been detected.
+ * Bit [4] = Multiple correctable ECC events have been detected.
+ * Bit [3] = A correctable ECC event has been detected.
+ * Bit [2] = Multiple accesses outside the defined PHYSICAL memory space
+ * have occurred.
+ * Bit [1] = A memory access outside the defined PHYSICAL memory space
+ * has occurred.
+ * Bit [0] = The memory reset is valid on the DFI bus.
+
+ * Of these 1, 2, 3, 4, 5, 6, 7, 11, 21, 25, and 30 are of interest.
+ */
/*
* MPR dump processing - overview.
@@ -182,7 +190,7 @@ static atomic64_t mc_counter = ATOMIC_INIT(0);
* one need to collect dumps for all available cs. Below given example
* for two cs0/cs1.
*
- * CMEM MC cmmon_isr cmmon_wq
+ * CMEM MC cmmon_isr_sw cmmon_wq
* | | |
* | | |
* |ALERT_N - int_status bit [30] |
@@ -337,6 +345,15 @@ struct __packed mpr_dump {
u8 cs;
};
+enum init_return_codes {ERR_STAGE_8 = -8,
+ ERR_STAGE_7 = -7,
+ ERR_STAGE_6 = -6,
+ ERR_STAGE_5 = -5,
+ ERR_STAGE_4 = -4,
+ ERR_STAGE_3 = -3,
+ ERR_STAGE_2 = -2,
+ ERR_STAGE_1 = -1
+};
enum events {
EV_ILLEGAL = 0,
EV_MULT_ILLEGAL,
@@ -437,9 +454,19 @@ struct intel_edac_dev_info {
struct mc_edac_data *data;
char *ctl_name;
char *blk_name;
+ char *proc_name;
+#ifdef CONFIG_DEBUG_CMEM
+ struct proc_dir_entry *dir_entry;
+#endif
+ struct mutex state_machine_lock;
struct work_struct offload_alerts;
struct work_struct offload_events;
+ int finish_alerts;
+ int finish_events;
+ struct workqueue_struct *wq_alerts;
+ struct workqueue_struct *wq_events;
int is_ddr4;
+ int is_controller_configured;
int edac_idx;
u32 cm_region;
struct regmap *syscon;
@@ -448,7 +475,7 @@ struct intel_edac_dev_info {
void (*check)(struct edac_device_ctl_info *edac_dev);
};
-
+#ifdef CONFIG_DEBUG_CMEM
static ssize_t mpr1_dump_show(struct edac_device_ctl_info
*edac_dev, char *data)
{
@@ -534,14 +561,17 @@ static ssize_t mpr1_dump_show(struct edac_device_ctl_info
return 0;
}
+
+
static struct edac_dev_sysfs_attribute device_block_attr[] = {
{
.attr = {
.name = "mpr_page1",
- .mode = (S_IRUGO | S_IWUSR)
+ .mode = (0644)
},
.show = mpr1_dump_show,
- .store = NULL},
+ .store = NULL
+ },
/* End of list */
{
.attr = {.name = NULL}
@@ -552,6 +582,7 @@ static void axxia_mc_sysfs_attributes(struct edac_device_ctl_info *edac_dev)
{
edac_dev->sysfs_attributes = &device_block_attr[0];
}
+#endif
static inline void __attribute__((always_inline))
handle_events(struct intel_edac_dev_info *edac_dev,
@@ -679,11 +710,24 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs)
}
static irqreturn_t
-cmmon_isr(int interrupt, void *device)
+cmmon_isr_hw(int interrupt, void *device)
+{
+ return IRQ_WAKE_THREAD;
+}
+
+static int initialize(struct intel_edac_dev_info *dev_info);
+static int enable_workers(struct intel_edac_dev_info *dev_info);
+static void uninitialize(struct intel_edac_dev_info *dev_info,
+ int ret, int only_disable);
+
+static irqreturn_t
+cmmon_isr_sw(int interrupt, void *device)
{
struct intel_edac_dev_info *dev_info = device;
struct cm_56xx_denali_ctl_84 denali_ctl_84;
struct cm_56xx_denali_ctl_85 denali_ctl_85 = {0};
+ struct cm_56xx_denali_ctl_86 denali_ctl_86;
+ int ret = 0;
/*
* NOTE:
@@ -702,12 +746,47 @@ cmmon_isr(int interrupt, void *device)
4, (u32 *) &denali_ctl_84))
goto error_read;
+ if (denali_ctl_84.int_status & INT_BIT_8) {
+ if (dev_info->is_controller_configured == 0) {
+ ret = initialize(dev_info);
+ if (ret)
+ goto error_init;
+
+ ret = enable_workers(dev_info);
+ if (ret)
+ goto error_init;
+
+ dev_info->is_controller_configured = 1;
+ }
+
+ if (dev_info->is_ddr4)
+ denali_ctl_86.int_mask = CM_INT_MASK_FULL;
+ else
+ denali_ctl_86.int_mask = CM_INT_MASK_BASE;
+
+ if (ncr_write(dev_info->cm_region,
+ CM_56XX_DENALI_CTL_86,
+ 4, (u32 *) &denali_ctl_86)) {
+ goto error_write;
+ }
+ return IRQ_HANDLED;
+ }
+
+ /*
+ * SAFETY CHECK
+ * one cannot go further if driver is not fully functional!!!
+ */
+ if (dev_info->is_controller_configured == 0)
+ return IRQ_HANDLED;
+
+
handle_events(dev_info, &denali_ctl_84);
atomic_set(&dev_info->data->event_ready, 1);
wake_up(&dev_info->data->event_wq);
denali_ctl_85.int_ack =
- (denali_ctl_84.int_status & (~(INT_BIT_25 | INT_BIT_31)));
+ (denali_ctl_84.int_status &
+ (~(INT_BIT_25 | INT_BIT_31 | INT_BIT_8)));
if (dev_info->is_ddr4) {
if (denali_ctl_84.int_status & INT_BIT_25) {
@@ -737,6 +816,12 @@ cmmon_isr(int interrupt, void *device)
printk_ratelimited("%s: Error reading interrupt status\n",
dev_name(&dev_info->pdev->dev));
return IRQ_HANDLED;
+error_init:
+ printk_ratelimited("%s: Error during driver initialization\n",
+ dev_name(&dev_info->pdev->dev));
+ uninitialize(dev_info, ret,
+ dev_info->is_controller_configured == 0 ? 1 : 0);
+ return IRQ_HANDLED;
}
@@ -747,16 +832,19 @@ static void intel_cm_alerts_error_check(struct edac_device_ctl_info *edac_dev)
struct event_counter (*alerts)[MAX_DQ][MPR_ERRORS] =
dev_info->data->alerts;
struct cm_56xx_denali_ctl_34 denali_ctl_34;
- int i, j, k, l;
+ int i, j, k, l, ret;
u32 counter;
start:
/* keep hung up monitor happy 90 sec's */
- if (0 == wait_event_timeout(dev_info->data->dump_wq,
+ if (wait_event_timeout(dev_info->data->dump_wq,
atomic_read(&dev_info->data->dump_in_progress),
- msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)))
+ msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0)
goto start;
+ if (dev_info->finish_alerts)
+ goto finish;
+
for (i = 0; i < dev_info->data->cs_count; ++i) {
/* trigger dump */
@@ -779,9 +867,15 @@ static void intel_cm_alerts_error_check(struct edac_device_ctl_info *edac_dev)
CM_56XX_DENALI_CTL_34,
4, (u32 *) &denali_ctl_34))
goto error_write;
+
/* wait */
- wait_event(dev_info->data->dump_wq,
- atomic_read(&dev_info->data->dump_ready));
+ ret = wait_event_timeout(dev_info->data->dump_wq,
+ atomic_read(&dev_info->data->dump_ready),
+ msecs_to_jiffies(1000));
+ if (dev_info->finish_alerts)
+ goto finish;
+ if (ret == 0)
+ goto timeout_error;
atomic_set(&dev_info->data->dump_ready, 0);
/* collect data */
@@ -811,11 +905,21 @@ static void intel_cm_alerts_error_check(struct edac_device_ctl_info *edac_dev)
atomic_set(&dev_info->data->dump_in_progress, 0);
goto start;
+timeout_error:
+ printk_ratelimited("Timeout occurred during MPR dump.\n");
+ atomic_set(&dev_info->data->dump_ready, 0);
+ atomic_set(&dev_info->data->dump_in_progress, 0);
+ goto start;
+
error_read:
error_write:
printk_ratelimited("Could not collect MPR dump.\n");
atomic_set(&dev_info->data->dump_in_progress, 0);
goto start;
+
+finish:
+ atomic_set(&dev_info->data->dump_ready, 0);
+ atomic_set(&dev_info->data->dump_in_progress, 0);
}
static void intel_cm_events_error_check(struct edac_device_ctl_info *edac_dev)
@@ -827,13 +931,16 @@ static void intel_cm_events_error_check(struct edac_device_ctl_info *edac_dev)
u32 counter;
while (1) {
- if (0 == wait_event_timeout(dev_info->data->event_wq,
+ if (wait_event_timeout(dev_info->data->event_wq,
atomic_read(&dev_info->data->event_ready),
- msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)))
+ msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0)
continue;
atomic_set(&dev_info->data->event_ready, 0);
+ if (dev_info->finish_events)
+ break;
+
mutex_lock(&dev_info->data->edac_sysfs_data_lock);
for (i = 0; i < NR_EVENTS; ++i) {
counter = atomic_xchg(&events[i].counter, 0);
@@ -911,119 +1018,133 @@ static int get_active_dram(struct intel_edac_dev_info *dev_info)
if (ncr_read(dev_info->cm_region, CM_56XX_DENALI_CTL_74,
4, (u32 *) &denali_ctl_74)) {
pr_err("Could not read number of lanes.\n");
+ return dram;
}
- if (0 == denali_ctl_74.bank_diff)
+ if (denali_ctl_74.bank_diff == 0)
dram = MAX_DQ/2;
- if (1 == denali_ctl_74.bank_diff)
+ if (denali_ctl_74.bank_diff == 1)
dram = MAX_DQ;
return dram;
}
-static int intel_edac_mc_probe(struct platform_device *pdev)
+static int get_ddr4(struct intel_edac_dev_info *dev_info)
{
- struct edac_device_instance *instance;
- struct edac_device_block *block;
- int i, j, k, l;
- int count;
- struct intel_edac_dev_info *dev_info = NULL;
- struct resource *io;
- struct device_node *np = pdev->dev.of_node;
- int irq = -1, rc = 0;
struct cm_56xx_denali_ctl_00 denali_ctl_00;
- struct cm_56xx_denali_ctl_86 denali_ctl_86;
- int cs_count = MAX_CS;
- int dram_count = MAX_DQ;
-
- count = atomic64_inc_return(&mc_counter);
- if ((count - 1) == MEMORY_CONTROLLERS)
- goto err_nodev;
-
- dev_info = devm_kzalloc(&pdev->dev, sizeof(*dev_info), GFP_KERNEL);
- if (!dev_info)
- goto err_nomem;
- dev_info->data =
- devm_kzalloc(&pdev->dev, sizeof(*dev_info->data), GFP_KERNEL);
- if (!dev_info->data)
- goto err_noctlinfo;
-
- init_waitqueue_head(&dev_info->data->dump_wq);
- init_waitqueue_head(&dev_info->data->event_wq);
-
- raw_spin_lock_init(&dev_info->data->mpr_data_lock);
- mutex_init(&dev_info->data->edac_sysfs_data_lock);
+ if (ncr_read(dev_info->cm_region, CM_56XX_DENALI_CTL_00,
+ 4, (u32 *) &denali_ctl_00)) {
+ pr_err("Could not read ddr version.\n");
+ return -1;
+ }
- dev_info->ctl_name = kstrdup(np->name, GFP_KERNEL);
- dev_info->blk_name = "ECC";
- edac_op_state = EDAC_OPSTATE_POLL;
+ if (denali_ctl_00.dram_class == 0xa)
+ return 1;
- dev_info->pdev = pdev;
- dev_info->edac_idx = edac_device_alloc_index();
- dev_info->data->irq = 0;
+ return 0;
+}
- /* setup all counters */
- for (i = 0; i < NR_EVENTS; ++i)
- atomic_set(&dev_info->data->events[i].counter, 0);
+static void uninitialize(struct intel_edac_dev_info *dev_info,
+ int ret, int only_disable)
+{
+ struct cm_56xx_denali_ctl_86 denali_ctl_86;
- for (j = 0; j < MAX_CS; ++j) {
- for (l = 0; l < MAX_DQ; ++l) {
- for (k = 0; k < MPR_ERRORS; ++k, ++i) {
- atomic_set(&dev_info->data->
- alerts[j][l][k].counter, 0);
+ switch (ret) {
+ case ERR_STAGE_8:
+ if (dev_info->data->irq) {
+ disable_irq(dev_info->data->irq);
+ devm_free_irq(&dev_info->pdev->dev,
+ dev_info->data->irq, dev_info);
+ dev_info->data->irq = 0;
+ }
+ /* fall-through */
+ case ERR_STAGE_7:
+ denali_ctl_86.int_mask = CM_INT_MASK_ALL;
+ if (ncr_write(dev_info->cm_region,
+ CM_56XX_DENALI_CTL_86,
+ 4, (u32 *) &denali_ctl_86)) {
+ pr_err("Could not mask interrupts (%s - ctl_86).\n",
+ dev_info->ctl_name);
+ }
+ if (only_disable)
+ break;
+ /* fall-through */
+ case ERR_STAGE_6:
+ if (dev_info->is_ddr4) {
+ dev_info->finish_alerts = 1;
+ atomic_inc(&dev_info->data->dump_in_progress);
+ atomic_set(&dev_info->data->dump_ready, 1);
+ wake_up(&dev_info->data->dump_wq);
+ cancel_work_sync(&dev_info->offload_alerts);
+ }
+ dev_info->finish_events = 1;
+ atomic_set(&dev_info->data->event_ready, 1);
+ wake_up(&dev_info->data->event_wq);
+ cancel_work_sync(&dev_info->offload_events);
+ /* fall-through */
+ case ERR_STAGE_5:
+ if (dev_info->is_ddr4)
+ if (dev_info->wq_alerts) {
+ destroy_workqueue(dev_info->wq_alerts);
+ dev_info->wq_alerts = NULL;
}
+ /* fall-through */
+ case ERR_STAGE_4:
+ if (dev_info->wq_events) {
+ destroy_workqueue(dev_info->wq_events);
+ dev_info->wq_events = NULL;
+
+ }
+ /* fall-through */
+ case ERR_STAGE_3:
+ edac_device_del_device(&dev_info->pdev->dev);
+ /* fall-through */
+ case ERR_STAGE_2:
+ if (dev_info->edac_dev) {
+ edac_device_free_ctl_info(dev_info->edac_dev);
+ dev_info->edac_dev = NULL;
}
+ /* fall-through */
+ case ERR_STAGE_1:
+ /* fall-through */
+ default:
+ break;
}
+}
- /* set up dump in progress flag */
- atomic_set(&dev_info->data->dump_in_progress, 0);
+static int initialize(struct intel_edac_dev_info *dev_info)
+{
+ struct edac_device_instance *instance;
+ struct edac_device_block *block;
+ int i, j, k, l;
- io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- if (!io) {
- dev_err(&pdev->dev, "Unable to get mem resource\n");
- goto err_noctlinfo;
- }
- dev_info->cm_region = io->start;
- dev_info->syscon =
- syscon_regmap_lookup_by_phandle(np, "syscon");
- if (IS_ERR(dev_info->syscon)) {
- pr_info(FMT, np->name);
- dev_info->axi2ser3_region = ioremap(AXI2_SER3_PHY_ADDR,
- AXI2_SER3_PHY_SIZE);
- if (!dev_info->axi2ser3_region) {
- pr_err("ioremap of axi2ser3 region failed\n");
- goto err_noctlinfo;
- }
- }
+ int cs_count = MAX_CS;
+ int dram_count = MAX_DQ;
- if (ncr_read(dev_info->cm_region, CM_56XX_DENALI_CTL_00,
- 4, (u32 *) &denali_ctl_00)) {
- pr_err("Could not read ddr version.\n");
- goto err_noctlinfo;
+ cs_count = get_active_cs(dev_info);
+ if (cs_count == 0) {
+ pr_err("Could not get cs number. Is config loaded?\n");
+ return ERR_STAGE_1;
}
- if (0xa == denali_ctl_00.dram_class) {
- pr_info("%s supports mpr dump (DDR4).\n", dev_info->ctl_name);
- dev_info->is_ddr4 = 1;
- } else {
- if (0x6 == denali_ctl_00.dram_class) {
- pr_info("%s doesn't support mpr dump (DDR3).\n",
- dev_info->ctl_name);
- } else {
- pr_err("CMEM is not configured. Check uboot settings.\n");
- goto err_noctlinfo;
- }
+ dram_count = get_active_dram(dev_info);
+ if (dram_count == 0) {
+ pr_err("Could not get dram number. Is config loaded?\n");
+ return ERR_STAGE_1;
}
- cs_count = get_active_cs(dev_info);
- if (cs_count == 0)
- goto err_noctlinfo;
+ dev_info->is_ddr4 = get_ddr4(dev_info);
- dram_count = get_active_dram(dev_info);
- if (dram_count == 0)
- goto err_noctlinfo;
+ if (dev_info->is_ddr4 == -1) {
+ pr_err("Could not get dram version. Is config loaded?\n");
+ return ERR_STAGE_1;
+ }
+ /*dev_info->is_ddr4 = 1;*/
+
+ dev_info->finish_alerts = 0;
+ dev_info->finish_events = 0;
dev_info->data->cs_count = cs_count;
dev_info->data->dram_count = dram_count;
@@ -1031,13 +1152,15 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
dev_info->edac_dev =
edac_device_alloc_ctl_info(0, dev_info->ctl_name,
1, dev_info->blk_name,
- NR_EVENTS +
- cs_count * dram_count * MPR_ERRORS,
+ NR_EVENTS + (dev_info->is_ddr4 ?
+ cs_count * dram_count * MPR_ERRORS
+ :
+ 0),
0, NULL, 0, dev_info->edac_idx);
if (!dev_info->edac_dev) {
pr_info("No memory for edac device\n");
- goto err_noctlinfo;
+ return ERR_STAGE_1;
}
instance = &dev_info->edac_dev->instances[0];
@@ -1086,51 +1209,85 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
dev_info->edac_dev->dev_name = dev_name(&dev_info->pdev->dev);
dev_info->edac_dev->edac_check = NULL;
+#ifdef CONFIG_DEBUG_CMEM
if (dev_info->is_ddr4)
axxia_mc_sysfs_attributes(dev_info->edac_dev);
+#endif
if (edac_device_add_device(dev_info->edac_dev) != 0) {
pr_info("Unable to add edac device for %s\n",
dev_info->ctl_name);
- goto err_nosysfs;
+ return ERR_STAGE_2;
}
- snprintf(&dev_info->data->irq_name[0], IRQ_NAME_LEN,
- "%s-mon", dev_info->ctl_name);
+ return 0;
+}
+
+static int enable_workers(struct intel_edac_dev_info *dev_info)
+{
+ atomic_set(&dev_info->data->dump_ready, 0);
+ atomic_set(&dev_info->data->event_ready, 0);
+ atomic_set(&dev_info->data->dump_in_progress, 0);
+
+ dev_info->wq_events = alloc_workqueue("%s-events", WQ_MEM_RECLAIM, 1,
+ (dev_info->ctl_name));
+ if (!dev_info->wq_events)
+ return ERR_STAGE_3;
+ if (dev_info->is_ddr4) {
+ dev_info->wq_alerts =
+ alloc_workqueue("%s-alerts", WQ_MEM_RECLAIM, 1,
+ (dev_info->ctl_name));
+ if (!dev_info->wq_alerts)
+ return ERR_STAGE_4;
+ }
if (dev_info->is_ddr4)
INIT_WORK(&dev_info->offload_alerts, axxia_alerts_work);
INIT_WORK(&dev_info->offload_events, axxia_events_work);
if (dev_info->is_ddr4)
- schedule_work(&dev_info->offload_alerts);
- schedule_work(&dev_info->offload_events);
+ queue_work(dev_info->wq_alerts, &dev_info->offload_alerts);
+ queue_work(dev_info->wq_events, &dev_info->offload_events);
+
+ return 0;
+}
- irq = platform_get_irq(pdev, 0);
+static int enable_driver_irq(struct intel_edac_dev_info *dev_info)
+{
+ int irq = -1, rc = 0;
+ struct cm_56xx_denali_ctl_86 denali_ctl_86;
+
+ snprintf(&dev_info->data->irq_name[0], IRQ_NAME_LEN,
+ "%s-mon", dev_info->ctl_name);
+
+ irq = platform_get_irq(dev_info->pdev, 0);
if (irq < 0) {
pr_err("Could not get irq number.\n");
- goto err_noirq;
+ return ERR_STAGE_5;
}
/*
* Enable memory controller interrupts.
*/
- if (dev_info->is_ddr4)
- denali_ctl_86.int_mask = CM_INT_MASK_FULL;
- else
- denali_ctl_86.int_mask = CM_INT_MASK_BASE;
+ if (dev_info->is_controller_configured) {
+ if (dev_info->is_ddr4)
+ denali_ctl_86.int_mask = CM_INT_MASK_FULL;
+ else
+ denali_ctl_86.int_mask = CM_INT_MASK_BASE;
+ } else
+ denali_ctl_86.int_mask = CM_INT_MASK_BASE_PROBE;
if (ncr_write(dev_info->cm_region, CM_56XX_DENALI_CTL_86,
4, (u32 *) &denali_ctl_86)) {
pr_err("Could not write interrupt mask reg (%s - ctl_86).\n",
dev_info->ctl_name);
- goto err_noirq;
+ return ERR_STAGE_6;
}
dev_info->data->irq = irq;
- rc = devm_request_irq(&pdev->dev, irq,
- cmmon_isr, IRQF_ONESHOT,
+ rc = devm_request_threaded_irq(&dev_info->pdev->dev, irq,
+ cmmon_isr_hw, cmmon_isr_sw, IRQF_ONESHOT,
&dev_info->data->irq_name[0], dev_info);
if (rc) {
@@ -1145,23 +1302,285 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
4, (u32 *) &denali_ctl_86)) {
pr_err("Could not mask interrupts (%s - ctl_86).\n",
dev_info->ctl_name);
+ return ERR_STAGE_7;
}
- goto err_noirq;
+ return ERR_STAGE_6;
}
return 0;
+}
-err_noirq:
- if (dev_info->is_ddr4)
- cancel_work_sync(&dev_info->offload_alerts);
- cancel_work_sync(&dev_info->offload_events);
- edac_device_del_device(&dev_info->pdev->dev);
+#ifdef CONFIG_DEBUG_CMEM
+static ssize_t
+axxia_cmem_read(struct file *filp, char *buffer, size_t length, loff_t *offset)
+{
+ char *buf = NULL;
+ struct intel_edac_dev_info *dev_info =
+ (struct intel_edac_dev_info *) filp->private_data;
+ ssize_t len;
+
+ if (*offset > 0)
+ return 0;
+
+ buf = kmalloc(PAGE_SIZE, __GFP_WAIT);
+ if (buf == NULL)
+ goto no_mem_buffer;
+
+ mutex_lock(&dev_info->state_machine_lock);
+
+ /*
+ * Do not modify this. Content is used by rte driver.
+ * Once changed modify rte code.
+ */
+ len = snprintf(buf, PAGE_SIZE-1, "Node: 0x%x\n"
+ "Command available:\n"
+ " dump - triggers mpr_page1 dump.\n",
+ (int) dev_info->cm_region >> 16);
+
+ mutex_unlock(&dev_info->state_machine_lock);
+
+ buf[len] = '\0';
+ if (copy_to_user(buffer, buf, len))
+ len = -EFAULT;
+
+ kfree(buf);
+ *offset += len;
+ return len;
-err_nosysfs:
- edac_device_free_ctl_info(dev_info->edac_dev);
-err_noctlinfo:
+no_mem_buffer:
+ pr_err("Could not allocate memory for cmem edac control buffer.\n");
+ return -ENOSPC;
+}
+
+static ssize_t
+axxia_cmem_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ char *buf = NULL;
+ struct intel_edac_dev_info *dev_info =
+ (struct intel_edac_dev_info *) file->private_data;
+
+ buf = kmalloc(count + 1, __GFP_WAIT);
+ if (buf == NULL)
+ goto no_mem_buffer;
+
+ memset(buf, 0, count + 1);
+
+ if (copy_from_user(buf, buffer, count)) {
+ pr_err("Could not copy data from user.\n");
+ goto cfu_failed;
+ }
+
+ if (!strncmp(buf, "dump", 4)) {
+ atomic_inc(&dev_info->data->dump_in_progress);
+ wake_up(&dev_info->data->dump_wq);
+ }
+
+ kfree(buf);
+ return count;
+
+cfu_failed:
+ kfree(buf);
+ return -EFAULT;
+
+no_mem_buffer:
+ pr_err("Could not allocate memory for cmem edac control buffer.\n");
+ return -ENOSPC;
+}
+
+int axxia_cmem_open(struct inode *inode, struct file *filp)
+{
+ try_module_get(THIS_MODULE);
+ filp->private_data = PDE_DATA(inode);
+ return 0;
+}
+
+int axxia_cmem_close(struct inode *inode, struct file *filp)
+{
+ module_put(THIS_MODULE);
+ filp->private_data = 0;
+ return 0;
+}
+
+static const struct file_operations axxia_edac_cmem_proc_ops = {
+ .owner = THIS_MODULE,
+ .open = axxia_cmem_open,
+ .read = axxia_cmem_read,
+ .write = axxia_cmem_write,
+ .release = axxia_cmem_close,
+ .llseek = noop_llseek
+};
+
+static void remove_procfs_entry(struct intel_edac_dev_info *dev_info)
+{
+ if (dev_info && dev_info->dir_entry) {
+ proc_remove(dev_info->dir_entry);
+ dev_info->dir_entry = NULL;
+ }
+}
+#endif
+
+static int intel_edac_mc_probe(struct platform_device *pdev)
+{
+ int i, j, k, l;
+ int count;
+ struct intel_edac_dev_info *dev_info = NULL;
+ struct resource *io;
+ struct device_node *np = pdev->dev.of_node;
+ struct cm_56xx_denali_ctl_00 denali_ctl_00;
+ int ret = -1;
+
+ count = atomic64_inc_return(&mc_counter);
+ if ((count - 1) == MEMORY_CONTROLLERS)
+ goto err_nodev;
+
+ dev_info = devm_kzalloc(&pdev->dev, sizeof(*dev_info), GFP_KERNEL);
+ if (!dev_info)
+ goto err_nomem;
+
+ dev_info->ctl_name =
+ devm_kzalloc(&pdev->dev, 32*sizeof(char), GFP_KERNEL);
+ if (!dev_info->ctl_name)
+ goto err_nomem;
+
+ dev_info->blk_name =
+ devm_kzalloc(&pdev->dev, 32*sizeof(char), GFP_KERNEL);
+ if (!dev_info->blk_name)
+ goto err_nomem;
+
+ dev_info->data =
+ devm_kzalloc(&pdev->dev, sizeof(*dev_info->data), GFP_KERNEL);
+ if (!dev_info->data)
+ goto err_nomem;
+
+ init_waitqueue_head(&dev_info->data->dump_wq);
+ init_waitqueue_head(&dev_info->data->event_wq);
+
+ raw_spin_lock_init(&dev_info->data->mpr_data_lock);
+ mutex_init(&dev_info->data->edac_sysfs_data_lock);
+ mutex_init(&dev_info->state_machine_lock);
+
+ strncpy(dev_info->ctl_name, np->name, 32);
+ dev_info->ctl_name[31] = '\0';
+
+ strncpy(dev_info->blk_name, "ECC", 32);
+ dev_info->blk_name[31] = '\0';
+
+ edac_op_state = EDAC_OPSTATE_POLL;
+
+ dev_info->pdev = pdev;
+ dev_info->edac_idx = edac_device_alloc_index();
+ dev_info->data->irq = 0;
+
+ /* setup all counters */
+ for (i = 0; i < NR_EVENTS; ++i)
+ atomic_set(&dev_info->data->events[i].counter, 0);
+
+ for (j = 0; j < MAX_CS; ++j) {
+ for (l = 0; l < MAX_DQ; ++l) {
+ for (k = 0; k < MPR_ERRORS; ++k, ++i) {
+ atomic_set(&dev_info->data->
+ alerts[j][l][k].counter, 0);
+ }
+ }
+ }
+
+ io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!io) {
+ dev_err(&pdev->dev, "Unable to get mem resource\n");
+ goto err_init;
+ }
+ dev_info->cm_region = io->start;
+ dev_info->syscon =
+ syscon_regmap_lookup_by_phandle(np, "syscon");
+ if (IS_ERR(dev_info->syscon)) {
+ pr_info(FMT, np->name);
+ dev_info->axi2ser3_region = ioremap(AXI2_SER3_PHY_ADDR,
+ AXI2_SER3_PHY_SIZE);
+ if (!dev_info->axi2ser3_region) {
+ pr_err("ioremap of axi2ser3 region failed\n");
+ goto err_init;
+ }
+ }
+
+ if (ncr_read(dev_info->cm_region, CM_56XX_DENALI_CTL_00,
+ 4, (u32 *) &denali_ctl_00)) {
+ pr_err("Could not read ddr version.\n");
+ goto err_init;
+ }
+
+ if (denali_ctl_00.start == 1) {
+ /* uboot has configured CMEM */
+ if (denali_ctl_00.dram_class == 0xa) {
+ pr_info("%s supports mpr dump (DDR4).\n",
+ dev_info->ctl_name);
+ dev_info->is_ddr4 = 1;
+ }
+ if (denali_ctl_00.dram_class == 0x6) {
+ pr_info("%s doesn't support mpr dump (DDR3).\n",
+ dev_info->ctl_name);
+ }
+ dev_info->is_controller_configured = 1;
+
+ ret = initialize(dev_info);
+ if (ret)
+ goto err_uninit;
+
+ ret = enable_workers(dev_info);
+ if (ret)
+ goto err_uninit;
+
+ ret = enable_driver_irq(dev_info);
+ if (ret)
+ goto err_uninit;
+
+ } else {
+ /* CMEM is not configured */
+ dev_info->is_controller_configured = 0;
+
+ ret = enable_driver_irq(dev_info);
+ if (ret)
+ goto err_uninit;
+
+ pr_info("CMEM base init: controller: %s DEV %s (INTERRUPT).\n",
+ dev_info->ctl_name,
+ dev_name(&dev_info->pdev->dev));
+ }
+
+#ifdef CONFIG_DEBUG_CMEM
+ /* in this case create procfs file to be used by rte */
+ dev_info->proc_name =
+ devm_kzalloc(&pdev->dev, 32*sizeof(char),
+ GFP_KERNEL);
+ if (!dev_info->proc_name)
+ goto err_uninit;
+
+ snprintf(dev_info->proc_name, 31*sizeof(char),
+ "driver/axxia_edac_%s_control",
+ dev_info->ctl_name);
+
+ /* each instance shall know each private data */
+ dev_info->dir_entry =
+ proc_create_data(dev_info->proc_name, 0200,
+ NULL, &axxia_edac_cmem_proc_ops,
+ dev_info);
+
+ if (dev_info->dir_entry == NULL) {
+ pr_err("Could not create proc entry for %s.\n",
+ dev_info->ctl_name);
+ goto err_uninit;
+ }
+#endif
+ return 0;
+
+
+err_uninit:
+ uninitialize(dev_info, ret,
+ dev_info->is_controller_configured == 0 ? 1 : 0);
+err_init:
mutex_destroy(&dev_info->data->edac_sysfs_data_lock);
+ mutex_destroy(&dev_info->state_machine_lock);
atomic64_dec(&mc_counter);
return 1;
err_nomem:
@@ -1172,23 +1591,20 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
return -ENODEV;
}
+
+
static int intel_edac_mc_remove(struct platform_device *pdev)
{
struct intel_edac_dev_info *dev_info =
(struct intel_edac_dev_info *) &pdev->dev;
if (dev_info) {
- if (dev_info->data->irq > 0) {
- disable_irq(dev_info->data->irq);
- devm_free_irq(&pdev->dev,
- dev_info->data->irq, dev_info);
+#ifdef CONFIG_DEBUG_CMEM
+ remove_procfs_entry(dev_info);
+#endif
- dev_info->data->irq = 0;
-
- if (dev_info->is_ddr4)
- cancel_work_sync(&dev_info->offload_alerts);
- cancel_work_sync(&dev_info->offload_events);
- }
+ uninitialize(dev_info, ERR_STAGE_8,
+ dev_info->is_controller_configured == 0 ? 1 : 0);
if (dev_info->edac_dev != NULL) {
edac_device_del_device(&dev_info->pdev->dev);
@@ -1196,6 +1612,8 @@ static int intel_edac_mc_remove(struct platform_device *pdev)
}
mutex_destroy(&dev_info->data->edac_sysfs_data_lock);
+ mutex_destroy(&dev_info->state_machine_lock);
+
atomic64_dec(&mc_counter);
}
platform_device_unregister(pdev);
diff --git a/drivers/edac/axxia_edac-mc_56xx.c b/drivers/edac/axxia_edac-mc_56xx.c
index f850ebe..fb1e742 100644
--- a/drivers/edac/axxia_edac-mc_56xx.c
+++ b/drivers/edac/axxia_edac-mc_56xx.c
@@ -92,68 +92,68 @@
#define ALIVE_NOTIFICATION_PERIOD (90*1000)
static int log = 1;
-module_param(log, int, S_IRUGO|S_IWUSR);
+module_param(log, int, 0644);
MODULE_PARM_DESC(log, "Log each error to kernel log.");
static int force_restart = 1;
-module_param(force_restart, int, S_IRUGO|S_IWUSR);
+module_param(force_restart, int, 0644);
MODULE_PARM_DESC(force_restart, "Machine restart on fatal error.");
static atomic64_t mc_counter = ATOMIC_INIT(0);
/*
- Bit [34] = Logical OR of all lower bits.
- Bit [33] = A CRC error occurred on the write data bus.
- Bit [32] = The software-initiated control word write has completed.
- Bit [31] = The user-initiated DLL resync has completed.
- Bit [30] = A state change has been detected on the
- dfi_init_complete signal after initialization.
- Bit [29] = The assertion of the INHIBIT_DRAM_CMD parameter has
- successfully inhibited the command queue.
- Bit [28] = The register interface-initiated mode register write has
- completed and another mode register write may be issued.
- Bit [27] = A Low Power Interface (LPI) timeout error has occurred.
- Bit [26] = MPR read command, initiated with a software MPR_READ request,
- is complete.
- Bit [25] = Error received from the PHY on the DFI bus.
- Bit [24] = RESERVED
- Bit [23] = RESERVED
- Bit [22] = A parity error has been detected on the address/control bus
- on a registered DIMM.
- Bit [21] = The leveling operation has completed.
- Bit [20] = A read leveling gate training operation has been requested.
- Bit [19] = A read leveling operation has been requested.
- Bit [18] = A write leveling operation has been requested.
- Bit [17] = A DFI update error has occurred. Error information can be
- found in the UPDATE_ERROR_STATUS parameter.
- Bit [16] = A write leveling error has occurred. Error information can
- be found in the WRLVL_ERROR_STATUS parameter.
- Bit [15] = A read leveling gate training error has occurred. Error
- information can be found in the RDLVL_ERROR_STATUS parameter.
- Bit [14] = A read leveling error has occurred. Error information can be
- found in the RDLVL_ERROR_STATUS parameter.
- Bit [13] = The user has programmed an invalid setting associated with
- user words per burst.
- Examples: Setting param_reduc when burst length = 2. A 1:2
- MC:PHY clock ratio with burst length = 2.
- Bit [12] = A wrap cycle crossing a DRAM page has been detected. This
- is unsupported & may result in memory data corruption.
- Bit [11] = A write was attempted to a writeprotected region.
- Bit [10] = The BIST operation has been completed.
- Bit [9] = The low power operation has been completed.
- Bit [8] = The MC initialization has been completed.
- Bit [7] = An error occurred on the port command channel.
- Bit [6] = Multiple uncorrectable ECC events have been detected.
- Bit [5] = An uncorrectable ECC event has been detected.
- Bit [4] = Multiple correctable ECC events have been detected.
- Bit [3] = A correctable ECC event has been detected.
- Bit [2] = Multiple accesses outside the defined PHYSICAL memory space
- have occurred.
- Bit [1] = A memory access outside the defined PHYSICAL memory space
- has occurred.
- Bit [0] = The memory reset is valid on the DFI bus.
-
- Of these 1, 2, 3, 4, 5, 6, 7, 12, 22 and 26 are of interest.
-*/
+ * Bit [34] = Logical OR of all lower bits.
+ * Bit [33] = A CRC error occurred on the write data bus.
+ * Bit [32] = The software-initiated control word write has completed.
+ * Bit [31] = The user-initiated DLL resync has completed.
+ * Bit [30] = A state change has been detected on the
+ * dfi_init_complete signal after initialization.
+ * Bit [29] = The assertion of the INHIBIT_DRAM_CMD parameter has
+ * successfully inhibited the command queue.
+ * Bit [28] = The register interface-initiated mode register write has
+ * completed and another mode register write may be issued.
+ * Bit [27] = A Low Power Interface (LPI) timeout error has occurred.
+ * Bit [26] = MPR read command, initiated with a software MPR_READ request,
+ * is complete.
+ * Bit [25] = Error received from the PHY on the DFI bus.
+ * Bit [24] = RESERVED
+ * Bit [23] = RESERVED
+ * Bit [22] = A parity error has been detected on the address/control bus
+ * on a registered DIMM.
+ * Bit [21] = The leveling operation has completed.
+ * Bit [20] = A read leveling gate training operation has been requested.
+ * Bit [19] = A read leveling operation has been requested.
+ * Bit [18] = A write leveling operation has been requested.
+ * Bit [17] = A DFI update error has occurred. Error information can be
+ * found in the UPDATE_ERROR_STATUS parameter.
+ * Bit [16] = A write leveling error has occurred. Error information can
+ * be found in the WRLVL_ERROR_STATUS parameter.
+ * Bit [15] = A read leveling gate training error has occurred. Error
+ * information can be found in the RDLVL_ERROR_STATUS parameter.
+ * Bit [14] = A read leveling error has occurred. Error information can be
+ * found in the RDLVL_ERROR_STATUS parameter.
+ * Bit [13] = The user has programmed an invalid setting associated with
+ * user words per burst.
+ * Examples: Setting param_reduc when burst length = 2. A 1:2
+ * MC:PHY clock ratio with burst length = 2.
+ * Bit [12] = A wrap cycle crossing a DRAM page has been detected. This
+ * is unsupported & may result in memory data corruption.
+ * Bit [11] = A write was attempted to a writeprotected region.
+ * Bit [10] = The BIST operation has been completed.
+ * Bit [9] = The low power operation has been completed.
+ * Bit [8] = The MC initialization has been completed.
+ * Bit [7] = An error occurred on the port command channel.
+ * Bit [6] = Multiple uncorrectable ECC events have been detected.
+ * Bit [5] = An uncorrectable ECC event has been detected.
+ * Bit [4] = Multiple correctable ECC events have been detected.
+ * Bit [3] = A correctable ECC event has been detected.
+ * Bit [2] = Multiple accesses outside the defined PHYSICAL memory space
+ * have occurred.
+ * Bit [1] = A memory access outside the defined PHYSICAL memory space
+ * has occurred.
+ * Bit [0] = The memory reset is valid on the DFI bus.
+ *
+ * Of these 1, 2, 3, 4, 5, 6, 7, 12, 22 and 26 are of interest.
+ */
/*
* MPR dump processing - overview.
@@ -162,7 +162,7 @@ static atomic64_t mc_counter = ATOMIC_INIT(0);
* one need to collect dumps for all available cs. Below given example
* for two cs0/cs1.
*
- * SMEM MC smmon_isr smmon_wq
+ * SMEM MC smmon_isr_sw smmon_wq
* | | |
* | | |
* |ALERT_N - int_status bit [33] |
@@ -612,6 +612,10 @@ struct intel_edac_dev_info {
char *blk_name;
struct work_struct offload_alerts;
struct work_struct offload_events;
+ struct workqueue_struct *wq_alerts;
+ struct workqueue_struct *wq_events;
+ int finish_alerts;
+ int finish_events;
int is_ddr4;
int edac_idx;
u32 sm_region;
@@ -725,7 +729,7 @@ static struct edac_dev_sysfs_attribute device_block_attr[] = {
{
.attr = {
.name = "mpr_page1",
- .mode = (S_IRUGO | S_IWUSR)
+ .mode = (0644)
},
.show = mpr1_dump_show,
.store = NULL},
@@ -904,7 +908,13 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs)
}
static irqreturn_t
-smmon_isr(int interrupt, void *device)
+smmon_isr_hw(int interrupt, void *device)
+{
+ return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t
+smmon_isr_sw(int interrupt, void *device)
{
struct intel_edac_dev_info *dev_info = device;
struct sm_56xx_denali_ctl_366 denali_ctl_366;
@@ -990,11 +1000,14 @@ static void intel_sm_alerts_error_check(struct edac_device_ctl_info *edac_dev)
start:
/* keep hung up monitor happy 90 sec's */
- if (0 == wait_event_timeout(dev_info->data->dump_wq,
+ if (wait_event_timeout(dev_info->data->dump_wq,
atomic_read(&dev_info->data->dump_in_progress),
- msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)))
+ msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0)
goto start;
+ if (dev_info->finish_alerts)
+ goto finish;
+
/* the only one running workqueue */
for (i = 0; i < dev_info->data->cs_count; ++i) {
@@ -1021,6 +1034,9 @@ static void intel_sm_alerts_error_check(struct edac_device_ctl_info *edac_dev)
wait_event(dev_info->data->dump_wq,
atomic_read(&dev_info->data->dump_ready));
+ if (dev_info->finish_alerts)
+ goto finish;
+
atomic_set(&dev_info->data->dump_ready, 0);
/* collect data */
collect_mpr_dump(dev_info, SM_MPR_PAGE, i);
@@ -1054,6 +1070,10 @@ static void intel_sm_alerts_error_check(struct edac_device_ctl_info *edac_dev)
printk_ratelimited("Could not collect MPR dump.\n");
atomic_set(&dev_info->data->dump_in_progress, 0);
goto start;
+
+finish:
+ atomic_set(&dev_info->data->dump_ready, 0);
+ atomic_set(&dev_info->data->dump_in_progress, 0);
}
static void intel_sm_events_error_check(struct edac_device_ctl_info *edac_dev)
@@ -1065,13 +1085,16 @@ static void intel_sm_events_error_check(struct edac_device_ctl_info *edac_dev)
u32 counter;
while (1) {
- if (0 == wait_event_timeout(dev_info->data->event_wq,
+ if (wait_event_timeout(dev_info->data->event_wq,
atomic_read(&dev_info->data->event_ready),
- msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)))
+ msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0)
continue;
atomic_set(&dev_info->data->event_ready, 0);
+ if (dev_info->finish_events)
+ break;
+
mutex_lock(&dev_info->data->edac_sysfs_data_lock);
for (i = 0; i < NR_EVENTS; ++i) {
counter = atomic_xchg(&events[i].counter, 0);
@@ -1160,6 +1183,22 @@ static int get_active_dram(struct intel_edac_dev_info *dev_info)
return dram;
}
+static void finish_workqueues(struct intel_edac_dev_info *dev_info)
+{
+ if (dev_info->is_ddr4) {
+ dev_info->finish_alerts = 1;
+ atomic_inc(&dev_info->data->dump_in_progress);
+ atomic_set(&dev_info->data->dump_ready, 1);
+ wake_up(&dev_info->data->dump_wq);
+ cancel_work_sync(&dev_info->offload_alerts);
+ }
+
+ dev_info->finish_events = 1;
+ atomic_set(&dev_info->data->event_ready, 1);
+ wake_up(&dev_info->data->event_wq);
+ cancel_work_sync(&dev_info->offload_events);
+}
+
static int intel_edac_mc_probe(struct platform_device *pdev)
{
struct edac_device_instance *instance;
@@ -1184,10 +1223,20 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
if (!dev_info)
goto err_nomem;
+ dev_info->ctl_name =
+ devm_kzalloc(&pdev->dev, 32*sizeof(char), GFP_KERNEL);
+ if (!dev_info->ctl_name)
+ goto err_nomem;
+
+ dev_info->blk_name =
+ devm_kzalloc(&pdev->dev, 32*sizeof(char), GFP_KERNEL);
+ if (!dev_info->blk_name)
+ goto err_nomem;
+
dev_info->data =
devm_kzalloc(&pdev->dev, sizeof(*dev_info->data), GFP_KERNEL);
if (!dev_info->data)
- goto err_noctlinfo;
+ goto err_nomem;
init_waitqueue_head(&dev_info->data->dump_wq);
init_waitqueue_head(&dev_info->data->event_wq);
@@ -1195,6 +1244,12 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
raw_spin_lock_init(&dev_info->data->mpr_data_lock);
mutex_init(&dev_info->data->edac_sysfs_data_lock);
+ strncpy(dev_info->ctl_name, np->name, 32);
+ dev_info->ctl_name[31] = '\0';
+
+ strncpy(dev_info->blk_name, "ECC", 32);
+ dev_info->ctl_name[31] = '\0';
+
dev_info->ctl_name = kstrdup(np->name, GFP_KERNEL);
dev_info->blk_name = "ECC";
edac_op_state = EDAC_OPSTATE_POLL;
@@ -1268,8 +1323,10 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
dev_info->edac_dev =
edac_device_alloc_ctl_info(0, dev_info->ctl_name,
1, dev_info->blk_name,
- NR_EVENTS +
- cs_count * dram_count * MPR_ERRORS,
+ NR_EVENTS + (dev_info->is_ddr4 ?
+ cs_count * dram_count * MPR_ERRORS
+ :
+ 0),
0, NULL, 0, dev_info->edac_idx);
if (!dev_info->edac_dev) {
@@ -1329,20 +1386,34 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
if (edac_device_add_device(dev_info->edac_dev) != 0) {
pr_info("Unable to add edac device for %s\n",
dev_info->ctl_name);
- goto err_nosysfs;
+ goto err_noctlinfo;
}
snprintf(&dev_info->data->irq_name[0], IRQ_NAME_LEN,
"%s-mon", dev_info->ctl_name);
+ dev_info->wq_events =
+ alloc_workqueue("%s-events", WQ_MEM_RECLAIM, 1,
+ (dev_info->ctl_name));
+ if (!dev_info->wq_events)
+ goto err_nosysfs;
+
+ if (dev_info->is_ddr4) {
+ dev_info->wq_alerts =
+ alloc_workqueue("%s-alerts", WQ_MEM_RECLAIM, 1,
+ (dev_info->ctl_name));
+
+ if (!dev_info->wq_alerts)
+ goto err_noevents;
+ }
if (dev_info->is_ddr4)
INIT_WORK(&dev_info->offload_alerts, axxia_alerts_work);
INIT_WORK(&dev_info->offload_events, axxia_events_work);
if (dev_info->is_ddr4)
- schedule_work(&dev_info->offload_alerts);
- schedule_work(&dev_info->offload_events);
+ queue_work(dev_info->wq_alerts, &dev_info->offload_alerts);
+ queue_work(dev_info->wq_events, &dev_info->offload_events);
irq = platform_get_irq(pdev, 0);
if (irq < 0) {
@@ -1382,8 +1453,8 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
}
dev_info->data->irq = irq;
- rc = devm_request_irq(&pdev->dev, irq,
- smmon_isr, IRQF_ONESHOT,
+ rc = devm_request_threaded_irq(&pdev->dev, irq,
+ smmon_isr_hw, smmon_isr_sw, IRQF_ONESHOT,
&dev_info->data->irq_name[0], dev_info);
if (rc) {
@@ -1411,18 +1482,23 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
return 0;
err_noirq:
+ finish_workqueues(dev_info);
+ edac_device_del_device(&dev_info->pdev->dev);
+
if (dev_info->is_ddr4)
- cancel_work_sync(&dev_info->offload_alerts);
- cancel_work_sync(&dev_info->offload_events);
+ destroy_workqueue(dev_info->wq_alerts);
- edac_device_del_device(&dev_info->pdev->dev);
+err_noevents:
+ destroy_workqueue(dev_info->wq_events);
err_nosysfs:
edac_device_free_ctl_info(dev_info->edac_dev);
+
err_noctlinfo:
mutex_destroy(&dev_info->data->edac_sysfs_data_lock);
atomic64_dec(&mc_counter);
return 1;
+
err_nomem:
atomic64_dec(&mc_counter);
return -ENOMEM;
@@ -1444,9 +1520,11 @@ static int intel_edac_mc_remove(struct platform_device *pdev)
dev_info->data->irq = 0;
+ finish_workqueues(dev_info);
+
if (dev_info->is_ddr4)
- cancel_work_sync(&dev_info->offload_alerts);
- cancel_work_sync(&dev_info->offload_events);
+ destroy_workqueue(dev_info->wq_alerts);
+ destroy_workqueue(dev_info->wq_events);
}
if (dev_info->edac_dev != NULL) {
--
2.7.4
More information about the linux-yocto
mailing list