[linux-yocto] [PATCH 30/48] drivers/edac: CMEM EDAC support on config load.

Daniel Dragomir daniel.dragomir at windriver.com
Mon Dec 11 05:14:00 PST 2017


From: Marek Majtyka <marekx.majtyka at intel.com>

Implemented CMEM EDAC support on rte config load/unload.
 - fixed bugs found CMEM/SMEM:
        - work queue hang on driver removal
        - unnecessary edac sysfs entries (valid only for ddr3)

Signed-off-by: Marek Majtyka <marekx.majtyka at intel.com>
---
 drivers/edac/axxia_edac-cmc_56xx.c | 796 ++++++++++++++++++++++++++++---------
 drivers/edac/axxia_edac-mc_56xx.c  | 228 +++++++----
 2 files changed, 760 insertions(+), 264 deletions(-)

diff --git a/drivers/edac/axxia_edac-cmc_56xx.c b/drivers/edac/axxia_edac-cmc_56xx.c
index f99c46d..c4bf2d0 100644
--- a/drivers/edac/axxia_edac-cmc_56xx.c
+++ b/drivers/edac/axxia_edac-cmc_56xx.c
@@ -25,6 +25,9 @@
 #include <linux/mfd/syscon.h>
 #include <linux/regmap.h>
 #include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
 #include "edac_core.h"
 #include "edac_module.h"
 
@@ -78,12 +81,17 @@
 #define INT_BIT_5  (0x00000020)
 #define INT_BIT_6  (0x00000040)
 #define INT_BIT_7  (0x00000080)
+#define INT_BIT_8  (0x00000100)
 #define INT_BIT_11 (0x00000800)
 #define INT_BIT_21 (0x00200000)
 #define INT_BIT_25 (0x02000000)
 #define INT_BIT_30 (0x40000000)
 #define INT_BIT_31 (0x80000000)
 
+#define CM_INT_MASK_BASE_PROBE (~(\
+			INT_BIT_8 |\
+			INT_BIT_31))
+
 #define CM_INT_MASK_BASE (~(\
 			INT_BIT_1 |\
 			INT_BIT_2 |\
@@ -110,70 +118,70 @@
 			INT_BIT_30 |\
 			INT_BIT_31))
 
-#define CM_INT_MASK_ALL (0xffffffff)
+#define CM_INT_MASK_ALL (0x7fffffff)
 #define ALIVE_NOTIFICATION_PERIOD (90*1000)
 
 static int log = 1;
-module_param(log, int, S_IRUGO|S_IWUSR);
+module_param(log, int, 0644);
 MODULE_PARM_DESC(log, "Log each error to kernel log.");
 
 static int force_restart = 1;
-module_param(force_restart, int, S_IRUGO|S_IWUSR);
+module_param(force_restart, int, 0644);
 MODULE_PARM_DESC(force_restart, "Machine restart on fatal error.");
 
 static atomic64_t mc_counter = ATOMIC_INIT(0);
 /*
- Bit [31] = Logical OR of all lower bits.
- Bit [30] = A CRC error occurred on the write data bus.
- Bit [29] = The user-initiated DLL resync has completed.
- Bit [28] = A state change has been detected on the dfi_init_complete
-	signal after initialization.
- Bit [27] = The assertion of the INHIBIT_DRAM_CMD parameter has successfully
-	inhibited the command queue.
- Bit [26] = The register interface-initiated mode register write has completed
-	and another mode register write may be issued.
- Bit [25] = MPR read command, initiated with a software MPR_READ request, is
-	complete.
- Bit [24] = Error received from the PHY on the DFI bus.
- Bit [23] = RESERVED
- Bit [22] = RESERVED
- Bit [21] = A parity error has been detected on the address/control bus on
-	a registered DIMM.
- Bit [20] = The leveling operation has completed.
- Bit [19] = A read leveling gate training operation has been requested.
- Bit [18] = A read leveling operation has been requested.
- Bit [17] = A write leveling operation has been requested.
- Bit [16] = A DFI update error has occurred.  Error information can be found in
-	the UPDATE_ERROR_STATUS parameter.
- Bit [15] = A write leveling error has occurred. Error information can be found
-	in the WRLVL_ERROR_STATUS parameter.
- Bit [14] = A read leveling gate training error has occurred. Error information
-	can be found in the RDLVL_ERROR_STATUS parameter.
- Bit [13] = A read leveling error has occurred. Error information can be found
-	in the RDLVL_ERROR_STATUS parameter.
- Bit [12] = The user has programmed an invalid setting associated with user
-	words per burst.
-	Examples:
-		Setting param_reduc when burst length = 2.
-		A 1:2 MC:PHY clock ratio with burst length = 2.
- Bit [11] = A wrap cycle crossing a DRAM page has been detected. This is
-	unsupported & may result in memory data corruption.
- Bit [10] = The BIST operation has been completed.
- Bit [9] = The low power operation has been completed.
- Bit [8] = The MC initialization has been completed.
- Bit [7] = An error occurred on the port command channel.
- Bit [6] = Multiple uncorrectable ECC events have been detected.
- Bit [5] = An uncorrectable ECC event has been detected.
- Bit [4] = Multiple correctable ECC events have been detected.
- Bit [3] = A correctable ECC event has been detected.
- Bit [2] = Multiple accesses outside the defined PHYSICAL memory space
-	have occurred.
- Bit [1] = A memory access outside the defined PHYSICAL memory space
-	has occurred.
- Bit [0] = The memory reset is valid on the DFI bus.
-
- Of these 1, 2, 3, 4, 5, 6, 7, 11, 21, 25, and 30 are of interest.
-*/
+ * Bit [31] = Logical OR of all lower bits.
+ * Bit [30] = A CRC error occurred on the write data bus.
+ * Bit [29] = The user-initiated DLL resync has completed.
+ * Bit [28] = A state change has been detected on the dfi_init_complete
+ *        signal after initialization.
+ * Bit [27] = The assertion of the INHIBIT_DRAM_CMD parameter has successfully
+ *        inhibited the command queue.
+ * Bit [26] = The register interface-initiated mode register write has completed
+ *        and another mode register write may be issued.
+ * Bit [25] = MPR read command, initiated with a software MPR_READ request, is
+ *        complete.
+ * Bit [24] = Error received from the PHY on the DFI bus.
+ * Bit [23] = RESERVED
+ * Bit [22] = RESERVED
+ * Bit [21] = A parity error has been detected on the address/control bus on
+ *        a registered DIMM.
+ * Bit [20] = The leveling operation has completed.
+ * Bit [19] = A read leveling gate training operation has been requested.
+ * Bit [18] = A read leveling operation has been requested.
+ * Bit [17] = A write leveling operation has been requested.
+ * Bit [16] = A DFI update error has occurred.  Error information can be found
+ *        in the UPDATE_ERROR_STATUS parameter.
+ * Bit [15] = A write leveling error has occurred. Error information can be
+ *        found in the WRLVL_ERROR_STATUS parameter.
+ * Bit [14] = A read leveling gate training error has occurred. Error
+ *        information can be found in the RDLVL_ERROR_STATUS parameter.
+ * Bit [13] = A read leveling error has occurred. Error information can be
+ *        found in the RDLVL_ERROR_STATUS parameter.
+ * Bit [12] = The user has programmed an invalid setting associated with user
+ *        words per burst.
+ *        Examples:
+ *          Setting param_reduc when burst length = 2.
+ *          A 1:2 MC:PHY clock ratio with burst length = 2.
+ * Bit [11] = A wrap cycle crossing a DRAM page has been detected. This is
+ *        unsupported & may result in memory data corruption.
+ * Bit [10] = The BIST operation has been completed.
+ * Bit [9] = The low power operation has been completed.
+ * Bit [8] = The MC initialization has been completed.
+ * Bit [7] = An error occurred on the port command channel.
+ * Bit [6] = Multiple uncorrectable ECC events have been detected.
+ * Bit [5] = An uncorrectable ECC event has been detected.
+ * Bit [4] = Multiple correctable ECC events have been detected.
+ * Bit [3] = A correctable ECC event has been detected.
+ * Bit [2] = Multiple accesses outside the defined PHYSICAL memory space
+ *        have occurred.
+ * Bit [1] = A memory access outside the defined PHYSICAL memory space
+ *        has occurred.
+ * Bit [0] = The memory reset is valid on the DFI bus.
+
+ * Of these 1, 2, 3, 4, 5, 6, 7, 11, 21, 25, and 30 are of interest.
+ */
 
 /*
  *   MPR dump processing - overview.
@@ -182,7 +190,7 @@ static atomic64_t mc_counter = ATOMIC_INIT(0);
  * one need to collect dumps for all available cs. Below given example
  * for two cs0/cs1.
  *
- *   CMEM MC           cmmon_isr           cmmon_wq
+ *   CMEM MC           cmmon_isr_sw         cmmon_wq
  *     |                   |                   |
  *     |                   |                   |
  *     |ALERT_N - int_status bit [30]          |
@@ -337,6 +345,15 @@ struct __packed mpr_dump {
 	u8	cs;
 };
 
+enum init_return_codes {ERR_STAGE_8 = -8,
+			ERR_STAGE_7 = -7,
+			ERR_STAGE_6 = -6,
+			ERR_STAGE_5 = -5,
+			ERR_STAGE_4 = -4,
+			ERR_STAGE_3 = -3,
+			ERR_STAGE_2 = -2,
+			ERR_STAGE_1 = -1
+};
 enum events {
 	EV_ILLEGAL = 0,
 	EV_MULT_ILLEGAL,
@@ -437,9 +454,19 @@ struct intel_edac_dev_info {
 	struct mc_edac_data *data;
 	char *ctl_name;
 	char *blk_name;
+	char *proc_name;
+#ifdef CONFIG_DEBUG_CMEM
+	struct proc_dir_entry *dir_entry;
+#endif
+	struct mutex state_machine_lock;
 	struct work_struct offload_alerts;
 	struct work_struct offload_events;
+	int finish_alerts;
+	int finish_events;
+	struct workqueue_struct *wq_alerts;
+	struct workqueue_struct *wq_events;
 	int is_ddr4;
+	int is_controller_configured;
 	int edac_idx;
 	u32 cm_region;
 	struct regmap *syscon;
@@ -448,7 +475,7 @@ struct intel_edac_dev_info {
 	void (*check)(struct edac_device_ctl_info *edac_dev);
 };
 
-
+#ifdef CONFIG_DEBUG_CMEM
 static ssize_t mpr1_dump_show(struct edac_device_ctl_info
 				 *edac_dev, char *data)
 {
@@ -534,14 +561,17 @@ static ssize_t mpr1_dump_show(struct edac_device_ctl_info
 	return 0;
 }
 
+
+
 static struct edac_dev_sysfs_attribute device_block_attr[] = {
 	{
 		.attr = {
 			.name = "mpr_page1",
-			.mode = (S_IRUGO | S_IWUSR)
+			.mode = (0644)
 		},
 		.show = mpr1_dump_show,
-		.store = NULL},
+		.store = NULL
+	},
 	/* End of list */
 	{
 		.attr = {.name = NULL}
@@ -552,6 +582,7 @@ static void axxia_mc_sysfs_attributes(struct edac_device_ctl_info *edac_dev)
 {
 		edac_dev->sysfs_attributes = &device_block_attr[0];
 }
+#endif
 
 static inline void __attribute__((always_inline))
 handle_events(struct intel_edac_dev_info *edac_dev,
@@ -679,11 +710,24 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs)
 }
 
 static irqreturn_t
-cmmon_isr(int interrupt, void *device)
+cmmon_isr_hw(int interrupt, void *device)
+{
+	return IRQ_WAKE_THREAD;
+}
+
+static int initialize(struct intel_edac_dev_info *dev_info);
+static int enable_workers(struct intel_edac_dev_info *dev_info);
+static void uninitialize(struct intel_edac_dev_info *dev_info,
+			int ret, int only_disable);
+
+static irqreturn_t
+cmmon_isr_sw(int interrupt, void *device)
 {
 	struct intel_edac_dev_info *dev_info = device;
 	struct cm_56xx_denali_ctl_84 denali_ctl_84;
 	struct cm_56xx_denali_ctl_85 denali_ctl_85 = {0};
+	struct cm_56xx_denali_ctl_86 denali_ctl_86;
+	int ret = 0;
 
 	/*
 	 * NOTE:
@@ -702,12 +746,47 @@ cmmon_isr(int interrupt, void *device)
 		4, (u32 *) &denali_ctl_84))
 		goto error_read;
 
+	if (denali_ctl_84.int_status & INT_BIT_8) {
+		if (dev_info->is_controller_configured == 0) {
+			ret = initialize(dev_info);
+			if (ret)
+				goto error_init;
+
+			ret = enable_workers(dev_info);
+			if (ret)
+				goto error_init;
+
+			dev_info->is_controller_configured = 1;
+		}
+
+		if (dev_info->is_ddr4)
+			denali_ctl_86.int_mask = CM_INT_MASK_FULL;
+		else
+			denali_ctl_86.int_mask = CM_INT_MASK_BASE;
+
+		if (ncr_write(dev_info->cm_region,
+					CM_56XX_DENALI_CTL_86,
+					4, (u32 *) &denali_ctl_86)) {
+			goto error_write;
+		}
+		return IRQ_HANDLED;
+	}
+
+	/*
+	 * SAFETY CHECK
+	 * one cannot go further if driver is not fully functional!!!
+	 */
+	if (dev_info->is_controller_configured == 0)
+		return IRQ_HANDLED;
+
+
 	handle_events(dev_info, &denali_ctl_84);
 	atomic_set(&dev_info->data->event_ready, 1);
 	wake_up(&dev_info->data->event_wq);
 
 	denali_ctl_85.int_ack =
-		(denali_ctl_84.int_status & (~(INT_BIT_25 | INT_BIT_31)));
+		(denali_ctl_84.int_status &
+		   (~(INT_BIT_25 | INT_BIT_31 | INT_BIT_8)));
 
 	if (dev_info->is_ddr4) {
 		if (denali_ctl_84.int_status & INT_BIT_25) {
@@ -737,6 +816,12 @@ cmmon_isr(int interrupt, void *device)
 	printk_ratelimited("%s: Error reading interrupt status\n",
 		       dev_name(&dev_info->pdev->dev));
 	return IRQ_HANDLED;
+error_init:
+	printk_ratelimited("%s: Error during driver initialization\n",
+		       dev_name(&dev_info->pdev->dev));
+	uninitialize(dev_info, ret,
+			dev_info->is_controller_configured == 0 ? 1 : 0);
+	return IRQ_HANDLED;
 }
 
 
@@ -747,16 +832,19 @@ static void intel_cm_alerts_error_check(struct edac_device_ctl_info *edac_dev)
 	struct event_counter (*alerts)[MAX_DQ][MPR_ERRORS] =
 			dev_info->data->alerts;
 	struct cm_56xx_denali_ctl_34 denali_ctl_34;
-	int i, j, k, l;
+	int i, j, k, l, ret;
 	u32 counter;
 
 start:
 	/* keep hung up monitor happy 90 sec's */
-	if (0 == wait_event_timeout(dev_info->data->dump_wq,
+	if (wait_event_timeout(dev_info->data->dump_wq,
 		atomic_read(&dev_info->data->dump_in_progress),
-		msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)))
+		msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0)
 		goto start;
 
+	if (dev_info->finish_alerts)
+		goto finish;
+
 	for (i = 0; i < dev_info->data->cs_count; ++i) {
 
 		/* trigger dump */
@@ -779,9 +867,15 @@ static void intel_cm_alerts_error_check(struct edac_device_ctl_info *edac_dev)
 			CM_56XX_DENALI_CTL_34,
 			4, (u32 *) &denali_ctl_34))
 			goto error_write;
+
 		/* wait */
-		wait_event(dev_info->data->dump_wq,
-			   atomic_read(&dev_info->data->dump_ready));
+		ret = wait_event_timeout(dev_info->data->dump_wq,
+			   atomic_read(&dev_info->data->dump_ready),
+			   msecs_to_jiffies(1000));
+		if (dev_info->finish_alerts)
+			goto finish;
+		if (ret == 0)
+			goto timeout_error;
 
 		atomic_set(&dev_info->data->dump_ready, 0);
 		/* collect data */
@@ -811,11 +905,21 @@ static void intel_cm_alerts_error_check(struct edac_device_ctl_info *edac_dev)
 	atomic_set(&dev_info->data->dump_in_progress, 0);
 	goto start;
 
+timeout_error:
+	printk_ratelimited("Timeout occurred during MPR dump.\n");
+	atomic_set(&dev_info->data->dump_ready, 0);
+	atomic_set(&dev_info->data->dump_in_progress, 0);
+	goto start;
+
 error_read:
 error_write:
 	printk_ratelimited("Could not collect MPR dump.\n");
 	atomic_set(&dev_info->data->dump_in_progress, 0);
 	goto start;
+
+finish:
+	atomic_set(&dev_info->data->dump_ready, 0);
+	atomic_set(&dev_info->data->dump_in_progress, 0);
 }
 
 static void intel_cm_events_error_check(struct edac_device_ctl_info *edac_dev)
@@ -827,13 +931,16 @@ static void intel_cm_events_error_check(struct edac_device_ctl_info *edac_dev)
 	u32 counter;
 
 	while (1) {
-		if (0 == wait_event_timeout(dev_info->data->event_wq,
+		if (wait_event_timeout(dev_info->data->event_wq,
 			atomic_read(&dev_info->data->event_ready),
-			msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)))
+			msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0)
 			continue;
 
 		atomic_set(&dev_info->data->event_ready, 0);
 
+		if (dev_info->finish_events)
+			break;
+
 		mutex_lock(&dev_info->data->edac_sysfs_data_lock);
 		for (i = 0; i < NR_EVENTS; ++i) {
 			counter = atomic_xchg(&events[i].counter, 0);
@@ -911,119 +1018,133 @@ static int get_active_dram(struct intel_edac_dev_info *dev_info)
 	if (ncr_read(dev_info->cm_region, CM_56XX_DENALI_CTL_74,
 		4, (u32 *) &denali_ctl_74)) {
 		pr_err("Could not read number of lanes.\n");
+		return dram;
 	}
 
-	if (0 == denali_ctl_74.bank_diff)
+	if (denali_ctl_74.bank_diff == 0)
 		dram = MAX_DQ/2;
 
-	if (1 == denali_ctl_74.bank_diff)
+	if (denali_ctl_74.bank_diff == 1)
 		dram = MAX_DQ;
 
 	return dram;
 }
 
-static int intel_edac_mc_probe(struct platform_device *pdev)
+static int get_ddr4(struct intel_edac_dev_info *dev_info)
 {
-	struct edac_device_instance *instance;
-	struct edac_device_block *block;
-	int i, j, k, l;
-	int count;
-	struct intel_edac_dev_info *dev_info = NULL;
-	struct resource *io;
-	struct device_node *np = pdev->dev.of_node;
-	int irq = -1, rc = 0;
 	struct cm_56xx_denali_ctl_00 denali_ctl_00;
-	struct cm_56xx_denali_ctl_86 denali_ctl_86;
-	int cs_count = MAX_CS;
-	int dram_count = MAX_DQ;
-
-	count = atomic64_inc_return(&mc_counter);
-	if ((count - 1) == MEMORY_CONTROLLERS)
-		goto err_nodev;
-
-	dev_info = devm_kzalloc(&pdev->dev, sizeof(*dev_info), GFP_KERNEL);
-	if (!dev_info)
-		goto err_nomem;
 
-	dev_info->data =
-		devm_kzalloc(&pdev->dev, sizeof(*dev_info->data), GFP_KERNEL);
-	if (!dev_info->data)
-		goto err_noctlinfo;
-
-	init_waitqueue_head(&dev_info->data->dump_wq);
-	init_waitqueue_head(&dev_info->data->event_wq);
-
-	raw_spin_lock_init(&dev_info->data->mpr_data_lock);
-	mutex_init(&dev_info->data->edac_sysfs_data_lock);
+	if (ncr_read(dev_info->cm_region, CM_56XX_DENALI_CTL_00,
+		4, (u32 *) &denali_ctl_00)) {
+		pr_err("Could not read ddr version.\n");
+		return -1;
+	}
 
-	dev_info->ctl_name = kstrdup(np->name, GFP_KERNEL);
-	dev_info->blk_name = "ECC";
-	edac_op_state = EDAC_OPSTATE_POLL;
+	if (denali_ctl_00.dram_class == 0xa)
+		return 1;
 
-	dev_info->pdev = pdev;
-	dev_info->edac_idx = edac_device_alloc_index();
-	dev_info->data->irq = 0;
+	return 0;
+}
 
-	/* setup all counters */
-	for (i = 0; i < NR_EVENTS; ++i)
-		atomic_set(&dev_info->data->events[i].counter, 0);
+static void uninitialize(struct intel_edac_dev_info *dev_info,
+			int ret, int only_disable)
+{
+	struct cm_56xx_denali_ctl_86 denali_ctl_86;
 
-	for (j = 0; j < MAX_CS; ++j) {
-		for (l = 0; l < MAX_DQ; ++l) {
-			for (k = 0; k < MPR_ERRORS; ++k, ++i) {
-				atomic_set(&dev_info->data->
-						alerts[j][l][k].counter, 0);
+	switch (ret) {
+	case ERR_STAGE_8:
+		if (dev_info->data->irq) {
+			disable_irq(dev_info->data->irq);
+			devm_free_irq(&dev_info->pdev->dev,
+					dev_info->data->irq, dev_info);
+			dev_info->data->irq = 0;
+		}
+		/* fall-through */
+	case ERR_STAGE_7:
+		denali_ctl_86.int_mask = CM_INT_MASK_ALL;
+		if (ncr_write(dev_info->cm_region,
+					CM_56XX_DENALI_CTL_86,
+					4, (u32 *) &denali_ctl_86)) {
+			pr_err("Could not mask interrupts (%s - ctl_86).\n",
+				dev_info->ctl_name);
+		}
+		if (only_disable)
+			break;
+		/* fall-through */
+	case ERR_STAGE_6:
+		if (dev_info->is_ddr4) {
+			dev_info->finish_alerts = 1;
+			atomic_inc(&dev_info->data->dump_in_progress);
+			atomic_set(&dev_info->data->dump_ready, 1);
+			wake_up(&dev_info->data->dump_wq);
+			cancel_work_sync(&dev_info->offload_alerts);
+		}
+		dev_info->finish_events = 1;
+		atomic_set(&dev_info->data->event_ready, 1);
+		wake_up(&dev_info->data->event_wq);
+		cancel_work_sync(&dev_info->offload_events);
+		/* fall-through */
+	case ERR_STAGE_5:
+		if (dev_info->is_ddr4)
+			if (dev_info->wq_alerts) {
+				destroy_workqueue(dev_info->wq_alerts);
+				dev_info->wq_alerts = NULL;
 			}
+		/* fall-through */
+	case ERR_STAGE_4:
+		if (dev_info->wq_events) {
+			destroy_workqueue(dev_info->wq_events);
+			dev_info->wq_events = NULL;
+
+		}
+		/* fall-through */
+	case ERR_STAGE_3:
+		edac_device_del_device(&dev_info->pdev->dev);
+		/* fall-through */
+	case ERR_STAGE_2:
+		if (dev_info->edac_dev) {
+			edac_device_free_ctl_info(dev_info->edac_dev);
+			dev_info->edac_dev = NULL;
 		}
+		/* fall-through */
+	case ERR_STAGE_1:
+		/* fall-through */
+	default:
+		break;
 	}
+}
 
-	/* set up dump in progress flag */
-	atomic_set(&dev_info->data->dump_in_progress, 0);
+static int initialize(struct intel_edac_dev_info *dev_info)
+{
+	struct edac_device_instance *instance;
+	struct edac_device_block *block;
+	int i, j, k, l;
 
-	io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!io) {
-		dev_err(&pdev->dev, "Unable to get mem resource\n");
-		goto err_noctlinfo;
-	}
-	dev_info->cm_region = io->start;
-	dev_info->syscon =
-		syscon_regmap_lookup_by_phandle(np, "syscon");
-	if (IS_ERR(dev_info->syscon)) {
-		pr_info(FMT, np->name);
-		dev_info->axi2ser3_region = ioremap(AXI2_SER3_PHY_ADDR,
-			AXI2_SER3_PHY_SIZE);
-		if (!dev_info->axi2ser3_region) {
-			pr_err("ioremap of axi2ser3 region failed\n");
-			goto err_noctlinfo;
-		}
-	}
+	int cs_count = MAX_CS;
+	int dram_count = MAX_DQ;
 
-	if (ncr_read(dev_info->cm_region, CM_56XX_DENALI_CTL_00,
-		4, (u32 *) &denali_ctl_00)) {
-		pr_err("Could not read ddr version.\n");
-		goto err_noctlinfo;
+	cs_count = get_active_cs(dev_info);
+	if (cs_count == 0) {
+		pr_err("Could not get cs number. Is config loaded?\n");
+		return ERR_STAGE_1;
 	}
 
-	if (0xa == denali_ctl_00.dram_class) {
-		pr_info("%s supports mpr dump (DDR4).\n", dev_info->ctl_name);
-		dev_info->is_ddr4 = 1;
-	} else {
-		if (0x6 == denali_ctl_00.dram_class) {
-			pr_info("%s doesn't support mpr dump (DDR3).\n",
-				dev_info->ctl_name);
-		} else {
-			pr_err("CMEM is not configured. Check uboot settings.\n");
-			goto err_noctlinfo;
-		}
+	dram_count = get_active_dram(dev_info);
+	if (dram_count == 0) {
+		pr_err("Could not get dram number. Is config loaded?\n");
+		return ERR_STAGE_1;
 	}
 
-	cs_count = get_active_cs(dev_info);
-	if (cs_count == 0)
-		goto err_noctlinfo;
+	dev_info->is_ddr4 = get_ddr4(dev_info);
 
-	dram_count = get_active_dram(dev_info);
-	if (dram_count == 0)
-		goto err_noctlinfo;
+	if (dev_info->is_ddr4 == -1) {
+		pr_err("Could not get dram version. Is config loaded?\n");
+		return ERR_STAGE_1;
+	}
+	/*dev_info->is_ddr4 = 1;*/
+
+	dev_info->finish_alerts = 0;
+	dev_info->finish_events = 0;
 
 	dev_info->data->cs_count = cs_count;
 	dev_info->data->dram_count = dram_count;
@@ -1031,13 +1152,15 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
 	dev_info->edac_dev =
 		edac_device_alloc_ctl_info(0, dev_info->ctl_name,
 					 1, dev_info->blk_name,
-					 NR_EVENTS +
-					 cs_count * dram_count * MPR_ERRORS,
+					 NR_EVENTS + (dev_info->is_ddr4 ?
+					 cs_count * dram_count * MPR_ERRORS
+					 :
+					 0),
 					 0, NULL, 0, dev_info->edac_idx);
 
 	if (!dev_info->edac_dev) {
 		pr_info("No memory for edac device\n");
-		goto err_noctlinfo;
+		return ERR_STAGE_1;
 	}
 
 	instance = &dev_info->edac_dev->instances[0];
@@ -1086,51 +1209,85 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
 	dev_info->edac_dev->dev_name = dev_name(&dev_info->pdev->dev);
 	dev_info->edac_dev->edac_check = NULL;
 
+#ifdef CONFIG_DEBUG_CMEM
 	if (dev_info->is_ddr4)
 		axxia_mc_sysfs_attributes(dev_info->edac_dev);
+#endif
 
 	if (edac_device_add_device(dev_info->edac_dev) != 0) {
 		pr_info("Unable to add edac device for %s\n",
 			dev_info->ctl_name);
-		goto err_nosysfs;
+		return ERR_STAGE_2;
 	}
 
-	snprintf(&dev_info->data->irq_name[0], IRQ_NAME_LEN,
-			"%s-mon", dev_info->ctl_name);
+	return 0;
+}
+
+static int enable_workers(struct intel_edac_dev_info *dev_info)
+{
+	atomic_set(&dev_info->data->dump_ready, 0);
+	atomic_set(&dev_info->data->event_ready, 0);
+	atomic_set(&dev_info->data->dump_in_progress, 0);
+
+	dev_info->wq_events = alloc_workqueue("%s-events", WQ_MEM_RECLAIM, 1,
+						(dev_info->ctl_name));
+	if (!dev_info->wq_events)
+		return ERR_STAGE_3;
 
+	if (dev_info->is_ddr4) {
+		dev_info->wq_alerts =
+			alloc_workqueue("%s-alerts", WQ_MEM_RECLAIM, 1,
+					(dev_info->ctl_name));
+		if (!dev_info->wq_alerts)
+			return ERR_STAGE_4;
+	}
 	if (dev_info->is_ddr4)
 		INIT_WORK(&dev_info->offload_alerts, axxia_alerts_work);
 
 	INIT_WORK(&dev_info->offload_events, axxia_events_work);
 
 	if (dev_info->is_ddr4)
-		schedule_work(&dev_info->offload_alerts);
-	schedule_work(&dev_info->offload_events);
+		queue_work(dev_info->wq_alerts, &dev_info->offload_alerts);
+	queue_work(dev_info->wq_events, &dev_info->offload_events);
+
+	return 0;
+}
 
-	irq = platform_get_irq(pdev, 0);
+static int enable_driver_irq(struct intel_edac_dev_info *dev_info)
+{
+	int irq = -1, rc = 0;
+	struct cm_56xx_denali_ctl_86 denali_ctl_86;
+
+	snprintf(&dev_info->data->irq_name[0], IRQ_NAME_LEN,
+			"%s-mon", dev_info->ctl_name);
+
+	irq = platform_get_irq(dev_info->pdev, 0);
 	if (irq < 0) {
 		pr_err("Could not get irq number.\n");
-		goto err_noirq;
+		return ERR_STAGE_5;
 	}
 
 	/*
 	 * Enable memory controller interrupts.
 	 */
-	if (dev_info->is_ddr4)
-		denali_ctl_86.int_mask = CM_INT_MASK_FULL;
-	else
-		denali_ctl_86.int_mask = CM_INT_MASK_BASE;
+	if (dev_info->is_controller_configured) {
+		if (dev_info->is_ddr4)
+			denali_ctl_86.int_mask = CM_INT_MASK_FULL;
+		else
+			denali_ctl_86.int_mask = CM_INT_MASK_BASE;
+	} else
+		denali_ctl_86.int_mask = CM_INT_MASK_BASE_PROBE;
 
 	if (ncr_write(dev_info->cm_region, CM_56XX_DENALI_CTL_86,
 		4, (u32 *) &denali_ctl_86)) {
 		pr_err("Could not write interrupt mask reg (%s - ctl_86).\n",
 			dev_info->ctl_name);
-		goto err_noirq;
+		return ERR_STAGE_6;
 	}
 
 	dev_info->data->irq = irq;
-	rc = devm_request_irq(&pdev->dev, irq,
-			cmmon_isr, IRQF_ONESHOT,
+	rc = devm_request_threaded_irq(&dev_info->pdev->dev, irq,
+			cmmon_isr_hw, cmmon_isr_sw, IRQF_ONESHOT,
 			&dev_info->data->irq_name[0], dev_info);
 
 	if (rc) {
@@ -1145,23 +1302,285 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
 					4, (u32 *) &denali_ctl_86)) {
 			pr_err("Could not mask interrupts (%s - ctl_86).\n",
 					dev_info->ctl_name);
+			return ERR_STAGE_7;
 		}
 
-		goto err_noirq;
+		return ERR_STAGE_6;
 	}
 	return 0;
+}
 
-err_noirq:
-	if (dev_info->is_ddr4)
-		cancel_work_sync(&dev_info->offload_alerts);
-	cancel_work_sync(&dev_info->offload_events);
 
-	edac_device_del_device(&dev_info->pdev->dev);
+#ifdef CONFIG_DEBUG_CMEM
+static ssize_t
+axxia_cmem_read(struct file *filp, char *buffer, size_t length, loff_t *offset)
+{
+	char *buf = NULL;
+	struct intel_edac_dev_info *dev_info =
+		(struct intel_edac_dev_info *) filp->private_data;
+	ssize_t len;
+
+	if (*offset > 0)
+		return 0;
+
+	buf = kmalloc(PAGE_SIZE, __GFP_WAIT);
+	if (buf == NULL)
+		goto no_mem_buffer;
+
+	mutex_lock(&dev_info->state_machine_lock);
+
+	/*
+	 * Do not modify this. Content is used by rte driver.
+	 * Once changed modify rte code.
+	 */
+	len = snprintf(buf, PAGE_SIZE-1, "Node: 0x%x\n"
+		"Command available:\n"
+		"          dump - triggers mpr_page1 dump.\n",
+		(int) dev_info->cm_region >> 16);
+
+	mutex_unlock(&dev_info->state_machine_lock);
+
+	buf[len] = '\0';
+	if (copy_to_user(buffer, buf, len))
+		len = -EFAULT;
+
+	kfree(buf);
+	*offset += len;
+	return len;
 
-err_nosysfs:
-	edac_device_free_ctl_info(dev_info->edac_dev);
-err_noctlinfo:
+no_mem_buffer:
+	pr_err("Could not allocate memory for cmem edac control buffer.\n");
+	return -ENOSPC;
+}
+
+static ssize_t
+axxia_cmem_write(struct file *file, const char __user *buffer,
+		 size_t count, loff_t *ppos)
+{
+	char *buf = NULL;
+	struct intel_edac_dev_info *dev_info =
+		(struct intel_edac_dev_info *) file->private_data;
+
+	buf = kmalloc(count + 1, __GFP_WAIT);
+	if (buf == NULL)
+		goto no_mem_buffer;
+
+	memset(buf, 0, count + 1);
+
+	if (copy_from_user(buf, buffer, count)) {
+		pr_err("Could not copy data from user.\n");
+		goto cfu_failed;
+	}
+
+	if (!strncmp(buf, "dump", 4)) {
+		atomic_inc(&dev_info->data->dump_in_progress);
+		wake_up(&dev_info->data->dump_wq);
+	}
+
+	kfree(buf);
+	return count;
+
+cfu_failed:
+	kfree(buf);
+	return -EFAULT;
+
+no_mem_buffer:
+	pr_err("Could not allocate memory for cmem edac control buffer.\n");
+	return -ENOSPC;
+}
+
+int axxia_cmem_open(struct inode *inode, struct file *filp)
+{
+	try_module_get(THIS_MODULE);
+	filp->private_data = PDE_DATA(inode);
+	return 0;
+}
+
+int axxia_cmem_close(struct inode *inode, struct file *filp)
+{
+	module_put(THIS_MODULE);
+	filp->private_data = 0;
+	return 0;
+}
+
+static const struct file_operations axxia_edac_cmem_proc_ops = {
+	.owner      = THIS_MODULE,
+	.open       = axxia_cmem_open,
+	.read       = axxia_cmem_read,
+	.write      = axxia_cmem_write,
+	.release    = axxia_cmem_close,
+	.llseek     = noop_llseek
+};
+
+static void remove_procfs_entry(struct intel_edac_dev_info *dev_info)
+{
+	if (dev_info && dev_info->dir_entry) {
+		proc_remove(dev_info->dir_entry);
+		dev_info->dir_entry = NULL;
+	}
+}
+#endif
+
+static int intel_edac_mc_probe(struct platform_device *pdev)
+{
+	int i, j, k, l;
+	int count;
+	struct intel_edac_dev_info *dev_info = NULL;
+	struct resource *io;
+	struct device_node *np = pdev->dev.of_node;
+	struct cm_56xx_denali_ctl_00 denali_ctl_00;
+	int ret = -1;
+
+	count = atomic64_inc_return(&mc_counter);
+	if ((count - 1) == MEMORY_CONTROLLERS)
+		goto err_nodev;
+
+	dev_info = devm_kzalloc(&pdev->dev, sizeof(*dev_info), GFP_KERNEL);
+	if (!dev_info)
+		goto err_nomem;
+
+	dev_info->ctl_name =
+		devm_kzalloc(&pdev->dev, 32*sizeof(char), GFP_KERNEL);
+	if (!dev_info->ctl_name)
+		goto err_nomem;
+
+	dev_info->blk_name =
+		devm_kzalloc(&pdev->dev, 32*sizeof(char), GFP_KERNEL);
+	if (!dev_info->blk_name)
+		goto err_nomem;
+
+	dev_info->data =
+		devm_kzalloc(&pdev->dev, sizeof(*dev_info->data), GFP_KERNEL);
+	if (!dev_info->data)
+		goto err_nomem;
+
+	init_waitqueue_head(&dev_info->data->dump_wq);
+	init_waitqueue_head(&dev_info->data->event_wq);
+
+	raw_spin_lock_init(&dev_info->data->mpr_data_lock);
+	mutex_init(&dev_info->data->edac_sysfs_data_lock);
+	mutex_init(&dev_info->state_machine_lock);
+
+	strncpy(dev_info->ctl_name, np->name, 32);
+	dev_info->ctl_name[31] = '\0';
+
+	strncpy(dev_info->blk_name, "ECC", 32);
+	dev_info->blk_name[31] = '\0';
+
+	edac_op_state = EDAC_OPSTATE_POLL;
+
+	dev_info->pdev = pdev;
+	dev_info->edac_idx = edac_device_alloc_index();
+	dev_info->data->irq = 0;
+
+	/* setup all counters */
+	for (i = 0; i < NR_EVENTS; ++i)
+		atomic_set(&dev_info->data->events[i].counter, 0);
+
+	for (j = 0; j < MAX_CS; ++j) {
+		for (l = 0; l < MAX_DQ; ++l) {
+			for (k = 0; k < MPR_ERRORS; ++k, ++i) {
+				atomic_set(&dev_info->data->
+						alerts[j][l][k].counter, 0);
+			}
+		}
+	}
+
+	io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!io) {
+		dev_err(&pdev->dev, "Unable to get mem resource\n");
+		goto err_init;
+	}
+	dev_info->cm_region = io->start;
+	dev_info->syscon =
+		syscon_regmap_lookup_by_phandle(np, "syscon");
+	if (IS_ERR(dev_info->syscon)) {
+		pr_info(FMT, np->name);
+		dev_info->axi2ser3_region = ioremap(AXI2_SER3_PHY_ADDR,
+			AXI2_SER3_PHY_SIZE);
+		if (!dev_info->axi2ser3_region) {
+			pr_err("ioremap of axi2ser3 region failed\n");
+			goto err_init;
+		}
+	}
+
+	if (ncr_read(dev_info->cm_region, CM_56XX_DENALI_CTL_00,
+		4, (u32 *) &denali_ctl_00)) {
+		pr_err("Could not read ddr version.\n");
+		goto err_init;
+	}
+
+	if (denali_ctl_00.start == 1) {
+		/* uboot has configured CMEM */
+		if (denali_ctl_00.dram_class == 0xa) {
+			pr_info("%s supports mpr dump (DDR4).\n",
+					dev_info->ctl_name);
+			dev_info->is_ddr4 = 1;
+		}
+		if (denali_ctl_00.dram_class == 0x6) {
+			pr_info("%s doesn't support mpr dump (DDR3).\n",
+				dev_info->ctl_name);
+		}
+		dev_info->is_controller_configured = 1;
+
+		ret = initialize(dev_info);
+		if (ret)
+			goto err_uninit;
+
+		ret = enable_workers(dev_info);
+		if (ret)
+			goto err_uninit;
+
+		ret = enable_driver_irq(dev_info);
+		if (ret)
+			goto err_uninit;
+
+	} else {
+		/* CMEM is not configured */
+		dev_info->is_controller_configured = 0;
+
+		ret = enable_driver_irq(dev_info);
+		if (ret)
+			goto err_uninit;
+
+		pr_info("CMEM base init: controller: %s DEV %s (INTERRUPT).\n",
+			dev_info->ctl_name,
+			dev_name(&dev_info->pdev->dev));
+	}
+
+#ifdef CONFIG_DEBUG_CMEM
+	/* in this case create procfs file to be used by rte */
+	dev_info->proc_name =
+		devm_kzalloc(&pdev->dev, 32*sizeof(char),
+				GFP_KERNEL);
+	if (!dev_info->proc_name)
+		goto err_uninit;
+
+	snprintf(dev_info->proc_name, 31*sizeof(char),
+		"driver/axxia_edac_%s_control",
+		dev_info->ctl_name);
+
+	/* each instance shall know each private data */
+	dev_info->dir_entry =
+		proc_create_data(dev_info->proc_name, 0200,
+				NULL, &axxia_edac_cmem_proc_ops,
+				dev_info);
+
+	if (dev_info->dir_entry == NULL) {
+		pr_err("Could not create proc entry for %s.\n",
+				dev_info->ctl_name);
+		goto err_uninit;
+	}
+#endif
+	return 0;
+
+
+err_uninit:
+	uninitialize(dev_info, ret,
+			dev_info->is_controller_configured == 0 ? 1 : 0);
+err_init:
 	mutex_destroy(&dev_info->data->edac_sysfs_data_lock);
+	mutex_destroy(&dev_info->state_machine_lock);
 	atomic64_dec(&mc_counter);
 	return 1;
 err_nomem:
@@ -1172,23 +1591,20 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
 	return -ENODEV;
 }
 
+
+
 static int intel_edac_mc_remove(struct platform_device *pdev)
 {
 	struct intel_edac_dev_info *dev_info =
 		(struct intel_edac_dev_info *) &pdev->dev;
 
 	if (dev_info) {
-		if (dev_info->data->irq > 0) {
-			disable_irq(dev_info->data->irq);
-			devm_free_irq(&pdev->dev,
-					dev_info->data->irq, dev_info);
+#ifdef CONFIG_DEBUG_CMEM
+		remove_procfs_entry(dev_info);
+#endif
 
-			dev_info->data->irq = 0;
-
-			if (dev_info->is_ddr4)
-				cancel_work_sync(&dev_info->offload_alerts);
-			cancel_work_sync(&dev_info->offload_events);
-		}
+		uninitialize(dev_info, ERR_STAGE_8,
+			dev_info->is_controller_configured == 0 ? 1 : 0);
 
 		if (dev_info->edac_dev != NULL) {
 			edac_device_del_device(&dev_info->pdev->dev);
@@ -1196,6 +1612,8 @@ static int intel_edac_mc_remove(struct platform_device *pdev)
 		}
 
 		mutex_destroy(&dev_info->data->edac_sysfs_data_lock);
+		mutex_destroy(&dev_info->state_machine_lock);
+
 		atomic64_dec(&mc_counter);
 	}
 	platform_device_unregister(pdev);
diff --git a/drivers/edac/axxia_edac-mc_56xx.c b/drivers/edac/axxia_edac-mc_56xx.c
index f850ebe..fb1e742 100644
--- a/drivers/edac/axxia_edac-mc_56xx.c
+++ b/drivers/edac/axxia_edac-mc_56xx.c
@@ -92,68 +92,68 @@
 #define ALIVE_NOTIFICATION_PERIOD (90*1000)
 
 static int log = 1;
-module_param(log, int, S_IRUGO|S_IWUSR);
+module_param(log, int, 0644);
 MODULE_PARM_DESC(log, "Log each error to kernel log.");
 
 static int force_restart = 1;
-module_param(force_restart, int, S_IRUGO|S_IWUSR);
+module_param(force_restart, int, 0644);
 MODULE_PARM_DESC(force_restart, "Machine restart on fatal error.");
 
 static atomic64_t mc_counter = ATOMIC_INIT(0);
 /*
- Bit [34] = Logical OR of all lower bits.
- Bit [33] = A CRC error occurred on the write data bus.
- Bit [32] = The software-initiated control word write has completed.
- Bit [31] = The user-initiated DLL resync has completed.
- Bit [30] = A state change has been detected on the
-	dfi_init_complete signal after initialization.
- Bit [29] = The assertion of the INHIBIT_DRAM_CMD parameter has
-	successfully inhibited the command queue.
- Bit [28] = The register interface-initiated mode register write has
-	completed and another mode register write may be issued.
- Bit [27] = A Low Power Interface (LPI) timeout error has occurred.
- Bit [26] = MPR read command, initiated with a software MPR_READ request,
-	 is complete.
- Bit [25] = Error received from the PHY on the DFI bus.
- Bit [24] = RESERVED
- Bit [23] = RESERVED
- Bit [22] = A parity error has been detected on the address/control bus
-	on a registered DIMM.
- Bit [21] = The leveling operation has completed.
- Bit [20] = A read leveling gate training operation has been requested.
- Bit [19] = A read leveling operation has been requested.
- Bit [18] = A write leveling operation has been requested.
- Bit [17] = A DFI update error has occurred. Error information can be
-	found in the UPDATE_ERROR_STATUS parameter.
- Bit [16] = A write leveling error has occurred. Error information can
-	be found in the WRLVL_ERROR_STATUS parameter.
- Bit [15] = A read leveling gate training error has occurred. Error
-	information can be found in the RDLVL_ERROR_STATUS parameter.
- Bit [14] = A read leveling error has occurred. Error information can be
-	found in the RDLVL_ERROR_STATUS parameter.
- Bit [13] = The user has programmed an invalid setting associated with
-	user words per burst.
-	Examples: Setting param_reduc when burst length = 2. A 1:2
-	MC:PHY clock ratio with burst length = 2.
- Bit [12] = A wrap cycle crossing a DRAM page has been detected. This
-	is unsupported & may result in memory data corruption.
- Bit [11] = A write was attempted to a writeprotected region.
- Bit [10] = The BIST operation has been completed.
- Bit [9] = The low power operation has been completed.
- Bit [8] = The MC initialization has been completed.
- Bit [7] = An error occurred on the port command channel.
- Bit [6] = Multiple uncorrectable ECC events have been detected.
- Bit [5] = An uncorrectable ECC event has been detected.
- Bit [4] = Multiple correctable ECC events have been detected.
- Bit [3] = A correctable ECC event has been detected.
- Bit [2] = Multiple accesses outside the defined PHYSICAL memory space
-	have occurred.
- Bit [1] = A memory access outside the defined PHYSICAL memory space
-	has occurred.
- Bit [0] = The memory reset is valid on the DFI bus.
-
- Of these 1, 2, 3, 4, 5, 6, 7, 12, 22 and 26 are of interest.
-*/
+ * Bit [34] = Logical OR of all lower bits.
+ * Bit [33] = A CRC error occurred on the write data bus.
+ * Bit [32] = The software-initiated control word write has completed.
+ * Bit [31] = The user-initiated DLL resync has completed.
+ * Bit [30] = A state change has been detected on the
+ *        dfi_init_complete signal after initialization.
+ * Bit [29] = The assertion of the INHIBIT_DRAM_CMD parameter has
+ *        successfully inhibited the command queue.
+ * Bit [28] = The register interface-initiated mode register write has
+ *        completed and another mode register write may be issued.
+ * Bit [27] = A Low Power Interface (LPI) timeout error has occurred.
+ * Bit [26] = MPR read command, initiated with a software MPR_READ request,
+ *         is complete.
+ * Bit [25] = Error received from the PHY on the DFI bus.
+ * Bit [24] = RESERVED
+ * Bit [23] = RESERVED
+ * Bit [22] = A parity error has been detected on the address/control bus
+ *        on a registered DIMM.
+ * Bit [21] = The leveling operation has completed.
+ * Bit [20] = A read leveling gate training operation has been requested.
+ * Bit [19] = A read leveling operation has been requested.
+ * Bit [18] = A write leveling operation has been requested.
+ * Bit [17] = A DFI update error has occurred. Error information can be
+ *        found in the UPDATE_ERROR_STATUS parameter.
+ * Bit [16] = A write leveling error has occurred. Error information can
+ *        be found in the WRLVL_ERROR_STATUS parameter.
+ * Bit [15] = A read leveling gate training error has occurred. Error
+ *        information can be found in the RDLVL_ERROR_STATUS parameter.
+ * Bit [14] = A read leveling error has occurred. Error information can be
+ *        found in the RDLVL_ERROR_STATUS parameter.
+ * Bit [13] = The user has programmed an invalid setting associated with
+ *        user words per burst.
+ *        Examples: Setting param_reduc when burst length = 2. A 1:2
+ *        MC:PHY clock ratio with burst length = 2.
+ * Bit [12] = A wrap cycle crossing a DRAM page has been detected. This
+ *        is unsupported & may result in memory data corruption.
+ * Bit [11] = A write was attempted to a writeprotected region.
+ * Bit [10] = The BIST operation has been completed.
+ * Bit [9] = The low power operation has been completed.
+ * Bit [8] = The MC initialization has been completed.
+ * Bit [7] = An error occurred on the port command channel.
+ * Bit [6] = Multiple uncorrectable ECC events have been detected.
+ * Bit [5] = An uncorrectable ECC event has been detected.
+ * Bit [4] = Multiple correctable ECC events have been detected.
+ * Bit [3] = A correctable ECC event has been detected.
+ * Bit [2] = Multiple accesses outside the defined PHYSICAL memory space
+ *        have occurred.
+ * Bit [1] = A memory access outside the defined PHYSICAL memory space
+ *        has occurred.
+ * Bit [0] = The memory reset is valid on the DFI bus.
+ *
+ * Of these 1, 2, 3, 4, 5, 6, 7, 12, 22 and 26 are of interest.
+ */
 
 /*
  *   MPR dump processing - overview.
@@ -162,7 +162,7 @@ static atomic64_t mc_counter = ATOMIC_INIT(0);
  * one need to collect dumps for all available cs. Below given example
  * for two cs0/cs1.
  *
- *   SMEM MC           smmon_isr           smmon_wq
+ *   SMEM MC           smmon_isr_sw           smmon_wq
  *     |                   |                   |
  *     |                   |                   |
  *     |ALERT_N - int_status bit [33]          |
@@ -612,6 +612,10 @@ struct intel_edac_dev_info {
 	char *blk_name;
 	struct work_struct offload_alerts;
 	struct work_struct offload_events;
+	struct workqueue_struct *wq_alerts;
+	struct workqueue_struct *wq_events;
+	int finish_alerts;
+	int finish_events;
 	int is_ddr4;
 	int edac_idx;
 	u32 sm_region;
@@ -725,7 +729,7 @@ static struct edac_dev_sysfs_attribute device_block_attr[] = {
 	{
 		.attr = {
 			.name = "mpr_page1",
-			.mode = (S_IRUGO | S_IWUSR)
+			.mode = (0644)
 		},
 		.show = mpr1_dump_show,
 		.store = NULL},
@@ -904,7 +908,13 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs)
 }
 
 static irqreturn_t
-smmon_isr(int interrupt, void *device)
+smmon_isr_hw(int interrupt, void *device)
+{
+	return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t
+smmon_isr_sw(int interrupt, void *device)
 {
 	struct intel_edac_dev_info *dev_info = device;
 	struct sm_56xx_denali_ctl_366 denali_ctl_366;
@@ -990,11 +1000,14 @@ static void intel_sm_alerts_error_check(struct edac_device_ctl_info *edac_dev)
 
 start:
 	/* keep hung up monitor happy 90 sec's */
-	if (0 == wait_event_timeout(dev_info->data->dump_wq,
+	if (wait_event_timeout(dev_info->data->dump_wq,
 		atomic_read(&dev_info->data->dump_in_progress),
-		msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)))
+		msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0)
 		goto start;
 
+	if (dev_info->finish_alerts)
+		goto finish;
+
 		/* the only one running workqueue */
 	for (i = 0; i < dev_info->data->cs_count; ++i) {
 
@@ -1021,6 +1034,9 @@ static void intel_sm_alerts_error_check(struct edac_device_ctl_info *edac_dev)
 		wait_event(dev_info->data->dump_wq,
 			   atomic_read(&dev_info->data->dump_ready));
 
+		if (dev_info->finish_alerts)
+			goto finish;
+
 		atomic_set(&dev_info->data->dump_ready, 0);
 		/* collect data */
 		collect_mpr_dump(dev_info, SM_MPR_PAGE, i);
@@ -1054,6 +1070,10 @@ static void intel_sm_alerts_error_check(struct edac_device_ctl_info *edac_dev)
 	printk_ratelimited("Could not collect MPR dump.\n");
 	atomic_set(&dev_info->data->dump_in_progress, 0);
 	goto start;
+
+finish:
+	atomic_set(&dev_info->data->dump_ready, 0);
+	atomic_set(&dev_info->data->dump_in_progress, 0);
 }
 
 static void intel_sm_events_error_check(struct edac_device_ctl_info *edac_dev)
@@ -1065,13 +1085,16 @@ static void intel_sm_events_error_check(struct edac_device_ctl_info *edac_dev)
 	u32 counter;
 
 	while (1) {
-		if (0 == wait_event_timeout(dev_info->data->event_wq,
+		if (wait_event_timeout(dev_info->data->event_wq,
 			atomic_read(&dev_info->data->event_ready),
-			msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)))
+			msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0)
 			continue;
 
 		atomic_set(&dev_info->data->event_ready, 0);
 
+		if (dev_info->finish_events)
+			break;
+
 		mutex_lock(&dev_info->data->edac_sysfs_data_lock);
 		for (i = 0; i < NR_EVENTS; ++i) {
 			counter = atomic_xchg(&events[i].counter, 0);
@@ -1160,6 +1183,22 @@ static int get_active_dram(struct intel_edac_dev_info *dev_info)
 	return dram;
 }
 
+static void finish_workqueues(struct intel_edac_dev_info *dev_info)
+{
+	if (dev_info->is_ddr4) {
+		dev_info->finish_alerts = 1;
+		atomic_inc(&dev_info->data->dump_in_progress);
+		atomic_set(&dev_info->data->dump_ready, 1);
+		wake_up(&dev_info->data->dump_wq);
+		cancel_work_sync(&dev_info->offload_alerts);
+	}
+
+	dev_info->finish_events = 1;
+	atomic_set(&dev_info->data->event_ready, 1);
+	wake_up(&dev_info->data->event_wq);
+	cancel_work_sync(&dev_info->offload_events);
+}
+
 static int intel_edac_mc_probe(struct platform_device *pdev)
 {
 	struct edac_device_instance *instance;
@@ -1184,10 +1223,20 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
 	if (!dev_info)
 		goto err_nomem;
 
+	dev_info->ctl_name =
+		devm_kzalloc(&pdev->dev, 32*sizeof(char), GFP_KERNEL);
+	if (!dev_info->ctl_name)
+		goto err_nomem;
+
+	dev_info->blk_name =
+		devm_kzalloc(&pdev->dev, 32*sizeof(char), GFP_KERNEL);
+	if (!dev_info->blk_name)
+		goto err_nomem;
+
 	dev_info->data =
 		devm_kzalloc(&pdev->dev, sizeof(*dev_info->data), GFP_KERNEL);
 	if (!dev_info->data)
-		goto err_noctlinfo;
+		goto err_nomem;
 
 	init_waitqueue_head(&dev_info->data->dump_wq);
 	init_waitqueue_head(&dev_info->data->event_wq);
@@ -1195,6 +1244,12 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
 	raw_spin_lock_init(&dev_info->data->mpr_data_lock);
 	mutex_init(&dev_info->data->edac_sysfs_data_lock);
 
+	strncpy(dev_info->ctl_name, np->name, 32);
+	dev_info->ctl_name[31] = '\0';
+
+	strncpy(dev_info->blk_name, "ECC", 32);
+	dev_info->ctl_name[31] = '\0';
+
 	dev_info->ctl_name = kstrdup(np->name, GFP_KERNEL);
 	dev_info->blk_name = "ECC";
 	edac_op_state = EDAC_OPSTATE_POLL;
@@ -1268,8 +1323,10 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
 	dev_info->edac_dev =
 		edac_device_alloc_ctl_info(0, dev_info->ctl_name,
 					 1, dev_info->blk_name,
-					 NR_EVENTS +
-					 cs_count * dram_count * MPR_ERRORS,
+					 NR_EVENTS + (dev_info->is_ddr4 ?
+					 cs_count * dram_count * MPR_ERRORS
+					 :
+					 0),
 					 0, NULL, 0, dev_info->edac_idx);
 
 	if (!dev_info->edac_dev) {
@@ -1329,20 +1386,34 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
 	if (edac_device_add_device(dev_info->edac_dev) != 0) {
 		pr_info("Unable to add edac device for %s\n",
 			dev_info->ctl_name);
-		goto err_nosysfs;
+		goto err_noctlinfo;
 	}
 
 	snprintf(&dev_info->data->irq_name[0], IRQ_NAME_LEN,
 			"%s-mon", dev_info->ctl_name);
 
+	dev_info->wq_events =
+		alloc_workqueue("%s-events", WQ_MEM_RECLAIM, 1,
+				   (dev_info->ctl_name));
+	if (!dev_info->wq_events)
+		goto err_nosysfs;
+
+	if (dev_info->is_ddr4) {
+		dev_info->wq_alerts =
+		  alloc_workqueue("%s-alerts", WQ_MEM_RECLAIM, 1,
+				   (dev_info->ctl_name));
+
+		if (!dev_info->wq_alerts)
+			goto err_noevents;
+	}
 	if (dev_info->is_ddr4)
 		INIT_WORK(&dev_info->offload_alerts, axxia_alerts_work);
 
 	INIT_WORK(&dev_info->offload_events, axxia_events_work);
 
 	if (dev_info->is_ddr4)
-		schedule_work(&dev_info->offload_alerts);
-	schedule_work(&dev_info->offload_events);
+		queue_work(dev_info->wq_alerts, &dev_info->offload_alerts);
+	queue_work(dev_info->wq_events, &dev_info->offload_events);
 
 	irq = platform_get_irq(pdev, 0);
 	if (irq < 0) {
@@ -1382,8 +1453,8 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
 	}
 
 	dev_info->data->irq = irq;
-	rc = devm_request_irq(&pdev->dev, irq,
-			smmon_isr, IRQF_ONESHOT,
+	rc = devm_request_threaded_irq(&pdev->dev, irq,
+			smmon_isr_hw, smmon_isr_sw, IRQF_ONESHOT,
 			&dev_info->data->irq_name[0], dev_info);
 
 	if (rc) {
@@ -1411,18 +1482,23 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
 	return 0;
 
 err_noirq:
+	finish_workqueues(dev_info);
+	edac_device_del_device(&dev_info->pdev->dev);
+
 	if (dev_info->is_ddr4)
-		cancel_work_sync(&dev_info->offload_alerts);
-	cancel_work_sync(&dev_info->offload_events);
+		destroy_workqueue(dev_info->wq_alerts);
 
-	edac_device_del_device(&dev_info->pdev->dev);
+err_noevents:
+	destroy_workqueue(dev_info->wq_events);
 
 err_nosysfs:
 	edac_device_free_ctl_info(dev_info->edac_dev);
+
 err_noctlinfo:
 	mutex_destroy(&dev_info->data->edac_sysfs_data_lock);
 	atomic64_dec(&mc_counter);
 	return 1;
+
 err_nomem:
 	atomic64_dec(&mc_counter);
 	return -ENOMEM;
@@ -1444,9 +1520,11 @@ static int intel_edac_mc_remove(struct platform_device *pdev)
 
 			dev_info->data->irq = 0;
 
+			finish_workqueues(dev_info);
+
 			if (dev_info->is_ddr4)
-				cancel_work_sync(&dev_info->offload_alerts);
-			cancel_work_sync(&dev_info->offload_events);
+				destroy_workqueue(dev_info->wq_alerts);
+			destroy_workqueue(dev_info->wq_events);
 		}
 
 		if (dev_info->edac_dev != NULL) {
-- 
2.7.4



More information about the linux-yocto mailing list