1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * OPAL hypervisor Maintenance interrupt handling support in PowerNV.
4 *
5 * Copyright 2014 IBM Corporation
6 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
7 */
8
9#undef DEBUG
10
11#include <linux/kernel.h>
12#include <linux/init.h>
13#include <linux/of.h>
14#include <linux/mm.h>
15#include <linux/slab.h>
16
17#include <asm/opal.h>
18#include <asm/cputable.h>
19#include <asm/machdep.h>
20
21#include "powernv.h"
22
23static int opal_hmi_handler_nb_init;
24struct OpalHmiEvtNode {
25	struct list_head list;
26	struct OpalHMIEvent hmi_evt;
27};
28
29struct xstop_reason {
30	uint32_t xstop_reason;
31	const char *unit_failed;
32	const char *description;
33};
34
35static LIST_HEAD(opal_hmi_evt_list);
36static DEFINE_SPINLOCK(opal_hmi_evt_lock);
37
38static void print_core_checkstop_reason(const char *level,
39					struct OpalHMIEvent *hmi_evt)
40{
41	int i;
42	static const struct xstop_reason xstop_reason[] = {
43		{ CORE_CHECKSTOP_IFU_REGFILE, "IFU",
44				"RegFile core check stop" },
45		{ CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
46		{ CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
47				"Core checkstop during recovery" },
48		{ CORE_CHECKSTOP_ISU_REGFILE, "ISU",
49				"RegFile core check stop (mapper error)" },
50		{ CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
51		{ CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
52		{ CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
53		{ CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
54				"Recovery in maintenance mode" },
55		{ CORE_CHECKSTOP_LSU_REGFILE, "LSU",
56				"RegFile core check stop" },
57		{ CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
58				"Forward Progress Error" },
59		{ CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
60		{ CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
61		{ CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
62				"Hypervisor Resource error - core check stop" },
63		{ CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
64				"Hang Recovery Failed (core check stop)" },
65		{ CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
66				"Ambiguous Hang Detected (unknown source)" },
67		{ CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
68				"Debug Trigger Error inject" },
69		{ CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
70				"Hypervisor check stop via SPRC/SPRD" },
71	};
72
73	/* Validity check */
74	if (!hmi_evt->u.xstop_error.xstop_reason) {
75		printk("%s	Unknown Core check stop.\n", level);
76		return;
77	}
78
79	printk("%s	CPU PIR: %08x\n", level,
80			be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
81	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
82		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
83					xstop_reason[i].xstop_reason)
84			printk("%s	[Unit: %-3s] %s\n", level,
85					xstop_reason[i].unit_failed,
86					xstop_reason[i].description);
87}
88
89static void print_nx_checkstop_reason(const char *level,
90					struct OpalHMIEvent *hmi_evt)
91{
92	int i;
93	static const struct xstop_reason xstop_reason[] = {
94		{ NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
95					"SHM invalid state error" },
96		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
97					"DMA invalid state error bit 15" },
98		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
99					"DMA invalid state error bit 16" },
100		{ NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
101					"Channel 0 invalid state error" },
102		{ NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
103					"Channel 1 invalid state error" },
104		{ NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
105					"Channel 2 invalid state error" },
106		{ NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
107					"Channel 3 invalid state error" },
108		{ NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
109					"Channel 4 invalid state error" },
110		{ NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
111					"Channel 5 invalid state error" },
112		{ NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
113					"Channel 6 invalid state error" },
114		{ NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
115					"Channel 7 invalid state error" },
116		{ NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
117					"UE error on CRB(CSB address, CCB)" },
118		{ NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
119					"SUE error on CRB(CSB address, CCB)" },
120		{ NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
121		"CRB Kill ISN received while holding ISN with UE error" },
122	};
123
124	/* Validity check */
125	if (!hmi_evt->u.xstop_error.xstop_reason) {
126		printk("%s	Unknown NX check stop.\n", level);
127		return;
128	}
129
130	printk("%s	NX checkstop on CHIP ID: %x\n", level,
131			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
132	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
133		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
134					xstop_reason[i].xstop_reason)
135			printk("%s	[Unit: %-3s] %s\n", level,
136					xstop_reason[i].unit_failed,
137					xstop_reason[i].description);
138}
139
140static void print_npu_checkstop_reason(const char *level,
141					struct OpalHMIEvent *hmi_evt)
142{
143	uint8_t reason, reason_count, i;
144
145	/*
146	 * We may not have a checkstop reason on some combination of
147	 * hardware and/or skiboot version
148	 */
149	if (!hmi_evt->u.xstop_error.xstop_reason) {
150		printk("%s	NPU checkstop on chip %x\n", level,
151			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
152		return;
153	}
154
155	/*
156	 * NPU2 has 3 FIRs. Reason encoded on a byte as:
157	 *   2 bits for the FIR number
158	 *   6 bits for the bit number
159	 * It may be possible to find several reasons.
160	 *
161	 * We don't display a specific message per FIR bit as there
162	 * are too many and most are meaningless without the workbook
163	 * and/or hw team help anyway.
164	 */
165	reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) /
166		sizeof(reason);
167	for (i = 0; i < reason_count; i++) {
168		reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 0xFF;
169		if (reason)
170			printk("%s	NPU checkstop on chip %x: FIR%d bit %d is set\n",
171				level,
172				be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id),
173				reason >> 6, reason & 0x3F);
174	}
175}
176
177static void print_checkstop_reason(const char *level,
178					struct OpalHMIEvent *hmi_evt)
179{
180	uint8_t type = hmi_evt->u.xstop_error.xstop_type;
181	switch (type) {
182	case CHECKSTOP_TYPE_CORE:
183		print_core_checkstop_reason(level, hmi_evt);
184		break;
185	case CHECKSTOP_TYPE_NX:
186		print_nx_checkstop_reason(level, hmi_evt);
187		break;
188	case CHECKSTOP_TYPE_NPU:
189		print_npu_checkstop_reason(level, hmi_evt);
190		break;
191	default:
192		printk("%s	Unknown Malfunction Alert of type %d\n",
193		       level, type);
194		break;
195	}
196}
197
198static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
199{
200	const char *level, *sevstr, *error_info;
201	static const char *hmi_error_types[] = {
202		"Malfunction Alert",
203		"Processor Recovery done",
204		"Processor recovery occurred again",
205		"Processor recovery occurred for masked error",
206		"Timer facility experienced an error",
207		"TFMR SPR is corrupted",
208		"UPS (Uninterrupted Power System) Overflow indication",
209		"An XSCOM operation failure",
210		"An XSCOM operation completed",
211		"SCOM has set a reserved FIR bit to cause recovery",
212		"Debug trigger has set a reserved FIR bit to cause recovery",
213		"A hypervisor resource error occurred",
214		"CAPP recovery process is in progress",
215	};
216
217	/* Print things out */
218	if (hmi_evt->version < OpalHMIEvt_V1) {
219		pr_err("HMI Interrupt, Unknown event version %d !\n",
220			hmi_evt->version);
221		return;
222	}
223	switch (hmi_evt->severity) {
224	case OpalHMI_SEV_NO_ERROR:
225		level = KERN_INFO;
226		sevstr = "Harmless";
227		break;
228	case OpalHMI_SEV_WARNING:
229		level = KERN_WARNING;
230		sevstr = "";
231		break;
232	case OpalHMI_SEV_ERROR_SYNC:
233		level = KERN_ERR;
234		sevstr = "Severe";
235		break;
236	case OpalHMI_SEV_FATAL:
237	default:
238		level = KERN_ERR;
239		sevstr = "Fatal";
240		break;
241	}
242
243	printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
244		level, sevstr,
245		hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
246		"Recovered" : "Not recovered");
247	error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
248			hmi_error_types[hmi_evt->type]
249			: "Unknown";
250	printk("%s Error detail: %s\n", level, error_info);
251	printk("%s	HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
252	if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
253		(hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
254		printk("%s	TFMR: %016llx\n", level,
255						be64_to_cpu(hmi_evt->tfmr));
256
257	if (hmi_evt->version < OpalHMIEvt_V2)
258		return;
259
260	/* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
261	if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
262		print_checkstop_reason(level, hmi_evt);
263}
264
265static void hmi_event_handler(struct work_struct *work)
266{
267	unsigned long flags;
268	struct OpalHMIEvent *hmi_evt;
269	struct OpalHmiEvtNode *msg_node;
270	uint8_t disposition;
271	struct opal_msg msg;
272	int unrecoverable = 0;
273
274	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
275	while (!list_empty(&opal_hmi_evt_list)) {
276		msg_node = list_entry(opal_hmi_evt_list.next,
277					   struct OpalHmiEvtNode, list);
278		list_del(&msg_node->list);
279		spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
280
281		hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
282		print_hmi_event_info(hmi_evt);
283		disposition = hmi_evt->disposition;
284		kfree(msg_node);
285
286		/*
287		 * Check if HMI event has been recovered or not. If not
288		 * then kernel can't continue, we need to panic.
289		 * But before we do that, display all the HMI event
290		 * available on the list and set unrecoverable flag to 1.
291		 */
292		if (disposition != OpalHMI_DISPOSITION_RECOVERED)
293			unrecoverable = 1;
294
295		spin_lock_irqsave(&opal_hmi_evt_lock, flags);
296	}
297	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
298
299	if (unrecoverable) {
300		/* Pull all HMI events from OPAL before we panic. */
301		while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
302			u32 type;
303
304			type = be32_to_cpu(msg.msg_type);
305
306			/* skip if not HMI event */
307			if (type != OPAL_MSG_HMI_EVT)
308				continue;
309
310			/* HMI event info starts from param[0] */
311			hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
312			print_hmi_event_info(hmi_evt);
313		}
314
315		pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");
316	}
317}
318
319static DECLARE_WORK(hmi_event_work, hmi_event_handler);
320/*
321 * opal_handle_hmi_event - notifier handler that queues up HMI events
322 * to be preocessed later.
323 */
324static int opal_handle_hmi_event(struct notifier_block *nb,
325			  unsigned long msg_type, void *msg)
326{
327	unsigned long flags;
328	struct OpalHMIEvent *hmi_evt;
329	struct opal_msg *hmi_msg = msg;
330	struct OpalHmiEvtNode *msg_node;
331
332	/* Sanity Checks */
333	if (msg_type != OPAL_MSG_HMI_EVT)
334		return 0;
335
336	/* HMI event info starts from param[0] */
337	hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
338
339	/* Delay the logging of HMI events to workqueue. */
340	msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
341	if (!msg_node) {
342		pr_err("HMI: out of memory, Opal message event not handled\n");
343		return -ENOMEM;
344	}
345	memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(*hmi_evt));
346
347	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
348	list_add(&msg_node->list, &opal_hmi_evt_list);
349	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
350
351	schedule_work(&hmi_event_work);
352	return 0;
353}
354
355static struct notifier_block opal_hmi_handler_nb = {
356	.notifier_call	= opal_handle_hmi_event,
357	.next		= NULL,
358	.priority	= 0,
359};
360
361int __init opal_hmi_handler_init(void)
362{
363	int ret;
364
365	if (!opal_hmi_handler_nb_init) {
366		ret = opal_message_notifier_register(
367				OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
368		if (ret) {
369			pr_err("%s: Can't register OPAL event notifier (%d)\n",
370			       __func__, ret);
371			return ret;
372		}
373		opal_hmi_handler_nb_init = 1;
374	}
375	return 0;
376}
377