1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright(c) 2022 Intel Corporation. */
3
4#include <linux/cpu.h>
5#include <linux/delay.h>
6#include <linux/fs.h>
7#include <linux/nmi.h>
8#include <linux/slab.h>
9#include <linux/stop_machine.h>
10
11#include "ifs.h"
12
13/*
14 * Note all code and data in this file is protected by
15 * ifs_sem. On HT systems all threads on a core will
16 * execute together, but only the first thread on the
17 * core will update results of the test.
18 */
19
20#define CREATE_TRACE_POINTS
21#include <trace/events/intel_ifs.h>
22
23/* Max retries on the same chunk */
24#define MAX_IFS_RETRIES  5
25
26/*
27 * Number of TSC cycles that a logical CPU will wait for the other
28 * logical CPU on the core in the WRMSR(ACTIVATE_SCAN).
29 */
30#define IFS_THREAD_WAIT 100000
31
32enum ifs_status_err_code {
33	IFS_NO_ERROR				= 0,
34	IFS_OTHER_THREAD_COULD_NOT_JOIN		= 1,
35	IFS_INTERRUPTED_BEFORE_RENDEZVOUS	= 2,
36	IFS_POWER_MGMT_INADEQUATE_FOR_SCAN	= 3,
37	IFS_INVALID_CHUNK_RANGE			= 4,
38	IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS	= 5,
39	IFS_CORE_NOT_CAPABLE_CURRENTLY		= 6,
40	IFS_UNASSIGNED_ERROR_CODE		= 7,
41	IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT	= 8,
42	IFS_INTERRUPTED_DURING_EXECUTION	= 9,
43};
44
45static const char * const scan_test_status[] = {
46	[IFS_NO_ERROR] = "SCAN no error",
47	[IFS_OTHER_THREAD_COULD_NOT_JOIN] = "Other thread could not join.",
48	[IFS_INTERRUPTED_BEFORE_RENDEZVOUS] = "Interrupt occurred prior to SCAN coordination.",
49	[IFS_POWER_MGMT_INADEQUATE_FOR_SCAN] =
50	"Core Abort SCAN Response due to power management condition.",
51	[IFS_INVALID_CHUNK_RANGE] = "Non valid chunks in the range",
52	[IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS] = "Mismatch in arguments between threads T0/T1.",
53	[IFS_CORE_NOT_CAPABLE_CURRENTLY] = "Core not capable of performing SCAN currently",
54	[IFS_UNASSIGNED_ERROR_CODE] = "Unassigned error code 0x7",
55	[IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT] =
56	"Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently",
57	[IFS_INTERRUPTED_DURING_EXECUTION] = "Interrupt occurred prior to SCAN start",
58};
59
60static void message_not_tested(struct device *dev, int cpu, union ifs_status status)
61{
62	if (status.error_code < ARRAY_SIZE(scan_test_status)) {
63		dev_info(dev, "CPU(s) %*pbl: SCAN operation did not start. %s\n",
64			 cpumask_pr_args(cpu_smt_mask(cpu)),
65			 scan_test_status[status.error_code]);
66	} else if (status.error_code == IFS_SW_TIMEOUT) {
67		dev_info(dev, "CPU(s) %*pbl: software timeout during scan\n",
68			 cpumask_pr_args(cpu_smt_mask(cpu)));
69	} else if (status.error_code == IFS_SW_PARTIAL_COMPLETION) {
70		dev_info(dev, "CPU(s) %*pbl: %s\n",
71			 cpumask_pr_args(cpu_smt_mask(cpu)),
72			 "Not all scan chunks were executed. Maximum forward progress retries exceeded");
73	} else {
74		dev_info(dev, "CPU(s) %*pbl: SCAN unknown status %llx\n",
75			 cpumask_pr_args(cpu_smt_mask(cpu)), status.data);
76	}
77}
78
79static void message_fail(struct device *dev, int cpu, union ifs_status status)
80{
81	struct ifs_data *ifsd = ifs_get_data(dev);
82
83	/*
84	 * control_error is set when the microcode runs into a problem
85	 * loading the image from the reserved BIOS memory, or it has
86	 * been corrupted. Reloading the image may fix this issue.
87	 */
88	if (status.control_error) {
89		dev_err(dev, "CPU(s) %*pbl: could not execute from loaded scan image. Batch: %02x version: 0x%x\n",
90			cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
91	}
92
93	/*
94	 * signature_error is set when the output from the scan chains does not
95	 * match the expected signature. This might be a transient problem (e.g.
96	 * due to a bit flip from an alpha particle or neutron). If the problem
97	 * repeats on a subsequent test, then it indicates an actual problem in
98	 * the core being tested.
99	 */
100	if (status.signature_error) {
101		dev_err(dev, "CPU(s) %*pbl: test signature incorrect. Batch: %02x version: 0x%x\n",
102			cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
103	}
104}
105
106static bool can_restart(union ifs_status status)
107{
108	enum ifs_status_err_code err_code = status.error_code;
109
110	/* Signature for chunk is bad, or scan test failed */
111	if (status.signature_error || status.control_error)
112		return false;
113
114	switch (err_code) {
115	case IFS_NO_ERROR:
116	case IFS_OTHER_THREAD_COULD_NOT_JOIN:
117	case IFS_INTERRUPTED_BEFORE_RENDEZVOUS:
118	case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN:
119	case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT:
120	case IFS_INTERRUPTED_DURING_EXECUTION:
121		return true;
122	case IFS_INVALID_CHUNK_RANGE:
123	case IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS:
124	case IFS_CORE_NOT_CAPABLE_CURRENTLY:
125	case IFS_UNASSIGNED_ERROR_CODE:
126		break;
127	}
128	return false;
129}
130
131/*
132 * Execute the scan. Called "simultaneously" on all threads of a core
133 * at high priority using the stop_cpus mechanism.
134 */
135static int doscan(void *data)
136{
137	int cpu = smp_processor_id();
138	u64 *msrs = data;
139	int first;
140
141	/* Only the first logical CPU on a core reports result */
142	first = cpumask_first(cpu_smt_mask(cpu));
143
144	/*
145	 * This WRMSR will wait for other HT threads to also write
146	 * to this MSR (at most for activate.delay cycles). Then it
147	 * starts scan of each requested chunk. The core scan happens
148	 * during the "execution" of the WRMSR. This instruction can
149	 * take up to 200 milliseconds (in the case where all chunks
150	 * are processed in a single pass) before it retires.
151	 */
152	wrmsrl(MSR_ACTIVATE_SCAN, msrs[0]);
153
154	if (cpu == first) {
155		/* Pass back the result of the scan */
156		rdmsrl(MSR_SCAN_STATUS, msrs[1]);
157	}
158
159	return 0;
160}
161
162/*
163 * Use stop_core_cpuslocked() to synchronize writing to MSR_ACTIVATE_SCAN
164 * on all threads of the core to be tested. Loop if necessary to complete
165 * run of all chunks. Include some defensive tests to make sure forward
166 * progress is made, and that the whole test completes in a reasonable time.
167 */
168static void ifs_test_core(int cpu, struct device *dev)
169{
170	union ifs_scan activate;
171	union ifs_status status;
172	unsigned long timeout;
173	struct ifs_data *ifsd;
174	u64 msrvals[2];
175	int retries;
176
177	ifsd = ifs_get_data(dev);
178
179	activate.rsvd = 0;
180	activate.delay = IFS_THREAD_WAIT;
181	activate.sigmce = 0;
182	activate.start = 0;
183	activate.stop = ifsd->valid_chunks - 1;
184
185	timeout = jiffies + HZ / 2;
186	retries = MAX_IFS_RETRIES;
187
188	while (activate.start <= activate.stop) {
189		if (time_after(jiffies, timeout)) {
190			status.error_code = IFS_SW_TIMEOUT;
191			break;
192		}
193
194		msrvals[0] = activate.data;
195		stop_core_cpuslocked(cpu, doscan, msrvals);
196
197		status.data = msrvals[1];
198
199		trace_ifs_status(cpu, activate, status);
200
201		/* Some cases can be retried, give up for others */
202		if (!can_restart(status))
203			break;
204
205		if (status.chunk_num == activate.start) {
206			/* Check for forward progress */
207			if (--retries == 0) {
208				if (status.error_code == IFS_NO_ERROR)
209					status.error_code = IFS_SW_PARTIAL_COMPLETION;
210				break;
211			}
212		} else {
213			retries = MAX_IFS_RETRIES;
214			activate.start = status.chunk_num;
215		}
216	}
217
218	/* Update status for this core */
219	ifsd->scan_details = status.data;
220
221	if (status.control_error || status.signature_error) {
222		ifsd->status = SCAN_TEST_FAIL;
223		message_fail(dev, cpu, status);
224	} else if (status.error_code) {
225		ifsd->status = SCAN_NOT_TESTED;
226		message_not_tested(dev, cpu, status);
227	} else {
228		ifsd->status = SCAN_TEST_PASS;
229	}
230}
231
232#define SPINUNIT 100 /* 100 nsec */
233static atomic_t array_cpus_out;
234
235/*
236 * Simplified cpu sibling rendezvous loop based on microcode loader __wait_for_cpus()
237 */
238static void wait_for_sibling_cpu(atomic_t *t, long long timeout)
239{
240	int cpu = smp_processor_id();
241	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
242	int all_cpus = cpumask_weight(smt_mask);
243
244	atomic_inc(t);
245	while (atomic_read(t) < all_cpus) {
246		if (timeout < SPINUNIT)
247			return;
248		ndelay(SPINUNIT);
249		timeout -= SPINUNIT;
250		touch_nmi_watchdog();
251	}
252}
253
254static int do_array_test(void *data)
255{
256	union ifs_array *command = data;
257	int cpu = smp_processor_id();
258	int first;
259
260	/*
261	 * Only one logical CPU on a core needs to trigger the Array test via MSR write.
262	 */
263	first = cpumask_first(cpu_smt_mask(cpu));
264
265	if (cpu == first) {
266		wrmsrl(MSR_ARRAY_BIST, command->data);
267		/* Pass back the result of the test */
268		rdmsrl(MSR_ARRAY_BIST, command->data);
269	}
270
271	/* Tests complete faster if the sibling is spinning here */
272	wait_for_sibling_cpu(&array_cpus_out, NSEC_PER_SEC);
273
274	return 0;
275}
276
277static void ifs_array_test_core(int cpu, struct device *dev)
278{
279	union ifs_array command = {};
280	bool timed_out = false;
281	struct ifs_data *ifsd;
282	unsigned long timeout;
283
284	ifsd = ifs_get_data(dev);
285
286	command.array_bitmask = ~0U;
287	timeout = jiffies + HZ / 2;
288
289	do {
290		if (time_after(jiffies, timeout)) {
291			timed_out = true;
292			break;
293		}
294		atomic_set(&array_cpus_out, 0);
295		stop_core_cpuslocked(cpu, do_array_test, &command);
296
297		if (command.ctrl_result)
298			break;
299	} while (command.array_bitmask);
300
301	ifsd->scan_details = command.data;
302
303	if (command.ctrl_result)
304		ifsd->status = SCAN_TEST_FAIL;
305	else if (timed_out || command.array_bitmask)
306		ifsd->status = SCAN_NOT_TESTED;
307	else
308		ifsd->status = SCAN_TEST_PASS;
309}
310
311/*
312 * Initiate per core test. It wakes up work queue threads on the target cpu and
313 * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and
314 * wait for all sibling threads to finish the scan test.
315 */
316int do_core_test(int cpu, struct device *dev)
317{
318	const struct ifs_test_caps *test = ifs_get_test_caps(dev);
319	struct ifs_data *ifsd = ifs_get_data(dev);
320	int ret = 0;
321
322	/* Prevent CPUs from being taken offline during the scan test */
323	cpus_read_lock();
324
325	if (!cpu_online(cpu)) {
326		dev_info(dev, "cannot test on the offline cpu %d\n", cpu);
327		ret = -EINVAL;
328		goto out;
329	}
330
331	switch (test->test_num) {
332	case IFS_TYPE_SAF:
333		if (!ifsd->loaded)
334			ret = -EPERM;
335		else
336			ifs_test_core(cpu, dev);
337		break;
338	case IFS_TYPE_ARRAY_BIST:
339		ifs_array_test_core(cpu, dev);
340		break;
341	default:
342		ret = -EINVAL;
343	}
344out:
345	cpus_read_unlock();
346	return ret;
347}
348