1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * This file contains an ECC algorithm that detects and corrects 1 bit
4 * errors in a 256 byte block of data.
5 *
6 * Copyright © 2008 Koninklijke Philips Electronics NV.
7 *                  Author: Frans Meulenbroeks
8 *
9 * Completely replaces the previous ECC implementation which was written by:
10 *   Steven J. Hill (sjhill@realitydiluted.com)
11 *   Thomas Gleixner (tglx@linutronix.de)
12 *
13 * Information on how this algorithm works and how it was developed
14 * can be found in Documentation/driver-api/mtd/nand_ecc.rst
15 */
16
17#include <linux/types.h>
18#include <linux/kernel.h>
19#include <linux/module.h>
20#include <linux/mtd/mtd.h>
21#include <linux/mtd/rawnand.h>
22#include <linux/mtd/nand_ecc.h>
23#include <asm/byteorder.h>
24
25/*
26 * invparity is a 256 byte table that contains the odd parity
27 * for each byte. So if the number of bits in a byte is even,
28 * the array element is 1, and when the number of bits is odd
29 * the array eleemnt is 0.
30 */
31static const char invparity[256] = {
32	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
33	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
34	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
35	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
36	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
37	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
38	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
39	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
40	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
41	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
42	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
43	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
44	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
45	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
46	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
47	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1
48};
49
50/*
51 * bitsperbyte contains the number of bits per byte
52 * this is only used for testing and repairing parity
53 * (a precalculated value slightly improves performance)
54 */
55static const char bitsperbyte[256] = {
56	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
57	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
58	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
59	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
60	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
61	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
62	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
63	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
64	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
65	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
66	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
67	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
68	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
69	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
70	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
71	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
72};
73
74/*
75 * addressbits is a lookup table to filter out the bits from the xor-ed
76 * ECC data that identify the faulty location.
77 * this is only used for repairing parity
78 * see the comments in nand_correct_data for more details
79 */
80static const char addressbits[256] = {
81	0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
82	0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
83	0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
84	0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
85	0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05,
86	0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
87	0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05,
88	0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
89	0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
90	0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
91	0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
92	0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
93	0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05,
94	0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
95	0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05,
96	0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
97	0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09,
98	0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b,
99	0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09,
100	0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b,
101	0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d,
102	0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f,
103	0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d,
104	0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f,
105	0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09,
106	0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b,
107	0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09,
108	0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b,
109	0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d,
110	0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f,
111	0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d,
112	0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f
113};
114
115/**
116 * __nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256/512-byte
117 *			 block
118 * @buf:	input buffer with raw data
119 * @eccsize:	data bytes per ECC step (256 or 512)
120 * @code:	output buffer with ECC
121 * @sm_order:	Smart Media byte ordering
122 */
123void __nand_calculate_ecc(const unsigned char *buf, unsigned int eccsize,
124			  unsigned char *code, bool sm_order)
125{
126	int i;
127	const uint32_t *bp = (uint32_t *)buf;
128	/* 256 or 512 bytes/ecc  */
129	const uint32_t eccsize_mult = eccsize >> 8;
130	uint32_t cur;		/* current value in buffer */
131	/* rp0..rp15..rp17 are the various accumulated parities (per byte) */
132	uint32_t rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7;
133	uint32_t rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15, rp16;
134	uint32_t rp17;
135	uint32_t par;		/* the cumulative parity for all data */
136	uint32_t tmppar;	/* the cumulative parity for this iteration;
137				   for rp12, rp14 and rp16 at the end of the
138				   loop */
139
140	par = 0;
141	rp4 = 0;
142	rp6 = 0;
143	rp8 = 0;
144	rp10 = 0;
145	rp12 = 0;
146	rp14 = 0;
147	rp16 = 0;
148
149	/*
150	 * The loop is unrolled a number of times;
151	 * This avoids if statements to decide on which rp value to update
152	 * Also we process the data by longwords.
153	 * Note: passing unaligned data might give a performance penalty.
154	 * It is assumed that the buffers are aligned.
155	 * tmppar is the cumulative sum of this iteration.
156	 * needed for calculating rp12, rp14, rp16 and par
157	 * also used as a performance improvement for rp6, rp8 and rp10
158	 */
159	for (i = 0; i < eccsize_mult << 2; i++) {
160		cur = *bp++;
161		tmppar = cur;
162		rp4 ^= cur;
163		cur = *bp++;
164		tmppar ^= cur;
165		rp6 ^= tmppar;
166		cur = *bp++;
167		tmppar ^= cur;
168		rp4 ^= cur;
169		cur = *bp++;
170		tmppar ^= cur;
171		rp8 ^= tmppar;
172
173		cur = *bp++;
174		tmppar ^= cur;
175		rp4 ^= cur;
176		rp6 ^= cur;
177		cur = *bp++;
178		tmppar ^= cur;
179		rp6 ^= cur;
180		cur = *bp++;
181		tmppar ^= cur;
182		rp4 ^= cur;
183		cur = *bp++;
184		tmppar ^= cur;
185		rp10 ^= tmppar;
186
187		cur = *bp++;
188		tmppar ^= cur;
189		rp4 ^= cur;
190		rp6 ^= cur;
191		rp8 ^= cur;
192		cur = *bp++;
193		tmppar ^= cur;
194		rp6 ^= cur;
195		rp8 ^= cur;
196		cur = *bp++;
197		tmppar ^= cur;
198		rp4 ^= cur;
199		rp8 ^= cur;
200		cur = *bp++;
201		tmppar ^= cur;
202		rp8 ^= cur;
203
204		cur = *bp++;
205		tmppar ^= cur;
206		rp4 ^= cur;
207		rp6 ^= cur;
208		cur = *bp++;
209		tmppar ^= cur;
210		rp6 ^= cur;
211		cur = *bp++;
212		tmppar ^= cur;
213		rp4 ^= cur;
214		cur = *bp++;
215		tmppar ^= cur;
216
217		par ^= tmppar;
218		if ((i & 0x1) == 0)
219			rp12 ^= tmppar;
220		if ((i & 0x2) == 0)
221			rp14 ^= tmppar;
222		if (eccsize_mult == 2 && (i & 0x4) == 0)
223			rp16 ^= tmppar;
224	}
225
226	/*
227	 * handle the fact that we use longword operations
228	 * we'll bring rp4..rp14..rp16 back to single byte entities by
229	 * shifting and xoring first fold the upper and lower 16 bits,
230	 * then the upper and lower 8 bits.
231	 */
232	rp4 ^= (rp4 >> 16);
233	rp4 ^= (rp4 >> 8);
234	rp4 &= 0xff;
235	rp6 ^= (rp6 >> 16);
236	rp6 ^= (rp6 >> 8);
237	rp6 &= 0xff;
238	rp8 ^= (rp8 >> 16);
239	rp8 ^= (rp8 >> 8);
240	rp8 &= 0xff;
241	rp10 ^= (rp10 >> 16);
242	rp10 ^= (rp10 >> 8);
243	rp10 &= 0xff;
244	rp12 ^= (rp12 >> 16);
245	rp12 ^= (rp12 >> 8);
246	rp12 &= 0xff;
247	rp14 ^= (rp14 >> 16);
248	rp14 ^= (rp14 >> 8);
249	rp14 &= 0xff;
250	if (eccsize_mult == 2) {
251		rp16 ^= (rp16 >> 16);
252		rp16 ^= (rp16 >> 8);
253		rp16 &= 0xff;
254	}
255
256	/*
257	 * we also need to calculate the row parity for rp0..rp3
258	 * This is present in par, because par is now
259	 * rp3 rp3 rp2 rp2 in little endian and
260	 * rp2 rp2 rp3 rp3 in big endian
261	 * as well as
262	 * rp1 rp0 rp1 rp0 in little endian and
263	 * rp0 rp1 rp0 rp1 in big endian
264	 * First calculate rp2 and rp3
265	 */
266#ifdef __BIG_ENDIAN
267	rp2 = (par >> 16);
268	rp2 ^= (rp2 >> 8);
269	rp2 &= 0xff;
270	rp3 = par & 0xffff;
271	rp3 ^= (rp3 >> 8);
272	rp3 &= 0xff;
273#else
274	rp3 = (par >> 16);
275	rp3 ^= (rp3 >> 8);
276	rp3 &= 0xff;
277	rp2 = par & 0xffff;
278	rp2 ^= (rp2 >> 8);
279	rp2 &= 0xff;
280#endif
281
282	/* reduce par to 16 bits then calculate rp1 and rp0 */
283	par ^= (par >> 16);
284#ifdef __BIG_ENDIAN
285	rp0 = (par >> 8) & 0xff;
286	rp1 = (par & 0xff);
287#else
288	rp1 = (par >> 8) & 0xff;
289	rp0 = (par & 0xff);
290#endif
291
292	/* finally reduce par to 8 bits */
293	par ^= (par >> 8);
294	par &= 0xff;
295
296	/*
297	 * and calculate rp5..rp15..rp17
298	 * note that par = rp4 ^ rp5 and due to the commutative property
299	 * of the ^ operator we can say:
300	 * rp5 = (par ^ rp4);
301	 * The & 0xff seems superfluous, but benchmarking learned that
302	 * leaving it out gives slightly worse results. No idea why, probably
303	 * it has to do with the way the pipeline in pentium is organized.
304	 */
305	rp5 = (par ^ rp4) & 0xff;
306	rp7 = (par ^ rp6) & 0xff;
307	rp9 = (par ^ rp8) & 0xff;
308	rp11 = (par ^ rp10) & 0xff;
309	rp13 = (par ^ rp12) & 0xff;
310	rp15 = (par ^ rp14) & 0xff;
311	if (eccsize_mult == 2)
312		rp17 = (par ^ rp16) & 0xff;
313
314	/*
315	 * Finally calculate the ECC bits.
316	 * Again here it might seem that there are performance optimisations
317	 * possible, but benchmarks showed that on the system this is developed
318	 * the code below is the fastest
319	 */
320	if (sm_order) {
321		code[0] = (invparity[rp7] << 7) | (invparity[rp6] << 6) |
322			  (invparity[rp5] << 5) | (invparity[rp4] << 4) |
323			  (invparity[rp3] << 3) | (invparity[rp2] << 2) |
324			  (invparity[rp1] << 1) | (invparity[rp0]);
325		code[1] = (invparity[rp15] << 7) | (invparity[rp14] << 6) |
326			  (invparity[rp13] << 5) | (invparity[rp12] << 4) |
327			  (invparity[rp11] << 3) | (invparity[rp10] << 2) |
328			  (invparity[rp9] << 1) | (invparity[rp8]);
329	} else {
330		code[1] = (invparity[rp7] << 7) | (invparity[rp6] << 6) |
331			  (invparity[rp5] << 5) | (invparity[rp4] << 4) |
332			  (invparity[rp3] << 3) | (invparity[rp2] << 2) |
333			  (invparity[rp1] << 1) | (invparity[rp0]);
334		code[0] = (invparity[rp15] << 7) | (invparity[rp14] << 6) |
335			  (invparity[rp13] << 5) | (invparity[rp12] << 4) |
336			  (invparity[rp11] << 3) | (invparity[rp10] << 2) |
337			  (invparity[rp9] << 1) | (invparity[rp8]);
338	}
339
340	if (eccsize_mult == 1)
341		code[2] =
342		    (invparity[par & 0xf0] << 7) |
343		    (invparity[par & 0x0f] << 6) |
344		    (invparity[par & 0xcc] << 5) |
345		    (invparity[par & 0x33] << 4) |
346		    (invparity[par & 0xaa] << 3) |
347		    (invparity[par & 0x55] << 2) |
348		    3;
349	else
350		code[2] =
351		    (invparity[par & 0xf0] << 7) |
352		    (invparity[par & 0x0f] << 6) |
353		    (invparity[par & 0xcc] << 5) |
354		    (invparity[par & 0x33] << 4) |
355		    (invparity[par & 0xaa] << 3) |
356		    (invparity[par & 0x55] << 2) |
357		    (invparity[rp17] << 1) |
358		    (invparity[rp16] << 0);
359}
360EXPORT_SYMBOL(__nand_calculate_ecc);
361
362/**
363 * nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256/512-byte
364 *			 block
365 * @chip:	NAND chip object
366 * @buf:	input buffer with raw data
367 * @code:	output buffer with ECC
368 */
369int nand_calculate_ecc(struct nand_chip *chip, const unsigned char *buf,
370		       unsigned char *code)
371{
372	bool sm_order = chip->ecc.options & NAND_ECC_SOFT_HAMMING_SM_ORDER;
373
374	__nand_calculate_ecc(buf, chip->ecc.size, code, sm_order);
375
376	return 0;
377}
378EXPORT_SYMBOL(nand_calculate_ecc);
379
380/**
381 * __nand_correct_data - [NAND Interface] Detect and correct bit error(s)
382 * @buf:	raw data read from the chip
383 * @read_ecc:	ECC from the chip
384 * @calc_ecc:	the ECC calculated from raw data
385 * @eccsize:	data bytes per ECC step (256 or 512)
386 * @sm_order:	Smart Media byte order
387 *
388 * Detect and correct a 1 bit error for eccsize byte block
389 */
390int __nand_correct_data(unsigned char *buf,
391			unsigned char *read_ecc, unsigned char *calc_ecc,
392			unsigned int eccsize, bool sm_order)
393{
394	unsigned char b0, b1, b2, bit_addr;
395	unsigned int byte_addr;
396	/* 256 or 512 bytes/ecc  */
397	const uint32_t eccsize_mult = eccsize >> 8;
398
399	/*
400	 * b0 to b2 indicate which bit is faulty (if any)
401	 * we might need the xor result  more than once,
402	 * so keep them in a local var
403	*/
404	if (sm_order) {
405		b0 = read_ecc[0] ^ calc_ecc[0];
406		b1 = read_ecc[1] ^ calc_ecc[1];
407	} else {
408		b0 = read_ecc[1] ^ calc_ecc[1];
409		b1 = read_ecc[0] ^ calc_ecc[0];
410	}
411
412	b2 = read_ecc[2] ^ calc_ecc[2];
413
414	/* check if there are any bitfaults */
415
416	/* repeated if statements are slightly more efficient than switch ... */
417	/* ordered in order of likelihood */
418
419	if ((b0 | b1 | b2) == 0)
420		return 0;	/* no error */
421
422	if ((((b0 ^ (b0 >> 1)) & 0x55) == 0x55) &&
423	    (((b1 ^ (b1 >> 1)) & 0x55) == 0x55) &&
424	    ((eccsize_mult == 1 && ((b2 ^ (b2 >> 1)) & 0x54) == 0x54) ||
425	     (eccsize_mult == 2 && ((b2 ^ (b2 >> 1)) & 0x55) == 0x55))) {
426	/* single bit error */
427		/*
428		 * rp17/rp15/13/11/9/7/5/3/1 indicate which byte is the faulty
429		 * byte, cp 5/3/1 indicate the faulty bit.
430		 * A lookup table (called addressbits) is used to filter
431		 * the bits from the byte they are in.
432		 * A marginal optimisation is possible by having three
433		 * different lookup tables.
434		 * One as we have now (for b0), one for b2
435		 * (that would avoid the >> 1), and one for b1 (with all values
436		 * << 4). However it was felt that introducing two more tables
437		 * hardly justify the gain.
438		 *
439		 * The b2 shift is there to get rid of the lowest two bits.
440		 * We could also do addressbits[b2] >> 1 but for the
441		 * performance it does not make any difference
442		 */
443		if (eccsize_mult == 1)
444			byte_addr = (addressbits[b1] << 4) + addressbits[b0];
445		else
446			byte_addr = (addressbits[b2 & 0x3] << 8) +
447				    (addressbits[b1] << 4) + addressbits[b0];
448		bit_addr = addressbits[b2 >> 2];
449		/* flip the bit */
450		buf[byte_addr] ^= (1 << bit_addr);
451		return 1;
452
453	}
454	/* count nr of bits; use table lookup, faster than calculating it */
455	if ((bitsperbyte[b0] + bitsperbyte[b1] + bitsperbyte[b2]) == 1)
456		return 1;	/* error in ECC data; no action needed */
457
458	pr_err("%s: uncorrectable ECC error\n", __func__);
459	return -EBADMSG;
460}
461EXPORT_SYMBOL(__nand_correct_data);
462
463/**
464 * nand_correct_data - [NAND Interface] Detect and correct bit error(s)
465 * @chip:	NAND chip object
466 * @buf:	raw data read from the chip
467 * @read_ecc:	ECC from the chip
468 * @calc_ecc:	the ECC calculated from raw data
469 *
470 * Detect and correct a 1 bit error for 256/512 byte block
471 */
472int nand_correct_data(struct nand_chip *chip, unsigned char *buf,
473		      unsigned char *read_ecc, unsigned char *calc_ecc)
474{
475	bool sm_order = chip->ecc.options & NAND_ECC_SOFT_HAMMING_SM_ORDER;
476
477	return __nand_correct_data(buf, read_ecc, calc_ecc, chip->ecc.size,
478				   sm_order);
479}
480EXPORT_SYMBOL(nand_correct_data);
481
482MODULE_LICENSE("GPL");
483MODULE_AUTHOR("Frans Meulenbroeks <fransmeulenbroeks@gmail.com>");
484MODULE_DESCRIPTION("Generic NAND ECC support");
485