1// SPDX-License-Identifier: LGPL-2.1+
2/*
3 * Copyright 2016 Tom aan de Wiel
4 * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5 *
6 * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
7 *
8 * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
9 * R.D. Brown, 1977
10 */
11
12#include <linux/string.h>
13#include <linux/kernel.h>
14#include "codec-fwht.h"
15
16#define OVERFLOW_BIT BIT(14)
17
18/*
19 * Note: bit 0 of the header must always be 0. Otherwise it cannot
20 * be guaranteed that the magic 8 byte sequence (see below) can
21 * never occur in the rlc output.
22 */
23#define PFRAME_BIT BIT(15)
24#define DUPS_MASK 0x1ffe
25
26#define PBLOCK 0
27#define IBLOCK 1
28
29#define ALL_ZEROS 15
30
31static const uint8_t zigzag[64] = {
32	0,
33	1,  8,
34	2,  9, 16,
35	3, 10, 17, 24,
36	4, 11, 18, 25, 32,
37	5, 12, 19, 26, 33, 40,
38	6, 13, 20, 27, 34, 41, 48,
39	7, 14, 21, 28, 35, 42, 49, 56,
40	15, 22, 29, 36, 43, 50, 57,
41	23, 30, 37, 44, 51, 58,
42	31, 38, 45, 52, 59,
43	39, 46, 53, 60,
44	47, 54, 61,
45	55, 62,
46	63,
47};
48
49/*
50 * noinline_for_stack to work around
51 * https://bugs.llvm.org/show_bug.cgi?id=38809
52 */
53static int noinline_for_stack
54rlc(const s16 *in, __be16 *output, int blocktype)
55{
56	s16 block[8 * 8];
57	s16 *wp = block;
58	int i = 0;
59	int x, y;
60	int ret = 0;
61
62	/* read in block from framebuffer */
63	int lastzero_run = 0;
64	int to_encode;
65
66	for (y = 0; y < 8; y++) {
67		for (x = 0; x < 8; x++) {
68			*wp = in[x + y * 8];
69			wp++;
70		}
71	}
72
73	/* keep track of amount of trailing zeros */
74	for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
75		lastzero_run++;
76
77	*output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
78	ret++;
79
80	to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
81
82	i = 0;
83	while (i < to_encode) {
84		int cnt = 0;
85		int tmp;
86
87		/* count leading zeros */
88		while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
89			cnt++;
90			i++;
91			if (i == to_encode) {
92				cnt--;
93				break;
94			}
95		}
96		/* 4 bits for run, 12 for coefficient (quantization by 4) */
97		*output++ = htons((cnt | tmp << 4));
98		i++;
99		ret++;
100	}
101	if (lastzero_run > 14) {
102		*output = htons(ALL_ZEROS | 0);
103		ret++;
104	}
105
106	return ret;
107}
108
109/*
110 * This function will worst-case increase rlc_in by 65*2 bytes:
111 * one s16 value for the header and 8 * 8 coefficients of type s16.
112 */
113static noinline_for_stack u16
114derlc(const __be16 **rlc_in, s16 *dwht_out, const __be16 *end_of_input)
115{
116	/* header */
117	const __be16 *input = *rlc_in;
118	u16 stat;
119	int dec_count = 0;
120	s16 block[8 * 8 + 16];
121	s16 *wp = block;
122	int i;
123
124	if (input > end_of_input)
125		return OVERFLOW_BIT;
126	stat = ntohs(*input++);
127
128	/*
129	 * Now de-compress, it expands one byte to up to 15 bytes
130	 * (or fills the remainder of the 64 bytes with zeroes if it
131	 * is the last byte to expand).
132	 *
133	 * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
134	 * allow for overflow if the incoming data was malformed.
135	 */
136	while (dec_count < 8 * 8) {
137		s16 in;
138		int length;
139		int coeff;
140
141		if (input > end_of_input)
142			return OVERFLOW_BIT;
143		in = ntohs(*input++);
144		length = in & 0xf;
145		coeff = in >> 4;
146
147		/* fill remainder with zeros */
148		if (length == 15) {
149			for (i = 0; i < 64 - dec_count; i++)
150				*wp++ = 0;
151			break;
152		}
153
154		for (i = 0; i < length; i++)
155			*wp++ = 0;
156		*wp++ = coeff;
157		dec_count += length + 1;
158	}
159
160	wp = block;
161
162	for (i = 0; i < 64; i++) {
163		int pos = zigzag[i];
164		int y = pos / 8;
165		int x = pos % 8;
166
167		dwht_out[x + y * 8] = *wp++;
168	}
169	*rlc_in = input;
170	return stat;
171}
172
173static const int quant_table[] = {
174	2, 2, 2, 2, 2, 2,  2,  2,
175	2, 2, 2, 2, 2, 2,  2,  2,
176	2, 2, 2, 2, 2, 2,  2,  3,
177	2, 2, 2, 2, 2, 2,  3,  6,
178	2, 2, 2, 2, 2, 3,  6,  6,
179	2, 2, 2, 2, 3, 6,  6,  6,
180	2, 2, 2, 3, 6, 6,  6,  6,
181	2, 2, 3, 6, 6, 6,  6,  8,
182};
183
184static const int quant_table_p[] = {
185	3, 3, 3, 3, 3, 3,  3,  3,
186	3, 3, 3, 3, 3, 3,  3,  3,
187	3, 3, 3, 3, 3, 3,  3,  3,
188	3, 3, 3, 3, 3, 3,  3,  6,
189	3, 3, 3, 3, 3, 3,  6,  6,
190	3, 3, 3, 3, 3, 6,  6,  9,
191	3, 3, 3, 3, 6, 6,  9,  9,
192	3, 3, 3, 6, 6, 9,  9,  10,
193};
194
195static void quantize_intra(s16 *coeff, s16 *de_coeff, u16 qp)
196{
197	const int *quant = quant_table;
198	int i, j;
199
200	for (j = 0; j < 8; j++) {
201		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
202			*coeff >>= *quant;
203			if (*coeff >= -qp && *coeff <= qp)
204				*coeff = *de_coeff = 0;
205			else
206				*de_coeff = *coeff << *quant;
207		}
208	}
209}
210
211static void dequantize_intra(s16 *coeff)
212{
213	const int *quant = quant_table;
214	int i, j;
215
216	for (j = 0; j < 8; j++)
217		for (i = 0; i < 8; i++, quant++, coeff++)
218			*coeff <<= *quant;
219}
220
221static void quantize_inter(s16 *coeff, s16 *de_coeff, u16 qp)
222{
223	const int *quant = quant_table_p;
224	int i, j;
225
226	for (j = 0; j < 8; j++) {
227		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
228			*coeff >>= *quant;
229			if (*coeff >= -qp && *coeff <= qp)
230				*coeff = *de_coeff = 0;
231			else
232				*de_coeff = *coeff << *quant;
233		}
234	}
235}
236
237static void dequantize_inter(s16 *coeff)
238{
239	const int *quant = quant_table_p;
240	int i, j;
241
242	for (j = 0; j < 8; j++)
243		for (i = 0; i < 8; i++, quant++, coeff++)
244			*coeff <<= *quant;
245}
246
247static void noinline_for_stack fwht(const u8 *block, s16 *output_block,
248				    unsigned int stride,
249				    unsigned int input_step, bool intra)
250{
251	/* we'll need more than 8 bits for the transformed coefficients */
252	s32 workspace1[8], workspace2[8];
253	const u8 *tmp = block;
254	s16 *out = output_block;
255	int add = intra ? 256 : 0;
256	unsigned int i;
257
258	/* stage 1 */
259	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
260		switch (input_step) {
261		case 1:
262			workspace1[0]  = tmp[0] + tmp[1] - add;
263			workspace1[1]  = tmp[0] - tmp[1];
264
265			workspace1[2]  = tmp[2] + tmp[3] - add;
266			workspace1[3]  = tmp[2] - tmp[3];
267
268			workspace1[4]  = tmp[4] + tmp[5] - add;
269			workspace1[5]  = tmp[4] - tmp[5];
270
271			workspace1[6]  = tmp[6] + tmp[7] - add;
272			workspace1[7]  = tmp[6] - tmp[7];
273			break;
274		case 2:
275			workspace1[0]  = tmp[0] + tmp[2] - add;
276			workspace1[1]  = tmp[0] - tmp[2];
277
278			workspace1[2]  = tmp[4] + tmp[6] - add;
279			workspace1[3]  = tmp[4] - tmp[6];
280
281			workspace1[4]  = tmp[8] + tmp[10] - add;
282			workspace1[5]  = tmp[8] - tmp[10];
283
284			workspace1[6]  = tmp[12] + tmp[14] - add;
285			workspace1[7]  = tmp[12] - tmp[14];
286			break;
287		case 3:
288			workspace1[0]  = tmp[0] + tmp[3] - add;
289			workspace1[1]  = tmp[0] - tmp[3];
290
291			workspace1[2]  = tmp[6] + tmp[9] - add;
292			workspace1[3]  = tmp[6] - tmp[9];
293
294			workspace1[4]  = tmp[12] + tmp[15] - add;
295			workspace1[5]  = tmp[12] - tmp[15];
296
297			workspace1[6]  = tmp[18] + tmp[21] - add;
298			workspace1[7]  = tmp[18] - tmp[21];
299			break;
300		default:
301			workspace1[0]  = tmp[0] + tmp[4] - add;
302			workspace1[1]  = tmp[0] - tmp[4];
303
304			workspace1[2]  = tmp[8] + tmp[12] - add;
305			workspace1[3]  = tmp[8] - tmp[12];
306
307			workspace1[4]  = tmp[16] + tmp[20] - add;
308			workspace1[5]  = tmp[16] - tmp[20];
309
310			workspace1[6]  = tmp[24] + tmp[28] - add;
311			workspace1[7]  = tmp[24] - tmp[28];
312			break;
313		}
314
315		/* stage 2 */
316		workspace2[0] = workspace1[0] + workspace1[2];
317		workspace2[1] = workspace1[0] - workspace1[2];
318		workspace2[2] = workspace1[1] - workspace1[3];
319		workspace2[3] = workspace1[1] + workspace1[3];
320
321		workspace2[4] = workspace1[4] + workspace1[6];
322		workspace2[5] = workspace1[4] - workspace1[6];
323		workspace2[6] = workspace1[5] - workspace1[7];
324		workspace2[7] = workspace1[5] + workspace1[7];
325
326		/* stage 3 */
327		out[0] = workspace2[0] + workspace2[4];
328		out[1] = workspace2[0] - workspace2[4];
329		out[2] = workspace2[1] - workspace2[5];
330		out[3] = workspace2[1] + workspace2[5];
331		out[4] = workspace2[2] + workspace2[6];
332		out[5] = workspace2[2] - workspace2[6];
333		out[6] = workspace2[3] - workspace2[7];
334		out[7] = workspace2[3] + workspace2[7];
335	}
336
337	out = output_block;
338
339	for (i = 0; i < 8; i++, out++) {
340		/* stage 1 */
341		workspace1[0]  = out[0] + out[1 * 8];
342		workspace1[1]  = out[0] - out[1 * 8];
343
344		workspace1[2]  = out[2 * 8] + out[3 * 8];
345		workspace1[3]  = out[2 * 8] - out[3 * 8];
346
347		workspace1[4]  = out[4 * 8] + out[5 * 8];
348		workspace1[5]  = out[4 * 8] - out[5 * 8];
349
350		workspace1[6]  = out[6 * 8] + out[7 * 8];
351		workspace1[7]  = out[6 * 8] - out[7 * 8];
352
353		/* stage 2 */
354		workspace2[0] = workspace1[0] + workspace1[2];
355		workspace2[1] = workspace1[0] - workspace1[2];
356		workspace2[2] = workspace1[1] - workspace1[3];
357		workspace2[3] = workspace1[1] + workspace1[3];
358
359		workspace2[4] = workspace1[4] + workspace1[6];
360		workspace2[5] = workspace1[4] - workspace1[6];
361		workspace2[6] = workspace1[5] - workspace1[7];
362		workspace2[7] = workspace1[5] + workspace1[7];
363		/* stage 3 */
364		out[0 * 8] = workspace2[0] + workspace2[4];
365		out[1 * 8] = workspace2[0] - workspace2[4];
366		out[2 * 8] = workspace2[1] - workspace2[5];
367		out[3 * 8] = workspace2[1] + workspace2[5];
368		out[4 * 8] = workspace2[2] + workspace2[6];
369		out[5 * 8] = workspace2[2] - workspace2[6];
370		out[6 * 8] = workspace2[3] - workspace2[7];
371		out[7 * 8] = workspace2[3] + workspace2[7];
372	}
373}
374
375/*
376 * Not the nicest way of doing it, but P-blocks get twice the range of
377 * that of the I-blocks. Therefore we need a type bigger than 8 bits.
378 * Furthermore values can be negative... This is just a version that
379 * works with 16 signed data
380 */
381static void noinline_for_stack
382fwht16(const s16 *block, s16 *output_block, int stride, int intra)
383{
384	/* we'll need more than 8 bits for the transformed coefficients */
385	s32 workspace1[8], workspace2[8];
386	const s16 *tmp = block;
387	s16 *out = output_block;
388	int i;
389
390	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
391		/* stage 1 */
392		workspace1[0]  = tmp[0] + tmp[1];
393		workspace1[1]  = tmp[0] - tmp[1];
394
395		workspace1[2]  = tmp[2] + tmp[3];
396		workspace1[3]  = tmp[2] - tmp[3];
397
398		workspace1[4]  = tmp[4] + tmp[5];
399		workspace1[5]  = tmp[4] - tmp[5];
400
401		workspace1[6]  = tmp[6] + tmp[7];
402		workspace1[7]  = tmp[6] - tmp[7];
403
404		/* stage 2 */
405		workspace2[0] = workspace1[0] + workspace1[2];
406		workspace2[1] = workspace1[0] - workspace1[2];
407		workspace2[2] = workspace1[1] - workspace1[3];
408		workspace2[3] = workspace1[1] + workspace1[3];
409
410		workspace2[4] = workspace1[4] + workspace1[6];
411		workspace2[5] = workspace1[4] - workspace1[6];
412		workspace2[6] = workspace1[5] - workspace1[7];
413		workspace2[7] = workspace1[5] + workspace1[7];
414
415		/* stage 3 */
416		out[0] = workspace2[0] + workspace2[4];
417		out[1] = workspace2[0] - workspace2[4];
418		out[2] = workspace2[1] - workspace2[5];
419		out[3] = workspace2[1] + workspace2[5];
420		out[4] = workspace2[2] + workspace2[6];
421		out[5] = workspace2[2] - workspace2[6];
422		out[6] = workspace2[3] - workspace2[7];
423		out[7] = workspace2[3] + workspace2[7];
424	}
425
426	out = output_block;
427
428	for (i = 0; i < 8; i++, out++) {
429		/* stage 1 */
430		workspace1[0]  = out[0] + out[1*8];
431		workspace1[1]  = out[0] - out[1*8];
432
433		workspace1[2]  = out[2*8] + out[3*8];
434		workspace1[3]  = out[2*8] - out[3*8];
435
436		workspace1[4]  = out[4*8] + out[5*8];
437		workspace1[5]  = out[4*8] - out[5*8];
438
439		workspace1[6]  = out[6*8] + out[7*8];
440		workspace1[7]  = out[6*8] - out[7*8];
441
442		/* stage 2 */
443		workspace2[0] = workspace1[0] + workspace1[2];
444		workspace2[1] = workspace1[0] - workspace1[2];
445		workspace2[2] = workspace1[1] - workspace1[3];
446		workspace2[3] = workspace1[1] + workspace1[3];
447
448		workspace2[4] = workspace1[4] + workspace1[6];
449		workspace2[5] = workspace1[4] - workspace1[6];
450		workspace2[6] = workspace1[5] - workspace1[7];
451		workspace2[7] = workspace1[5] + workspace1[7];
452
453		/* stage 3 */
454		out[0*8] = workspace2[0] + workspace2[4];
455		out[1*8] = workspace2[0] - workspace2[4];
456		out[2*8] = workspace2[1] - workspace2[5];
457		out[3*8] = workspace2[1] + workspace2[5];
458		out[4*8] = workspace2[2] + workspace2[6];
459		out[5*8] = workspace2[2] - workspace2[6];
460		out[6*8] = workspace2[3] - workspace2[7];
461		out[7*8] = workspace2[3] + workspace2[7];
462	}
463}
464
465static noinline_for_stack void
466ifwht(const s16 *block, s16 *output_block, int intra)
467{
468	/*
469	 * we'll need more than 8 bits for the transformed coefficients
470	 * use native unit of cpu
471	 */
472	int workspace1[8], workspace2[8];
473	int inter = intra ? 0 : 1;
474	const s16 *tmp = block;
475	s16 *out = output_block;
476	int i;
477
478	for (i = 0; i < 8; i++, tmp += 8, out += 8) {
479		/* stage 1 */
480		workspace1[0]  = tmp[0] + tmp[1];
481		workspace1[1]  = tmp[0] - tmp[1];
482
483		workspace1[2]  = tmp[2] + tmp[3];
484		workspace1[3]  = tmp[2] - tmp[3];
485
486		workspace1[4]  = tmp[4] + tmp[5];
487		workspace1[5]  = tmp[4] - tmp[5];
488
489		workspace1[6]  = tmp[6] + tmp[7];
490		workspace1[7]  = tmp[6] - tmp[7];
491
492		/* stage 2 */
493		workspace2[0] = workspace1[0] + workspace1[2];
494		workspace2[1] = workspace1[0] - workspace1[2];
495		workspace2[2] = workspace1[1] - workspace1[3];
496		workspace2[3] = workspace1[1] + workspace1[3];
497
498		workspace2[4] = workspace1[4] + workspace1[6];
499		workspace2[5] = workspace1[4] - workspace1[6];
500		workspace2[6] = workspace1[5] - workspace1[7];
501		workspace2[7] = workspace1[5] + workspace1[7];
502
503		/* stage 3 */
504		out[0] = workspace2[0] + workspace2[4];
505		out[1] = workspace2[0] - workspace2[4];
506		out[2] = workspace2[1] - workspace2[5];
507		out[3] = workspace2[1] + workspace2[5];
508		out[4] = workspace2[2] + workspace2[6];
509		out[5] = workspace2[2] - workspace2[6];
510		out[6] = workspace2[3] - workspace2[7];
511		out[7] = workspace2[3] + workspace2[7];
512	}
513
514	out = output_block;
515
516	for (i = 0; i < 8; i++, out++) {
517		/* stage 1 */
518		workspace1[0]  = out[0] + out[1 * 8];
519		workspace1[1]  = out[0] - out[1 * 8];
520
521		workspace1[2]  = out[2 * 8] + out[3 * 8];
522		workspace1[3]  = out[2 * 8] - out[3 * 8];
523
524		workspace1[4]  = out[4 * 8] + out[5 * 8];
525		workspace1[5]  = out[4 * 8] - out[5 * 8];
526
527		workspace1[6]  = out[6 * 8] + out[7 * 8];
528		workspace1[7]  = out[6 * 8] - out[7 * 8];
529
530		/* stage 2 */
531		workspace2[0] = workspace1[0] + workspace1[2];
532		workspace2[1] = workspace1[0] - workspace1[2];
533		workspace2[2] = workspace1[1] - workspace1[3];
534		workspace2[3] = workspace1[1] + workspace1[3];
535
536		workspace2[4] = workspace1[4] + workspace1[6];
537		workspace2[5] = workspace1[4] - workspace1[6];
538		workspace2[6] = workspace1[5] - workspace1[7];
539		workspace2[7] = workspace1[5] + workspace1[7];
540
541		/* stage 3 */
542		if (inter) {
543			int d;
544
545			out[0 * 8] = workspace2[0] + workspace2[4];
546			out[1 * 8] = workspace2[0] - workspace2[4];
547			out[2 * 8] = workspace2[1] - workspace2[5];
548			out[3 * 8] = workspace2[1] + workspace2[5];
549			out[4 * 8] = workspace2[2] + workspace2[6];
550			out[5 * 8] = workspace2[2] - workspace2[6];
551			out[6 * 8] = workspace2[3] - workspace2[7];
552			out[7 * 8] = workspace2[3] + workspace2[7];
553
554			for (d = 0; d < 8; d++)
555				out[8 * d] >>= 6;
556		} else {
557			int d;
558
559			out[0 * 8] = workspace2[0] + workspace2[4];
560			out[1 * 8] = workspace2[0] - workspace2[4];
561			out[2 * 8] = workspace2[1] - workspace2[5];
562			out[3 * 8] = workspace2[1] + workspace2[5];
563			out[4 * 8] = workspace2[2] + workspace2[6];
564			out[5 * 8] = workspace2[2] - workspace2[6];
565			out[6 * 8] = workspace2[3] - workspace2[7];
566			out[7 * 8] = workspace2[3] + workspace2[7];
567
568			for (d = 0; d < 8; d++) {
569				out[8 * d] >>= 6;
570				out[8 * d] += 128;
571			}
572		}
573	}
574}
575
576static void fill_encoder_block(const u8 *input, s16 *dst,
577			       unsigned int stride, unsigned int input_step)
578{
579	int i, j;
580
581	for (i = 0; i < 8; i++) {
582		for (j = 0; j < 8; j++, input += input_step)
583			*dst++ = *input;
584		input += stride - 8 * input_step;
585	}
586}
587
588static int var_intra(const s16 *input)
589{
590	int32_t mean = 0;
591	int32_t ret = 0;
592	const s16 *tmp = input;
593	int i;
594
595	for (i = 0; i < 8 * 8; i++, tmp++)
596		mean += *tmp;
597	mean /= 64;
598	tmp = input;
599	for (i = 0; i < 8 * 8; i++, tmp++)
600		ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
601	return ret;
602}
603
604static int var_inter(const s16 *old, const s16 *new)
605{
606	int32_t ret = 0;
607	int i;
608
609	for (i = 0; i < 8 * 8; i++, old++, new++)
610		ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
611	return ret;
612}
613
614static noinline_for_stack int
615decide_blocktype(const u8 *cur, const u8 *reference, s16 *deltablock,
616		 unsigned int stride, unsigned int input_step)
617{
618	s16 tmp[64];
619	s16 old[64];
620	s16 *work = tmp;
621	unsigned int k, l;
622	int vari;
623	int vard;
624
625	fill_encoder_block(cur, tmp, stride, input_step);
626	fill_encoder_block(reference, old, 8, 1);
627	vari = var_intra(tmp);
628
629	for (k = 0; k < 8; k++) {
630		for (l = 0; l < 8; l++) {
631			*deltablock = *work - *reference;
632			deltablock++;
633			work++;
634			reference++;
635		}
636	}
637	deltablock -= 64;
638	vard = var_inter(old, tmp);
639	return vari <= vard ? IBLOCK : PBLOCK;
640}
641
642static void fill_decoder_block(u8 *dst, const s16 *input, int stride,
643			       unsigned int dst_step)
644{
645	int i, j;
646
647	for (i = 0; i < 8; i++) {
648		for (j = 0; j < 8; j++, input++, dst += dst_step) {
649			if (*input < 0)
650				*dst = 0;
651			else if (*input > 255)
652				*dst = 255;
653			else
654				*dst = *input;
655		}
656		dst += stride - (8 * dst_step);
657	}
658}
659
660static void add_deltas(s16 *deltas, const u8 *ref, int stride,
661		       unsigned int ref_step)
662{
663	int k, l;
664
665	for (k = 0; k < 8; k++) {
666		for (l = 0; l < 8; l++) {
667			*deltas += *ref;
668			ref += ref_step;
669			/*
670			 * Due to quantizing, it might possible that the
671			 * decoded coefficients are slightly out of range
672			 */
673			if (*deltas < 0)
674				*deltas = 0;
675			else if (*deltas > 255)
676				*deltas = 255;
677			deltas++;
678		}
679		ref += stride - (8 * ref_step);
680	}
681}
682
683static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
684			struct fwht_cframe *cf, u32 height, u32 width,
685			u32 stride, unsigned int input_step,
686			bool is_intra, bool next_is_intra)
687{
688	u8 *input_start = input;
689	__be16 *rlco_start = *rlco;
690	s16 deltablock[64];
691	__be16 pframe_bit = htons(PFRAME_BIT);
692	u32 encoding = 0;
693	unsigned int last_size = 0;
694	unsigned int i, j;
695
696	width = round_up(width, 8);
697	height = round_up(height, 8);
698
699	for (j = 0; j < height / 8; j++) {
700		input = input_start + j * 8 * stride;
701		for (i = 0; i < width / 8; i++) {
702			/* intra code, first frame is always intra coded. */
703			int blocktype = IBLOCK;
704			unsigned int size;
705
706			if (!is_intra)
707				blocktype = decide_blocktype(input, refp,
708					deltablock, stride, input_step);
709			if (blocktype == IBLOCK) {
710				fwht(input, cf->coeffs, stride, input_step, 1);
711				quantize_intra(cf->coeffs, cf->de_coeffs,
712					       cf->i_frame_qp);
713			} else {
714				/* inter code */
715				encoding |= FWHT_FRAME_PCODED;
716				fwht16(deltablock, cf->coeffs, 8, 0);
717				quantize_inter(cf->coeffs, cf->de_coeffs,
718					       cf->p_frame_qp);
719			}
720			if (!next_is_intra) {
721				ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
722
723				if (blocktype == PBLOCK)
724					add_deltas(cf->de_fwht, refp, 8, 1);
725				fill_decoder_block(refp, cf->de_fwht, 8, 1);
726			}
727
728			input += 8 * input_step;
729			refp += 8 * 8;
730
731			size = rlc(cf->coeffs, *rlco, blocktype);
732			if (last_size == size &&
733			    !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
734				__be16 *last_rlco = *rlco - size;
735				s16 hdr = ntohs(*last_rlco);
736
737				if (!((*last_rlco ^ **rlco) & pframe_bit) &&
738				    (hdr & DUPS_MASK) < DUPS_MASK)
739					*last_rlco = htons(hdr + 2);
740				else
741					*rlco += size;
742			} else {
743				*rlco += size;
744			}
745			if (*rlco >= rlco_max) {
746				encoding |= FWHT_FRAME_UNENCODED;
747				goto exit_loop;
748			}
749			last_size = size;
750		}
751	}
752
753exit_loop:
754	if (encoding & FWHT_FRAME_UNENCODED) {
755		u8 *out = (u8 *)rlco_start;
756		u8 *p;
757
758		input = input_start;
759		/*
760		 * The compressed stream should never contain the magic
761		 * header, so when we copy the YUV data we replace 0xff
762		 * by 0xfe. Since YUV is limited range such values
763		 * shouldn't appear anyway.
764		 */
765		for (j = 0; j < height; j++) {
766			for (i = 0, p = input; i < width; i++, p += input_step)
767				*out++ = (*p == 0xff) ? 0xfe : *p;
768			input += stride;
769		}
770		*rlco = (__be16 *)out;
771		encoding &= ~FWHT_FRAME_PCODED;
772	}
773	return encoding;
774}
775
776u32 fwht_encode_frame(struct fwht_raw_frame *frm,
777		      struct fwht_raw_frame *ref_frm,
778		      struct fwht_cframe *cf,
779		      bool is_intra, bool next_is_intra,
780		      unsigned int width, unsigned int height,
781		      unsigned int stride, unsigned int chroma_stride)
782{
783	unsigned int size = height * width;
784	__be16 *rlco = cf->rlc_data;
785	__be16 *rlco_max;
786	u32 encoding;
787
788	rlco_max = rlco + size / 2 - 256;
789	encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
790				height, width, stride,
791				frm->luma_alpha_step, is_intra, next_is_intra);
792	if (encoding & FWHT_FRAME_UNENCODED)
793		encoding |= FWHT_LUMA_UNENCODED;
794	encoding &= ~FWHT_FRAME_UNENCODED;
795
796	if (frm->components_num >= 3) {
797		u32 chroma_h = height / frm->height_div;
798		u32 chroma_w = width / frm->width_div;
799		unsigned int chroma_size = chroma_h * chroma_w;
800
801		rlco_max = rlco + chroma_size / 2 - 256;
802		encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max,
803					 cf, chroma_h, chroma_w,
804					 chroma_stride, frm->chroma_step,
805					 is_intra, next_is_intra);
806		if (encoding & FWHT_FRAME_UNENCODED)
807			encoding |= FWHT_CB_UNENCODED;
808		encoding &= ~FWHT_FRAME_UNENCODED;
809		rlco_max = rlco + chroma_size / 2 - 256;
810		encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max,
811					 cf, chroma_h, chroma_w,
812					 chroma_stride, frm->chroma_step,
813					 is_intra, next_is_intra);
814		if (encoding & FWHT_FRAME_UNENCODED)
815			encoding |= FWHT_CR_UNENCODED;
816		encoding &= ~FWHT_FRAME_UNENCODED;
817	}
818
819	if (frm->components_num == 4) {
820		rlco_max = rlco + size / 2 - 256;
821		encoding |= encode_plane(frm->alpha, ref_frm->alpha, &rlco,
822					 rlco_max, cf, height, width,
823					 stride, frm->luma_alpha_step,
824					 is_intra, next_is_intra);
825		if (encoding & FWHT_FRAME_UNENCODED)
826			encoding |= FWHT_ALPHA_UNENCODED;
827		encoding &= ~FWHT_FRAME_UNENCODED;
828	}
829
830	cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
831	return encoding;
832}
833
834static bool decode_plane(struct fwht_cframe *cf, const __be16 **rlco,
835			 u32 height, u32 width, const u8 *ref, u32 ref_stride,
836			 unsigned int ref_step, u8 *dst,
837			 unsigned int dst_stride, unsigned int dst_step,
838			 bool uncompressed, const __be16 *end_of_rlco_buf)
839{
840	unsigned int copies = 0;
841	s16 copy[8 * 8];
842	u16 stat;
843	unsigned int i, j;
844	bool is_intra = !ref;
845
846	width = round_up(width, 8);
847	height = round_up(height, 8);
848
849	if (uncompressed) {
850		int i;
851
852		if (end_of_rlco_buf + 1 < *rlco + width * height / 2)
853			return false;
854		for (i = 0; i < height; i++) {
855			memcpy(dst, *rlco, width);
856			dst += dst_stride;
857			*rlco += width / 2;
858		}
859		return true;
860	}
861
862	/*
863	 * When decoding each macroblock the rlco pointer will be increased
864	 * by 65 * 2 bytes worst-case.
865	 * To avoid overflow the buffer has to be 65/64th of the actual raw
866	 * image size, just in case someone feeds it malicious data.
867	 */
868	for (j = 0; j < height / 8; j++) {
869		for (i = 0; i < width / 8; i++) {
870			const u8 *refp = ref + j * 8 * ref_stride +
871				i * 8 * ref_step;
872			u8 *dstp = dst + j * 8 * dst_stride + i * 8 * dst_step;
873
874			if (copies) {
875				memcpy(cf->de_fwht, copy, sizeof(copy));
876				if ((stat & PFRAME_BIT) && !is_intra)
877					add_deltas(cf->de_fwht, refp,
878						   ref_stride, ref_step);
879				fill_decoder_block(dstp, cf->de_fwht,
880						   dst_stride, dst_step);
881				copies--;
882				continue;
883			}
884
885			stat = derlc(rlco, cf->coeffs, end_of_rlco_buf);
886			if (stat & OVERFLOW_BIT)
887				return false;
888			if ((stat & PFRAME_BIT) && !is_intra)
889				dequantize_inter(cf->coeffs);
890			else
891				dequantize_intra(cf->coeffs);
892
893			ifwht(cf->coeffs, cf->de_fwht,
894			      ((stat & PFRAME_BIT) && !is_intra) ? 0 : 1);
895
896			copies = (stat & DUPS_MASK) >> 1;
897			if (copies)
898				memcpy(copy, cf->de_fwht, sizeof(copy));
899			if ((stat & PFRAME_BIT) && !is_intra)
900				add_deltas(cf->de_fwht, refp,
901					   ref_stride, ref_step);
902			fill_decoder_block(dstp, cf->de_fwht, dst_stride,
903					   dst_step);
904		}
905	}
906	return true;
907}
908
909bool fwht_decode_frame(struct fwht_cframe *cf, u32 hdr_flags,
910		       unsigned int components_num, unsigned int width,
911		       unsigned int height, const struct fwht_raw_frame *ref,
912		       unsigned int ref_stride, unsigned int ref_chroma_stride,
913		       struct fwht_raw_frame *dst, unsigned int dst_stride,
914		       unsigned int dst_chroma_stride)
915{
916	const __be16 *rlco = cf->rlc_data;
917	const __be16 *end_of_rlco_buf = cf->rlc_data +
918			(cf->size / sizeof(*rlco)) - 1;
919
920	if (!decode_plane(cf, &rlco, height, width, ref->luma, ref_stride,
921			  ref->luma_alpha_step, dst->luma, dst_stride,
922			  dst->luma_alpha_step,
923			  hdr_flags & FWHT_FL_LUMA_IS_UNCOMPRESSED,
924			  end_of_rlco_buf))
925		return false;
926
927	if (components_num >= 3) {
928		u32 h = height;
929		u32 w = width;
930
931		if (!(hdr_flags & FWHT_FL_CHROMA_FULL_HEIGHT))
932			h /= 2;
933		if (!(hdr_flags & FWHT_FL_CHROMA_FULL_WIDTH))
934			w /= 2;
935
936		if (!decode_plane(cf, &rlco, h, w, ref->cb, ref_chroma_stride,
937				  ref->chroma_step, dst->cb, dst_chroma_stride,
938				  dst->chroma_step,
939				  hdr_flags & FWHT_FL_CB_IS_UNCOMPRESSED,
940				  end_of_rlco_buf))
941			return false;
942		if (!decode_plane(cf, &rlco, h, w, ref->cr, ref_chroma_stride,
943				  ref->chroma_step, dst->cr, dst_chroma_stride,
944				  dst->chroma_step,
945				  hdr_flags & FWHT_FL_CR_IS_UNCOMPRESSED,
946				  end_of_rlco_buf))
947			return false;
948	}
949
950	if (components_num == 4)
951		if (!decode_plane(cf, &rlco, height, width, ref->alpha, ref_stride,
952				  ref->luma_alpha_step, dst->alpha, dst_stride,
953				  dst->luma_alpha_step,
954				  hdr_flags & FWHT_FL_ALPHA_IS_UNCOMPRESSED,
955				  end_of_rlco_buf))
956			return false;
957	return true;
958}
959