162306a36Sopenharmony_ci// SPDX-License-Identifier: LGPL-2.1+
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright 2016 Tom aan de Wiel
462306a36Sopenharmony_ci * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
562306a36Sopenharmony_ci *
662306a36Sopenharmony_ci * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
762306a36Sopenharmony_ci *
862306a36Sopenharmony_ci * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
962306a36Sopenharmony_ci * R.D. Brown, 1977
1062306a36Sopenharmony_ci */
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci#include <linux/string.h>
1362306a36Sopenharmony_ci#include <linux/kernel.h>
1462306a36Sopenharmony_ci#include <linux/videodev2.h>
1562306a36Sopenharmony_ci#include "codec-fwht.h"
1662306a36Sopenharmony_ci
1762306a36Sopenharmony_ci#define OVERFLOW_BIT BIT(14)
1862306a36Sopenharmony_ci
1962306a36Sopenharmony_ci/*
2062306a36Sopenharmony_ci * Note: bit 0 of the header must always be 0. Otherwise it cannot
2162306a36Sopenharmony_ci * be guaranteed that the magic 8 byte sequence (see below) can
2262306a36Sopenharmony_ci * never occur in the rlc output.
2362306a36Sopenharmony_ci */
2462306a36Sopenharmony_ci#define PFRAME_BIT BIT(15)
2562306a36Sopenharmony_ci#define DUPS_MASK 0x1ffe
2662306a36Sopenharmony_ci
2762306a36Sopenharmony_ci#define PBLOCK 0
2862306a36Sopenharmony_ci#define IBLOCK 1
2962306a36Sopenharmony_ci
3062306a36Sopenharmony_ci#define ALL_ZEROS 15
3162306a36Sopenharmony_ci
3262306a36Sopenharmony_cistatic const uint8_t zigzag[64] = {
3362306a36Sopenharmony_ci	0,
3462306a36Sopenharmony_ci	1,  8,
3562306a36Sopenharmony_ci	2,  9, 16,
3662306a36Sopenharmony_ci	3, 10, 17, 24,
3762306a36Sopenharmony_ci	4, 11, 18, 25, 32,
3862306a36Sopenharmony_ci	5, 12, 19, 26, 33, 40,
3962306a36Sopenharmony_ci	6, 13, 20, 27, 34, 41, 48,
4062306a36Sopenharmony_ci	7, 14, 21, 28, 35, 42, 49, 56,
4162306a36Sopenharmony_ci	15, 22, 29, 36, 43, 50, 57,
4262306a36Sopenharmony_ci	23, 30, 37, 44, 51, 58,
4362306a36Sopenharmony_ci	31, 38, 45, 52, 59,
4462306a36Sopenharmony_ci	39, 46, 53, 60,
4562306a36Sopenharmony_ci	47, 54, 61,
4662306a36Sopenharmony_ci	55, 62,
4762306a36Sopenharmony_ci	63,
4862306a36Sopenharmony_ci};
4962306a36Sopenharmony_ci
5062306a36Sopenharmony_ci/*
5162306a36Sopenharmony_ci * noinline_for_stack to work around
5262306a36Sopenharmony_ci * https://bugs.llvm.org/show_bug.cgi?id=38809
5362306a36Sopenharmony_ci */
5462306a36Sopenharmony_cistatic int noinline_for_stack
5562306a36Sopenharmony_cirlc(const s16 *in, __be16 *output, int blocktype)
5662306a36Sopenharmony_ci{
5762306a36Sopenharmony_ci	s16 block[8 * 8];
5862306a36Sopenharmony_ci	s16 *wp = block;
5962306a36Sopenharmony_ci	int i = 0;
6062306a36Sopenharmony_ci	int x, y;
6162306a36Sopenharmony_ci	int ret = 0;
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_ci	/* read in block from framebuffer */
6462306a36Sopenharmony_ci	int lastzero_run = 0;
6562306a36Sopenharmony_ci	int to_encode;
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_ci	for (y = 0; y < 8; y++) {
6862306a36Sopenharmony_ci		for (x = 0; x < 8; x++) {
6962306a36Sopenharmony_ci			*wp = in[x + y * 8];
7062306a36Sopenharmony_ci			wp++;
7162306a36Sopenharmony_ci		}
7262306a36Sopenharmony_ci	}
7362306a36Sopenharmony_ci
7462306a36Sopenharmony_ci	/* keep track of amount of trailing zeros */
7562306a36Sopenharmony_ci	for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
7662306a36Sopenharmony_ci		lastzero_run++;
7762306a36Sopenharmony_ci
7862306a36Sopenharmony_ci	*output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
7962306a36Sopenharmony_ci	ret++;
8062306a36Sopenharmony_ci
8162306a36Sopenharmony_ci	to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
8262306a36Sopenharmony_ci
8362306a36Sopenharmony_ci	i = 0;
8462306a36Sopenharmony_ci	while (i < to_encode) {
8562306a36Sopenharmony_ci		int cnt = 0;
8662306a36Sopenharmony_ci		int tmp;
8762306a36Sopenharmony_ci
8862306a36Sopenharmony_ci		/* count leading zeros */
8962306a36Sopenharmony_ci		while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
9062306a36Sopenharmony_ci			cnt++;
9162306a36Sopenharmony_ci			i++;
9262306a36Sopenharmony_ci			if (i == to_encode) {
9362306a36Sopenharmony_ci				cnt--;
9462306a36Sopenharmony_ci				break;
9562306a36Sopenharmony_ci			}
9662306a36Sopenharmony_ci		}
9762306a36Sopenharmony_ci		/* 4 bits for run, 12 for coefficient (quantization by 4) */
9862306a36Sopenharmony_ci		*output++ = htons((cnt | tmp << 4));
9962306a36Sopenharmony_ci		i++;
10062306a36Sopenharmony_ci		ret++;
10162306a36Sopenharmony_ci	}
10262306a36Sopenharmony_ci	if (lastzero_run > 14) {
10362306a36Sopenharmony_ci		*output = htons(ALL_ZEROS | 0);
10462306a36Sopenharmony_ci		ret++;
10562306a36Sopenharmony_ci	}
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci	return ret;
10862306a36Sopenharmony_ci}
10962306a36Sopenharmony_ci
11062306a36Sopenharmony_ci/*
11162306a36Sopenharmony_ci * This function will worst-case increase rlc_in by 65*2 bytes:
11262306a36Sopenharmony_ci * one s16 value for the header and 8 * 8 coefficients of type s16.
11362306a36Sopenharmony_ci */
11462306a36Sopenharmony_cistatic noinline_for_stack u16
11562306a36Sopenharmony_ciderlc(const __be16 **rlc_in, s16 *dwht_out, const __be16 *end_of_input)
11662306a36Sopenharmony_ci{
11762306a36Sopenharmony_ci	/* header */
11862306a36Sopenharmony_ci	const __be16 *input = *rlc_in;
11962306a36Sopenharmony_ci	u16 stat;
12062306a36Sopenharmony_ci	int dec_count = 0;
12162306a36Sopenharmony_ci	s16 block[8 * 8 + 16];
12262306a36Sopenharmony_ci	s16 *wp = block;
12362306a36Sopenharmony_ci	int i;
12462306a36Sopenharmony_ci
12562306a36Sopenharmony_ci	if (input > end_of_input)
12662306a36Sopenharmony_ci		return OVERFLOW_BIT;
12762306a36Sopenharmony_ci	stat = ntohs(*input++);
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_ci	/*
13062306a36Sopenharmony_ci	 * Now de-compress, it expands one byte to up to 15 bytes
13162306a36Sopenharmony_ci	 * (or fills the remainder of the 64 bytes with zeroes if it
13262306a36Sopenharmony_ci	 * is the last byte to expand).
13362306a36Sopenharmony_ci	 *
13462306a36Sopenharmony_ci	 * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
13562306a36Sopenharmony_ci	 * allow for overflow if the incoming data was malformed.
13662306a36Sopenharmony_ci	 */
13762306a36Sopenharmony_ci	while (dec_count < 8 * 8) {
13862306a36Sopenharmony_ci		s16 in;
13962306a36Sopenharmony_ci		int length;
14062306a36Sopenharmony_ci		int coeff;
14162306a36Sopenharmony_ci
14262306a36Sopenharmony_ci		if (input > end_of_input)
14362306a36Sopenharmony_ci			return OVERFLOW_BIT;
14462306a36Sopenharmony_ci		in = ntohs(*input++);
14562306a36Sopenharmony_ci		length = in & 0xf;
14662306a36Sopenharmony_ci		coeff = in >> 4;
14762306a36Sopenharmony_ci
14862306a36Sopenharmony_ci		/* fill remainder with zeros */
14962306a36Sopenharmony_ci		if (length == 15) {
15062306a36Sopenharmony_ci			for (i = 0; i < 64 - dec_count; i++)
15162306a36Sopenharmony_ci				*wp++ = 0;
15262306a36Sopenharmony_ci			break;
15362306a36Sopenharmony_ci		}
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_ci		for (i = 0; i < length; i++)
15662306a36Sopenharmony_ci			*wp++ = 0;
15762306a36Sopenharmony_ci		*wp++ = coeff;
15862306a36Sopenharmony_ci		dec_count += length + 1;
15962306a36Sopenharmony_ci	}
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci	wp = block;
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_ci	for (i = 0; i < 64; i++) {
16462306a36Sopenharmony_ci		int pos = zigzag[i];
16562306a36Sopenharmony_ci		int y = pos / 8;
16662306a36Sopenharmony_ci		int x = pos % 8;
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_ci		dwht_out[x + y * 8] = *wp++;
16962306a36Sopenharmony_ci	}
17062306a36Sopenharmony_ci	*rlc_in = input;
17162306a36Sopenharmony_ci	return stat;
17262306a36Sopenharmony_ci}
17362306a36Sopenharmony_ci
17462306a36Sopenharmony_cistatic const int quant_table[] = {
17562306a36Sopenharmony_ci	2, 2, 2, 2, 2, 2,  2,  2,
17662306a36Sopenharmony_ci	2, 2, 2, 2, 2, 2,  2,  2,
17762306a36Sopenharmony_ci	2, 2, 2, 2, 2, 2,  2,  3,
17862306a36Sopenharmony_ci	2, 2, 2, 2, 2, 2,  3,  6,
17962306a36Sopenharmony_ci	2, 2, 2, 2, 2, 3,  6,  6,
18062306a36Sopenharmony_ci	2, 2, 2, 2, 3, 6,  6,  6,
18162306a36Sopenharmony_ci	2, 2, 2, 3, 6, 6,  6,  6,
18262306a36Sopenharmony_ci	2, 2, 3, 6, 6, 6,  6,  8,
18362306a36Sopenharmony_ci};
18462306a36Sopenharmony_ci
18562306a36Sopenharmony_cistatic const int quant_table_p[] = {
18662306a36Sopenharmony_ci	3, 3, 3, 3, 3, 3,  3,  3,
18762306a36Sopenharmony_ci	3, 3, 3, 3, 3, 3,  3,  3,
18862306a36Sopenharmony_ci	3, 3, 3, 3, 3, 3,  3,  3,
18962306a36Sopenharmony_ci	3, 3, 3, 3, 3, 3,  3,  6,
19062306a36Sopenharmony_ci	3, 3, 3, 3, 3, 3,  6,  6,
19162306a36Sopenharmony_ci	3, 3, 3, 3, 3, 6,  6,  9,
19262306a36Sopenharmony_ci	3, 3, 3, 3, 6, 6,  9,  9,
19362306a36Sopenharmony_ci	3, 3, 3, 6, 6, 9,  9,  10,
19462306a36Sopenharmony_ci};
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_cistatic void quantize_intra(s16 *coeff, s16 *de_coeff, u16 qp)
19762306a36Sopenharmony_ci{
19862306a36Sopenharmony_ci	const int *quant = quant_table;
19962306a36Sopenharmony_ci	int i, j;
20062306a36Sopenharmony_ci
20162306a36Sopenharmony_ci	for (j = 0; j < 8; j++) {
20262306a36Sopenharmony_ci		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
20362306a36Sopenharmony_ci			*coeff >>= *quant;
20462306a36Sopenharmony_ci			if (*coeff >= -qp && *coeff <= qp)
20562306a36Sopenharmony_ci				*coeff = *de_coeff = 0;
20662306a36Sopenharmony_ci			else
20762306a36Sopenharmony_ci				*de_coeff = *coeff << *quant;
20862306a36Sopenharmony_ci		}
20962306a36Sopenharmony_ci	}
21062306a36Sopenharmony_ci}
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_cistatic void dequantize_intra(s16 *coeff)
21362306a36Sopenharmony_ci{
21462306a36Sopenharmony_ci	const int *quant = quant_table;
21562306a36Sopenharmony_ci	int i, j;
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_ci	for (j = 0; j < 8; j++)
21862306a36Sopenharmony_ci		for (i = 0; i < 8; i++, quant++, coeff++)
21962306a36Sopenharmony_ci			*coeff <<= *quant;
22062306a36Sopenharmony_ci}
22162306a36Sopenharmony_ci
22262306a36Sopenharmony_cistatic void quantize_inter(s16 *coeff, s16 *de_coeff, u16 qp)
22362306a36Sopenharmony_ci{
22462306a36Sopenharmony_ci	const int *quant = quant_table_p;
22562306a36Sopenharmony_ci	int i, j;
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ci	for (j = 0; j < 8; j++) {
22862306a36Sopenharmony_ci		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
22962306a36Sopenharmony_ci			*coeff >>= *quant;
23062306a36Sopenharmony_ci			if (*coeff >= -qp && *coeff <= qp)
23162306a36Sopenharmony_ci				*coeff = *de_coeff = 0;
23262306a36Sopenharmony_ci			else
23362306a36Sopenharmony_ci				*de_coeff = *coeff << *quant;
23462306a36Sopenharmony_ci		}
23562306a36Sopenharmony_ci	}
23662306a36Sopenharmony_ci}
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_cistatic void dequantize_inter(s16 *coeff)
23962306a36Sopenharmony_ci{
24062306a36Sopenharmony_ci	const int *quant = quant_table_p;
24162306a36Sopenharmony_ci	int i, j;
24262306a36Sopenharmony_ci
24362306a36Sopenharmony_ci	for (j = 0; j < 8; j++)
24462306a36Sopenharmony_ci		for (i = 0; i < 8; i++, quant++, coeff++)
24562306a36Sopenharmony_ci			*coeff <<= *quant;
24662306a36Sopenharmony_ci}
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_cistatic void noinline_for_stack fwht(const u8 *block, s16 *output_block,
24962306a36Sopenharmony_ci				    unsigned int stride,
25062306a36Sopenharmony_ci				    unsigned int input_step, bool intra)
25162306a36Sopenharmony_ci{
25262306a36Sopenharmony_ci	/* we'll need more than 8 bits for the transformed coefficients */
25362306a36Sopenharmony_ci	s32 workspace1[8], workspace2[8];
25462306a36Sopenharmony_ci	const u8 *tmp = block;
25562306a36Sopenharmony_ci	s16 *out = output_block;
25662306a36Sopenharmony_ci	int add = intra ? 256 : 0;
25762306a36Sopenharmony_ci	unsigned int i;
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci	/* stage 1 */
26062306a36Sopenharmony_ci	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
26162306a36Sopenharmony_ci		switch (input_step) {
26262306a36Sopenharmony_ci		case 1:
26362306a36Sopenharmony_ci			workspace1[0]  = tmp[0] + tmp[1] - add;
26462306a36Sopenharmony_ci			workspace1[1]  = tmp[0] - tmp[1];
26562306a36Sopenharmony_ci
26662306a36Sopenharmony_ci			workspace1[2]  = tmp[2] + tmp[3] - add;
26762306a36Sopenharmony_ci			workspace1[3]  = tmp[2] - tmp[3];
26862306a36Sopenharmony_ci
26962306a36Sopenharmony_ci			workspace1[4]  = tmp[4] + tmp[5] - add;
27062306a36Sopenharmony_ci			workspace1[5]  = tmp[4] - tmp[5];
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci			workspace1[6]  = tmp[6] + tmp[7] - add;
27362306a36Sopenharmony_ci			workspace1[7]  = tmp[6] - tmp[7];
27462306a36Sopenharmony_ci			break;
27562306a36Sopenharmony_ci		case 2:
27662306a36Sopenharmony_ci			workspace1[0]  = tmp[0] + tmp[2] - add;
27762306a36Sopenharmony_ci			workspace1[1]  = tmp[0] - tmp[2];
27862306a36Sopenharmony_ci
27962306a36Sopenharmony_ci			workspace1[2]  = tmp[4] + tmp[6] - add;
28062306a36Sopenharmony_ci			workspace1[3]  = tmp[4] - tmp[6];
28162306a36Sopenharmony_ci
28262306a36Sopenharmony_ci			workspace1[4]  = tmp[8] + tmp[10] - add;
28362306a36Sopenharmony_ci			workspace1[5]  = tmp[8] - tmp[10];
28462306a36Sopenharmony_ci
28562306a36Sopenharmony_ci			workspace1[6]  = tmp[12] + tmp[14] - add;
28662306a36Sopenharmony_ci			workspace1[7]  = tmp[12] - tmp[14];
28762306a36Sopenharmony_ci			break;
28862306a36Sopenharmony_ci		case 3:
28962306a36Sopenharmony_ci			workspace1[0]  = tmp[0] + tmp[3] - add;
29062306a36Sopenharmony_ci			workspace1[1]  = tmp[0] - tmp[3];
29162306a36Sopenharmony_ci
29262306a36Sopenharmony_ci			workspace1[2]  = tmp[6] + tmp[9] - add;
29362306a36Sopenharmony_ci			workspace1[3]  = tmp[6] - tmp[9];
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_ci			workspace1[4]  = tmp[12] + tmp[15] - add;
29662306a36Sopenharmony_ci			workspace1[5]  = tmp[12] - tmp[15];
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_ci			workspace1[6]  = tmp[18] + tmp[21] - add;
29962306a36Sopenharmony_ci			workspace1[7]  = tmp[18] - tmp[21];
30062306a36Sopenharmony_ci			break;
30162306a36Sopenharmony_ci		default:
30262306a36Sopenharmony_ci			workspace1[0]  = tmp[0] + tmp[4] - add;
30362306a36Sopenharmony_ci			workspace1[1]  = tmp[0] - tmp[4];
30462306a36Sopenharmony_ci
30562306a36Sopenharmony_ci			workspace1[2]  = tmp[8] + tmp[12] - add;
30662306a36Sopenharmony_ci			workspace1[3]  = tmp[8] - tmp[12];
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_ci			workspace1[4]  = tmp[16] + tmp[20] - add;
30962306a36Sopenharmony_ci			workspace1[5]  = tmp[16] - tmp[20];
31062306a36Sopenharmony_ci
31162306a36Sopenharmony_ci			workspace1[6]  = tmp[24] + tmp[28] - add;
31262306a36Sopenharmony_ci			workspace1[7]  = tmp[24] - tmp[28];
31362306a36Sopenharmony_ci			break;
31462306a36Sopenharmony_ci		}
31562306a36Sopenharmony_ci
31662306a36Sopenharmony_ci		/* stage 2 */
31762306a36Sopenharmony_ci		workspace2[0] = workspace1[0] + workspace1[2];
31862306a36Sopenharmony_ci		workspace2[1] = workspace1[0] - workspace1[2];
31962306a36Sopenharmony_ci		workspace2[2] = workspace1[1] - workspace1[3];
32062306a36Sopenharmony_ci		workspace2[3] = workspace1[1] + workspace1[3];
32162306a36Sopenharmony_ci
32262306a36Sopenharmony_ci		workspace2[4] = workspace1[4] + workspace1[6];
32362306a36Sopenharmony_ci		workspace2[5] = workspace1[4] - workspace1[6];
32462306a36Sopenharmony_ci		workspace2[6] = workspace1[5] - workspace1[7];
32562306a36Sopenharmony_ci		workspace2[7] = workspace1[5] + workspace1[7];
32662306a36Sopenharmony_ci
32762306a36Sopenharmony_ci		/* stage 3 */
32862306a36Sopenharmony_ci		out[0] = workspace2[0] + workspace2[4];
32962306a36Sopenharmony_ci		out[1] = workspace2[0] - workspace2[4];
33062306a36Sopenharmony_ci		out[2] = workspace2[1] - workspace2[5];
33162306a36Sopenharmony_ci		out[3] = workspace2[1] + workspace2[5];
33262306a36Sopenharmony_ci		out[4] = workspace2[2] + workspace2[6];
33362306a36Sopenharmony_ci		out[5] = workspace2[2] - workspace2[6];
33462306a36Sopenharmony_ci		out[6] = workspace2[3] - workspace2[7];
33562306a36Sopenharmony_ci		out[7] = workspace2[3] + workspace2[7];
33662306a36Sopenharmony_ci	}
33762306a36Sopenharmony_ci
33862306a36Sopenharmony_ci	out = output_block;
33962306a36Sopenharmony_ci
34062306a36Sopenharmony_ci	for (i = 0; i < 8; i++, out++) {
34162306a36Sopenharmony_ci		/* stage 1 */
34262306a36Sopenharmony_ci		workspace1[0]  = out[0] + out[1 * 8];
34362306a36Sopenharmony_ci		workspace1[1]  = out[0] - out[1 * 8];
34462306a36Sopenharmony_ci
34562306a36Sopenharmony_ci		workspace1[2]  = out[2 * 8] + out[3 * 8];
34662306a36Sopenharmony_ci		workspace1[3]  = out[2 * 8] - out[3 * 8];
34762306a36Sopenharmony_ci
34862306a36Sopenharmony_ci		workspace1[4]  = out[4 * 8] + out[5 * 8];
34962306a36Sopenharmony_ci		workspace1[5]  = out[4 * 8] - out[5 * 8];
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_ci		workspace1[6]  = out[6 * 8] + out[7 * 8];
35262306a36Sopenharmony_ci		workspace1[7]  = out[6 * 8] - out[7 * 8];
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_ci		/* stage 2 */
35562306a36Sopenharmony_ci		workspace2[0] = workspace1[0] + workspace1[2];
35662306a36Sopenharmony_ci		workspace2[1] = workspace1[0] - workspace1[2];
35762306a36Sopenharmony_ci		workspace2[2] = workspace1[1] - workspace1[3];
35862306a36Sopenharmony_ci		workspace2[3] = workspace1[1] + workspace1[3];
35962306a36Sopenharmony_ci
36062306a36Sopenharmony_ci		workspace2[4] = workspace1[4] + workspace1[6];
36162306a36Sopenharmony_ci		workspace2[5] = workspace1[4] - workspace1[6];
36262306a36Sopenharmony_ci		workspace2[6] = workspace1[5] - workspace1[7];
36362306a36Sopenharmony_ci		workspace2[7] = workspace1[5] + workspace1[7];
36462306a36Sopenharmony_ci		/* stage 3 */
36562306a36Sopenharmony_ci		out[0 * 8] = workspace2[0] + workspace2[4];
36662306a36Sopenharmony_ci		out[1 * 8] = workspace2[0] - workspace2[4];
36762306a36Sopenharmony_ci		out[2 * 8] = workspace2[1] - workspace2[5];
36862306a36Sopenharmony_ci		out[3 * 8] = workspace2[1] + workspace2[5];
36962306a36Sopenharmony_ci		out[4 * 8] = workspace2[2] + workspace2[6];
37062306a36Sopenharmony_ci		out[5 * 8] = workspace2[2] - workspace2[6];
37162306a36Sopenharmony_ci		out[6 * 8] = workspace2[3] - workspace2[7];
37262306a36Sopenharmony_ci		out[7 * 8] = workspace2[3] + workspace2[7];
37362306a36Sopenharmony_ci	}
37462306a36Sopenharmony_ci}
37562306a36Sopenharmony_ci
37662306a36Sopenharmony_ci/*
37762306a36Sopenharmony_ci * Not the nicest way of doing it, but P-blocks get twice the range of
37862306a36Sopenharmony_ci * that of the I-blocks. Therefore we need a type bigger than 8 bits.
37962306a36Sopenharmony_ci * Furthermore values can be negative... This is just a version that
38062306a36Sopenharmony_ci * works with 16 signed data
38162306a36Sopenharmony_ci */
38262306a36Sopenharmony_cistatic void noinline_for_stack
38362306a36Sopenharmony_cifwht16(const s16 *block, s16 *output_block, int stride, int intra)
38462306a36Sopenharmony_ci{
38562306a36Sopenharmony_ci	/* we'll need more than 8 bits for the transformed coefficients */
38662306a36Sopenharmony_ci	s32 workspace1[8], workspace2[8];
38762306a36Sopenharmony_ci	const s16 *tmp = block;
38862306a36Sopenharmony_ci	s16 *out = output_block;
38962306a36Sopenharmony_ci	int i;
39062306a36Sopenharmony_ci
39162306a36Sopenharmony_ci	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
39262306a36Sopenharmony_ci		/* stage 1 */
39362306a36Sopenharmony_ci		workspace1[0]  = tmp[0] + tmp[1];
39462306a36Sopenharmony_ci		workspace1[1]  = tmp[0] - tmp[1];
39562306a36Sopenharmony_ci
39662306a36Sopenharmony_ci		workspace1[2]  = tmp[2] + tmp[3];
39762306a36Sopenharmony_ci		workspace1[3]  = tmp[2] - tmp[3];
39862306a36Sopenharmony_ci
39962306a36Sopenharmony_ci		workspace1[4]  = tmp[4] + tmp[5];
40062306a36Sopenharmony_ci		workspace1[5]  = tmp[4] - tmp[5];
40162306a36Sopenharmony_ci
40262306a36Sopenharmony_ci		workspace1[6]  = tmp[6] + tmp[7];
40362306a36Sopenharmony_ci		workspace1[7]  = tmp[6] - tmp[7];
40462306a36Sopenharmony_ci
40562306a36Sopenharmony_ci		/* stage 2 */
40662306a36Sopenharmony_ci		workspace2[0] = workspace1[0] + workspace1[2];
40762306a36Sopenharmony_ci		workspace2[1] = workspace1[0] - workspace1[2];
40862306a36Sopenharmony_ci		workspace2[2] = workspace1[1] - workspace1[3];
40962306a36Sopenharmony_ci		workspace2[3] = workspace1[1] + workspace1[3];
41062306a36Sopenharmony_ci
41162306a36Sopenharmony_ci		workspace2[4] = workspace1[4] + workspace1[6];
41262306a36Sopenharmony_ci		workspace2[5] = workspace1[4] - workspace1[6];
41362306a36Sopenharmony_ci		workspace2[6] = workspace1[5] - workspace1[7];
41462306a36Sopenharmony_ci		workspace2[7] = workspace1[5] + workspace1[7];
41562306a36Sopenharmony_ci
41662306a36Sopenharmony_ci		/* stage 3 */
41762306a36Sopenharmony_ci		out[0] = workspace2[0] + workspace2[4];
41862306a36Sopenharmony_ci		out[1] = workspace2[0] - workspace2[4];
41962306a36Sopenharmony_ci		out[2] = workspace2[1] - workspace2[5];
42062306a36Sopenharmony_ci		out[3] = workspace2[1] + workspace2[5];
42162306a36Sopenharmony_ci		out[4] = workspace2[2] + workspace2[6];
42262306a36Sopenharmony_ci		out[5] = workspace2[2] - workspace2[6];
42362306a36Sopenharmony_ci		out[6] = workspace2[3] - workspace2[7];
42462306a36Sopenharmony_ci		out[7] = workspace2[3] + workspace2[7];
42562306a36Sopenharmony_ci	}
42662306a36Sopenharmony_ci
42762306a36Sopenharmony_ci	out = output_block;
42862306a36Sopenharmony_ci
42962306a36Sopenharmony_ci	for (i = 0; i < 8; i++, out++) {
43062306a36Sopenharmony_ci		/* stage 1 */
43162306a36Sopenharmony_ci		workspace1[0]  = out[0] + out[1*8];
43262306a36Sopenharmony_ci		workspace1[1]  = out[0] - out[1*8];
43362306a36Sopenharmony_ci
43462306a36Sopenharmony_ci		workspace1[2]  = out[2*8] + out[3*8];
43562306a36Sopenharmony_ci		workspace1[3]  = out[2*8] - out[3*8];
43662306a36Sopenharmony_ci
43762306a36Sopenharmony_ci		workspace1[4]  = out[4*8] + out[5*8];
43862306a36Sopenharmony_ci		workspace1[5]  = out[4*8] - out[5*8];
43962306a36Sopenharmony_ci
44062306a36Sopenharmony_ci		workspace1[6]  = out[6*8] + out[7*8];
44162306a36Sopenharmony_ci		workspace1[7]  = out[6*8] - out[7*8];
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_ci		/* stage 2 */
44462306a36Sopenharmony_ci		workspace2[0] = workspace1[0] + workspace1[2];
44562306a36Sopenharmony_ci		workspace2[1] = workspace1[0] - workspace1[2];
44662306a36Sopenharmony_ci		workspace2[2] = workspace1[1] - workspace1[3];
44762306a36Sopenharmony_ci		workspace2[3] = workspace1[1] + workspace1[3];
44862306a36Sopenharmony_ci
44962306a36Sopenharmony_ci		workspace2[4] = workspace1[4] + workspace1[6];
45062306a36Sopenharmony_ci		workspace2[5] = workspace1[4] - workspace1[6];
45162306a36Sopenharmony_ci		workspace2[6] = workspace1[5] - workspace1[7];
45262306a36Sopenharmony_ci		workspace2[7] = workspace1[5] + workspace1[7];
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_ci		/* stage 3 */
45562306a36Sopenharmony_ci		out[0*8] = workspace2[0] + workspace2[4];
45662306a36Sopenharmony_ci		out[1*8] = workspace2[0] - workspace2[4];
45762306a36Sopenharmony_ci		out[2*8] = workspace2[1] - workspace2[5];
45862306a36Sopenharmony_ci		out[3*8] = workspace2[1] + workspace2[5];
45962306a36Sopenharmony_ci		out[4*8] = workspace2[2] + workspace2[6];
46062306a36Sopenharmony_ci		out[5*8] = workspace2[2] - workspace2[6];
46162306a36Sopenharmony_ci		out[6*8] = workspace2[3] - workspace2[7];
46262306a36Sopenharmony_ci		out[7*8] = workspace2[3] + workspace2[7];
46362306a36Sopenharmony_ci	}
46462306a36Sopenharmony_ci}
46562306a36Sopenharmony_ci
46662306a36Sopenharmony_cistatic noinline_for_stack void
46762306a36Sopenharmony_ciifwht(const s16 *block, s16 *output_block, int intra)
46862306a36Sopenharmony_ci{
46962306a36Sopenharmony_ci	/*
47062306a36Sopenharmony_ci	 * we'll need more than 8 bits for the transformed coefficients
47162306a36Sopenharmony_ci	 * use native unit of cpu
47262306a36Sopenharmony_ci	 */
47362306a36Sopenharmony_ci	int workspace1[8], workspace2[8];
47462306a36Sopenharmony_ci	int inter = intra ? 0 : 1;
47562306a36Sopenharmony_ci	const s16 *tmp = block;
47662306a36Sopenharmony_ci	s16 *out = output_block;
47762306a36Sopenharmony_ci	int i;
47862306a36Sopenharmony_ci
47962306a36Sopenharmony_ci	for (i = 0; i < 8; i++, tmp += 8, out += 8) {
48062306a36Sopenharmony_ci		/* stage 1 */
48162306a36Sopenharmony_ci		workspace1[0]  = tmp[0] + tmp[1];
48262306a36Sopenharmony_ci		workspace1[1]  = tmp[0] - tmp[1];
48362306a36Sopenharmony_ci
48462306a36Sopenharmony_ci		workspace1[2]  = tmp[2] + tmp[3];
48562306a36Sopenharmony_ci		workspace1[3]  = tmp[2] - tmp[3];
48662306a36Sopenharmony_ci
48762306a36Sopenharmony_ci		workspace1[4]  = tmp[4] + tmp[5];
48862306a36Sopenharmony_ci		workspace1[5]  = tmp[4] - tmp[5];
48962306a36Sopenharmony_ci
49062306a36Sopenharmony_ci		workspace1[6]  = tmp[6] + tmp[7];
49162306a36Sopenharmony_ci		workspace1[7]  = tmp[6] - tmp[7];
49262306a36Sopenharmony_ci
49362306a36Sopenharmony_ci		/* stage 2 */
49462306a36Sopenharmony_ci		workspace2[0] = workspace1[0] + workspace1[2];
49562306a36Sopenharmony_ci		workspace2[1] = workspace1[0] - workspace1[2];
49662306a36Sopenharmony_ci		workspace2[2] = workspace1[1] - workspace1[3];
49762306a36Sopenharmony_ci		workspace2[3] = workspace1[1] + workspace1[3];
49862306a36Sopenharmony_ci
49962306a36Sopenharmony_ci		workspace2[4] = workspace1[4] + workspace1[6];
50062306a36Sopenharmony_ci		workspace2[5] = workspace1[4] - workspace1[6];
50162306a36Sopenharmony_ci		workspace2[6] = workspace1[5] - workspace1[7];
50262306a36Sopenharmony_ci		workspace2[7] = workspace1[5] + workspace1[7];
50362306a36Sopenharmony_ci
50462306a36Sopenharmony_ci		/* stage 3 */
50562306a36Sopenharmony_ci		out[0] = workspace2[0] + workspace2[4];
50662306a36Sopenharmony_ci		out[1] = workspace2[0] - workspace2[4];
50762306a36Sopenharmony_ci		out[2] = workspace2[1] - workspace2[5];
50862306a36Sopenharmony_ci		out[3] = workspace2[1] + workspace2[5];
50962306a36Sopenharmony_ci		out[4] = workspace2[2] + workspace2[6];
51062306a36Sopenharmony_ci		out[5] = workspace2[2] - workspace2[6];
51162306a36Sopenharmony_ci		out[6] = workspace2[3] - workspace2[7];
51262306a36Sopenharmony_ci		out[7] = workspace2[3] + workspace2[7];
51362306a36Sopenharmony_ci	}
51462306a36Sopenharmony_ci
51562306a36Sopenharmony_ci	out = output_block;
51662306a36Sopenharmony_ci
51762306a36Sopenharmony_ci	for (i = 0; i < 8; i++, out++) {
51862306a36Sopenharmony_ci		/* stage 1 */
51962306a36Sopenharmony_ci		workspace1[0]  = out[0] + out[1 * 8];
52062306a36Sopenharmony_ci		workspace1[1]  = out[0] - out[1 * 8];
52162306a36Sopenharmony_ci
52262306a36Sopenharmony_ci		workspace1[2]  = out[2 * 8] + out[3 * 8];
52362306a36Sopenharmony_ci		workspace1[3]  = out[2 * 8] - out[3 * 8];
52462306a36Sopenharmony_ci
52562306a36Sopenharmony_ci		workspace1[4]  = out[4 * 8] + out[5 * 8];
52662306a36Sopenharmony_ci		workspace1[5]  = out[4 * 8] - out[5 * 8];
52762306a36Sopenharmony_ci
52862306a36Sopenharmony_ci		workspace1[6]  = out[6 * 8] + out[7 * 8];
52962306a36Sopenharmony_ci		workspace1[7]  = out[6 * 8] - out[7 * 8];
53062306a36Sopenharmony_ci
53162306a36Sopenharmony_ci		/* stage 2 */
53262306a36Sopenharmony_ci		workspace2[0] = workspace1[0] + workspace1[2];
53362306a36Sopenharmony_ci		workspace2[1] = workspace1[0] - workspace1[2];
53462306a36Sopenharmony_ci		workspace2[2] = workspace1[1] - workspace1[3];
53562306a36Sopenharmony_ci		workspace2[3] = workspace1[1] + workspace1[3];
53662306a36Sopenharmony_ci
53762306a36Sopenharmony_ci		workspace2[4] = workspace1[4] + workspace1[6];
53862306a36Sopenharmony_ci		workspace2[5] = workspace1[4] - workspace1[6];
53962306a36Sopenharmony_ci		workspace2[6] = workspace1[5] - workspace1[7];
54062306a36Sopenharmony_ci		workspace2[7] = workspace1[5] + workspace1[7];
54162306a36Sopenharmony_ci
54262306a36Sopenharmony_ci		/* stage 3 */
54362306a36Sopenharmony_ci		if (inter) {
54462306a36Sopenharmony_ci			int d;
54562306a36Sopenharmony_ci
54662306a36Sopenharmony_ci			out[0 * 8] = workspace2[0] + workspace2[4];
54762306a36Sopenharmony_ci			out[1 * 8] = workspace2[0] - workspace2[4];
54862306a36Sopenharmony_ci			out[2 * 8] = workspace2[1] - workspace2[5];
54962306a36Sopenharmony_ci			out[3 * 8] = workspace2[1] + workspace2[5];
55062306a36Sopenharmony_ci			out[4 * 8] = workspace2[2] + workspace2[6];
55162306a36Sopenharmony_ci			out[5 * 8] = workspace2[2] - workspace2[6];
55262306a36Sopenharmony_ci			out[6 * 8] = workspace2[3] - workspace2[7];
55362306a36Sopenharmony_ci			out[7 * 8] = workspace2[3] + workspace2[7];
55462306a36Sopenharmony_ci
55562306a36Sopenharmony_ci			for (d = 0; d < 8; d++)
55662306a36Sopenharmony_ci				out[8 * d] >>= 6;
55762306a36Sopenharmony_ci		} else {
55862306a36Sopenharmony_ci			int d;
55962306a36Sopenharmony_ci
56062306a36Sopenharmony_ci			out[0 * 8] = workspace2[0] + workspace2[4];
56162306a36Sopenharmony_ci			out[1 * 8] = workspace2[0] - workspace2[4];
56262306a36Sopenharmony_ci			out[2 * 8] = workspace2[1] - workspace2[5];
56362306a36Sopenharmony_ci			out[3 * 8] = workspace2[1] + workspace2[5];
56462306a36Sopenharmony_ci			out[4 * 8] = workspace2[2] + workspace2[6];
56562306a36Sopenharmony_ci			out[5 * 8] = workspace2[2] - workspace2[6];
56662306a36Sopenharmony_ci			out[6 * 8] = workspace2[3] - workspace2[7];
56762306a36Sopenharmony_ci			out[7 * 8] = workspace2[3] + workspace2[7];
56862306a36Sopenharmony_ci
56962306a36Sopenharmony_ci			for (d = 0; d < 8; d++) {
57062306a36Sopenharmony_ci				out[8 * d] >>= 6;
57162306a36Sopenharmony_ci				out[8 * d] += 128;
57262306a36Sopenharmony_ci			}
57362306a36Sopenharmony_ci		}
57462306a36Sopenharmony_ci	}
57562306a36Sopenharmony_ci}
57662306a36Sopenharmony_ci
57762306a36Sopenharmony_cistatic void fill_encoder_block(const u8 *input, s16 *dst,
57862306a36Sopenharmony_ci			       unsigned int stride, unsigned int input_step)
57962306a36Sopenharmony_ci{
58062306a36Sopenharmony_ci	int i, j;
58162306a36Sopenharmony_ci
58262306a36Sopenharmony_ci	for (i = 0; i < 8; i++) {
58362306a36Sopenharmony_ci		for (j = 0; j < 8; j++, input += input_step)
58462306a36Sopenharmony_ci			*dst++ = *input;
58562306a36Sopenharmony_ci		input += stride - 8 * input_step;
58662306a36Sopenharmony_ci	}
58762306a36Sopenharmony_ci}
58862306a36Sopenharmony_ci
58962306a36Sopenharmony_cistatic int var_intra(const s16 *input)
59062306a36Sopenharmony_ci{
59162306a36Sopenharmony_ci	int32_t mean = 0;
59262306a36Sopenharmony_ci	int32_t ret = 0;
59362306a36Sopenharmony_ci	const s16 *tmp = input;
59462306a36Sopenharmony_ci	int i;
59562306a36Sopenharmony_ci
59662306a36Sopenharmony_ci	for (i = 0; i < 8 * 8; i++, tmp++)
59762306a36Sopenharmony_ci		mean += *tmp;
59862306a36Sopenharmony_ci	mean /= 64;
59962306a36Sopenharmony_ci	tmp = input;
60062306a36Sopenharmony_ci	for (i = 0; i < 8 * 8; i++, tmp++)
60162306a36Sopenharmony_ci		ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
60262306a36Sopenharmony_ci	return ret;
60362306a36Sopenharmony_ci}
60462306a36Sopenharmony_ci
60562306a36Sopenharmony_cistatic int var_inter(const s16 *old, const s16 *new)
60662306a36Sopenharmony_ci{
60762306a36Sopenharmony_ci	int32_t ret = 0;
60862306a36Sopenharmony_ci	int i;
60962306a36Sopenharmony_ci
61062306a36Sopenharmony_ci	for (i = 0; i < 8 * 8; i++, old++, new++)
61162306a36Sopenharmony_ci		ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
61262306a36Sopenharmony_ci	return ret;
61362306a36Sopenharmony_ci}
61462306a36Sopenharmony_ci
61562306a36Sopenharmony_cistatic noinline_for_stack int
61662306a36Sopenharmony_cidecide_blocktype(const u8 *cur, const u8 *reference, s16 *deltablock,
61762306a36Sopenharmony_ci		 unsigned int stride, unsigned int input_step)
61862306a36Sopenharmony_ci{
61962306a36Sopenharmony_ci	s16 tmp[64];
62062306a36Sopenharmony_ci	s16 old[64];
62162306a36Sopenharmony_ci	s16 *work = tmp;
62262306a36Sopenharmony_ci	unsigned int k, l;
62362306a36Sopenharmony_ci	int vari;
62462306a36Sopenharmony_ci	int vard;
62562306a36Sopenharmony_ci
62662306a36Sopenharmony_ci	fill_encoder_block(cur, tmp, stride, input_step);
62762306a36Sopenharmony_ci	fill_encoder_block(reference, old, 8, 1);
62862306a36Sopenharmony_ci	vari = var_intra(tmp);
62962306a36Sopenharmony_ci
63062306a36Sopenharmony_ci	for (k = 0; k < 8; k++) {
63162306a36Sopenharmony_ci		for (l = 0; l < 8; l++) {
63262306a36Sopenharmony_ci			*deltablock = *work - *reference;
63362306a36Sopenharmony_ci			deltablock++;
63462306a36Sopenharmony_ci			work++;
63562306a36Sopenharmony_ci			reference++;
63662306a36Sopenharmony_ci		}
63762306a36Sopenharmony_ci	}
63862306a36Sopenharmony_ci	deltablock -= 64;
63962306a36Sopenharmony_ci	vard = var_inter(old, tmp);
64062306a36Sopenharmony_ci	return vari <= vard ? IBLOCK : PBLOCK;
64162306a36Sopenharmony_ci}
64262306a36Sopenharmony_ci
64362306a36Sopenharmony_cistatic void fill_decoder_block(u8 *dst, const s16 *input, int stride,
64462306a36Sopenharmony_ci			       unsigned int dst_step)
64562306a36Sopenharmony_ci{
64662306a36Sopenharmony_ci	int i, j;
64762306a36Sopenharmony_ci
64862306a36Sopenharmony_ci	for (i = 0; i < 8; i++) {
64962306a36Sopenharmony_ci		for (j = 0; j < 8; j++, input++, dst += dst_step) {
65062306a36Sopenharmony_ci			if (*input < 0)
65162306a36Sopenharmony_ci				*dst = 0;
65262306a36Sopenharmony_ci			else if (*input > 255)
65362306a36Sopenharmony_ci				*dst = 255;
65462306a36Sopenharmony_ci			else
65562306a36Sopenharmony_ci				*dst = *input;
65662306a36Sopenharmony_ci		}
65762306a36Sopenharmony_ci		dst += stride - (8 * dst_step);
65862306a36Sopenharmony_ci	}
65962306a36Sopenharmony_ci}
66062306a36Sopenharmony_ci
66162306a36Sopenharmony_cistatic void add_deltas(s16 *deltas, const u8 *ref, int stride,
66262306a36Sopenharmony_ci		       unsigned int ref_step)
66362306a36Sopenharmony_ci{
66462306a36Sopenharmony_ci	int k, l;
66562306a36Sopenharmony_ci
66662306a36Sopenharmony_ci	for (k = 0; k < 8; k++) {
66762306a36Sopenharmony_ci		for (l = 0; l < 8; l++) {
66862306a36Sopenharmony_ci			*deltas += *ref;
66962306a36Sopenharmony_ci			ref += ref_step;
67062306a36Sopenharmony_ci			/*
67162306a36Sopenharmony_ci			 * Due to quantizing, it might possible that the
67262306a36Sopenharmony_ci			 * decoded coefficients are slightly out of range
67362306a36Sopenharmony_ci			 */
67462306a36Sopenharmony_ci			if (*deltas < 0)
67562306a36Sopenharmony_ci				*deltas = 0;
67662306a36Sopenharmony_ci			else if (*deltas > 255)
67762306a36Sopenharmony_ci				*deltas = 255;
67862306a36Sopenharmony_ci			deltas++;
67962306a36Sopenharmony_ci		}
68062306a36Sopenharmony_ci		ref += stride - (8 * ref_step);
68162306a36Sopenharmony_ci	}
68262306a36Sopenharmony_ci}
68362306a36Sopenharmony_ci
68462306a36Sopenharmony_cistatic u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
68562306a36Sopenharmony_ci			struct fwht_cframe *cf, u32 height, u32 width,
68662306a36Sopenharmony_ci			u32 stride, unsigned int input_step,
68762306a36Sopenharmony_ci			bool is_intra, bool next_is_intra)
68862306a36Sopenharmony_ci{
68962306a36Sopenharmony_ci	u8 *input_start = input;
69062306a36Sopenharmony_ci	__be16 *rlco_start = *rlco;
69162306a36Sopenharmony_ci	s16 deltablock[64];
69262306a36Sopenharmony_ci	__be16 pframe_bit = htons(PFRAME_BIT);
69362306a36Sopenharmony_ci	u32 encoding = 0;
69462306a36Sopenharmony_ci	unsigned int last_size = 0;
69562306a36Sopenharmony_ci	unsigned int i, j;
69662306a36Sopenharmony_ci
69762306a36Sopenharmony_ci	width = round_up(width, 8);
69862306a36Sopenharmony_ci	height = round_up(height, 8);
69962306a36Sopenharmony_ci
70062306a36Sopenharmony_ci	for (j = 0; j < height / 8; j++) {
70162306a36Sopenharmony_ci		input = input_start + j * 8 * stride;
70262306a36Sopenharmony_ci		for (i = 0; i < width / 8; i++) {
70362306a36Sopenharmony_ci			/* intra code, first frame is always intra coded. */
70462306a36Sopenharmony_ci			int blocktype = IBLOCK;
70562306a36Sopenharmony_ci			unsigned int size;
70662306a36Sopenharmony_ci
70762306a36Sopenharmony_ci			if (!is_intra)
70862306a36Sopenharmony_ci				blocktype = decide_blocktype(input, refp,
70962306a36Sopenharmony_ci					deltablock, stride, input_step);
71062306a36Sopenharmony_ci			if (blocktype == IBLOCK) {
71162306a36Sopenharmony_ci				fwht(input, cf->coeffs, stride, input_step, 1);
71262306a36Sopenharmony_ci				quantize_intra(cf->coeffs, cf->de_coeffs,
71362306a36Sopenharmony_ci					       cf->i_frame_qp);
71462306a36Sopenharmony_ci			} else {
71562306a36Sopenharmony_ci				/* inter code */
71662306a36Sopenharmony_ci				encoding |= FWHT_FRAME_PCODED;
71762306a36Sopenharmony_ci				fwht16(deltablock, cf->coeffs, 8, 0);
71862306a36Sopenharmony_ci				quantize_inter(cf->coeffs, cf->de_coeffs,
71962306a36Sopenharmony_ci					       cf->p_frame_qp);
72062306a36Sopenharmony_ci			}
72162306a36Sopenharmony_ci			if (!next_is_intra) {
72262306a36Sopenharmony_ci				ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
72362306a36Sopenharmony_ci
72462306a36Sopenharmony_ci				if (blocktype == PBLOCK)
72562306a36Sopenharmony_ci					add_deltas(cf->de_fwht, refp, 8, 1);
72662306a36Sopenharmony_ci				fill_decoder_block(refp, cf->de_fwht, 8, 1);
72762306a36Sopenharmony_ci			}
72862306a36Sopenharmony_ci
72962306a36Sopenharmony_ci			input += 8 * input_step;
73062306a36Sopenharmony_ci			refp += 8 * 8;
73162306a36Sopenharmony_ci
73262306a36Sopenharmony_ci			size = rlc(cf->coeffs, *rlco, blocktype);
73362306a36Sopenharmony_ci			if (last_size == size &&
73462306a36Sopenharmony_ci			    !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
73562306a36Sopenharmony_ci				__be16 *last_rlco = *rlco - size;
73662306a36Sopenharmony_ci				s16 hdr = ntohs(*last_rlco);
73762306a36Sopenharmony_ci
73862306a36Sopenharmony_ci				if (!((*last_rlco ^ **rlco) & pframe_bit) &&
73962306a36Sopenharmony_ci				    (hdr & DUPS_MASK) < DUPS_MASK)
74062306a36Sopenharmony_ci					*last_rlco = htons(hdr + 2);
74162306a36Sopenharmony_ci				else
74262306a36Sopenharmony_ci					*rlco += size;
74362306a36Sopenharmony_ci			} else {
74462306a36Sopenharmony_ci				*rlco += size;
74562306a36Sopenharmony_ci			}
74662306a36Sopenharmony_ci			if (*rlco >= rlco_max) {
74762306a36Sopenharmony_ci				encoding |= FWHT_FRAME_UNENCODED;
74862306a36Sopenharmony_ci				goto exit_loop;
74962306a36Sopenharmony_ci			}
75062306a36Sopenharmony_ci			last_size = size;
75162306a36Sopenharmony_ci		}
75262306a36Sopenharmony_ci	}
75362306a36Sopenharmony_ci
75462306a36Sopenharmony_ciexit_loop:
75562306a36Sopenharmony_ci	if (encoding & FWHT_FRAME_UNENCODED) {
75662306a36Sopenharmony_ci		u8 *out = (u8 *)rlco_start;
75762306a36Sopenharmony_ci		u8 *p;
75862306a36Sopenharmony_ci
75962306a36Sopenharmony_ci		input = input_start;
76062306a36Sopenharmony_ci		/*
76162306a36Sopenharmony_ci		 * The compressed stream should never contain the magic
76262306a36Sopenharmony_ci		 * header, so when we copy the YUV data we replace 0xff
76362306a36Sopenharmony_ci		 * by 0xfe. Since YUV is limited range such values
76462306a36Sopenharmony_ci		 * shouldn't appear anyway.
76562306a36Sopenharmony_ci		 */
76662306a36Sopenharmony_ci		for (j = 0; j < height; j++) {
76762306a36Sopenharmony_ci			for (i = 0, p = input; i < width; i++, p += input_step)
76862306a36Sopenharmony_ci				*out++ = (*p == 0xff) ? 0xfe : *p;
76962306a36Sopenharmony_ci			input += stride;
77062306a36Sopenharmony_ci		}
77162306a36Sopenharmony_ci		*rlco = (__be16 *)out;
77262306a36Sopenharmony_ci		encoding &= ~FWHT_FRAME_PCODED;
77362306a36Sopenharmony_ci	}
77462306a36Sopenharmony_ci	return encoding;
77562306a36Sopenharmony_ci}
77662306a36Sopenharmony_ci
77762306a36Sopenharmony_ciu32 fwht_encode_frame(struct fwht_raw_frame *frm,
77862306a36Sopenharmony_ci		      struct fwht_raw_frame *ref_frm,
77962306a36Sopenharmony_ci		      struct fwht_cframe *cf,
78062306a36Sopenharmony_ci		      bool is_intra, bool next_is_intra,
78162306a36Sopenharmony_ci		      unsigned int width, unsigned int height,
78262306a36Sopenharmony_ci		      unsigned int stride, unsigned int chroma_stride)
78362306a36Sopenharmony_ci{
78462306a36Sopenharmony_ci	unsigned int size = height * width;
78562306a36Sopenharmony_ci	__be16 *rlco = cf->rlc_data;
78662306a36Sopenharmony_ci	__be16 *rlco_max;
78762306a36Sopenharmony_ci	u32 encoding;
78862306a36Sopenharmony_ci
78962306a36Sopenharmony_ci	rlco_max = rlco + size / 2 - 256;
79062306a36Sopenharmony_ci	encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
79162306a36Sopenharmony_ci				height, width, stride,
79262306a36Sopenharmony_ci				frm->luma_alpha_step, is_intra, next_is_intra);
79362306a36Sopenharmony_ci	if (encoding & FWHT_FRAME_UNENCODED)
79462306a36Sopenharmony_ci		encoding |= FWHT_LUMA_UNENCODED;
79562306a36Sopenharmony_ci	encoding &= ~FWHT_FRAME_UNENCODED;
79662306a36Sopenharmony_ci
79762306a36Sopenharmony_ci	if (frm->components_num >= 3) {
79862306a36Sopenharmony_ci		u32 chroma_h = height / frm->height_div;
79962306a36Sopenharmony_ci		u32 chroma_w = width / frm->width_div;
80062306a36Sopenharmony_ci		unsigned int chroma_size = chroma_h * chroma_w;
80162306a36Sopenharmony_ci
80262306a36Sopenharmony_ci		rlco_max = rlco + chroma_size / 2 - 256;
80362306a36Sopenharmony_ci		encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max,
80462306a36Sopenharmony_ci					 cf, chroma_h, chroma_w,
80562306a36Sopenharmony_ci					 chroma_stride, frm->chroma_step,
80662306a36Sopenharmony_ci					 is_intra, next_is_intra);
80762306a36Sopenharmony_ci		if (encoding & FWHT_FRAME_UNENCODED)
80862306a36Sopenharmony_ci			encoding |= FWHT_CB_UNENCODED;
80962306a36Sopenharmony_ci		encoding &= ~FWHT_FRAME_UNENCODED;
81062306a36Sopenharmony_ci		rlco_max = rlco + chroma_size / 2 - 256;
81162306a36Sopenharmony_ci		encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max,
81262306a36Sopenharmony_ci					 cf, chroma_h, chroma_w,
81362306a36Sopenharmony_ci					 chroma_stride, frm->chroma_step,
81462306a36Sopenharmony_ci					 is_intra, next_is_intra);
81562306a36Sopenharmony_ci		if (encoding & FWHT_FRAME_UNENCODED)
81662306a36Sopenharmony_ci			encoding |= FWHT_CR_UNENCODED;
81762306a36Sopenharmony_ci		encoding &= ~FWHT_FRAME_UNENCODED;
81862306a36Sopenharmony_ci	}
81962306a36Sopenharmony_ci
82062306a36Sopenharmony_ci	if (frm->components_num == 4) {
82162306a36Sopenharmony_ci		rlco_max = rlco + size / 2 - 256;
82262306a36Sopenharmony_ci		encoding |= encode_plane(frm->alpha, ref_frm->alpha, &rlco,
82362306a36Sopenharmony_ci					 rlco_max, cf, height, width,
82462306a36Sopenharmony_ci					 stride, frm->luma_alpha_step,
82562306a36Sopenharmony_ci					 is_intra, next_is_intra);
82662306a36Sopenharmony_ci		if (encoding & FWHT_FRAME_UNENCODED)
82762306a36Sopenharmony_ci			encoding |= FWHT_ALPHA_UNENCODED;
82862306a36Sopenharmony_ci		encoding &= ~FWHT_FRAME_UNENCODED;
82962306a36Sopenharmony_ci	}
83062306a36Sopenharmony_ci
83162306a36Sopenharmony_ci	cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
83262306a36Sopenharmony_ci	return encoding;
83362306a36Sopenharmony_ci}
83462306a36Sopenharmony_ci
83562306a36Sopenharmony_cistatic bool decode_plane(struct fwht_cframe *cf, const __be16 **rlco,
83662306a36Sopenharmony_ci			 u32 height, u32 width, const u8 *ref, u32 ref_stride,
83762306a36Sopenharmony_ci			 unsigned int ref_step, u8 *dst,
83862306a36Sopenharmony_ci			 unsigned int dst_stride, unsigned int dst_step,
83962306a36Sopenharmony_ci			 bool uncompressed, const __be16 *end_of_rlco_buf)
84062306a36Sopenharmony_ci{
84162306a36Sopenharmony_ci	unsigned int copies = 0;
84262306a36Sopenharmony_ci	s16 copy[8 * 8];
84362306a36Sopenharmony_ci	u16 stat;
84462306a36Sopenharmony_ci	unsigned int i, j;
84562306a36Sopenharmony_ci	bool is_intra = !ref;
84662306a36Sopenharmony_ci
84762306a36Sopenharmony_ci	width = round_up(width, 8);
84862306a36Sopenharmony_ci	height = round_up(height, 8);
84962306a36Sopenharmony_ci
85062306a36Sopenharmony_ci	if (uncompressed) {
85162306a36Sopenharmony_ci		int i;
85262306a36Sopenharmony_ci
85362306a36Sopenharmony_ci		if (end_of_rlco_buf + 1 < *rlco + width * height / 2)
85462306a36Sopenharmony_ci			return false;
85562306a36Sopenharmony_ci		for (i = 0; i < height; i++) {
85662306a36Sopenharmony_ci			memcpy(dst, *rlco, width);
85762306a36Sopenharmony_ci			dst += dst_stride;
85862306a36Sopenharmony_ci			*rlco += width / 2;
85962306a36Sopenharmony_ci		}
86062306a36Sopenharmony_ci		return true;
86162306a36Sopenharmony_ci	}
86262306a36Sopenharmony_ci
86362306a36Sopenharmony_ci	/*
86462306a36Sopenharmony_ci	 * When decoding each macroblock the rlco pointer will be increased
86562306a36Sopenharmony_ci	 * by 65 * 2 bytes worst-case.
86662306a36Sopenharmony_ci	 * To avoid overflow the buffer has to be 65/64th of the actual raw
86762306a36Sopenharmony_ci	 * image size, just in case someone feeds it malicious data.
86862306a36Sopenharmony_ci	 */
86962306a36Sopenharmony_ci	for (j = 0; j < height / 8; j++) {
87062306a36Sopenharmony_ci		for (i = 0; i < width / 8; i++) {
87162306a36Sopenharmony_ci			const u8 *refp = ref + j * 8 * ref_stride +
87262306a36Sopenharmony_ci				i * 8 * ref_step;
87362306a36Sopenharmony_ci			u8 *dstp = dst + j * 8 * dst_stride + i * 8 * dst_step;
87462306a36Sopenharmony_ci
87562306a36Sopenharmony_ci			if (copies) {
87662306a36Sopenharmony_ci				memcpy(cf->de_fwht, copy, sizeof(copy));
87762306a36Sopenharmony_ci				if ((stat & PFRAME_BIT) && !is_intra)
87862306a36Sopenharmony_ci					add_deltas(cf->de_fwht, refp,
87962306a36Sopenharmony_ci						   ref_stride, ref_step);
88062306a36Sopenharmony_ci				fill_decoder_block(dstp, cf->de_fwht,
88162306a36Sopenharmony_ci						   dst_stride, dst_step);
88262306a36Sopenharmony_ci				copies--;
88362306a36Sopenharmony_ci				continue;
88462306a36Sopenharmony_ci			}
88562306a36Sopenharmony_ci
88662306a36Sopenharmony_ci			stat = derlc(rlco, cf->coeffs, end_of_rlco_buf);
88762306a36Sopenharmony_ci			if (stat & OVERFLOW_BIT)
88862306a36Sopenharmony_ci				return false;
88962306a36Sopenharmony_ci			if ((stat & PFRAME_BIT) && !is_intra)
89062306a36Sopenharmony_ci				dequantize_inter(cf->coeffs);
89162306a36Sopenharmony_ci			else
89262306a36Sopenharmony_ci				dequantize_intra(cf->coeffs);
89362306a36Sopenharmony_ci
89462306a36Sopenharmony_ci			ifwht(cf->coeffs, cf->de_fwht,
89562306a36Sopenharmony_ci			      ((stat & PFRAME_BIT) && !is_intra) ? 0 : 1);
89662306a36Sopenharmony_ci
89762306a36Sopenharmony_ci			copies = (stat & DUPS_MASK) >> 1;
89862306a36Sopenharmony_ci			if (copies)
89962306a36Sopenharmony_ci				memcpy(copy, cf->de_fwht, sizeof(copy));
90062306a36Sopenharmony_ci			if ((stat & PFRAME_BIT) && !is_intra)
90162306a36Sopenharmony_ci				add_deltas(cf->de_fwht, refp,
90262306a36Sopenharmony_ci					   ref_stride, ref_step);
90362306a36Sopenharmony_ci			fill_decoder_block(dstp, cf->de_fwht, dst_stride,
90462306a36Sopenharmony_ci					   dst_step);
90562306a36Sopenharmony_ci		}
90662306a36Sopenharmony_ci	}
90762306a36Sopenharmony_ci	return true;
90862306a36Sopenharmony_ci}
90962306a36Sopenharmony_ci
91062306a36Sopenharmony_cibool fwht_decode_frame(struct fwht_cframe *cf, u32 hdr_flags,
91162306a36Sopenharmony_ci		       unsigned int components_num, unsigned int width,
91262306a36Sopenharmony_ci		       unsigned int height, const struct fwht_raw_frame *ref,
91362306a36Sopenharmony_ci		       unsigned int ref_stride, unsigned int ref_chroma_stride,
91462306a36Sopenharmony_ci		       struct fwht_raw_frame *dst, unsigned int dst_stride,
91562306a36Sopenharmony_ci		       unsigned int dst_chroma_stride)
91662306a36Sopenharmony_ci{
91762306a36Sopenharmony_ci	const __be16 *rlco = cf->rlc_data;
91862306a36Sopenharmony_ci	const __be16 *end_of_rlco_buf = cf->rlc_data +
91962306a36Sopenharmony_ci			(cf->size / sizeof(*rlco)) - 1;
92062306a36Sopenharmony_ci
92162306a36Sopenharmony_ci	if (!decode_plane(cf, &rlco, height, width, ref->luma, ref_stride,
92262306a36Sopenharmony_ci			  ref->luma_alpha_step, dst->luma, dst_stride,
92362306a36Sopenharmony_ci			  dst->luma_alpha_step,
92462306a36Sopenharmony_ci			  hdr_flags & V4L2_FWHT_FL_LUMA_IS_UNCOMPRESSED,
92562306a36Sopenharmony_ci			  end_of_rlco_buf))
92662306a36Sopenharmony_ci		return false;
92762306a36Sopenharmony_ci
92862306a36Sopenharmony_ci	if (components_num >= 3) {
92962306a36Sopenharmony_ci		u32 h = height;
93062306a36Sopenharmony_ci		u32 w = width;
93162306a36Sopenharmony_ci
93262306a36Sopenharmony_ci		if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_HEIGHT))
93362306a36Sopenharmony_ci			h /= 2;
93462306a36Sopenharmony_ci		if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_WIDTH))
93562306a36Sopenharmony_ci			w /= 2;
93662306a36Sopenharmony_ci
93762306a36Sopenharmony_ci		if (!decode_plane(cf, &rlco, h, w, ref->cb, ref_chroma_stride,
93862306a36Sopenharmony_ci				  ref->chroma_step, dst->cb, dst_chroma_stride,
93962306a36Sopenharmony_ci				  dst->chroma_step,
94062306a36Sopenharmony_ci				  hdr_flags & V4L2_FWHT_FL_CB_IS_UNCOMPRESSED,
94162306a36Sopenharmony_ci				  end_of_rlco_buf))
94262306a36Sopenharmony_ci			return false;
94362306a36Sopenharmony_ci		if (!decode_plane(cf, &rlco, h, w, ref->cr, ref_chroma_stride,
94462306a36Sopenharmony_ci				  ref->chroma_step, dst->cr, dst_chroma_stride,
94562306a36Sopenharmony_ci				  dst->chroma_step,
94662306a36Sopenharmony_ci				  hdr_flags & V4L2_FWHT_FL_CR_IS_UNCOMPRESSED,
94762306a36Sopenharmony_ci				  end_of_rlco_buf))
94862306a36Sopenharmony_ci			return false;
94962306a36Sopenharmony_ci	}
95062306a36Sopenharmony_ci
95162306a36Sopenharmony_ci	if (components_num == 4)
95262306a36Sopenharmony_ci		if (!decode_plane(cf, &rlco, height, width, ref->alpha, ref_stride,
95362306a36Sopenharmony_ci				  ref->luma_alpha_step, dst->alpha, dst_stride,
95462306a36Sopenharmony_ci				  dst->luma_alpha_step,
95562306a36Sopenharmony_ci				  hdr_flags & V4L2_FWHT_FL_ALPHA_IS_UNCOMPRESSED,
95662306a36Sopenharmony_ci				  end_of_rlco_buf))
95762306a36Sopenharmony_ci			return false;
95862306a36Sopenharmony_ci	return true;
95962306a36Sopenharmony_ci}
960