18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: LGPL-2.1+ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Copyright 2016 Tom aan de Wiel 48c2ecf20Sopenharmony_ci * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved. 58c2ecf20Sopenharmony_ci * 68c2ecf20Sopenharmony_ci * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper: 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms, 98c2ecf20Sopenharmony_ci * R.D. Brown, 1977 108c2ecf20Sopenharmony_ci */ 118c2ecf20Sopenharmony_ci 128c2ecf20Sopenharmony_ci#include <linux/string.h> 138c2ecf20Sopenharmony_ci#include <linux/kernel.h> 148c2ecf20Sopenharmony_ci#include "codec-fwht.h" 158c2ecf20Sopenharmony_ci 168c2ecf20Sopenharmony_ci#define OVERFLOW_BIT BIT(14) 178c2ecf20Sopenharmony_ci 188c2ecf20Sopenharmony_ci/* 198c2ecf20Sopenharmony_ci * Note: bit 0 of the header must always be 0. Otherwise it cannot 208c2ecf20Sopenharmony_ci * be guaranteed that the magic 8 byte sequence (see below) can 218c2ecf20Sopenharmony_ci * never occur in the rlc output. 228c2ecf20Sopenharmony_ci */ 238c2ecf20Sopenharmony_ci#define PFRAME_BIT BIT(15) 248c2ecf20Sopenharmony_ci#define DUPS_MASK 0x1ffe 258c2ecf20Sopenharmony_ci 268c2ecf20Sopenharmony_ci#define PBLOCK 0 278c2ecf20Sopenharmony_ci#define IBLOCK 1 288c2ecf20Sopenharmony_ci 298c2ecf20Sopenharmony_ci#define ALL_ZEROS 15 308c2ecf20Sopenharmony_ci 318c2ecf20Sopenharmony_cistatic const uint8_t zigzag[64] = { 328c2ecf20Sopenharmony_ci 0, 338c2ecf20Sopenharmony_ci 1, 8, 348c2ecf20Sopenharmony_ci 2, 9, 16, 358c2ecf20Sopenharmony_ci 3, 10, 17, 24, 368c2ecf20Sopenharmony_ci 4, 11, 18, 25, 32, 378c2ecf20Sopenharmony_ci 5, 12, 19, 26, 33, 40, 388c2ecf20Sopenharmony_ci 6, 13, 20, 27, 34, 41, 48, 398c2ecf20Sopenharmony_ci 7, 14, 21, 28, 35, 42, 49, 56, 408c2ecf20Sopenharmony_ci 15, 22, 29, 36, 43, 50, 57, 418c2ecf20Sopenharmony_ci 23, 30, 37, 44, 51, 58, 428c2ecf20Sopenharmony_ci 31, 38, 45, 52, 59, 438c2ecf20Sopenharmony_ci 39, 46, 53, 60, 448c2ecf20Sopenharmony_ci 47, 54, 61, 458c2ecf20Sopenharmony_ci 55, 62, 468c2ecf20Sopenharmony_ci 63, 478c2ecf20Sopenharmony_ci}; 488c2ecf20Sopenharmony_ci 498c2ecf20Sopenharmony_ci/* 508c2ecf20Sopenharmony_ci * noinline_for_stack to work around 518c2ecf20Sopenharmony_ci * https://bugs.llvm.org/show_bug.cgi?id=38809 528c2ecf20Sopenharmony_ci */ 538c2ecf20Sopenharmony_cistatic int noinline_for_stack 548c2ecf20Sopenharmony_cirlc(const s16 *in, __be16 *output, int blocktype) 558c2ecf20Sopenharmony_ci{ 568c2ecf20Sopenharmony_ci s16 block[8 * 8]; 578c2ecf20Sopenharmony_ci s16 *wp = block; 588c2ecf20Sopenharmony_ci int i = 0; 598c2ecf20Sopenharmony_ci int x, y; 608c2ecf20Sopenharmony_ci int ret = 0; 618c2ecf20Sopenharmony_ci 628c2ecf20Sopenharmony_ci /* read in block from framebuffer */ 638c2ecf20Sopenharmony_ci int lastzero_run = 0; 648c2ecf20Sopenharmony_ci int to_encode; 658c2ecf20Sopenharmony_ci 668c2ecf20Sopenharmony_ci for (y = 0; y < 8; y++) { 678c2ecf20Sopenharmony_ci for (x = 0; x < 8; x++) { 688c2ecf20Sopenharmony_ci *wp = in[x + y * 8]; 698c2ecf20Sopenharmony_ci wp++; 708c2ecf20Sopenharmony_ci } 718c2ecf20Sopenharmony_ci } 728c2ecf20Sopenharmony_ci 738c2ecf20Sopenharmony_ci /* keep track of amount of trailing zeros */ 748c2ecf20Sopenharmony_ci for (i = 63; i >= 0 && !block[zigzag[i]]; i--) 758c2ecf20Sopenharmony_ci lastzero_run++; 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci *output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0); 788c2ecf20Sopenharmony_ci ret++; 798c2ecf20Sopenharmony_ci 808c2ecf20Sopenharmony_ci to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0); 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_ci i = 0; 838c2ecf20Sopenharmony_ci while (i < to_encode) { 848c2ecf20Sopenharmony_ci int cnt = 0; 858c2ecf20Sopenharmony_ci int tmp; 868c2ecf20Sopenharmony_ci 878c2ecf20Sopenharmony_ci /* count leading zeros */ 888c2ecf20Sopenharmony_ci while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) { 898c2ecf20Sopenharmony_ci cnt++; 908c2ecf20Sopenharmony_ci i++; 918c2ecf20Sopenharmony_ci if (i == to_encode) { 928c2ecf20Sopenharmony_ci cnt--; 938c2ecf20Sopenharmony_ci break; 948c2ecf20Sopenharmony_ci } 958c2ecf20Sopenharmony_ci } 968c2ecf20Sopenharmony_ci /* 4 bits for run, 12 for coefficient (quantization by 4) */ 978c2ecf20Sopenharmony_ci *output++ = htons((cnt | tmp << 4)); 988c2ecf20Sopenharmony_ci i++; 998c2ecf20Sopenharmony_ci ret++; 1008c2ecf20Sopenharmony_ci } 1018c2ecf20Sopenharmony_ci if (lastzero_run > 14) { 1028c2ecf20Sopenharmony_ci *output = htons(ALL_ZEROS | 0); 1038c2ecf20Sopenharmony_ci ret++; 1048c2ecf20Sopenharmony_ci } 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci return ret; 1078c2ecf20Sopenharmony_ci} 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_ci/* 1108c2ecf20Sopenharmony_ci * This function will worst-case increase rlc_in by 65*2 bytes: 1118c2ecf20Sopenharmony_ci * one s16 value for the header and 8 * 8 coefficients of type s16. 1128c2ecf20Sopenharmony_ci */ 1138c2ecf20Sopenharmony_cistatic noinline_for_stack u16 1148c2ecf20Sopenharmony_ciderlc(const __be16 **rlc_in, s16 *dwht_out, const __be16 *end_of_input) 1158c2ecf20Sopenharmony_ci{ 1168c2ecf20Sopenharmony_ci /* header */ 1178c2ecf20Sopenharmony_ci const __be16 *input = *rlc_in; 1188c2ecf20Sopenharmony_ci u16 stat; 1198c2ecf20Sopenharmony_ci int dec_count = 0; 1208c2ecf20Sopenharmony_ci s16 block[8 * 8 + 16]; 1218c2ecf20Sopenharmony_ci s16 *wp = block; 1228c2ecf20Sopenharmony_ci int i; 1238c2ecf20Sopenharmony_ci 1248c2ecf20Sopenharmony_ci if (input > end_of_input) 1258c2ecf20Sopenharmony_ci return OVERFLOW_BIT; 1268c2ecf20Sopenharmony_ci stat = ntohs(*input++); 1278c2ecf20Sopenharmony_ci 1288c2ecf20Sopenharmony_ci /* 1298c2ecf20Sopenharmony_ci * Now de-compress, it expands one byte to up to 15 bytes 1308c2ecf20Sopenharmony_ci * (or fills the remainder of the 64 bytes with zeroes if it 1318c2ecf20Sopenharmony_ci * is the last byte to expand). 1328c2ecf20Sopenharmony_ci * 1338c2ecf20Sopenharmony_ci * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to 1348c2ecf20Sopenharmony_ci * allow for overflow if the incoming data was malformed. 1358c2ecf20Sopenharmony_ci */ 1368c2ecf20Sopenharmony_ci while (dec_count < 8 * 8) { 1378c2ecf20Sopenharmony_ci s16 in; 1388c2ecf20Sopenharmony_ci int length; 1398c2ecf20Sopenharmony_ci int coeff; 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_ci if (input > end_of_input) 1428c2ecf20Sopenharmony_ci return OVERFLOW_BIT; 1438c2ecf20Sopenharmony_ci in = ntohs(*input++); 1448c2ecf20Sopenharmony_ci length = in & 0xf; 1458c2ecf20Sopenharmony_ci coeff = in >> 4; 1468c2ecf20Sopenharmony_ci 1478c2ecf20Sopenharmony_ci /* fill remainder with zeros */ 1488c2ecf20Sopenharmony_ci if (length == 15) { 1498c2ecf20Sopenharmony_ci for (i = 0; i < 64 - dec_count; i++) 1508c2ecf20Sopenharmony_ci *wp++ = 0; 1518c2ecf20Sopenharmony_ci break; 1528c2ecf20Sopenharmony_ci } 1538c2ecf20Sopenharmony_ci 1548c2ecf20Sopenharmony_ci for (i = 0; i < length; i++) 1558c2ecf20Sopenharmony_ci *wp++ = 0; 1568c2ecf20Sopenharmony_ci *wp++ = coeff; 1578c2ecf20Sopenharmony_ci dec_count += length + 1; 1588c2ecf20Sopenharmony_ci } 1598c2ecf20Sopenharmony_ci 1608c2ecf20Sopenharmony_ci wp = block; 1618c2ecf20Sopenharmony_ci 1628c2ecf20Sopenharmony_ci for (i = 0; i < 64; i++) { 1638c2ecf20Sopenharmony_ci int pos = zigzag[i]; 1648c2ecf20Sopenharmony_ci int y = pos / 8; 1658c2ecf20Sopenharmony_ci int x = pos % 8; 1668c2ecf20Sopenharmony_ci 1678c2ecf20Sopenharmony_ci dwht_out[x + y * 8] = *wp++; 1688c2ecf20Sopenharmony_ci } 1698c2ecf20Sopenharmony_ci *rlc_in = input; 1708c2ecf20Sopenharmony_ci return stat; 1718c2ecf20Sopenharmony_ci} 1728c2ecf20Sopenharmony_ci 1738c2ecf20Sopenharmony_cistatic const int quant_table[] = { 1748c2ecf20Sopenharmony_ci 2, 2, 2, 2, 2, 2, 2, 2, 1758c2ecf20Sopenharmony_ci 2, 2, 2, 2, 2, 2, 2, 2, 1768c2ecf20Sopenharmony_ci 2, 2, 2, 2, 2, 2, 2, 3, 1778c2ecf20Sopenharmony_ci 2, 2, 2, 2, 2, 2, 3, 6, 1788c2ecf20Sopenharmony_ci 2, 2, 2, 2, 2, 3, 6, 6, 1798c2ecf20Sopenharmony_ci 2, 2, 2, 2, 3, 6, 6, 6, 1808c2ecf20Sopenharmony_ci 2, 2, 2, 3, 6, 6, 6, 6, 1818c2ecf20Sopenharmony_ci 2, 2, 3, 6, 6, 6, 6, 8, 1828c2ecf20Sopenharmony_ci}; 1838c2ecf20Sopenharmony_ci 1848c2ecf20Sopenharmony_cistatic const int quant_table_p[] = { 1858c2ecf20Sopenharmony_ci 3, 3, 3, 3, 3, 3, 3, 3, 1868c2ecf20Sopenharmony_ci 3, 3, 3, 3, 3, 3, 3, 3, 1878c2ecf20Sopenharmony_ci 3, 3, 3, 3, 3, 3, 3, 3, 1888c2ecf20Sopenharmony_ci 3, 3, 3, 3, 3, 3, 3, 6, 1898c2ecf20Sopenharmony_ci 3, 3, 3, 3, 3, 3, 6, 6, 1908c2ecf20Sopenharmony_ci 3, 3, 3, 3, 3, 6, 6, 9, 1918c2ecf20Sopenharmony_ci 3, 3, 3, 3, 6, 6, 9, 9, 1928c2ecf20Sopenharmony_ci 3, 3, 3, 6, 6, 9, 9, 10, 1938c2ecf20Sopenharmony_ci}; 1948c2ecf20Sopenharmony_ci 1958c2ecf20Sopenharmony_cistatic void quantize_intra(s16 *coeff, s16 *de_coeff, u16 qp) 1968c2ecf20Sopenharmony_ci{ 1978c2ecf20Sopenharmony_ci const int *quant = quant_table; 1988c2ecf20Sopenharmony_ci int i, j; 1998c2ecf20Sopenharmony_ci 2008c2ecf20Sopenharmony_ci for (j = 0; j < 8; j++) { 2018c2ecf20Sopenharmony_ci for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) { 2028c2ecf20Sopenharmony_ci *coeff >>= *quant; 2038c2ecf20Sopenharmony_ci if (*coeff >= -qp && *coeff <= qp) 2048c2ecf20Sopenharmony_ci *coeff = *de_coeff = 0; 2058c2ecf20Sopenharmony_ci else 2068c2ecf20Sopenharmony_ci *de_coeff = *coeff << *quant; 2078c2ecf20Sopenharmony_ci } 2088c2ecf20Sopenharmony_ci } 2098c2ecf20Sopenharmony_ci} 2108c2ecf20Sopenharmony_ci 2118c2ecf20Sopenharmony_cistatic void dequantize_intra(s16 *coeff) 2128c2ecf20Sopenharmony_ci{ 2138c2ecf20Sopenharmony_ci const int *quant = quant_table; 2148c2ecf20Sopenharmony_ci int i, j; 2158c2ecf20Sopenharmony_ci 2168c2ecf20Sopenharmony_ci for (j = 0; j < 8; j++) 2178c2ecf20Sopenharmony_ci for (i = 0; i < 8; i++, quant++, coeff++) 2188c2ecf20Sopenharmony_ci *coeff <<= *quant; 2198c2ecf20Sopenharmony_ci} 2208c2ecf20Sopenharmony_ci 2218c2ecf20Sopenharmony_cistatic void quantize_inter(s16 *coeff, s16 *de_coeff, u16 qp) 2228c2ecf20Sopenharmony_ci{ 2238c2ecf20Sopenharmony_ci const int *quant = quant_table_p; 2248c2ecf20Sopenharmony_ci int i, j; 2258c2ecf20Sopenharmony_ci 2268c2ecf20Sopenharmony_ci for (j = 0; j < 8; j++) { 2278c2ecf20Sopenharmony_ci for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) { 2288c2ecf20Sopenharmony_ci *coeff >>= *quant; 2298c2ecf20Sopenharmony_ci if (*coeff >= -qp && *coeff <= qp) 2308c2ecf20Sopenharmony_ci *coeff = *de_coeff = 0; 2318c2ecf20Sopenharmony_ci else 2328c2ecf20Sopenharmony_ci *de_coeff = *coeff << *quant; 2338c2ecf20Sopenharmony_ci } 2348c2ecf20Sopenharmony_ci } 2358c2ecf20Sopenharmony_ci} 2368c2ecf20Sopenharmony_ci 2378c2ecf20Sopenharmony_cistatic void dequantize_inter(s16 *coeff) 2388c2ecf20Sopenharmony_ci{ 2398c2ecf20Sopenharmony_ci const int *quant = quant_table_p; 2408c2ecf20Sopenharmony_ci int i, j; 2418c2ecf20Sopenharmony_ci 2428c2ecf20Sopenharmony_ci for (j = 0; j < 8; j++) 2438c2ecf20Sopenharmony_ci for (i = 0; i < 8; i++, quant++, coeff++) 2448c2ecf20Sopenharmony_ci *coeff <<= *quant; 2458c2ecf20Sopenharmony_ci} 2468c2ecf20Sopenharmony_ci 2478c2ecf20Sopenharmony_cistatic void noinline_for_stack fwht(const u8 *block, s16 *output_block, 2488c2ecf20Sopenharmony_ci unsigned int stride, 2498c2ecf20Sopenharmony_ci unsigned int input_step, bool intra) 2508c2ecf20Sopenharmony_ci{ 2518c2ecf20Sopenharmony_ci /* we'll need more than 8 bits for the transformed coefficients */ 2528c2ecf20Sopenharmony_ci s32 workspace1[8], workspace2[8]; 2538c2ecf20Sopenharmony_ci const u8 *tmp = block; 2548c2ecf20Sopenharmony_ci s16 *out = output_block; 2558c2ecf20Sopenharmony_ci int add = intra ? 256 : 0; 2568c2ecf20Sopenharmony_ci unsigned int i; 2578c2ecf20Sopenharmony_ci 2588c2ecf20Sopenharmony_ci /* stage 1 */ 2598c2ecf20Sopenharmony_ci for (i = 0; i < 8; i++, tmp += stride, out += 8) { 2608c2ecf20Sopenharmony_ci switch (input_step) { 2618c2ecf20Sopenharmony_ci case 1: 2628c2ecf20Sopenharmony_ci workspace1[0] = tmp[0] + tmp[1] - add; 2638c2ecf20Sopenharmony_ci workspace1[1] = tmp[0] - tmp[1]; 2648c2ecf20Sopenharmony_ci 2658c2ecf20Sopenharmony_ci workspace1[2] = tmp[2] + tmp[3] - add; 2668c2ecf20Sopenharmony_ci workspace1[3] = tmp[2] - tmp[3]; 2678c2ecf20Sopenharmony_ci 2688c2ecf20Sopenharmony_ci workspace1[4] = tmp[4] + tmp[5] - add; 2698c2ecf20Sopenharmony_ci workspace1[5] = tmp[4] - tmp[5]; 2708c2ecf20Sopenharmony_ci 2718c2ecf20Sopenharmony_ci workspace1[6] = tmp[6] + tmp[7] - add; 2728c2ecf20Sopenharmony_ci workspace1[7] = tmp[6] - tmp[7]; 2738c2ecf20Sopenharmony_ci break; 2748c2ecf20Sopenharmony_ci case 2: 2758c2ecf20Sopenharmony_ci workspace1[0] = tmp[0] + tmp[2] - add; 2768c2ecf20Sopenharmony_ci workspace1[1] = tmp[0] - tmp[2]; 2778c2ecf20Sopenharmony_ci 2788c2ecf20Sopenharmony_ci workspace1[2] = tmp[4] + tmp[6] - add; 2798c2ecf20Sopenharmony_ci workspace1[3] = tmp[4] - tmp[6]; 2808c2ecf20Sopenharmony_ci 2818c2ecf20Sopenharmony_ci workspace1[4] = tmp[8] + tmp[10] - add; 2828c2ecf20Sopenharmony_ci workspace1[5] = tmp[8] - tmp[10]; 2838c2ecf20Sopenharmony_ci 2848c2ecf20Sopenharmony_ci workspace1[6] = tmp[12] + tmp[14] - add; 2858c2ecf20Sopenharmony_ci workspace1[7] = tmp[12] - tmp[14]; 2868c2ecf20Sopenharmony_ci break; 2878c2ecf20Sopenharmony_ci case 3: 2888c2ecf20Sopenharmony_ci workspace1[0] = tmp[0] + tmp[3] - add; 2898c2ecf20Sopenharmony_ci workspace1[1] = tmp[0] - tmp[3]; 2908c2ecf20Sopenharmony_ci 2918c2ecf20Sopenharmony_ci workspace1[2] = tmp[6] + tmp[9] - add; 2928c2ecf20Sopenharmony_ci workspace1[3] = tmp[6] - tmp[9]; 2938c2ecf20Sopenharmony_ci 2948c2ecf20Sopenharmony_ci workspace1[4] = tmp[12] + tmp[15] - add; 2958c2ecf20Sopenharmony_ci workspace1[5] = tmp[12] - tmp[15]; 2968c2ecf20Sopenharmony_ci 2978c2ecf20Sopenharmony_ci workspace1[6] = tmp[18] + tmp[21] - add; 2988c2ecf20Sopenharmony_ci workspace1[7] = tmp[18] - tmp[21]; 2998c2ecf20Sopenharmony_ci break; 3008c2ecf20Sopenharmony_ci default: 3018c2ecf20Sopenharmony_ci workspace1[0] = tmp[0] + tmp[4] - add; 3028c2ecf20Sopenharmony_ci workspace1[1] = tmp[0] - tmp[4]; 3038c2ecf20Sopenharmony_ci 3048c2ecf20Sopenharmony_ci workspace1[2] = tmp[8] + tmp[12] - add; 3058c2ecf20Sopenharmony_ci workspace1[3] = tmp[8] - tmp[12]; 3068c2ecf20Sopenharmony_ci 3078c2ecf20Sopenharmony_ci workspace1[4] = tmp[16] + tmp[20] - add; 3088c2ecf20Sopenharmony_ci workspace1[5] = tmp[16] - tmp[20]; 3098c2ecf20Sopenharmony_ci 3108c2ecf20Sopenharmony_ci workspace1[6] = tmp[24] + tmp[28] - add; 3118c2ecf20Sopenharmony_ci workspace1[7] = tmp[24] - tmp[28]; 3128c2ecf20Sopenharmony_ci break; 3138c2ecf20Sopenharmony_ci } 3148c2ecf20Sopenharmony_ci 3158c2ecf20Sopenharmony_ci /* stage 2 */ 3168c2ecf20Sopenharmony_ci workspace2[0] = workspace1[0] + workspace1[2]; 3178c2ecf20Sopenharmony_ci workspace2[1] = workspace1[0] - workspace1[2]; 3188c2ecf20Sopenharmony_ci workspace2[2] = workspace1[1] - workspace1[3]; 3198c2ecf20Sopenharmony_ci workspace2[3] = workspace1[1] + workspace1[3]; 3208c2ecf20Sopenharmony_ci 3218c2ecf20Sopenharmony_ci workspace2[4] = workspace1[4] + workspace1[6]; 3228c2ecf20Sopenharmony_ci workspace2[5] = workspace1[4] - workspace1[6]; 3238c2ecf20Sopenharmony_ci workspace2[6] = workspace1[5] - workspace1[7]; 3248c2ecf20Sopenharmony_ci workspace2[7] = workspace1[5] + workspace1[7]; 3258c2ecf20Sopenharmony_ci 3268c2ecf20Sopenharmony_ci /* stage 3 */ 3278c2ecf20Sopenharmony_ci out[0] = workspace2[0] + workspace2[4]; 3288c2ecf20Sopenharmony_ci out[1] = workspace2[0] - workspace2[4]; 3298c2ecf20Sopenharmony_ci out[2] = workspace2[1] - workspace2[5]; 3308c2ecf20Sopenharmony_ci out[3] = workspace2[1] + workspace2[5]; 3318c2ecf20Sopenharmony_ci out[4] = workspace2[2] + workspace2[6]; 3328c2ecf20Sopenharmony_ci out[5] = workspace2[2] - workspace2[6]; 3338c2ecf20Sopenharmony_ci out[6] = workspace2[3] - workspace2[7]; 3348c2ecf20Sopenharmony_ci out[7] = workspace2[3] + workspace2[7]; 3358c2ecf20Sopenharmony_ci } 3368c2ecf20Sopenharmony_ci 3378c2ecf20Sopenharmony_ci out = output_block; 3388c2ecf20Sopenharmony_ci 3398c2ecf20Sopenharmony_ci for (i = 0; i < 8; i++, out++) { 3408c2ecf20Sopenharmony_ci /* stage 1 */ 3418c2ecf20Sopenharmony_ci workspace1[0] = out[0] + out[1 * 8]; 3428c2ecf20Sopenharmony_ci workspace1[1] = out[0] - out[1 * 8]; 3438c2ecf20Sopenharmony_ci 3448c2ecf20Sopenharmony_ci workspace1[2] = out[2 * 8] + out[3 * 8]; 3458c2ecf20Sopenharmony_ci workspace1[3] = out[2 * 8] - out[3 * 8]; 3468c2ecf20Sopenharmony_ci 3478c2ecf20Sopenharmony_ci workspace1[4] = out[4 * 8] + out[5 * 8]; 3488c2ecf20Sopenharmony_ci workspace1[5] = out[4 * 8] - out[5 * 8]; 3498c2ecf20Sopenharmony_ci 3508c2ecf20Sopenharmony_ci workspace1[6] = out[6 * 8] + out[7 * 8]; 3518c2ecf20Sopenharmony_ci workspace1[7] = out[6 * 8] - out[7 * 8]; 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_ci /* stage 2 */ 3548c2ecf20Sopenharmony_ci workspace2[0] = workspace1[0] + workspace1[2]; 3558c2ecf20Sopenharmony_ci workspace2[1] = workspace1[0] - workspace1[2]; 3568c2ecf20Sopenharmony_ci workspace2[2] = workspace1[1] - workspace1[3]; 3578c2ecf20Sopenharmony_ci workspace2[3] = workspace1[1] + workspace1[3]; 3588c2ecf20Sopenharmony_ci 3598c2ecf20Sopenharmony_ci workspace2[4] = workspace1[4] + workspace1[6]; 3608c2ecf20Sopenharmony_ci workspace2[5] = workspace1[4] - workspace1[6]; 3618c2ecf20Sopenharmony_ci workspace2[6] = workspace1[5] - workspace1[7]; 3628c2ecf20Sopenharmony_ci workspace2[7] = workspace1[5] + workspace1[7]; 3638c2ecf20Sopenharmony_ci /* stage 3 */ 3648c2ecf20Sopenharmony_ci out[0 * 8] = workspace2[0] + workspace2[4]; 3658c2ecf20Sopenharmony_ci out[1 * 8] = workspace2[0] - workspace2[4]; 3668c2ecf20Sopenharmony_ci out[2 * 8] = workspace2[1] - workspace2[5]; 3678c2ecf20Sopenharmony_ci out[3 * 8] = workspace2[1] + workspace2[5]; 3688c2ecf20Sopenharmony_ci out[4 * 8] = workspace2[2] + workspace2[6]; 3698c2ecf20Sopenharmony_ci out[5 * 8] = workspace2[2] - workspace2[6]; 3708c2ecf20Sopenharmony_ci out[6 * 8] = workspace2[3] - workspace2[7]; 3718c2ecf20Sopenharmony_ci out[7 * 8] = workspace2[3] + workspace2[7]; 3728c2ecf20Sopenharmony_ci } 3738c2ecf20Sopenharmony_ci} 3748c2ecf20Sopenharmony_ci 3758c2ecf20Sopenharmony_ci/* 3768c2ecf20Sopenharmony_ci * Not the nicest way of doing it, but P-blocks get twice the range of 3778c2ecf20Sopenharmony_ci * that of the I-blocks. Therefore we need a type bigger than 8 bits. 3788c2ecf20Sopenharmony_ci * Furthermore values can be negative... This is just a version that 3798c2ecf20Sopenharmony_ci * works with 16 signed data 3808c2ecf20Sopenharmony_ci */ 3818c2ecf20Sopenharmony_cistatic void noinline_for_stack 3828c2ecf20Sopenharmony_cifwht16(const s16 *block, s16 *output_block, int stride, int intra) 3838c2ecf20Sopenharmony_ci{ 3848c2ecf20Sopenharmony_ci /* we'll need more than 8 bits for the transformed coefficients */ 3858c2ecf20Sopenharmony_ci s32 workspace1[8], workspace2[8]; 3868c2ecf20Sopenharmony_ci const s16 *tmp = block; 3878c2ecf20Sopenharmony_ci s16 *out = output_block; 3888c2ecf20Sopenharmony_ci int i; 3898c2ecf20Sopenharmony_ci 3908c2ecf20Sopenharmony_ci for (i = 0; i < 8; i++, tmp += stride, out += 8) { 3918c2ecf20Sopenharmony_ci /* stage 1 */ 3928c2ecf20Sopenharmony_ci workspace1[0] = tmp[0] + tmp[1]; 3938c2ecf20Sopenharmony_ci workspace1[1] = tmp[0] - tmp[1]; 3948c2ecf20Sopenharmony_ci 3958c2ecf20Sopenharmony_ci workspace1[2] = tmp[2] + tmp[3]; 3968c2ecf20Sopenharmony_ci workspace1[3] = tmp[2] - tmp[3]; 3978c2ecf20Sopenharmony_ci 3988c2ecf20Sopenharmony_ci workspace1[4] = tmp[4] + tmp[5]; 3998c2ecf20Sopenharmony_ci workspace1[5] = tmp[4] - tmp[5]; 4008c2ecf20Sopenharmony_ci 4018c2ecf20Sopenharmony_ci workspace1[6] = tmp[6] + tmp[7]; 4028c2ecf20Sopenharmony_ci workspace1[7] = tmp[6] - tmp[7]; 4038c2ecf20Sopenharmony_ci 4048c2ecf20Sopenharmony_ci /* stage 2 */ 4058c2ecf20Sopenharmony_ci workspace2[0] = workspace1[0] + workspace1[2]; 4068c2ecf20Sopenharmony_ci workspace2[1] = workspace1[0] - workspace1[2]; 4078c2ecf20Sopenharmony_ci workspace2[2] = workspace1[1] - workspace1[3]; 4088c2ecf20Sopenharmony_ci workspace2[3] = workspace1[1] + workspace1[3]; 4098c2ecf20Sopenharmony_ci 4108c2ecf20Sopenharmony_ci workspace2[4] = workspace1[4] + workspace1[6]; 4118c2ecf20Sopenharmony_ci workspace2[5] = workspace1[4] - workspace1[6]; 4128c2ecf20Sopenharmony_ci workspace2[6] = workspace1[5] - workspace1[7]; 4138c2ecf20Sopenharmony_ci workspace2[7] = workspace1[5] + workspace1[7]; 4148c2ecf20Sopenharmony_ci 4158c2ecf20Sopenharmony_ci /* stage 3 */ 4168c2ecf20Sopenharmony_ci out[0] = workspace2[0] + workspace2[4]; 4178c2ecf20Sopenharmony_ci out[1] = workspace2[0] - workspace2[4]; 4188c2ecf20Sopenharmony_ci out[2] = workspace2[1] - workspace2[5]; 4198c2ecf20Sopenharmony_ci out[3] = workspace2[1] + workspace2[5]; 4208c2ecf20Sopenharmony_ci out[4] = workspace2[2] + workspace2[6]; 4218c2ecf20Sopenharmony_ci out[5] = workspace2[2] - workspace2[6]; 4228c2ecf20Sopenharmony_ci out[6] = workspace2[3] - workspace2[7]; 4238c2ecf20Sopenharmony_ci out[7] = workspace2[3] + workspace2[7]; 4248c2ecf20Sopenharmony_ci } 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_ci out = output_block; 4278c2ecf20Sopenharmony_ci 4288c2ecf20Sopenharmony_ci for (i = 0; i < 8; i++, out++) { 4298c2ecf20Sopenharmony_ci /* stage 1 */ 4308c2ecf20Sopenharmony_ci workspace1[0] = out[0] + out[1*8]; 4318c2ecf20Sopenharmony_ci workspace1[1] = out[0] - out[1*8]; 4328c2ecf20Sopenharmony_ci 4338c2ecf20Sopenharmony_ci workspace1[2] = out[2*8] + out[3*8]; 4348c2ecf20Sopenharmony_ci workspace1[3] = out[2*8] - out[3*8]; 4358c2ecf20Sopenharmony_ci 4368c2ecf20Sopenharmony_ci workspace1[4] = out[4*8] + out[5*8]; 4378c2ecf20Sopenharmony_ci workspace1[5] = out[4*8] - out[5*8]; 4388c2ecf20Sopenharmony_ci 4398c2ecf20Sopenharmony_ci workspace1[6] = out[6*8] + out[7*8]; 4408c2ecf20Sopenharmony_ci workspace1[7] = out[6*8] - out[7*8]; 4418c2ecf20Sopenharmony_ci 4428c2ecf20Sopenharmony_ci /* stage 2 */ 4438c2ecf20Sopenharmony_ci workspace2[0] = workspace1[0] + workspace1[2]; 4448c2ecf20Sopenharmony_ci workspace2[1] = workspace1[0] - workspace1[2]; 4458c2ecf20Sopenharmony_ci workspace2[2] = workspace1[1] - workspace1[3]; 4468c2ecf20Sopenharmony_ci workspace2[3] = workspace1[1] + workspace1[3]; 4478c2ecf20Sopenharmony_ci 4488c2ecf20Sopenharmony_ci workspace2[4] = workspace1[4] + workspace1[6]; 4498c2ecf20Sopenharmony_ci workspace2[5] = workspace1[4] - workspace1[6]; 4508c2ecf20Sopenharmony_ci workspace2[6] = workspace1[5] - workspace1[7]; 4518c2ecf20Sopenharmony_ci workspace2[7] = workspace1[5] + workspace1[7]; 4528c2ecf20Sopenharmony_ci 4538c2ecf20Sopenharmony_ci /* stage 3 */ 4548c2ecf20Sopenharmony_ci out[0*8] = workspace2[0] + workspace2[4]; 4558c2ecf20Sopenharmony_ci out[1*8] = workspace2[0] - workspace2[4]; 4568c2ecf20Sopenharmony_ci out[2*8] = workspace2[1] - workspace2[5]; 4578c2ecf20Sopenharmony_ci out[3*8] = workspace2[1] + workspace2[5]; 4588c2ecf20Sopenharmony_ci out[4*8] = workspace2[2] + workspace2[6]; 4598c2ecf20Sopenharmony_ci out[5*8] = workspace2[2] - workspace2[6]; 4608c2ecf20Sopenharmony_ci out[6*8] = workspace2[3] - workspace2[7]; 4618c2ecf20Sopenharmony_ci out[7*8] = workspace2[3] + workspace2[7]; 4628c2ecf20Sopenharmony_ci } 4638c2ecf20Sopenharmony_ci} 4648c2ecf20Sopenharmony_ci 4658c2ecf20Sopenharmony_cistatic noinline_for_stack void 4668c2ecf20Sopenharmony_ciifwht(const s16 *block, s16 *output_block, int intra) 4678c2ecf20Sopenharmony_ci{ 4688c2ecf20Sopenharmony_ci /* 4698c2ecf20Sopenharmony_ci * we'll need more than 8 bits for the transformed coefficients 4708c2ecf20Sopenharmony_ci * use native unit of cpu 4718c2ecf20Sopenharmony_ci */ 4728c2ecf20Sopenharmony_ci int workspace1[8], workspace2[8]; 4738c2ecf20Sopenharmony_ci int inter = intra ? 0 : 1; 4748c2ecf20Sopenharmony_ci const s16 *tmp = block; 4758c2ecf20Sopenharmony_ci s16 *out = output_block; 4768c2ecf20Sopenharmony_ci int i; 4778c2ecf20Sopenharmony_ci 4788c2ecf20Sopenharmony_ci for (i = 0; i < 8; i++, tmp += 8, out += 8) { 4798c2ecf20Sopenharmony_ci /* stage 1 */ 4808c2ecf20Sopenharmony_ci workspace1[0] = tmp[0] + tmp[1]; 4818c2ecf20Sopenharmony_ci workspace1[1] = tmp[0] - tmp[1]; 4828c2ecf20Sopenharmony_ci 4838c2ecf20Sopenharmony_ci workspace1[2] = tmp[2] + tmp[3]; 4848c2ecf20Sopenharmony_ci workspace1[3] = tmp[2] - tmp[3]; 4858c2ecf20Sopenharmony_ci 4868c2ecf20Sopenharmony_ci workspace1[4] = tmp[4] + tmp[5]; 4878c2ecf20Sopenharmony_ci workspace1[5] = tmp[4] - tmp[5]; 4888c2ecf20Sopenharmony_ci 4898c2ecf20Sopenharmony_ci workspace1[6] = tmp[6] + tmp[7]; 4908c2ecf20Sopenharmony_ci workspace1[7] = tmp[6] - tmp[7]; 4918c2ecf20Sopenharmony_ci 4928c2ecf20Sopenharmony_ci /* stage 2 */ 4938c2ecf20Sopenharmony_ci workspace2[0] = workspace1[0] + workspace1[2]; 4948c2ecf20Sopenharmony_ci workspace2[1] = workspace1[0] - workspace1[2]; 4958c2ecf20Sopenharmony_ci workspace2[2] = workspace1[1] - workspace1[3]; 4968c2ecf20Sopenharmony_ci workspace2[3] = workspace1[1] + workspace1[3]; 4978c2ecf20Sopenharmony_ci 4988c2ecf20Sopenharmony_ci workspace2[4] = workspace1[4] + workspace1[6]; 4998c2ecf20Sopenharmony_ci workspace2[5] = workspace1[4] - workspace1[6]; 5008c2ecf20Sopenharmony_ci workspace2[6] = workspace1[5] - workspace1[7]; 5018c2ecf20Sopenharmony_ci workspace2[7] = workspace1[5] + workspace1[7]; 5028c2ecf20Sopenharmony_ci 5038c2ecf20Sopenharmony_ci /* stage 3 */ 5048c2ecf20Sopenharmony_ci out[0] = workspace2[0] + workspace2[4]; 5058c2ecf20Sopenharmony_ci out[1] = workspace2[0] - workspace2[4]; 5068c2ecf20Sopenharmony_ci out[2] = workspace2[1] - workspace2[5]; 5078c2ecf20Sopenharmony_ci out[3] = workspace2[1] + workspace2[5]; 5088c2ecf20Sopenharmony_ci out[4] = workspace2[2] + workspace2[6]; 5098c2ecf20Sopenharmony_ci out[5] = workspace2[2] - workspace2[6]; 5108c2ecf20Sopenharmony_ci out[6] = workspace2[3] - workspace2[7]; 5118c2ecf20Sopenharmony_ci out[7] = workspace2[3] + workspace2[7]; 5128c2ecf20Sopenharmony_ci } 5138c2ecf20Sopenharmony_ci 5148c2ecf20Sopenharmony_ci out = output_block; 5158c2ecf20Sopenharmony_ci 5168c2ecf20Sopenharmony_ci for (i = 0; i < 8; i++, out++) { 5178c2ecf20Sopenharmony_ci /* stage 1 */ 5188c2ecf20Sopenharmony_ci workspace1[0] = out[0] + out[1 * 8]; 5198c2ecf20Sopenharmony_ci workspace1[1] = out[0] - out[1 * 8]; 5208c2ecf20Sopenharmony_ci 5218c2ecf20Sopenharmony_ci workspace1[2] = out[2 * 8] + out[3 * 8]; 5228c2ecf20Sopenharmony_ci workspace1[3] = out[2 * 8] - out[3 * 8]; 5238c2ecf20Sopenharmony_ci 5248c2ecf20Sopenharmony_ci workspace1[4] = out[4 * 8] + out[5 * 8]; 5258c2ecf20Sopenharmony_ci workspace1[5] = out[4 * 8] - out[5 * 8]; 5268c2ecf20Sopenharmony_ci 5278c2ecf20Sopenharmony_ci workspace1[6] = out[6 * 8] + out[7 * 8]; 5288c2ecf20Sopenharmony_ci workspace1[7] = out[6 * 8] - out[7 * 8]; 5298c2ecf20Sopenharmony_ci 5308c2ecf20Sopenharmony_ci /* stage 2 */ 5318c2ecf20Sopenharmony_ci workspace2[0] = workspace1[0] + workspace1[2]; 5328c2ecf20Sopenharmony_ci workspace2[1] = workspace1[0] - workspace1[2]; 5338c2ecf20Sopenharmony_ci workspace2[2] = workspace1[1] - workspace1[3]; 5348c2ecf20Sopenharmony_ci workspace2[3] = workspace1[1] + workspace1[3]; 5358c2ecf20Sopenharmony_ci 5368c2ecf20Sopenharmony_ci workspace2[4] = workspace1[4] + workspace1[6]; 5378c2ecf20Sopenharmony_ci workspace2[5] = workspace1[4] - workspace1[6]; 5388c2ecf20Sopenharmony_ci workspace2[6] = workspace1[5] - workspace1[7]; 5398c2ecf20Sopenharmony_ci workspace2[7] = workspace1[5] + workspace1[7]; 5408c2ecf20Sopenharmony_ci 5418c2ecf20Sopenharmony_ci /* stage 3 */ 5428c2ecf20Sopenharmony_ci if (inter) { 5438c2ecf20Sopenharmony_ci int d; 5448c2ecf20Sopenharmony_ci 5458c2ecf20Sopenharmony_ci out[0 * 8] = workspace2[0] + workspace2[4]; 5468c2ecf20Sopenharmony_ci out[1 * 8] = workspace2[0] - workspace2[4]; 5478c2ecf20Sopenharmony_ci out[2 * 8] = workspace2[1] - workspace2[5]; 5488c2ecf20Sopenharmony_ci out[3 * 8] = workspace2[1] + workspace2[5]; 5498c2ecf20Sopenharmony_ci out[4 * 8] = workspace2[2] + workspace2[6]; 5508c2ecf20Sopenharmony_ci out[5 * 8] = workspace2[2] - workspace2[6]; 5518c2ecf20Sopenharmony_ci out[6 * 8] = workspace2[3] - workspace2[7]; 5528c2ecf20Sopenharmony_ci out[7 * 8] = workspace2[3] + workspace2[7]; 5538c2ecf20Sopenharmony_ci 5548c2ecf20Sopenharmony_ci for (d = 0; d < 8; d++) 5558c2ecf20Sopenharmony_ci out[8 * d] >>= 6; 5568c2ecf20Sopenharmony_ci } else { 5578c2ecf20Sopenharmony_ci int d; 5588c2ecf20Sopenharmony_ci 5598c2ecf20Sopenharmony_ci out[0 * 8] = workspace2[0] + workspace2[4]; 5608c2ecf20Sopenharmony_ci out[1 * 8] = workspace2[0] - workspace2[4]; 5618c2ecf20Sopenharmony_ci out[2 * 8] = workspace2[1] - workspace2[5]; 5628c2ecf20Sopenharmony_ci out[3 * 8] = workspace2[1] + workspace2[5]; 5638c2ecf20Sopenharmony_ci out[4 * 8] = workspace2[2] + workspace2[6]; 5648c2ecf20Sopenharmony_ci out[5 * 8] = workspace2[2] - workspace2[6]; 5658c2ecf20Sopenharmony_ci out[6 * 8] = workspace2[3] - workspace2[7]; 5668c2ecf20Sopenharmony_ci out[7 * 8] = workspace2[3] + workspace2[7]; 5678c2ecf20Sopenharmony_ci 5688c2ecf20Sopenharmony_ci for (d = 0; d < 8; d++) { 5698c2ecf20Sopenharmony_ci out[8 * d] >>= 6; 5708c2ecf20Sopenharmony_ci out[8 * d] += 128; 5718c2ecf20Sopenharmony_ci } 5728c2ecf20Sopenharmony_ci } 5738c2ecf20Sopenharmony_ci } 5748c2ecf20Sopenharmony_ci} 5758c2ecf20Sopenharmony_ci 5768c2ecf20Sopenharmony_cistatic void fill_encoder_block(const u8 *input, s16 *dst, 5778c2ecf20Sopenharmony_ci unsigned int stride, unsigned int input_step) 5788c2ecf20Sopenharmony_ci{ 5798c2ecf20Sopenharmony_ci int i, j; 5808c2ecf20Sopenharmony_ci 5818c2ecf20Sopenharmony_ci for (i = 0; i < 8; i++) { 5828c2ecf20Sopenharmony_ci for (j = 0; j < 8; j++, input += input_step) 5838c2ecf20Sopenharmony_ci *dst++ = *input; 5848c2ecf20Sopenharmony_ci input += stride - 8 * input_step; 5858c2ecf20Sopenharmony_ci } 5868c2ecf20Sopenharmony_ci} 5878c2ecf20Sopenharmony_ci 5888c2ecf20Sopenharmony_cistatic int var_intra(const s16 *input) 5898c2ecf20Sopenharmony_ci{ 5908c2ecf20Sopenharmony_ci int32_t mean = 0; 5918c2ecf20Sopenharmony_ci int32_t ret = 0; 5928c2ecf20Sopenharmony_ci const s16 *tmp = input; 5938c2ecf20Sopenharmony_ci int i; 5948c2ecf20Sopenharmony_ci 5958c2ecf20Sopenharmony_ci for (i = 0; i < 8 * 8; i++, tmp++) 5968c2ecf20Sopenharmony_ci mean += *tmp; 5978c2ecf20Sopenharmony_ci mean /= 64; 5988c2ecf20Sopenharmony_ci tmp = input; 5998c2ecf20Sopenharmony_ci for (i = 0; i < 8 * 8; i++, tmp++) 6008c2ecf20Sopenharmony_ci ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean); 6018c2ecf20Sopenharmony_ci return ret; 6028c2ecf20Sopenharmony_ci} 6038c2ecf20Sopenharmony_ci 6048c2ecf20Sopenharmony_cistatic int var_inter(const s16 *old, const s16 *new) 6058c2ecf20Sopenharmony_ci{ 6068c2ecf20Sopenharmony_ci int32_t ret = 0; 6078c2ecf20Sopenharmony_ci int i; 6088c2ecf20Sopenharmony_ci 6098c2ecf20Sopenharmony_ci for (i = 0; i < 8 * 8; i++, old++, new++) 6108c2ecf20Sopenharmony_ci ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new); 6118c2ecf20Sopenharmony_ci return ret; 6128c2ecf20Sopenharmony_ci} 6138c2ecf20Sopenharmony_ci 6148c2ecf20Sopenharmony_cistatic noinline_for_stack int 6158c2ecf20Sopenharmony_cidecide_blocktype(const u8 *cur, const u8 *reference, s16 *deltablock, 6168c2ecf20Sopenharmony_ci unsigned int stride, unsigned int input_step) 6178c2ecf20Sopenharmony_ci{ 6188c2ecf20Sopenharmony_ci s16 tmp[64]; 6198c2ecf20Sopenharmony_ci s16 old[64]; 6208c2ecf20Sopenharmony_ci s16 *work = tmp; 6218c2ecf20Sopenharmony_ci unsigned int k, l; 6228c2ecf20Sopenharmony_ci int vari; 6238c2ecf20Sopenharmony_ci int vard; 6248c2ecf20Sopenharmony_ci 6258c2ecf20Sopenharmony_ci fill_encoder_block(cur, tmp, stride, input_step); 6268c2ecf20Sopenharmony_ci fill_encoder_block(reference, old, 8, 1); 6278c2ecf20Sopenharmony_ci vari = var_intra(tmp); 6288c2ecf20Sopenharmony_ci 6298c2ecf20Sopenharmony_ci for (k = 0; k < 8; k++) { 6308c2ecf20Sopenharmony_ci for (l = 0; l < 8; l++) { 6318c2ecf20Sopenharmony_ci *deltablock = *work - *reference; 6328c2ecf20Sopenharmony_ci deltablock++; 6338c2ecf20Sopenharmony_ci work++; 6348c2ecf20Sopenharmony_ci reference++; 6358c2ecf20Sopenharmony_ci } 6368c2ecf20Sopenharmony_ci } 6378c2ecf20Sopenharmony_ci deltablock -= 64; 6388c2ecf20Sopenharmony_ci vard = var_inter(old, tmp); 6398c2ecf20Sopenharmony_ci return vari <= vard ? IBLOCK : PBLOCK; 6408c2ecf20Sopenharmony_ci} 6418c2ecf20Sopenharmony_ci 6428c2ecf20Sopenharmony_cistatic void fill_decoder_block(u8 *dst, const s16 *input, int stride, 6438c2ecf20Sopenharmony_ci unsigned int dst_step) 6448c2ecf20Sopenharmony_ci{ 6458c2ecf20Sopenharmony_ci int i, j; 6468c2ecf20Sopenharmony_ci 6478c2ecf20Sopenharmony_ci for (i = 0; i < 8; i++) { 6488c2ecf20Sopenharmony_ci for (j = 0; j < 8; j++, input++, dst += dst_step) { 6498c2ecf20Sopenharmony_ci if (*input < 0) 6508c2ecf20Sopenharmony_ci *dst = 0; 6518c2ecf20Sopenharmony_ci else if (*input > 255) 6528c2ecf20Sopenharmony_ci *dst = 255; 6538c2ecf20Sopenharmony_ci else 6548c2ecf20Sopenharmony_ci *dst = *input; 6558c2ecf20Sopenharmony_ci } 6568c2ecf20Sopenharmony_ci dst += stride - (8 * dst_step); 6578c2ecf20Sopenharmony_ci } 6588c2ecf20Sopenharmony_ci} 6598c2ecf20Sopenharmony_ci 6608c2ecf20Sopenharmony_cistatic void add_deltas(s16 *deltas, const u8 *ref, int stride, 6618c2ecf20Sopenharmony_ci unsigned int ref_step) 6628c2ecf20Sopenharmony_ci{ 6638c2ecf20Sopenharmony_ci int k, l; 6648c2ecf20Sopenharmony_ci 6658c2ecf20Sopenharmony_ci for (k = 0; k < 8; k++) { 6668c2ecf20Sopenharmony_ci for (l = 0; l < 8; l++) { 6678c2ecf20Sopenharmony_ci *deltas += *ref; 6688c2ecf20Sopenharmony_ci ref += ref_step; 6698c2ecf20Sopenharmony_ci /* 6708c2ecf20Sopenharmony_ci * Due to quantizing, it might possible that the 6718c2ecf20Sopenharmony_ci * decoded coefficients are slightly out of range 6728c2ecf20Sopenharmony_ci */ 6738c2ecf20Sopenharmony_ci if (*deltas < 0) 6748c2ecf20Sopenharmony_ci *deltas = 0; 6758c2ecf20Sopenharmony_ci else if (*deltas > 255) 6768c2ecf20Sopenharmony_ci *deltas = 255; 6778c2ecf20Sopenharmony_ci deltas++; 6788c2ecf20Sopenharmony_ci } 6798c2ecf20Sopenharmony_ci ref += stride - (8 * ref_step); 6808c2ecf20Sopenharmony_ci } 6818c2ecf20Sopenharmony_ci} 6828c2ecf20Sopenharmony_ci 6838c2ecf20Sopenharmony_cistatic u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max, 6848c2ecf20Sopenharmony_ci struct fwht_cframe *cf, u32 height, u32 width, 6858c2ecf20Sopenharmony_ci u32 stride, unsigned int input_step, 6868c2ecf20Sopenharmony_ci bool is_intra, bool next_is_intra) 6878c2ecf20Sopenharmony_ci{ 6888c2ecf20Sopenharmony_ci u8 *input_start = input; 6898c2ecf20Sopenharmony_ci __be16 *rlco_start = *rlco; 6908c2ecf20Sopenharmony_ci s16 deltablock[64]; 6918c2ecf20Sopenharmony_ci __be16 pframe_bit = htons(PFRAME_BIT); 6928c2ecf20Sopenharmony_ci u32 encoding = 0; 6938c2ecf20Sopenharmony_ci unsigned int last_size = 0; 6948c2ecf20Sopenharmony_ci unsigned int i, j; 6958c2ecf20Sopenharmony_ci 6968c2ecf20Sopenharmony_ci width = round_up(width, 8); 6978c2ecf20Sopenharmony_ci height = round_up(height, 8); 6988c2ecf20Sopenharmony_ci 6998c2ecf20Sopenharmony_ci for (j = 0; j < height / 8; j++) { 7008c2ecf20Sopenharmony_ci input = input_start + j * 8 * stride; 7018c2ecf20Sopenharmony_ci for (i = 0; i < width / 8; i++) { 7028c2ecf20Sopenharmony_ci /* intra code, first frame is always intra coded. */ 7038c2ecf20Sopenharmony_ci int blocktype = IBLOCK; 7048c2ecf20Sopenharmony_ci unsigned int size; 7058c2ecf20Sopenharmony_ci 7068c2ecf20Sopenharmony_ci if (!is_intra) 7078c2ecf20Sopenharmony_ci blocktype = decide_blocktype(input, refp, 7088c2ecf20Sopenharmony_ci deltablock, stride, input_step); 7098c2ecf20Sopenharmony_ci if (blocktype == IBLOCK) { 7108c2ecf20Sopenharmony_ci fwht(input, cf->coeffs, stride, input_step, 1); 7118c2ecf20Sopenharmony_ci quantize_intra(cf->coeffs, cf->de_coeffs, 7128c2ecf20Sopenharmony_ci cf->i_frame_qp); 7138c2ecf20Sopenharmony_ci } else { 7148c2ecf20Sopenharmony_ci /* inter code */ 7158c2ecf20Sopenharmony_ci encoding |= FWHT_FRAME_PCODED; 7168c2ecf20Sopenharmony_ci fwht16(deltablock, cf->coeffs, 8, 0); 7178c2ecf20Sopenharmony_ci quantize_inter(cf->coeffs, cf->de_coeffs, 7188c2ecf20Sopenharmony_ci cf->p_frame_qp); 7198c2ecf20Sopenharmony_ci } 7208c2ecf20Sopenharmony_ci if (!next_is_intra) { 7218c2ecf20Sopenharmony_ci ifwht(cf->de_coeffs, cf->de_fwht, blocktype); 7228c2ecf20Sopenharmony_ci 7238c2ecf20Sopenharmony_ci if (blocktype == PBLOCK) 7248c2ecf20Sopenharmony_ci add_deltas(cf->de_fwht, refp, 8, 1); 7258c2ecf20Sopenharmony_ci fill_decoder_block(refp, cf->de_fwht, 8, 1); 7268c2ecf20Sopenharmony_ci } 7278c2ecf20Sopenharmony_ci 7288c2ecf20Sopenharmony_ci input += 8 * input_step; 7298c2ecf20Sopenharmony_ci refp += 8 * 8; 7308c2ecf20Sopenharmony_ci 7318c2ecf20Sopenharmony_ci size = rlc(cf->coeffs, *rlco, blocktype); 7328c2ecf20Sopenharmony_ci if (last_size == size && 7338c2ecf20Sopenharmony_ci !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) { 7348c2ecf20Sopenharmony_ci __be16 *last_rlco = *rlco - size; 7358c2ecf20Sopenharmony_ci s16 hdr = ntohs(*last_rlco); 7368c2ecf20Sopenharmony_ci 7378c2ecf20Sopenharmony_ci if (!((*last_rlco ^ **rlco) & pframe_bit) && 7388c2ecf20Sopenharmony_ci (hdr & DUPS_MASK) < DUPS_MASK) 7398c2ecf20Sopenharmony_ci *last_rlco = htons(hdr + 2); 7408c2ecf20Sopenharmony_ci else 7418c2ecf20Sopenharmony_ci *rlco += size; 7428c2ecf20Sopenharmony_ci } else { 7438c2ecf20Sopenharmony_ci *rlco += size; 7448c2ecf20Sopenharmony_ci } 7458c2ecf20Sopenharmony_ci if (*rlco >= rlco_max) { 7468c2ecf20Sopenharmony_ci encoding |= FWHT_FRAME_UNENCODED; 7478c2ecf20Sopenharmony_ci goto exit_loop; 7488c2ecf20Sopenharmony_ci } 7498c2ecf20Sopenharmony_ci last_size = size; 7508c2ecf20Sopenharmony_ci } 7518c2ecf20Sopenharmony_ci } 7528c2ecf20Sopenharmony_ci 7538c2ecf20Sopenharmony_ciexit_loop: 7548c2ecf20Sopenharmony_ci if (encoding & FWHT_FRAME_UNENCODED) { 7558c2ecf20Sopenharmony_ci u8 *out = (u8 *)rlco_start; 7568c2ecf20Sopenharmony_ci u8 *p; 7578c2ecf20Sopenharmony_ci 7588c2ecf20Sopenharmony_ci input = input_start; 7598c2ecf20Sopenharmony_ci /* 7608c2ecf20Sopenharmony_ci * The compressed stream should never contain the magic 7618c2ecf20Sopenharmony_ci * header, so when we copy the YUV data we replace 0xff 7628c2ecf20Sopenharmony_ci * by 0xfe. Since YUV is limited range such values 7638c2ecf20Sopenharmony_ci * shouldn't appear anyway. 7648c2ecf20Sopenharmony_ci */ 7658c2ecf20Sopenharmony_ci for (j = 0; j < height; j++) { 7668c2ecf20Sopenharmony_ci for (i = 0, p = input; i < width; i++, p += input_step) 7678c2ecf20Sopenharmony_ci *out++ = (*p == 0xff) ? 0xfe : *p; 7688c2ecf20Sopenharmony_ci input += stride; 7698c2ecf20Sopenharmony_ci } 7708c2ecf20Sopenharmony_ci *rlco = (__be16 *)out; 7718c2ecf20Sopenharmony_ci encoding &= ~FWHT_FRAME_PCODED; 7728c2ecf20Sopenharmony_ci } 7738c2ecf20Sopenharmony_ci return encoding; 7748c2ecf20Sopenharmony_ci} 7758c2ecf20Sopenharmony_ci 7768c2ecf20Sopenharmony_ciu32 fwht_encode_frame(struct fwht_raw_frame *frm, 7778c2ecf20Sopenharmony_ci struct fwht_raw_frame *ref_frm, 7788c2ecf20Sopenharmony_ci struct fwht_cframe *cf, 7798c2ecf20Sopenharmony_ci bool is_intra, bool next_is_intra, 7808c2ecf20Sopenharmony_ci unsigned int width, unsigned int height, 7818c2ecf20Sopenharmony_ci unsigned int stride, unsigned int chroma_stride) 7828c2ecf20Sopenharmony_ci{ 7838c2ecf20Sopenharmony_ci unsigned int size = height * width; 7848c2ecf20Sopenharmony_ci __be16 *rlco = cf->rlc_data; 7858c2ecf20Sopenharmony_ci __be16 *rlco_max; 7868c2ecf20Sopenharmony_ci u32 encoding; 7878c2ecf20Sopenharmony_ci 7888c2ecf20Sopenharmony_ci rlco_max = rlco + size / 2 - 256; 7898c2ecf20Sopenharmony_ci encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf, 7908c2ecf20Sopenharmony_ci height, width, stride, 7918c2ecf20Sopenharmony_ci frm->luma_alpha_step, is_intra, next_is_intra); 7928c2ecf20Sopenharmony_ci if (encoding & FWHT_FRAME_UNENCODED) 7938c2ecf20Sopenharmony_ci encoding |= FWHT_LUMA_UNENCODED; 7948c2ecf20Sopenharmony_ci encoding &= ~FWHT_FRAME_UNENCODED; 7958c2ecf20Sopenharmony_ci 7968c2ecf20Sopenharmony_ci if (frm->components_num >= 3) { 7978c2ecf20Sopenharmony_ci u32 chroma_h = height / frm->height_div; 7988c2ecf20Sopenharmony_ci u32 chroma_w = width / frm->width_div; 7998c2ecf20Sopenharmony_ci unsigned int chroma_size = chroma_h * chroma_w; 8008c2ecf20Sopenharmony_ci 8018c2ecf20Sopenharmony_ci rlco_max = rlco + chroma_size / 2 - 256; 8028c2ecf20Sopenharmony_ci encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max, 8038c2ecf20Sopenharmony_ci cf, chroma_h, chroma_w, 8048c2ecf20Sopenharmony_ci chroma_stride, frm->chroma_step, 8058c2ecf20Sopenharmony_ci is_intra, next_is_intra); 8068c2ecf20Sopenharmony_ci if (encoding & FWHT_FRAME_UNENCODED) 8078c2ecf20Sopenharmony_ci encoding |= FWHT_CB_UNENCODED; 8088c2ecf20Sopenharmony_ci encoding &= ~FWHT_FRAME_UNENCODED; 8098c2ecf20Sopenharmony_ci rlco_max = rlco + chroma_size / 2 - 256; 8108c2ecf20Sopenharmony_ci encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max, 8118c2ecf20Sopenharmony_ci cf, chroma_h, chroma_w, 8128c2ecf20Sopenharmony_ci chroma_stride, frm->chroma_step, 8138c2ecf20Sopenharmony_ci is_intra, next_is_intra); 8148c2ecf20Sopenharmony_ci if (encoding & FWHT_FRAME_UNENCODED) 8158c2ecf20Sopenharmony_ci encoding |= FWHT_CR_UNENCODED; 8168c2ecf20Sopenharmony_ci encoding &= ~FWHT_FRAME_UNENCODED; 8178c2ecf20Sopenharmony_ci } 8188c2ecf20Sopenharmony_ci 8198c2ecf20Sopenharmony_ci if (frm->components_num == 4) { 8208c2ecf20Sopenharmony_ci rlco_max = rlco + size / 2 - 256; 8218c2ecf20Sopenharmony_ci encoding |= encode_plane(frm->alpha, ref_frm->alpha, &rlco, 8228c2ecf20Sopenharmony_ci rlco_max, cf, height, width, 8238c2ecf20Sopenharmony_ci stride, frm->luma_alpha_step, 8248c2ecf20Sopenharmony_ci is_intra, next_is_intra); 8258c2ecf20Sopenharmony_ci if (encoding & FWHT_FRAME_UNENCODED) 8268c2ecf20Sopenharmony_ci encoding |= FWHT_ALPHA_UNENCODED; 8278c2ecf20Sopenharmony_ci encoding &= ~FWHT_FRAME_UNENCODED; 8288c2ecf20Sopenharmony_ci } 8298c2ecf20Sopenharmony_ci 8308c2ecf20Sopenharmony_ci cf->size = (rlco - cf->rlc_data) * sizeof(*rlco); 8318c2ecf20Sopenharmony_ci return encoding; 8328c2ecf20Sopenharmony_ci} 8338c2ecf20Sopenharmony_ci 8348c2ecf20Sopenharmony_cistatic bool decode_plane(struct fwht_cframe *cf, const __be16 **rlco, 8358c2ecf20Sopenharmony_ci u32 height, u32 width, const u8 *ref, u32 ref_stride, 8368c2ecf20Sopenharmony_ci unsigned int ref_step, u8 *dst, 8378c2ecf20Sopenharmony_ci unsigned int dst_stride, unsigned int dst_step, 8388c2ecf20Sopenharmony_ci bool uncompressed, const __be16 *end_of_rlco_buf) 8398c2ecf20Sopenharmony_ci{ 8408c2ecf20Sopenharmony_ci unsigned int copies = 0; 8418c2ecf20Sopenharmony_ci s16 copy[8 * 8]; 8428c2ecf20Sopenharmony_ci u16 stat; 8438c2ecf20Sopenharmony_ci unsigned int i, j; 8448c2ecf20Sopenharmony_ci bool is_intra = !ref; 8458c2ecf20Sopenharmony_ci 8468c2ecf20Sopenharmony_ci width = round_up(width, 8); 8478c2ecf20Sopenharmony_ci height = round_up(height, 8); 8488c2ecf20Sopenharmony_ci 8498c2ecf20Sopenharmony_ci if (uncompressed) { 8508c2ecf20Sopenharmony_ci int i; 8518c2ecf20Sopenharmony_ci 8528c2ecf20Sopenharmony_ci if (end_of_rlco_buf + 1 < *rlco + width * height / 2) 8538c2ecf20Sopenharmony_ci return false; 8548c2ecf20Sopenharmony_ci for (i = 0; i < height; i++) { 8558c2ecf20Sopenharmony_ci memcpy(dst, *rlco, width); 8568c2ecf20Sopenharmony_ci dst += dst_stride; 8578c2ecf20Sopenharmony_ci *rlco += width / 2; 8588c2ecf20Sopenharmony_ci } 8598c2ecf20Sopenharmony_ci return true; 8608c2ecf20Sopenharmony_ci } 8618c2ecf20Sopenharmony_ci 8628c2ecf20Sopenharmony_ci /* 8638c2ecf20Sopenharmony_ci * When decoding each macroblock the rlco pointer will be increased 8648c2ecf20Sopenharmony_ci * by 65 * 2 bytes worst-case. 8658c2ecf20Sopenharmony_ci * To avoid overflow the buffer has to be 65/64th of the actual raw 8668c2ecf20Sopenharmony_ci * image size, just in case someone feeds it malicious data. 8678c2ecf20Sopenharmony_ci */ 8688c2ecf20Sopenharmony_ci for (j = 0; j < height / 8; j++) { 8698c2ecf20Sopenharmony_ci for (i = 0; i < width / 8; i++) { 8708c2ecf20Sopenharmony_ci const u8 *refp = ref + j * 8 * ref_stride + 8718c2ecf20Sopenharmony_ci i * 8 * ref_step; 8728c2ecf20Sopenharmony_ci u8 *dstp = dst + j * 8 * dst_stride + i * 8 * dst_step; 8738c2ecf20Sopenharmony_ci 8748c2ecf20Sopenharmony_ci if (copies) { 8758c2ecf20Sopenharmony_ci memcpy(cf->de_fwht, copy, sizeof(copy)); 8768c2ecf20Sopenharmony_ci if ((stat & PFRAME_BIT) && !is_intra) 8778c2ecf20Sopenharmony_ci add_deltas(cf->de_fwht, refp, 8788c2ecf20Sopenharmony_ci ref_stride, ref_step); 8798c2ecf20Sopenharmony_ci fill_decoder_block(dstp, cf->de_fwht, 8808c2ecf20Sopenharmony_ci dst_stride, dst_step); 8818c2ecf20Sopenharmony_ci copies--; 8828c2ecf20Sopenharmony_ci continue; 8838c2ecf20Sopenharmony_ci } 8848c2ecf20Sopenharmony_ci 8858c2ecf20Sopenharmony_ci stat = derlc(rlco, cf->coeffs, end_of_rlco_buf); 8868c2ecf20Sopenharmony_ci if (stat & OVERFLOW_BIT) 8878c2ecf20Sopenharmony_ci return false; 8888c2ecf20Sopenharmony_ci if ((stat & PFRAME_BIT) && !is_intra) 8898c2ecf20Sopenharmony_ci dequantize_inter(cf->coeffs); 8908c2ecf20Sopenharmony_ci else 8918c2ecf20Sopenharmony_ci dequantize_intra(cf->coeffs); 8928c2ecf20Sopenharmony_ci 8938c2ecf20Sopenharmony_ci ifwht(cf->coeffs, cf->de_fwht, 8948c2ecf20Sopenharmony_ci ((stat & PFRAME_BIT) && !is_intra) ? 0 : 1); 8958c2ecf20Sopenharmony_ci 8968c2ecf20Sopenharmony_ci copies = (stat & DUPS_MASK) >> 1; 8978c2ecf20Sopenharmony_ci if (copies) 8988c2ecf20Sopenharmony_ci memcpy(copy, cf->de_fwht, sizeof(copy)); 8998c2ecf20Sopenharmony_ci if ((stat & PFRAME_BIT) && !is_intra) 9008c2ecf20Sopenharmony_ci add_deltas(cf->de_fwht, refp, 9018c2ecf20Sopenharmony_ci ref_stride, ref_step); 9028c2ecf20Sopenharmony_ci fill_decoder_block(dstp, cf->de_fwht, dst_stride, 9038c2ecf20Sopenharmony_ci dst_step); 9048c2ecf20Sopenharmony_ci } 9058c2ecf20Sopenharmony_ci } 9068c2ecf20Sopenharmony_ci return true; 9078c2ecf20Sopenharmony_ci} 9088c2ecf20Sopenharmony_ci 9098c2ecf20Sopenharmony_cibool fwht_decode_frame(struct fwht_cframe *cf, u32 hdr_flags, 9108c2ecf20Sopenharmony_ci unsigned int components_num, unsigned int width, 9118c2ecf20Sopenharmony_ci unsigned int height, const struct fwht_raw_frame *ref, 9128c2ecf20Sopenharmony_ci unsigned int ref_stride, unsigned int ref_chroma_stride, 9138c2ecf20Sopenharmony_ci struct fwht_raw_frame *dst, unsigned int dst_stride, 9148c2ecf20Sopenharmony_ci unsigned int dst_chroma_stride) 9158c2ecf20Sopenharmony_ci{ 9168c2ecf20Sopenharmony_ci const __be16 *rlco = cf->rlc_data; 9178c2ecf20Sopenharmony_ci const __be16 *end_of_rlco_buf = cf->rlc_data + 9188c2ecf20Sopenharmony_ci (cf->size / sizeof(*rlco)) - 1; 9198c2ecf20Sopenharmony_ci 9208c2ecf20Sopenharmony_ci if (!decode_plane(cf, &rlco, height, width, ref->luma, ref_stride, 9218c2ecf20Sopenharmony_ci ref->luma_alpha_step, dst->luma, dst_stride, 9228c2ecf20Sopenharmony_ci dst->luma_alpha_step, 9238c2ecf20Sopenharmony_ci hdr_flags & FWHT_FL_LUMA_IS_UNCOMPRESSED, 9248c2ecf20Sopenharmony_ci end_of_rlco_buf)) 9258c2ecf20Sopenharmony_ci return false; 9268c2ecf20Sopenharmony_ci 9278c2ecf20Sopenharmony_ci if (components_num >= 3) { 9288c2ecf20Sopenharmony_ci u32 h = height; 9298c2ecf20Sopenharmony_ci u32 w = width; 9308c2ecf20Sopenharmony_ci 9318c2ecf20Sopenharmony_ci if (!(hdr_flags & FWHT_FL_CHROMA_FULL_HEIGHT)) 9328c2ecf20Sopenharmony_ci h /= 2; 9338c2ecf20Sopenharmony_ci if (!(hdr_flags & FWHT_FL_CHROMA_FULL_WIDTH)) 9348c2ecf20Sopenharmony_ci w /= 2; 9358c2ecf20Sopenharmony_ci 9368c2ecf20Sopenharmony_ci if (!decode_plane(cf, &rlco, h, w, ref->cb, ref_chroma_stride, 9378c2ecf20Sopenharmony_ci ref->chroma_step, dst->cb, dst_chroma_stride, 9388c2ecf20Sopenharmony_ci dst->chroma_step, 9398c2ecf20Sopenharmony_ci hdr_flags & FWHT_FL_CB_IS_UNCOMPRESSED, 9408c2ecf20Sopenharmony_ci end_of_rlco_buf)) 9418c2ecf20Sopenharmony_ci return false; 9428c2ecf20Sopenharmony_ci if (!decode_plane(cf, &rlco, h, w, ref->cr, ref_chroma_stride, 9438c2ecf20Sopenharmony_ci ref->chroma_step, dst->cr, dst_chroma_stride, 9448c2ecf20Sopenharmony_ci dst->chroma_step, 9458c2ecf20Sopenharmony_ci hdr_flags & FWHT_FL_CR_IS_UNCOMPRESSED, 9468c2ecf20Sopenharmony_ci end_of_rlco_buf)) 9478c2ecf20Sopenharmony_ci return false; 9488c2ecf20Sopenharmony_ci } 9498c2ecf20Sopenharmony_ci 9508c2ecf20Sopenharmony_ci if (components_num == 4) 9518c2ecf20Sopenharmony_ci if (!decode_plane(cf, &rlco, height, width, ref->alpha, ref_stride, 9528c2ecf20Sopenharmony_ci ref->luma_alpha_step, dst->alpha, dst_stride, 9538c2ecf20Sopenharmony_ci dst->luma_alpha_step, 9548c2ecf20Sopenharmony_ci hdr_flags & FWHT_FL_ALPHA_IS_UNCOMPRESSED, 9558c2ecf20Sopenharmony_ci end_of_rlco_buf)) 9568c2ecf20Sopenharmony_ci return false; 9578c2ecf20Sopenharmony_ci return true; 9588c2ecf20Sopenharmony_ci} 959