1cb93a386Sopenharmony_ci/* 2cb93a386Sopenharmony_ci * jquanti-neon.c - sample data conversion and quantization (Arm Neon) 3cb93a386Sopenharmony_ci * 4cb93a386Sopenharmony_ci * Copyright (C) 2020, Arm Limited. All Rights Reserved. 5cb93a386Sopenharmony_ci * 6cb93a386Sopenharmony_ci * This software is provided 'as-is', without any express or implied 7cb93a386Sopenharmony_ci * warranty. In no event will the authors be held liable for any damages 8cb93a386Sopenharmony_ci * arising from the use of this software. 9cb93a386Sopenharmony_ci * 10cb93a386Sopenharmony_ci * Permission is granted to anyone to use this software for any purpose, 11cb93a386Sopenharmony_ci * including commercial applications, and to alter it and redistribute it 12cb93a386Sopenharmony_ci * freely, subject to the following restrictions: 13cb93a386Sopenharmony_ci * 14cb93a386Sopenharmony_ci * 1. The origin of this software must not be misrepresented; you must not 15cb93a386Sopenharmony_ci * claim that you wrote the original software. If you use this software 16cb93a386Sopenharmony_ci * in a product, an acknowledgment in the product documentation would be 17cb93a386Sopenharmony_ci * appreciated but is not required. 18cb93a386Sopenharmony_ci * 2. Altered source versions must be plainly marked as such, and must not be 19cb93a386Sopenharmony_ci * misrepresented as being the original software. 20cb93a386Sopenharmony_ci * 3. This notice may not be removed or altered from any source distribution. 21cb93a386Sopenharmony_ci */ 22cb93a386Sopenharmony_ci 23cb93a386Sopenharmony_ci#define JPEG_INTERNALS 24cb93a386Sopenharmony_ci#include "../../jinclude.h" 25cb93a386Sopenharmony_ci#include "../../jpeglib.h" 26cb93a386Sopenharmony_ci#include "../../jsimd.h" 27cb93a386Sopenharmony_ci#include "../../jdct.h" 28cb93a386Sopenharmony_ci#include "../../jsimddct.h" 29cb93a386Sopenharmony_ci#include "../jsimd.h" 30cb93a386Sopenharmony_ci 31cb93a386Sopenharmony_ci#include <arm_neon.h> 32cb93a386Sopenharmony_ci 33cb93a386Sopenharmony_ci 34cb93a386Sopenharmony_ci/* After downsampling, the resulting sample values are in the range [0, 255], 35cb93a386Sopenharmony_ci * but the Discrete Cosine Transform (DCT) operates on values centered around 36cb93a386Sopenharmony_ci * 0. 37cb93a386Sopenharmony_ci * 38cb93a386Sopenharmony_ci * To prepare sample values for the DCT, load samples into a DCT workspace, 39cb93a386Sopenharmony_ci * subtracting CENTERJSAMPLE (128). The samples, now in the range [-128, 127], 40cb93a386Sopenharmony_ci * are also widened from 8- to 16-bit. 41cb93a386Sopenharmony_ci * 42cb93a386Sopenharmony_ci * The equivalent scalar C function convsamp() can be found in jcdctmgr.c. 43cb93a386Sopenharmony_ci */ 44cb93a386Sopenharmony_ci 45cb93a386Sopenharmony_civoid jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col, 46cb93a386Sopenharmony_ci DCTELEM *workspace) 47cb93a386Sopenharmony_ci{ 48cb93a386Sopenharmony_ci uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col); 49cb93a386Sopenharmony_ci uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col); 50cb93a386Sopenharmony_ci uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col); 51cb93a386Sopenharmony_ci uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col); 52cb93a386Sopenharmony_ci uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col); 53cb93a386Sopenharmony_ci uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col); 54cb93a386Sopenharmony_ci uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col); 55cb93a386Sopenharmony_ci uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col); 56cb93a386Sopenharmony_ci 57cb93a386Sopenharmony_ci int16x8_t row0 = 58cb93a386Sopenharmony_ci vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE))); 59cb93a386Sopenharmony_ci int16x8_t row1 = 60cb93a386Sopenharmony_ci vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE))); 61cb93a386Sopenharmony_ci int16x8_t row2 = 62cb93a386Sopenharmony_ci vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE))); 63cb93a386Sopenharmony_ci int16x8_t row3 = 64cb93a386Sopenharmony_ci vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE))); 65cb93a386Sopenharmony_ci int16x8_t row4 = 66cb93a386Sopenharmony_ci vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE))); 67cb93a386Sopenharmony_ci int16x8_t row5 = 68cb93a386Sopenharmony_ci vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE))); 69cb93a386Sopenharmony_ci int16x8_t row6 = 70cb93a386Sopenharmony_ci vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE))); 71cb93a386Sopenharmony_ci int16x8_t row7 = 72cb93a386Sopenharmony_ci vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE))); 73cb93a386Sopenharmony_ci 74cb93a386Sopenharmony_ci vst1q_s16(workspace + 0 * DCTSIZE, row0); 75cb93a386Sopenharmony_ci vst1q_s16(workspace + 1 * DCTSIZE, row1); 76cb93a386Sopenharmony_ci vst1q_s16(workspace + 2 * DCTSIZE, row2); 77cb93a386Sopenharmony_ci vst1q_s16(workspace + 3 * DCTSIZE, row3); 78cb93a386Sopenharmony_ci vst1q_s16(workspace + 4 * DCTSIZE, row4); 79cb93a386Sopenharmony_ci vst1q_s16(workspace + 5 * DCTSIZE, row5); 80cb93a386Sopenharmony_ci vst1q_s16(workspace + 6 * DCTSIZE, row6); 81cb93a386Sopenharmony_ci vst1q_s16(workspace + 7 * DCTSIZE, row7); 82cb93a386Sopenharmony_ci} 83cb93a386Sopenharmony_ci 84cb93a386Sopenharmony_ci 85cb93a386Sopenharmony_ci/* After the DCT, the resulting array of coefficient values needs to be divided 86cb93a386Sopenharmony_ci * by an array of quantization values. 87cb93a386Sopenharmony_ci * 88cb93a386Sopenharmony_ci * To avoid a slow division operation, the DCT coefficients are multiplied by 89cb93a386Sopenharmony_ci * the (scaled) reciprocals of the quantization values and then right-shifted. 90cb93a386Sopenharmony_ci * 91cb93a386Sopenharmony_ci * The equivalent scalar C function quantize() can be found in jcdctmgr.c. 92cb93a386Sopenharmony_ci */ 93cb93a386Sopenharmony_ci 94cb93a386Sopenharmony_civoid jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors, 95cb93a386Sopenharmony_ci DCTELEM *workspace) 96cb93a386Sopenharmony_ci{ 97cb93a386Sopenharmony_ci JCOEFPTR out_ptr = coef_block; 98cb93a386Sopenharmony_ci UDCTELEM *recip_ptr = (UDCTELEM *)divisors; 99cb93a386Sopenharmony_ci UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2; 100cb93a386Sopenharmony_ci DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2; 101cb93a386Sopenharmony_ci int i; 102cb93a386Sopenharmony_ci 103cb93a386Sopenharmony_ci for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) { 104cb93a386Sopenharmony_ci /* Load DCT coefficients. */ 105cb93a386Sopenharmony_ci int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE); 106cb93a386Sopenharmony_ci int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE); 107cb93a386Sopenharmony_ci int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE); 108cb93a386Sopenharmony_ci int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE); 109cb93a386Sopenharmony_ci /* Load reciprocals of quantization values. */ 110cb93a386Sopenharmony_ci uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE); 111cb93a386Sopenharmony_ci uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE); 112cb93a386Sopenharmony_ci uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE); 113cb93a386Sopenharmony_ci uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE); 114cb93a386Sopenharmony_ci uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE); 115cb93a386Sopenharmony_ci uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE); 116cb93a386Sopenharmony_ci uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE); 117cb93a386Sopenharmony_ci uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE); 118cb93a386Sopenharmony_ci int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE); 119cb93a386Sopenharmony_ci int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE); 120cb93a386Sopenharmony_ci int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE); 121cb93a386Sopenharmony_ci int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE); 122cb93a386Sopenharmony_ci 123cb93a386Sopenharmony_ci /* Extract sign from coefficients. */ 124cb93a386Sopenharmony_ci int16x8_t sign_row0 = vshrq_n_s16(row0, 15); 125cb93a386Sopenharmony_ci int16x8_t sign_row1 = vshrq_n_s16(row1, 15); 126cb93a386Sopenharmony_ci int16x8_t sign_row2 = vshrq_n_s16(row2, 15); 127cb93a386Sopenharmony_ci int16x8_t sign_row3 = vshrq_n_s16(row3, 15); 128cb93a386Sopenharmony_ci /* Get absolute value of DCT coefficients. */ 129cb93a386Sopenharmony_ci uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0)); 130cb93a386Sopenharmony_ci uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1)); 131cb93a386Sopenharmony_ci uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2)); 132cb93a386Sopenharmony_ci uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3)); 133cb93a386Sopenharmony_ci /* Add correction. */ 134cb93a386Sopenharmony_ci abs_row0 = vaddq_u16(abs_row0, corr0); 135cb93a386Sopenharmony_ci abs_row1 = vaddq_u16(abs_row1, corr1); 136cb93a386Sopenharmony_ci abs_row2 = vaddq_u16(abs_row2, corr2); 137cb93a386Sopenharmony_ci abs_row3 = vaddq_u16(abs_row3, corr3); 138cb93a386Sopenharmony_ci 139cb93a386Sopenharmony_ci /* Multiply DCT coefficients by quantization reciprocals. */ 140cb93a386Sopenharmony_ci int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0), 141cb93a386Sopenharmony_ci vget_low_u16(recip0))); 142cb93a386Sopenharmony_ci int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0), 143cb93a386Sopenharmony_ci vget_high_u16(recip0))); 144cb93a386Sopenharmony_ci int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1), 145cb93a386Sopenharmony_ci vget_low_u16(recip1))); 146cb93a386Sopenharmony_ci int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1), 147cb93a386Sopenharmony_ci vget_high_u16(recip1))); 148cb93a386Sopenharmony_ci int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2), 149cb93a386Sopenharmony_ci vget_low_u16(recip2))); 150cb93a386Sopenharmony_ci int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2), 151cb93a386Sopenharmony_ci vget_high_u16(recip2))); 152cb93a386Sopenharmony_ci int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3), 153cb93a386Sopenharmony_ci vget_low_u16(recip3))); 154cb93a386Sopenharmony_ci int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3), 155cb93a386Sopenharmony_ci vget_high_u16(recip3))); 156cb93a386Sopenharmony_ci /* Narrow back to 16-bit. */ 157cb93a386Sopenharmony_ci row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16)); 158cb93a386Sopenharmony_ci row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16)); 159cb93a386Sopenharmony_ci row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16)); 160cb93a386Sopenharmony_ci row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16)); 161cb93a386Sopenharmony_ci 162cb93a386Sopenharmony_ci /* Since VSHR only supports an immediate as its second argument, negate the 163cb93a386Sopenharmony_ci * shift value and shift left. 164cb93a386Sopenharmony_ci */ 165cb93a386Sopenharmony_ci row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0), 166cb93a386Sopenharmony_ci vnegq_s16(shift0))); 167cb93a386Sopenharmony_ci row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1), 168cb93a386Sopenharmony_ci vnegq_s16(shift1))); 169cb93a386Sopenharmony_ci row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2), 170cb93a386Sopenharmony_ci vnegq_s16(shift2))); 171cb93a386Sopenharmony_ci row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3), 172cb93a386Sopenharmony_ci vnegq_s16(shift3))); 173cb93a386Sopenharmony_ci 174cb93a386Sopenharmony_ci /* Restore sign to original product. */ 175cb93a386Sopenharmony_ci row0 = veorq_s16(row0, sign_row0); 176cb93a386Sopenharmony_ci row0 = vsubq_s16(row0, sign_row0); 177cb93a386Sopenharmony_ci row1 = veorq_s16(row1, sign_row1); 178cb93a386Sopenharmony_ci row1 = vsubq_s16(row1, sign_row1); 179cb93a386Sopenharmony_ci row2 = veorq_s16(row2, sign_row2); 180cb93a386Sopenharmony_ci row2 = vsubq_s16(row2, sign_row2); 181cb93a386Sopenharmony_ci row3 = veorq_s16(row3, sign_row3); 182cb93a386Sopenharmony_ci row3 = vsubq_s16(row3, sign_row3); 183cb93a386Sopenharmony_ci 184cb93a386Sopenharmony_ci /* Store quantized coefficients to memory. */ 185cb93a386Sopenharmony_ci vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0); 186cb93a386Sopenharmony_ci vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1); 187cb93a386Sopenharmony_ci vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2); 188cb93a386Sopenharmony_ci vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3); 189cb93a386Sopenharmony_ci } 190cb93a386Sopenharmony_ci} 191