1cb93a386Sopenharmony_ci/*
2cb93a386Sopenharmony_ci * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
3cb93a386Sopenharmony_ci *
4cb93a386Sopenharmony_ci * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
5cb93a386Sopenharmony_ci *
6cb93a386Sopenharmony_ci * This software is provided 'as-is', without any express or implied
7cb93a386Sopenharmony_ci * warranty.  In no event will the authors be held liable for any damages
8cb93a386Sopenharmony_ci * arising from the use of this software.
9cb93a386Sopenharmony_ci *
10cb93a386Sopenharmony_ci * Permission is granted to anyone to use this software for any purpose,
11cb93a386Sopenharmony_ci * including commercial applications, and to alter it and redistribute it
12cb93a386Sopenharmony_ci * freely, subject to the following restrictions:
13cb93a386Sopenharmony_ci *
14cb93a386Sopenharmony_ci * 1. The origin of this software must not be misrepresented; you must not
15cb93a386Sopenharmony_ci *    claim that you wrote the original software. If you use this software
16cb93a386Sopenharmony_ci *    in a product, an acknowledgment in the product documentation would be
17cb93a386Sopenharmony_ci *    appreciated but is not required.
18cb93a386Sopenharmony_ci * 2. Altered source versions must be plainly marked as such, and must not be
19cb93a386Sopenharmony_ci *    misrepresented as being the original software.
20cb93a386Sopenharmony_ci * 3. This notice may not be removed or altered from any source distribution.
21cb93a386Sopenharmony_ci */
22cb93a386Sopenharmony_ci
23cb93a386Sopenharmony_ci#define JPEG_INTERNALS
24cb93a386Sopenharmony_ci#include "../../jinclude.h"
25cb93a386Sopenharmony_ci#include "../../jpeglib.h"
26cb93a386Sopenharmony_ci#include "../../jsimd.h"
27cb93a386Sopenharmony_ci#include "../../jdct.h"
28cb93a386Sopenharmony_ci#include "../../jsimddct.h"
29cb93a386Sopenharmony_ci#include "../jsimd.h"
30cb93a386Sopenharmony_ci
31cb93a386Sopenharmony_ci#include <arm_neon.h>
32cb93a386Sopenharmony_ci
33cb93a386Sopenharmony_ci
34cb93a386Sopenharmony_ci/* After downsampling, the resulting sample values are in the range [0, 255],
35cb93a386Sopenharmony_ci * but the Discrete Cosine Transform (DCT) operates on values centered around
36cb93a386Sopenharmony_ci * 0.
37cb93a386Sopenharmony_ci *
38cb93a386Sopenharmony_ci * To prepare sample values for the DCT, load samples into a DCT workspace,
39cb93a386Sopenharmony_ci * subtracting CENTERJSAMPLE (128).  The samples, now in the range [-128, 127],
40cb93a386Sopenharmony_ci * are also widened from 8- to 16-bit.
41cb93a386Sopenharmony_ci *
42cb93a386Sopenharmony_ci * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
43cb93a386Sopenharmony_ci */
44cb93a386Sopenharmony_ci
45cb93a386Sopenharmony_civoid jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
46cb93a386Sopenharmony_ci                         DCTELEM *workspace)
47cb93a386Sopenharmony_ci{
48cb93a386Sopenharmony_ci  uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
49cb93a386Sopenharmony_ci  uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
50cb93a386Sopenharmony_ci  uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
51cb93a386Sopenharmony_ci  uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
52cb93a386Sopenharmony_ci  uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
53cb93a386Sopenharmony_ci  uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
54cb93a386Sopenharmony_ci  uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
55cb93a386Sopenharmony_ci  uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
56cb93a386Sopenharmony_ci
57cb93a386Sopenharmony_ci  int16x8_t row0 =
58cb93a386Sopenharmony_ci    vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
59cb93a386Sopenharmony_ci  int16x8_t row1 =
60cb93a386Sopenharmony_ci    vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
61cb93a386Sopenharmony_ci  int16x8_t row2 =
62cb93a386Sopenharmony_ci    vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
63cb93a386Sopenharmony_ci  int16x8_t row3 =
64cb93a386Sopenharmony_ci    vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
65cb93a386Sopenharmony_ci  int16x8_t row4 =
66cb93a386Sopenharmony_ci    vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
67cb93a386Sopenharmony_ci  int16x8_t row5 =
68cb93a386Sopenharmony_ci    vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
69cb93a386Sopenharmony_ci  int16x8_t row6 =
70cb93a386Sopenharmony_ci    vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
71cb93a386Sopenharmony_ci  int16x8_t row7 =
72cb93a386Sopenharmony_ci    vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
73cb93a386Sopenharmony_ci
74cb93a386Sopenharmony_ci  vst1q_s16(workspace + 0 * DCTSIZE, row0);
75cb93a386Sopenharmony_ci  vst1q_s16(workspace + 1 * DCTSIZE, row1);
76cb93a386Sopenharmony_ci  vst1q_s16(workspace + 2 * DCTSIZE, row2);
77cb93a386Sopenharmony_ci  vst1q_s16(workspace + 3 * DCTSIZE, row3);
78cb93a386Sopenharmony_ci  vst1q_s16(workspace + 4 * DCTSIZE, row4);
79cb93a386Sopenharmony_ci  vst1q_s16(workspace + 5 * DCTSIZE, row5);
80cb93a386Sopenharmony_ci  vst1q_s16(workspace + 6 * DCTSIZE, row6);
81cb93a386Sopenharmony_ci  vst1q_s16(workspace + 7 * DCTSIZE, row7);
82cb93a386Sopenharmony_ci}
83cb93a386Sopenharmony_ci
84cb93a386Sopenharmony_ci
85cb93a386Sopenharmony_ci/* After the DCT, the resulting array of coefficient values needs to be divided
86cb93a386Sopenharmony_ci * by an array of quantization values.
87cb93a386Sopenharmony_ci *
88cb93a386Sopenharmony_ci * To avoid a slow division operation, the DCT coefficients are multiplied by
89cb93a386Sopenharmony_ci * the (scaled) reciprocals of the quantization values and then right-shifted.
90cb93a386Sopenharmony_ci *
91cb93a386Sopenharmony_ci * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
92cb93a386Sopenharmony_ci */
93cb93a386Sopenharmony_ci
94cb93a386Sopenharmony_civoid jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
95cb93a386Sopenharmony_ci                         DCTELEM *workspace)
96cb93a386Sopenharmony_ci{
97cb93a386Sopenharmony_ci  JCOEFPTR out_ptr = coef_block;
98cb93a386Sopenharmony_ci  UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
99cb93a386Sopenharmony_ci  UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
100cb93a386Sopenharmony_ci  DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
101cb93a386Sopenharmony_ci  int i;
102cb93a386Sopenharmony_ci
103cb93a386Sopenharmony_ci  for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
104cb93a386Sopenharmony_ci    /* Load DCT coefficients. */
105cb93a386Sopenharmony_ci    int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
106cb93a386Sopenharmony_ci    int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
107cb93a386Sopenharmony_ci    int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
108cb93a386Sopenharmony_ci    int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
109cb93a386Sopenharmony_ci    /* Load reciprocals of quantization values. */
110cb93a386Sopenharmony_ci    uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
111cb93a386Sopenharmony_ci    uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
112cb93a386Sopenharmony_ci    uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
113cb93a386Sopenharmony_ci    uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
114cb93a386Sopenharmony_ci    uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
115cb93a386Sopenharmony_ci    uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
116cb93a386Sopenharmony_ci    uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
117cb93a386Sopenharmony_ci    uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
118cb93a386Sopenharmony_ci    int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
119cb93a386Sopenharmony_ci    int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
120cb93a386Sopenharmony_ci    int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
121cb93a386Sopenharmony_ci    int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
122cb93a386Sopenharmony_ci
123cb93a386Sopenharmony_ci    /* Extract sign from coefficients. */
124cb93a386Sopenharmony_ci    int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
125cb93a386Sopenharmony_ci    int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
126cb93a386Sopenharmony_ci    int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
127cb93a386Sopenharmony_ci    int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
128cb93a386Sopenharmony_ci    /* Get absolute value of DCT coefficients. */
129cb93a386Sopenharmony_ci    uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
130cb93a386Sopenharmony_ci    uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
131cb93a386Sopenharmony_ci    uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
132cb93a386Sopenharmony_ci    uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
133cb93a386Sopenharmony_ci    /* Add correction. */
134cb93a386Sopenharmony_ci    abs_row0 = vaddq_u16(abs_row0, corr0);
135cb93a386Sopenharmony_ci    abs_row1 = vaddq_u16(abs_row1, corr1);
136cb93a386Sopenharmony_ci    abs_row2 = vaddq_u16(abs_row2, corr2);
137cb93a386Sopenharmony_ci    abs_row3 = vaddq_u16(abs_row3, corr3);
138cb93a386Sopenharmony_ci
139cb93a386Sopenharmony_ci    /* Multiply DCT coefficients by quantization reciprocals. */
140cb93a386Sopenharmony_ci    int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
141cb93a386Sopenharmony_ci                                                       vget_low_u16(recip0)));
142cb93a386Sopenharmony_ci    int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
143cb93a386Sopenharmony_ci                                                       vget_high_u16(recip0)));
144cb93a386Sopenharmony_ci    int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
145cb93a386Sopenharmony_ci                                                       vget_low_u16(recip1)));
146cb93a386Sopenharmony_ci    int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
147cb93a386Sopenharmony_ci                                                       vget_high_u16(recip1)));
148cb93a386Sopenharmony_ci    int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
149cb93a386Sopenharmony_ci                                                       vget_low_u16(recip2)));
150cb93a386Sopenharmony_ci    int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
151cb93a386Sopenharmony_ci                                                       vget_high_u16(recip2)));
152cb93a386Sopenharmony_ci    int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
153cb93a386Sopenharmony_ci                                                       vget_low_u16(recip3)));
154cb93a386Sopenharmony_ci    int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
155cb93a386Sopenharmony_ci                                                       vget_high_u16(recip3)));
156cb93a386Sopenharmony_ci    /* Narrow back to 16-bit. */
157cb93a386Sopenharmony_ci    row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
158cb93a386Sopenharmony_ci    row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
159cb93a386Sopenharmony_ci    row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
160cb93a386Sopenharmony_ci    row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
161cb93a386Sopenharmony_ci
162cb93a386Sopenharmony_ci    /* Since VSHR only supports an immediate as its second argument, negate the
163cb93a386Sopenharmony_ci     * shift value and shift left.
164cb93a386Sopenharmony_ci     */
165cb93a386Sopenharmony_ci    row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
166cb93a386Sopenharmony_ci                                           vnegq_s16(shift0)));
167cb93a386Sopenharmony_ci    row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
168cb93a386Sopenharmony_ci                                           vnegq_s16(shift1)));
169cb93a386Sopenharmony_ci    row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
170cb93a386Sopenharmony_ci                                           vnegq_s16(shift2)));
171cb93a386Sopenharmony_ci    row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
172cb93a386Sopenharmony_ci                                           vnegq_s16(shift3)));
173cb93a386Sopenharmony_ci
174cb93a386Sopenharmony_ci    /* Restore sign to original product. */
175cb93a386Sopenharmony_ci    row0 = veorq_s16(row0, sign_row0);
176cb93a386Sopenharmony_ci    row0 = vsubq_s16(row0, sign_row0);
177cb93a386Sopenharmony_ci    row1 = veorq_s16(row1, sign_row1);
178cb93a386Sopenharmony_ci    row1 = vsubq_s16(row1, sign_row1);
179cb93a386Sopenharmony_ci    row2 = veorq_s16(row2, sign_row2);
180cb93a386Sopenharmony_ci    row2 = vsubq_s16(row2, sign_row2);
181cb93a386Sopenharmony_ci    row3 = veorq_s16(row3, sign_row3);
182cb93a386Sopenharmony_ci    row3 = vsubq_s16(row3, sign_row3);
183cb93a386Sopenharmony_ci
184cb93a386Sopenharmony_ci    /* Store quantized coefficients to memory. */
185cb93a386Sopenharmony_ci    vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
186cb93a386Sopenharmony_ci    vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
187cb93a386Sopenharmony_ci    vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
188cb93a386Sopenharmony_ci    vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
189cb93a386Sopenharmony_ci  }
190cb93a386Sopenharmony_ci}
191