1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Loongson LASX optimized h264dsp
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * Copyright (c) 2021 Loongson Technology Corporation Limited
5cabdff1aSopenharmony_ci * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
6cabdff1aSopenharmony_ci *                Xiwei  Gu  <guxiwei-hf@loongson.cn>
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * This file is part of FFmpeg.
9cabdff1aSopenharmony_ci *
10cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
11cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
12cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
13cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
14cabdff1aSopenharmony_ci *
15cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
16cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
17cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18cabdff1aSopenharmony_ci * Lesser General Public License for more details.
19cabdff1aSopenharmony_ci *
20cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
21cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
22cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23cabdff1aSopenharmony_ci */
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h"
26cabdff1aSopenharmony_ci#include "h264dsp_lasx.h"
27cabdff1aSopenharmony_ci#include "libavcodec/bit_depth_template.c"
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_ci#define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3)     \
30cabdff1aSopenharmony_ci{                                                                    \
31cabdff1aSopenharmony_ci   __m256i tmp0_m, tmp1_m, tmp2_m, tmp3_m;                           \
32cabdff1aSopenharmony_ci                                                                     \
33cabdff1aSopenharmony_ci    tmp0_m = __lasx_xvadd_h(in0, in2);                               \
34cabdff1aSopenharmony_ci    tmp1_m = __lasx_xvsub_h(in0, in2);                               \
35cabdff1aSopenharmony_ci    tmp2_m = __lasx_xvsrai_h(in1, 1);                                \
36cabdff1aSopenharmony_ci    tmp2_m = __lasx_xvsub_h(tmp2_m, in3);                            \
37cabdff1aSopenharmony_ci    tmp3_m = __lasx_xvsrai_h(in3, 1);                                \
38cabdff1aSopenharmony_ci    tmp3_m = __lasx_xvadd_h(in1, tmp3_m);                            \
39cabdff1aSopenharmony_ci                                                                     \
40cabdff1aSopenharmony_ci    LASX_BUTTERFLY_4_H(tmp0_m, tmp1_m, tmp2_m, tmp3_m,               \
41cabdff1aSopenharmony_ci                       out0, out1, out2, out3);                      \
42cabdff1aSopenharmony_ci}
43cabdff1aSopenharmony_ci
44cabdff1aSopenharmony_civoid ff_h264_idct_add_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride)
45cabdff1aSopenharmony_ci{
46cabdff1aSopenharmony_ci    __m256i src0_m, src1_m, src2_m, src3_m;
47cabdff1aSopenharmony_ci    __m256i dst0_m, dst1_m;
48cabdff1aSopenharmony_ci    __m256i hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3;
49cabdff1aSopenharmony_ci    __m256i inp0_m, inp1_m, res0_m, src1, src3;
50cabdff1aSopenharmony_ci    __m256i src0 = __lasx_xvld(src, 0);
51cabdff1aSopenharmony_ci    __m256i src2 = __lasx_xvld(src, 16);
52cabdff1aSopenharmony_ci    __m256i zero = __lasx_xvldi(0);
53cabdff1aSopenharmony_ci    int32_t dst_stride_2x = dst_stride << 1;
54cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
55cabdff1aSopenharmony_ci
56cabdff1aSopenharmony_ci    __lasx_xvst(zero, src, 0);
57cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvh_d, src0, src0, src2, src2, src1, src3);
58cabdff1aSopenharmony_ci    AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
59cabdff1aSopenharmony_ci    LASX_TRANSPOSE4x4_H(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
60cabdff1aSopenharmony_ci    AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
61cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
62cabdff1aSopenharmony_ci              dst, dst_stride_3x, src0_m, src1_m, src2_m, src3_m);
63cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvld, dst, 0, dst + dst_stride, 0, dst + dst_stride_2x,
64cabdff1aSopenharmony_ci              0, dst + dst_stride_3x, 0, src0_m, src1_m, src2_m, src3_m);
65cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, vres1, vres0, vres3, vres2, inp0_m, inp1_m);
66cabdff1aSopenharmony_ci    inp0_m = __lasx_xvpermi_q(inp1_m, inp0_m, 0x20);
67cabdff1aSopenharmony_ci    inp0_m = __lasx_xvsrari_h(inp0_m, 6);
68cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_w, src1_m, src0_m, src3_m, src2_m, dst0_m, dst1_m);
69cabdff1aSopenharmony_ci    dst0_m = __lasx_xvilvl_d(dst1_m, dst0_m);
70cabdff1aSopenharmony_ci    res0_m = __lasx_vext2xv_hu_bu(dst0_m);
71cabdff1aSopenharmony_ci    res0_m = __lasx_xvadd_h(res0_m, inp0_m);
72cabdff1aSopenharmony_ci    res0_m = __lasx_xvclip255_h(res0_m);
73cabdff1aSopenharmony_ci    dst0_m = __lasx_xvpickev_b(res0_m, res0_m);
74cabdff1aSopenharmony_ci    __lasx_xvstelm_w(dst0_m, dst, 0, 0);
75cabdff1aSopenharmony_ci    __lasx_xvstelm_w(dst0_m, dst + dst_stride, 0, 1);
76cabdff1aSopenharmony_ci    __lasx_xvstelm_w(dst0_m, dst + dst_stride_2x, 0, 4);
77cabdff1aSopenharmony_ci    __lasx_xvstelm_w(dst0_m, dst + dst_stride_3x, 0, 5);
78cabdff1aSopenharmony_ci}
79cabdff1aSopenharmony_ci
80cabdff1aSopenharmony_civoid ff_h264_idct8_addblk_lasx(uint8_t *dst, int16_t *src,
81cabdff1aSopenharmony_ci                               int32_t dst_stride)
82cabdff1aSopenharmony_ci{
83cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, src4, src5, src6, src7;
84cabdff1aSopenharmony_ci    __m256i vec0, vec1, vec2, vec3;
85cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
86cabdff1aSopenharmony_ci    __m256i res0, res1, res2, res3, res4, res5, res6, res7;
87cabdff1aSopenharmony_ci    __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
88cabdff1aSopenharmony_ci    __m256i zero = __lasx_xvldi(0);
89cabdff1aSopenharmony_ci    int32_t dst_stride_2x = dst_stride << 1;
90cabdff1aSopenharmony_ci    int32_t dst_stride_4x = dst_stride << 2;
91cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
92cabdff1aSopenharmony_ci
93cabdff1aSopenharmony_ci    src[0] += 32;
94cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvld, src, 0, src, 16, src, 32, src, 48,
95cabdff1aSopenharmony_ci              src0, src1, src2, src3);
96cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvld, src, 64, src, 80, src, 96, src, 112,
97cabdff1aSopenharmony_ci              src4, src5, src6, src7);
98cabdff1aSopenharmony_ci    __lasx_xvst(zero, src, 0);
99cabdff1aSopenharmony_ci    __lasx_xvst(zero, src, 32);
100cabdff1aSopenharmony_ci    __lasx_xvst(zero, src, 64);
101cabdff1aSopenharmony_ci    __lasx_xvst(zero, src, 96);
102cabdff1aSopenharmony_ci
103cabdff1aSopenharmony_ci    vec0 = __lasx_xvadd_h(src0, src4);
104cabdff1aSopenharmony_ci    vec1 = __lasx_xvsub_h(src0, src4);
105cabdff1aSopenharmony_ci    vec2 = __lasx_xvsrai_h(src2, 1);
106cabdff1aSopenharmony_ci    vec2 = __lasx_xvsub_h(vec2, src6);
107cabdff1aSopenharmony_ci    vec3 = __lasx_xvsrai_h(src6, 1);
108cabdff1aSopenharmony_ci    vec3 = __lasx_xvadd_h(src2, vec3);
109cabdff1aSopenharmony_ci
110cabdff1aSopenharmony_ci    LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3);
111cabdff1aSopenharmony_ci
112cabdff1aSopenharmony_ci    vec0 = __lasx_xvsrai_h(src7, 1);
113cabdff1aSopenharmony_ci    vec0 = __lasx_xvsub_h(src5, vec0);
114cabdff1aSopenharmony_ci    vec0 = __lasx_xvsub_h(vec0, src3);
115cabdff1aSopenharmony_ci    vec0 = __lasx_xvsub_h(vec0, src7);
116cabdff1aSopenharmony_ci
117cabdff1aSopenharmony_ci    vec1 = __lasx_xvsrai_h(src3, 1);
118cabdff1aSopenharmony_ci    vec1 = __lasx_xvsub_h(src1, vec1);
119cabdff1aSopenharmony_ci    vec1 = __lasx_xvadd_h(vec1, src7);
120cabdff1aSopenharmony_ci    vec1 = __lasx_xvsub_h(vec1, src3);
121cabdff1aSopenharmony_ci
122cabdff1aSopenharmony_ci    vec2 = __lasx_xvsrai_h(src5, 1);
123cabdff1aSopenharmony_ci    vec2 = __lasx_xvsub_h(vec2, src1);
124cabdff1aSopenharmony_ci    vec2 = __lasx_xvadd_h(vec2, src7);
125cabdff1aSopenharmony_ci    vec2 = __lasx_xvadd_h(vec2, src5);
126cabdff1aSopenharmony_ci
127cabdff1aSopenharmony_ci    vec3 = __lasx_xvsrai_h(src1, 1);
128cabdff1aSopenharmony_ci    vec3 = __lasx_xvadd_h(src3, vec3);
129cabdff1aSopenharmony_ci    vec3 = __lasx_xvadd_h(vec3, src5);
130cabdff1aSopenharmony_ci    vec3 = __lasx_xvadd_h(vec3, src1);
131cabdff1aSopenharmony_ci
132cabdff1aSopenharmony_ci    tmp4 = __lasx_xvsrai_h(vec3, 2);
133cabdff1aSopenharmony_ci    tmp4 = __lasx_xvadd_h(tmp4, vec0);
134cabdff1aSopenharmony_ci    tmp5 = __lasx_xvsrai_h(vec2, 2);
135cabdff1aSopenharmony_ci    tmp5 = __lasx_xvadd_h(tmp5, vec1);
136cabdff1aSopenharmony_ci    tmp6 = __lasx_xvsrai_h(vec1, 2);
137cabdff1aSopenharmony_ci    tmp6 = __lasx_xvsub_h(tmp6, vec2);
138cabdff1aSopenharmony_ci    tmp7 = __lasx_xvsrai_h(vec0, 2);
139cabdff1aSopenharmony_ci    tmp7 = __lasx_xvsub_h(vec3, tmp7);
140cabdff1aSopenharmony_ci
141cabdff1aSopenharmony_ci    LASX_BUTTERFLY_8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
142cabdff1aSopenharmony_ci                       res0, res1, res2, res3, res4, res5, res6, res7);
143cabdff1aSopenharmony_ci    LASX_TRANSPOSE8x8_H(res0, res1, res2, res3, res4, res5, res6, res7,
144cabdff1aSopenharmony_ci                        res0, res1, res2, res3, res4, res5, res6, res7);
145cabdff1aSopenharmony_ci
146cabdff1aSopenharmony_ci    DUP4_ARG1(__lasx_vext2xv_w_h, res0, res1, res2, res3,
147cabdff1aSopenharmony_ci              tmp0, tmp1, tmp2, tmp3);
148cabdff1aSopenharmony_ci    DUP4_ARG1(__lasx_vext2xv_w_h, res4, res5, res6, res7,
149cabdff1aSopenharmony_ci              tmp4, tmp5, tmp6, tmp7);
150cabdff1aSopenharmony_ci    vec0 = __lasx_xvadd_w(tmp0, tmp4);
151cabdff1aSopenharmony_ci    vec1 = __lasx_xvsub_w(tmp0, tmp4);
152cabdff1aSopenharmony_ci
153cabdff1aSopenharmony_ci    vec2 = __lasx_xvsrai_w(tmp2, 1);
154cabdff1aSopenharmony_ci    vec2 = __lasx_xvsub_w(vec2, tmp6);
155cabdff1aSopenharmony_ci    vec3 = __lasx_xvsrai_w(tmp6, 1);
156cabdff1aSopenharmony_ci    vec3 = __lasx_xvadd_w(vec3, tmp2);
157cabdff1aSopenharmony_ci
158cabdff1aSopenharmony_ci    tmp0 = __lasx_xvadd_w(vec0, vec3);
159cabdff1aSopenharmony_ci    tmp2 = __lasx_xvadd_w(vec1, vec2);
160cabdff1aSopenharmony_ci    tmp4 = __lasx_xvsub_w(vec1, vec2);
161cabdff1aSopenharmony_ci    tmp6 = __lasx_xvsub_w(vec0, vec3);
162cabdff1aSopenharmony_ci
163cabdff1aSopenharmony_ci    vec0 = __lasx_xvsrai_w(tmp7, 1);
164cabdff1aSopenharmony_ci    vec0 = __lasx_xvsub_w(tmp5, vec0);
165cabdff1aSopenharmony_ci    vec0 = __lasx_xvsub_w(vec0, tmp3);
166cabdff1aSopenharmony_ci    vec0 = __lasx_xvsub_w(vec0, tmp7);
167cabdff1aSopenharmony_ci
168cabdff1aSopenharmony_ci    vec1 = __lasx_xvsrai_w(tmp3, 1);
169cabdff1aSopenharmony_ci    vec1 = __lasx_xvsub_w(tmp1, vec1);
170cabdff1aSopenharmony_ci    vec1 = __lasx_xvadd_w(vec1, tmp7);
171cabdff1aSopenharmony_ci    vec1 = __lasx_xvsub_w(vec1, tmp3);
172cabdff1aSopenharmony_ci
173cabdff1aSopenharmony_ci    vec2 = __lasx_xvsrai_w(tmp5, 1);
174cabdff1aSopenharmony_ci    vec2 = __lasx_xvsub_w(vec2, tmp1);
175cabdff1aSopenharmony_ci    vec2 = __lasx_xvadd_w(vec2, tmp7);
176cabdff1aSopenharmony_ci    vec2 = __lasx_xvadd_w(vec2, tmp5);
177cabdff1aSopenharmony_ci
178cabdff1aSopenharmony_ci    vec3 = __lasx_xvsrai_w(tmp1, 1);
179cabdff1aSopenharmony_ci    vec3 = __lasx_xvadd_w(tmp3, vec3);
180cabdff1aSopenharmony_ci    vec3 = __lasx_xvadd_w(vec3, tmp5);
181cabdff1aSopenharmony_ci    vec3 = __lasx_xvadd_w(vec3, tmp1);
182cabdff1aSopenharmony_ci
183cabdff1aSopenharmony_ci    tmp1 = __lasx_xvsrai_w(vec3, 2);
184cabdff1aSopenharmony_ci    tmp1 = __lasx_xvadd_w(tmp1, vec0);
185cabdff1aSopenharmony_ci    tmp3 = __lasx_xvsrai_w(vec2, 2);
186cabdff1aSopenharmony_ci    tmp3 = __lasx_xvadd_w(tmp3, vec1);
187cabdff1aSopenharmony_ci    tmp5 = __lasx_xvsrai_w(vec1, 2);
188cabdff1aSopenharmony_ci    tmp5 = __lasx_xvsub_w(tmp5, vec2);
189cabdff1aSopenharmony_ci    tmp7 = __lasx_xvsrai_w(vec0, 2);
190cabdff1aSopenharmony_ci    tmp7 = __lasx_xvsub_w(vec3, tmp7);
191cabdff1aSopenharmony_ci
192cabdff1aSopenharmony_ci    LASX_BUTTERFLY_4_W(tmp0, tmp2, tmp5, tmp7, res0, res1, res6, res7);
193cabdff1aSopenharmony_ci    LASX_BUTTERFLY_4_W(tmp4, tmp6, tmp1, tmp3, res2, res3, res4, res5);
194cabdff1aSopenharmony_ci
195cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvsrai_w, res0, 6, res1, 6, res2, 6, res3, 6,
196cabdff1aSopenharmony_ci              res0, res1, res2, res3);
197cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvsrai_w, res4, 6, res5, 6, res6, 6, res7, 6,
198cabdff1aSopenharmony_ci              res4, res5, res6, res7);
199cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvpickev_h, res1, res0, res3, res2, res5, res4, res7,
200cabdff1aSopenharmony_ci              res6, res0, res1, res2, res3);
201cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvpermi_d, res0, 0xd8, res1, 0xd8, res2, 0xd8, res3, 0xd8,
202cabdff1aSopenharmony_ci              res0, res1, res2, res3);
203cabdff1aSopenharmony_ci
204cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
205cabdff1aSopenharmony_ci              dst, dst_stride_3x, dst0, dst1, dst2, dst3);
206cabdff1aSopenharmony_ci    dst += dst_stride_4x;
207cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
208cabdff1aSopenharmony_ci              dst, dst_stride_3x, dst4, dst5, dst6, dst7);
209cabdff1aSopenharmony_ci    dst -= dst_stride_4x;
210cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvl_b, zero, dst0, zero, dst1, zero, dst2, zero, dst3,
211cabdff1aSopenharmony_ci              dst0, dst1, dst2, dst3);
212cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvl_b, zero, dst4, zero, dst5, zero, dst6, zero, dst7,
213cabdff1aSopenharmony_ci              dst4, dst5, dst6, dst7);
214cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst3, dst2, 0x20, dst5,
215cabdff1aSopenharmony_ci              dst4, 0x20, dst7, dst6, 0x20, dst0, dst1, dst2, dst3);
216cabdff1aSopenharmony_ci    res0 = __lasx_xvadd_h(res0, dst0);
217cabdff1aSopenharmony_ci    res1 = __lasx_xvadd_h(res1, dst1);
218cabdff1aSopenharmony_ci    res2 = __lasx_xvadd_h(res2, dst2);
219cabdff1aSopenharmony_ci    res3 = __lasx_xvadd_h(res3, dst3);
220cabdff1aSopenharmony_ci    DUP4_ARG1(__lasx_xvclip255_h, res0, res1, res2, res3, res0, res1,
221cabdff1aSopenharmony_ci              res2, res3);
222cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvpickev_b, res1, res0, res3, res2, res0, res1);
223cabdff1aSopenharmony_ci    __lasx_xvstelm_d(res0, dst, 0, 0);
224cabdff1aSopenharmony_ci    __lasx_xvstelm_d(res0, dst + dst_stride, 0, 2);
225cabdff1aSopenharmony_ci    __lasx_xvstelm_d(res0, dst + dst_stride_2x, 0, 1);
226cabdff1aSopenharmony_ci    __lasx_xvstelm_d(res0, dst + dst_stride_3x, 0, 3);
227cabdff1aSopenharmony_ci    dst += dst_stride_4x;
228cabdff1aSopenharmony_ci    __lasx_xvstelm_d(res1, dst, 0, 0);
229cabdff1aSopenharmony_ci    __lasx_xvstelm_d(res1, dst + dst_stride, 0, 2);
230cabdff1aSopenharmony_ci    __lasx_xvstelm_d(res1, dst + dst_stride_2x, 0, 1);
231cabdff1aSopenharmony_ci    __lasx_xvstelm_d(res1, dst + dst_stride_3x, 0, 3);
232cabdff1aSopenharmony_ci}
233cabdff1aSopenharmony_ci
234cabdff1aSopenharmony_civoid ff_h264_idct4x4_addblk_dc_lasx(uint8_t *dst, int16_t *src,
235cabdff1aSopenharmony_ci                                    int32_t dst_stride)
236cabdff1aSopenharmony_ci{
237cabdff1aSopenharmony_ci    const int16_t dc = (src[0] + 32) >> 6;
238cabdff1aSopenharmony_ci    int32_t dst_stride_2x = dst_stride << 1;
239cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
240cabdff1aSopenharmony_ci    __m256i pred, out;
241cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3;
242cabdff1aSopenharmony_ci    __m256i input_dc = __lasx_xvreplgr2vr_h(dc);
243cabdff1aSopenharmony_ci
244cabdff1aSopenharmony_ci    src[0] = 0;
245cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
246cabdff1aSopenharmony_ci              dst, dst_stride_3x, src0, src1, src2, src3);
247cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src0, src1);
248cabdff1aSopenharmony_ci
249cabdff1aSopenharmony_ci    pred = __lasx_xvpermi_q(src0, src1, 0x02);
250cabdff1aSopenharmony_ci    pred = __lasx_xvaddw_h_h_bu(input_dc, pred);
251cabdff1aSopenharmony_ci    pred = __lasx_xvclip255_h(pred);
252cabdff1aSopenharmony_ci    out = __lasx_xvpickev_b(pred, pred);
253cabdff1aSopenharmony_ci    __lasx_xvstelm_w(out, dst, 0, 0);
254cabdff1aSopenharmony_ci    __lasx_xvstelm_w(out, dst + dst_stride, 0, 1);
255cabdff1aSopenharmony_ci    __lasx_xvstelm_w(out, dst + dst_stride_2x, 0, 4);
256cabdff1aSopenharmony_ci    __lasx_xvstelm_w(out, dst + dst_stride_3x, 0, 5);
257cabdff1aSopenharmony_ci}
258cabdff1aSopenharmony_ci
259cabdff1aSopenharmony_civoid ff_h264_idct8_dc_addblk_lasx(uint8_t *dst, int16_t *src,
260cabdff1aSopenharmony_ci                                  int32_t dst_stride)
261cabdff1aSopenharmony_ci{
262cabdff1aSopenharmony_ci    int32_t dc_val;
263cabdff1aSopenharmony_ci    int32_t dst_stride_2x = dst_stride << 1;
264cabdff1aSopenharmony_ci    int32_t dst_stride_4x = dst_stride << 2;
265cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
266cabdff1aSopenharmony_ci    __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
267cabdff1aSopenharmony_ci    __m256i dc;
268cabdff1aSopenharmony_ci
269cabdff1aSopenharmony_ci    dc_val = (src[0] + 32) >> 6;
270cabdff1aSopenharmony_ci    dc = __lasx_xvreplgr2vr_h(dc_val);
271cabdff1aSopenharmony_ci
272cabdff1aSopenharmony_ci    src[0] = 0;
273cabdff1aSopenharmony_ci
274cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
275cabdff1aSopenharmony_ci              dst, dst_stride_3x, dst0, dst1, dst2, dst3);
276cabdff1aSopenharmony_ci    dst += dst_stride_4x;
277cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
278cabdff1aSopenharmony_ci              dst, dst_stride_3x, dst4, dst5, dst6, dst7);
279cabdff1aSopenharmony_ci    dst -= dst_stride_4x;
280cabdff1aSopenharmony_ci    DUP4_ARG1(__lasx_vext2xv_hu_bu, dst0, dst1, dst2, dst3,
281cabdff1aSopenharmony_ci              dst0, dst1, dst2, dst3);
282cabdff1aSopenharmony_ci    DUP4_ARG1(__lasx_vext2xv_hu_bu, dst4, dst5, dst6, dst7,
283cabdff1aSopenharmony_ci              dst4, dst5, dst6, dst7);
284cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst3, dst2, 0x20, dst5,
285cabdff1aSopenharmony_ci              dst4, 0x20, dst7, dst6, 0x20, dst0, dst1, dst2, dst3);
286cabdff1aSopenharmony_ci    dst0 = __lasx_xvadd_h(dst0, dc);
287cabdff1aSopenharmony_ci    dst1 = __lasx_xvadd_h(dst1, dc);
288cabdff1aSopenharmony_ci    dst2 = __lasx_xvadd_h(dst2, dc);
289cabdff1aSopenharmony_ci    dst3 = __lasx_xvadd_h(dst3, dc);
290cabdff1aSopenharmony_ci    DUP4_ARG1(__lasx_xvclip255_h, dst0, dst1, dst2, dst3,
291cabdff1aSopenharmony_ci              dst0, dst1, dst2, dst3);
292cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
293cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst, 0, 0);
294cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + dst_stride, 0, 2);
295cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + dst_stride_2x, 0, 1);
296cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + dst_stride_3x, 0, 3);
297cabdff1aSopenharmony_ci    dst += dst_stride_4x;
298cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst, 0, 0);
299cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst + dst_stride, 0, 2);
300cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst + dst_stride_2x, 0, 1);
301cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst + dst_stride_3x, 0, 3);
302cabdff1aSopenharmony_ci}
303cabdff1aSopenharmony_ci
304cabdff1aSopenharmony_civoid ff_h264_idct_add16_lasx(uint8_t *dst,
305cabdff1aSopenharmony_ci                             const int32_t *blk_offset,
306cabdff1aSopenharmony_ci                             int16_t *block, int32_t dst_stride,
307cabdff1aSopenharmony_ci                             const uint8_t nzc[15 * 8])
308cabdff1aSopenharmony_ci{
309cabdff1aSopenharmony_ci    int32_t i;
310cabdff1aSopenharmony_ci
311cabdff1aSopenharmony_ci    for (i = 0; i < 16; i++) {
312cabdff1aSopenharmony_ci        int32_t nnz = nzc[scan8[i]];
313cabdff1aSopenharmony_ci
314cabdff1aSopenharmony_ci        if (nnz) {
315cabdff1aSopenharmony_ci            if (nnz == 1 && ((dctcoef *) block)[i * 16])
316cabdff1aSopenharmony_ci                ff_h264_idct4x4_addblk_dc_lasx(dst + blk_offset[i],
317cabdff1aSopenharmony_ci                                               block + i * 16 * sizeof(pixel),
318cabdff1aSopenharmony_ci                                               dst_stride);
319cabdff1aSopenharmony_ci            else
320cabdff1aSopenharmony_ci                ff_h264_idct_add_lasx(dst + blk_offset[i],
321cabdff1aSopenharmony_ci                                      block + i * 16 * sizeof(pixel),
322cabdff1aSopenharmony_ci                                      dst_stride);
323cabdff1aSopenharmony_ci        }
324cabdff1aSopenharmony_ci    }
325cabdff1aSopenharmony_ci}
326cabdff1aSopenharmony_ci
327cabdff1aSopenharmony_civoid ff_h264_idct8_add4_lasx(uint8_t *dst, const int32_t *blk_offset,
328cabdff1aSopenharmony_ci                             int16_t *block, int32_t dst_stride,
329cabdff1aSopenharmony_ci                             const uint8_t nzc[15 * 8])
330cabdff1aSopenharmony_ci{
331cabdff1aSopenharmony_ci    int32_t cnt;
332cabdff1aSopenharmony_ci
333cabdff1aSopenharmony_ci    for (cnt = 0; cnt < 16; cnt += 4) {
334cabdff1aSopenharmony_ci        int32_t nnz = nzc[scan8[cnt]];
335cabdff1aSopenharmony_ci
336cabdff1aSopenharmony_ci        if (nnz) {
337cabdff1aSopenharmony_ci            if (nnz == 1 && ((dctcoef *) block)[cnt * 16])
338cabdff1aSopenharmony_ci                ff_h264_idct8_dc_addblk_lasx(dst + blk_offset[cnt],
339cabdff1aSopenharmony_ci                                             block + cnt * 16 * sizeof(pixel),
340cabdff1aSopenharmony_ci                                             dst_stride);
341cabdff1aSopenharmony_ci            else
342cabdff1aSopenharmony_ci                ff_h264_idct8_addblk_lasx(dst + blk_offset[cnt],
343cabdff1aSopenharmony_ci                                          block + cnt * 16 * sizeof(pixel),
344cabdff1aSopenharmony_ci                                          dst_stride);
345cabdff1aSopenharmony_ci        }
346cabdff1aSopenharmony_ci    }
347cabdff1aSopenharmony_ci}
348cabdff1aSopenharmony_ci
349cabdff1aSopenharmony_ci
350cabdff1aSopenharmony_civoid ff_h264_idct_add8_lasx(uint8_t **dst,
351cabdff1aSopenharmony_ci                            const int32_t *blk_offset,
352cabdff1aSopenharmony_ci                            int16_t *block, int32_t dst_stride,
353cabdff1aSopenharmony_ci                            const uint8_t nzc[15 * 8])
354cabdff1aSopenharmony_ci{
355cabdff1aSopenharmony_ci    int32_t i;
356cabdff1aSopenharmony_ci
357cabdff1aSopenharmony_ci    for (i = 16; i < 20; i++) {
358cabdff1aSopenharmony_ci        if (nzc[scan8[i]])
359cabdff1aSopenharmony_ci            ff_h264_idct_add_lasx(dst[0] + blk_offset[i],
360cabdff1aSopenharmony_ci                                  block + i * 16 * sizeof(pixel),
361cabdff1aSopenharmony_ci                                  dst_stride);
362cabdff1aSopenharmony_ci        else if (((dctcoef *) block)[i * 16])
363cabdff1aSopenharmony_ci            ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i],
364cabdff1aSopenharmony_ci                                           block + i * 16 * sizeof(pixel),
365cabdff1aSopenharmony_ci                                           dst_stride);
366cabdff1aSopenharmony_ci    }
367cabdff1aSopenharmony_ci    for (i = 32; i < 36; i++) {
368cabdff1aSopenharmony_ci        if (nzc[scan8[i]])
369cabdff1aSopenharmony_ci            ff_h264_idct_add_lasx(dst[1] + blk_offset[i],
370cabdff1aSopenharmony_ci                                  block + i * 16 * sizeof(pixel),
371cabdff1aSopenharmony_ci                                  dst_stride);
372cabdff1aSopenharmony_ci        else if (((dctcoef *) block)[i * 16])
373cabdff1aSopenharmony_ci            ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i],
374cabdff1aSopenharmony_ci                                           block + i * 16 * sizeof(pixel),
375cabdff1aSopenharmony_ci                                           dst_stride);
376cabdff1aSopenharmony_ci    }
377cabdff1aSopenharmony_ci}
378cabdff1aSopenharmony_ci
379cabdff1aSopenharmony_civoid ff_h264_idct_add8_422_lasx(uint8_t **dst,
380cabdff1aSopenharmony_ci                                const int32_t *blk_offset,
381cabdff1aSopenharmony_ci                                int16_t *block, int32_t dst_stride,
382cabdff1aSopenharmony_ci                                const uint8_t nzc[15 * 8])
383cabdff1aSopenharmony_ci{
384cabdff1aSopenharmony_ci    int32_t i;
385cabdff1aSopenharmony_ci
386cabdff1aSopenharmony_ci    for (i = 16; i < 20; i++) {
387cabdff1aSopenharmony_ci        if (nzc[scan8[i]])
388cabdff1aSopenharmony_ci            ff_h264_idct_add_lasx(dst[0] + blk_offset[i],
389cabdff1aSopenharmony_ci                                  block + i * 16 * sizeof(pixel),
390cabdff1aSopenharmony_ci                                  dst_stride);
391cabdff1aSopenharmony_ci        else if (((dctcoef *) block)[i * 16])
392cabdff1aSopenharmony_ci            ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i],
393cabdff1aSopenharmony_ci                                           block + i * 16 * sizeof(pixel),
394cabdff1aSopenharmony_ci                                           dst_stride);
395cabdff1aSopenharmony_ci    }
396cabdff1aSopenharmony_ci    for (i = 32; i < 36; i++) {
397cabdff1aSopenharmony_ci        if (nzc[scan8[i]])
398cabdff1aSopenharmony_ci            ff_h264_idct_add_lasx(dst[1] + blk_offset[i],
399cabdff1aSopenharmony_ci                                  block + i * 16 * sizeof(pixel),
400cabdff1aSopenharmony_ci                                  dst_stride);
401cabdff1aSopenharmony_ci        else if (((dctcoef *) block)[i * 16])
402cabdff1aSopenharmony_ci            ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i],
403cabdff1aSopenharmony_ci                                           block + i * 16 * sizeof(pixel),
404cabdff1aSopenharmony_ci                                           dst_stride);
405cabdff1aSopenharmony_ci    }
406cabdff1aSopenharmony_ci    for (i = 20; i < 24; i++) {
407cabdff1aSopenharmony_ci        if (nzc[scan8[i + 4]])
408cabdff1aSopenharmony_ci            ff_h264_idct_add_lasx(dst[0] + blk_offset[i + 4],
409cabdff1aSopenharmony_ci                                  block + i * 16 * sizeof(pixel),
410cabdff1aSopenharmony_ci                                  dst_stride);
411cabdff1aSopenharmony_ci        else if (((dctcoef *) block)[i * 16])
412cabdff1aSopenharmony_ci            ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i + 4],
413cabdff1aSopenharmony_ci                                           block + i * 16 * sizeof(pixel),
414cabdff1aSopenharmony_ci                                           dst_stride);
415cabdff1aSopenharmony_ci    }
416cabdff1aSopenharmony_ci    for (i = 36; i < 40; i++) {
417cabdff1aSopenharmony_ci        if (nzc[scan8[i + 4]])
418cabdff1aSopenharmony_ci            ff_h264_idct_add_lasx(dst[1] + blk_offset[i + 4],
419cabdff1aSopenharmony_ci                                  block + i * 16 * sizeof(pixel),
420cabdff1aSopenharmony_ci                                  dst_stride);
421cabdff1aSopenharmony_ci        else if (((dctcoef *) block)[i * 16])
422cabdff1aSopenharmony_ci            ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i + 4],
423cabdff1aSopenharmony_ci                                           block + i * 16 * sizeof(pixel),
424cabdff1aSopenharmony_ci                                           dst_stride);
425cabdff1aSopenharmony_ci    }
426cabdff1aSopenharmony_ci}
427cabdff1aSopenharmony_ci
428cabdff1aSopenharmony_civoid ff_h264_idct_add16_intra_lasx(uint8_t *dst,
429cabdff1aSopenharmony_ci                                   const int32_t *blk_offset,
430cabdff1aSopenharmony_ci                                   int16_t *block,
431cabdff1aSopenharmony_ci                                   int32_t dst_stride,
432cabdff1aSopenharmony_ci                                   const uint8_t nzc[15 * 8])
433cabdff1aSopenharmony_ci{
434cabdff1aSopenharmony_ci    int32_t i;
435cabdff1aSopenharmony_ci
436cabdff1aSopenharmony_ci    for (i = 0; i < 16; i++) {
437cabdff1aSopenharmony_ci        if (nzc[scan8[i]])
438cabdff1aSopenharmony_ci            ff_h264_idct_add_lasx(dst + blk_offset[i],
439cabdff1aSopenharmony_ci                                  block + i * 16 * sizeof(pixel), dst_stride);
440cabdff1aSopenharmony_ci        else if (((dctcoef *) block)[i * 16])
441cabdff1aSopenharmony_ci            ff_h264_idct4x4_addblk_dc_lasx(dst + blk_offset[i],
442cabdff1aSopenharmony_ci                                           block + i * 16 * sizeof(pixel),
443cabdff1aSopenharmony_ci                                           dst_stride);
444cabdff1aSopenharmony_ci    }
445cabdff1aSopenharmony_ci}
446cabdff1aSopenharmony_ci
447cabdff1aSopenharmony_civoid ff_h264_deq_idct_luma_dc_lasx(int16_t *dst, int16_t *src,
448cabdff1aSopenharmony_ci                                   int32_t de_qval)
449cabdff1aSopenharmony_ci{
450cabdff1aSopenharmony_ci#define DC_DEST_STRIDE 16
451cabdff1aSopenharmony_ci
452cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3;
453cabdff1aSopenharmony_ci    __m256i vec0, vec1, vec2, vec3;
454cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3;
455cabdff1aSopenharmony_ci    __m256i hres0, hres1, hres2, hres3;
456cabdff1aSopenharmony_ci    __m256i vres0, vres1, vres2, vres3;
457cabdff1aSopenharmony_ci    __m256i de_q_vec = __lasx_xvreplgr2vr_w(de_qval);
458cabdff1aSopenharmony_ci
459cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvld, src, 0, src, 8, src, 16, src, 24,
460cabdff1aSopenharmony_ci              src0, src1, src2, src3);
461cabdff1aSopenharmony_ci    LASX_TRANSPOSE4x4_H(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
462cabdff1aSopenharmony_ci    LASX_BUTTERFLY_4_H(tmp0, tmp2, tmp3, tmp1, vec0, vec3, vec2, vec1);
463cabdff1aSopenharmony_ci    LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1);
464cabdff1aSopenharmony_ci    LASX_TRANSPOSE4x4_H(hres0, hres1, hres2, hres3,
465cabdff1aSopenharmony_ci                        hres0, hres1, hres2, hres3);
466cabdff1aSopenharmony_ci    LASX_BUTTERFLY_4_H(hres0, hres1, hres3, hres2, vec0, vec3, vec2, vec1);
467cabdff1aSopenharmony_ci    LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, vres0, vres1, vres2, vres3);
468cabdff1aSopenharmony_ci    DUP4_ARG1(__lasx_vext2xv_w_h, vres0, vres1, vres2, vres3,
469cabdff1aSopenharmony_ci              vres0, vres1, vres2, vres3);
470cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvpermi_q, vres1, vres0, 0x20, vres3, vres2, 0x20,
471cabdff1aSopenharmony_ci              vres0, vres1);
472cabdff1aSopenharmony_ci
473cabdff1aSopenharmony_ci    vres0 = __lasx_xvmul_w(vres0, de_q_vec);
474cabdff1aSopenharmony_ci    vres1 = __lasx_xvmul_w(vres1, de_q_vec);
475cabdff1aSopenharmony_ci
476cabdff1aSopenharmony_ci    vres0 = __lasx_xvsrari_w(vres0, 8);
477cabdff1aSopenharmony_ci    vres1 = __lasx_xvsrari_w(vres1, 8);
478cabdff1aSopenharmony_ci    vec0 = __lasx_xvpickev_h(vres1, vres0);
479cabdff1aSopenharmony_ci    vec0 = __lasx_xvpermi_d(vec0, 0xd8);
480cabdff1aSopenharmony_ci    __lasx_xvstelm_h(vec0, dst + 0  * DC_DEST_STRIDE, 0, 0);
481cabdff1aSopenharmony_ci    __lasx_xvstelm_h(vec0, dst + 2  * DC_DEST_STRIDE, 0, 1);
482cabdff1aSopenharmony_ci    __lasx_xvstelm_h(vec0, dst + 8  * DC_DEST_STRIDE, 0, 2);
483cabdff1aSopenharmony_ci    __lasx_xvstelm_h(vec0, dst + 10 * DC_DEST_STRIDE, 0, 3);
484cabdff1aSopenharmony_ci    __lasx_xvstelm_h(vec0, dst + 1  * DC_DEST_STRIDE, 0, 4);
485cabdff1aSopenharmony_ci    __lasx_xvstelm_h(vec0, dst + 3  * DC_DEST_STRIDE, 0, 5);
486cabdff1aSopenharmony_ci    __lasx_xvstelm_h(vec0, dst + 9  * DC_DEST_STRIDE, 0, 6);
487cabdff1aSopenharmony_ci    __lasx_xvstelm_h(vec0, dst + 11 * DC_DEST_STRIDE, 0, 7);
488cabdff1aSopenharmony_ci    __lasx_xvstelm_h(vec0, dst + 4  * DC_DEST_STRIDE, 0, 8);
489cabdff1aSopenharmony_ci    __lasx_xvstelm_h(vec0, dst + 6  * DC_DEST_STRIDE, 0, 9);
490cabdff1aSopenharmony_ci    __lasx_xvstelm_h(vec0, dst + 12 * DC_DEST_STRIDE, 0, 10);
491cabdff1aSopenharmony_ci    __lasx_xvstelm_h(vec0, dst + 14 * DC_DEST_STRIDE, 0, 11);
492cabdff1aSopenharmony_ci    __lasx_xvstelm_h(vec0, dst + 5  * DC_DEST_STRIDE, 0, 12);
493cabdff1aSopenharmony_ci    __lasx_xvstelm_h(vec0, dst + 7  * DC_DEST_STRIDE, 0, 13);
494cabdff1aSopenharmony_ci    __lasx_xvstelm_h(vec0, dst + 13 * DC_DEST_STRIDE, 0, 14);
495cabdff1aSopenharmony_ci    __lasx_xvstelm_h(vec0, dst + 15 * DC_DEST_STRIDE, 0, 15);
496cabdff1aSopenharmony_ci
497cabdff1aSopenharmony_ci#undef DC_DEST_STRIDE
498cabdff1aSopenharmony_ci}
499