1cabdff1aSopenharmony_ci; /*
2cabdff1aSopenharmony_ci; * Provide SSE luma and chroma mc functions for HEVC decoding
3cabdff1aSopenharmony_ci; * Copyright (c) 2013 Pierre-Edouard LEPERE
4cabdff1aSopenharmony_ci; *
5cabdff1aSopenharmony_ci; * This file is part of FFmpeg.
6cabdff1aSopenharmony_ci; *
7cabdff1aSopenharmony_ci; * FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci; * modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci; * License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci; * version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci; *
12cabdff1aSopenharmony_ci; * FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci; * Lesser General Public License for more details.
16cabdff1aSopenharmony_ci; *
17cabdff1aSopenharmony_ci; * You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci; * License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci; */
21cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ciSECTION_RODATA 32
24cabdff1aSopenharmony_cicextern pw_255
25cabdff1aSopenharmony_cicextern pw_512
26cabdff1aSopenharmony_cicextern pw_2048
27cabdff1aSopenharmony_cicextern pw_8192
28cabdff1aSopenharmony_cicextern pw_1023
29cabdff1aSopenharmony_cicextern pw_1024
30cabdff1aSopenharmony_cicextern pw_4096
31cabdff1aSopenharmony_ci%define pw_8 pw_512
32cabdff1aSopenharmony_ci%define pw_10 pw_2048
33cabdff1aSopenharmony_ci%define pw_12 pw_8192
34cabdff1aSopenharmony_ci%define pw_bi_10 pw_1024
35cabdff1aSopenharmony_ci%define pw_bi_12 pw_4096
36cabdff1aSopenharmony_ci%define max_pixels_8 pw_255
37cabdff1aSopenharmony_ci%define max_pixels_10 pw_1023
38cabdff1aSopenharmony_cipw_bi_8:                times 16 dw  (1 <<  8)
39cabdff1aSopenharmony_cimax_pixels_12:          times 16 dw ((1 << 12)-1)
40cabdff1aSopenharmony_cicextern pd_1
41cabdff1aSopenharmony_cicextern pb_0
42cabdff1aSopenharmony_ci
43cabdff1aSopenharmony_ci%macro EPEL_TABLE 4
44cabdff1aSopenharmony_cihevc_epel_filters_%4_%1 times %2 d%3 -2, 58
45cabdff1aSopenharmony_ci                        times %2 d%3 10, -2
46cabdff1aSopenharmony_ci                        times %2 d%3 -4, 54
47cabdff1aSopenharmony_ci                        times %2 d%3 16, -2
48cabdff1aSopenharmony_ci                        times %2 d%3 -6, 46
49cabdff1aSopenharmony_ci                        times %2 d%3 28, -4
50cabdff1aSopenharmony_ci                        times %2 d%3 -4, 36
51cabdff1aSopenharmony_ci                        times %2 d%3 36, -4
52cabdff1aSopenharmony_ci                        times %2 d%3 -4, 28
53cabdff1aSopenharmony_ci                        times %2 d%3 46, -6
54cabdff1aSopenharmony_ci                        times %2 d%3 -2, 16
55cabdff1aSopenharmony_ci                        times %2 d%3 54, -4
56cabdff1aSopenharmony_ci                        times %2 d%3 -2, 10
57cabdff1aSopenharmony_ci                        times %2 d%3 58, -2
58cabdff1aSopenharmony_ci%endmacro
59cabdff1aSopenharmony_ci
60cabdff1aSopenharmony_ci
61cabdff1aSopenharmony_ciEPEL_TABLE  8,16, b, avx2
62cabdff1aSopenharmony_ciEPEL_TABLE 10, 8, w, avx2
63cabdff1aSopenharmony_ci
64cabdff1aSopenharmony_ciEPEL_TABLE  8, 8, b, sse4
65cabdff1aSopenharmony_ciEPEL_TABLE 10, 4, w, sse4
66cabdff1aSopenharmony_ciEPEL_TABLE 12, 4, w, sse4
67cabdff1aSopenharmony_ci
68cabdff1aSopenharmony_ci%macro QPEL_TABLE 4
69cabdff1aSopenharmony_cihevc_qpel_filters_%4_%1 times %2 d%3  -1,  4
70cabdff1aSopenharmony_ci                        times %2 d%3 -10, 58
71cabdff1aSopenharmony_ci                        times %2 d%3  17, -5
72cabdff1aSopenharmony_ci                        times %2 d%3   1,  0
73cabdff1aSopenharmony_ci                        times %2 d%3  -1,  4
74cabdff1aSopenharmony_ci                        times %2 d%3 -11, 40
75cabdff1aSopenharmony_ci                        times %2 d%3  40,-11
76cabdff1aSopenharmony_ci                        times %2 d%3   4, -1
77cabdff1aSopenharmony_ci                        times %2 d%3   0,  1
78cabdff1aSopenharmony_ci                        times %2 d%3  -5, 17
79cabdff1aSopenharmony_ci                        times %2 d%3  58,-10
80cabdff1aSopenharmony_ci                        times %2 d%3   4, -1
81cabdff1aSopenharmony_ci%endmacro
82cabdff1aSopenharmony_ci
83cabdff1aSopenharmony_ciQPEL_TABLE  8, 8, b, sse4
84cabdff1aSopenharmony_ciQPEL_TABLE 10, 4, w, sse4
85cabdff1aSopenharmony_ciQPEL_TABLE 12, 4, w, sse4
86cabdff1aSopenharmony_ci
87cabdff1aSopenharmony_ciQPEL_TABLE  8,16, b, avx2
88cabdff1aSopenharmony_ciQPEL_TABLE 10, 8, w, avx2
89cabdff1aSopenharmony_ci
90cabdff1aSopenharmony_ciQPEL_TABLE  4, 1, b, avx512icl_h
91cabdff1aSopenharmony_ciQPEL_TABLE  8, 1, b, avx512icl_h
92cabdff1aSopenharmony_ciQPEL_TABLE  8, 1, d, avx512icl_v
93cabdff1aSopenharmony_ciQPEL_TABLE 16, 1, b, avx512icl_h
94cabdff1aSopenharmony_ciQPEL_TABLE 32, 1, b, avx512icl_h
95cabdff1aSopenharmony_ciQPEL_TABLE 64, 1, b, avx512icl_h
96cabdff1aSopenharmony_ci
97cabdff1aSopenharmony_cipb_qpel_shuffle_index: db  0,  1,  2,  3
98cabdff1aSopenharmony_ci                       db  1,  2,  3,  4
99cabdff1aSopenharmony_ci                       db  2,  3,  4,  5
100cabdff1aSopenharmony_ci                       db  3,  4,  5,  6
101cabdff1aSopenharmony_ci                       db  4,  5,  6,  7
102cabdff1aSopenharmony_ci                       db  5,  6,  7,  8
103cabdff1aSopenharmony_ci                       db  6,  7,  8,  9
104cabdff1aSopenharmony_ci                       db  7,  8,  9, 10
105cabdff1aSopenharmony_ci                       db  8,  9, 10, 11
106cabdff1aSopenharmony_ci                       db  9, 10, 11, 12
107cabdff1aSopenharmony_ci                       db 10, 11, 12, 13
108cabdff1aSopenharmony_ci                       db 11, 12, 13, 14
109cabdff1aSopenharmony_ci                       db 12, 13, 14, 15
110cabdff1aSopenharmony_ci                       db 13, 14, 15, 16
111cabdff1aSopenharmony_ci                       db 14, 15, 16, 17
112cabdff1aSopenharmony_ci                       db 15, 16, 17, 18
113cabdff1aSopenharmony_ci                       db  4,  5,  6,  7
114cabdff1aSopenharmony_ci                       db  5,  6,  7,  8
115cabdff1aSopenharmony_ci                       db  6,  7,  8,  9
116cabdff1aSopenharmony_ci                       db  7,  8,  9, 10
117cabdff1aSopenharmony_ci                       db  8,  9, 10, 11
118cabdff1aSopenharmony_ci                       db  9, 10, 11, 12
119cabdff1aSopenharmony_ci                       db 10, 11, 12, 13
120cabdff1aSopenharmony_ci                       db 11, 12, 13, 14
121cabdff1aSopenharmony_ci                       db 12, 13, 14, 15
122cabdff1aSopenharmony_ci                       db 13, 14, 15, 16
123cabdff1aSopenharmony_ci                       db 14, 15, 16, 17
124cabdff1aSopenharmony_ci                       db 15, 16, 17, 18
125cabdff1aSopenharmony_ci                       db 16, 17, 18, 19
126cabdff1aSopenharmony_ci                       db 17, 18, 19, 20
127cabdff1aSopenharmony_ci                       db 18, 19, 20, 21
128cabdff1aSopenharmony_ci                       db 19, 20, 21, 22
129cabdff1aSopenharmony_ci
130cabdff1aSopenharmony_ciSECTION .text
131cabdff1aSopenharmony_ci
132cabdff1aSopenharmony_ci%define MAX_PB_SIZE  64
133cabdff1aSopenharmony_ci
134cabdff1aSopenharmony_ci%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
135cabdff1aSopenharmony_ci
136cabdff1aSopenharmony_ci%define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10
137cabdff1aSopenharmony_ci
138cabdff1aSopenharmony_ci%if ARCH_X86_64
139cabdff1aSopenharmony_ci
140cabdff1aSopenharmony_ci%macro SIMPLE_BILOAD 4   ;width, tab, r1, r2
141cabdff1aSopenharmony_ci%if %1 <= 4
142cabdff1aSopenharmony_ci    movq              %3, [%2]                                              ; load data from source2
143cabdff1aSopenharmony_ci%elif %1 <= 8
144cabdff1aSopenharmony_ci    movdqa            %3, [%2]                                              ; load data from source2
145cabdff1aSopenharmony_ci%elif %1 <= 12
146cabdff1aSopenharmony_ci%if cpuflag(avx2)
147cabdff1aSopenharmony_ci    mova              %3, [%2]
148cabdff1aSopenharmony_ci%else
149cabdff1aSopenharmony_ci    movdqa            %3, [%2]                                              ; load data from source2
150cabdff1aSopenharmony_ci    movq              %4, [%2+16]                                           ; load data from source2
151cabdff1aSopenharmony_ci%endif ;avx
152cabdff1aSopenharmony_ci%elif %1 <= 16
153cabdff1aSopenharmony_ci%if cpuflag(avx2)
154cabdff1aSopenharmony_ci    mova              %3, [%2]
155cabdff1aSopenharmony_ci%else
156cabdff1aSopenharmony_ci    movdqa            %3, [%2]                                              ; load data from source2
157cabdff1aSopenharmony_ci    movdqa            %4, [%2+16]                                           ; load data from source2
158cabdff1aSopenharmony_ci%endif ; avx
159cabdff1aSopenharmony_ci%else ; %1 = 32
160cabdff1aSopenharmony_ci    mova              %3, [%2]
161cabdff1aSopenharmony_ci    mova              %4, [%2+32]
162cabdff1aSopenharmony_ci%endif
163cabdff1aSopenharmony_ci%endmacro
164cabdff1aSopenharmony_ci
165cabdff1aSopenharmony_ci%macro SIMPLE_LOAD 4    ;width, bitd, tab, r1
166cabdff1aSopenharmony_ci%if %1 == 2 || (%2 == 8 && %1 <= 4)
167cabdff1aSopenharmony_ci    movd              %4, [%3]                                               ; load data from source
168cabdff1aSopenharmony_ci%elif %1 == 4 || (%2 == 8 && %1 <= 8)
169cabdff1aSopenharmony_ci    movq              %4, [%3]                                               ; load data from source
170cabdff1aSopenharmony_ci%elif notcpuflag(avx)
171cabdff1aSopenharmony_ci    movu              %4, [%3]                                               ; load data from source
172cabdff1aSopenharmony_ci%elif %1 <= 8 || (%2 == 8 && %1 <= 16)
173cabdff1aSopenharmony_ci    movdqu           %4, [%3]
174cabdff1aSopenharmony_ci%else
175cabdff1aSopenharmony_ci    movu              %4, [%3]
176cabdff1aSopenharmony_ci%endif
177cabdff1aSopenharmony_ci%endmacro
178cabdff1aSopenharmony_ci
179cabdff1aSopenharmony_ci
180cabdff1aSopenharmony_ci%macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp
181cabdff1aSopenharmony_ci%if cpuflag(avx2)
182cabdff1aSopenharmony_ci%assign %%offset 32
183cabdff1aSopenharmony_ci%ifdef PIC
184cabdff1aSopenharmony_ci    lea              %5q, [hevc_epel_filters_avx2_%1]
185cabdff1aSopenharmony_ci    %define FILTER %5q
186cabdff1aSopenharmony_ci%else
187cabdff1aSopenharmony_ci    %define FILTER hevc_epel_filters_avx2_%1
188cabdff1aSopenharmony_ci%endif
189cabdff1aSopenharmony_ci%else
190cabdff1aSopenharmony_ci%assign %%offset 16
191cabdff1aSopenharmony_ci%ifdef PIC
192cabdff1aSopenharmony_ci    lea              %5q, [hevc_epel_filters_sse4_%1]
193cabdff1aSopenharmony_ci    %define FILTER %5q
194cabdff1aSopenharmony_ci%else
195cabdff1aSopenharmony_ci    %define FILTER hevc_epel_filters_sse4_%1
196cabdff1aSopenharmony_ci%endif
197cabdff1aSopenharmony_ci%endif ;cpuflag(avx2)
198cabdff1aSopenharmony_ci    sub              %2q, 1
199cabdff1aSopenharmony_ci%if cpuflag(avx2)
200cabdff1aSopenharmony_ci    shl              %2q, 6                      ; multiply by 64
201cabdff1aSopenharmony_ci  %else
202cabdff1aSopenharmony_ci    shl              %2q, 5                      ; multiply by 32
203cabdff1aSopenharmony_ci%endif
204cabdff1aSopenharmony_ci    mova           %3, [FILTER + %2q]        ; get 2 first values of filters
205cabdff1aSopenharmony_ci    mova           %4, [FILTER + %2q+%%offset]     ; get 2 last values of filters
206cabdff1aSopenharmony_ci%endmacro
207cabdff1aSopenharmony_ci
208cabdff1aSopenharmony_ci%macro EPEL_HV_FILTER 1
209cabdff1aSopenharmony_ci%if cpuflag(avx2)
210cabdff1aSopenharmony_ci%assign %%offset 32
211cabdff1aSopenharmony_ci%assign %%shift  6
212cabdff1aSopenharmony_ci%define %%table  hevc_epel_filters_avx2_%1
213cabdff1aSopenharmony_ci%else
214cabdff1aSopenharmony_ci%assign %%offset 16
215cabdff1aSopenharmony_ci%assign %%shift  5
216cabdff1aSopenharmony_ci%define %%table  hevc_epel_filters_sse4_%1
217cabdff1aSopenharmony_ci%endif
218cabdff1aSopenharmony_ci
219cabdff1aSopenharmony_ci%ifdef PIC
220cabdff1aSopenharmony_ci    lea           r3srcq, [%%table]
221cabdff1aSopenharmony_ci    %define FILTER r3srcq
222cabdff1aSopenharmony_ci%else
223cabdff1aSopenharmony_ci    %define FILTER %%table
224cabdff1aSopenharmony_ci%endif
225cabdff1aSopenharmony_ci    sub              mxq, 1
226cabdff1aSopenharmony_ci    sub              myq, 1
227cabdff1aSopenharmony_ci    shl              mxq, %%shift                ; multiply by 32
228cabdff1aSopenharmony_ci    shl              myq, %%shift                ; multiply by 32
229cabdff1aSopenharmony_ci    mova             m14, [FILTER + mxq]        ; get 2 first values of filters
230cabdff1aSopenharmony_ci    mova             m15, [FILTER + mxq+%%offset]     ; get 2 last values of filters
231cabdff1aSopenharmony_ci
232cabdff1aSopenharmony_ci%if cpuflag(avx2)
233cabdff1aSopenharmony_ci%define %%table  hevc_epel_filters_avx2_10
234cabdff1aSopenharmony_ci%else
235cabdff1aSopenharmony_ci%define %%table  hevc_epel_filters_sse4_10
236cabdff1aSopenharmony_ci%endif
237cabdff1aSopenharmony_ci%ifdef PIC
238cabdff1aSopenharmony_ci    lea           r3srcq, [%%table]
239cabdff1aSopenharmony_ci    %define FILTER r3srcq
240cabdff1aSopenharmony_ci%else
241cabdff1aSopenharmony_ci    %define FILTER %%table
242cabdff1aSopenharmony_ci%endif
243cabdff1aSopenharmony_ci    mova             m12, [FILTER + myq]        ; get 2 first values of filters
244cabdff1aSopenharmony_ci    mova             m13, [FILTER + myq+%%offset]     ; get 2 last values of filters
245cabdff1aSopenharmony_ci    lea           r3srcq, [srcstrideq*3]
246cabdff1aSopenharmony_ci%endmacro
247cabdff1aSopenharmony_ci
248cabdff1aSopenharmony_ci%macro QPEL_FILTER 2
249cabdff1aSopenharmony_ci
250cabdff1aSopenharmony_ci%if cpuflag(avx2)
251cabdff1aSopenharmony_ci%assign %%offset 32
252cabdff1aSopenharmony_ci%assign %%shift  7
253cabdff1aSopenharmony_ci%define %%table  hevc_qpel_filters_avx2_%1
254cabdff1aSopenharmony_ci%else
255cabdff1aSopenharmony_ci%assign %%offset 16
256cabdff1aSopenharmony_ci%assign %%shift  6
257cabdff1aSopenharmony_ci%define %%table  hevc_qpel_filters_sse4_%1
258cabdff1aSopenharmony_ci%endif
259cabdff1aSopenharmony_ci
260cabdff1aSopenharmony_ci%ifdef PIC
261cabdff1aSopenharmony_ci    lea         rfilterq, [%%table]
262cabdff1aSopenharmony_ci%else
263cabdff1aSopenharmony_ci    %define rfilterq %%table
264cabdff1aSopenharmony_ci%endif
265cabdff1aSopenharmony_ci    sub              %2q, 1
266cabdff1aSopenharmony_ci    shl              %2q, %%shift                        ; multiply by 32
267cabdff1aSopenharmony_ci    mova             m12, [rfilterq + %2q]               ; get 4 first values of filters
268cabdff1aSopenharmony_ci    mova             m13, [rfilterq + %2q +   %%offset]  ; get 4 first values of filters
269cabdff1aSopenharmony_ci    mova             m14, [rfilterq + %2q + 2*%%offset]  ; get 4 first values of filters
270cabdff1aSopenharmony_ci    mova             m15, [rfilterq + %2q + 3*%%offset]  ; get 4 first values of filters
271cabdff1aSopenharmony_ci%endmacro
272cabdff1aSopenharmony_ci
273cabdff1aSopenharmony_ci%macro EPEL_LOAD 4
274cabdff1aSopenharmony_ci%if (%1 == 8 && %4 <= 4)
275cabdff1aSopenharmony_ci%define %%load movd
276cabdff1aSopenharmony_ci%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
277cabdff1aSopenharmony_ci%define %%load movq
278cabdff1aSopenharmony_ci%else
279cabdff1aSopenharmony_ci%define %%load movdqu
280cabdff1aSopenharmony_ci%endif
281cabdff1aSopenharmony_ci
282cabdff1aSopenharmony_ci    %%load            m0, [%2q ]
283cabdff1aSopenharmony_ci%ifnum %3
284cabdff1aSopenharmony_ci    %%load            m1, [%2q+  %3]
285cabdff1aSopenharmony_ci    %%load            m2, [%2q+2*%3]
286cabdff1aSopenharmony_ci    %%load            m3, [%2q+3*%3]
287cabdff1aSopenharmony_ci%else
288cabdff1aSopenharmony_ci    %%load            m1, [%2q+  %3q]
289cabdff1aSopenharmony_ci    %%load            m2, [%2q+2*%3q]
290cabdff1aSopenharmony_ci    %%load            m3, [%2q+r3srcq]
291cabdff1aSopenharmony_ci%endif
292cabdff1aSopenharmony_ci%if %1 == 8
293cabdff1aSopenharmony_ci%if %4 > 8
294cabdff1aSopenharmony_ci    SBUTTERFLY        bw, 0, 1, 7
295cabdff1aSopenharmony_ci    SBUTTERFLY        bw, 2, 3, 7
296cabdff1aSopenharmony_ci%else
297cabdff1aSopenharmony_ci    punpcklbw         m0, m1
298cabdff1aSopenharmony_ci    punpcklbw         m2, m3
299cabdff1aSopenharmony_ci%endif
300cabdff1aSopenharmony_ci%else
301cabdff1aSopenharmony_ci%if %4 > 4
302cabdff1aSopenharmony_ci    SBUTTERFLY        wd, 0, 1, 7
303cabdff1aSopenharmony_ci    SBUTTERFLY        wd, 2, 3, 7
304cabdff1aSopenharmony_ci%else
305cabdff1aSopenharmony_ci    punpcklwd         m0, m1
306cabdff1aSopenharmony_ci    punpcklwd         m2, m3
307cabdff1aSopenharmony_ci%endif
308cabdff1aSopenharmony_ci%endif
309cabdff1aSopenharmony_ci%endmacro
310cabdff1aSopenharmony_ci
311cabdff1aSopenharmony_ci
312cabdff1aSopenharmony_ci%macro QPEL_H_LOAD 4
313cabdff1aSopenharmony_ci%assign %%stride (%1+7)/8
314cabdff1aSopenharmony_ci%if %1 == 8
315cabdff1aSopenharmony_ci%if %3 <= 4
316cabdff1aSopenharmony_ci%define %%load movd
317cabdff1aSopenharmony_ci%elif %3 == 8
318cabdff1aSopenharmony_ci%define %%load movq
319cabdff1aSopenharmony_ci%else
320cabdff1aSopenharmony_ci%define %%load movu
321cabdff1aSopenharmony_ci%endif
322cabdff1aSopenharmony_ci%else
323cabdff1aSopenharmony_ci%if %3 == 2
324cabdff1aSopenharmony_ci%define %%load movd
325cabdff1aSopenharmony_ci%elif %3 == 4
326cabdff1aSopenharmony_ci%define %%load movq
327cabdff1aSopenharmony_ci%else
328cabdff1aSopenharmony_ci%define %%load movu
329cabdff1aSopenharmony_ci%endif
330cabdff1aSopenharmony_ci%endif
331cabdff1aSopenharmony_ci    %%load            m0, [%2-3*%%stride]        ;load data from source
332cabdff1aSopenharmony_ci    %%load            m1, [%2-2*%%stride]
333cabdff1aSopenharmony_ci    %%load            m2, [%2-%%stride  ]
334cabdff1aSopenharmony_ci    %%load            m3, [%2           ]
335cabdff1aSopenharmony_ci    %%load            m4, [%2+%%stride  ]
336cabdff1aSopenharmony_ci    %%load            m5, [%2+2*%%stride]
337cabdff1aSopenharmony_ci    %%load            m6, [%2+3*%%stride]
338cabdff1aSopenharmony_ci    %%load            m7, [%2+4*%%stride]
339cabdff1aSopenharmony_ci
340cabdff1aSopenharmony_ci%if %1 == 8
341cabdff1aSopenharmony_ci%if %3 > 8
342cabdff1aSopenharmony_ci    SBUTTERFLY        wd, 0, 1, %4
343cabdff1aSopenharmony_ci    SBUTTERFLY        wd, 2, 3, %4
344cabdff1aSopenharmony_ci    SBUTTERFLY        wd, 4, 5, %4
345cabdff1aSopenharmony_ci    SBUTTERFLY        wd, 6, 7, %4
346cabdff1aSopenharmony_ci%else
347cabdff1aSopenharmony_ci    punpcklbw         m0, m1
348cabdff1aSopenharmony_ci    punpcklbw         m2, m3
349cabdff1aSopenharmony_ci    punpcklbw         m4, m5
350cabdff1aSopenharmony_ci    punpcklbw         m6, m7
351cabdff1aSopenharmony_ci%endif
352cabdff1aSopenharmony_ci%else
353cabdff1aSopenharmony_ci%if %3 > 4
354cabdff1aSopenharmony_ci    SBUTTERFLY        dq, 0, 1, %4
355cabdff1aSopenharmony_ci    SBUTTERFLY        dq, 2, 3, %4
356cabdff1aSopenharmony_ci    SBUTTERFLY        dq, 4, 5, %4
357cabdff1aSopenharmony_ci    SBUTTERFLY        dq, 6, 7, %4
358cabdff1aSopenharmony_ci%else
359cabdff1aSopenharmony_ci    punpcklwd         m0, m1
360cabdff1aSopenharmony_ci    punpcklwd         m2, m3
361cabdff1aSopenharmony_ci    punpcklwd         m4, m5
362cabdff1aSopenharmony_ci    punpcklwd         m6, m7
363cabdff1aSopenharmony_ci%endif
364cabdff1aSopenharmony_ci%endif
365cabdff1aSopenharmony_ci%endmacro
366cabdff1aSopenharmony_ci
367cabdff1aSopenharmony_ci%macro QPEL_V_LOAD 5
368cabdff1aSopenharmony_ci    lea              %5q, [%2]
369cabdff1aSopenharmony_ci    sub              %5q, r3srcq
370cabdff1aSopenharmony_ci    movu              m0, [%5q            ]      ;load x- 3*srcstride
371cabdff1aSopenharmony_ci    movu              m1, [%5q+   %3q     ]      ;load x- 2*srcstride
372cabdff1aSopenharmony_ci    movu              m2, [%5q+ 2*%3q     ]      ;load x-srcstride
373cabdff1aSopenharmony_ci    movu              m3, [%2       ]      ;load x
374cabdff1aSopenharmony_ci    movu              m4, [%2+   %3q]      ;load x+stride
375cabdff1aSopenharmony_ci    movu              m5, [%2+ 2*%3q]      ;load x+2*stride
376cabdff1aSopenharmony_ci    movu              m6, [%2+r3srcq]      ;load x+3*stride
377cabdff1aSopenharmony_ci    movu              m7, [%2+ 4*%3q]      ;load x+4*stride
378cabdff1aSopenharmony_ci%if %1 == 8
379cabdff1aSopenharmony_ci%if %4 > 8
380cabdff1aSopenharmony_ci    SBUTTERFLY        bw, 0, 1, 8
381cabdff1aSopenharmony_ci    SBUTTERFLY        bw, 2, 3, 8
382cabdff1aSopenharmony_ci    SBUTTERFLY        bw, 4, 5, 8
383cabdff1aSopenharmony_ci    SBUTTERFLY        bw, 6, 7, 8
384cabdff1aSopenharmony_ci%else
385cabdff1aSopenharmony_ci    punpcklbw         m0, m1
386cabdff1aSopenharmony_ci    punpcklbw         m2, m3
387cabdff1aSopenharmony_ci    punpcklbw         m4, m5
388cabdff1aSopenharmony_ci    punpcklbw         m6, m7
389cabdff1aSopenharmony_ci%endif
390cabdff1aSopenharmony_ci%else
391cabdff1aSopenharmony_ci%if %4 > 4
392cabdff1aSopenharmony_ci    SBUTTERFLY        wd, 0, 1, 8
393cabdff1aSopenharmony_ci    SBUTTERFLY        wd, 2, 3, 8
394cabdff1aSopenharmony_ci    SBUTTERFLY        wd, 4, 5, 8
395cabdff1aSopenharmony_ci    SBUTTERFLY        wd, 6, 7, 8
396cabdff1aSopenharmony_ci%else
397cabdff1aSopenharmony_ci    punpcklwd         m0, m1
398cabdff1aSopenharmony_ci    punpcklwd         m2, m3
399cabdff1aSopenharmony_ci    punpcklwd         m4, m5
400cabdff1aSopenharmony_ci    punpcklwd         m6, m7
401cabdff1aSopenharmony_ci%endif
402cabdff1aSopenharmony_ci%endif
403cabdff1aSopenharmony_ci%endmacro
404cabdff1aSopenharmony_ci
405cabdff1aSopenharmony_ci%macro PEL_12STORE2 3
406cabdff1aSopenharmony_ci    movd           [%1], %2
407cabdff1aSopenharmony_ci%endmacro
408cabdff1aSopenharmony_ci%macro PEL_12STORE4 3
409cabdff1aSopenharmony_ci    movq           [%1], %2
410cabdff1aSopenharmony_ci%endmacro
411cabdff1aSopenharmony_ci%macro PEL_12STORE6 3
412cabdff1aSopenharmony_ci    movq           [%1], %2
413cabdff1aSopenharmony_ci    psrldq            %2, 8
414cabdff1aSopenharmony_ci    movd         [%1+8], %2
415cabdff1aSopenharmony_ci%endmacro
416cabdff1aSopenharmony_ci%macro PEL_12STORE8 3
417cabdff1aSopenharmony_ci    movdqa         [%1], %2
418cabdff1aSopenharmony_ci%endmacro
419cabdff1aSopenharmony_ci%macro PEL_12STORE12 3
420cabdff1aSopenharmony_ci    movdqa         [%1], %2
421cabdff1aSopenharmony_ci    movq        [%1+16], %3
422cabdff1aSopenharmony_ci%endmacro
423cabdff1aSopenharmony_ci%macro PEL_12STORE16 3
424cabdff1aSopenharmony_ci    PEL_12STORE8      %1, %2, %3
425cabdff1aSopenharmony_ci    movdqa       [%1+16], %3
426cabdff1aSopenharmony_ci%endmacro
427cabdff1aSopenharmony_ci
428cabdff1aSopenharmony_ci%macro PEL_10STORE2 3
429cabdff1aSopenharmony_ci    movd           [%1], %2
430cabdff1aSopenharmony_ci%endmacro
431cabdff1aSopenharmony_ci%macro PEL_10STORE4 3
432cabdff1aSopenharmony_ci    movq           [%1], %2
433cabdff1aSopenharmony_ci%endmacro
434cabdff1aSopenharmony_ci%macro PEL_10STORE6 3
435cabdff1aSopenharmony_ci    movq           [%1], %2
436cabdff1aSopenharmony_ci    psrldq            %2, 8
437cabdff1aSopenharmony_ci    movd         [%1+8], %2
438cabdff1aSopenharmony_ci%endmacro
439cabdff1aSopenharmony_ci%macro PEL_10STORE8 3
440cabdff1aSopenharmony_ci    movdqa         [%1], %2
441cabdff1aSopenharmony_ci%endmacro
442cabdff1aSopenharmony_ci%macro PEL_10STORE12 3
443cabdff1aSopenharmony_ci    movdqa         [%1], %2
444cabdff1aSopenharmony_ci    movq        [%1+16], %3
445cabdff1aSopenharmony_ci%endmacro
446cabdff1aSopenharmony_ci%macro PEL_10STORE16 3
447cabdff1aSopenharmony_ci%if cpuflag(avx2)
448cabdff1aSopenharmony_ci    movu            [%1], %2
449cabdff1aSopenharmony_ci%else
450cabdff1aSopenharmony_ci    PEL_10STORE8      %1, %2, %3
451cabdff1aSopenharmony_ci    movdqa       [%1+16], %3
452cabdff1aSopenharmony_ci%endif
453cabdff1aSopenharmony_ci%endmacro
454cabdff1aSopenharmony_ci
455cabdff1aSopenharmony_ci%macro PEL_10STORE32 3
456cabdff1aSopenharmony_ci    PEL_10STORE16     %1, %2, %3
457cabdff1aSopenharmony_ci    movu         [%1+32], %3
458cabdff1aSopenharmony_ci%endmacro
459cabdff1aSopenharmony_ci
460cabdff1aSopenharmony_ci%macro PEL_8STORE2 3
461cabdff1aSopenharmony_ci    pextrw          [%1], %2, 0
462cabdff1aSopenharmony_ci%endmacro
463cabdff1aSopenharmony_ci%macro PEL_8STORE4 3
464cabdff1aSopenharmony_ci    movd            [%1], %2
465cabdff1aSopenharmony_ci%endmacro
466cabdff1aSopenharmony_ci%macro PEL_8STORE6 3
467cabdff1aSopenharmony_ci    movd            [%1], %2
468cabdff1aSopenharmony_ci    pextrw        [%1+4], %2, 2
469cabdff1aSopenharmony_ci%endmacro
470cabdff1aSopenharmony_ci%macro PEL_8STORE8 3
471cabdff1aSopenharmony_ci    movq           [%1], %2
472cabdff1aSopenharmony_ci%endmacro
473cabdff1aSopenharmony_ci%macro PEL_8STORE12 3
474cabdff1aSopenharmony_ci    movq            [%1], %2
475cabdff1aSopenharmony_ci    psrldq            %2, 8
476cabdff1aSopenharmony_ci    movd          [%1+8], %2
477cabdff1aSopenharmony_ci%endmacro
478cabdff1aSopenharmony_ci%macro PEL_8STORE16 3
479cabdff1aSopenharmony_ci%if cpuflag(avx2)
480cabdff1aSopenharmony_ci    movdqu        [%1], %2
481cabdff1aSopenharmony_ci%else
482cabdff1aSopenharmony_ci    mova          [%1], %2
483cabdff1aSopenharmony_ci%endif ; avx
484cabdff1aSopenharmony_ci%endmacro
485cabdff1aSopenharmony_ci%macro PEL_8STORE32 3
486cabdff1aSopenharmony_ci    movu          [%1], %2
487cabdff1aSopenharmony_ci%endmacro
488cabdff1aSopenharmony_ci
489cabdff1aSopenharmony_ci%macro LOOP_END 3
490cabdff1aSopenharmony_ci    add              %1q, 2*MAX_PB_SIZE          ; dst += dststride
491cabdff1aSopenharmony_ci    add              %2q, %3q                    ; src += srcstride
492cabdff1aSopenharmony_ci    dec          heightd                         ; cmp height
493cabdff1aSopenharmony_ci    jnz               .loop                      ; height loop
494cabdff1aSopenharmony_ci%endmacro
495cabdff1aSopenharmony_ci
496cabdff1aSopenharmony_ci
497cabdff1aSopenharmony_ci%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth
498cabdff1aSopenharmony_ci%if %2 == 8
499cabdff1aSopenharmony_ci%if cpuflag(avx2) && %0 ==3
500cabdff1aSopenharmony_ci%if %1 > 16
501cabdff1aSopenharmony_ci    vextracti128 xm1, m0, 1
502cabdff1aSopenharmony_ci    pmovzxbw      m1, xm1
503cabdff1aSopenharmony_ci    psllw         m1, 14-%2
504cabdff1aSopenharmony_ci%endif
505cabdff1aSopenharmony_ci    pmovzxbw      m0, xm0
506cabdff1aSopenharmony_ci%else ; not avx
507cabdff1aSopenharmony_ci%if %1 > 8
508cabdff1aSopenharmony_ci    punpckhbw     m1, m0, m2
509cabdff1aSopenharmony_ci    psllw         m1, 14-%2
510cabdff1aSopenharmony_ci%endif
511cabdff1aSopenharmony_ci    punpcklbw     m0, m2
512cabdff1aSopenharmony_ci%endif
513cabdff1aSopenharmony_ci%endif ;avx
514cabdff1aSopenharmony_ci    psllw         m0, 14-%2
515cabdff1aSopenharmony_ci%endmacro
516cabdff1aSopenharmony_ci
517cabdff1aSopenharmony_ci%macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3
518cabdff1aSopenharmony_ci%if %0 == 8
519cabdff1aSopenharmony_ci%define %%reg0 %5
520cabdff1aSopenharmony_ci%define %%reg2 %6
521cabdff1aSopenharmony_ci%define %%reg1 %7
522cabdff1aSopenharmony_ci%define %%reg3 %8
523cabdff1aSopenharmony_ci%else
524cabdff1aSopenharmony_ci%define %%reg0 m0
525cabdff1aSopenharmony_ci%define %%reg2 m2
526cabdff1aSopenharmony_ci%define %%reg1 m1
527cabdff1aSopenharmony_ci%define %%reg3 m3
528cabdff1aSopenharmony_ci%endif
529cabdff1aSopenharmony_ci%if %1 == 8
530cabdff1aSopenharmony_ci%if cpuflag(avx2) && (%0 == 5)
531cabdff1aSopenharmony_ci%if %2 > 16
532cabdff1aSopenharmony_ci    vperm2i128    m10, m0, m1, q0301
533cabdff1aSopenharmony_ci%endif
534cabdff1aSopenharmony_ci    vinserti128    m0, m0, xm1, 1
535cabdff1aSopenharmony_ci    mova           m1, m10
536cabdff1aSopenharmony_ci%if %2 > 16
537cabdff1aSopenharmony_ci    vperm2i128    m10, m2, m3, q0301
538cabdff1aSopenharmony_ci%endif
539cabdff1aSopenharmony_ci    vinserti128    m2, m2, xm3, 1
540cabdff1aSopenharmony_ci    mova           m3, m10
541cabdff1aSopenharmony_ci%endif
542cabdff1aSopenharmony_ci    pmaddubsw      %%reg0, %3   ;x1*c1+x2*c2
543cabdff1aSopenharmony_ci    pmaddubsw      %%reg2, %4   ;x3*c3+x4*c4
544cabdff1aSopenharmony_ci    paddw          %%reg0, %%reg2
545cabdff1aSopenharmony_ci%if %2 > 8
546cabdff1aSopenharmony_ci    pmaddubsw      %%reg1, %3
547cabdff1aSopenharmony_ci    pmaddubsw      %%reg3, %4
548cabdff1aSopenharmony_ci    paddw          %%reg1, %%reg3
549cabdff1aSopenharmony_ci%endif
550cabdff1aSopenharmony_ci%else
551cabdff1aSopenharmony_ci    pmaddwd        %%reg0, %3
552cabdff1aSopenharmony_ci    pmaddwd        %%reg2, %4
553cabdff1aSopenharmony_ci    paddd          %%reg0, %%reg2
554cabdff1aSopenharmony_ci%if %2 > 4
555cabdff1aSopenharmony_ci    pmaddwd        %%reg1, %3
556cabdff1aSopenharmony_ci    pmaddwd        %%reg3, %4
557cabdff1aSopenharmony_ci    paddd          %%reg1, %%reg3
558cabdff1aSopenharmony_ci%if %1 != 8
559cabdff1aSopenharmony_ci    psrad          %%reg1, %1-8
560cabdff1aSopenharmony_ci%endif
561cabdff1aSopenharmony_ci%endif
562cabdff1aSopenharmony_ci%if %1 != 8
563cabdff1aSopenharmony_ci    psrad          %%reg0, %1-8
564cabdff1aSopenharmony_ci%endif
565cabdff1aSopenharmony_ci    packssdw       %%reg0, %%reg1
566cabdff1aSopenharmony_ci%endif
567cabdff1aSopenharmony_ci%endmacro
568cabdff1aSopenharmony_ci
569cabdff1aSopenharmony_ci%macro QPEL_HV_COMPUTE 4     ; width, bitdepth, filter idx
570cabdff1aSopenharmony_ci
571cabdff1aSopenharmony_ci%if cpuflag(avx2)
572cabdff1aSopenharmony_ci%assign %%offset 32
573cabdff1aSopenharmony_ci%define %%table  hevc_qpel_filters_avx2_%2
574cabdff1aSopenharmony_ci%else
575cabdff1aSopenharmony_ci%assign %%offset 16
576cabdff1aSopenharmony_ci%define %%table  hevc_qpel_filters_sse4_%2
577cabdff1aSopenharmony_ci%endif
578cabdff1aSopenharmony_ci
579cabdff1aSopenharmony_ci%ifdef PIC
580cabdff1aSopenharmony_ci    lea         rfilterq, [%%table]
581cabdff1aSopenharmony_ci%else
582cabdff1aSopenharmony_ci    %define rfilterq %%table
583cabdff1aSopenharmony_ci%endif
584cabdff1aSopenharmony_ci
585cabdff1aSopenharmony_ci%if %2 == 8
586cabdff1aSopenharmony_ci    pmaddubsw         m0, [rfilterq + %3q*8   ]   ;x1*c1+x2*c2
587cabdff1aSopenharmony_ci    pmaddubsw         m2, [rfilterq + %3q*8+%%offset]   ;x3*c3+x4*c4
588cabdff1aSopenharmony_ci    pmaddubsw         m4, [rfilterq + %3q*8+2*%%offset]   ;x5*c5+x6*c6
589cabdff1aSopenharmony_ci    pmaddubsw         m6, [rfilterq + %3q*8+3*%%offset]   ;x7*c7+x8*c8
590cabdff1aSopenharmony_ci    paddw             m0, m2
591cabdff1aSopenharmony_ci    paddw             m4, m6
592cabdff1aSopenharmony_ci    paddw             m0, m4
593cabdff1aSopenharmony_ci%else
594cabdff1aSopenharmony_ci    pmaddwd           m0, [rfilterq + %3q*8   ]
595cabdff1aSopenharmony_ci    pmaddwd           m2, [rfilterq + %3q*8+%%offset]
596cabdff1aSopenharmony_ci    pmaddwd           m4, [rfilterq + %3q*8+2*%%offset]
597cabdff1aSopenharmony_ci    pmaddwd           m6, [rfilterq + %3q*8+3*%%offset]
598cabdff1aSopenharmony_ci    paddd             m0, m2
599cabdff1aSopenharmony_ci    paddd             m4, m6
600cabdff1aSopenharmony_ci    paddd             m0, m4
601cabdff1aSopenharmony_ci%if %2 != 8
602cabdff1aSopenharmony_ci    psrad             m0, %2-8
603cabdff1aSopenharmony_ci%endif
604cabdff1aSopenharmony_ci%if %1 > 4
605cabdff1aSopenharmony_ci    pmaddwd           m1, [rfilterq + %3q*8   ]
606cabdff1aSopenharmony_ci    pmaddwd           m3, [rfilterq + %3q*8+%%offset]
607cabdff1aSopenharmony_ci    pmaddwd           m5, [rfilterq + %3q*8+2*%%offset]
608cabdff1aSopenharmony_ci    pmaddwd           m7, [rfilterq + %3q*8+3*%%offset]
609cabdff1aSopenharmony_ci    paddd             m1, m3
610cabdff1aSopenharmony_ci    paddd             m5, m7
611cabdff1aSopenharmony_ci    paddd             m1, m5
612cabdff1aSopenharmony_ci%if %2 != 8
613cabdff1aSopenharmony_ci    psrad             m1, %2-8
614cabdff1aSopenharmony_ci%endif
615cabdff1aSopenharmony_ci%endif
616cabdff1aSopenharmony_ci    p%4               m0, m1
617cabdff1aSopenharmony_ci%endif
618cabdff1aSopenharmony_ci%endmacro
619cabdff1aSopenharmony_ci
620cabdff1aSopenharmony_ci%macro QPEL_COMPUTE 2-3     ; width, bitdepth
621cabdff1aSopenharmony_ci%if %2 == 8
622cabdff1aSopenharmony_ci%if cpuflag(avx2) && (%0 == 3)
623cabdff1aSopenharmony_ci
624cabdff1aSopenharmony_ci    vperm2i128 m10, m0,  m1, q0301
625cabdff1aSopenharmony_ci    vinserti128 m0, m0, xm1, 1
626cabdff1aSopenharmony_ci    SWAP 1, 10
627cabdff1aSopenharmony_ci
628cabdff1aSopenharmony_ci    vperm2i128 m10, m2,  m3, q0301
629cabdff1aSopenharmony_ci    vinserti128 m2, m2, xm3, 1
630cabdff1aSopenharmony_ci    SWAP 3, 10
631cabdff1aSopenharmony_ci
632cabdff1aSopenharmony_ci
633cabdff1aSopenharmony_ci    vperm2i128 m10, m4,  m5, q0301
634cabdff1aSopenharmony_ci    vinserti128 m4, m4, xm5, 1
635cabdff1aSopenharmony_ci    SWAP 5, 10
636cabdff1aSopenharmony_ci
637cabdff1aSopenharmony_ci    vperm2i128 m10, m6,  m7, q0301
638cabdff1aSopenharmony_ci    vinserti128 m6, m6, xm7, 1
639cabdff1aSopenharmony_ci    SWAP 7, 10
640cabdff1aSopenharmony_ci%endif
641cabdff1aSopenharmony_ci
642cabdff1aSopenharmony_ci    pmaddubsw         m0, m12   ;x1*c1+x2*c2
643cabdff1aSopenharmony_ci    pmaddubsw         m2, m13   ;x3*c3+x4*c4
644cabdff1aSopenharmony_ci    pmaddubsw         m4, m14   ;x5*c5+x6*c6
645cabdff1aSopenharmony_ci    pmaddubsw         m6, m15   ;x7*c7+x8*c8
646cabdff1aSopenharmony_ci    paddw             m0, m2
647cabdff1aSopenharmony_ci    paddw             m4, m6
648cabdff1aSopenharmony_ci    paddw             m0, m4
649cabdff1aSopenharmony_ci%if %1 > 8
650cabdff1aSopenharmony_ci    pmaddubsw         m1, m12
651cabdff1aSopenharmony_ci    pmaddubsw         m3, m13
652cabdff1aSopenharmony_ci    pmaddubsw         m5, m14
653cabdff1aSopenharmony_ci    pmaddubsw         m7, m15
654cabdff1aSopenharmony_ci    paddw             m1, m3
655cabdff1aSopenharmony_ci    paddw             m5, m7
656cabdff1aSopenharmony_ci    paddw             m1, m5
657cabdff1aSopenharmony_ci%endif
658cabdff1aSopenharmony_ci%else
659cabdff1aSopenharmony_ci    pmaddwd           m0, m12
660cabdff1aSopenharmony_ci    pmaddwd           m2, m13
661cabdff1aSopenharmony_ci    pmaddwd           m4, m14
662cabdff1aSopenharmony_ci    pmaddwd           m6, m15
663cabdff1aSopenharmony_ci    paddd             m0, m2
664cabdff1aSopenharmony_ci    paddd             m4, m6
665cabdff1aSopenharmony_ci    paddd             m0, m4
666cabdff1aSopenharmony_ci%if %2 != 8
667cabdff1aSopenharmony_ci    psrad             m0, %2-8
668cabdff1aSopenharmony_ci%endif
669cabdff1aSopenharmony_ci%if %1 > 4
670cabdff1aSopenharmony_ci    pmaddwd           m1, m12
671cabdff1aSopenharmony_ci    pmaddwd           m3, m13
672cabdff1aSopenharmony_ci    pmaddwd           m5, m14
673cabdff1aSopenharmony_ci    pmaddwd           m7, m15
674cabdff1aSopenharmony_ci    paddd             m1, m3
675cabdff1aSopenharmony_ci    paddd             m5, m7
676cabdff1aSopenharmony_ci    paddd             m1, m5
677cabdff1aSopenharmony_ci%if %2 != 8
678cabdff1aSopenharmony_ci    psrad             m1, %2-8
679cabdff1aSopenharmony_ci%endif
680cabdff1aSopenharmony_ci%endif
681cabdff1aSopenharmony_ci%endif
682cabdff1aSopenharmony_ci%endmacro
683cabdff1aSopenharmony_ci
684cabdff1aSopenharmony_ci%macro BI_COMPUTE 7-8     ; width, bitd, src1l, src1h, scr2l, scr2h, pw
685cabdff1aSopenharmony_ci    paddsw            %3, %5
686cabdff1aSopenharmony_ci%if %1 > 8
687cabdff1aSopenharmony_ci    paddsw            %4, %6
688cabdff1aSopenharmony_ci%endif
689cabdff1aSopenharmony_ci    UNI_COMPUTE       %1, %2, %3, %4, %7
690cabdff1aSopenharmony_ci%if %0 == 8 && cpuflag(avx2) && (%2 == 8)
691cabdff1aSopenharmony_ci    vpermq            %3, %3, 216
692cabdff1aSopenharmony_ci    vpermq            %4, %4, 216
693cabdff1aSopenharmony_ci%endif
694cabdff1aSopenharmony_ci%endmacro
695cabdff1aSopenharmony_ci
696cabdff1aSopenharmony_ci%macro UNI_COMPUTE 5
697cabdff1aSopenharmony_ci    pmulhrsw          %3, %5
698cabdff1aSopenharmony_ci%if %1 > 8 || (%2 > 8 && %1 > 4)
699cabdff1aSopenharmony_ci    pmulhrsw          %4, %5
700cabdff1aSopenharmony_ci%endif
701cabdff1aSopenharmony_ci%if %2 == 8
702cabdff1aSopenharmony_ci    packuswb          %3, %4
703cabdff1aSopenharmony_ci%else
704cabdff1aSopenharmony_ci    CLIPW             %3, [pb_0], [max_pixels_%2]
705cabdff1aSopenharmony_ci%if (%1 > 8 && notcpuflag(avx)) || %1 > 16
706cabdff1aSopenharmony_ci    CLIPW             %4, [pb_0], [max_pixels_%2]
707cabdff1aSopenharmony_ci%endif
708cabdff1aSopenharmony_ci%endif
709cabdff1aSopenharmony_ci%endmacro
710cabdff1aSopenharmony_ci
711cabdff1aSopenharmony_ci
712cabdff1aSopenharmony_ci; ******************************
713cabdff1aSopenharmony_ci; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
714cabdff1aSopenharmony_ci;                         uint8_t *_src, ptrdiff_t _srcstride,
715cabdff1aSopenharmony_ci;                         int height, int mx, int my)
716cabdff1aSopenharmony_ci; ******************************
717cabdff1aSopenharmony_ci
718cabdff1aSopenharmony_ci%macro HEVC_PUT_HEVC_PEL_PIXELS 2
719cabdff1aSopenharmony_ciHEVC_PEL_PIXELS     %1, %2
720cabdff1aSopenharmony_ciHEVC_UNI_PEL_PIXELS %1, %2
721cabdff1aSopenharmony_ciHEVC_BI_PEL_PIXELS  %1, %2
722cabdff1aSopenharmony_ci%endmacro
723cabdff1aSopenharmony_ci
724cabdff1aSopenharmony_ci%macro HEVC_PEL_PIXELS 2
725cabdff1aSopenharmony_cicglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
726cabdff1aSopenharmony_ci    pxor               m2, m2
727cabdff1aSopenharmony_ci.loop:
728cabdff1aSopenharmony_ci    SIMPLE_LOAD       %1, %2, srcq, m0
729cabdff1aSopenharmony_ci    MC_PIXEL_COMPUTE  %1, %2, 1
730cabdff1aSopenharmony_ci    PEL_10STORE%1     dstq, m0, m1
731cabdff1aSopenharmony_ci    LOOP_END         dst, src, srcstride
732cabdff1aSopenharmony_ci    RET
733cabdff1aSopenharmony_ci %endmacro
734cabdff1aSopenharmony_ci
735cabdff1aSopenharmony_ci%macro HEVC_UNI_PEL_PIXELS 2
736cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
737cabdff1aSopenharmony_ci.loop:
738cabdff1aSopenharmony_ci    SIMPLE_LOAD       %1, %2, srcq, m0
739cabdff1aSopenharmony_ci    PEL_%2STORE%1   dstq, m0, m1
740cabdff1aSopenharmony_ci    add             dstq, dststrideq             ; dst += dststride
741cabdff1aSopenharmony_ci    add             srcq, srcstrideq             ; src += srcstride
742cabdff1aSopenharmony_ci    dec          heightd                         ; cmp height
743cabdff1aSopenharmony_ci    jnz               .loop                      ; height loop
744cabdff1aSopenharmony_ci    RET
745cabdff1aSopenharmony_ci%endmacro
746cabdff1aSopenharmony_ci
747cabdff1aSopenharmony_ci%macro HEVC_BI_PEL_PIXELS 2
748cabdff1aSopenharmony_cicglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
749cabdff1aSopenharmony_ci    pxor              m2, m2
750cabdff1aSopenharmony_ci    movdqa            m5, [pw_bi_%2]
751cabdff1aSopenharmony_ci.loop:
752cabdff1aSopenharmony_ci    SIMPLE_LOAD       %1, %2, srcq, m0
753cabdff1aSopenharmony_ci    SIMPLE_BILOAD     %1, src2q, m3, m4
754cabdff1aSopenharmony_ci    MC_PIXEL_COMPUTE  %1, %2, 1
755cabdff1aSopenharmony_ci    BI_COMPUTE        %1, %2, m0, m1, m3, m4, m5, 1
756cabdff1aSopenharmony_ci    PEL_%2STORE%1   dstq, m0, m1
757cabdff1aSopenharmony_ci    add             dstq, dststrideq             ; dst += dststride
758cabdff1aSopenharmony_ci    add             srcq, srcstrideq             ; src += srcstride
759cabdff1aSopenharmony_ci    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
760cabdff1aSopenharmony_ci    dec          heightd                         ; cmp height
761cabdff1aSopenharmony_ci    jnz               .loop                      ; height loop
762cabdff1aSopenharmony_ci    RET
763cabdff1aSopenharmony_ci%endmacro
764cabdff1aSopenharmony_ci
765cabdff1aSopenharmony_ci
766cabdff1aSopenharmony_ci; ******************************
767cabdff1aSopenharmony_ci; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
768cabdff1aSopenharmony_ci;                       uint8_t *_src, ptrdiff_t _srcstride,
769cabdff1aSopenharmony_ci;                       int height, int mx, int my, int width);
770cabdff1aSopenharmony_ci; ******************************
771cabdff1aSopenharmony_ci
772cabdff1aSopenharmony_ci
773cabdff1aSopenharmony_ci%macro HEVC_PUT_HEVC_EPEL 2
774cabdff1aSopenharmony_ci%if cpuflag(avx2)
775cabdff1aSopenharmony_ci%define XMM_REGS  11
776cabdff1aSopenharmony_ci%else
777cabdff1aSopenharmony_ci%define XMM_REGS  8
778cabdff1aSopenharmony_ci%endif
779cabdff1aSopenharmony_ci
780cabdff1aSopenharmony_cicglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter
781cabdff1aSopenharmony_ci%assign %%stride ((%2 + 7)/8)
782cabdff1aSopenharmony_ci    EPEL_FILTER       %2, mx, m4, m5, rfilter
783cabdff1aSopenharmony_ci.loop:
784cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
785cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m4, m5, 1
786cabdff1aSopenharmony_ci    PEL_10STORE%1      dstq, m0, m1
787cabdff1aSopenharmony_ci    LOOP_END         dst, src, srcstride
788cabdff1aSopenharmony_ci    RET
789cabdff1aSopenharmony_ci
790cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter
791cabdff1aSopenharmony_ci%assign %%stride ((%2 + 7)/8)
792cabdff1aSopenharmony_ci    movdqa            m6, [pw_%2]
793cabdff1aSopenharmony_ci    EPEL_FILTER       %2, mx, m4, m5, rfilter
794cabdff1aSopenharmony_ci.loop:
795cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
796cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m4, m5
797cabdff1aSopenharmony_ci    UNI_COMPUTE       %1, %2, m0, m1, m6
798cabdff1aSopenharmony_ci    PEL_%2STORE%1   dstq, m0, m1
799cabdff1aSopenharmony_ci    add             dstq, dststrideq             ; dst += dststride
800cabdff1aSopenharmony_ci    add             srcq, srcstrideq             ; src += srcstride
801cabdff1aSopenharmony_ci    dec          heightd                         ; cmp height
802cabdff1aSopenharmony_ci    jnz               .loop                      ; height loop
803cabdff1aSopenharmony_ci    RET
804cabdff1aSopenharmony_ci
805cabdff1aSopenharmony_cicglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter
806cabdff1aSopenharmony_ci    movdqa            m6, [pw_bi_%2]
807cabdff1aSopenharmony_ci    EPEL_FILTER       %2, mx, m4, m5, rfilter
808cabdff1aSopenharmony_ci.loop:
809cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
810cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m4, m5, 1
811cabdff1aSopenharmony_ci    SIMPLE_BILOAD     %1, src2q, m2, m3
812cabdff1aSopenharmony_ci    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
813cabdff1aSopenharmony_ci    PEL_%2STORE%1   dstq, m0, m1
814cabdff1aSopenharmony_ci    add             dstq, dststrideq             ; dst += dststride
815cabdff1aSopenharmony_ci    add             srcq, srcstrideq             ; src += srcstride
816cabdff1aSopenharmony_ci    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
817cabdff1aSopenharmony_ci    dec          heightd                         ; cmp height
818cabdff1aSopenharmony_ci    jnz               .loop                      ; height loop
819cabdff1aSopenharmony_ci    RET
820cabdff1aSopenharmony_ci
821cabdff1aSopenharmony_ci; ******************************
822cabdff1aSopenharmony_ci; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
823cabdff1aSopenharmony_ci;                      uint8_t *_src, ptrdiff_t _srcstride,
824cabdff1aSopenharmony_ci;                      int height, int mx, int my, int width)
825cabdff1aSopenharmony_ci; ******************************
826cabdff1aSopenharmony_ci
827cabdff1aSopenharmony_cicglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my
828cabdff1aSopenharmony_ci    movifnidn        myd, mym
829cabdff1aSopenharmony_ci    sub             srcq, srcstrideq
830cabdff1aSopenharmony_ci    EPEL_FILTER       %2, my, m4, m5, r3src
831cabdff1aSopenharmony_ci    lea           r3srcq, [srcstrideq*3]
832cabdff1aSopenharmony_ci.loop:
833cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq, srcstride, %1
834cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m4, m5, 1
835cabdff1aSopenharmony_ci    PEL_10STORE%1     dstq, m0, m1
836cabdff1aSopenharmony_ci    LOOP_END          dst, src, srcstride
837cabdff1aSopenharmony_ci    RET
838cabdff1aSopenharmony_ci
839cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my
840cabdff1aSopenharmony_ci    movifnidn        myd, mym
841cabdff1aSopenharmony_ci    movdqa            m6, [pw_%2]
842cabdff1aSopenharmony_ci    sub             srcq, srcstrideq
843cabdff1aSopenharmony_ci    EPEL_FILTER       %2, my, m4, m5, r3src
844cabdff1aSopenharmony_ci    lea           r3srcq, [srcstrideq*3]
845cabdff1aSopenharmony_ci.loop:
846cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq, srcstride, %1
847cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m4, m5
848cabdff1aSopenharmony_ci    UNI_COMPUTE       %1, %2, m0, m1, m6
849cabdff1aSopenharmony_ci    PEL_%2STORE%1   dstq, m0, m1
850cabdff1aSopenharmony_ci    add             dstq, dststrideq             ; dst += dststride
851cabdff1aSopenharmony_ci    add             srcq, srcstrideq             ; src += srcstride
852cabdff1aSopenharmony_ci    dec          heightd                         ; cmp height
853cabdff1aSopenharmony_ci    jnz               .loop                      ; height loop
854cabdff1aSopenharmony_ci    RET
855cabdff1aSopenharmony_ci
856cabdff1aSopenharmony_ci
857cabdff1aSopenharmony_cicglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my
858cabdff1aSopenharmony_ci    movifnidn        myd, mym
859cabdff1aSopenharmony_ci    movdqa            m6, [pw_bi_%2]
860cabdff1aSopenharmony_ci    sub             srcq, srcstrideq
861cabdff1aSopenharmony_ci    EPEL_FILTER       %2, my, m4, m5, r3src
862cabdff1aSopenharmony_ci    lea           r3srcq, [srcstrideq*3]
863cabdff1aSopenharmony_ci.loop:
864cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq, srcstride, %1
865cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m4, m5, 1
866cabdff1aSopenharmony_ci    SIMPLE_BILOAD     %1, src2q, m2, m3
867cabdff1aSopenharmony_ci    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
868cabdff1aSopenharmony_ci    PEL_%2STORE%1   dstq, m0, m1
869cabdff1aSopenharmony_ci    add             dstq, dststrideq             ; dst += dststride
870cabdff1aSopenharmony_ci    add             srcq, srcstrideq             ; src += srcstride
871cabdff1aSopenharmony_ci    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
872cabdff1aSopenharmony_ci    dec          heightd                         ; cmp height
873cabdff1aSopenharmony_ci    jnz               .loop                      ; height loop
874cabdff1aSopenharmony_ci    RET
875cabdff1aSopenharmony_ci%endmacro
876cabdff1aSopenharmony_ci
877cabdff1aSopenharmony_ci
878cabdff1aSopenharmony_ci; ******************************
879cabdff1aSopenharmony_ci; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
880cabdff1aSopenharmony_ci;                       uint8_t *_src, ptrdiff_t _srcstride,
881cabdff1aSopenharmony_ci;                       int height, int mx, int my, int width)
882cabdff1aSopenharmony_ci; ******************************
883cabdff1aSopenharmony_ci
884cabdff1aSopenharmony_ci%macro HEVC_PUT_HEVC_EPEL_HV 2
885cabdff1aSopenharmony_cicglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src
886cabdff1aSopenharmony_ci%assign %%stride ((%2 + 7)/8)
887cabdff1aSopenharmony_ci    sub             srcq, srcstrideq
888cabdff1aSopenharmony_ci    EPEL_HV_FILTER    %2
889cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
890cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m14, m15
891cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
892cabdff1aSopenharmony_ci    SWAP              m8, m1
893cabdff1aSopenharmony_ci%endif
894cabdff1aSopenharmony_ci    SWAP              m4, m0
895cabdff1aSopenharmony_ci    add             srcq, srcstrideq
896cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
897cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m14, m15
898cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
899cabdff1aSopenharmony_ci    SWAP              m9, m1
900cabdff1aSopenharmony_ci%endif
901cabdff1aSopenharmony_ci    SWAP              m5, m0
902cabdff1aSopenharmony_ci    add             srcq, srcstrideq
903cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
904cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m14, m15
905cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
906cabdff1aSopenharmony_ci    SWAP             m10, m1
907cabdff1aSopenharmony_ci%endif
908cabdff1aSopenharmony_ci    SWAP              m6, m0
909cabdff1aSopenharmony_ci    add             srcq, srcstrideq
910cabdff1aSopenharmony_ci.loop:
911cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
912cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m14, m15
913cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
914cabdff1aSopenharmony_ci    SWAP             m11, m1
915cabdff1aSopenharmony_ci%endif
916cabdff1aSopenharmony_ci    SWAP              m7, m0
917cabdff1aSopenharmony_ci    punpcklwd         m0, m4, m5
918cabdff1aSopenharmony_ci    punpcklwd         m2, m6, m7
919cabdff1aSopenharmony_ci%if %1 > 4
920cabdff1aSopenharmony_ci    punpckhwd         m1, m4, m5
921cabdff1aSopenharmony_ci    punpckhwd         m3, m6, m7
922cabdff1aSopenharmony_ci%endif
923cabdff1aSopenharmony_ci    EPEL_COMPUTE      14, %1, m12, m13
924cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
925cabdff1aSopenharmony_ci    punpcklwd         m4, m8, m9
926cabdff1aSopenharmony_ci    punpcklwd         m2, m10, m11
927cabdff1aSopenharmony_ci    punpckhwd         m8, m8, m9
928cabdff1aSopenharmony_ci    punpckhwd         m3, m10, m11
929cabdff1aSopenharmony_ci    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
930cabdff1aSopenharmony_ci%if cpuflag(avx2)
931cabdff1aSopenharmony_ci    vinserti128       m2, m0, xm4, 1
932cabdff1aSopenharmony_ci    vperm2i128        m3, m0, m4, q0301
933cabdff1aSopenharmony_ci    PEL_10STORE%1     dstq, m2, m3
934cabdff1aSopenharmony_ci%else
935cabdff1aSopenharmony_ci    PEL_10STORE%1     dstq, m0, m4
936cabdff1aSopenharmony_ci%endif
937cabdff1aSopenharmony_ci%else
938cabdff1aSopenharmony_ci    PEL_10STORE%1     dstq, m0, m1
939cabdff1aSopenharmony_ci%endif
940cabdff1aSopenharmony_ci    movdqa            m4, m5
941cabdff1aSopenharmony_ci    movdqa            m5, m6
942cabdff1aSopenharmony_ci    movdqa            m6, m7
943cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
944cabdff1aSopenharmony_ci    mova              m8, m9
945cabdff1aSopenharmony_ci    mova              m9, m10
946cabdff1aSopenharmony_ci    mova             m10, m11
947cabdff1aSopenharmony_ci%endif
948cabdff1aSopenharmony_ci    LOOP_END         dst, src, srcstride
949cabdff1aSopenharmony_ci    RET
950cabdff1aSopenharmony_ci
951cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src
952cabdff1aSopenharmony_ci%assign %%stride ((%2 + 7)/8)
953cabdff1aSopenharmony_ci    sub             srcq, srcstrideq
954cabdff1aSopenharmony_ci    EPEL_HV_FILTER    %2
955cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
956cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m14, m15
957cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
958cabdff1aSopenharmony_ci    SWAP              m8, m1
959cabdff1aSopenharmony_ci%endif
960cabdff1aSopenharmony_ci    SWAP              m4, m0
961cabdff1aSopenharmony_ci    add             srcq, srcstrideq
962cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
963cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m14, m15
964cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
965cabdff1aSopenharmony_ci    SWAP              m9, m1
966cabdff1aSopenharmony_ci%endif
967cabdff1aSopenharmony_ci    SWAP              m5, m0
968cabdff1aSopenharmony_ci    add             srcq, srcstrideq
969cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
970cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m14, m15
971cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
972cabdff1aSopenharmony_ci    SWAP             m10, m1
973cabdff1aSopenharmony_ci%endif
974cabdff1aSopenharmony_ci    SWAP              m6, m0
975cabdff1aSopenharmony_ci    add             srcq, srcstrideq
976cabdff1aSopenharmony_ci.loop:
977cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
978cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m14, m15
979cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
980cabdff1aSopenharmony_ci    SWAP             m11, m1
981cabdff1aSopenharmony_ci%endif
982cabdff1aSopenharmony_ci    mova              m7, m0
983cabdff1aSopenharmony_ci    punpcklwd         m0, m4, m5
984cabdff1aSopenharmony_ci    punpcklwd         m2, m6, m7
985cabdff1aSopenharmony_ci%if %1 > 4
986cabdff1aSopenharmony_ci    punpckhwd         m1, m4, m5
987cabdff1aSopenharmony_ci    punpckhwd         m3, m6, m7
988cabdff1aSopenharmony_ci%endif
989cabdff1aSopenharmony_ci    EPEL_COMPUTE      14, %1, m12, m13
990cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
991cabdff1aSopenharmony_ci    punpcklwd         m4, m8, m9
992cabdff1aSopenharmony_ci    punpcklwd         m2, m10, m11
993cabdff1aSopenharmony_ci    punpckhwd         m8, m8, m9
994cabdff1aSopenharmony_ci    punpckhwd         m3, m10, m11
995cabdff1aSopenharmony_ci    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
996cabdff1aSopenharmony_ci    UNI_COMPUTE       %1, %2, m0, m4, [pw_%2]
997cabdff1aSopenharmony_ci%else
998cabdff1aSopenharmony_ci    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
999cabdff1aSopenharmony_ci%endif
1000cabdff1aSopenharmony_ci    PEL_%2STORE%1   dstq, m0, m1
1001cabdff1aSopenharmony_ci    mova              m4, m5
1002cabdff1aSopenharmony_ci    mova              m5, m6
1003cabdff1aSopenharmony_ci    mova              m6, m7
1004cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
1005cabdff1aSopenharmony_ci    mova              m8, m9
1006cabdff1aSopenharmony_ci    mova              m9, m10
1007cabdff1aSopenharmony_ci    mova             m10, m11
1008cabdff1aSopenharmony_ci%endif
1009cabdff1aSopenharmony_ci    add             dstq, dststrideq             ; dst += dststride
1010cabdff1aSopenharmony_ci    add             srcq, srcstrideq             ; src += srcstride
1011cabdff1aSopenharmony_ci    dec          heightd                         ; cmp height
1012cabdff1aSopenharmony_ci    jnz               .loop                      ; height loop
1013cabdff1aSopenharmony_ci    RET
1014cabdff1aSopenharmony_ci
1015cabdff1aSopenharmony_cicglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src
1016cabdff1aSopenharmony_ci%assign %%stride ((%2 + 7)/8)
1017cabdff1aSopenharmony_ci    sub             srcq, srcstrideq
1018cabdff1aSopenharmony_ci    EPEL_HV_FILTER    %2
1019cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
1020cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m14, m15
1021cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
1022cabdff1aSopenharmony_ci    SWAP              m8, m1
1023cabdff1aSopenharmony_ci%endif
1024cabdff1aSopenharmony_ci    SWAP              m4, m0
1025cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1026cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
1027cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m14, m15
1028cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
1029cabdff1aSopenharmony_ci    SWAP              m9, m1
1030cabdff1aSopenharmony_ci%endif
1031cabdff1aSopenharmony_ci    SWAP              m5, m0
1032cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1033cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
1034cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m14, m15
1035cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
1036cabdff1aSopenharmony_ci    SWAP             m10, m1
1037cabdff1aSopenharmony_ci%endif
1038cabdff1aSopenharmony_ci    SWAP              m6, m0
1039cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1040cabdff1aSopenharmony_ci.loop:
1041cabdff1aSopenharmony_ci    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
1042cabdff1aSopenharmony_ci    EPEL_COMPUTE      %2, %1, m14, m15
1043cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
1044cabdff1aSopenharmony_ci    SWAP             m11, m1
1045cabdff1aSopenharmony_ci%endif
1046cabdff1aSopenharmony_ci    SWAP              m7, m0
1047cabdff1aSopenharmony_ci    punpcklwd         m0, m4, m5
1048cabdff1aSopenharmony_ci    punpcklwd         m2, m6, m7
1049cabdff1aSopenharmony_ci%if %1 > 4
1050cabdff1aSopenharmony_ci    punpckhwd         m1, m4, m5
1051cabdff1aSopenharmony_ci    punpckhwd         m3, m6, m7
1052cabdff1aSopenharmony_ci%endif
1053cabdff1aSopenharmony_ci    EPEL_COMPUTE      14, %1, m12, m13
1054cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
1055cabdff1aSopenharmony_ci    punpcklwd         m4, m8, m9
1056cabdff1aSopenharmony_ci    punpcklwd         m2, m10, m11
1057cabdff1aSopenharmony_ci    punpckhwd         m8, m8, m9
1058cabdff1aSopenharmony_ci    punpckhwd         m3, m10, m11
1059cabdff1aSopenharmony_ci    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
1060cabdff1aSopenharmony_ci    SIMPLE_BILOAD     %1, src2q, m8, m3
1061cabdff1aSopenharmony_ci%if cpuflag(avx2)
1062cabdff1aSopenharmony_ci    vinserti128       m1, m8, xm3, 1
1063cabdff1aSopenharmony_ci    vperm2i128        m2, m8, m3, q0301
1064cabdff1aSopenharmony_ci    BI_COMPUTE        %1, %2, m0, m4, m1, m2, [pw_bi_%2]
1065cabdff1aSopenharmony_ci%else
1066cabdff1aSopenharmony_ci    BI_COMPUTE        %1, %2, m0, m4, m8, m3, [pw_bi_%2]
1067cabdff1aSopenharmony_ci%endif
1068cabdff1aSopenharmony_ci%else
1069cabdff1aSopenharmony_ci    SIMPLE_BILOAD     %1, src2q, m8, m9
1070cabdff1aSopenharmony_ci    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1071cabdff1aSopenharmony_ci%endif
1072cabdff1aSopenharmony_ci    PEL_%2STORE%1   dstq, m0, m4
1073cabdff1aSopenharmony_ci    mova              m4, m5
1074cabdff1aSopenharmony_ci    mova              m5, m6
1075cabdff1aSopenharmony_ci    mova              m6, m7
1076cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8))
1077cabdff1aSopenharmony_ci    mova              m8, m9
1078cabdff1aSopenharmony_ci    mova              m9, m10
1079cabdff1aSopenharmony_ci    mova             m10, m11
1080cabdff1aSopenharmony_ci%endif
1081cabdff1aSopenharmony_ci    add             dstq, dststrideq             ; dst += dststride
1082cabdff1aSopenharmony_ci    add             srcq, srcstrideq             ; src += srcstride
1083cabdff1aSopenharmony_ci    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
1084cabdff1aSopenharmony_ci    dec          heightd                         ; cmp height
1085cabdff1aSopenharmony_ci    jnz               .loop                      ; height loop
1086cabdff1aSopenharmony_ci    RET
1087cabdff1aSopenharmony_ci%endmacro
1088cabdff1aSopenharmony_ci
1089cabdff1aSopenharmony_ci; ******************************
1090cabdff1aSopenharmony_ci; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
1091cabdff1aSopenharmony_ci;                       uint8_t *_src, ptrdiff_t _srcstride,
1092cabdff1aSopenharmony_ci;                       int height, int mx, int my, int width)
1093cabdff1aSopenharmony_ci; ******************************
1094cabdff1aSopenharmony_ci
1095cabdff1aSopenharmony_ci%macro HEVC_PUT_HEVC_QPEL 2
1096cabdff1aSopenharmony_cicglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter
1097cabdff1aSopenharmony_ci    QPEL_FILTER       %2, mx
1098cabdff1aSopenharmony_ci.loop:
1099cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 10
1100cabdff1aSopenharmony_ci    QPEL_COMPUTE      %1, %2, 1
1101cabdff1aSopenharmony_ci%if %2 > 8
1102cabdff1aSopenharmony_ci    packssdw          m0, m1
1103cabdff1aSopenharmony_ci%endif
1104cabdff1aSopenharmony_ci    PEL_10STORE%1     dstq, m0, m1
1105cabdff1aSopenharmony_ci    LOOP_END          dst, src, srcstride
1106cabdff1aSopenharmony_ci    RET
1107cabdff1aSopenharmony_ci
1108cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
1109cabdff1aSopenharmony_ci    mova              m9, [pw_%2]
1110cabdff1aSopenharmony_ci    QPEL_FILTER       %2, mx
1111cabdff1aSopenharmony_ci.loop:
1112cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 10
1113cabdff1aSopenharmony_ci    QPEL_COMPUTE      %1, %2
1114cabdff1aSopenharmony_ci%if %2 > 8
1115cabdff1aSopenharmony_ci    packssdw          m0, m1
1116cabdff1aSopenharmony_ci%endif
1117cabdff1aSopenharmony_ci    UNI_COMPUTE       %1, %2, m0, m1, m9
1118cabdff1aSopenharmony_ci    PEL_%2STORE%1   dstq, m0, m1
1119cabdff1aSopenharmony_ci    add             dstq, dststrideq             ; dst += dststride
1120cabdff1aSopenharmony_ci    add             srcq, srcstrideq             ; src += srcstride
1121cabdff1aSopenharmony_ci    dec          heightd                         ; cmp height
1122cabdff1aSopenharmony_ci    jnz               .loop                      ; height loop
1123cabdff1aSopenharmony_ci    RET
1124cabdff1aSopenharmony_ci
1125cabdff1aSopenharmony_cicglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
1126cabdff1aSopenharmony_ci    movdqa            m9, [pw_bi_%2]
1127cabdff1aSopenharmony_ci    QPEL_FILTER       %2, mx
1128cabdff1aSopenharmony_ci.loop:
1129cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 10
1130cabdff1aSopenharmony_ci    QPEL_COMPUTE      %1, %2, 1
1131cabdff1aSopenharmony_ci%if %2 > 8
1132cabdff1aSopenharmony_ci    packssdw          m0, m1
1133cabdff1aSopenharmony_ci%endif
1134cabdff1aSopenharmony_ci    SIMPLE_BILOAD     %1, src2q, m10, m11
1135cabdff1aSopenharmony_ci    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
1136cabdff1aSopenharmony_ci    PEL_%2STORE%1   dstq, m0, m1
1137cabdff1aSopenharmony_ci    add             dstq, dststrideq             ; dst += dststride
1138cabdff1aSopenharmony_ci    add             srcq, srcstrideq             ; src += srcstride
1139cabdff1aSopenharmony_ci    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
1140cabdff1aSopenharmony_ci    dec          heightd                         ; cmp height
1141cabdff1aSopenharmony_ci    jnz               .loop                      ; height loop
1142cabdff1aSopenharmony_ci    RET
1143cabdff1aSopenharmony_ci
1144cabdff1aSopenharmony_ci
1145cabdff1aSopenharmony_ci; ******************************
1146cabdff1aSopenharmony_ci; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
1147cabdff1aSopenharmony_ci;                       uint8_t *_src, ptrdiff_t _srcstride,
1148cabdff1aSopenharmony_ci;                       int height, int mx, int my, int width)
1149cabdff1aSopenharmony_ci; ******************************
1150cabdff1aSopenharmony_ci
1151cabdff1aSopenharmony_cicglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter
1152cabdff1aSopenharmony_ci    movifnidn        myd, mym
1153cabdff1aSopenharmony_ci    lea           r3srcq, [srcstrideq*3]
1154cabdff1aSopenharmony_ci    QPEL_FILTER       %2, my
1155cabdff1aSopenharmony_ci.loop:
1156cabdff1aSopenharmony_ci    QPEL_V_LOAD       %2, srcq, srcstride, %1, r7
1157cabdff1aSopenharmony_ci    QPEL_COMPUTE      %1, %2, 1
1158cabdff1aSopenharmony_ci%if %2 > 8
1159cabdff1aSopenharmony_ci    packssdw          m0, m1
1160cabdff1aSopenharmony_ci%endif
1161cabdff1aSopenharmony_ci    PEL_10STORE%1     dstq, m0, m1
1162cabdff1aSopenharmony_ci    LOOP_END         dst, src, srcstride
1163cabdff1aSopenharmony_ci    RET
1164cabdff1aSopenharmony_ci
1165cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter
1166cabdff1aSopenharmony_ci    movifnidn        myd, mym
1167cabdff1aSopenharmony_ci    movdqa            m9, [pw_%2]
1168cabdff1aSopenharmony_ci    lea           r3srcq, [srcstrideq*3]
1169cabdff1aSopenharmony_ci    QPEL_FILTER       %2, my
1170cabdff1aSopenharmony_ci.loop:
1171cabdff1aSopenharmony_ci    QPEL_V_LOAD       %2, srcq, srcstride, %1, r8
1172cabdff1aSopenharmony_ci    QPEL_COMPUTE      %1, %2
1173cabdff1aSopenharmony_ci%if %2 > 8
1174cabdff1aSopenharmony_ci    packssdw          m0, m1
1175cabdff1aSopenharmony_ci%endif
1176cabdff1aSopenharmony_ci    UNI_COMPUTE       %1, %2, m0, m1, m9
1177cabdff1aSopenharmony_ci    PEL_%2STORE%1   dstq, m0, m1
1178cabdff1aSopenharmony_ci    add             dstq, dststrideq             ; dst += dststride
1179cabdff1aSopenharmony_ci    add             srcq, srcstrideq             ; src += srcstride
1180cabdff1aSopenharmony_ci    dec          heightd                         ; cmp height
1181cabdff1aSopenharmony_ci    jnz               .loop                      ; height loop
1182cabdff1aSopenharmony_ci    RET
1183cabdff1aSopenharmony_ci
1184cabdff1aSopenharmony_cicglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
1185cabdff1aSopenharmony_ci    movifnidn        myd, mym
1186cabdff1aSopenharmony_ci    movdqa            m9, [pw_bi_%2]
1187cabdff1aSopenharmony_ci    lea           r3srcq, [srcstrideq*3]
1188cabdff1aSopenharmony_ci    QPEL_FILTER       %2, my
1189cabdff1aSopenharmony_ci.loop:
1190cabdff1aSopenharmony_ci    QPEL_V_LOAD       %2, srcq, srcstride, %1, r9
1191cabdff1aSopenharmony_ci    QPEL_COMPUTE      %1, %2, 1
1192cabdff1aSopenharmony_ci%if %2 > 8
1193cabdff1aSopenharmony_ci    packssdw          m0, m1
1194cabdff1aSopenharmony_ci%endif
1195cabdff1aSopenharmony_ci    SIMPLE_BILOAD     %1, src2q, m10, m11
1196cabdff1aSopenharmony_ci    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
1197cabdff1aSopenharmony_ci    PEL_%2STORE%1   dstq, m0, m1
1198cabdff1aSopenharmony_ci    add             dstq, dststrideq             ; dst += dststride
1199cabdff1aSopenharmony_ci    add             srcq, srcstrideq             ; src += srcstride
1200cabdff1aSopenharmony_ci    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
1201cabdff1aSopenharmony_ci    dec          heightd                         ; cmp height
1202cabdff1aSopenharmony_ci    jnz               .loop                      ; height loop
1203cabdff1aSopenharmony_ci    RET
1204cabdff1aSopenharmony_ci%endmacro
1205cabdff1aSopenharmony_ci
1206cabdff1aSopenharmony_ci
1207cabdff1aSopenharmony_ci; ******************************
1208cabdff1aSopenharmony_ci; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
1209cabdff1aSopenharmony_ci;                       uint8_t *_src, ptrdiff_t _srcstride,
1210cabdff1aSopenharmony_ci;                       int height, int mx, int my)
1211cabdff1aSopenharmony_ci; ******************************
1212cabdff1aSopenharmony_ci%macro HEVC_PUT_HEVC_QPEL_HV 2
1213cabdff1aSopenharmony_cicglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
1214cabdff1aSopenharmony_ci%if cpuflag(avx2)
1215cabdff1aSopenharmony_ci%assign %%shift  4
1216cabdff1aSopenharmony_ci%else
1217cabdff1aSopenharmony_ci%assign %%shift  3
1218cabdff1aSopenharmony_ci%endif
1219cabdff1aSopenharmony_ci    sub              mxq, 1
1220cabdff1aSopenharmony_ci    sub              myq, 1
1221cabdff1aSopenharmony_ci    shl              mxq, %%shift                ; multiply by 32
1222cabdff1aSopenharmony_ci    shl              myq, %%shift                ; multiply by 32
1223cabdff1aSopenharmony_ci    lea           r3srcq, [srcstrideq*3]
1224cabdff1aSopenharmony_ci    sub             srcq, r3srcq
1225cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1226cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1227cabdff1aSopenharmony_ci    SWAP              m8, m0
1228cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1229cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1230cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1231cabdff1aSopenharmony_ci    SWAP              m9, m0
1232cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1233cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1234cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1235cabdff1aSopenharmony_ci    SWAP             m10, m0
1236cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1237cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1238cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1239cabdff1aSopenharmony_ci    SWAP             m11, m0
1240cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1241cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1242cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1243cabdff1aSopenharmony_ci    SWAP             m12, m0
1244cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1245cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1246cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1247cabdff1aSopenharmony_ci    SWAP             m13, m0
1248cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1249cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1250cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1251cabdff1aSopenharmony_ci    SWAP             m14, m0
1252cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1253cabdff1aSopenharmony_ci.loop:
1254cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1255cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1256cabdff1aSopenharmony_ci    SWAP             m15, m0
1257cabdff1aSopenharmony_ci    punpcklwd         m0, m8, m9
1258cabdff1aSopenharmony_ci    punpcklwd         m2, m10, m11
1259cabdff1aSopenharmony_ci    punpcklwd         m4, m12, m13
1260cabdff1aSopenharmony_ci    punpcklwd         m6, m14, m15
1261cabdff1aSopenharmony_ci%if %1 > 4
1262cabdff1aSopenharmony_ci    punpckhwd         m1, m8, m9
1263cabdff1aSopenharmony_ci    punpckhwd         m3, m10, m11
1264cabdff1aSopenharmony_ci    punpckhwd         m5, m12, m13
1265cabdff1aSopenharmony_ci    punpckhwd         m7, m14, m15
1266cabdff1aSopenharmony_ci%endif
1267cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, 14, my, ackssdw
1268cabdff1aSopenharmony_ci    PEL_10STORE%1     dstq, m0, m1
1269cabdff1aSopenharmony_ci%if %1 <= 4
1270cabdff1aSopenharmony_ci    movq              m8, m9
1271cabdff1aSopenharmony_ci    movq              m9, m10
1272cabdff1aSopenharmony_ci    movq             m10, m11
1273cabdff1aSopenharmony_ci    movq             m11, m12
1274cabdff1aSopenharmony_ci    movq             m12, m13
1275cabdff1aSopenharmony_ci    movq             m13, m14
1276cabdff1aSopenharmony_ci    movq             m14, m15
1277cabdff1aSopenharmony_ci%else
1278cabdff1aSopenharmony_ci    movdqa            m8, m9
1279cabdff1aSopenharmony_ci    movdqa            m9, m10
1280cabdff1aSopenharmony_ci    movdqa           m10, m11
1281cabdff1aSopenharmony_ci    movdqa           m11, m12
1282cabdff1aSopenharmony_ci    movdqa           m12, m13
1283cabdff1aSopenharmony_ci    movdqa           m13, m14
1284cabdff1aSopenharmony_ci    movdqa           m14, m15
1285cabdff1aSopenharmony_ci%endif
1286cabdff1aSopenharmony_ci    LOOP_END         dst, src, srcstride
1287cabdff1aSopenharmony_ci    RET
1288cabdff1aSopenharmony_ci
1289cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
1290cabdff1aSopenharmony_ci%if cpuflag(avx2)
1291cabdff1aSopenharmony_ci%assign %%shift  4
1292cabdff1aSopenharmony_ci%else
1293cabdff1aSopenharmony_ci%assign %%shift  3
1294cabdff1aSopenharmony_ci%endif
1295cabdff1aSopenharmony_ci    sub              mxq, 1
1296cabdff1aSopenharmony_ci    sub              myq, 1
1297cabdff1aSopenharmony_ci    shl              mxq, %%shift                ; multiply by 32
1298cabdff1aSopenharmony_ci    shl              myq, %%shift                ; multiply by 32
1299cabdff1aSopenharmony_ci    lea           r3srcq, [srcstrideq*3]
1300cabdff1aSopenharmony_ci    sub             srcq, r3srcq
1301cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1302cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1303cabdff1aSopenharmony_ci    SWAP              m8, m0
1304cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1305cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1306cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1307cabdff1aSopenharmony_ci    SWAP              m9, m0
1308cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1309cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1310cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1311cabdff1aSopenharmony_ci    SWAP             m10, m0
1312cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1313cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1314cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1315cabdff1aSopenharmony_ci    SWAP             m11, m0
1316cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1317cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1318cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1319cabdff1aSopenharmony_ci    SWAP             m12, m0
1320cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1321cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1322cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1323cabdff1aSopenharmony_ci    SWAP             m13, m0
1324cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1325cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1326cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1327cabdff1aSopenharmony_ci    SWAP             m14, m0
1328cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1329cabdff1aSopenharmony_ci.loop:
1330cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1331cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1332cabdff1aSopenharmony_ci    SWAP             m15, m0
1333cabdff1aSopenharmony_ci    punpcklwd         m0, m8, m9
1334cabdff1aSopenharmony_ci    punpcklwd         m2, m10, m11
1335cabdff1aSopenharmony_ci    punpcklwd         m4, m12, m13
1336cabdff1aSopenharmony_ci    punpcklwd         m6, m14, m15
1337cabdff1aSopenharmony_ci%if %1 > 4
1338cabdff1aSopenharmony_ci    punpckhwd         m1, m8, m9
1339cabdff1aSopenharmony_ci    punpckhwd         m3, m10, m11
1340cabdff1aSopenharmony_ci    punpckhwd         m5, m12, m13
1341cabdff1aSopenharmony_ci    punpckhwd         m7, m14, m15
1342cabdff1aSopenharmony_ci%endif
1343cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, 14, my, ackusdw
1344cabdff1aSopenharmony_ci    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
1345cabdff1aSopenharmony_ci    PEL_%2STORE%1   dstq, m0, m1
1346cabdff1aSopenharmony_ci
1347cabdff1aSopenharmony_ci%if %1 <= 4
1348cabdff1aSopenharmony_ci    movq              m8, m9
1349cabdff1aSopenharmony_ci    movq              m9, m10
1350cabdff1aSopenharmony_ci    movq             m10, m11
1351cabdff1aSopenharmony_ci    movq             m11, m12
1352cabdff1aSopenharmony_ci    movq             m12, m13
1353cabdff1aSopenharmony_ci    movq             m13, m14
1354cabdff1aSopenharmony_ci    movq             m14, m15
1355cabdff1aSopenharmony_ci%else
1356cabdff1aSopenharmony_ci    mova            m8, m9
1357cabdff1aSopenharmony_ci    mova            m9, m10
1358cabdff1aSopenharmony_ci    mova           m10, m11
1359cabdff1aSopenharmony_ci    mova           m11, m12
1360cabdff1aSopenharmony_ci    mova           m12, m13
1361cabdff1aSopenharmony_ci    mova           m13, m14
1362cabdff1aSopenharmony_ci    mova           m14, m15
1363cabdff1aSopenharmony_ci%endif
1364cabdff1aSopenharmony_ci    add             dstq, dststrideq             ; dst += dststride
1365cabdff1aSopenharmony_ci    add             srcq, srcstrideq             ; src += srcstride
1366cabdff1aSopenharmony_ci    dec          heightd                         ; cmp height
1367cabdff1aSopenharmony_ci    jnz               .loop                      ; height loop
1368cabdff1aSopenharmony_ci    RET
1369cabdff1aSopenharmony_ci
1370cabdff1aSopenharmony_cicglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
1371cabdff1aSopenharmony_ci%if cpuflag(avx2)
1372cabdff1aSopenharmony_ci%assign %%shift  4
1373cabdff1aSopenharmony_ci%else
1374cabdff1aSopenharmony_ci%assign %%shift  3
1375cabdff1aSopenharmony_ci%endif
1376cabdff1aSopenharmony_ci    sub              mxq, 1
1377cabdff1aSopenharmony_ci    sub              myq, 1
1378cabdff1aSopenharmony_ci    shl              mxq, %%shift                ; multiply by 32
1379cabdff1aSopenharmony_ci    shl              myq, %%shift                ; multiply by 32
1380cabdff1aSopenharmony_ci    lea           r3srcq, [srcstrideq*3]
1381cabdff1aSopenharmony_ci    sub             srcq, r3srcq
1382cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1383cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1384cabdff1aSopenharmony_ci    SWAP              m8, m0
1385cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1386cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1387cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1388cabdff1aSopenharmony_ci    SWAP              m9, m0
1389cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1390cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1391cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1392cabdff1aSopenharmony_ci    SWAP             m10, m0
1393cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1394cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1395cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1396cabdff1aSopenharmony_ci    SWAP             m11, m0
1397cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1398cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1399cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1400cabdff1aSopenharmony_ci    SWAP             m12, m0
1401cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1402cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1403cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1404cabdff1aSopenharmony_ci    SWAP             m13, m0
1405cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1406cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1407cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1408cabdff1aSopenharmony_ci    SWAP             m14, m0
1409cabdff1aSopenharmony_ci    add             srcq, srcstrideq
1410cabdff1aSopenharmony_ci.loop:
1411cabdff1aSopenharmony_ci    QPEL_H_LOAD       %2, srcq, %1, 15
1412cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1413cabdff1aSopenharmony_ci    SWAP             m15, m0
1414cabdff1aSopenharmony_ci    punpcklwd         m0, m8, m9
1415cabdff1aSopenharmony_ci    punpcklwd         m2, m10, m11
1416cabdff1aSopenharmony_ci    punpcklwd         m4, m12, m13
1417cabdff1aSopenharmony_ci    punpcklwd         m6, m14, m15
1418cabdff1aSopenharmony_ci%if %1 > 4
1419cabdff1aSopenharmony_ci    punpckhwd         m1, m8, m9
1420cabdff1aSopenharmony_ci    punpckhwd         m3, m10, m11
1421cabdff1aSopenharmony_ci    punpckhwd         m5, m12, m13
1422cabdff1aSopenharmony_ci    punpckhwd         m7, m14, m15
1423cabdff1aSopenharmony_ci%endif
1424cabdff1aSopenharmony_ci    QPEL_HV_COMPUTE   %1, 14, my, ackssdw
1425cabdff1aSopenharmony_ci    SIMPLE_BILOAD     %1, src2q, m8, m9 ;m9 not used in this case
1426cabdff1aSopenharmony_ci    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1427cabdff1aSopenharmony_ci    PEL_%2STORE%1   dstq, m0, m1
1428cabdff1aSopenharmony_ci
1429cabdff1aSopenharmony_ci%if %1 <= 4
1430cabdff1aSopenharmony_ci    movq              m8, m9
1431cabdff1aSopenharmony_ci    movq              m9, m10
1432cabdff1aSopenharmony_ci    movq             m10, m11
1433cabdff1aSopenharmony_ci    movq             m11, m12
1434cabdff1aSopenharmony_ci    movq             m12, m13
1435cabdff1aSopenharmony_ci    movq             m13, m14
1436cabdff1aSopenharmony_ci    movq             m14, m15
1437cabdff1aSopenharmony_ci%else
1438cabdff1aSopenharmony_ci    movdqa            m8, m9
1439cabdff1aSopenharmony_ci    movdqa            m9, m10
1440cabdff1aSopenharmony_ci    movdqa           m10, m11
1441cabdff1aSopenharmony_ci    movdqa           m11, m12
1442cabdff1aSopenharmony_ci    movdqa           m12, m13
1443cabdff1aSopenharmony_ci    movdqa           m13, m14
1444cabdff1aSopenharmony_ci    movdqa           m14, m15
1445cabdff1aSopenharmony_ci%endif
1446cabdff1aSopenharmony_ci    add             dstq, dststrideq             ; dst += dststride
1447cabdff1aSopenharmony_ci    add             srcq, srcstrideq             ; src += srcstride
1448cabdff1aSopenharmony_ci    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
1449cabdff1aSopenharmony_ci    dec          heightd                         ; cmp height
1450cabdff1aSopenharmony_ci    jnz               .loop                      ; height loop
1451cabdff1aSopenharmony_ci    RET
1452cabdff1aSopenharmony_ci%endmacro
1453cabdff1aSopenharmony_ci
1454cabdff1aSopenharmony_ci%macro WEIGHTING_FUNCS 2
1455cabdff1aSopenharmony_ci%if WIN64 || ARCH_X86_32
1456cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, height, denom, wx, ox
1457cabdff1aSopenharmony_ci    mov             r4d, denomm
1458cabdff1aSopenharmony_ci%define SHIFT  r4d
1459cabdff1aSopenharmony_ci%else
1460cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, height, denom, wx, ox
1461cabdff1aSopenharmony_ci%define SHIFT  denomd
1462cabdff1aSopenharmony_ci%endif
1463cabdff1aSopenharmony_ci    lea           SHIFT, [SHIFT+14-%2]          ; shift = 14 - bitd + denom
1464cabdff1aSopenharmony_ci%if %1 <= 4
1465cabdff1aSopenharmony_ci    pxor             m1, m1
1466cabdff1aSopenharmony_ci%endif
1467cabdff1aSopenharmony_ci    movd             m2, wxm        ; WX
1468cabdff1aSopenharmony_ci    movd             m4, SHIFT      ; shift
1469cabdff1aSopenharmony_ci%if %1 <= 4
1470cabdff1aSopenharmony_ci    punpcklwd        m2, m1
1471cabdff1aSopenharmony_ci%else
1472cabdff1aSopenharmony_ci    punpcklwd        m2, m2
1473cabdff1aSopenharmony_ci%endif
1474cabdff1aSopenharmony_ci    dec           SHIFT
1475cabdff1aSopenharmony_ci    movdqu           m5, [pd_1]
1476cabdff1aSopenharmony_ci    movd             m6, SHIFT
1477cabdff1aSopenharmony_ci    pshufd           m2, m2, 0
1478cabdff1aSopenharmony_ci    mov           SHIFT, oxm
1479cabdff1aSopenharmony_ci    pslld            m5, m6
1480cabdff1aSopenharmony_ci%if %2 != 8
1481cabdff1aSopenharmony_ci    shl           SHIFT, %2-8       ; ox << (bitd - 8)
1482cabdff1aSopenharmony_ci%endif
1483cabdff1aSopenharmony_ci    movd             m3, SHIFT      ; OX
1484cabdff1aSopenharmony_ci    pshufd           m3, m3, 0
1485cabdff1aSopenharmony_ci%if WIN64 || ARCH_X86_32
1486cabdff1aSopenharmony_ci    mov           SHIFT, heightm
1487cabdff1aSopenharmony_ci%endif
1488cabdff1aSopenharmony_ci.loop:
1489cabdff1aSopenharmony_ci   SIMPLE_LOAD        %1, 10, srcq, m0
1490cabdff1aSopenharmony_ci%if %1 <= 4
1491cabdff1aSopenharmony_ci    punpcklwd         m0, m1
1492cabdff1aSopenharmony_ci    pmaddwd           m0, m2
1493cabdff1aSopenharmony_ci    paddd             m0, m5
1494cabdff1aSopenharmony_ci    psrad             m0, m4
1495cabdff1aSopenharmony_ci    paddd             m0, m3
1496cabdff1aSopenharmony_ci%else
1497cabdff1aSopenharmony_ci    pmulhw            m6, m0, m2
1498cabdff1aSopenharmony_ci    pmullw            m0, m2
1499cabdff1aSopenharmony_ci    punpckhwd         m1, m0, m6
1500cabdff1aSopenharmony_ci    punpcklwd         m0, m6
1501cabdff1aSopenharmony_ci    paddd             m0, m5
1502cabdff1aSopenharmony_ci    paddd             m1, m5
1503cabdff1aSopenharmony_ci    psrad             m0, m4
1504cabdff1aSopenharmony_ci    psrad             m1, m4
1505cabdff1aSopenharmony_ci    paddd             m0, m3
1506cabdff1aSopenharmony_ci    paddd             m1, m3
1507cabdff1aSopenharmony_ci%endif
1508cabdff1aSopenharmony_ci    packssdw          m0, m1
1509cabdff1aSopenharmony_ci%if %2 == 8
1510cabdff1aSopenharmony_ci    packuswb          m0, m0
1511cabdff1aSopenharmony_ci%else
1512cabdff1aSopenharmony_ci    CLIPW             m0, [pb_0], [max_pixels_%2]
1513cabdff1aSopenharmony_ci%endif
1514cabdff1aSopenharmony_ci    PEL_%2STORE%1   dstq, m0, m1
1515cabdff1aSopenharmony_ci    add             dstq, dststrideq             ; dst += dststride
1516cabdff1aSopenharmony_ci    add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
1517cabdff1aSopenharmony_ci    dec          heightd                         ; cmp height
1518cabdff1aSopenharmony_ci    jnz               .loop                      ; height loop
1519cabdff1aSopenharmony_ci    RET
1520cabdff1aSopenharmony_ci
1521cabdff1aSopenharmony_cicglobal hevc_put_hevc_bi_w%1_%2, 4, 6, 10, dst, dststride, src, src2, height, denom, wx0, wx1, ox0, ox1
1522cabdff1aSopenharmony_ci    movifnidn        r5d, denomm
1523cabdff1aSopenharmony_ci%if %1 <= 4
1524cabdff1aSopenharmony_ci    pxor              m1, m1
1525cabdff1aSopenharmony_ci%endif
1526cabdff1aSopenharmony_ci    movd              m2, wx0m         ; WX0
1527cabdff1aSopenharmony_ci    lea              r5d, [r5d+14-%2]  ; shift = 14 - bitd + denom
1528cabdff1aSopenharmony_ci    movd              m3, wx1m         ; WX1
1529cabdff1aSopenharmony_ci    movd              m0, r5d          ; shift
1530cabdff1aSopenharmony_ci%if %1 <= 4
1531cabdff1aSopenharmony_ci    punpcklwd         m2, m1
1532cabdff1aSopenharmony_ci    punpcklwd         m3, m1
1533cabdff1aSopenharmony_ci%else
1534cabdff1aSopenharmony_ci    punpcklwd         m2, m2
1535cabdff1aSopenharmony_ci    punpcklwd         m3, m3
1536cabdff1aSopenharmony_ci%endif
1537cabdff1aSopenharmony_ci    inc              r5d
1538cabdff1aSopenharmony_ci    movd              m5, r5d          ; shift+1
1539cabdff1aSopenharmony_ci    pshufd            m2, m2, 0
1540cabdff1aSopenharmony_ci    mov              r5d, ox0m
1541cabdff1aSopenharmony_ci    pshufd            m3, m3, 0
1542cabdff1aSopenharmony_ci    add              r5d, ox1m
1543cabdff1aSopenharmony_ci%if %2 != 8
1544cabdff1aSopenharmony_ci    shl              r5d, %2-8         ; ox << (bitd - 8)
1545cabdff1aSopenharmony_ci%endif
1546cabdff1aSopenharmony_ci    inc              r5d
1547cabdff1aSopenharmony_ci    movd              m4, r5d          ; offset
1548cabdff1aSopenharmony_ci    pshufd            m4, m4, 0
1549cabdff1aSopenharmony_ci%if UNIX64
1550cabdff1aSopenharmony_ci%define h heightd
1551cabdff1aSopenharmony_ci%else
1552cabdff1aSopenharmony_ci    mov              r5d, heightm
1553cabdff1aSopenharmony_ci%define h r5d
1554cabdff1aSopenharmony_ci%endif
1555cabdff1aSopenharmony_ci    pslld             m4, m0
1556cabdff1aSopenharmony_ci
1557cabdff1aSopenharmony_ci.loop:
1558cabdff1aSopenharmony_ci   SIMPLE_LOAD        %1, 10, srcq,  m0
1559cabdff1aSopenharmony_ci   SIMPLE_LOAD        %1, 10, src2q, m8
1560cabdff1aSopenharmony_ci%if %1 <= 4
1561cabdff1aSopenharmony_ci    punpcklwd         m0, m1
1562cabdff1aSopenharmony_ci    punpcklwd         m8, m1
1563cabdff1aSopenharmony_ci    pmaddwd           m0, m3
1564cabdff1aSopenharmony_ci    pmaddwd           m8, m2
1565cabdff1aSopenharmony_ci    paddd             m0, m4
1566cabdff1aSopenharmony_ci    paddd             m0, m8
1567cabdff1aSopenharmony_ci    psrad             m0, m5
1568cabdff1aSopenharmony_ci%else
1569cabdff1aSopenharmony_ci    pmulhw            m6, m0, m3
1570cabdff1aSopenharmony_ci    pmullw            m0, m3
1571cabdff1aSopenharmony_ci    pmulhw            m7, m8, m2
1572cabdff1aSopenharmony_ci    pmullw            m8, m2
1573cabdff1aSopenharmony_ci    punpckhwd         m1, m0, m6
1574cabdff1aSopenharmony_ci    punpcklwd         m0, m6
1575cabdff1aSopenharmony_ci    punpckhwd         m9, m8, m7
1576cabdff1aSopenharmony_ci    punpcklwd         m8, m7
1577cabdff1aSopenharmony_ci    paddd             m0, m8
1578cabdff1aSopenharmony_ci    paddd             m1, m9
1579cabdff1aSopenharmony_ci    paddd             m0, m4
1580cabdff1aSopenharmony_ci    paddd             m1, m4
1581cabdff1aSopenharmony_ci    psrad             m0, m5
1582cabdff1aSopenharmony_ci    psrad             m1, m5
1583cabdff1aSopenharmony_ci%endif
1584cabdff1aSopenharmony_ci    packssdw          m0, m1
1585cabdff1aSopenharmony_ci%if %2 == 8
1586cabdff1aSopenharmony_ci    packuswb          m0, m0
1587cabdff1aSopenharmony_ci%else
1588cabdff1aSopenharmony_ci     CLIPW            m0, [pb_0], [max_pixels_%2]
1589cabdff1aSopenharmony_ci%endif
1590cabdff1aSopenharmony_ci    PEL_%2STORE%1   dstq, m0, m1
1591cabdff1aSopenharmony_ci    add             dstq, dststrideq             ; dst += dststride
1592cabdff1aSopenharmony_ci    add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
1593cabdff1aSopenharmony_ci    add            src2q, 2*MAX_PB_SIZE          ; src2 += srcstride
1594cabdff1aSopenharmony_ci    dec                h                         ; cmp height
1595cabdff1aSopenharmony_ci    jnz               .loop                      ; height loop
1596cabdff1aSopenharmony_ci    RET
1597cabdff1aSopenharmony_ci%endmacro
1598cabdff1aSopenharmony_ci
1599cabdff1aSopenharmony_ciINIT_XMM sse4                                    ; adds ff_ and _sse4 to function name
1600cabdff1aSopenharmony_ci
1601cabdff1aSopenharmony_ciWEIGHTING_FUNCS 2, 8
1602cabdff1aSopenharmony_ciWEIGHTING_FUNCS 4, 8
1603cabdff1aSopenharmony_ciWEIGHTING_FUNCS 6, 8
1604cabdff1aSopenharmony_ciWEIGHTING_FUNCS 8, 8
1605cabdff1aSopenharmony_ci
1606cabdff1aSopenharmony_ciWEIGHTING_FUNCS 2, 10
1607cabdff1aSopenharmony_ciWEIGHTING_FUNCS 4, 10
1608cabdff1aSopenharmony_ciWEIGHTING_FUNCS 6, 10
1609cabdff1aSopenharmony_ciWEIGHTING_FUNCS 8, 10
1610cabdff1aSopenharmony_ci
1611cabdff1aSopenharmony_ciWEIGHTING_FUNCS 2, 12
1612cabdff1aSopenharmony_ciWEIGHTING_FUNCS 4, 12
1613cabdff1aSopenharmony_ciWEIGHTING_FUNCS 6, 12
1614cabdff1aSopenharmony_ciWEIGHTING_FUNCS 8, 12
1615cabdff1aSopenharmony_ci
1616cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS  2, 8
1617cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS  4, 8
1618cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS  6, 8
1619cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS  8, 8
1620cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 12, 8
1621cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 16, 8
1622cabdff1aSopenharmony_ci
1623cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 2, 10
1624cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 4, 10
1625cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 6, 10
1626cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 8, 10
1627cabdff1aSopenharmony_ci
1628cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 2, 12
1629cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 4, 12
1630cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 6, 12
1631cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 8, 12
1632cabdff1aSopenharmony_ci
1633cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 2,  8
1634cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 4,  8
1635cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 6,  8
1636cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 8,  8
1637cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 12, 8
1638cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 16, 8
1639cabdff1aSopenharmony_ci
1640cabdff1aSopenharmony_ci
1641cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 2, 10
1642cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 4, 10
1643cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 6, 10
1644cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 8, 10
1645cabdff1aSopenharmony_ci
1646cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 2, 12
1647cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 4, 12
1648cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 6, 12
1649cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 8, 12
1650cabdff1aSopenharmony_ci
1651cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 2,  8
1652cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 4,  8
1653cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 6,  8
1654cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 8,  8
1655cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 16, 8
1656cabdff1aSopenharmony_ci
1657cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 2, 10
1658cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 4, 10
1659cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 6, 10
1660cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 8, 10
1661cabdff1aSopenharmony_ci
1662cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 2, 12
1663cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 4, 12
1664cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 6, 12
1665cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 8, 12
1666cabdff1aSopenharmony_ci
1667cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 4,  8
1668cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 8,  8
1669cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 12, 8
1670cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 16, 8
1671cabdff1aSopenharmony_ci
1672cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 4, 10
1673cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 8, 10
1674cabdff1aSopenharmony_ci
1675cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 4, 12
1676cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 8, 12
1677cabdff1aSopenharmony_ci
1678cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 2, 8
1679cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 4, 8
1680cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 6, 8
1681cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 8, 8
1682cabdff1aSopenharmony_ci
1683cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 2, 10
1684cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 4, 10
1685cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 6, 10
1686cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 8, 10
1687cabdff1aSopenharmony_ci
1688cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 2, 12
1689cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 4, 12
1690cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 6, 12
1691cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 8, 12
1692cabdff1aSopenharmony_ci
1693cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
1694cabdff1aSopenharmony_ciINIT_YMM avx2  ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
1695cabdff1aSopenharmony_ci
1696cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 32, 8
1697cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 16, 10
1698cabdff1aSopenharmony_ci
1699cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 32, 8
1700cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 16, 10
1701cabdff1aSopenharmony_ci
1702cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 16, 10
1703cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 32, 8
1704cabdff1aSopenharmony_ci
1705cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 32, 8
1706cabdff1aSopenharmony_ci
1707cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 16, 10
1708cabdff1aSopenharmony_ci
1709cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 16, 10
1710cabdff1aSopenharmony_ci
1711cabdff1aSopenharmony_ci%endif ;AVX2
1712cabdff1aSopenharmony_ci%endif ; ARCH_X86_64
1713cabdff1aSopenharmony_ci
1714cabdff1aSopenharmony_ci%macro QPEL_FILTER_H 5
1715cabdff1aSopenharmony_ci%define %%table hevc_qpel_filters_avx512icl_h_%1
1716cabdff1aSopenharmony_ci%assign %%offset 4
1717cabdff1aSopenharmony_ci    dec %2q
1718cabdff1aSopenharmony_ci    shl %2q, 3
1719cabdff1aSopenharmony_ci%ifdef PIC
1720cabdff1aSopenharmony_ci    lea %5q, [%%table]
1721cabdff1aSopenharmony_ci    %define FILTER %5q
1722cabdff1aSopenharmony_ci%else
1723cabdff1aSopenharmony_ci    %define FILTER %%table
1724cabdff1aSopenharmony_ci%endif
1725cabdff1aSopenharmony_ci    vpbroadcastd m%3, [FILTER + %2q + 0*%%offset]
1726cabdff1aSopenharmony_ci    vpbroadcastd m%4, [FILTER + %2q + 1*%%offset]
1727cabdff1aSopenharmony_ci%endmacro
1728cabdff1aSopenharmony_ci
1729cabdff1aSopenharmony_ci%macro QPEL_FILTER_V 5
1730cabdff1aSopenharmony_ci    vpbroadcastd m%3, [%5 + %2q + 4*%4]
1731cabdff1aSopenharmony_ci%endmacro
1732cabdff1aSopenharmony_ci
1733cabdff1aSopenharmony_ci%macro QPEL_LOAD_SHUF 2
1734cabdff1aSopenharmony_ci    movu m%1, [pb_qpel_shuffle_index +  0]
1735cabdff1aSopenharmony_ci    movu m%2, [pb_qpel_shuffle_index + 64]
1736cabdff1aSopenharmony_ci%endmacro
1737cabdff1aSopenharmony_ci
1738cabdff1aSopenharmony_ci; required: m0-m5
1739cabdff1aSopenharmony_ci; %1: dst register index
1740cabdff1aSopenharmony_ci; %2: name for src
1741cabdff1aSopenharmony_ci; %3: optional offset
1742cabdff1aSopenharmony_ci%macro QPEL_H_LOAD_COMPUTE 2-3
1743cabdff1aSopenharmony_ci%assign %%offset 0
1744cabdff1aSopenharmony_ci%if %0 == 3
1745cabdff1aSopenharmony_ci%assign %%offset %3
1746cabdff1aSopenharmony_ci%endif
1747cabdff1aSopenharmony_ci    pxor            m%1, m%1
1748cabdff1aSopenharmony_ci%if mmsize == 64
1749cabdff1aSopenharmony_ci    movu            ym4, [%2q + %%offset - 3]
1750cabdff1aSopenharmony_ci%else
1751cabdff1aSopenharmony_ci    movu            xm4, [%2q + %%offset - 3]
1752cabdff1aSopenharmony_ci%endif
1753cabdff1aSopenharmony_ci    vpermb           m5, m2, m4
1754cabdff1aSopenharmony_ci    vpermb           m4, m3, m4
1755cabdff1aSopenharmony_ci    vpdpbusd        m%1, m5, m0
1756cabdff1aSopenharmony_ci    vpdpbusd        m%1, m4, m1
1757cabdff1aSopenharmony_ci%endmacro
1758cabdff1aSopenharmony_ci
1759cabdff1aSopenharmony_ci%macro HEVC_PUT_HEVC_QPEL_AVX512ICL 2
1760cabdff1aSopenharmony_cicglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 8, dst, src, srcstride, height, mx, tmp
1761cabdff1aSopenharmony_ci    QPEL_FILTER_H   %1, mx, 0, 1, tmp
1762cabdff1aSopenharmony_ci    QPEL_LOAD_SHUF   2, 3
1763cabdff1aSopenharmony_ci.loop:
1764cabdff1aSopenharmony_ci    QPEL_H_LOAD_COMPUTE   6, src
1765cabdff1aSopenharmony_ci%if %1 == 4
1766cabdff1aSopenharmony_ci    vpmovdw             xm6, m6
1767cabdff1aSopenharmony_ci    movq             [dstq], xm6
1768cabdff1aSopenharmony_ci%else
1769cabdff1aSopenharmony_ci    vpmovdw          [dstq], m6
1770cabdff1aSopenharmony_ci%endif
1771cabdff1aSopenharmony_ci%if %1 > 16
1772cabdff1aSopenharmony_ci    QPEL_H_LOAD_COMPUTE   7, src, 16
1773cabdff1aSopenharmony_ci    vpmovdw     [dstq + 32], m7
1774cabdff1aSopenharmony_ci%endif
1775cabdff1aSopenharmony_ci%if %1 > 32
1776cabdff1aSopenharmony_ci    QPEL_H_LOAD_COMPUTE   6, src, 32
1777cabdff1aSopenharmony_ci    QPEL_H_LOAD_COMPUTE   7, src, 48
1778cabdff1aSopenharmony_ci    vpmovdw     [dstq + 64], m6
1779cabdff1aSopenharmony_ci    vpmovdw     [dstq + 96], m7
1780cabdff1aSopenharmony_ci%endif
1781cabdff1aSopenharmony_ci    LOOP_END            dst, src, srcstride
1782cabdff1aSopenharmony_ci    RET
1783cabdff1aSopenharmony_ci%endmacro
1784cabdff1aSopenharmony_ci
1785cabdff1aSopenharmony_ci%macro HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 2
1786cabdff1aSopenharmony_cicglobal hevc_put_hevc_qpel_hv%1_%2, 6, 7, 27, dst, src, srcstride, height, mx, my, tmp
1787cabdff1aSopenharmony_ci%assign %%shift 6
1788cabdff1aSopenharmony_ci%assign %%extra 7
1789cabdff1aSopenharmony_ci    QPEL_FILTER_H    %1, mx, 0, 1, tmp
1790cabdff1aSopenharmony_ci    QPEL_LOAD_SHUF    2, 3
1791cabdff1aSopenharmony_ci    lea            tmpq, [srcstrideq*3]
1792cabdff1aSopenharmony_ci    sub            srcq, tmpq
1793cabdff1aSopenharmony_ci    sub             myq, 1
1794cabdff1aSopenharmony_ci    shl             myq, 5
1795cabdff1aSopenharmony_ci%define %%table hevc_qpel_filters_avx512icl_v_%1
1796cabdff1aSopenharmony_ci%ifdef PIC
1797cabdff1aSopenharmony_ci    lea tmpq, [%%table]
1798cabdff1aSopenharmony_ci    %define FILTER tmpq
1799cabdff1aSopenharmony_ci%else
1800cabdff1aSopenharmony_ci    %define FILTER %%table
1801cabdff1aSopenharmony_ci%endif
1802cabdff1aSopenharmony_ci%assign %%i 6
1803cabdff1aSopenharmony_ci%assign %%j 0
1804cabdff1aSopenharmony_ci%rep %1
1805cabdff1aSopenharmony_ci    QPEL_FILTER_V %1, my, %%i, %%j, FILTER
1806cabdff1aSopenharmony_ci    %assign %%i %%i+1
1807cabdff1aSopenharmony_ci    %assign %%j %%j+1
1808cabdff1aSopenharmony_ci%endrep
1809cabdff1aSopenharmony_ci%rep %%extra
1810cabdff1aSopenharmony_ci    QPEL_H_LOAD_COMPUTE %%i, src
1811cabdff1aSopenharmony_ci    add srcq, srcstrideq
1812cabdff1aSopenharmony_ci%assign %%i %%i+1
1813cabdff1aSopenharmony_ci%endrep
1814cabdff1aSopenharmony_ci.loop:
1815cabdff1aSopenharmony_ci    QPEL_H_LOAD_COMPUTE %%i, src
1816cabdff1aSopenharmony_ci    vpmulld           m22, m14, m6
1817cabdff1aSopenharmony_ci    vpmulld           m23, m15, m7
1818cabdff1aSopenharmony_ci    vpmulld           m24, m16, m8
1819cabdff1aSopenharmony_ci    vpmulld           m25, m17, m9
1820cabdff1aSopenharmony_ci    vpaddd            m26, m22, m23
1821cabdff1aSopenharmony_ci    vpaddd            m24, m25
1822cabdff1aSopenharmony_ci    vpaddd            m26, m24
1823cabdff1aSopenharmony_ci    vpmulld           m22, m18, m10
1824cabdff1aSopenharmony_ci    vpmulld           m23, m19, m11
1825cabdff1aSopenharmony_ci    vpmulld           m24, m20, m12
1826cabdff1aSopenharmony_ci    vpmulld           m25, m21, m13
1827cabdff1aSopenharmony_ci    vpaddd            m22, m22, m23
1828cabdff1aSopenharmony_ci    vpaddd            m24, m25
1829cabdff1aSopenharmony_ci    vpaddd            m26, m24
1830cabdff1aSopenharmony_ci    vpaddd            m22, m26
1831cabdff1aSopenharmony_ci    mova              m14, m15
1832cabdff1aSopenharmony_ci    mova              m15, m16
1833cabdff1aSopenharmony_ci    mova              m16, m17
1834cabdff1aSopenharmony_ci    mova              m17, m18
1835cabdff1aSopenharmony_ci    mova              m18, m19
1836cabdff1aSopenharmony_ci    mova              m19, m20
1837cabdff1aSopenharmony_ci    mova              m20, m21
1838cabdff1aSopenharmony_ci    vpsrad            m22, %%shift
1839cabdff1aSopenharmony_ci    vpmovdw        [dstq], m22
1840cabdff1aSopenharmony_ci    LOOP_END          dst, src, srcstride
1841cabdff1aSopenharmony_ci
1842cabdff1aSopenharmony_ci    RET
1843cabdff1aSopenharmony_ci%endmacro
1844cabdff1aSopenharmony_ci
1845cabdff1aSopenharmony_ci%if ARCH_X86_64
1846cabdff1aSopenharmony_ci%if HAVE_AVX512ICL_EXTERNAL
1847cabdff1aSopenharmony_ci
1848cabdff1aSopenharmony_ciINIT_XMM avx512icl
1849cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_AVX512ICL 4, 8
1850cabdff1aSopenharmony_ci
1851cabdff1aSopenharmony_ciINIT_YMM avx512icl
1852cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_AVX512ICL 8, 8
1853cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV_AVX512ICL 8, 8
1854cabdff1aSopenharmony_ci
1855cabdff1aSopenharmony_ciINIT_ZMM avx512icl
1856cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_AVX512ICL 16, 8
1857cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_AVX512ICL 32, 8
1858cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_AVX512ICL 64, 8
1859cabdff1aSopenharmony_ci
1860cabdff1aSopenharmony_ci%endif
1861cabdff1aSopenharmony_ci%endif
1862