1; /*
2; * Provide SSE luma and chroma mc functions for HEVC decoding
3; * Copyright (c) 2013 Pierre-Edouard LEPERE
4; *
5; * This file is part of FFmpeg.
6; *
7; * FFmpeg is free software; you can redistribute it and/or
8; * modify it under the terms of the GNU Lesser General Public
9; * License as published by the Free Software Foundation; either
10; * version 2.1 of the License, or (at your option) any later version.
11; *
12; * FFmpeg is distributed in the hope that it will be useful,
13; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15; * Lesser General Public License for more details.
16; *
17; * You should have received a copy of the GNU Lesser General Public
18; * License along with FFmpeg; if not, write to the Free Software
19; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20; */
21%include "libavutil/x86/x86util.asm"
22
23SECTION_RODATA 32
24cextern pw_255
25cextern pw_512
26cextern pw_2048
27cextern pw_8192
28cextern pw_1023
29cextern pw_1024
30cextern pw_4096
31%define pw_8 pw_512
32%define pw_10 pw_2048
33%define pw_12 pw_8192
34%define pw_bi_10 pw_1024
35%define pw_bi_12 pw_4096
36%define max_pixels_8 pw_255
37%define max_pixels_10 pw_1023
38pw_bi_8:                times 16 dw  (1 <<  8)
39max_pixels_12:          times 16 dw ((1 << 12)-1)
40cextern pd_1
41cextern pb_0
42
43%macro EPEL_TABLE 4
44hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
45                        times %2 d%3 10, -2
46                        times %2 d%3 -4, 54
47                        times %2 d%3 16, -2
48                        times %2 d%3 -6, 46
49                        times %2 d%3 28, -4
50                        times %2 d%3 -4, 36
51                        times %2 d%3 36, -4
52                        times %2 d%3 -4, 28
53                        times %2 d%3 46, -6
54                        times %2 d%3 -2, 16
55                        times %2 d%3 54, -4
56                        times %2 d%3 -2, 10
57                        times %2 d%3 58, -2
58%endmacro
59
60
61EPEL_TABLE  8,16, b, avx2
62EPEL_TABLE 10, 8, w, avx2
63
64EPEL_TABLE  8, 8, b, sse4
65EPEL_TABLE 10, 4, w, sse4
66EPEL_TABLE 12, 4, w, sse4
67
68%macro QPEL_TABLE 4
69hevc_qpel_filters_%4_%1 times %2 d%3  -1,  4
70                        times %2 d%3 -10, 58
71                        times %2 d%3  17, -5
72                        times %2 d%3   1,  0
73                        times %2 d%3  -1,  4
74                        times %2 d%3 -11, 40
75                        times %2 d%3  40,-11
76                        times %2 d%3   4, -1
77                        times %2 d%3   0,  1
78                        times %2 d%3  -5, 17
79                        times %2 d%3  58,-10
80                        times %2 d%3   4, -1
81%endmacro
82
83QPEL_TABLE  8, 8, b, sse4
84QPEL_TABLE 10, 4, w, sse4
85QPEL_TABLE 12, 4, w, sse4
86
87QPEL_TABLE  8,16, b, avx2
88QPEL_TABLE 10, 8, w, avx2
89
90QPEL_TABLE  4, 1, b, avx512icl_h
91QPEL_TABLE  8, 1, b, avx512icl_h
92QPEL_TABLE  8, 1, d, avx512icl_v
93QPEL_TABLE 16, 1, b, avx512icl_h
94QPEL_TABLE 32, 1, b, avx512icl_h
95QPEL_TABLE 64, 1, b, avx512icl_h
96
97pb_qpel_shuffle_index: db  0,  1,  2,  3
98                       db  1,  2,  3,  4
99                       db  2,  3,  4,  5
100                       db  3,  4,  5,  6
101                       db  4,  5,  6,  7
102                       db  5,  6,  7,  8
103                       db  6,  7,  8,  9
104                       db  7,  8,  9, 10
105                       db  8,  9, 10, 11
106                       db  9, 10, 11, 12
107                       db 10, 11, 12, 13
108                       db 11, 12, 13, 14
109                       db 12, 13, 14, 15
110                       db 13, 14, 15, 16
111                       db 14, 15, 16, 17
112                       db 15, 16, 17, 18
113                       db  4,  5,  6,  7
114                       db  5,  6,  7,  8
115                       db  6,  7,  8,  9
116                       db  7,  8,  9, 10
117                       db  8,  9, 10, 11
118                       db  9, 10, 11, 12
119                       db 10, 11, 12, 13
120                       db 11, 12, 13, 14
121                       db 12, 13, 14, 15
122                       db 13, 14, 15, 16
123                       db 14, 15, 16, 17
124                       db 15, 16, 17, 18
125                       db 16, 17, 18, 19
126                       db 17, 18, 19, 20
127                       db 18, 19, 20, 21
128                       db 19, 20, 21, 22
129
130SECTION .text
131
132%define MAX_PB_SIZE  64
133
134%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
135
136%define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10
137
138%if ARCH_X86_64
139
140%macro SIMPLE_BILOAD 4   ;width, tab, r1, r2
141%if %1 <= 4
142    movq              %3, [%2]                                              ; load data from source2
143%elif %1 <= 8
144    movdqa            %3, [%2]                                              ; load data from source2
145%elif %1 <= 12
146%if cpuflag(avx2)
147    mova              %3, [%2]
148%else
149    movdqa            %3, [%2]                                              ; load data from source2
150    movq              %4, [%2+16]                                           ; load data from source2
151%endif ;avx
152%elif %1 <= 16
153%if cpuflag(avx2)
154    mova              %3, [%2]
155%else
156    movdqa            %3, [%2]                                              ; load data from source2
157    movdqa            %4, [%2+16]                                           ; load data from source2
158%endif ; avx
159%else ; %1 = 32
160    mova              %3, [%2]
161    mova              %4, [%2+32]
162%endif
163%endmacro
164
165%macro SIMPLE_LOAD 4    ;width, bitd, tab, r1
166%if %1 == 2 || (%2 == 8 && %1 <= 4)
167    movd              %4, [%3]                                               ; load data from source
168%elif %1 == 4 || (%2 == 8 && %1 <= 8)
169    movq              %4, [%3]                                               ; load data from source
170%elif notcpuflag(avx)
171    movu              %4, [%3]                                               ; load data from source
172%elif %1 <= 8 || (%2 == 8 && %1 <= 16)
173    movdqu           %4, [%3]
174%else
175    movu              %4, [%3]
176%endif
177%endmacro
178
179
180%macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp
181%if cpuflag(avx2)
182%assign %%offset 32
183%ifdef PIC
184    lea              %5q, [hevc_epel_filters_avx2_%1]
185    %define FILTER %5q
186%else
187    %define FILTER hevc_epel_filters_avx2_%1
188%endif
189%else
190%assign %%offset 16
191%ifdef PIC
192    lea              %5q, [hevc_epel_filters_sse4_%1]
193    %define FILTER %5q
194%else
195    %define FILTER hevc_epel_filters_sse4_%1
196%endif
197%endif ;cpuflag(avx2)
198    sub              %2q, 1
199%if cpuflag(avx2)
200    shl              %2q, 6                      ; multiply by 64
201  %else
202    shl              %2q, 5                      ; multiply by 32
203%endif
204    mova           %3, [FILTER + %2q]        ; get 2 first values of filters
205    mova           %4, [FILTER + %2q+%%offset]     ; get 2 last values of filters
206%endmacro
207
208%macro EPEL_HV_FILTER 1
209%if cpuflag(avx2)
210%assign %%offset 32
211%assign %%shift  6
212%define %%table  hevc_epel_filters_avx2_%1
213%else
214%assign %%offset 16
215%assign %%shift  5
216%define %%table  hevc_epel_filters_sse4_%1
217%endif
218
219%ifdef PIC
220    lea           r3srcq, [%%table]
221    %define FILTER r3srcq
222%else
223    %define FILTER %%table
224%endif
225    sub              mxq, 1
226    sub              myq, 1
227    shl              mxq, %%shift                ; multiply by 32
228    shl              myq, %%shift                ; multiply by 32
229    mova             m14, [FILTER + mxq]        ; get 2 first values of filters
230    mova             m15, [FILTER + mxq+%%offset]     ; get 2 last values of filters
231
232%if cpuflag(avx2)
233%define %%table  hevc_epel_filters_avx2_10
234%else
235%define %%table  hevc_epel_filters_sse4_10
236%endif
237%ifdef PIC
238    lea           r3srcq, [%%table]
239    %define FILTER r3srcq
240%else
241    %define FILTER %%table
242%endif
243    mova             m12, [FILTER + myq]        ; get 2 first values of filters
244    mova             m13, [FILTER + myq+%%offset]     ; get 2 last values of filters
245    lea           r3srcq, [srcstrideq*3]
246%endmacro
247
248%macro QPEL_FILTER 2
249
250%if cpuflag(avx2)
251%assign %%offset 32
252%assign %%shift  7
253%define %%table  hevc_qpel_filters_avx2_%1
254%else
255%assign %%offset 16
256%assign %%shift  6
257%define %%table  hevc_qpel_filters_sse4_%1
258%endif
259
260%ifdef PIC
261    lea         rfilterq, [%%table]
262%else
263    %define rfilterq %%table
264%endif
265    sub              %2q, 1
266    shl              %2q, %%shift                        ; multiply by 32
267    mova             m12, [rfilterq + %2q]               ; get 4 first values of filters
268    mova             m13, [rfilterq + %2q +   %%offset]  ; get 4 first values of filters
269    mova             m14, [rfilterq + %2q + 2*%%offset]  ; get 4 first values of filters
270    mova             m15, [rfilterq + %2q + 3*%%offset]  ; get 4 first values of filters
271%endmacro
272
273%macro EPEL_LOAD 4
274%if (%1 == 8 && %4 <= 4)
275%define %%load movd
276%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
277%define %%load movq
278%else
279%define %%load movdqu
280%endif
281
282    %%load            m0, [%2q ]
283%ifnum %3
284    %%load            m1, [%2q+  %3]
285    %%load            m2, [%2q+2*%3]
286    %%load            m3, [%2q+3*%3]
287%else
288    %%load            m1, [%2q+  %3q]
289    %%load            m2, [%2q+2*%3q]
290    %%load            m3, [%2q+r3srcq]
291%endif
292%if %1 == 8
293%if %4 > 8
294    SBUTTERFLY        bw, 0, 1, 7
295    SBUTTERFLY        bw, 2, 3, 7
296%else
297    punpcklbw         m0, m1
298    punpcklbw         m2, m3
299%endif
300%else
301%if %4 > 4
302    SBUTTERFLY        wd, 0, 1, 7
303    SBUTTERFLY        wd, 2, 3, 7
304%else
305    punpcklwd         m0, m1
306    punpcklwd         m2, m3
307%endif
308%endif
309%endmacro
310
311
312%macro QPEL_H_LOAD 4
313%assign %%stride (%1+7)/8
314%if %1 == 8
315%if %3 <= 4
316%define %%load movd
317%elif %3 == 8
318%define %%load movq
319%else
320%define %%load movu
321%endif
322%else
323%if %3 == 2
324%define %%load movd
325%elif %3 == 4
326%define %%load movq
327%else
328%define %%load movu
329%endif
330%endif
331    %%load            m0, [%2-3*%%stride]        ;load data from source
332    %%load            m1, [%2-2*%%stride]
333    %%load            m2, [%2-%%stride  ]
334    %%load            m3, [%2           ]
335    %%load            m4, [%2+%%stride  ]
336    %%load            m5, [%2+2*%%stride]
337    %%load            m6, [%2+3*%%stride]
338    %%load            m7, [%2+4*%%stride]
339
340%if %1 == 8
341%if %3 > 8
342    SBUTTERFLY        wd, 0, 1, %4
343    SBUTTERFLY        wd, 2, 3, %4
344    SBUTTERFLY        wd, 4, 5, %4
345    SBUTTERFLY        wd, 6, 7, %4
346%else
347    punpcklbw         m0, m1
348    punpcklbw         m2, m3
349    punpcklbw         m4, m5
350    punpcklbw         m6, m7
351%endif
352%else
353%if %3 > 4
354    SBUTTERFLY        dq, 0, 1, %4
355    SBUTTERFLY        dq, 2, 3, %4
356    SBUTTERFLY        dq, 4, 5, %4
357    SBUTTERFLY        dq, 6, 7, %4
358%else
359    punpcklwd         m0, m1
360    punpcklwd         m2, m3
361    punpcklwd         m4, m5
362    punpcklwd         m6, m7
363%endif
364%endif
365%endmacro
366
367%macro QPEL_V_LOAD 5
368    lea              %5q, [%2]
369    sub              %5q, r3srcq
370    movu              m0, [%5q            ]      ;load x- 3*srcstride
371    movu              m1, [%5q+   %3q     ]      ;load x- 2*srcstride
372    movu              m2, [%5q+ 2*%3q     ]      ;load x-srcstride
373    movu              m3, [%2       ]      ;load x
374    movu              m4, [%2+   %3q]      ;load x+stride
375    movu              m5, [%2+ 2*%3q]      ;load x+2*stride
376    movu              m6, [%2+r3srcq]      ;load x+3*stride
377    movu              m7, [%2+ 4*%3q]      ;load x+4*stride
378%if %1 == 8
379%if %4 > 8
380    SBUTTERFLY        bw, 0, 1, 8
381    SBUTTERFLY        bw, 2, 3, 8
382    SBUTTERFLY        bw, 4, 5, 8
383    SBUTTERFLY        bw, 6, 7, 8
384%else
385    punpcklbw         m0, m1
386    punpcklbw         m2, m3
387    punpcklbw         m4, m5
388    punpcklbw         m6, m7
389%endif
390%else
391%if %4 > 4
392    SBUTTERFLY        wd, 0, 1, 8
393    SBUTTERFLY        wd, 2, 3, 8
394    SBUTTERFLY        wd, 4, 5, 8
395    SBUTTERFLY        wd, 6, 7, 8
396%else
397    punpcklwd         m0, m1
398    punpcklwd         m2, m3
399    punpcklwd         m4, m5
400    punpcklwd         m6, m7
401%endif
402%endif
403%endmacro
404
405%macro PEL_12STORE2 3
406    movd           [%1], %2
407%endmacro
408%macro PEL_12STORE4 3
409    movq           [%1], %2
410%endmacro
411%macro PEL_12STORE6 3
412    movq           [%1], %2
413    psrldq            %2, 8
414    movd         [%1+8], %2
415%endmacro
416%macro PEL_12STORE8 3
417    movdqa         [%1], %2
418%endmacro
419%macro PEL_12STORE12 3
420    movdqa         [%1], %2
421    movq        [%1+16], %3
422%endmacro
423%macro PEL_12STORE16 3
424    PEL_12STORE8      %1, %2, %3
425    movdqa       [%1+16], %3
426%endmacro
427
428%macro PEL_10STORE2 3
429    movd           [%1], %2
430%endmacro
431%macro PEL_10STORE4 3
432    movq           [%1], %2
433%endmacro
434%macro PEL_10STORE6 3
435    movq           [%1], %2
436    psrldq            %2, 8
437    movd         [%1+8], %2
438%endmacro
439%macro PEL_10STORE8 3
440    movdqa         [%1], %2
441%endmacro
442%macro PEL_10STORE12 3
443    movdqa         [%1], %2
444    movq        [%1+16], %3
445%endmacro
446%macro PEL_10STORE16 3
447%if cpuflag(avx2)
448    movu            [%1], %2
449%else
450    PEL_10STORE8      %1, %2, %3
451    movdqa       [%1+16], %3
452%endif
453%endmacro
454
455%macro PEL_10STORE32 3
456    PEL_10STORE16     %1, %2, %3
457    movu         [%1+32], %3
458%endmacro
459
460%macro PEL_8STORE2 3
461    pextrw          [%1], %2, 0
462%endmacro
463%macro PEL_8STORE4 3
464    movd            [%1], %2
465%endmacro
466%macro PEL_8STORE6 3
467    movd            [%1], %2
468    pextrw        [%1+4], %2, 2
469%endmacro
470%macro PEL_8STORE8 3
471    movq           [%1], %2
472%endmacro
473%macro PEL_8STORE12 3
474    movq            [%1], %2
475    psrldq            %2, 8
476    movd          [%1+8], %2
477%endmacro
478%macro PEL_8STORE16 3
479%if cpuflag(avx2)
480    movdqu        [%1], %2
481%else
482    mova          [%1], %2
483%endif ; avx
484%endmacro
485%macro PEL_8STORE32 3
486    movu          [%1], %2
487%endmacro
488
489%macro LOOP_END 3
490    add              %1q, 2*MAX_PB_SIZE          ; dst += dststride
491    add              %2q, %3q                    ; src += srcstride
492    dec          heightd                         ; cmp height
493    jnz               .loop                      ; height loop
494%endmacro
495
496
497%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth
498%if %2 == 8
499%if cpuflag(avx2) && %0 ==3
500%if %1 > 16
501    vextracti128 xm1, m0, 1
502    pmovzxbw      m1, xm1
503    psllw         m1, 14-%2
504%endif
505    pmovzxbw      m0, xm0
506%else ; not avx
507%if %1 > 8
508    punpckhbw     m1, m0, m2
509    psllw         m1, 14-%2
510%endif
511    punpcklbw     m0, m2
512%endif
513%endif ;avx
514    psllw         m0, 14-%2
515%endmacro
516
517%macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3
518%if %0 == 8
519%define %%reg0 %5
520%define %%reg2 %6
521%define %%reg1 %7
522%define %%reg3 %8
523%else
524%define %%reg0 m0
525%define %%reg2 m2
526%define %%reg1 m1
527%define %%reg3 m3
528%endif
529%if %1 == 8
530%if cpuflag(avx2) && (%0 == 5)
531%if %2 > 16
532    vperm2i128    m10, m0, m1, q0301
533%endif
534    vinserti128    m0, m0, xm1, 1
535    mova           m1, m10
536%if %2 > 16
537    vperm2i128    m10, m2, m3, q0301
538%endif
539    vinserti128    m2, m2, xm3, 1
540    mova           m3, m10
541%endif
542    pmaddubsw      %%reg0, %3   ;x1*c1+x2*c2
543    pmaddubsw      %%reg2, %4   ;x3*c3+x4*c4
544    paddw          %%reg0, %%reg2
545%if %2 > 8
546    pmaddubsw      %%reg1, %3
547    pmaddubsw      %%reg3, %4
548    paddw          %%reg1, %%reg3
549%endif
550%else
551    pmaddwd        %%reg0, %3
552    pmaddwd        %%reg2, %4
553    paddd          %%reg0, %%reg2
554%if %2 > 4
555    pmaddwd        %%reg1, %3
556    pmaddwd        %%reg3, %4
557    paddd          %%reg1, %%reg3
558%if %1 != 8
559    psrad          %%reg1, %1-8
560%endif
561%endif
562%if %1 != 8
563    psrad          %%reg0, %1-8
564%endif
565    packssdw       %%reg0, %%reg1
566%endif
567%endmacro
568
569%macro QPEL_HV_COMPUTE 4     ; width, bitdepth, filter idx
570
571%if cpuflag(avx2)
572%assign %%offset 32
573%define %%table  hevc_qpel_filters_avx2_%2
574%else
575%assign %%offset 16
576%define %%table  hevc_qpel_filters_sse4_%2
577%endif
578
579%ifdef PIC
580    lea         rfilterq, [%%table]
581%else
582    %define rfilterq %%table
583%endif
584
585%if %2 == 8
586    pmaddubsw         m0, [rfilterq + %3q*8   ]   ;x1*c1+x2*c2
587    pmaddubsw         m2, [rfilterq + %3q*8+%%offset]   ;x3*c3+x4*c4
588    pmaddubsw         m4, [rfilterq + %3q*8+2*%%offset]   ;x5*c5+x6*c6
589    pmaddubsw         m6, [rfilterq + %3q*8+3*%%offset]   ;x7*c7+x8*c8
590    paddw             m0, m2
591    paddw             m4, m6
592    paddw             m0, m4
593%else
594    pmaddwd           m0, [rfilterq + %3q*8   ]
595    pmaddwd           m2, [rfilterq + %3q*8+%%offset]
596    pmaddwd           m4, [rfilterq + %3q*8+2*%%offset]
597    pmaddwd           m6, [rfilterq + %3q*8+3*%%offset]
598    paddd             m0, m2
599    paddd             m4, m6
600    paddd             m0, m4
601%if %2 != 8
602    psrad             m0, %2-8
603%endif
604%if %1 > 4
605    pmaddwd           m1, [rfilterq + %3q*8   ]
606    pmaddwd           m3, [rfilterq + %3q*8+%%offset]
607    pmaddwd           m5, [rfilterq + %3q*8+2*%%offset]
608    pmaddwd           m7, [rfilterq + %3q*8+3*%%offset]
609    paddd             m1, m3
610    paddd             m5, m7
611    paddd             m1, m5
612%if %2 != 8
613    psrad             m1, %2-8
614%endif
615%endif
616    p%4               m0, m1
617%endif
618%endmacro
619
620%macro QPEL_COMPUTE 2-3     ; width, bitdepth
621%if %2 == 8
622%if cpuflag(avx2) && (%0 == 3)
623
624    vperm2i128 m10, m0,  m1, q0301
625    vinserti128 m0, m0, xm1, 1
626    SWAP 1, 10
627
628    vperm2i128 m10, m2,  m3, q0301
629    vinserti128 m2, m2, xm3, 1
630    SWAP 3, 10
631
632
633    vperm2i128 m10, m4,  m5, q0301
634    vinserti128 m4, m4, xm5, 1
635    SWAP 5, 10
636
637    vperm2i128 m10, m6,  m7, q0301
638    vinserti128 m6, m6, xm7, 1
639    SWAP 7, 10
640%endif
641
642    pmaddubsw         m0, m12   ;x1*c1+x2*c2
643    pmaddubsw         m2, m13   ;x3*c3+x4*c4
644    pmaddubsw         m4, m14   ;x5*c5+x6*c6
645    pmaddubsw         m6, m15   ;x7*c7+x8*c8
646    paddw             m0, m2
647    paddw             m4, m6
648    paddw             m0, m4
649%if %1 > 8
650    pmaddubsw         m1, m12
651    pmaddubsw         m3, m13
652    pmaddubsw         m5, m14
653    pmaddubsw         m7, m15
654    paddw             m1, m3
655    paddw             m5, m7
656    paddw             m1, m5
657%endif
658%else
659    pmaddwd           m0, m12
660    pmaddwd           m2, m13
661    pmaddwd           m4, m14
662    pmaddwd           m6, m15
663    paddd             m0, m2
664    paddd             m4, m6
665    paddd             m0, m4
666%if %2 != 8
667    psrad             m0, %2-8
668%endif
669%if %1 > 4
670    pmaddwd           m1, m12
671    pmaddwd           m3, m13
672    pmaddwd           m5, m14
673    pmaddwd           m7, m15
674    paddd             m1, m3
675    paddd             m5, m7
676    paddd             m1, m5
677%if %2 != 8
678    psrad             m1, %2-8
679%endif
680%endif
681%endif
682%endmacro
683
684%macro BI_COMPUTE 7-8     ; width, bitd, src1l, src1h, scr2l, scr2h, pw
685    paddsw            %3, %5
686%if %1 > 8
687    paddsw            %4, %6
688%endif
689    UNI_COMPUTE       %1, %2, %3, %4, %7
690%if %0 == 8 && cpuflag(avx2) && (%2 == 8)
691    vpermq            %3, %3, 216
692    vpermq            %4, %4, 216
693%endif
694%endmacro
695
696%macro UNI_COMPUTE 5
697    pmulhrsw          %3, %5
698%if %1 > 8 || (%2 > 8 && %1 > 4)
699    pmulhrsw          %4, %5
700%endif
701%if %2 == 8
702    packuswb          %3, %4
703%else
704    CLIPW             %3, [pb_0], [max_pixels_%2]
705%if (%1 > 8 && notcpuflag(avx)) || %1 > 16
706    CLIPW             %4, [pb_0], [max_pixels_%2]
707%endif
708%endif
709%endmacro
710
711
712; ******************************
713; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
714;                         uint8_t *_src, ptrdiff_t _srcstride,
715;                         int height, int mx, int my)
716; ******************************
717
718%macro HEVC_PUT_HEVC_PEL_PIXELS 2
719HEVC_PEL_PIXELS     %1, %2
720HEVC_UNI_PEL_PIXELS %1, %2
721HEVC_BI_PEL_PIXELS  %1, %2
722%endmacro
723
724%macro HEVC_PEL_PIXELS 2
725cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
726    pxor               m2, m2
727.loop:
728    SIMPLE_LOAD       %1, %2, srcq, m0
729    MC_PIXEL_COMPUTE  %1, %2, 1
730    PEL_10STORE%1     dstq, m0, m1
731    LOOP_END         dst, src, srcstride
732    RET
733 %endmacro
734
735%macro HEVC_UNI_PEL_PIXELS 2
736cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
737.loop:
738    SIMPLE_LOAD       %1, %2, srcq, m0
739    PEL_%2STORE%1   dstq, m0, m1
740    add             dstq, dststrideq             ; dst += dststride
741    add             srcq, srcstrideq             ; src += srcstride
742    dec          heightd                         ; cmp height
743    jnz               .loop                      ; height loop
744    RET
745%endmacro
746
747%macro HEVC_BI_PEL_PIXELS 2
748cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
749    pxor              m2, m2
750    movdqa            m5, [pw_bi_%2]
751.loop:
752    SIMPLE_LOAD       %1, %2, srcq, m0
753    SIMPLE_BILOAD     %1, src2q, m3, m4
754    MC_PIXEL_COMPUTE  %1, %2, 1
755    BI_COMPUTE        %1, %2, m0, m1, m3, m4, m5, 1
756    PEL_%2STORE%1   dstq, m0, m1
757    add             dstq, dststrideq             ; dst += dststride
758    add             srcq, srcstrideq             ; src += srcstride
759    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
760    dec          heightd                         ; cmp height
761    jnz               .loop                      ; height loop
762    RET
763%endmacro
764
765
766; ******************************
767; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
768;                       uint8_t *_src, ptrdiff_t _srcstride,
769;                       int height, int mx, int my, int width);
770; ******************************
771
772
773%macro HEVC_PUT_HEVC_EPEL 2
774%if cpuflag(avx2)
775%define XMM_REGS  11
776%else
777%define XMM_REGS  8
778%endif
779
780cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter
781%assign %%stride ((%2 + 7)/8)
782    EPEL_FILTER       %2, mx, m4, m5, rfilter
783.loop:
784    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
785    EPEL_COMPUTE      %2, %1, m4, m5, 1
786    PEL_10STORE%1      dstq, m0, m1
787    LOOP_END         dst, src, srcstride
788    RET
789
790cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter
791%assign %%stride ((%2 + 7)/8)
792    movdqa            m6, [pw_%2]
793    EPEL_FILTER       %2, mx, m4, m5, rfilter
794.loop:
795    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
796    EPEL_COMPUTE      %2, %1, m4, m5
797    UNI_COMPUTE       %1, %2, m0, m1, m6
798    PEL_%2STORE%1   dstq, m0, m1
799    add             dstq, dststrideq             ; dst += dststride
800    add             srcq, srcstrideq             ; src += srcstride
801    dec          heightd                         ; cmp height
802    jnz               .loop                      ; height loop
803    RET
804
805cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter
806    movdqa            m6, [pw_bi_%2]
807    EPEL_FILTER       %2, mx, m4, m5, rfilter
808.loop:
809    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
810    EPEL_COMPUTE      %2, %1, m4, m5, 1
811    SIMPLE_BILOAD     %1, src2q, m2, m3
812    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
813    PEL_%2STORE%1   dstq, m0, m1
814    add             dstq, dststrideq             ; dst += dststride
815    add             srcq, srcstrideq             ; src += srcstride
816    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
817    dec          heightd                         ; cmp height
818    jnz               .loop                      ; height loop
819    RET
820
821; ******************************
822; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
823;                      uint8_t *_src, ptrdiff_t _srcstride,
824;                      int height, int mx, int my, int width)
825; ******************************
826
827cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my
828    movifnidn        myd, mym
829    sub             srcq, srcstrideq
830    EPEL_FILTER       %2, my, m4, m5, r3src
831    lea           r3srcq, [srcstrideq*3]
832.loop:
833    EPEL_LOAD         %2, srcq, srcstride, %1
834    EPEL_COMPUTE      %2, %1, m4, m5, 1
835    PEL_10STORE%1     dstq, m0, m1
836    LOOP_END          dst, src, srcstride
837    RET
838
839cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my
840    movifnidn        myd, mym
841    movdqa            m6, [pw_%2]
842    sub             srcq, srcstrideq
843    EPEL_FILTER       %2, my, m4, m5, r3src
844    lea           r3srcq, [srcstrideq*3]
845.loop:
846    EPEL_LOAD         %2, srcq, srcstride, %1
847    EPEL_COMPUTE      %2, %1, m4, m5
848    UNI_COMPUTE       %1, %2, m0, m1, m6
849    PEL_%2STORE%1   dstq, m0, m1
850    add             dstq, dststrideq             ; dst += dststride
851    add             srcq, srcstrideq             ; src += srcstride
852    dec          heightd                         ; cmp height
853    jnz               .loop                      ; height loop
854    RET
855
856
857cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my
858    movifnidn        myd, mym
859    movdqa            m6, [pw_bi_%2]
860    sub             srcq, srcstrideq
861    EPEL_FILTER       %2, my, m4, m5, r3src
862    lea           r3srcq, [srcstrideq*3]
863.loop:
864    EPEL_LOAD         %2, srcq, srcstride, %1
865    EPEL_COMPUTE      %2, %1, m4, m5, 1
866    SIMPLE_BILOAD     %1, src2q, m2, m3
867    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
868    PEL_%2STORE%1   dstq, m0, m1
869    add             dstq, dststrideq             ; dst += dststride
870    add             srcq, srcstrideq             ; src += srcstride
871    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
872    dec          heightd                         ; cmp height
873    jnz               .loop                      ; height loop
874    RET
875%endmacro
876
877
878; ******************************
879; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
880;                       uint8_t *_src, ptrdiff_t _srcstride,
881;                       int height, int mx, int my, int width)
882; ******************************
883
884%macro HEVC_PUT_HEVC_EPEL_HV 2
885cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src
886%assign %%stride ((%2 + 7)/8)
887    sub             srcq, srcstrideq
888    EPEL_HV_FILTER    %2
889    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
890    EPEL_COMPUTE      %2, %1, m14, m15
891%if (%1 > 8 && (%2 == 8))
892    SWAP              m8, m1
893%endif
894    SWAP              m4, m0
895    add             srcq, srcstrideq
896    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
897    EPEL_COMPUTE      %2, %1, m14, m15
898%if (%1 > 8 && (%2 == 8))
899    SWAP              m9, m1
900%endif
901    SWAP              m5, m0
902    add             srcq, srcstrideq
903    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
904    EPEL_COMPUTE      %2, %1, m14, m15
905%if (%1 > 8 && (%2 == 8))
906    SWAP             m10, m1
907%endif
908    SWAP              m6, m0
909    add             srcq, srcstrideq
910.loop:
911    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
912    EPEL_COMPUTE      %2, %1, m14, m15
913%if (%1 > 8 && (%2 == 8))
914    SWAP             m11, m1
915%endif
916    SWAP              m7, m0
917    punpcklwd         m0, m4, m5
918    punpcklwd         m2, m6, m7
919%if %1 > 4
920    punpckhwd         m1, m4, m5
921    punpckhwd         m3, m6, m7
922%endif
923    EPEL_COMPUTE      14, %1, m12, m13
924%if (%1 > 8 && (%2 == 8))
925    punpcklwd         m4, m8, m9
926    punpcklwd         m2, m10, m11
927    punpckhwd         m8, m8, m9
928    punpckhwd         m3, m10, m11
929    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
930%if cpuflag(avx2)
931    vinserti128       m2, m0, xm4, 1
932    vperm2i128        m3, m0, m4, q0301
933    PEL_10STORE%1     dstq, m2, m3
934%else
935    PEL_10STORE%1     dstq, m0, m4
936%endif
937%else
938    PEL_10STORE%1     dstq, m0, m1
939%endif
940    movdqa            m4, m5
941    movdqa            m5, m6
942    movdqa            m6, m7
943%if (%1 > 8 && (%2 == 8))
944    mova              m8, m9
945    mova              m9, m10
946    mova             m10, m11
947%endif
948    LOOP_END         dst, src, srcstride
949    RET
950
951cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src
952%assign %%stride ((%2 + 7)/8)
953    sub             srcq, srcstrideq
954    EPEL_HV_FILTER    %2
955    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
956    EPEL_COMPUTE      %2, %1, m14, m15
957%if (%1 > 8 && (%2 == 8))
958    SWAP              m8, m1
959%endif
960    SWAP              m4, m0
961    add             srcq, srcstrideq
962    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
963    EPEL_COMPUTE      %2, %1, m14, m15
964%if (%1 > 8 && (%2 == 8))
965    SWAP              m9, m1
966%endif
967    SWAP              m5, m0
968    add             srcq, srcstrideq
969    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
970    EPEL_COMPUTE      %2, %1, m14, m15
971%if (%1 > 8 && (%2 == 8))
972    SWAP             m10, m1
973%endif
974    SWAP              m6, m0
975    add             srcq, srcstrideq
976.loop:
977    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
978    EPEL_COMPUTE      %2, %1, m14, m15
979%if (%1 > 8 && (%2 == 8))
980    SWAP             m11, m1
981%endif
982    mova              m7, m0
983    punpcklwd         m0, m4, m5
984    punpcklwd         m2, m6, m7
985%if %1 > 4
986    punpckhwd         m1, m4, m5
987    punpckhwd         m3, m6, m7
988%endif
989    EPEL_COMPUTE      14, %1, m12, m13
990%if (%1 > 8 && (%2 == 8))
991    punpcklwd         m4, m8, m9
992    punpcklwd         m2, m10, m11
993    punpckhwd         m8, m8, m9
994    punpckhwd         m3, m10, m11
995    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
996    UNI_COMPUTE       %1, %2, m0, m4, [pw_%2]
997%else
998    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
999%endif
1000    PEL_%2STORE%1   dstq, m0, m1
1001    mova              m4, m5
1002    mova              m5, m6
1003    mova              m6, m7
1004%if (%1 > 8 && (%2 == 8))
1005    mova              m8, m9
1006    mova              m9, m10
1007    mova             m10, m11
1008%endif
1009    add             dstq, dststrideq             ; dst += dststride
1010    add             srcq, srcstrideq             ; src += srcstride
1011    dec          heightd                         ; cmp height
1012    jnz               .loop                      ; height loop
1013    RET
1014
1015cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src
1016%assign %%stride ((%2 + 7)/8)
1017    sub             srcq, srcstrideq
1018    EPEL_HV_FILTER    %2
1019    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
1020    EPEL_COMPUTE      %2, %1, m14, m15
1021%if (%1 > 8 && (%2 == 8))
1022    SWAP              m8, m1
1023%endif
1024    SWAP              m4, m0
1025    add             srcq, srcstrideq
1026    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
1027    EPEL_COMPUTE      %2, %1, m14, m15
1028%if (%1 > 8 && (%2 == 8))
1029    SWAP              m9, m1
1030%endif
1031    SWAP              m5, m0
1032    add             srcq, srcstrideq
1033    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
1034    EPEL_COMPUTE      %2, %1, m14, m15
1035%if (%1 > 8 && (%2 == 8))
1036    SWAP             m10, m1
1037%endif
1038    SWAP              m6, m0
1039    add             srcq, srcstrideq
1040.loop:
1041    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
1042    EPEL_COMPUTE      %2, %1, m14, m15
1043%if (%1 > 8 && (%2 == 8))
1044    SWAP             m11, m1
1045%endif
1046    SWAP              m7, m0
1047    punpcklwd         m0, m4, m5
1048    punpcklwd         m2, m6, m7
1049%if %1 > 4
1050    punpckhwd         m1, m4, m5
1051    punpckhwd         m3, m6, m7
1052%endif
1053    EPEL_COMPUTE      14, %1, m12, m13
1054%if (%1 > 8 && (%2 == 8))
1055    punpcklwd         m4, m8, m9
1056    punpcklwd         m2, m10, m11
1057    punpckhwd         m8, m8, m9
1058    punpckhwd         m3, m10, m11
1059    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
1060    SIMPLE_BILOAD     %1, src2q, m8, m3
1061%if cpuflag(avx2)
1062    vinserti128       m1, m8, xm3, 1
1063    vperm2i128        m2, m8, m3, q0301
1064    BI_COMPUTE        %1, %2, m0, m4, m1, m2, [pw_bi_%2]
1065%else
1066    BI_COMPUTE        %1, %2, m0, m4, m8, m3, [pw_bi_%2]
1067%endif
1068%else
1069    SIMPLE_BILOAD     %1, src2q, m8, m9
1070    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1071%endif
1072    PEL_%2STORE%1   dstq, m0, m4
1073    mova              m4, m5
1074    mova              m5, m6
1075    mova              m6, m7
1076%if (%1 > 8 && (%2 == 8))
1077    mova              m8, m9
1078    mova              m9, m10
1079    mova             m10, m11
1080%endif
1081    add             dstq, dststrideq             ; dst += dststride
1082    add             srcq, srcstrideq             ; src += srcstride
1083    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
1084    dec          heightd                         ; cmp height
1085    jnz               .loop                      ; height loop
1086    RET
1087%endmacro
1088
1089; ******************************
1090; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
1091;                       uint8_t *_src, ptrdiff_t _srcstride,
1092;                       int height, int mx, int my, int width)
1093; ******************************
1094
1095%macro HEVC_PUT_HEVC_QPEL 2
1096cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter
1097    QPEL_FILTER       %2, mx
1098.loop:
1099    QPEL_H_LOAD       %2, srcq, %1, 10
1100    QPEL_COMPUTE      %1, %2, 1
1101%if %2 > 8
1102    packssdw          m0, m1
1103%endif
1104    PEL_10STORE%1     dstq, m0, m1
1105    LOOP_END          dst, src, srcstride
1106    RET
1107
1108cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
1109    mova              m9, [pw_%2]
1110    QPEL_FILTER       %2, mx
1111.loop:
1112    QPEL_H_LOAD       %2, srcq, %1, 10
1113    QPEL_COMPUTE      %1, %2
1114%if %2 > 8
1115    packssdw          m0, m1
1116%endif
1117    UNI_COMPUTE       %1, %2, m0, m1, m9
1118    PEL_%2STORE%1   dstq, m0, m1
1119    add             dstq, dststrideq             ; dst += dststride
1120    add             srcq, srcstrideq             ; src += srcstride
1121    dec          heightd                         ; cmp height
1122    jnz               .loop                      ; height loop
1123    RET
1124
1125cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
1126    movdqa            m9, [pw_bi_%2]
1127    QPEL_FILTER       %2, mx
1128.loop:
1129    QPEL_H_LOAD       %2, srcq, %1, 10
1130    QPEL_COMPUTE      %1, %2, 1
1131%if %2 > 8
1132    packssdw          m0, m1
1133%endif
1134    SIMPLE_BILOAD     %1, src2q, m10, m11
1135    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
1136    PEL_%2STORE%1   dstq, m0, m1
1137    add             dstq, dststrideq             ; dst += dststride
1138    add             srcq, srcstrideq             ; src += srcstride
1139    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
1140    dec          heightd                         ; cmp height
1141    jnz               .loop                      ; height loop
1142    RET
1143
1144
1145; ******************************
1146; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
1147;                       uint8_t *_src, ptrdiff_t _srcstride,
1148;                       int height, int mx, int my, int width)
1149; ******************************
1150
1151cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter
1152    movifnidn        myd, mym
1153    lea           r3srcq, [srcstrideq*3]
1154    QPEL_FILTER       %2, my
1155.loop:
1156    QPEL_V_LOAD       %2, srcq, srcstride, %1, r7
1157    QPEL_COMPUTE      %1, %2, 1
1158%if %2 > 8
1159    packssdw          m0, m1
1160%endif
1161    PEL_10STORE%1     dstq, m0, m1
1162    LOOP_END         dst, src, srcstride
1163    RET
1164
1165cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter
1166    movifnidn        myd, mym
1167    movdqa            m9, [pw_%2]
1168    lea           r3srcq, [srcstrideq*3]
1169    QPEL_FILTER       %2, my
1170.loop:
1171    QPEL_V_LOAD       %2, srcq, srcstride, %1, r8
1172    QPEL_COMPUTE      %1, %2
1173%if %2 > 8
1174    packssdw          m0, m1
1175%endif
1176    UNI_COMPUTE       %1, %2, m0, m1, m9
1177    PEL_%2STORE%1   dstq, m0, m1
1178    add             dstq, dststrideq             ; dst += dststride
1179    add             srcq, srcstrideq             ; src += srcstride
1180    dec          heightd                         ; cmp height
1181    jnz               .loop                      ; height loop
1182    RET
1183
1184cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
1185    movifnidn        myd, mym
1186    movdqa            m9, [pw_bi_%2]
1187    lea           r3srcq, [srcstrideq*3]
1188    QPEL_FILTER       %2, my
1189.loop:
1190    QPEL_V_LOAD       %2, srcq, srcstride, %1, r9
1191    QPEL_COMPUTE      %1, %2, 1
1192%if %2 > 8
1193    packssdw          m0, m1
1194%endif
1195    SIMPLE_BILOAD     %1, src2q, m10, m11
1196    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
1197    PEL_%2STORE%1   dstq, m0, m1
1198    add             dstq, dststrideq             ; dst += dststride
1199    add             srcq, srcstrideq             ; src += srcstride
1200    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
1201    dec          heightd                         ; cmp height
1202    jnz               .loop                      ; height loop
1203    RET
1204%endmacro
1205
1206
1207; ******************************
1208; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
1209;                       uint8_t *_src, ptrdiff_t _srcstride,
1210;                       int height, int mx, int my)
1211; ******************************
1212%macro HEVC_PUT_HEVC_QPEL_HV 2
1213cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
1214%if cpuflag(avx2)
1215%assign %%shift  4
1216%else
1217%assign %%shift  3
1218%endif
1219    sub              mxq, 1
1220    sub              myq, 1
1221    shl              mxq, %%shift                ; multiply by 32
1222    shl              myq, %%shift                ; multiply by 32
1223    lea           r3srcq, [srcstrideq*3]
1224    sub             srcq, r3srcq
1225    QPEL_H_LOAD       %2, srcq, %1, 15
1226    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1227    SWAP              m8, m0
1228    add             srcq, srcstrideq
1229    QPEL_H_LOAD       %2, srcq, %1, 15
1230    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1231    SWAP              m9, m0
1232    add             srcq, srcstrideq
1233    QPEL_H_LOAD       %2, srcq, %1, 15
1234    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1235    SWAP             m10, m0
1236    add             srcq, srcstrideq
1237    QPEL_H_LOAD       %2, srcq, %1, 15
1238    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1239    SWAP             m11, m0
1240    add             srcq, srcstrideq
1241    QPEL_H_LOAD       %2, srcq, %1, 15
1242    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1243    SWAP             m12, m0
1244    add             srcq, srcstrideq
1245    QPEL_H_LOAD       %2, srcq, %1, 15
1246    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1247    SWAP             m13, m0
1248    add             srcq, srcstrideq
1249    QPEL_H_LOAD       %2, srcq, %1, 15
1250    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1251    SWAP             m14, m0
1252    add             srcq, srcstrideq
1253.loop:
1254    QPEL_H_LOAD       %2, srcq, %1, 15
1255    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1256    SWAP             m15, m0
1257    punpcklwd         m0, m8, m9
1258    punpcklwd         m2, m10, m11
1259    punpcklwd         m4, m12, m13
1260    punpcklwd         m6, m14, m15
1261%if %1 > 4
1262    punpckhwd         m1, m8, m9
1263    punpckhwd         m3, m10, m11
1264    punpckhwd         m5, m12, m13
1265    punpckhwd         m7, m14, m15
1266%endif
1267    QPEL_HV_COMPUTE   %1, 14, my, ackssdw
1268    PEL_10STORE%1     dstq, m0, m1
1269%if %1 <= 4
1270    movq              m8, m9
1271    movq              m9, m10
1272    movq             m10, m11
1273    movq             m11, m12
1274    movq             m12, m13
1275    movq             m13, m14
1276    movq             m14, m15
1277%else
1278    movdqa            m8, m9
1279    movdqa            m9, m10
1280    movdqa           m10, m11
1281    movdqa           m11, m12
1282    movdqa           m12, m13
1283    movdqa           m13, m14
1284    movdqa           m14, m15
1285%endif
1286    LOOP_END         dst, src, srcstride
1287    RET
1288
1289cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
1290%if cpuflag(avx2)
1291%assign %%shift  4
1292%else
1293%assign %%shift  3
1294%endif
1295    sub              mxq, 1
1296    sub              myq, 1
1297    shl              mxq, %%shift                ; multiply by 32
1298    shl              myq, %%shift                ; multiply by 32
1299    lea           r3srcq, [srcstrideq*3]
1300    sub             srcq, r3srcq
1301    QPEL_H_LOAD       %2, srcq, %1, 15
1302    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1303    SWAP              m8, m0
1304    add             srcq, srcstrideq
1305    QPEL_H_LOAD       %2, srcq, %1, 15
1306    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1307    SWAP              m9, m0
1308    add             srcq, srcstrideq
1309    QPEL_H_LOAD       %2, srcq, %1, 15
1310    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1311    SWAP             m10, m0
1312    add             srcq, srcstrideq
1313    QPEL_H_LOAD       %2, srcq, %1, 15
1314    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1315    SWAP             m11, m0
1316    add             srcq, srcstrideq
1317    QPEL_H_LOAD       %2, srcq, %1, 15
1318    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1319    SWAP             m12, m0
1320    add             srcq, srcstrideq
1321    QPEL_H_LOAD       %2, srcq, %1, 15
1322    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1323    SWAP             m13, m0
1324    add             srcq, srcstrideq
1325    QPEL_H_LOAD       %2, srcq, %1, 15
1326    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1327    SWAP             m14, m0
1328    add             srcq, srcstrideq
1329.loop:
1330    QPEL_H_LOAD       %2, srcq, %1, 15
1331    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1332    SWAP             m15, m0
1333    punpcklwd         m0, m8, m9
1334    punpcklwd         m2, m10, m11
1335    punpcklwd         m4, m12, m13
1336    punpcklwd         m6, m14, m15
1337%if %1 > 4
1338    punpckhwd         m1, m8, m9
1339    punpckhwd         m3, m10, m11
1340    punpckhwd         m5, m12, m13
1341    punpckhwd         m7, m14, m15
1342%endif
1343    QPEL_HV_COMPUTE   %1, 14, my, ackusdw
1344    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
1345    PEL_%2STORE%1   dstq, m0, m1
1346
1347%if %1 <= 4
1348    movq              m8, m9
1349    movq              m9, m10
1350    movq             m10, m11
1351    movq             m11, m12
1352    movq             m12, m13
1353    movq             m13, m14
1354    movq             m14, m15
1355%else
1356    mova            m8, m9
1357    mova            m9, m10
1358    mova           m10, m11
1359    mova           m11, m12
1360    mova           m12, m13
1361    mova           m13, m14
1362    mova           m14, m15
1363%endif
1364    add             dstq, dststrideq             ; dst += dststride
1365    add             srcq, srcstrideq             ; src += srcstride
1366    dec          heightd                         ; cmp height
1367    jnz               .loop                      ; height loop
1368    RET
1369
1370cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
1371%if cpuflag(avx2)
1372%assign %%shift  4
1373%else
1374%assign %%shift  3
1375%endif
1376    sub              mxq, 1
1377    sub              myq, 1
1378    shl              mxq, %%shift                ; multiply by 32
1379    shl              myq, %%shift                ; multiply by 32
1380    lea           r3srcq, [srcstrideq*3]
1381    sub             srcq, r3srcq
1382    QPEL_H_LOAD       %2, srcq, %1, 15
1383    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1384    SWAP              m8, m0
1385    add             srcq, srcstrideq
1386    QPEL_H_LOAD       %2, srcq, %1, 15
1387    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1388    SWAP              m9, m0
1389    add             srcq, srcstrideq
1390    QPEL_H_LOAD       %2, srcq, %1, 15
1391    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1392    SWAP             m10, m0
1393    add             srcq, srcstrideq
1394    QPEL_H_LOAD       %2, srcq, %1, 15
1395    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1396    SWAP             m11, m0
1397    add             srcq, srcstrideq
1398    QPEL_H_LOAD       %2, srcq, %1, 15
1399    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1400    SWAP             m12, m0
1401    add             srcq, srcstrideq
1402    QPEL_H_LOAD       %2, srcq, %1, 15
1403    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1404    SWAP             m13, m0
1405    add             srcq, srcstrideq
1406    QPEL_H_LOAD       %2, srcq, %1, 15
1407    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1408    SWAP             m14, m0
1409    add             srcq, srcstrideq
1410.loop:
1411    QPEL_H_LOAD       %2, srcq, %1, 15
1412    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1413    SWAP             m15, m0
1414    punpcklwd         m0, m8, m9
1415    punpcklwd         m2, m10, m11
1416    punpcklwd         m4, m12, m13
1417    punpcklwd         m6, m14, m15
1418%if %1 > 4
1419    punpckhwd         m1, m8, m9
1420    punpckhwd         m3, m10, m11
1421    punpckhwd         m5, m12, m13
1422    punpckhwd         m7, m14, m15
1423%endif
1424    QPEL_HV_COMPUTE   %1, 14, my, ackssdw
1425    SIMPLE_BILOAD     %1, src2q, m8, m9 ;m9 not used in this case
1426    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1427    PEL_%2STORE%1   dstq, m0, m1
1428
1429%if %1 <= 4
1430    movq              m8, m9
1431    movq              m9, m10
1432    movq             m10, m11
1433    movq             m11, m12
1434    movq             m12, m13
1435    movq             m13, m14
1436    movq             m14, m15
1437%else
1438    movdqa            m8, m9
1439    movdqa            m9, m10
1440    movdqa           m10, m11
1441    movdqa           m11, m12
1442    movdqa           m12, m13
1443    movdqa           m13, m14
1444    movdqa           m14, m15
1445%endif
1446    add             dstq, dststrideq             ; dst += dststride
1447    add             srcq, srcstrideq             ; src += srcstride
1448    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
1449    dec          heightd                         ; cmp height
1450    jnz               .loop                      ; height loop
1451    RET
1452%endmacro
1453
1454%macro WEIGHTING_FUNCS 2
1455%if WIN64 || ARCH_X86_32
1456cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, height, denom, wx, ox
1457    mov             r4d, denomm
1458%define SHIFT  r4d
1459%else
1460cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, height, denom, wx, ox
1461%define SHIFT  denomd
1462%endif
1463    lea           SHIFT, [SHIFT+14-%2]          ; shift = 14 - bitd + denom
1464%if %1 <= 4
1465    pxor             m1, m1
1466%endif
1467    movd             m2, wxm        ; WX
1468    movd             m4, SHIFT      ; shift
1469%if %1 <= 4
1470    punpcklwd        m2, m1
1471%else
1472    punpcklwd        m2, m2
1473%endif
1474    dec           SHIFT
1475    movdqu           m5, [pd_1]
1476    movd             m6, SHIFT
1477    pshufd           m2, m2, 0
1478    mov           SHIFT, oxm
1479    pslld            m5, m6
1480%if %2 != 8
1481    shl           SHIFT, %2-8       ; ox << (bitd - 8)
1482%endif
1483    movd             m3, SHIFT      ; OX
1484    pshufd           m3, m3, 0
1485%if WIN64 || ARCH_X86_32
1486    mov           SHIFT, heightm
1487%endif
1488.loop:
1489   SIMPLE_LOAD        %1, 10, srcq, m0
1490%if %1 <= 4
1491    punpcklwd         m0, m1
1492    pmaddwd           m0, m2
1493    paddd             m0, m5
1494    psrad             m0, m4
1495    paddd             m0, m3
1496%else
1497    pmulhw            m6, m0, m2
1498    pmullw            m0, m2
1499    punpckhwd         m1, m0, m6
1500    punpcklwd         m0, m6
1501    paddd             m0, m5
1502    paddd             m1, m5
1503    psrad             m0, m4
1504    psrad             m1, m4
1505    paddd             m0, m3
1506    paddd             m1, m3
1507%endif
1508    packssdw          m0, m1
1509%if %2 == 8
1510    packuswb          m0, m0
1511%else
1512    CLIPW             m0, [pb_0], [max_pixels_%2]
1513%endif
1514    PEL_%2STORE%1   dstq, m0, m1
1515    add             dstq, dststrideq             ; dst += dststride
1516    add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
1517    dec          heightd                         ; cmp height
1518    jnz               .loop                      ; height loop
1519    RET
1520
1521cglobal hevc_put_hevc_bi_w%1_%2, 4, 6, 10, dst, dststride, src, src2, height, denom, wx0, wx1, ox0, ox1
1522    movifnidn        r5d, denomm
1523%if %1 <= 4
1524    pxor              m1, m1
1525%endif
1526    movd              m2, wx0m         ; WX0
1527    lea              r5d, [r5d+14-%2]  ; shift = 14 - bitd + denom
1528    movd              m3, wx1m         ; WX1
1529    movd              m0, r5d          ; shift
1530%if %1 <= 4
1531    punpcklwd         m2, m1
1532    punpcklwd         m3, m1
1533%else
1534    punpcklwd         m2, m2
1535    punpcklwd         m3, m3
1536%endif
1537    inc              r5d
1538    movd              m5, r5d          ; shift+1
1539    pshufd            m2, m2, 0
1540    mov              r5d, ox0m
1541    pshufd            m3, m3, 0
1542    add              r5d, ox1m
1543%if %2 != 8
1544    shl              r5d, %2-8         ; ox << (bitd - 8)
1545%endif
1546    inc              r5d
1547    movd              m4, r5d          ; offset
1548    pshufd            m4, m4, 0
1549%if UNIX64
1550%define h heightd
1551%else
1552    mov              r5d, heightm
1553%define h r5d
1554%endif
1555    pslld             m4, m0
1556
1557.loop:
1558   SIMPLE_LOAD        %1, 10, srcq,  m0
1559   SIMPLE_LOAD        %1, 10, src2q, m8
1560%if %1 <= 4
1561    punpcklwd         m0, m1
1562    punpcklwd         m8, m1
1563    pmaddwd           m0, m3
1564    pmaddwd           m8, m2
1565    paddd             m0, m4
1566    paddd             m0, m8
1567    psrad             m0, m5
1568%else
1569    pmulhw            m6, m0, m3
1570    pmullw            m0, m3
1571    pmulhw            m7, m8, m2
1572    pmullw            m8, m2
1573    punpckhwd         m1, m0, m6
1574    punpcklwd         m0, m6
1575    punpckhwd         m9, m8, m7
1576    punpcklwd         m8, m7
1577    paddd             m0, m8
1578    paddd             m1, m9
1579    paddd             m0, m4
1580    paddd             m1, m4
1581    psrad             m0, m5
1582    psrad             m1, m5
1583%endif
1584    packssdw          m0, m1
1585%if %2 == 8
1586    packuswb          m0, m0
1587%else
1588     CLIPW            m0, [pb_0], [max_pixels_%2]
1589%endif
1590    PEL_%2STORE%1   dstq, m0, m1
1591    add             dstq, dststrideq             ; dst += dststride
1592    add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
1593    add            src2q, 2*MAX_PB_SIZE          ; src2 += srcstride
1594    dec                h                         ; cmp height
1595    jnz               .loop                      ; height loop
1596    RET
1597%endmacro
1598
1599INIT_XMM sse4                                    ; adds ff_ and _sse4 to function name
1600
1601WEIGHTING_FUNCS 2, 8
1602WEIGHTING_FUNCS 4, 8
1603WEIGHTING_FUNCS 6, 8
1604WEIGHTING_FUNCS 8, 8
1605
1606WEIGHTING_FUNCS 2, 10
1607WEIGHTING_FUNCS 4, 10
1608WEIGHTING_FUNCS 6, 10
1609WEIGHTING_FUNCS 8, 10
1610
1611WEIGHTING_FUNCS 2, 12
1612WEIGHTING_FUNCS 4, 12
1613WEIGHTING_FUNCS 6, 12
1614WEIGHTING_FUNCS 8, 12
1615
1616HEVC_PUT_HEVC_PEL_PIXELS  2, 8
1617HEVC_PUT_HEVC_PEL_PIXELS  4, 8
1618HEVC_PUT_HEVC_PEL_PIXELS  6, 8
1619HEVC_PUT_HEVC_PEL_PIXELS  8, 8
1620HEVC_PUT_HEVC_PEL_PIXELS 12, 8
1621HEVC_PUT_HEVC_PEL_PIXELS 16, 8
1622
1623HEVC_PUT_HEVC_PEL_PIXELS 2, 10
1624HEVC_PUT_HEVC_PEL_PIXELS 4, 10
1625HEVC_PUT_HEVC_PEL_PIXELS 6, 10
1626HEVC_PUT_HEVC_PEL_PIXELS 8, 10
1627
1628HEVC_PUT_HEVC_PEL_PIXELS 2, 12
1629HEVC_PUT_HEVC_PEL_PIXELS 4, 12
1630HEVC_PUT_HEVC_PEL_PIXELS 6, 12
1631HEVC_PUT_HEVC_PEL_PIXELS 8, 12
1632
1633HEVC_PUT_HEVC_EPEL 2,  8
1634HEVC_PUT_HEVC_EPEL 4,  8
1635HEVC_PUT_HEVC_EPEL 6,  8
1636HEVC_PUT_HEVC_EPEL 8,  8
1637HEVC_PUT_HEVC_EPEL 12, 8
1638HEVC_PUT_HEVC_EPEL 16, 8
1639
1640
1641HEVC_PUT_HEVC_EPEL 2, 10
1642HEVC_PUT_HEVC_EPEL 4, 10
1643HEVC_PUT_HEVC_EPEL 6, 10
1644HEVC_PUT_HEVC_EPEL 8, 10
1645
1646HEVC_PUT_HEVC_EPEL 2, 12
1647HEVC_PUT_HEVC_EPEL 4, 12
1648HEVC_PUT_HEVC_EPEL 6, 12
1649HEVC_PUT_HEVC_EPEL 8, 12
1650
1651HEVC_PUT_HEVC_EPEL_HV 2,  8
1652HEVC_PUT_HEVC_EPEL_HV 4,  8
1653HEVC_PUT_HEVC_EPEL_HV 6,  8
1654HEVC_PUT_HEVC_EPEL_HV 8,  8
1655HEVC_PUT_HEVC_EPEL_HV 16, 8
1656
1657HEVC_PUT_HEVC_EPEL_HV 2, 10
1658HEVC_PUT_HEVC_EPEL_HV 4, 10
1659HEVC_PUT_HEVC_EPEL_HV 6, 10
1660HEVC_PUT_HEVC_EPEL_HV 8, 10
1661
1662HEVC_PUT_HEVC_EPEL_HV 2, 12
1663HEVC_PUT_HEVC_EPEL_HV 4, 12
1664HEVC_PUT_HEVC_EPEL_HV 6, 12
1665HEVC_PUT_HEVC_EPEL_HV 8, 12
1666
1667HEVC_PUT_HEVC_QPEL 4,  8
1668HEVC_PUT_HEVC_QPEL 8,  8
1669HEVC_PUT_HEVC_QPEL 12, 8
1670HEVC_PUT_HEVC_QPEL 16, 8
1671
1672HEVC_PUT_HEVC_QPEL 4, 10
1673HEVC_PUT_HEVC_QPEL 8, 10
1674
1675HEVC_PUT_HEVC_QPEL 4, 12
1676HEVC_PUT_HEVC_QPEL 8, 12
1677
1678HEVC_PUT_HEVC_QPEL_HV 2, 8
1679HEVC_PUT_HEVC_QPEL_HV 4, 8
1680HEVC_PUT_HEVC_QPEL_HV 6, 8
1681HEVC_PUT_HEVC_QPEL_HV 8, 8
1682
1683HEVC_PUT_HEVC_QPEL_HV 2, 10
1684HEVC_PUT_HEVC_QPEL_HV 4, 10
1685HEVC_PUT_HEVC_QPEL_HV 6, 10
1686HEVC_PUT_HEVC_QPEL_HV 8, 10
1687
1688HEVC_PUT_HEVC_QPEL_HV 2, 12
1689HEVC_PUT_HEVC_QPEL_HV 4, 12
1690HEVC_PUT_HEVC_QPEL_HV 6, 12
1691HEVC_PUT_HEVC_QPEL_HV 8, 12
1692
1693%if HAVE_AVX2_EXTERNAL
1694INIT_YMM avx2  ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
1695
1696HEVC_PUT_HEVC_PEL_PIXELS 32, 8
1697HEVC_PUT_HEVC_PEL_PIXELS 16, 10
1698
1699HEVC_PUT_HEVC_EPEL 32, 8
1700HEVC_PUT_HEVC_EPEL 16, 10
1701
1702HEVC_PUT_HEVC_EPEL_HV 16, 10
1703HEVC_PUT_HEVC_EPEL_HV 32, 8
1704
1705HEVC_PUT_HEVC_QPEL 32, 8
1706
1707HEVC_PUT_HEVC_QPEL 16, 10
1708
1709HEVC_PUT_HEVC_QPEL_HV 16, 10
1710
1711%endif ;AVX2
1712%endif ; ARCH_X86_64
1713
1714%macro QPEL_FILTER_H 5
1715%define %%table hevc_qpel_filters_avx512icl_h_%1
1716%assign %%offset 4
1717    dec %2q
1718    shl %2q, 3
1719%ifdef PIC
1720    lea %5q, [%%table]
1721    %define FILTER %5q
1722%else
1723    %define FILTER %%table
1724%endif
1725    vpbroadcastd m%3, [FILTER + %2q + 0*%%offset]
1726    vpbroadcastd m%4, [FILTER + %2q + 1*%%offset]
1727%endmacro
1728
1729%macro QPEL_FILTER_V 5
1730    vpbroadcastd m%3, [%5 + %2q + 4*%4]
1731%endmacro
1732
1733%macro QPEL_LOAD_SHUF 2
1734    movu m%1, [pb_qpel_shuffle_index +  0]
1735    movu m%2, [pb_qpel_shuffle_index + 64]
1736%endmacro
1737
1738; required: m0-m5
1739; %1: dst register index
1740; %2: name for src
1741; %3: optional offset
1742%macro QPEL_H_LOAD_COMPUTE 2-3
1743%assign %%offset 0
1744%if %0 == 3
1745%assign %%offset %3
1746%endif
1747    pxor            m%1, m%1
1748%if mmsize == 64
1749    movu            ym4, [%2q + %%offset - 3]
1750%else
1751    movu            xm4, [%2q + %%offset - 3]
1752%endif
1753    vpermb           m5, m2, m4
1754    vpermb           m4, m3, m4
1755    vpdpbusd        m%1, m5, m0
1756    vpdpbusd        m%1, m4, m1
1757%endmacro
1758
1759%macro HEVC_PUT_HEVC_QPEL_AVX512ICL 2
1760cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 8, dst, src, srcstride, height, mx, tmp
1761    QPEL_FILTER_H   %1, mx, 0, 1, tmp
1762    QPEL_LOAD_SHUF   2, 3
1763.loop:
1764    QPEL_H_LOAD_COMPUTE   6, src
1765%if %1 == 4
1766    vpmovdw             xm6, m6
1767    movq             [dstq], xm6
1768%else
1769    vpmovdw          [dstq], m6
1770%endif
1771%if %1 > 16
1772    QPEL_H_LOAD_COMPUTE   7, src, 16
1773    vpmovdw     [dstq + 32], m7
1774%endif
1775%if %1 > 32
1776    QPEL_H_LOAD_COMPUTE   6, src, 32
1777    QPEL_H_LOAD_COMPUTE   7, src, 48
1778    vpmovdw     [dstq + 64], m6
1779    vpmovdw     [dstq + 96], m7
1780%endif
1781    LOOP_END            dst, src, srcstride
1782    RET
1783%endmacro
1784
1785%macro HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 2
1786cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 7, 27, dst, src, srcstride, height, mx, my, tmp
1787%assign %%shift 6
1788%assign %%extra 7
1789    QPEL_FILTER_H    %1, mx, 0, 1, tmp
1790    QPEL_LOAD_SHUF    2, 3
1791    lea            tmpq, [srcstrideq*3]
1792    sub            srcq, tmpq
1793    sub             myq, 1
1794    shl             myq, 5
1795%define %%table hevc_qpel_filters_avx512icl_v_%1
1796%ifdef PIC
1797    lea tmpq, [%%table]
1798    %define FILTER tmpq
1799%else
1800    %define FILTER %%table
1801%endif
1802%assign %%i 6
1803%assign %%j 0
1804%rep %1
1805    QPEL_FILTER_V %1, my, %%i, %%j, FILTER
1806    %assign %%i %%i+1
1807    %assign %%j %%j+1
1808%endrep
1809%rep %%extra
1810    QPEL_H_LOAD_COMPUTE %%i, src
1811    add srcq, srcstrideq
1812%assign %%i %%i+1
1813%endrep
1814.loop:
1815    QPEL_H_LOAD_COMPUTE %%i, src
1816    vpmulld           m22, m14, m6
1817    vpmulld           m23, m15, m7
1818    vpmulld           m24, m16, m8
1819    vpmulld           m25, m17, m9
1820    vpaddd            m26, m22, m23
1821    vpaddd            m24, m25
1822    vpaddd            m26, m24
1823    vpmulld           m22, m18, m10
1824    vpmulld           m23, m19, m11
1825    vpmulld           m24, m20, m12
1826    vpmulld           m25, m21, m13
1827    vpaddd            m22, m22, m23
1828    vpaddd            m24, m25
1829    vpaddd            m26, m24
1830    vpaddd            m22, m26
1831    mova              m14, m15
1832    mova              m15, m16
1833    mova              m16, m17
1834    mova              m17, m18
1835    mova              m18, m19
1836    mova              m19, m20
1837    mova              m20, m21
1838    vpsrad            m22, %%shift
1839    vpmovdw        [dstq], m22
1840    LOOP_END          dst, src, srcstride
1841
1842    RET
1843%endmacro
1844
1845%if ARCH_X86_64
1846%if HAVE_AVX512ICL_EXTERNAL
1847
1848INIT_XMM avx512icl
1849HEVC_PUT_HEVC_QPEL_AVX512ICL 4, 8
1850
1851INIT_YMM avx512icl
1852HEVC_PUT_HEVC_QPEL_AVX512ICL 8, 8
1853HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 8, 8
1854
1855INIT_ZMM avx512icl
1856HEVC_PUT_HEVC_QPEL_AVX512ICL 16, 8
1857HEVC_PUT_HEVC_QPEL_AVX512ICL 32, 8
1858HEVC_PUT_HEVC_QPEL_AVX512ICL 64, 8
1859
1860%endif
1861%endif
1862