1cabdff1aSopenharmony_ci; /* 2cabdff1aSopenharmony_ci; * Provide SSE luma and chroma mc functions for HEVC decoding 3cabdff1aSopenharmony_ci; * Copyright (c) 2013 Pierre-Edouard LEPERE 4cabdff1aSopenharmony_ci; * 5cabdff1aSopenharmony_ci; * This file is part of FFmpeg. 6cabdff1aSopenharmony_ci; * 7cabdff1aSopenharmony_ci; * FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci; * modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci; * License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci; * version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci; * 12cabdff1aSopenharmony_ci; * FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci; * but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci; * Lesser General Public License for more details. 16cabdff1aSopenharmony_ci; * 17cabdff1aSopenharmony_ci; * You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci; * License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci; */ 21cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ciSECTION_RODATA 32 24cabdff1aSopenharmony_cicextern pw_255 25cabdff1aSopenharmony_cicextern pw_512 26cabdff1aSopenharmony_cicextern pw_2048 27cabdff1aSopenharmony_cicextern pw_8192 28cabdff1aSopenharmony_cicextern pw_1023 29cabdff1aSopenharmony_cicextern pw_1024 30cabdff1aSopenharmony_cicextern pw_4096 31cabdff1aSopenharmony_ci%define pw_8 pw_512 32cabdff1aSopenharmony_ci%define pw_10 pw_2048 33cabdff1aSopenharmony_ci%define pw_12 pw_8192 34cabdff1aSopenharmony_ci%define pw_bi_10 pw_1024 35cabdff1aSopenharmony_ci%define pw_bi_12 pw_4096 36cabdff1aSopenharmony_ci%define max_pixels_8 pw_255 37cabdff1aSopenharmony_ci%define max_pixels_10 pw_1023 38cabdff1aSopenharmony_cipw_bi_8: times 16 dw (1 << 8) 39cabdff1aSopenharmony_cimax_pixels_12: times 16 dw ((1 << 12)-1) 40cabdff1aSopenharmony_cicextern pd_1 41cabdff1aSopenharmony_cicextern pb_0 42cabdff1aSopenharmony_ci 43cabdff1aSopenharmony_ci%macro EPEL_TABLE 4 44cabdff1aSopenharmony_cihevc_epel_filters_%4_%1 times %2 d%3 -2, 58 45cabdff1aSopenharmony_ci times %2 d%3 10, -2 46cabdff1aSopenharmony_ci times %2 d%3 -4, 54 47cabdff1aSopenharmony_ci times %2 d%3 16, -2 48cabdff1aSopenharmony_ci times %2 d%3 -6, 46 49cabdff1aSopenharmony_ci times %2 d%3 28, -4 50cabdff1aSopenharmony_ci times %2 d%3 -4, 36 51cabdff1aSopenharmony_ci times %2 d%3 36, -4 52cabdff1aSopenharmony_ci times %2 d%3 -4, 28 53cabdff1aSopenharmony_ci times %2 d%3 46, -6 54cabdff1aSopenharmony_ci times %2 d%3 -2, 16 55cabdff1aSopenharmony_ci times %2 d%3 54, -4 56cabdff1aSopenharmony_ci times %2 d%3 -2, 10 57cabdff1aSopenharmony_ci times %2 d%3 58, -2 58cabdff1aSopenharmony_ci%endmacro 59cabdff1aSopenharmony_ci 60cabdff1aSopenharmony_ci 61cabdff1aSopenharmony_ciEPEL_TABLE 8,16, b, avx2 62cabdff1aSopenharmony_ciEPEL_TABLE 10, 8, w, avx2 63cabdff1aSopenharmony_ci 64cabdff1aSopenharmony_ciEPEL_TABLE 8, 8, b, sse4 65cabdff1aSopenharmony_ciEPEL_TABLE 10, 4, w, sse4 66cabdff1aSopenharmony_ciEPEL_TABLE 12, 4, w, sse4 67cabdff1aSopenharmony_ci 68cabdff1aSopenharmony_ci%macro QPEL_TABLE 4 69cabdff1aSopenharmony_cihevc_qpel_filters_%4_%1 times %2 d%3 -1, 4 70cabdff1aSopenharmony_ci times %2 d%3 -10, 58 71cabdff1aSopenharmony_ci times %2 d%3 17, -5 72cabdff1aSopenharmony_ci times %2 d%3 1, 0 73cabdff1aSopenharmony_ci times %2 d%3 -1, 4 74cabdff1aSopenharmony_ci times %2 d%3 -11, 40 75cabdff1aSopenharmony_ci times %2 d%3 40,-11 76cabdff1aSopenharmony_ci times %2 d%3 4, -1 77cabdff1aSopenharmony_ci times %2 d%3 0, 1 78cabdff1aSopenharmony_ci times %2 d%3 -5, 17 79cabdff1aSopenharmony_ci times %2 d%3 58,-10 80cabdff1aSopenharmony_ci times %2 d%3 4, -1 81cabdff1aSopenharmony_ci%endmacro 82cabdff1aSopenharmony_ci 83cabdff1aSopenharmony_ciQPEL_TABLE 8, 8, b, sse4 84cabdff1aSopenharmony_ciQPEL_TABLE 10, 4, w, sse4 85cabdff1aSopenharmony_ciQPEL_TABLE 12, 4, w, sse4 86cabdff1aSopenharmony_ci 87cabdff1aSopenharmony_ciQPEL_TABLE 8,16, b, avx2 88cabdff1aSopenharmony_ciQPEL_TABLE 10, 8, w, avx2 89cabdff1aSopenharmony_ci 90cabdff1aSopenharmony_ciQPEL_TABLE 4, 1, b, avx512icl_h 91cabdff1aSopenharmony_ciQPEL_TABLE 8, 1, b, avx512icl_h 92cabdff1aSopenharmony_ciQPEL_TABLE 8, 1, d, avx512icl_v 93cabdff1aSopenharmony_ciQPEL_TABLE 16, 1, b, avx512icl_h 94cabdff1aSopenharmony_ciQPEL_TABLE 32, 1, b, avx512icl_h 95cabdff1aSopenharmony_ciQPEL_TABLE 64, 1, b, avx512icl_h 96cabdff1aSopenharmony_ci 97cabdff1aSopenharmony_cipb_qpel_shuffle_index: db 0, 1, 2, 3 98cabdff1aSopenharmony_ci db 1, 2, 3, 4 99cabdff1aSopenharmony_ci db 2, 3, 4, 5 100cabdff1aSopenharmony_ci db 3, 4, 5, 6 101cabdff1aSopenharmony_ci db 4, 5, 6, 7 102cabdff1aSopenharmony_ci db 5, 6, 7, 8 103cabdff1aSopenharmony_ci db 6, 7, 8, 9 104cabdff1aSopenharmony_ci db 7, 8, 9, 10 105cabdff1aSopenharmony_ci db 8, 9, 10, 11 106cabdff1aSopenharmony_ci db 9, 10, 11, 12 107cabdff1aSopenharmony_ci db 10, 11, 12, 13 108cabdff1aSopenharmony_ci db 11, 12, 13, 14 109cabdff1aSopenharmony_ci db 12, 13, 14, 15 110cabdff1aSopenharmony_ci db 13, 14, 15, 16 111cabdff1aSopenharmony_ci db 14, 15, 16, 17 112cabdff1aSopenharmony_ci db 15, 16, 17, 18 113cabdff1aSopenharmony_ci db 4, 5, 6, 7 114cabdff1aSopenharmony_ci db 5, 6, 7, 8 115cabdff1aSopenharmony_ci db 6, 7, 8, 9 116cabdff1aSopenharmony_ci db 7, 8, 9, 10 117cabdff1aSopenharmony_ci db 8, 9, 10, 11 118cabdff1aSopenharmony_ci db 9, 10, 11, 12 119cabdff1aSopenharmony_ci db 10, 11, 12, 13 120cabdff1aSopenharmony_ci db 11, 12, 13, 14 121cabdff1aSopenharmony_ci db 12, 13, 14, 15 122cabdff1aSopenharmony_ci db 13, 14, 15, 16 123cabdff1aSopenharmony_ci db 14, 15, 16, 17 124cabdff1aSopenharmony_ci db 15, 16, 17, 18 125cabdff1aSopenharmony_ci db 16, 17, 18, 19 126cabdff1aSopenharmony_ci db 17, 18, 19, 20 127cabdff1aSopenharmony_ci db 18, 19, 20, 21 128cabdff1aSopenharmony_ci db 19, 20, 21, 22 129cabdff1aSopenharmony_ci 130cabdff1aSopenharmony_ciSECTION .text 131cabdff1aSopenharmony_ci 132cabdff1aSopenharmony_ci%define MAX_PB_SIZE 64 133cabdff1aSopenharmony_ci 134cabdff1aSopenharmony_ci%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10 135cabdff1aSopenharmony_ci 136cabdff1aSopenharmony_ci%define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10 137cabdff1aSopenharmony_ci 138cabdff1aSopenharmony_ci%if ARCH_X86_64 139cabdff1aSopenharmony_ci 140cabdff1aSopenharmony_ci%macro SIMPLE_BILOAD 4 ;width, tab, r1, r2 141cabdff1aSopenharmony_ci%if %1 <= 4 142cabdff1aSopenharmony_ci movq %3, [%2] ; load data from source2 143cabdff1aSopenharmony_ci%elif %1 <= 8 144cabdff1aSopenharmony_ci movdqa %3, [%2] ; load data from source2 145cabdff1aSopenharmony_ci%elif %1 <= 12 146cabdff1aSopenharmony_ci%if cpuflag(avx2) 147cabdff1aSopenharmony_ci mova %3, [%2] 148cabdff1aSopenharmony_ci%else 149cabdff1aSopenharmony_ci movdqa %3, [%2] ; load data from source2 150cabdff1aSopenharmony_ci movq %4, [%2+16] ; load data from source2 151cabdff1aSopenharmony_ci%endif ;avx 152cabdff1aSopenharmony_ci%elif %1 <= 16 153cabdff1aSopenharmony_ci%if cpuflag(avx2) 154cabdff1aSopenharmony_ci mova %3, [%2] 155cabdff1aSopenharmony_ci%else 156cabdff1aSopenharmony_ci movdqa %3, [%2] ; load data from source2 157cabdff1aSopenharmony_ci movdqa %4, [%2+16] ; load data from source2 158cabdff1aSopenharmony_ci%endif ; avx 159cabdff1aSopenharmony_ci%else ; %1 = 32 160cabdff1aSopenharmony_ci mova %3, [%2] 161cabdff1aSopenharmony_ci mova %4, [%2+32] 162cabdff1aSopenharmony_ci%endif 163cabdff1aSopenharmony_ci%endmacro 164cabdff1aSopenharmony_ci 165cabdff1aSopenharmony_ci%macro SIMPLE_LOAD 4 ;width, bitd, tab, r1 166cabdff1aSopenharmony_ci%if %1 == 2 || (%2 == 8 && %1 <= 4) 167cabdff1aSopenharmony_ci movd %4, [%3] ; load data from source 168cabdff1aSopenharmony_ci%elif %1 == 4 || (%2 == 8 && %1 <= 8) 169cabdff1aSopenharmony_ci movq %4, [%3] ; load data from source 170cabdff1aSopenharmony_ci%elif notcpuflag(avx) 171cabdff1aSopenharmony_ci movu %4, [%3] ; load data from source 172cabdff1aSopenharmony_ci%elif %1 <= 8 || (%2 == 8 && %1 <= 16) 173cabdff1aSopenharmony_ci movdqu %4, [%3] 174cabdff1aSopenharmony_ci%else 175cabdff1aSopenharmony_ci movu %4, [%3] 176cabdff1aSopenharmony_ci%endif 177cabdff1aSopenharmony_ci%endmacro 178cabdff1aSopenharmony_ci 179cabdff1aSopenharmony_ci 180cabdff1aSopenharmony_ci%macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp 181cabdff1aSopenharmony_ci%if cpuflag(avx2) 182cabdff1aSopenharmony_ci%assign %%offset 32 183cabdff1aSopenharmony_ci%ifdef PIC 184cabdff1aSopenharmony_ci lea %5q, [hevc_epel_filters_avx2_%1] 185cabdff1aSopenharmony_ci %define FILTER %5q 186cabdff1aSopenharmony_ci%else 187cabdff1aSopenharmony_ci %define FILTER hevc_epel_filters_avx2_%1 188cabdff1aSopenharmony_ci%endif 189cabdff1aSopenharmony_ci%else 190cabdff1aSopenharmony_ci%assign %%offset 16 191cabdff1aSopenharmony_ci%ifdef PIC 192cabdff1aSopenharmony_ci lea %5q, [hevc_epel_filters_sse4_%1] 193cabdff1aSopenharmony_ci %define FILTER %5q 194cabdff1aSopenharmony_ci%else 195cabdff1aSopenharmony_ci %define FILTER hevc_epel_filters_sse4_%1 196cabdff1aSopenharmony_ci%endif 197cabdff1aSopenharmony_ci%endif ;cpuflag(avx2) 198cabdff1aSopenharmony_ci sub %2q, 1 199cabdff1aSopenharmony_ci%if cpuflag(avx2) 200cabdff1aSopenharmony_ci shl %2q, 6 ; multiply by 64 201cabdff1aSopenharmony_ci %else 202cabdff1aSopenharmony_ci shl %2q, 5 ; multiply by 32 203cabdff1aSopenharmony_ci%endif 204cabdff1aSopenharmony_ci mova %3, [FILTER + %2q] ; get 2 first values of filters 205cabdff1aSopenharmony_ci mova %4, [FILTER + %2q+%%offset] ; get 2 last values of filters 206cabdff1aSopenharmony_ci%endmacro 207cabdff1aSopenharmony_ci 208cabdff1aSopenharmony_ci%macro EPEL_HV_FILTER 1 209cabdff1aSopenharmony_ci%if cpuflag(avx2) 210cabdff1aSopenharmony_ci%assign %%offset 32 211cabdff1aSopenharmony_ci%assign %%shift 6 212cabdff1aSopenharmony_ci%define %%table hevc_epel_filters_avx2_%1 213cabdff1aSopenharmony_ci%else 214cabdff1aSopenharmony_ci%assign %%offset 16 215cabdff1aSopenharmony_ci%assign %%shift 5 216cabdff1aSopenharmony_ci%define %%table hevc_epel_filters_sse4_%1 217cabdff1aSopenharmony_ci%endif 218cabdff1aSopenharmony_ci 219cabdff1aSopenharmony_ci%ifdef PIC 220cabdff1aSopenharmony_ci lea r3srcq, [%%table] 221cabdff1aSopenharmony_ci %define FILTER r3srcq 222cabdff1aSopenharmony_ci%else 223cabdff1aSopenharmony_ci %define FILTER %%table 224cabdff1aSopenharmony_ci%endif 225cabdff1aSopenharmony_ci sub mxq, 1 226cabdff1aSopenharmony_ci sub myq, 1 227cabdff1aSopenharmony_ci shl mxq, %%shift ; multiply by 32 228cabdff1aSopenharmony_ci shl myq, %%shift ; multiply by 32 229cabdff1aSopenharmony_ci mova m14, [FILTER + mxq] ; get 2 first values of filters 230cabdff1aSopenharmony_ci mova m15, [FILTER + mxq+%%offset] ; get 2 last values of filters 231cabdff1aSopenharmony_ci 232cabdff1aSopenharmony_ci%if cpuflag(avx2) 233cabdff1aSopenharmony_ci%define %%table hevc_epel_filters_avx2_10 234cabdff1aSopenharmony_ci%else 235cabdff1aSopenharmony_ci%define %%table hevc_epel_filters_sse4_10 236cabdff1aSopenharmony_ci%endif 237cabdff1aSopenharmony_ci%ifdef PIC 238cabdff1aSopenharmony_ci lea r3srcq, [%%table] 239cabdff1aSopenharmony_ci %define FILTER r3srcq 240cabdff1aSopenharmony_ci%else 241cabdff1aSopenharmony_ci %define FILTER %%table 242cabdff1aSopenharmony_ci%endif 243cabdff1aSopenharmony_ci mova m12, [FILTER + myq] ; get 2 first values of filters 244cabdff1aSopenharmony_ci mova m13, [FILTER + myq+%%offset] ; get 2 last values of filters 245cabdff1aSopenharmony_ci lea r3srcq, [srcstrideq*3] 246cabdff1aSopenharmony_ci%endmacro 247cabdff1aSopenharmony_ci 248cabdff1aSopenharmony_ci%macro QPEL_FILTER 2 249cabdff1aSopenharmony_ci 250cabdff1aSopenharmony_ci%if cpuflag(avx2) 251cabdff1aSopenharmony_ci%assign %%offset 32 252cabdff1aSopenharmony_ci%assign %%shift 7 253cabdff1aSopenharmony_ci%define %%table hevc_qpel_filters_avx2_%1 254cabdff1aSopenharmony_ci%else 255cabdff1aSopenharmony_ci%assign %%offset 16 256cabdff1aSopenharmony_ci%assign %%shift 6 257cabdff1aSopenharmony_ci%define %%table hevc_qpel_filters_sse4_%1 258cabdff1aSopenharmony_ci%endif 259cabdff1aSopenharmony_ci 260cabdff1aSopenharmony_ci%ifdef PIC 261cabdff1aSopenharmony_ci lea rfilterq, [%%table] 262cabdff1aSopenharmony_ci%else 263cabdff1aSopenharmony_ci %define rfilterq %%table 264cabdff1aSopenharmony_ci%endif 265cabdff1aSopenharmony_ci sub %2q, 1 266cabdff1aSopenharmony_ci shl %2q, %%shift ; multiply by 32 267cabdff1aSopenharmony_ci mova m12, [rfilterq + %2q] ; get 4 first values of filters 268cabdff1aSopenharmony_ci mova m13, [rfilterq + %2q + %%offset] ; get 4 first values of filters 269cabdff1aSopenharmony_ci mova m14, [rfilterq + %2q + 2*%%offset] ; get 4 first values of filters 270cabdff1aSopenharmony_ci mova m15, [rfilterq + %2q + 3*%%offset] ; get 4 first values of filters 271cabdff1aSopenharmony_ci%endmacro 272cabdff1aSopenharmony_ci 273cabdff1aSopenharmony_ci%macro EPEL_LOAD 4 274cabdff1aSopenharmony_ci%if (%1 == 8 && %4 <= 4) 275cabdff1aSopenharmony_ci%define %%load movd 276cabdff1aSopenharmony_ci%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4) 277cabdff1aSopenharmony_ci%define %%load movq 278cabdff1aSopenharmony_ci%else 279cabdff1aSopenharmony_ci%define %%load movdqu 280cabdff1aSopenharmony_ci%endif 281cabdff1aSopenharmony_ci 282cabdff1aSopenharmony_ci %%load m0, [%2q ] 283cabdff1aSopenharmony_ci%ifnum %3 284cabdff1aSopenharmony_ci %%load m1, [%2q+ %3] 285cabdff1aSopenharmony_ci %%load m2, [%2q+2*%3] 286cabdff1aSopenharmony_ci %%load m3, [%2q+3*%3] 287cabdff1aSopenharmony_ci%else 288cabdff1aSopenharmony_ci %%load m1, [%2q+ %3q] 289cabdff1aSopenharmony_ci %%load m2, [%2q+2*%3q] 290cabdff1aSopenharmony_ci %%load m3, [%2q+r3srcq] 291cabdff1aSopenharmony_ci%endif 292cabdff1aSopenharmony_ci%if %1 == 8 293cabdff1aSopenharmony_ci%if %4 > 8 294cabdff1aSopenharmony_ci SBUTTERFLY bw, 0, 1, 7 295cabdff1aSopenharmony_ci SBUTTERFLY bw, 2, 3, 7 296cabdff1aSopenharmony_ci%else 297cabdff1aSopenharmony_ci punpcklbw m0, m1 298cabdff1aSopenharmony_ci punpcklbw m2, m3 299cabdff1aSopenharmony_ci%endif 300cabdff1aSopenharmony_ci%else 301cabdff1aSopenharmony_ci%if %4 > 4 302cabdff1aSopenharmony_ci SBUTTERFLY wd, 0, 1, 7 303cabdff1aSopenharmony_ci SBUTTERFLY wd, 2, 3, 7 304cabdff1aSopenharmony_ci%else 305cabdff1aSopenharmony_ci punpcklwd m0, m1 306cabdff1aSopenharmony_ci punpcklwd m2, m3 307cabdff1aSopenharmony_ci%endif 308cabdff1aSopenharmony_ci%endif 309cabdff1aSopenharmony_ci%endmacro 310cabdff1aSopenharmony_ci 311cabdff1aSopenharmony_ci 312cabdff1aSopenharmony_ci%macro QPEL_H_LOAD 4 313cabdff1aSopenharmony_ci%assign %%stride (%1+7)/8 314cabdff1aSopenharmony_ci%if %1 == 8 315cabdff1aSopenharmony_ci%if %3 <= 4 316cabdff1aSopenharmony_ci%define %%load movd 317cabdff1aSopenharmony_ci%elif %3 == 8 318cabdff1aSopenharmony_ci%define %%load movq 319cabdff1aSopenharmony_ci%else 320cabdff1aSopenharmony_ci%define %%load movu 321cabdff1aSopenharmony_ci%endif 322cabdff1aSopenharmony_ci%else 323cabdff1aSopenharmony_ci%if %3 == 2 324cabdff1aSopenharmony_ci%define %%load movd 325cabdff1aSopenharmony_ci%elif %3 == 4 326cabdff1aSopenharmony_ci%define %%load movq 327cabdff1aSopenharmony_ci%else 328cabdff1aSopenharmony_ci%define %%load movu 329cabdff1aSopenharmony_ci%endif 330cabdff1aSopenharmony_ci%endif 331cabdff1aSopenharmony_ci %%load m0, [%2-3*%%stride] ;load data from source 332cabdff1aSopenharmony_ci %%load m1, [%2-2*%%stride] 333cabdff1aSopenharmony_ci %%load m2, [%2-%%stride ] 334cabdff1aSopenharmony_ci %%load m3, [%2 ] 335cabdff1aSopenharmony_ci %%load m4, [%2+%%stride ] 336cabdff1aSopenharmony_ci %%load m5, [%2+2*%%stride] 337cabdff1aSopenharmony_ci %%load m6, [%2+3*%%stride] 338cabdff1aSopenharmony_ci %%load m7, [%2+4*%%stride] 339cabdff1aSopenharmony_ci 340cabdff1aSopenharmony_ci%if %1 == 8 341cabdff1aSopenharmony_ci%if %3 > 8 342cabdff1aSopenharmony_ci SBUTTERFLY wd, 0, 1, %4 343cabdff1aSopenharmony_ci SBUTTERFLY wd, 2, 3, %4 344cabdff1aSopenharmony_ci SBUTTERFLY wd, 4, 5, %4 345cabdff1aSopenharmony_ci SBUTTERFLY wd, 6, 7, %4 346cabdff1aSopenharmony_ci%else 347cabdff1aSopenharmony_ci punpcklbw m0, m1 348cabdff1aSopenharmony_ci punpcklbw m2, m3 349cabdff1aSopenharmony_ci punpcklbw m4, m5 350cabdff1aSopenharmony_ci punpcklbw m6, m7 351cabdff1aSopenharmony_ci%endif 352cabdff1aSopenharmony_ci%else 353cabdff1aSopenharmony_ci%if %3 > 4 354cabdff1aSopenharmony_ci SBUTTERFLY dq, 0, 1, %4 355cabdff1aSopenharmony_ci SBUTTERFLY dq, 2, 3, %4 356cabdff1aSopenharmony_ci SBUTTERFLY dq, 4, 5, %4 357cabdff1aSopenharmony_ci SBUTTERFLY dq, 6, 7, %4 358cabdff1aSopenharmony_ci%else 359cabdff1aSopenharmony_ci punpcklwd m0, m1 360cabdff1aSopenharmony_ci punpcklwd m2, m3 361cabdff1aSopenharmony_ci punpcklwd m4, m5 362cabdff1aSopenharmony_ci punpcklwd m6, m7 363cabdff1aSopenharmony_ci%endif 364cabdff1aSopenharmony_ci%endif 365cabdff1aSopenharmony_ci%endmacro 366cabdff1aSopenharmony_ci 367cabdff1aSopenharmony_ci%macro QPEL_V_LOAD 5 368cabdff1aSopenharmony_ci lea %5q, [%2] 369cabdff1aSopenharmony_ci sub %5q, r3srcq 370cabdff1aSopenharmony_ci movu m0, [%5q ] ;load x- 3*srcstride 371cabdff1aSopenharmony_ci movu m1, [%5q+ %3q ] ;load x- 2*srcstride 372cabdff1aSopenharmony_ci movu m2, [%5q+ 2*%3q ] ;load x-srcstride 373cabdff1aSopenharmony_ci movu m3, [%2 ] ;load x 374cabdff1aSopenharmony_ci movu m4, [%2+ %3q] ;load x+stride 375cabdff1aSopenharmony_ci movu m5, [%2+ 2*%3q] ;load x+2*stride 376cabdff1aSopenharmony_ci movu m6, [%2+r3srcq] ;load x+3*stride 377cabdff1aSopenharmony_ci movu m7, [%2+ 4*%3q] ;load x+4*stride 378cabdff1aSopenharmony_ci%if %1 == 8 379cabdff1aSopenharmony_ci%if %4 > 8 380cabdff1aSopenharmony_ci SBUTTERFLY bw, 0, 1, 8 381cabdff1aSopenharmony_ci SBUTTERFLY bw, 2, 3, 8 382cabdff1aSopenharmony_ci SBUTTERFLY bw, 4, 5, 8 383cabdff1aSopenharmony_ci SBUTTERFLY bw, 6, 7, 8 384cabdff1aSopenharmony_ci%else 385cabdff1aSopenharmony_ci punpcklbw m0, m1 386cabdff1aSopenharmony_ci punpcklbw m2, m3 387cabdff1aSopenharmony_ci punpcklbw m4, m5 388cabdff1aSopenharmony_ci punpcklbw m6, m7 389cabdff1aSopenharmony_ci%endif 390cabdff1aSopenharmony_ci%else 391cabdff1aSopenharmony_ci%if %4 > 4 392cabdff1aSopenharmony_ci SBUTTERFLY wd, 0, 1, 8 393cabdff1aSopenharmony_ci SBUTTERFLY wd, 2, 3, 8 394cabdff1aSopenharmony_ci SBUTTERFLY wd, 4, 5, 8 395cabdff1aSopenharmony_ci SBUTTERFLY wd, 6, 7, 8 396cabdff1aSopenharmony_ci%else 397cabdff1aSopenharmony_ci punpcklwd m0, m1 398cabdff1aSopenharmony_ci punpcklwd m2, m3 399cabdff1aSopenharmony_ci punpcklwd m4, m5 400cabdff1aSopenharmony_ci punpcklwd m6, m7 401cabdff1aSopenharmony_ci%endif 402cabdff1aSopenharmony_ci%endif 403cabdff1aSopenharmony_ci%endmacro 404cabdff1aSopenharmony_ci 405cabdff1aSopenharmony_ci%macro PEL_12STORE2 3 406cabdff1aSopenharmony_ci movd [%1], %2 407cabdff1aSopenharmony_ci%endmacro 408cabdff1aSopenharmony_ci%macro PEL_12STORE4 3 409cabdff1aSopenharmony_ci movq [%1], %2 410cabdff1aSopenharmony_ci%endmacro 411cabdff1aSopenharmony_ci%macro PEL_12STORE6 3 412cabdff1aSopenharmony_ci movq [%1], %2 413cabdff1aSopenharmony_ci psrldq %2, 8 414cabdff1aSopenharmony_ci movd [%1+8], %2 415cabdff1aSopenharmony_ci%endmacro 416cabdff1aSopenharmony_ci%macro PEL_12STORE8 3 417cabdff1aSopenharmony_ci movdqa [%1], %2 418cabdff1aSopenharmony_ci%endmacro 419cabdff1aSopenharmony_ci%macro PEL_12STORE12 3 420cabdff1aSopenharmony_ci movdqa [%1], %2 421cabdff1aSopenharmony_ci movq [%1+16], %3 422cabdff1aSopenharmony_ci%endmacro 423cabdff1aSopenharmony_ci%macro PEL_12STORE16 3 424cabdff1aSopenharmony_ci PEL_12STORE8 %1, %2, %3 425cabdff1aSopenharmony_ci movdqa [%1+16], %3 426cabdff1aSopenharmony_ci%endmacro 427cabdff1aSopenharmony_ci 428cabdff1aSopenharmony_ci%macro PEL_10STORE2 3 429cabdff1aSopenharmony_ci movd [%1], %2 430cabdff1aSopenharmony_ci%endmacro 431cabdff1aSopenharmony_ci%macro PEL_10STORE4 3 432cabdff1aSopenharmony_ci movq [%1], %2 433cabdff1aSopenharmony_ci%endmacro 434cabdff1aSopenharmony_ci%macro PEL_10STORE6 3 435cabdff1aSopenharmony_ci movq [%1], %2 436cabdff1aSopenharmony_ci psrldq %2, 8 437cabdff1aSopenharmony_ci movd [%1+8], %2 438cabdff1aSopenharmony_ci%endmacro 439cabdff1aSopenharmony_ci%macro PEL_10STORE8 3 440cabdff1aSopenharmony_ci movdqa [%1], %2 441cabdff1aSopenharmony_ci%endmacro 442cabdff1aSopenharmony_ci%macro PEL_10STORE12 3 443cabdff1aSopenharmony_ci movdqa [%1], %2 444cabdff1aSopenharmony_ci movq [%1+16], %3 445cabdff1aSopenharmony_ci%endmacro 446cabdff1aSopenharmony_ci%macro PEL_10STORE16 3 447cabdff1aSopenharmony_ci%if cpuflag(avx2) 448cabdff1aSopenharmony_ci movu [%1], %2 449cabdff1aSopenharmony_ci%else 450cabdff1aSopenharmony_ci PEL_10STORE8 %1, %2, %3 451cabdff1aSopenharmony_ci movdqa [%1+16], %3 452cabdff1aSopenharmony_ci%endif 453cabdff1aSopenharmony_ci%endmacro 454cabdff1aSopenharmony_ci 455cabdff1aSopenharmony_ci%macro PEL_10STORE32 3 456cabdff1aSopenharmony_ci PEL_10STORE16 %1, %2, %3 457cabdff1aSopenharmony_ci movu [%1+32], %3 458cabdff1aSopenharmony_ci%endmacro 459cabdff1aSopenharmony_ci 460cabdff1aSopenharmony_ci%macro PEL_8STORE2 3 461cabdff1aSopenharmony_ci pextrw [%1], %2, 0 462cabdff1aSopenharmony_ci%endmacro 463cabdff1aSopenharmony_ci%macro PEL_8STORE4 3 464cabdff1aSopenharmony_ci movd [%1], %2 465cabdff1aSopenharmony_ci%endmacro 466cabdff1aSopenharmony_ci%macro PEL_8STORE6 3 467cabdff1aSopenharmony_ci movd [%1], %2 468cabdff1aSopenharmony_ci pextrw [%1+4], %2, 2 469cabdff1aSopenharmony_ci%endmacro 470cabdff1aSopenharmony_ci%macro PEL_8STORE8 3 471cabdff1aSopenharmony_ci movq [%1], %2 472cabdff1aSopenharmony_ci%endmacro 473cabdff1aSopenharmony_ci%macro PEL_8STORE12 3 474cabdff1aSopenharmony_ci movq [%1], %2 475cabdff1aSopenharmony_ci psrldq %2, 8 476cabdff1aSopenharmony_ci movd [%1+8], %2 477cabdff1aSopenharmony_ci%endmacro 478cabdff1aSopenharmony_ci%macro PEL_8STORE16 3 479cabdff1aSopenharmony_ci%if cpuflag(avx2) 480cabdff1aSopenharmony_ci movdqu [%1], %2 481cabdff1aSopenharmony_ci%else 482cabdff1aSopenharmony_ci mova [%1], %2 483cabdff1aSopenharmony_ci%endif ; avx 484cabdff1aSopenharmony_ci%endmacro 485cabdff1aSopenharmony_ci%macro PEL_8STORE32 3 486cabdff1aSopenharmony_ci movu [%1], %2 487cabdff1aSopenharmony_ci%endmacro 488cabdff1aSopenharmony_ci 489cabdff1aSopenharmony_ci%macro LOOP_END 3 490cabdff1aSopenharmony_ci add %1q, 2*MAX_PB_SIZE ; dst += dststride 491cabdff1aSopenharmony_ci add %2q, %3q ; src += srcstride 492cabdff1aSopenharmony_ci dec heightd ; cmp height 493cabdff1aSopenharmony_ci jnz .loop ; height loop 494cabdff1aSopenharmony_ci%endmacro 495cabdff1aSopenharmony_ci 496cabdff1aSopenharmony_ci 497cabdff1aSopenharmony_ci%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth 498cabdff1aSopenharmony_ci%if %2 == 8 499cabdff1aSopenharmony_ci%if cpuflag(avx2) && %0 ==3 500cabdff1aSopenharmony_ci%if %1 > 16 501cabdff1aSopenharmony_ci vextracti128 xm1, m0, 1 502cabdff1aSopenharmony_ci pmovzxbw m1, xm1 503cabdff1aSopenharmony_ci psllw m1, 14-%2 504cabdff1aSopenharmony_ci%endif 505cabdff1aSopenharmony_ci pmovzxbw m0, xm0 506cabdff1aSopenharmony_ci%else ; not avx 507cabdff1aSopenharmony_ci%if %1 > 8 508cabdff1aSopenharmony_ci punpckhbw m1, m0, m2 509cabdff1aSopenharmony_ci psllw m1, 14-%2 510cabdff1aSopenharmony_ci%endif 511cabdff1aSopenharmony_ci punpcklbw m0, m2 512cabdff1aSopenharmony_ci%endif 513cabdff1aSopenharmony_ci%endif ;avx 514cabdff1aSopenharmony_ci psllw m0, 14-%2 515cabdff1aSopenharmony_ci%endmacro 516cabdff1aSopenharmony_ci 517cabdff1aSopenharmony_ci%macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3 518cabdff1aSopenharmony_ci%if %0 == 8 519cabdff1aSopenharmony_ci%define %%reg0 %5 520cabdff1aSopenharmony_ci%define %%reg2 %6 521cabdff1aSopenharmony_ci%define %%reg1 %7 522cabdff1aSopenharmony_ci%define %%reg3 %8 523cabdff1aSopenharmony_ci%else 524cabdff1aSopenharmony_ci%define %%reg0 m0 525cabdff1aSopenharmony_ci%define %%reg2 m2 526cabdff1aSopenharmony_ci%define %%reg1 m1 527cabdff1aSopenharmony_ci%define %%reg3 m3 528cabdff1aSopenharmony_ci%endif 529cabdff1aSopenharmony_ci%if %1 == 8 530cabdff1aSopenharmony_ci%if cpuflag(avx2) && (%0 == 5) 531cabdff1aSopenharmony_ci%if %2 > 16 532cabdff1aSopenharmony_ci vperm2i128 m10, m0, m1, q0301 533cabdff1aSopenharmony_ci%endif 534cabdff1aSopenharmony_ci vinserti128 m0, m0, xm1, 1 535cabdff1aSopenharmony_ci mova m1, m10 536cabdff1aSopenharmony_ci%if %2 > 16 537cabdff1aSopenharmony_ci vperm2i128 m10, m2, m3, q0301 538cabdff1aSopenharmony_ci%endif 539cabdff1aSopenharmony_ci vinserti128 m2, m2, xm3, 1 540cabdff1aSopenharmony_ci mova m3, m10 541cabdff1aSopenharmony_ci%endif 542cabdff1aSopenharmony_ci pmaddubsw %%reg0, %3 ;x1*c1+x2*c2 543cabdff1aSopenharmony_ci pmaddubsw %%reg2, %4 ;x3*c3+x4*c4 544cabdff1aSopenharmony_ci paddw %%reg0, %%reg2 545cabdff1aSopenharmony_ci%if %2 > 8 546cabdff1aSopenharmony_ci pmaddubsw %%reg1, %3 547cabdff1aSopenharmony_ci pmaddubsw %%reg3, %4 548cabdff1aSopenharmony_ci paddw %%reg1, %%reg3 549cabdff1aSopenharmony_ci%endif 550cabdff1aSopenharmony_ci%else 551cabdff1aSopenharmony_ci pmaddwd %%reg0, %3 552cabdff1aSopenharmony_ci pmaddwd %%reg2, %4 553cabdff1aSopenharmony_ci paddd %%reg0, %%reg2 554cabdff1aSopenharmony_ci%if %2 > 4 555cabdff1aSopenharmony_ci pmaddwd %%reg1, %3 556cabdff1aSopenharmony_ci pmaddwd %%reg3, %4 557cabdff1aSopenharmony_ci paddd %%reg1, %%reg3 558cabdff1aSopenharmony_ci%if %1 != 8 559cabdff1aSopenharmony_ci psrad %%reg1, %1-8 560cabdff1aSopenharmony_ci%endif 561cabdff1aSopenharmony_ci%endif 562cabdff1aSopenharmony_ci%if %1 != 8 563cabdff1aSopenharmony_ci psrad %%reg0, %1-8 564cabdff1aSopenharmony_ci%endif 565cabdff1aSopenharmony_ci packssdw %%reg0, %%reg1 566cabdff1aSopenharmony_ci%endif 567cabdff1aSopenharmony_ci%endmacro 568cabdff1aSopenharmony_ci 569cabdff1aSopenharmony_ci%macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx 570cabdff1aSopenharmony_ci 571cabdff1aSopenharmony_ci%if cpuflag(avx2) 572cabdff1aSopenharmony_ci%assign %%offset 32 573cabdff1aSopenharmony_ci%define %%table hevc_qpel_filters_avx2_%2 574cabdff1aSopenharmony_ci%else 575cabdff1aSopenharmony_ci%assign %%offset 16 576cabdff1aSopenharmony_ci%define %%table hevc_qpel_filters_sse4_%2 577cabdff1aSopenharmony_ci%endif 578cabdff1aSopenharmony_ci 579cabdff1aSopenharmony_ci%ifdef PIC 580cabdff1aSopenharmony_ci lea rfilterq, [%%table] 581cabdff1aSopenharmony_ci%else 582cabdff1aSopenharmony_ci %define rfilterq %%table 583cabdff1aSopenharmony_ci%endif 584cabdff1aSopenharmony_ci 585cabdff1aSopenharmony_ci%if %2 == 8 586cabdff1aSopenharmony_ci pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2 587cabdff1aSopenharmony_ci pmaddubsw m2, [rfilterq + %3q*8+%%offset] ;x3*c3+x4*c4 588cabdff1aSopenharmony_ci pmaddubsw m4, [rfilterq + %3q*8+2*%%offset] ;x5*c5+x6*c6 589cabdff1aSopenharmony_ci pmaddubsw m6, [rfilterq + %3q*8+3*%%offset] ;x7*c7+x8*c8 590cabdff1aSopenharmony_ci paddw m0, m2 591cabdff1aSopenharmony_ci paddw m4, m6 592cabdff1aSopenharmony_ci paddw m0, m4 593cabdff1aSopenharmony_ci%else 594cabdff1aSopenharmony_ci pmaddwd m0, [rfilterq + %3q*8 ] 595cabdff1aSopenharmony_ci pmaddwd m2, [rfilterq + %3q*8+%%offset] 596cabdff1aSopenharmony_ci pmaddwd m4, [rfilterq + %3q*8+2*%%offset] 597cabdff1aSopenharmony_ci pmaddwd m6, [rfilterq + %3q*8+3*%%offset] 598cabdff1aSopenharmony_ci paddd m0, m2 599cabdff1aSopenharmony_ci paddd m4, m6 600cabdff1aSopenharmony_ci paddd m0, m4 601cabdff1aSopenharmony_ci%if %2 != 8 602cabdff1aSopenharmony_ci psrad m0, %2-8 603cabdff1aSopenharmony_ci%endif 604cabdff1aSopenharmony_ci%if %1 > 4 605cabdff1aSopenharmony_ci pmaddwd m1, [rfilterq + %3q*8 ] 606cabdff1aSopenharmony_ci pmaddwd m3, [rfilterq + %3q*8+%%offset] 607cabdff1aSopenharmony_ci pmaddwd m5, [rfilterq + %3q*8+2*%%offset] 608cabdff1aSopenharmony_ci pmaddwd m7, [rfilterq + %3q*8+3*%%offset] 609cabdff1aSopenharmony_ci paddd m1, m3 610cabdff1aSopenharmony_ci paddd m5, m7 611cabdff1aSopenharmony_ci paddd m1, m5 612cabdff1aSopenharmony_ci%if %2 != 8 613cabdff1aSopenharmony_ci psrad m1, %2-8 614cabdff1aSopenharmony_ci%endif 615cabdff1aSopenharmony_ci%endif 616cabdff1aSopenharmony_ci p%4 m0, m1 617cabdff1aSopenharmony_ci%endif 618cabdff1aSopenharmony_ci%endmacro 619cabdff1aSopenharmony_ci 620cabdff1aSopenharmony_ci%macro QPEL_COMPUTE 2-3 ; width, bitdepth 621cabdff1aSopenharmony_ci%if %2 == 8 622cabdff1aSopenharmony_ci%if cpuflag(avx2) && (%0 == 3) 623cabdff1aSopenharmony_ci 624cabdff1aSopenharmony_ci vperm2i128 m10, m0, m1, q0301 625cabdff1aSopenharmony_ci vinserti128 m0, m0, xm1, 1 626cabdff1aSopenharmony_ci SWAP 1, 10 627cabdff1aSopenharmony_ci 628cabdff1aSopenharmony_ci vperm2i128 m10, m2, m3, q0301 629cabdff1aSopenharmony_ci vinserti128 m2, m2, xm3, 1 630cabdff1aSopenharmony_ci SWAP 3, 10 631cabdff1aSopenharmony_ci 632cabdff1aSopenharmony_ci 633cabdff1aSopenharmony_ci vperm2i128 m10, m4, m5, q0301 634cabdff1aSopenharmony_ci vinserti128 m4, m4, xm5, 1 635cabdff1aSopenharmony_ci SWAP 5, 10 636cabdff1aSopenharmony_ci 637cabdff1aSopenharmony_ci vperm2i128 m10, m6, m7, q0301 638cabdff1aSopenharmony_ci vinserti128 m6, m6, xm7, 1 639cabdff1aSopenharmony_ci SWAP 7, 10 640cabdff1aSopenharmony_ci%endif 641cabdff1aSopenharmony_ci 642cabdff1aSopenharmony_ci pmaddubsw m0, m12 ;x1*c1+x2*c2 643cabdff1aSopenharmony_ci pmaddubsw m2, m13 ;x3*c3+x4*c4 644cabdff1aSopenharmony_ci pmaddubsw m4, m14 ;x5*c5+x6*c6 645cabdff1aSopenharmony_ci pmaddubsw m6, m15 ;x7*c7+x8*c8 646cabdff1aSopenharmony_ci paddw m0, m2 647cabdff1aSopenharmony_ci paddw m4, m6 648cabdff1aSopenharmony_ci paddw m0, m4 649cabdff1aSopenharmony_ci%if %1 > 8 650cabdff1aSopenharmony_ci pmaddubsw m1, m12 651cabdff1aSopenharmony_ci pmaddubsw m3, m13 652cabdff1aSopenharmony_ci pmaddubsw m5, m14 653cabdff1aSopenharmony_ci pmaddubsw m7, m15 654cabdff1aSopenharmony_ci paddw m1, m3 655cabdff1aSopenharmony_ci paddw m5, m7 656cabdff1aSopenharmony_ci paddw m1, m5 657cabdff1aSopenharmony_ci%endif 658cabdff1aSopenharmony_ci%else 659cabdff1aSopenharmony_ci pmaddwd m0, m12 660cabdff1aSopenharmony_ci pmaddwd m2, m13 661cabdff1aSopenharmony_ci pmaddwd m4, m14 662cabdff1aSopenharmony_ci pmaddwd m6, m15 663cabdff1aSopenharmony_ci paddd m0, m2 664cabdff1aSopenharmony_ci paddd m4, m6 665cabdff1aSopenharmony_ci paddd m0, m4 666cabdff1aSopenharmony_ci%if %2 != 8 667cabdff1aSopenharmony_ci psrad m0, %2-8 668cabdff1aSopenharmony_ci%endif 669cabdff1aSopenharmony_ci%if %1 > 4 670cabdff1aSopenharmony_ci pmaddwd m1, m12 671cabdff1aSopenharmony_ci pmaddwd m3, m13 672cabdff1aSopenharmony_ci pmaddwd m5, m14 673cabdff1aSopenharmony_ci pmaddwd m7, m15 674cabdff1aSopenharmony_ci paddd m1, m3 675cabdff1aSopenharmony_ci paddd m5, m7 676cabdff1aSopenharmony_ci paddd m1, m5 677cabdff1aSopenharmony_ci%if %2 != 8 678cabdff1aSopenharmony_ci psrad m1, %2-8 679cabdff1aSopenharmony_ci%endif 680cabdff1aSopenharmony_ci%endif 681cabdff1aSopenharmony_ci%endif 682cabdff1aSopenharmony_ci%endmacro 683cabdff1aSopenharmony_ci 684cabdff1aSopenharmony_ci%macro BI_COMPUTE 7-8 ; width, bitd, src1l, src1h, scr2l, scr2h, pw 685cabdff1aSopenharmony_ci paddsw %3, %5 686cabdff1aSopenharmony_ci%if %1 > 8 687cabdff1aSopenharmony_ci paddsw %4, %6 688cabdff1aSopenharmony_ci%endif 689cabdff1aSopenharmony_ci UNI_COMPUTE %1, %2, %3, %4, %7 690cabdff1aSopenharmony_ci%if %0 == 8 && cpuflag(avx2) && (%2 == 8) 691cabdff1aSopenharmony_ci vpermq %3, %3, 216 692cabdff1aSopenharmony_ci vpermq %4, %4, 216 693cabdff1aSopenharmony_ci%endif 694cabdff1aSopenharmony_ci%endmacro 695cabdff1aSopenharmony_ci 696cabdff1aSopenharmony_ci%macro UNI_COMPUTE 5 697cabdff1aSopenharmony_ci pmulhrsw %3, %5 698cabdff1aSopenharmony_ci%if %1 > 8 || (%2 > 8 && %1 > 4) 699cabdff1aSopenharmony_ci pmulhrsw %4, %5 700cabdff1aSopenharmony_ci%endif 701cabdff1aSopenharmony_ci%if %2 == 8 702cabdff1aSopenharmony_ci packuswb %3, %4 703cabdff1aSopenharmony_ci%else 704cabdff1aSopenharmony_ci CLIPW %3, [pb_0], [max_pixels_%2] 705cabdff1aSopenharmony_ci%if (%1 > 8 && notcpuflag(avx)) || %1 > 16 706cabdff1aSopenharmony_ci CLIPW %4, [pb_0], [max_pixels_%2] 707cabdff1aSopenharmony_ci%endif 708cabdff1aSopenharmony_ci%endif 709cabdff1aSopenharmony_ci%endmacro 710cabdff1aSopenharmony_ci 711cabdff1aSopenharmony_ci 712cabdff1aSopenharmony_ci; ****************************** 713cabdff1aSopenharmony_ci; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride, 714cabdff1aSopenharmony_ci; uint8_t *_src, ptrdiff_t _srcstride, 715cabdff1aSopenharmony_ci; int height, int mx, int my) 716cabdff1aSopenharmony_ci; ****************************** 717cabdff1aSopenharmony_ci 718cabdff1aSopenharmony_ci%macro HEVC_PUT_HEVC_PEL_PIXELS 2 719cabdff1aSopenharmony_ciHEVC_PEL_PIXELS %1, %2 720cabdff1aSopenharmony_ciHEVC_UNI_PEL_PIXELS %1, %2 721cabdff1aSopenharmony_ciHEVC_BI_PEL_PIXELS %1, %2 722cabdff1aSopenharmony_ci%endmacro 723cabdff1aSopenharmony_ci 724cabdff1aSopenharmony_ci%macro HEVC_PEL_PIXELS 2 725cabdff1aSopenharmony_cicglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height 726cabdff1aSopenharmony_ci pxor m2, m2 727cabdff1aSopenharmony_ci.loop: 728cabdff1aSopenharmony_ci SIMPLE_LOAD %1, %2, srcq, m0 729cabdff1aSopenharmony_ci MC_PIXEL_COMPUTE %1, %2, 1 730cabdff1aSopenharmony_ci PEL_10STORE%1 dstq, m0, m1 731cabdff1aSopenharmony_ci LOOP_END dst, src, srcstride 732cabdff1aSopenharmony_ci RET 733cabdff1aSopenharmony_ci %endmacro 734cabdff1aSopenharmony_ci 735cabdff1aSopenharmony_ci%macro HEVC_UNI_PEL_PIXELS 2 736cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height 737cabdff1aSopenharmony_ci.loop: 738cabdff1aSopenharmony_ci SIMPLE_LOAD %1, %2, srcq, m0 739cabdff1aSopenharmony_ci PEL_%2STORE%1 dstq, m0, m1 740cabdff1aSopenharmony_ci add dstq, dststrideq ; dst += dststride 741cabdff1aSopenharmony_ci add srcq, srcstrideq ; src += srcstride 742cabdff1aSopenharmony_ci dec heightd ; cmp height 743cabdff1aSopenharmony_ci jnz .loop ; height loop 744cabdff1aSopenharmony_ci RET 745cabdff1aSopenharmony_ci%endmacro 746cabdff1aSopenharmony_ci 747cabdff1aSopenharmony_ci%macro HEVC_BI_PEL_PIXELS 2 748cabdff1aSopenharmony_cicglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height 749cabdff1aSopenharmony_ci pxor m2, m2 750cabdff1aSopenharmony_ci movdqa m5, [pw_bi_%2] 751cabdff1aSopenharmony_ci.loop: 752cabdff1aSopenharmony_ci SIMPLE_LOAD %1, %2, srcq, m0 753cabdff1aSopenharmony_ci SIMPLE_BILOAD %1, src2q, m3, m4 754cabdff1aSopenharmony_ci MC_PIXEL_COMPUTE %1, %2, 1 755cabdff1aSopenharmony_ci BI_COMPUTE %1, %2, m0, m1, m3, m4, m5, 1 756cabdff1aSopenharmony_ci PEL_%2STORE%1 dstq, m0, m1 757cabdff1aSopenharmony_ci add dstq, dststrideq ; dst += dststride 758cabdff1aSopenharmony_ci add srcq, srcstrideq ; src += srcstride 759cabdff1aSopenharmony_ci add src2q, 2*MAX_PB_SIZE ; src += srcstride 760cabdff1aSopenharmony_ci dec heightd ; cmp height 761cabdff1aSopenharmony_ci jnz .loop ; height loop 762cabdff1aSopenharmony_ci RET 763cabdff1aSopenharmony_ci%endmacro 764cabdff1aSopenharmony_ci 765cabdff1aSopenharmony_ci 766cabdff1aSopenharmony_ci; ****************************** 767cabdff1aSopenharmony_ci; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride, 768cabdff1aSopenharmony_ci; uint8_t *_src, ptrdiff_t _srcstride, 769cabdff1aSopenharmony_ci; int height, int mx, int my, int width); 770cabdff1aSopenharmony_ci; ****************************** 771cabdff1aSopenharmony_ci 772cabdff1aSopenharmony_ci 773cabdff1aSopenharmony_ci%macro HEVC_PUT_HEVC_EPEL 2 774cabdff1aSopenharmony_ci%if cpuflag(avx2) 775cabdff1aSopenharmony_ci%define XMM_REGS 11 776cabdff1aSopenharmony_ci%else 777cabdff1aSopenharmony_ci%define XMM_REGS 8 778cabdff1aSopenharmony_ci%endif 779cabdff1aSopenharmony_ci 780cabdff1aSopenharmony_cicglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter 781cabdff1aSopenharmony_ci%assign %%stride ((%2 + 7)/8) 782cabdff1aSopenharmony_ci EPEL_FILTER %2, mx, m4, m5, rfilter 783cabdff1aSopenharmony_ci.loop: 784cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq-%%stride, %%stride, %1 785cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m4, m5, 1 786cabdff1aSopenharmony_ci PEL_10STORE%1 dstq, m0, m1 787cabdff1aSopenharmony_ci LOOP_END dst, src, srcstride 788cabdff1aSopenharmony_ci RET 789cabdff1aSopenharmony_ci 790cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter 791cabdff1aSopenharmony_ci%assign %%stride ((%2 + 7)/8) 792cabdff1aSopenharmony_ci movdqa m6, [pw_%2] 793cabdff1aSopenharmony_ci EPEL_FILTER %2, mx, m4, m5, rfilter 794cabdff1aSopenharmony_ci.loop: 795cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq-%%stride, %%stride, %1 796cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m4, m5 797cabdff1aSopenharmony_ci UNI_COMPUTE %1, %2, m0, m1, m6 798cabdff1aSopenharmony_ci PEL_%2STORE%1 dstq, m0, m1 799cabdff1aSopenharmony_ci add dstq, dststrideq ; dst += dststride 800cabdff1aSopenharmony_ci add srcq, srcstrideq ; src += srcstride 801cabdff1aSopenharmony_ci dec heightd ; cmp height 802cabdff1aSopenharmony_ci jnz .loop ; height loop 803cabdff1aSopenharmony_ci RET 804cabdff1aSopenharmony_ci 805cabdff1aSopenharmony_cicglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter 806cabdff1aSopenharmony_ci movdqa m6, [pw_bi_%2] 807cabdff1aSopenharmony_ci EPEL_FILTER %2, mx, m4, m5, rfilter 808cabdff1aSopenharmony_ci.loop: 809cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq-%%stride, %%stride, %1 810cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m4, m5, 1 811cabdff1aSopenharmony_ci SIMPLE_BILOAD %1, src2q, m2, m3 812cabdff1aSopenharmony_ci BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1 813cabdff1aSopenharmony_ci PEL_%2STORE%1 dstq, m0, m1 814cabdff1aSopenharmony_ci add dstq, dststrideq ; dst += dststride 815cabdff1aSopenharmony_ci add srcq, srcstrideq ; src += srcstride 816cabdff1aSopenharmony_ci add src2q, 2*MAX_PB_SIZE ; src += srcstride 817cabdff1aSopenharmony_ci dec heightd ; cmp height 818cabdff1aSopenharmony_ci jnz .loop ; height loop 819cabdff1aSopenharmony_ci RET 820cabdff1aSopenharmony_ci 821cabdff1aSopenharmony_ci; ****************************** 822cabdff1aSopenharmony_ci; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride, 823cabdff1aSopenharmony_ci; uint8_t *_src, ptrdiff_t _srcstride, 824cabdff1aSopenharmony_ci; int height, int mx, int my, int width) 825cabdff1aSopenharmony_ci; ****************************** 826cabdff1aSopenharmony_ci 827cabdff1aSopenharmony_cicglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my 828cabdff1aSopenharmony_ci movifnidn myd, mym 829cabdff1aSopenharmony_ci sub srcq, srcstrideq 830cabdff1aSopenharmony_ci EPEL_FILTER %2, my, m4, m5, r3src 831cabdff1aSopenharmony_ci lea r3srcq, [srcstrideq*3] 832cabdff1aSopenharmony_ci.loop: 833cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq, srcstride, %1 834cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m4, m5, 1 835cabdff1aSopenharmony_ci PEL_10STORE%1 dstq, m0, m1 836cabdff1aSopenharmony_ci LOOP_END dst, src, srcstride 837cabdff1aSopenharmony_ci RET 838cabdff1aSopenharmony_ci 839cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my 840cabdff1aSopenharmony_ci movifnidn myd, mym 841cabdff1aSopenharmony_ci movdqa m6, [pw_%2] 842cabdff1aSopenharmony_ci sub srcq, srcstrideq 843cabdff1aSopenharmony_ci EPEL_FILTER %2, my, m4, m5, r3src 844cabdff1aSopenharmony_ci lea r3srcq, [srcstrideq*3] 845cabdff1aSopenharmony_ci.loop: 846cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq, srcstride, %1 847cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m4, m5 848cabdff1aSopenharmony_ci UNI_COMPUTE %1, %2, m0, m1, m6 849cabdff1aSopenharmony_ci PEL_%2STORE%1 dstq, m0, m1 850cabdff1aSopenharmony_ci add dstq, dststrideq ; dst += dststride 851cabdff1aSopenharmony_ci add srcq, srcstrideq ; src += srcstride 852cabdff1aSopenharmony_ci dec heightd ; cmp height 853cabdff1aSopenharmony_ci jnz .loop ; height loop 854cabdff1aSopenharmony_ci RET 855cabdff1aSopenharmony_ci 856cabdff1aSopenharmony_ci 857cabdff1aSopenharmony_cicglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my 858cabdff1aSopenharmony_ci movifnidn myd, mym 859cabdff1aSopenharmony_ci movdqa m6, [pw_bi_%2] 860cabdff1aSopenharmony_ci sub srcq, srcstrideq 861cabdff1aSopenharmony_ci EPEL_FILTER %2, my, m4, m5, r3src 862cabdff1aSopenharmony_ci lea r3srcq, [srcstrideq*3] 863cabdff1aSopenharmony_ci.loop: 864cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq, srcstride, %1 865cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m4, m5, 1 866cabdff1aSopenharmony_ci SIMPLE_BILOAD %1, src2q, m2, m3 867cabdff1aSopenharmony_ci BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1 868cabdff1aSopenharmony_ci PEL_%2STORE%1 dstq, m0, m1 869cabdff1aSopenharmony_ci add dstq, dststrideq ; dst += dststride 870cabdff1aSopenharmony_ci add srcq, srcstrideq ; src += srcstride 871cabdff1aSopenharmony_ci add src2q, 2*MAX_PB_SIZE ; src += srcstride 872cabdff1aSopenharmony_ci dec heightd ; cmp height 873cabdff1aSopenharmony_ci jnz .loop ; height loop 874cabdff1aSopenharmony_ci RET 875cabdff1aSopenharmony_ci%endmacro 876cabdff1aSopenharmony_ci 877cabdff1aSopenharmony_ci 878cabdff1aSopenharmony_ci; ****************************** 879cabdff1aSopenharmony_ci; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride, 880cabdff1aSopenharmony_ci; uint8_t *_src, ptrdiff_t _srcstride, 881cabdff1aSopenharmony_ci; int height, int mx, int my, int width) 882cabdff1aSopenharmony_ci; ****************************** 883cabdff1aSopenharmony_ci 884cabdff1aSopenharmony_ci%macro HEVC_PUT_HEVC_EPEL_HV 2 885cabdff1aSopenharmony_cicglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src 886cabdff1aSopenharmony_ci%assign %%stride ((%2 + 7)/8) 887cabdff1aSopenharmony_ci sub srcq, srcstrideq 888cabdff1aSopenharmony_ci EPEL_HV_FILTER %2 889cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq-%%stride, %%stride, %1 890cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m14, m15 891cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 892cabdff1aSopenharmony_ci SWAP m8, m1 893cabdff1aSopenharmony_ci%endif 894cabdff1aSopenharmony_ci SWAP m4, m0 895cabdff1aSopenharmony_ci add srcq, srcstrideq 896cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq-%%stride, %%stride, %1 897cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m14, m15 898cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 899cabdff1aSopenharmony_ci SWAP m9, m1 900cabdff1aSopenharmony_ci%endif 901cabdff1aSopenharmony_ci SWAP m5, m0 902cabdff1aSopenharmony_ci add srcq, srcstrideq 903cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq-%%stride, %%stride, %1 904cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m14, m15 905cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 906cabdff1aSopenharmony_ci SWAP m10, m1 907cabdff1aSopenharmony_ci%endif 908cabdff1aSopenharmony_ci SWAP m6, m0 909cabdff1aSopenharmony_ci add srcq, srcstrideq 910cabdff1aSopenharmony_ci.loop: 911cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq-%%stride, %%stride, %1 912cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m14, m15 913cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 914cabdff1aSopenharmony_ci SWAP m11, m1 915cabdff1aSopenharmony_ci%endif 916cabdff1aSopenharmony_ci SWAP m7, m0 917cabdff1aSopenharmony_ci punpcklwd m0, m4, m5 918cabdff1aSopenharmony_ci punpcklwd m2, m6, m7 919cabdff1aSopenharmony_ci%if %1 > 4 920cabdff1aSopenharmony_ci punpckhwd m1, m4, m5 921cabdff1aSopenharmony_ci punpckhwd m3, m6, m7 922cabdff1aSopenharmony_ci%endif 923cabdff1aSopenharmony_ci EPEL_COMPUTE 14, %1, m12, m13 924cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 925cabdff1aSopenharmony_ci punpcklwd m4, m8, m9 926cabdff1aSopenharmony_ci punpcklwd m2, m10, m11 927cabdff1aSopenharmony_ci punpckhwd m8, m8, m9 928cabdff1aSopenharmony_ci punpckhwd m3, m10, m11 929cabdff1aSopenharmony_ci EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 930cabdff1aSopenharmony_ci%if cpuflag(avx2) 931cabdff1aSopenharmony_ci vinserti128 m2, m0, xm4, 1 932cabdff1aSopenharmony_ci vperm2i128 m3, m0, m4, q0301 933cabdff1aSopenharmony_ci PEL_10STORE%1 dstq, m2, m3 934cabdff1aSopenharmony_ci%else 935cabdff1aSopenharmony_ci PEL_10STORE%1 dstq, m0, m4 936cabdff1aSopenharmony_ci%endif 937cabdff1aSopenharmony_ci%else 938cabdff1aSopenharmony_ci PEL_10STORE%1 dstq, m0, m1 939cabdff1aSopenharmony_ci%endif 940cabdff1aSopenharmony_ci movdqa m4, m5 941cabdff1aSopenharmony_ci movdqa m5, m6 942cabdff1aSopenharmony_ci movdqa m6, m7 943cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 944cabdff1aSopenharmony_ci mova m8, m9 945cabdff1aSopenharmony_ci mova m9, m10 946cabdff1aSopenharmony_ci mova m10, m11 947cabdff1aSopenharmony_ci%endif 948cabdff1aSopenharmony_ci LOOP_END dst, src, srcstride 949cabdff1aSopenharmony_ci RET 950cabdff1aSopenharmony_ci 951cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src 952cabdff1aSopenharmony_ci%assign %%stride ((%2 + 7)/8) 953cabdff1aSopenharmony_ci sub srcq, srcstrideq 954cabdff1aSopenharmony_ci EPEL_HV_FILTER %2 955cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq-%%stride, %%stride, %1 956cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m14, m15 957cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 958cabdff1aSopenharmony_ci SWAP m8, m1 959cabdff1aSopenharmony_ci%endif 960cabdff1aSopenharmony_ci SWAP m4, m0 961cabdff1aSopenharmony_ci add srcq, srcstrideq 962cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq-%%stride, %%stride, %1 963cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m14, m15 964cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 965cabdff1aSopenharmony_ci SWAP m9, m1 966cabdff1aSopenharmony_ci%endif 967cabdff1aSopenharmony_ci SWAP m5, m0 968cabdff1aSopenharmony_ci add srcq, srcstrideq 969cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq-%%stride, %%stride, %1 970cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m14, m15 971cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 972cabdff1aSopenharmony_ci SWAP m10, m1 973cabdff1aSopenharmony_ci%endif 974cabdff1aSopenharmony_ci SWAP m6, m0 975cabdff1aSopenharmony_ci add srcq, srcstrideq 976cabdff1aSopenharmony_ci.loop: 977cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq-%%stride, %%stride, %1 978cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m14, m15 979cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 980cabdff1aSopenharmony_ci SWAP m11, m1 981cabdff1aSopenharmony_ci%endif 982cabdff1aSopenharmony_ci mova m7, m0 983cabdff1aSopenharmony_ci punpcklwd m0, m4, m5 984cabdff1aSopenharmony_ci punpcklwd m2, m6, m7 985cabdff1aSopenharmony_ci%if %1 > 4 986cabdff1aSopenharmony_ci punpckhwd m1, m4, m5 987cabdff1aSopenharmony_ci punpckhwd m3, m6, m7 988cabdff1aSopenharmony_ci%endif 989cabdff1aSopenharmony_ci EPEL_COMPUTE 14, %1, m12, m13 990cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 991cabdff1aSopenharmony_ci punpcklwd m4, m8, m9 992cabdff1aSopenharmony_ci punpcklwd m2, m10, m11 993cabdff1aSopenharmony_ci punpckhwd m8, m8, m9 994cabdff1aSopenharmony_ci punpckhwd m3, m10, m11 995cabdff1aSopenharmony_ci EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 996cabdff1aSopenharmony_ci UNI_COMPUTE %1, %2, m0, m4, [pw_%2] 997cabdff1aSopenharmony_ci%else 998cabdff1aSopenharmony_ci UNI_COMPUTE %1, %2, m0, m1, [pw_%2] 999cabdff1aSopenharmony_ci%endif 1000cabdff1aSopenharmony_ci PEL_%2STORE%1 dstq, m0, m1 1001cabdff1aSopenharmony_ci mova m4, m5 1002cabdff1aSopenharmony_ci mova m5, m6 1003cabdff1aSopenharmony_ci mova m6, m7 1004cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 1005cabdff1aSopenharmony_ci mova m8, m9 1006cabdff1aSopenharmony_ci mova m9, m10 1007cabdff1aSopenharmony_ci mova m10, m11 1008cabdff1aSopenharmony_ci%endif 1009cabdff1aSopenharmony_ci add dstq, dststrideq ; dst += dststride 1010cabdff1aSopenharmony_ci add srcq, srcstrideq ; src += srcstride 1011cabdff1aSopenharmony_ci dec heightd ; cmp height 1012cabdff1aSopenharmony_ci jnz .loop ; height loop 1013cabdff1aSopenharmony_ci RET 1014cabdff1aSopenharmony_ci 1015cabdff1aSopenharmony_cicglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src 1016cabdff1aSopenharmony_ci%assign %%stride ((%2 + 7)/8) 1017cabdff1aSopenharmony_ci sub srcq, srcstrideq 1018cabdff1aSopenharmony_ci EPEL_HV_FILTER %2 1019cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq-%%stride, %%stride, %1 1020cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m14, m15 1021cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 1022cabdff1aSopenharmony_ci SWAP m8, m1 1023cabdff1aSopenharmony_ci%endif 1024cabdff1aSopenharmony_ci SWAP m4, m0 1025cabdff1aSopenharmony_ci add srcq, srcstrideq 1026cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq-%%stride, %%stride, %1 1027cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m14, m15 1028cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 1029cabdff1aSopenharmony_ci SWAP m9, m1 1030cabdff1aSopenharmony_ci%endif 1031cabdff1aSopenharmony_ci SWAP m5, m0 1032cabdff1aSopenharmony_ci add srcq, srcstrideq 1033cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq-%%stride, %%stride, %1 1034cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m14, m15 1035cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 1036cabdff1aSopenharmony_ci SWAP m10, m1 1037cabdff1aSopenharmony_ci%endif 1038cabdff1aSopenharmony_ci SWAP m6, m0 1039cabdff1aSopenharmony_ci add srcq, srcstrideq 1040cabdff1aSopenharmony_ci.loop: 1041cabdff1aSopenharmony_ci EPEL_LOAD %2, srcq-%%stride, %%stride, %1 1042cabdff1aSopenharmony_ci EPEL_COMPUTE %2, %1, m14, m15 1043cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 1044cabdff1aSopenharmony_ci SWAP m11, m1 1045cabdff1aSopenharmony_ci%endif 1046cabdff1aSopenharmony_ci SWAP m7, m0 1047cabdff1aSopenharmony_ci punpcklwd m0, m4, m5 1048cabdff1aSopenharmony_ci punpcklwd m2, m6, m7 1049cabdff1aSopenharmony_ci%if %1 > 4 1050cabdff1aSopenharmony_ci punpckhwd m1, m4, m5 1051cabdff1aSopenharmony_ci punpckhwd m3, m6, m7 1052cabdff1aSopenharmony_ci%endif 1053cabdff1aSopenharmony_ci EPEL_COMPUTE 14, %1, m12, m13 1054cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 1055cabdff1aSopenharmony_ci punpcklwd m4, m8, m9 1056cabdff1aSopenharmony_ci punpcklwd m2, m10, m11 1057cabdff1aSopenharmony_ci punpckhwd m8, m8, m9 1058cabdff1aSopenharmony_ci punpckhwd m3, m10, m11 1059cabdff1aSopenharmony_ci EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 1060cabdff1aSopenharmony_ci SIMPLE_BILOAD %1, src2q, m8, m3 1061cabdff1aSopenharmony_ci%if cpuflag(avx2) 1062cabdff1aSopenharmony_ci vinserti128 m1, m8, xm3, 1 1063cabdff1aSopenharmony_ci vperm2i128 m2, m8, m3, q0301 1064cabdff1aSopenharmony_ci BI_COMPUTE %1, %2, m0, m4, m1, m2, [pw_bi_%2] 1065cabdff1aSopenharmony_ci%else 1066cabdff1aSopenharmony_ci BI_COMPUTE %1, %2, m0, m4, m8, m3, [pw_bi_%2] 1067cabdff1aSopenharmony_ci%endif 1068cabdff1aSopenharmony_ci%else 1069cabdff1aSopenharmony_ci SIMPLE_BILOAD %1, src2q, m8, m9 1070cabdff1aSopenharmony_ci BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2] 1071cabdff1aSopenharmony_ci%endif 1072cabdff1aSopenharmony_ci PEL_%2STORE%1 dstq, m0, m4 1073cabdff1aSopenharmony_ci mova m4, m5 1074cabdff1aSopenharmony_ci mova m5, m6 1075cabdff1aSopenharmony_ci mova m6, m7 1076cabdff1aSopenharmony_ci%if (%1 > 8 && (%2 == 8)) 1077cabdff1aSopenharmony_ci mova m8, m9 1078cabdff1aSopenharmony_ci mova m9, m10 1079cabdff1aSopenharmony_ci mova m10, m11 1080cabdff1aSopenharmony_ci%endif 1081cabdff1aSopenharmony_ci add dstq, dststrideq ; dst += dststride 1082cabdff1aSopenharmony_ci add srcq, srcstrideq ; src += srcstride 1083cabdff1aSopenharmony_ci add src2q, 2*MAX_PB_SIZE ; src += srcstride 1084cabdff1aSopenharmony_ci dec heightd ; cmp height 1085cabdff1aSopenharmony_ci jnz .loop ; height loop 1086cabdff1aSopenharmony_ci RET 1087cabdff1aSopenharmony_ci%endmacro 1088cabdff1aSopenharmony_ci 1089cabdff1aSopenharmony_ci; ****************************** 1090cabdff1aSopenharmony_ci; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride, 1091cabdff1aSopenharmony_ci; uint8_t *_src, ptrdiff_t _srcstride, 1092cabdff1aSopenharmony_ci; int height, int mx, int my, int width) 1093cabdff1aSopenharmony_ci; ****************************** 1094cabdff1aSopenharmony_ci 1095cabdff1aSopenharmony_ci%macro HEVC_PUT_HEVC_QPEL 2 1096cabdff1aSopenharmony_cicglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter 1097cabdff1aSopenharmony_ci QPEL_FILTER %2, mx 1098cabdff1aSopenharmony_ci.loop: 1099cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 10 1100cabdff1aSopenharmony_ci QPEL_COMPUTE %1, %2, 1 1101cabdff1aSopenharmony_ci%if %2 > 8 1102cabdff1aSopenharmony_ci packssdw m0, m1 1103cabdff1aSopenharmony_ci%endif 1104cabdff1aSopenharmony_ci PEL_10STORE%1 dstq, m0, m1 1105cabdff1aSopenharmony_ci LOOP_END dst, src, srcstride 1106cabdff1aSopenharmony_ci RET 1107cabdff1aSopenharmony_ci 1108cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter 1109cabdff1aSopenharmony_ci mova m9, [pw_%2] 1110cabdff1aSopenharmony_ci QPEL_FILTER %2, mx 1111cabdff1aSopenharmony_ci.loop: 1112cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 10 1113cabdff1aSopenharmony_ci QPEL_COMPUTE %1, %2 1114cabdff1aSopenharmony_ci%if %2 > 8 1115cabdff1aSopenharmony_ci packssdw m0, m1 1116cabdff1aSopenharmony_ci%endif 1117cabdff1aSopenharmony_ci UNI_COMPUTE %1, %2, m0, m1, m9 1118cabdff1aSopenharmony_ci PEL_%2STORE%1 dstq, m0, m1 1119cabdff1aSopenharmony_ci add dstq, dststrideq ; dst += dststride 1120cabdff1aSopenharmony_ci add srcq, srcstrideq ; src += srcstride 1121cabdff1aSopenharmony_ci dec heightd ; cmp height 1122cabdff1aSopenharmony_ci jnz .loop ; height loop 1123cabdff1aSopenharmony_ci RET 1124cabdff1aSopenharmony_ci 1125cabdff1aSopenharmony_cicglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter 1126cabdff1aSopenharmony_ci movdqa m9, [pw_bi_%2] 1127cabdff1aSopenharmony_ci QPEL_FILTER %2, mx 1128cabdff1aSopenharmony_ci.loop: 1129cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 10 1130cabdff1aSopenharmony_ci QPEL_COMPUTE %1, %2, 1 1131cabdff1aSopenharmony_ci%if %2 > 8 1132cabdff1aSopenharmony_ci packssdw m0, m1 1133cabdff1aSopenharmony_ci%endif 1134cabdff1aSopenharmony_ci SIMPLE_BILOAD %1, src2q, m10, m11 1135cabdff1aSopenharmony_ci BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1 1136cabdff1aSopenharmony_ci PEL_%2STORE%1 dstq, m0, m1 1137cabdff1aSopenharmony_ci add dstq, dststrideq ; dst += dststride 1138cabdff1aSopenharmony_ci add srcq, srcstrideq ; src += srcstride 1139cabdff1aSopenharmony_ci add src2q, 2*MAX_PB_SIZE ; src += srcstride 1140cabdff1aSopenharmony_ci dec heightd ; cmp height 1141cabdff1aSopenharmony_ci jnz .loop ; height loop 1142cabdff1aSopenharmony_ci RET 1143cabdff1aSopenharmony_ci 1144cabdff1aSopenharmony_ci 1145cabdff1aSopenharmony_ci; ****************************** 1146cabdff1aSopenharmony_ci; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride, 1147cabdff1aSopenharmony_ci; uint8_t *_src, ptrdiff_t _srcstride, 1148cabdff1aSopenharmony_ci; int height, int mx, int my, int width) 1149cabdff1aSopenharmony_ci; ****************************** 1150cabdff1aSopenharmony_ci 1151cabdff1aSopenharmony_cicglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter 1152cabdff1aSopenharmony_ci movifnidn myd, mym 1153cabdff1aSopenharmony_ci lea r3srcq, [srcstrideq*3] 1154cabdff1aSopenharmony_ci QPEL_FILTER %2, my 1155cabdff1aSopenharmony_ci.loop: 1156cabdff1aSopenharmony_ci QPEL_V_LOAD %2, srcq, srcstride, %1, r7 1157cabdff1aSopenharmony_ci QPEL_COMPUTE %1, %2, 1 1158cabdff1aSopenharmony_ci%if %2 > 8 1159cabdff1aSopenharmony_ci packssdw m0, m1 1160cabdff1aSopenharmony_ci%endif 1161cabdff1aSopenharmony_ci PEL_10STORE%1 dstq, m0, m1 1162cabdff1aSopenharmony_ci LOOP_END dst, src, srcstride 1163cabdff1aSopenharmony_ci RET 1164cabdff1aSopenharmony_ci 1165cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter 1166cabdff1aSopenharmony_ci movifnidn myd, mym 1167cabdff1aSopenharmony_ci movdqa m9, [pw_%2] 1168cabdff1aSopenharmony_ci lea r3srcq, [srcstrideq*3] 1169cabdff1aSopenharmony_ci QPEL_FILTER %2, my 1170cabdff1aSopenharmony_ci.loop: 1171cabdff1aSopenharmony_ci QPEL_V_LOAD %2, srcq, srcstride, %1, r8 1172cabdff1aSopenharmony_ci QPEL_COMPUTE %1, %2 1173cabdff1aSopenharmony_ci%if %2 > 8 1174cabdff1aSopenharmony_ci packssdw m0, m1 1175cabdff1aSopenharmony_ci%endif 1176cabdff1aSopenharmony_ci UNI_COMPUTE %1, %2, m0, m1, m9 1177cabdff1aSopenharmony_ci PEL_%2STORE%1 dstq, m0, m1 1178cabdff1aSopenharmony_ci add dstq, dststrideq ; dst += dststride 1179cabdff1aSopenharmony_ci add srcq, srcstrideq ; src += srcstride 1180cabdff1aSopenharmony_ci dec heightd ; cmp height 1181cabdff1aSopenharmony_ci jnz .loop ; height loop 1182cabdff1aSopenharmony_ci RET 1183cabdff1aSopenharmony_ci 1184cabdff1aSopenharmony_cicglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter 1185cabdff1aSopenharmony_ci movifnidn myd, mym 1186cabdff1aSopenharmony_ci movdqa m9, [pw_bi_%2] 1187cabdff1aSopenharmony_ci lea r3srcq, [srcstrideq*3] 1188cabdff1aSopenharmony_ci QPEL_FILTER %2, my 1189cabdff1aSopenharmony_ci.loop: 1190cabdff1aSopenharmony_ci QPEL_V_LOAD %2, srcq, srcstride, %1, r9 1191cabdff1aSopenharmony_ci QPEL_COMPUTE %1, %2, 1 1192cabdff1aSopenharmony_ci%if %2 > 8 1193cabdff1aSopenharmony_ci packssdw m0, m1 1194cabdff1aSopenharmony_ci%endif 1195cabdff1aSopenharmony_ci SIMPLE_BILOAD %1, src2q, m10, m11 1196cabdff1aSopenharmony_ci BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1 1197cabdff1aSopenharmony_ci PEL_%2STORE%1 dstq, m0, m1 1198cabdff1aSopenharmony_ci add dstq, dststrideq ; dst += dststride 1199cabdff1aSopenharmony_ci add srcq, srcstrideq ; src += srcstride 1200cabdff1aSopenharmony_ci add src2q, 2*MAX_PB_SIZE ; src += srcstride 1201cabdff1aSopenharmony_ci dec heightd ; cmp height 1202cabdff1aSopenharmony_ci jnz .loop ; height loop 1203cabdff1aSopenharmony_ci RET 1204cabdff1aSopenharmony_ci%endmacro 1205cabdff1aSopenharmony_ci 1206cabdff1aSopenharmony_ci 1207cabdff1aSopenharmony_ci; ****************************** 1208cabdff1aSopenharmony_ci; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride, 1209cabdff1aSopenharmony_ci; uint8_t *_src, ptrdiff_t _srcstride, 1210cabdff1aSopenharmony_ci; int height, int mx, int my) 1211cabdff1aSopenharmony_ci; ****************************** 1212cabdff1aSopenharmony_ci%macro HEVC_PUT_HEVC_QPEL_HV 2 1213cabdff1aSopenharmony_cicglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter 1214cabdff1aSopenharmony_ci%if cpuflag(avx2) 1215cabdff1aSopenharmony_ci%assign %%shift 4 1216cabdff1aSopenharmony_ci%else 1217cabdff1aSopenharmony_ci%assign %%shift 3 1218cabdff1aSopenharmony_ci%endif 1219cabdff1aSopenharmony_ci sub mxq, 1 1220cabdff1aSopenharmony_ci sub myq, 1 1221cabdff1aSopenharmony_ci shl mxq, %%shift ; multiply by 32 1222cabdff1aSopenharmony_ci shl myq, %%shift ; multiply by 32 1223cabdff1aSopenharmony_ci lea r3srcq, [srcstrideq*3] 1224cabdff1aSopenharmony_ci sub srcq, r3srcq 1225cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1226cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1227cabdff1aSopenharmony_ci SWAP m8, m0 1228cabdff1aSopenharmony_ci add srcq, srcstrideq 1229cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1230cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1231cabdff1aSopenharmony_ci SWAP m9, m0 1232cabdff1aSopenharmony_ci add srcq, srcstrideq 1233cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1234cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1235cabdff1aSopenharmony_ci SWAP m10, m0 1236cabdff1aSopenharmony_ci add srcq, srcstrideq 1237cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1238cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1239cabdff1aSopenharmony_ci SWAP m11, m0 1240cabdff1aSopenharmony_ci add srcq, srcstrideq 1241cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1242cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1243cabdff1aSopenharmony_ci SWAP m12, m0 1244cabdff1aSopenharmony_ci add srcq, srcstrideq 1245cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1246cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1247cabdff1aSopenharmony_ci SWAP m13, m0 1248cabdff1aSopenharmony_ci add srcq, srcstrideq 1249cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1250cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1251cabdff1aSopenharmony_ci SWAP m14, m0 1252cabdff1aSopenharmony_ci add srcq, srcstrideq 1253cabdff1aSopenharmony_ci.loop: 1254cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1255cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1256cabdff1aSopenharmony_ci SWAP m15, m0 1257cabdff1aSopenharmony_ci punpcklwd m0, m8, m9 1258cabdff1aSopenharmony_ci punpcklwd m2, m10, m11 1259cabdff1aSopenharmony_ci punpcklwd m4, m12, m13 1260cabdff1aSopenharmony_ci punpcklwd m6, m14, m15 1261cabdff1aSopenharmony_ci%if %1 > 4 1262cabdff1aSopenharmony_ci punpckhwd m1, m8, m9 1263cabdff1aSopenharmony_ci punpckhwd m3, m10, m11 1264cabdff1aSopenharmony_ci punpckhwd m5, m12, m13 1265cabdff1aSopenharmony_ci punpckhwd m7, m14, m15 1266cabdff1aSopenharmony_ci%endif 1267cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, 14, my, ackssdw 1268cabdff1aSopenharmony_ci PEL_10STORE%1 dstq, m0, m1 1269cabdff1aSopenharmony_ci%if %1 <= 4 1270cabdff1aSopenharmony_ci movq m8, m9 1271cabdff1aSopenharmony_ci movq m9, m10 1272cabdff1aSopenharmony_ci movq m10, m11 1273cabdff1aSopenharmony_ci movq m11, m12 1274cabdff1aSopenharmony_ci movq m12, m13 1275cabdff1aSopenharmony_ci movq m13, m14 1276cabdff1aSopenharmony_ci movq m14, m15 1277cabdff1aSopenharmony_ci%else 1278cabdff1aSopenharmony_ci movdqa m8, m9 1279cabdff1aSopenharmony_ci movdqa m9, m10 1280cabdff1aSopenharmony_ci movdqa m10, m11 1281cabdff1aSopenharmony_ci movdqa m11, m12 1282cabdff1aSopenharmony_ci movdqa m12, m13 1283cabdff1aSopenharmony_ci movdqa m13, m14 1284cabdff1aSopenharmony_ci movdqa m14, m15 1285cabdff1aSopenharmony_ci%endif 1286cabdff1aSopenharmony_ci LOOP_END dst, src, srcstride 1287cabdff1aSopenharmony_ci RET 1288cabdff1aSopenharmony_ci 1289cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter 1290cabdff1aSopenharmony_ci%if cpuflag(avx2) 1291cabdff1aSopenharmony_ci%assign %%shift 4 1292cabdff1aSopenharmony_ci%else 1293cabdff1aSopenharmony_ci%assign %%shift 3 1294cabdff1aSopenharmony_ci%endif 1295cabdff1aSopenharmony_ci sub mxq, 1 1296cabdff1aSopenharmony_ci sub myq, 1 1297cabdff1aSopenharmony_ci shl mxq, %%shift ; multiply by 32 1298cabdff1aSopenharmony_ci shl myq, %%shift ; multiply by 32 1299cabdff1aSopenharmony_ci lea r3srcq, [srcstrideq*3] 1300cabdff1aSopenharmony_ci sub srcq, r3srcq 1301cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1302cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1303cabdff1aSopenharmony_ci SWAP m8, m0 1304cabdff1aSopenharmony_ci add srcq, srcstrideq 1305cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1306cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1307cabdff1aSopenharmony_ci SWAP m9, m0 1308cabdff1aSopenharmony_ci add srcq, srcstrideq 1309cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1310cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1311cabdff1aSopenharmony_ci SWAP m10, m0 1312cabdff1aSopenharmony_ci add srcq, srcstrideq 1313cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1314cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1315cabdff1aSopenharmony_ci SWAP m11, m0 1316cabdff1aSopenharmony_ci add srcq, srcstrideq 1317cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1318cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1319cabdff1aSopenharmony_ci SWAP m12, m0 1320cabdff1aSopenharmony_ci add srcq, srcstrideq 1321cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1322cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1323cabdff1aSopenharmony_ci SWAP m13, m0 1324cabdff1aSopenharmony_ci add srcq, srcstrideq 1325cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1326cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1327cabdff1aSopenharmony_ci SWAP m14, m0 1328cabdff1aSopenharmony_ci add srcq, srcstrideq 1329cabdff1aSopenharmony_ci.loop: 1330cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1331cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1332cabdff1aSopenharmony_ci SWAP m15, m0 1333cabdff1aSopenharmony_ci punpcklwd m0, m8, m9 1334cabdff1aSopenharmony_ci punpcklwd m2, m10, m11 1335cabdff1aSopenharmony_ci punpcklwd m4, m12, m13 1336cabdff1aSopenharmony_ci punpcklwd m6, m14, m15 1337cabdff1aSopenharmony_ci%if %1 > 4 1338cabdff1aSopenharmony_ci punpckhwd m1, m8, m9 1339cabdff1aSopenharmony_ci punpckhwd m3, m10, m11 1340cabdff1aSopenharmony_ci punpckhwd m5, m12, m13 1341cabdff1aSopenharmony_ci punpckhwd m7, m14, m15 1342cabdff1aSopenharmony_ci%endif 1343cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, 14, my, ackusdw 1344cabdff1aSopenharmony_ci UNI_COMPUTE %1, %2, m0, m1, [pw_%2] 1345cabdff1aSopenharmony_ci PEL_%2STORE%1 dstq, m0, m1 1346cabdff1aSopenharmony_ci 1347cabdff1aSopenharmony_ci%if %1 <= 4 1348cabdff1aSopenharmony_ci movq m8, m9 1349cabdff1aSopenharmony_ci movq m9, m10 1350cabdff1aSopenharmony_ci movq m10, m11 1351cabdff1aSopenharmony_ci movq m11, m12 1352cabdff1aSopenharmony_ci movq m12, m13 1353cabdff1aSopenharmony_ci movq m13, m14 1354cabdff1aSopenharmony_ci movq m14, m15 1355cabdff1aSopenharmony_ci%else 1356cabdff1aSopenharmony_ci mova m8, m9 1357cabdff1aSopenharmony_ci mova m9, m10 1358cabdff1aSopenharmony_ci mova m10, m11 1359cabdff1aSopenharmony_ci mova m11, m12 1360cabdff1aSopenharmony_ci mova m12, m13 1361cabdff1aSopenharmony_ci mova m13, m14 1362cabdff1aSopenharmony_ci mova m14, m15 1363cabdff1aSopenharmony_ci%endif 1364cabdff1aSopenharmony_ci add dstq, dststrideq ; dst += dststride 1365cabdff1aSopenharmony_ci add srcq, srcstrideq ; src += srcstride 1366cabdff1aSopenharmony_ci dec heightd ; cmp height 1367cabdff1aSopenharmony_ci jnz .loop ; height loop 1368cabdff1aSopenharmony_ci RET 1369cabdff1aSopenharmony_ci 1370cabdff1aSopenharmony_cicglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter 1371cabdff1aSopenharmony_ci%if cpuflag(avx2) 1372cabdff1aSopenharmony_ci%assign %%shift 4 1373cabdff1aSopenharmony_ci%else 1374cabdff1aSopenharmony_ci%assign %%shift 3 1375cabdff1aSopenharmony_ci%endif 1376cabdff1aSopenharmony_ci sub mxq, 1 1377cabdff1aSopenharmony_ci sub myq, 1 1378cabdff1aSopenharmony_ci shl mxq, %%shift ; multiply by 32 1379cabdff1aSopenharmony_ci shl myq, %%shift ; multiply by 32 1380cabdff1aSopenharmony_ci lea r3srcq, [srcstrideq*3] 1381cabdff1aSopenharmony_ci sub srcq, r3srcq 1382cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1383cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1384cabdff1aSopenharmony_ci SWAP m8, m0 1385cabdff1aSopenharmony_ci add srcq, srcstrideq 1386cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1387cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1388cabdff1aSopenharmony_ci SWAP m9, m0 1389cabdff1aSopenharmony_ci add srcq, srcstrideq 1390cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1391cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1392cabdff1aSopenharmony_ci SWAP m10, m0 1393cabdff1aSopenharmony_ci add srcq, srcstrideq 1394cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1395cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1396cabdff1aSopenharmony_ci SWAP m11, m0 1397cabdff1aSopenharmony_ci add srcq, srcstrideq 1398cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1399cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1400cabdff1aSopenharmony_ci SWAP m12, m0 1401cabdff1aSopenharmony_ci add srcq, srcstrideq 1402cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1403cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1404cabdff1aSopenharmony_ci SWAP m13, m0 1405cabdff1aSopenharmony_ci add srcq, srcstrideq 1406cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1407cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1408cabdff1aSopenharmony_ci SWAP m14, m0 1409cabdff1aSopenharmony_ci add srcq, srcstrideq 1410cabdff1aSopenharmony_ci.loop: 1411cabdff1aSopenharmony_ci QPEL_H_LOAD %2, srcq, %1, 15 1412cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1413cabdff1aSopenharmony_ci SWAP m15, m0 1414cabdff1aSopenharmony_ci punpcklwd m0, m8, m9 1415cabdff1aSopenharmony_ci punpcklwd m2, m10, m11 1416cabdff1aSopenharmony_ci punpcklwd m4, m12, m13 1417cabdff1aSopenharmony_ci punpcklwd m6, m14, m15 1418cabdff1aSopenharmony_ci%if %1 > 4 1419cabdff1aSopenharmony_ci punpckhwd m1, m8, m9 1420cabdff1aSopenharmony_ci punpckhwd m3, m10, m11 1421cabdff1aSopenharmony_ci punpckhwd m5, m12, m13 1422cabdff1aSopenharmony_ci punpckhwd m7, m14, m15 1423cabdff1aSopenharmony_ci%endif 1424cabdff1aSopenharmony_ci QPEL_HV_COMPUTE %1, 14, my, ackssdw 1425cabdff1aSopenharmony_ci SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case 1426cabdff1aSopenharmony_ci BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2] 1427cabdff1aSopenharmony_ci PEL_%2STORE%1 dstq, m0, m1 1428cabdff1aSopenharmony_ci 1429cabdff1aSopenharmony_ci%if %1 <= 4 1430cabdff1aSopenharmony_ci movq m8, m9 1431cabdff1aSopenharmony_ci movq m9, m10 1432cabdff1aSopenharmony_ci movq m10, m11 1433cabdff1aSopenharmony_ci movq m11, m12 1434cabdff1aSopenharmony_ci movq m12, m13 1435cabdff1aSopenharmony_ci movq m13, m14 1436cabdff1aSopenharmony_ci movq m14, m15 1437cabdff1aSopenharmony_ci%else 1438cabdff1aSopenharmony_ci movdqa m8, m9 1439cabdff1aSopenharmony_ci movdqa m9, m10 1440cabdff1aSopenharmony_ci movdqa m10, m11 1441cabdff1aSopenharmony_ci movdqa m11, m12 1442cabdff1aSopenharmony_ci movdqa m12, m13 1443cabdff1aSopenharmony_ci movdqa m13, m14 1444cabdff1aSopenharmony_ci movdqa m14, m15 1445cabdff1aSopenharmony_ci%endif 1446cabdff1aSopenharmony_ci add dstq, dststrideq ; dst += dststride 1447cabdff1aSopenharmony_ci add srcq, srcstrideq ; src += srcstride 1448cabdff1aSopenharmony_ci add src2q, 2*MAX_PB_SIZE ; src += srcstride 1449cabdff1aSopenharmony_ci dec heightd ; cmp height 1450cabdff1aSopenharmony_ci jnz .loop ; height loop 1451cabdff1aSopenharmony_ci RET 1452cabdff1aSopenharmony_ci%endmacro 1453cabdff1aSopenharmony_ci 1454cabdff1aSopenharmony_ci%macro WEIGHTING_FUNCS 2 1455cabdff1aSopenharmony_ci%if WIN64 || ARCH_X86_32 1456cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, height, denom, wx, ox 1457cabdff1aSopenharmony_ci mov r4d, denomm 1458cabdff1aSopenharmony_ci%define SHIFT r4d 1459cabdff1aSopenharmony_ci%else 1460cabdff1aSopenharmony_cicglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, height, denom, wx, ox 1461cabdff1aSopenharmony_ci%define SHIFT denomd 1462cabdff1aSopenharmony_ci%endif 1463cabdff1aSopenharmony_ci lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom 1464cabdff1aSopenharmony_ci%if %1 <= 4 1465cabdff1aSopenharmony_ci pxor m1, m1 1466cabdff1aSopenharmony_ci%endif 1467cabdff1aSopenharmony_ci movd m2, wxm ; WX 1468cabdff1aSopenharmony_ci movd m4, SHIFT ; shift 1469cabdff1aSopenharmony_ci%if %1 <= 4 1470cabdff1aSopenharmony_ci punpcklwd m2, m1 1471cabdff1aSopenharmony_ci%else 1472cabdff1aSopenharmony_ci punpcklwd m2, m2 1473cabdff1aSopenharmony_ci%endif 1474cabdff1aSopenharmony_ci dec SHIFT 1475cabdff1aSopenharmony_ci movdqu m5, [pd_1] 1476cabdff1aSopenharmony_ci movd m6, SHIFT 1477cabdff1aSopenharmony_ci pshufd m2, m2, 0 1478cabdff1aSopenharmony_ci mov SHIFT, oxm 1479cabdff1aSopenharmony_ci pslld m5, m6 1480cabdff1aSopenharmony_ci%if %2 != 8 1481cabdff1aSopenharmony_ci shl SHIFT, %2-8 ; ox << (bitd - 8) 1482cabdff1aSopenharmony_ci%endif 1483cabdff1aSopenharmony_ci movd m3, SHIFT ; OX 1484cabdff1aSopenharmony_ci pshufd m3, m3, 0 1485cabdff1aSopenharmony_ci%if WIN64 || ARCH_X86_32 1486cabdff1aSopenharmony_ci mov SHIFT, heightm 1487cabdff1aSopenharmony_ci%endif 1488cabdff1aSopenharmony_ci.loop: 1489cabdff1aSopenharmony_ci SIMPLE_LOAD %1, 10, srcq, m0 1490cabdff1aSopenharmony_ci%if %1 <= 4 1491cabdff1aSopenharmony_ci punpcklwd m0, m1 1492cabdff1aSopenharmony_ci pmaddwd m0, m2 1493cabdff1aSopenharmony_ci paddd m0, m5 1494cabdff1aSopenharmony_ci psrad m0, m4 1495cabdff1aSopenharmony_ci paddd m0, m3 1496cabdff1aSopenharmony_ci%else 1497cabdff1aSopenharmony_ci pmulhw m6, m0, m2 1498cabdff1aSopenharmony_ci pmullw m0, m2 1499cabdff1aSopenharmony_ci punpckhwd m1, m0, m6 1500cabdff1aSopenharmony_ci punpcklwd m0, m6 1501cabdff1aSopenharmony_ci paddd m0, m5 1502cabdff1aSopenharmony_ci paddd m1, m5 1503cabdff1aSopenharmony_ci psrad m0, m4 1504cabdff1aSopenharmony_ci psrad m1, m4 1505cabdff1aSopenharmony_ci paddd m0, m3 1506cabdff1aSopenharmony_ci paddd m1, m3 1507cabdff1aSopenharmony_ci%endif 1508cabdff1aSopenharmony_ci packssdw m0, m1 1509cabdff1aSopenharmony_ci%if %2 == 8 1510cabdff1aSopenharmony_ci packuswb m0, m0 1511cabdff1aSopenharmony_ci%else 1512cabdff1aSopenharmony_ci CLIPW m0, [pb_0], [max_pixels_%2] 1513cabdff1aSopenharmony_ci%endif 1514cabdff1aSopenharmony_ci PEL_%2STORE%1 dstq, m0, m1 1515cabdff1aSopenharmony_ci add dstq, dststrideq ; dst += dststride 1516cabdff1aSopenharmony_ci add srcq, 2*MAX_PB_SIZE ; src += srcstride 1517cabdff1aSopenharmony_ci dec heightd ; cmp height 1518cabdff1aSopenharmony_ci jnz .loop ; height loop 1519cabdff1aSopenharmony_ci RET 1520cabdff1aSopenharmony_ci 1521cabdff1aSopenharmony_cicglobal hevc_put_hevc_bi_w%1_%2, 4, 6, 10, dst, dststride, src, src2, height, denom, wx0, wx1, ox0, ox1 1522cabdff1aSopenharmony_ci movifnidn r5d, denomm 1523cabdff1aSopenharmony_ci%if %1 <= 4 1524cabdff1aSopenharmony_ci pxor m1, m1 1525cabdff1aSopenharmony_ci%endif 1526cabdff1aSopenharmony_ci movd m2, wx0m ; WX0 1527cabdff1aSopenharmony_ci lea r5d, [r5d+14-%2] ; shift = 14 - bitd + denom 1528cabdff1aSopenharmony_ci movd m3, wx1m ; WX1 1529cabdff1aSopenharmony_ci movd m0, r5d ; shift 1530cabdff1aSopenharmony_ci%if %1 <= 4 1531cabdff1aSopenharmony_ci punpcklwd m2, m1 1532cabdff1aSopenharmony_ci punpcklwd m3, m1 1533cabdff1aSopenharmony_ci%else 1534cabdff1aSopenharmony_ci punpcklwd m2, m2 1535cabdff1aSopenharmony_ci punpcklwd m3, m3 1536cabdff1aSopenharmony_ci%endif 1537cabdff1aSopenharmony_ci inc r5d 1538cabdff1aSopenharmony_ci movd m5, r5d ; shift+1 1539cabdff1aSopenharmony_ci pshufd m2, m2, 0 1540cabdff1aSopenharmony_ci mov r5d, ox0m 1541cabdff1aSopenharmony_ci pshufd m3, m3, 0 1542cabdff1aSopenharmony_ci add r5d, ox1m 1543cabdff1aSopenharmony_ci%if %2 != 8 1544cabdff1aSopenharmony_ci shl r5d, %2-8 ; ox << (bitd - 8) 1545cabdff1aSopenharmony_ci%endif 1546cabdff1aSopenharmony_ci inc r5d 1547cabdff1aSopenharmony_ci movd m4, r5d ; offset 1548cabdff1aSopenharmony_ci pshufd m4, m4, 0 1549cabdff1aSopenharmony_ci%if UNIX64 1550cabdff1aSopenharmony_ci%define h heightd 1551cabdff1aSopenharmony_ci%else 1552cabdff1aSopenharmony_ci mov r5d, heightm 1553cabdff1aSopenharmony_ci%define h r5d 1554cabdff1aSopenharmony_ci%endif 1555cabdff1aSopenharmony_ci pslld m4, m0 1556cabdff1aSopenharmony_ci 1557cabdff1aSopenharmony_ci.loop: 1558cabdff1aSopenharmony_ci SIMPLE_LOAD %1, 10, srcq, m0 1559cabdff1aSopenharmony_ci SIMPLE_LOAD %1, 10, src2q, m8 1560cabdff1aSopenharmony_ci%if %1 <= 4 1561cabdff1aSopenharmony_ci punpcklwd m0, m1 1562cabdff1aSopenharmony_ci punpcklwd m8, m1 1563cabdff1aSopenharmony_ci pmaddwd m0, m3 1564cabdff1aSopenharmony_ci pmaddwd m8, m2 1565cabdff1aSopenharmony_ci paddd m0, m4 1566cabdff1aSopenharmony_ci paddd m0, m8 1567cabdff1aSopenharmony_ci psrad m0, m5 1568cabdff1aSopenharmony_ci%else 1569cabdff1aSopenharmony_ci pmulhw m6, m0, m3 1570cabdff1aSopenharmony_ci pmullw m0, m3 1571cabdff1aSopenharmony_ci pmulhw m7, m8, m2 1572cabdff1aSopenharmony_ci pmullw m8, m2 1573cabdff1aSopenharmony_ci punpckhwd m1, m0, m6 1574cabdff1aSopenharmony_ci punpcklwd m0, m6 1575cabdff1aSopenharmony_ci punpckhwd m9, m8, m7 1576cabdff1aSopenharmony_ci punpcklwd m8, m7 1577cabdff1aSopenharmony_ci paddd m0, m8 1578cabdff1aSopenharmony_ci paddd m1, m9 1579cabdff1aSopenharmony_ci paddd m0, m4 1580cabdff1aSopenharmony_ci paddd m1, m4 1581cabdff1aSopenharmony_ci psrad m0, m5 1582cabdff1aSopenharmony_ci psrad m1, m5 1583cabdff1aSopenharmony_ci%endif 1584cabdff1aSopenharmony_ci packssdw m0, m1 1585cabdff1aSopenharmony_ci%if %2 == 8 1586cabdff1aSopenharmony_ci packuswb m0, m0 1587cabdff1aSopenharmony_ci%else 1588cabdff1aSopenharmony_ci CLIPW m0, [pb_0], [max_pixels_%2] 1589cabdff1aSopenharmony_ci%endif 1590cabdff1aSopenharmony_ci PEL_%2STORE%1 dstq, m0, m1 1591cabdff1aSopenharmony_ci add dstq, dststrideq ; dst += dststride 1592cabdff1aSopenharmony_ci add srcq, 2*MAX_PB_SIZE ; src += srcstride 1593cabdff1aSopenharmony_ci add src2q, 2*MAX_PB_SIZE ; src2 += srcstride 1594cabdff1aSopenharmony_ci dec h ; cmp height 1595cabdff1aSopenharmony_ci jnz .loop ; height loop 1596cabdff1aSopenharmony_ci RET 1597cabdff1aSopenharmony_ci%endmacro 1598cabdff1aSopenharmony_ci 1599cabdff1aSopenharmony_ciINIT_XMM sse4 ; adds ff_ and _sse4 to function name 1600cabdff1aSopenharmony_ci 1601cabdff1aSopenharmony_ciWEIGHTING_FUNCS 2, 8 1602cabdff1aSopenharmony_ciWEIGHTING_FUNCS 4, 8 1603cabdff1aSopenharmony_ciWEIGHTING_FUNCS 6, 8 1604cabdff1aSopenharmony_ciWEIGHTING_FUNCS 8, 8 1605cabdff1aSopenharmony_ci 1606cabdff1aSopenharmony_ciWEIGHTING_FUNCS 2, 10 1607cabdff1aSopenharmony_ciWEIGHTING_FUNCS 4, 10 1608cabdff1aSopenharmony_ciWEIGHTING_FUNCS 6, 10 1609cabdff1aSopenharmony_ciWEIGHTING_FUNCS 8, 10 1610cabdff1aSopenharmony_ci 1611cabdff1aSopenharmony_ciWEIGHTING_FUNCS 2, 12 1612cabdff1aSopenharmony_ciWEIGHTING_FUNCS 4, 12 1613cabdff1aSopenharmony_ciWEIGHTING_FUNCS 6, 12 1614cabdff1aSopenharmony_ciWEIGHTING_FUNCS 8, 12 1615cabdff1aSopenharmony_ci 1616cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 2, 8 1617cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 4, 8 1618cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 6, 8 1619cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 8, 8 1620cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 12, 8 1621cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 16, 8 1622cabdff1aSopenharmony_ci 1623cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 2, 10 1624cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 4, 10 1625cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 6, 10 1626cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 8, 10 1627cabdff1aSopenharmony_ci 1628cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 2, 12 1629cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 4, 12 1630cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 6, 12 1631cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 8, 12 1632cabdff1aSopenharmony_ci 1633cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 2, 8 1634cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 4, 8 1635cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 6, 8 1636cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 8, 8 1637cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 12, 8 1638cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 16, 8 1639cabdff1aSopenharmony_ci 1640cabdff1aSopenharmony_ci 1641cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 2, 10 1642cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 4, 10 1643cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 6, 10 1644cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 8, 10 1645cabdff1aSopenharmony_ci 1646cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 2, 12 1647cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 4, 12 1648cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 6, 12 1649cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 8, 12 1650cabdff1aSopenharmony_ci 1651cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 2, 8 1652cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 4, 8 1653cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 6, 8 1654cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 8, 8 1655cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 16, 8 1656cabdff1aSopenharmony_ci 1657cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 2, 10 1658cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 4, 10 1659cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 6, 10 1660cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 8, 10 1661cabdff1aSopenharmony_ci 1662cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 2, 12 1663cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 4, 12 1664cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 6, 12 1665cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 8, 12 1666cabdff1aSopenharmony_ci 1667cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 4, 8 1668cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 8, 8 1669cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 12, 8 1670cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 16, 8 1671cabdff1aSopenharmony_ci 1672cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 4, 10 1673cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 8, 10 1674cabdff1aSopenharmony_ci 1675cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 4, 12 1676cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 8, 12 1677cabdff1aSopenharmony_ci 1678cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 2, 8 1679cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 4, 8 1680cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 6, 8 1681cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 8, 8 1682cabdff1aSopenharmony_ci 1683cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 2, 10 1684cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 4, 10 1685cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 6, 10 1686cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 8, 10 1687cabdff1aSopenharmony_ci 1688cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 2, 12 1689cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 4, 12 1690cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 6, 12 1691cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 8, 12 1692cabdff1aSopenharmony_ci 1693cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 1694cabdff1aSopenharmony_ciINIT_YMM avx2 ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0 1695cabdff1aSopenharmony_ci 1696cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 32, 8 1697cabdff1aSopenharmony_ciHEVC_PUT_HEVC_PEL_PIXELS 16, 10 1698cabdff1aSopenharmony_ci 1699cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 32, 8 1700cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL 16, 10 1701cabdff1aSopenharmony_ci 1702cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 16, 10 1703cabdff1aSopenharmony_ciHEVC_PUT_HEVC_EPEL_HV 32, 8 1704cabdff1aSopenharmony_ci 1705cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 32, 8 1706cabdff1aSopenharmony_ci 1707cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL 16, 10 1708cabdff1aSopenharmony_ci 1709cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV 16, 10 1710cabdff1aSopenharmony_ci 1711cabdff1aSopenharmony_ci%endif ;AVX2 1712cabdff1aSopenharmony_ci%endif ; ARCH_X86_64 1713cabdff1aSopenharmony_ci 1714cabdff1aSopenharmony_ci%macro QPEL_FILTER_H 5 1715cabdff1aSopenharmony_ci%define %%table hevc_qpel_filters_avx512icl_h_%1 1716cabdff1aSopenharmony_ci%assign %%offset 4 1717cabdff1aSopenharmony_ci dec %2q 1718cabdff1aSopenharmony_ci shl %2q, 3 1719cabdff1aSopenharmony_ci%ifdef PIC 1720cabdff1aSopenharmony_ci lea %5q, [%%table] 1721cabdff1aSopenharmony_ci %define FILTER %5q 1722cabdff1aSopenharmony_ci%else 1723cabdff1aSopenharmony_ci %define FILTER %%table 1724cabdff1aSopenharmony_ci%endif 1725cabdff1aSopenharmony_ci vpbroadcastd m%3, [FILTER + %2q + 0*%%offset] 1726cabdff1aSopenharmony_ci vpbroadcastd m%4, [FILTER + %2q + 1*%%offset] 1727cabdff1aSopenharmony_ci%endmacro 1728cabdff1aSopenharmony_ci 1729cabdff1aSopenharmony_ci%macro QPEL_FILTER_V 5 1730cabdff1aSopenharmony_ci vpbroadcastd m%3, [%5 + %2q + 4*%4] 1731cabdff1aSopenharmony_ci%endmacro 1732cabdff1aSopenharmony_ci 1733cabdff1aSopenharmony_ci%macro QPEL_LOAD_SHUF 2 1734cabdff1aSopenharmony_ci movu m%1, [pb_qpel_shuffle_index + 0] 1735cabdff1aSopenharmony_ci movu m%2, [pb_qpel_shuffle_index + 64] 1736cabdff1aSopenharmony_ci%endmacro 1737cabdff1aSopenharmony_ci 1738cabdff1aSopenharmony_ci; required: m0-m5 1739cabdff1aSopenharmony_ci; %1: dst register index 1740cabdff1aSopenharmony_ci; %2: name for src 1741cabdff1aSopenharmony_ci; %3: optional offset 1742cabdff1aSopenharmony_ci%macro QPEL_H_LOAD_COMPUTE 2-3 1743cabdff1aSopenharmony_ci%assign %%offset 0 1744cabdff1aSopenharmony_ci%if %0 == 3 1745cabdff1aSopenharmony_ci%assign %%offset %3 1746cabdff1aSopenharmony_ci%endif 1747cabdff1aSopenharmony_ci pxor m%1, m%1 1748cabdff1aSopenharmony_ci%if mmsize == 64 1749cabdff1aSopenharmony_ci movu ym4, [%2q + %%offset - 3] 1750cabdff1aSopenharmony_ci%else 1751cabdff1aSopenharmony_ci movu xm4, [%2q + %%offset - 3] 1752cabdff1aSopenharmony_ci%endif 1753cabdff1aSopenharmony_ci vpermb m5, m2, m4 1754cabdff1aSopenharmony_ci vpermb m4, m3, m4 1755cabdff1aSopenharmony_ci vpdpbusd m%1, m5, m0 1756cabdff1aSopenharmony_ci vpdpbusd m%1, m4, m1 1757cabdff1aSopenharmony_ci%endmacro 1758cabdff1aSopenharmony_ci 1759cabdff1aSopenharmony_ci%macro HEVC_PUT_HEVC_QPEL_AVX512ICL 2 1760cabdff1aSopenharmony_cicglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 8, dst, src, srcstride, height, mx, tmp 1761cabdff1aSopenharmony_ci QPEL_FILTER_H %1, mx, 0, 1, tmp 1762cabdff1aSopenharmony_ci QPEL_LOAD_SHUF 2, 3 1763cabdff1aSopenharmony_ci.loop: 1764cabdff1aSopenharmony_ci QPEL_H_LOAD_COMPUTE 6, src 1765cabdff1aSopenharmony_ci%if %1 == 4 1766cabdff1aSopenharmony_ci vpmovdw xm6, m6 1767cabdff1aSopenharmony_ci movq [dstq], xm6 1768cabdff1aSopenharmony_ci%else 1769cabdff1aSopenharmony_ci vpmovdw [dstq], m6 1770cabdff1aSopenharmony_ci%endif 1771cabdff1aSopenharmony_ci%if %1 > 16 1772cabdff1aSopenharmony_ci QPEL_H_LOAD_COMPUTE 7, src, 16 1773cabdff1aSopenharmony_ci vpmovdw [dstq + 32], m7 1774cabdff1aSopenharmony_ci%endif 1775cabdff1aSopenharmony_ci%if %1 > 32 1776cabdff1aSopenharmony_ci QPEL_H_LOAD_COMPUTE 6, src, 32 1777cabdff1aSopenharmony_ci QPEL_H_LOAD_COMPUTE 7, src, 48 1778cabdff1aSopenharmony_ci vpmovdw [dstq + 64], m6 1779cabdff1aSopenharmony_ci vpmovdw [dstq + 96], m7 1780cabdff1aSopenharmony_ci%endif 1781cabdff1aSopenharmony_ci LOOP_END dst, src, srcstride 1782cabdff1aSopenharmony_ci RET 1783cabdff1aSopenharmony_ci%endmacro 1784cabdff1aSopenharmony_ci 1785cabdff1aSopenharmony_ci%macro HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 2 1786cabdff1aSopenharmony_cicglobal hevc_put_hevc_qpel_hv%1_%2, 6, 7, 27, dst, src, srcstride, height, mx, my, tmp 1787cabdff1aSopenharmony_ci%assign %%shift 6 1788cabdff1aSopenharmony_ci%assign %%extra 7 1789cabdff1aSopenharmony_ci QPEL_FILTER_H %1, mx, 0, 1, tmp 1790cabdff1aSopenharmony_ci QPEL_LOAD_SHUF 2, 3 1791cabdff1aSopenharmony_ci lea tmpq, [srcstrideq*3] 1792cabdff1aSopenharmony_ci sub srcq, tmpq 1793cabdff1aSopenharmony_ci sub myq, 1 1794cabdff1aSopenharmony_ci shl myq, 5 1795cabdff1aSopenharmony_ci%define %%table hevc_qpel_filters_avx512icl_v_%1 1796cabdff1aSopenharmony_ci%ifdef PIC 1797cabdff1aSopenharmony_ci lea tmpq, [%%table] 1798cabdff1aSopenharmony_ci %define FILTER tmpq 1799cabdff1aSopenharmony_ci%else 1800cabdff1aSopenharmony_ci %define FILTER %%table 1801cabdff1aSopenharmony_ci%endif 1802cabdff1aSopenharmony_ci%assign %%i 6 1803cabdff1aSopenharmony_ci%assign %%j 0 1804cabdff1aSopenharmony_ci%rep %1 1805cabdff1aSopenharmony_ci QPEL_FILTER_V %1, my, %%i, %%j, FILTER 1806cabdff1aSopenharmony_ci %assign %%i %%i+1 1807cabdff1aSopenharmony_ci %assign %%j %%j+1 1808cabdff1aSopenharmony_ci%endrep 1809cabdff1aSopenharmony_ci%rep %%extra 1810cabdff1aSopenharmony_ci QPEL_H_LOAD_COMPUTE %%i, src 1811cabdff1aSopenharmony_ci add srcq, srcstrideq 1812cabdff1aSopenharmony_ci%assign %%i %%i+1 1813cabdff1aSopenharmony_ci%endrep 1814cabdff1aSopenharmony_ci.loop: 1815cabdff1aSopenharmony_ci QPEL_H_LOAD_COMPUTE %%i, src 1816cabdff1aSopenharmony_ci vpmulld m22, m14, m6 1817cabdff1aSopenharmony_ci vpmulld m23, m15, m7 1818cabdff1aSopenharmony_ci vpmulld m24, m16, m8 1819cabdff1aSopenharmony_ci vpmulld m25, m17, m9 1820cabdff1aSopenharmony_ci vpaddd m26, m22, m23 1821cabdff1aSopenharmony_ci vpaddd m24, m25 1822cabdff1aSopenharmony_ci vpaddd m26, m24 1823cabdff1aSopenharmony_ci vpmulld m22, m18, m10 1824cabdff1aSopenharmony_ci vpmulld m23, m19, m11 1825cabdff1aSopenharmony_ci vpmulld m24, m20, m12 1826cabdff1aSopenharmony_ci vpmulld m25, m21, m13 1827cabdff1aSopenharmony_ci vpaddd m22, m22, m23 1828cabdff1aSopenharmony_ci vpaddd m24, m25 1829cabdff1aSopenharmony_ci vpaddd m26, m24 1830cabdff1aSopenharmony_ci vpaddd m22, m26 1831cabdff1aSopenharmony_ci mova m14, m15 1832cabdff1aSopenharmony_ci mova m15, m16 1833cabdff1aSopenharmony_ci mova m16, m17 1834cabdff1aSopenharmony_ci mova m17, m18 1835cabdff1aSopenharmony_ci mova m18, m19 1836cabdff1aSopenharmony_ci mova m19, m20 1837cabdff1aSopenharmony_ci mova m20, m21 1838cabdff1aSopenharmony_ci vpsrad m22, %%shift 1839cabdff1aSopenharmony_ci vpmovdw [dstq], m22 1840cabdff1aSopenharmony_ci LOOP_END dst, src, srcstride 1841cabdff1aSopenharmony_ci 1842cabdff1aSopenharmony_ci RET 1843cabdff1aSopenharmony_ci%endmacro 1844cabdff1aSopenharmony_ci 1845cabdff1aSopenharmony_ci%if ARCH_X86_64 1846cabdff1aSopenharmony_ci%if HAVE_AVX512ICL_EXTERNAL 1847cabdff1aSopenharmony_ci 1848cabdff1aSopenharmony_ciINIT_XMM avx512icl 1849cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_AVX512ICL 4, 8 1850cabdff1aSopenharmony_ci 1851cabdff1aSopenharmony_ciINIT_YMM avx512icl 1852cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_AVX512ICL 8, 8 1853cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_HV_AVX512ICL 8, 8 1854cabdff1aSopenharmony_ci 1855cabdff1aSopenharmony_ciINIT_ZMM avx512icl 1856cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_AVX512ICL 16, 8 1857cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_AVX512ICL 32, 8 1858cabdff1aSopenharmony_ciHEVC_PUT_HEVC_QPEL_AVX512ICL 64, 8 1859cabdff1aSopenharmony_ci 1860cabdff1aSopenharmony_ci%endif 1861cabdff1aSopenharmony_ci%endif 1862