1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* VP9 Intra prediction SIMD optimizations 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com> 5cabdff1aSopenharmony_ci;* Copyright (c) 2015 Henrik Gramner <henrik gramner com> 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 8cabdff1aSopenharmony_ci;* 9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 13cabdff1aSopenharmony_ci;* 14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 18cabdff1aSopenharmony_ci;* 19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22cabdff1aSopenharmony_ci;****************************************************************************** 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ciSECTION_RODATA 32 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_cipd_2: times 8 dd 2 29cabdff1aSopenharmony_cipd_4: times 8 dd 4 30cabdff1aSopenharmony_cipd_8: times 8 dd 8 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_cipb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15 33cabdff1aSopenharmony_cipb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0 34cabdff1aSopenharmony_cipb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_cicextern pw_1 37cabdff1aSopenharmony_cicextern pw_1023 38cabdff1aSopenharmony_cicextern pw_4095 39cabdff1aSopenharmony_cicextern pd_16 40cabdff1aSopenharmony_cicextern pd_32 41cabdff1aSopenharmony_cicextern pd_65535; 42cabdff1aSopenharmony_ci 43cabdff1aSopenharmony_ci; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take 44cabdff1aSopenharmony_ci; only 3 registers on x86-32, which would make it one cycle faster, but that 45cabdff1aSopenharmony_ci; would make the code quite a bit uglier... 46cabdff1aSopenharmony_ci 47cabdff1aSopenharmony_ciSECTION .text 48cabdff1aSopenharmony_ci 49cabdff1aSopenharmony_ci%macro SCRATCH 3-4 50cabdff1aSopenharmony_ci%if ARCH_X86_64 51cabdff1aSopenharmony_ci SWAP %1, %2 52cabdff1aSopenharmony_ci%if %0 == 4 53cabdff1aSopenharmony_ci%define reg_%4 m%2 54cabdff1aSopenharmony_ci%endif 55cabdff1aSopenharmony_ci%else 56cabdff1aSopenharmony_ci mova [%3], m%1 57cabdff1aSopenharmony_ci%if %0 == 4 58cabdff1aSopenharmony_ci%define reg_%4 [%3] 59cabdff1aSopenharmony_ci%endif 60cabdff1aSopenharmony_ci%endif 61cabdff1aSopenharmony_ci%endmacro 62cabdff1aSopenharmony_ci 63cabdff1aSopenharmony_ci%macro UNSCRATCH 3-4 64cabdff1aSopenharmony_ci%if ARCH_X86_64 65cabdff1aSopenharmony_ci SWAP %1, %2 66cabdff1aSopenharmony_ci%else 67cabdff1aSopenharmony_ci mova m%1, [%3] 68cabdff1aSopenharmony_ci%endif 69cabdff1aSopenharmony_ci%if %0 == 4 70cabdff1aSopenharmony_ci%undef reg_%4 71cabdff1aSopenharmony_ci%endif 72cabdff1aSopenharmony_ci%endmacro 73cabdff1aSopenharmony_ci 74cabdff1aSopenharmony_ci%macro PRELOAD 2-3 75cabdff1aSopenharmony_ci%if ARCH_X86_64 76cabdff1aSopenharmony_ci mova m%1, [%2] 77cabdff1aSopenharmony_ci%if %0 == 3 78cabdff1aSopenharmony_ci%define reg_%3 m%1 79cabdff1aSopenharmony_ci%endif 80cabdff1aSopenharmony_ci%elif %0 == 3 81cabdff1aSopenharmony_ci%define reg_%3 [%2] 82cabdff1aSopenharmony_ci%endif 83cabdff1aSopenharmony_ci%endmacro 84cabdff1aSopenharmony_ci 85cabdff1aSopenharmony_ciINIT_MMX mmx 86cabdff1aSopenharmony_cicglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a 87cabdff1aSopenharmony_ci movifnidn aq, amp 88cabdff1aSopenharmony_ci mova m0, [aq] 89cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 90cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 91cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 92cabdff1aSopenharmony_ci mova [dstq+strideq*1], m0 93cabdff1aSopenharmony_ci mova [dstq+strideq*2], m0 94cabdff1aSopenharmony_ci mova [dstq+stride3q ], m0 95cabdff1aSopenharmony_ci RET 96cabdff1aSopenharmony_ci 97cabdff1aSopenharmony_ciINIT_XMM sse 98cabdff1aSopenharmony_cicglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a 99cabdff1aSopenharmony_ci movifnidn aq, amp 100cabdff1aSopenharmony_ci mova m0, [aq] 101cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 102cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 103cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 104cabdff1aSopenharmony_ci mova [dstq+strideq*1], m0 105cabdff1aSopenharmony_ci mova [dstq+strideq*2], m0 106cabdff1aSopenharmony_ci mova [dstq+stride3q ], m0 107cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 108cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 109cabdff1aSopenharmony_ci mova [dstq+strideq*1], m0 110cabdff1aSopenharmony_ci mova [dstq+strideq*2], m0 111cabdff1aSopenharmony_ci mova [dstq+stride3q ], m0 112cabdff1aSopenharmony_ci RET 113cabdff1aSopenharmony_ci 114cabdff1aSopenharmony_ciINIT_XMM sse 115cabdff1aSopenharmony_cicglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a 116cabdff1aSopenharmony_ci movifnidn aq, amp 117cabdff1aSopenharmony_ci mova m0, [aq] 118cabdff1aSopenharmony_ci mova m1, [aq+mmsize] 119cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3, cnt 120cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 121cabdff1aSopenharmony_ci mov cntd, 4 122cabdff1aSopenharmony_ci.loop: 123cabdff1aSopenharmony_ci mova [dstq+strideq*0+ 0], m0 124cabdff1aSopenharmony_ci mova [dstq+strideq*0+16], m1 125cabdff1aSopenharmony_ci mova [dstq+strideq*1+ 0], m0 126cabdff1aSopenharmony_ci mova [dstq+strideq*1+16], m1 127cabdff1aSopenharmony_ci mova [dstq+strideq*2+ 0], m0 128cabdff1aSopenharmony_ci mova [dstq+strideq*2+16], m1 129cabdff1aSopenharmony_ci mova [dstq+stride3q + 0], m0 130cabdff1aSopenharmony_ci mova [dstq+stride3q +16], m1 131cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 132cabdff1aSopenharmony_ci dec cntd 133cabdff1aSopenharmony_ci jg .loop 134cabdff1aSopenharmony_ci RET 135cabdff1aSopenharmony_ci 136cabdff1aSopenharmony_ciINIT_XMM sse 137cabdff1aSopenharmony_cicglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a 138cabdff1aSopenharmony_ci movifnidn aq, amp 139cabdff1aSopenharmony_ci mova m0, [aq+mmsize*0] 140cabdff1aSopenharmony_ci mova m1, [aq+mmsize*1] 141cabdff1aSopenharmony_ci mova m2, [aq+mmsize*2] 142cabdff1aSopenharmony_ci mova m3, [aq+mmsize*3] 143cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, cnt 144cabdff1aSopenharmony_ci mov cntd, 16 145cabdff1aSopenharmony_ci.loop: 146cabdff1aSopenharmony_ci mova [dstq+strideq*0+ 0], m0 147cabdff1aSopenharmony_ci mova [dstq+strideq*0+16], m1 148cabdff1aSopenharmony_ci mova [dstq+strideq*0+32], m2 149cabdff1aSopenharmony_ci mova [dstq+strideq*0+48], m3 150cabdff1aSopenharmony_ci mova [dstq+strideq*1+ 0], m0 151cabdff1aSopenharmony_ci mova [dstq+strideq*1+16], m1 152cabdff1aSopenharmony_ci mova [dstq+strideq*1+32], m2 153cabdff1aSopenharmony_ci mova [dstq+strideq*1+48], m3 154cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 155cabdff1aSopenharmony_ci dec cntd 156cabdff1aSopenharmony_ci jg .loop 157cabdff1aSopenharmony_ci RET 158cabdff1aSopenharmony_ci 159cabdff1aSopenharmony_ciINIT_MMX mmxext 160cabdff1aSopenharmony_cicglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a 161cabdff1aSopenharmony_ci mova m3, [lq] 162cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 163cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 164cabdff1aSopenharmony_ci pshufw m0, m3, q3333 165cabdff1aSopenharmony_ci pshufw m1, m3, q2222 166cabdff1aSopenharmony_ci pshufw m2, m3, q1111 167cabdff1aSopenharmony_ci pshufw m3, m3, q0000 168cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 169cabdff1aSopenharmony_ci mova [dstq+strideq*1], m1 170cabdff1aSopenharmony_ci mova [dstq+strideq*2], m2 171cabdff1aSopenharmony_ci mova [dstq+stride3q ], m3 172cabdff1aSopenharmony_ci RET 173cabdff1aSopenharmony_ci 174cabdff1aSopenharmony_ciINIT_XMM sse2 175cabdff1aSopenharmony_cicglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a 176cabdff1aSopenharmony_ci mova m2, [lq] 177cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 178cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 179cabdff1aSopenharmony_ci punpckhwd m3, m2, m2 180cabdff1aSopenharmony_ci pshufd m0, m3, q3333 181cabdff1aSopenharmony_ci pshufd m1, m3, q2222 182cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 183cabdff1aSopenharmony_ci mova [dstq+strideq*1], m1 184cabdff1aSopenharmony_ci pshufd m0, m3, q1111 185cabdff1aSopenharmony_ci pshufd m1, m3, q0000 186cabdff1aSopenharmony_ci mova [dstq+strideq*2], m0 187cabdff1aSopenharmony_ci mova [dstq+stride3q ], m1 188cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 189cabdff1aSopenharmony_ci punpcklwd m2, m2 190cabdff1aSopenharmony_ci pshufd m0, m2, q3333 191cabdff1aSopenharmony_ci pshufd m1, m2, q2222 192cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 193cabdff1aSopenharmony_ci mova [dstq+strideq*1], m1 194cabdff1aSopenharmony_ci pshufd m0, m2, q1111 195cabdff1aSopenharmony_ci pshufd m1, m2, q0000 196cabdff1aSopenharmony_ci mova [dstq+strideq*2], m0 197cabdff1aSopenharmony_ci mova [dstq+stride3q ], m1 198cabdff1aSopenharmony_ci RET 199cabdff1aSopenharmony_ci 200cabdff1aSopenharmony_ciINIT_XMM sse2 201cabdff1aSopenharmony_cicglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt 202cabdff1aSopenharmony_ci mov cntd, 3 203cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 204cabdff1aSopenharmony_ci.loop: 205cabdff1aSopenharmony_ci movh m3, [lq+cntq*8] 206cabdff1aSopenharmony_ci punpcklwd m3, m3 207cabdff1aSopenharmony_ci pshufd m0, m3, q3333 208cabdff1aSopenharmony_ci pshufd m1, m3, q2222 209cabdff1aSopenharmony_ci pshufd m2, m3, q1111 210cabdff1aSopenharmony_ci pshufd m3, m3, q0000 211cabdff1aSopenharmony_ci mova [dstq+strideq*0+ 0], m0 212cabdff1aSopenharmony_ci mova [dstq+strideq*0+16], m0 213cabdff1aSopenharmony_ci mova [dstq+strideq*1+ 0], m1 214cabdff1aSopenharmony_ci mova [dstq+strideq*1+16], m1 215cabdff1aSopenharmony_ci mova [dstq+strideq*2+ 0], m2 216cabdff1aSopenharmony_ci mova [dstq+strideq*2+16], m2 217cabdff1aSopenharmony_ci mova [dstq+stride3q + 0], m3 218cabdff1aSopenharmony_ci mova [dstq+stride3q +16], m3 219cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 220cabdff1aSopenharmony_ci dec cntd 221cabdff1aSopenharmony_ci jge .loop 222cabdff1aSopenharmony_ci RET 223cabdff1aSopenharmony_ci 224cabdff1aSopenharmony_ciINIT_XMM sse2 225cabdff1aSopenharmony_cicglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt 226cabdff1aSopenharmony_ci mov cntd, 7 227cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 228cabdff1aSopenharmony_ci.loop: 229cabdff1aSopenharmony_ci movh m3, [lq+cntq*8] 230cabdff1aSopenharmony_ci punpcklwd m3, m3 231cabdff1aSopenharmony_ci pshufd m0, m3, q3333 232cabdff1aSopenharmony_ci pshufd m1, m3, q2222 233cabdff1aSopenharmony_ci pshufd m2, m3, q1111 234cabdff1aSopenharmony_ci pshufd m3, m3, q0000 235cabdff1aSopenharmony_ci mova [dstq+strideq*0+ 0], m0 236cabdff1aSopenharmony_ci mova [dstq+strideq*0+16], m0 237cabdff1aSopenharmony_ci mova [dstq+strideq*0+32], m0 238cabdff1aSopenharmony_ci mova [dstq+strideq*0+48], m0 239cabdff1aSopenharmony_ci mova [dstq+strideq*1+ 0], m1 240cabdff1aSopenharmony_ci mova [dstq+strideq*1+16], m1 241cabdff1aSopenharmony_ci mova [dstq+strideq*1+32], m1 242cabdff1aSopenharmony_ci mova [dstq+strideq*1+48], m1 243cabdff1aSopenharmony_ci mova [dstq+strideq*2+ 0], m2 244cabdff1aSopenharmony_ci mova [dstq+strideq*2+16], m2 245cabdff1aSopenharmony_ci mova [dstq+strideq*2+32], m2 246cabdff1aSopenharmony_ci mova [dstq+strideq*2+48], m2 247cabdff1aSopenharmony_ci mova [dstq+stride3q + 0], m3 248cabdff1aSopenharmony_ci mova [dstq+stride3q +16], m3 249cabdff1aSopenharmony_ci mova [dstq+stride3q +32], m3 250cabdff1aSopenharmony_ci mova [dstq+stride3q +48], m3 251cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 252cabdff1aSopenharmony_ci dec cntd 253cabdff1aSopenharmony_ci jge .loop 254cabdff1aSopenharmony_ci RET 255cabdff1aSopenharmony_ci 256cabdff1aSopenharmony_ciINIT_MMX mmxext 257cabdff1aSopenharmony_cicglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a 258cabdff1aSopenharmony_ci mova m0, [lq] 259cabdff1aSopenharmony_ci paddw m0, [aq] 260cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 261cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 262cabdff1aSopenharmony_ci pmaddwd m0, [pw_1] 263cabdff1aSopenharmony_ci pshufw m1, m0, q3232 264cabdff1aSopenharmony_ci paddd m0, [pd_4] 265cabdff1aSopenharmony_ci paddd m0, m1 266cabdff1aSopenharmony_ci psrad m0, 3 267cabdff1aSopenharmony_ci pshufw m0, m0, q0000 268cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 269cabdff1aSopenharmony_ci mova [dstq+strideq*1], m0 270cabdff1aSopenharmony_ci mova [dstq+strideq*2], m0 271cabdff1aSopenharmony_ci mova [dstq+stride3q ], m0 272cabdff1aSopenharmony_ci RET 273cabdff1aSopenharmony_ci 274cabdff1aSopenharmony_ciINIT_XMM sse2 275cabdff1aSopenharmony_cicglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a 276cabdff1aSopenharmony_ci mova m0, [lq] 277cabdff1aSopenharmony_ci paddw m0, [aq] 278cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 279cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 280cabdff1aSopenharmony_ci pmaddwd m0, [pw_1] 281cabdff1aSopenharmony_ci pshufd m1, m0, q3232 282cabdff1aSopenharmony_ci paddd m0, m1 283cabdff1aSopenharmony_ci pshufd m1, m0, q1111 284cabdff1aSopenharmony_ci paddd m0, [pd_8] 285cabdff1aSopenharmony_ci paddd m0, m1 286cabdff1aSopenharmony_ci psrad m0, 4 287cabdff1aSopenharmony_ci pshuflw m0, m0, q0000 288cabdff1aSopenharmony_ci punpcklqdq m0, m0 289cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 290cabdff1aSopenharmony_ci mova [dstq+strideq*1], m0 291cabdff1aSopenharmony_ci mova [dstq+strideq*2], m0 292cabdff1aSopenharmony_ci mova [dstq+stride3q ], m0 293cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 294cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 295cabdff1aSopenharmony_ci mova [dstq+strideq*1], m0 296cabdff1aSopenharmony_ci mova [dstq+strideq*2], m0 297cabdff1aSopenharmony_ci mova [dstq+stride3q ], m0 298cabdff1aSopenharmony_ci RET 299cabdff1aSopenharmony_ci 300cabdff1aSopenharmony_ciINIT_XMM sse2 301cabdff1aSopenharmony_cicglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a 302cabdff1aSopenharmony_ci mova m0, [lq] 303cabdff1aSopenharmony_ci paddw m0, [lq+mmsize] 304cabdff1aSopenharmony_ci paddw m0, [aq] 305cabdff1aSopenharmony_ci paddw m0, [aq+mmsize] 306cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3, cnt 307cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 308cabdff1aSopenharmony_ci mov cntd, 4 309cabdff1aSopenharmony_ci pmaddwd m0, [pw_1] 310cabdff1aSopenharmony_ci pshufd m1, m0, q3232 311cabdff1aSopenharmony_ci paddd m0, m1 312cabdff1aSopenharmony_ci pshufd m1, m0, q1111 313cabdff1aSopenharmony_ci paddd m0, [pd_16] 314cabdff1aSopenharmony_ci paddd m0, m1 315cabdff1aSopenharmony_ci psrad m0, 5 316cabdff1aSopenharmony_ci pshuflw m0, m0, q0000 317cabdff1aSopenharmony_ci punpcklqdq m0, m0 318cabdff1aSopenharmony_ci.loop: 319cabdff1aSopenharmony_ci mova [dstq+strideq*0+ 0], m0 320cabdff1aSopenharmony_ci mova [dstq+strideq*0+16], m0 321cabdff1aSopenharmony_ci mova [dstq+strideq*1+ 0], m0 322cabdff1aSopenharmony_ci mova [dstq+strideq*1+16], m0 323cabdff1aSopenharmony_ci mova [dstq+strideq*2+ 0], m0 324cabdff1aSopenharmony_ci mova [dstq+strideq*2+16], m0 325cabdff1aSopenharmony_ci mova [dstq+stride3q + 0], m0 326cabdff1aSopenharmony_ci mova [dstq+stride3q +16], m0 327cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 328cabdff1aSopenharmony_ci dec cntd 329cabdff1aSopenharmony_ci jg .loop 330cabdff1aSopenharmony_ci RET 331cabdff1aSopenharmony_ci 332cabdff1aSopenharmony_ciINIT_XMM sse2 333cabdff1aSopenharmony_cicglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a 334cabdff1aSopenharmony_ci mova m0, [lq+mmsize*0] 335cabdff1aSopenharmony_ci paddw m0, [lq+mmsize*1] 336cabdff1aSopenharmony_ci paddw m0, [lq+mmsize*2] 337cabdff1aSopenharmony_ci paddw m0, [lq+mmsize*3] 338cabdff1aSopenharmony_ci paddw m0, [aq+mmsize*0] 339cabdff1aSopenharmony_ci paddw m0, [aq+mmsize*1] 340cabdff1aSopenharmony_ci paddw m0, [aq+mmsize*2] 341cabdff1aSopenharmony_ci paddw m0, [aq+mmsize*3] 342cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3, cnt 343cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 344cabdff1aSopenharmony_ci mov cntd, 16 345cabdff1aSopenharmony_ci pmaddwd m0, [pw_1] 346cabdff1aSopenharmony_ci pshufd m1, m0, q3232 347cabdff1aSopenharmony_ci paddd m0, m1 348cabdff1aSopenharmony_ci pshufd m1, m0, q1111 349cabdff1aSopenharmony_ci paddd m0, [pd_32] 350cabdff1aSopenharmony_ci paddd m0, m1 351cabdff1aSopenharmony_ci psrad m0, 6 352cabdff1aSopenharmony_ci pshuflw m0, m0, q0000 353cabdff1aSopenharmony_ci punpcklqdq m0, m0 354cabdff1aSopenharmony_ci.loop: 355cabdff1aSopenharmony_ci mova [dstq+strideq*0+ 0], m0 356cabdff1aSopenharmony_ci mova [dstq+strideq*0+16], m0 357cabdff1aSopenharmony_ci mova [dstq+strideq*0+32], m0 358cabdff1aSopenharmony_ci mova [dstq+strideq*0+48], m0 359cabdff1aSopenharmony_ci mova [dstq+strideq*1+ 0], m0 360cabdff1aSopenharmony_ci mova [dstq+strideq*1+16], m0 361cabdff1aSopenharmony_ci mova [dstq+strideq*1+32], m0 362cabdff1aSopenharmony_ci mova [dstq+strideq*1+48], m0 363cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 364cabdff1aSopenharmony_ci dec cntd 365cabdff1aSopenharmony_ci jg .loop 366cabdff1aSopenharmony_ci RET 367cabdff1aSopenharmony_ci 368cabdff1aSopenharmony_ci%macro DC_1D_FNS 2 369cabdff1aSopenharmony_ciINIT_MMX mmxext 370cabdff1aSopenharmony_cicglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a 371cabdff1aSopenharmony_ci mova m0, [%2] 372cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 373cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 374cabdff1aSopenharmony_ci pmaddwd m0, [pw_1] 375cabdff1aSopenharmony_ci pshufw m1, m0, q3232 376cabdff1aSopenharmony_ci paddd m0, [pd_2] 377cabdff1aSopenharmony_ci paddd m0, m1 378cabdff1aSopenharmony_ci psrad m0, 2 379cabdff1aSopenharmony_ci pshufw m0, m0, q0000 380cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 381cabdff1aSopenharmony_ci mova [dstq+strideq*1], m0 382cabdff1aSopenharmony_ci mova [dstq+strideq*2], m0 383cabdff1aSopenharmony_ci mova [dstq+stride3q ], m0 384cabdff1aSopenharmony_ci RET 385cabdff1aSopenharmony_ci 386cabdff1aSopenharmony_ciINIT_XMM sse2 387cabdff1aSopenharmony_cicglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a 388cabdff1aSopenharmony_ci mova m0, [%2] 389cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 390cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 391cabdff1aSopenharmony_ci pmaddwd m0, [pw_1] 392cabdff1aSopenharmony_ci pshufd m1, m0, q3232 393cabdff1aSopenharmony_ci paddd m0, m1 394cabdff1aSopenharmony_ci pshufd m1, m0, q1111 395cabdff1aSopenharmony_ci paddd m0, [pd_4] 396cabdff1aSopenharmony_ci paddd m0, m1 397cabdff1aSopenharmony_ci psrad m0, 3 398cabdff1aSopenharmony_ci pshuflw m0, m0, q0000 399cabdff1aSopenharmony_ci punpcklqdq m0, m0 400cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 401cabdff1aSopenharmony_ci mova [dstq+strideq*1], m0 402cabdff1aSopenharmony_ci mova [dstq+strideq*2], m0 403cabdff1aSopenharmony_ci mova [dstq+stride3q ], m0 404cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 405cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 406cabdff1aSopenharmony_ci mova [dstq+strideq*1], m0 407cabdff1aSopenharmony_ci mova [dstq+strideq*2], m0 408cabdff1aSopenharmony_ci mova [dstq+stride3q ], m0 409cabdff1aSopenharmony_ci RET 410cabdff1aSopenharmony_ci 411cabdff1aSopenharmony_ciINIT_XMM sse2 412cabdff1aSopenharmony_cicglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a 413cabdff1aSopenharmony_ci mova m0, [%2] 414cabdff1aSopenharmony_ci paddw m0, [%2+mmsize] 415cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3, cnt 416cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 417cabdff1aSopenharmony_ci mov cntd, 4 418cabdff1aSopenharmony_ci pmaddwd m0, [pw_1] 419cabdff1aSopenharmony_ci pshufd m1, m0, q3232 420cabdff1aSopenharmony_ci paddd m0, m1 421cabdff1aSopenharmony_ci pshufd m1, m0, q1111 422cabdff1aSopenharmony_ci paddd m0, [pd_8] 423cabdff1aSopenharmony_ci paddd m0, m1 424cabdff1aSopenharmony_ci psrad m0, 4 425cabdff1aSopenharmony_ci pshuflw m0, m0, q0000 426cabdff1aSopenharmony_ci punpcklqdq m0, m0 427cabdff1aSopenharmony_ci.loop: 428cabdff1aSopenharmony_ci mova [dstq+strideq*0+ 0], m0 429cabdff1aSopenharmony_ci mova [dstq+strideq*0+16], m0 430cabdff1aSopenharmony_ci mova [dstq+strideq*1+ 0], m0 431cabdff1aSopenharmony_ci mova [dstq+strideq*1+16], m0 432cabdff1aSopenharmony_ci mova [dstq+strideq*2+ 0], m0 433cabdff1aSopenharmony_ci mova [dstq+strideq*2+16], m0 434cabdff1aSopenharmony_ci mova [dstq+stride3q + 0], m0 435cabdff1aSopenharmony_ci mova [dstq+stride3q +16], m0 436cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 437cabdff1aSopenharmony_ci dec cntd 438cabdff1aSopenharmony_ci jg .loop 439cabdff1aSopenharmony_ci RET 440cabdff1aSopenharmony_ci 441cabdff1aSopenharmony_ciINIT_XMM sse2 442cabdff1aSopenharmony_cicglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a 443cabdff1aSopenharmony_ci mova m0, [%2+mmsize*0] 444cabdff1aSopenharmony_ci paddw m0, [%2+mmsize*1] 445cabdff1aSopenharmony_ci paddw m0, [%2+mmsize*2] 446cabdff1aSopenharmony_ci paddw m0, [%2+mmsize*3] 447cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, cnt 448cabdff1aSopenharmony_ci mov cntd, 16 449cabdff1aSopenharmony_ci pmaddwd m0, [pw_1] 450cabdff1aSopenharmony_ci pshufd m1, m0, q3232 451cabdff1aSopenharmony_ci paddd m0, m1 452cabdff1aSopenharmony_ci pshufd m1, m0, q1111 453cabdff1aSopenharmony_ci paddd m0, [pd_16] 454cabdff1aSopenharmony_ci paddd m0, m1 455cabdff1aSopenharmony_ci psrad m0, 5 456cabdff1aSopenharmony_ci pshuflw m0, m0, q0000 457cabdff1aSopenharmony_ci punpcklqdq m0, m0 458cabdff1aSopenharmony_ci.loop: 459cabdff1aSopenharmony_ci mova [dstq+strideq*0+ 0], m0 460cabdff1aSopenharmony_ci mova [dstq+strideq*0+16], m0 461cabdff1aSopenharmony_ci mova [dstq+strideq*0+32], m0 462cabdff1aSopenharmony_ci mova [dstq+strideq*0+48], m0 463cabdff1aSopenharmony_ci mova [dstq+strideq*1+ 0], m0 464cabdff1aSopenharmony_ci mova [dstq+strideq*1+16], m0 465cabdff1aSopenharmony_ci mova [dstq+strideq*1+32], m0 466cabdff1aSopenharmony_ci mova [dstq+strideq*1+48], m0 467cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 468cabdff1aSopenharmony_ci dec cntd 469cabdff1aSopenharmony_ci jg .loop 470cabdff1aSopenharmony_ci RET 471cabdff1aSopenharmony_ci%endmacro 472cabdff1aSopenharmony_ci 473cabdff1aSopenharmony_ciDC_1D_FNS top, aq 474cabdff1aSopenharmony_ciDC_1D_FNS left, lq 475cabdff1aSopenharmony_ci 476cabdff1aSopenharmony_ciINIT_MMX mmxext 477cabdff1aSopenharmony_cicglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a 478cabdff1aSopenharmony_ci mova m5, [pw_1023] 479cabdff1aSopenharmony_ci.body: 480cabdff1aSopenharmony_ci mova m4, [aq] 481cabdff1aSopenharmony_ci mova m3, [lq] 482cabdff1aSopenharmony_ci movd m0, [aq-4] 483cabdff1aSopenharmony_ci pshufw m0, m0, q1111 484cabdff1aSopenharmony_ci psubw m4, m0 485cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 486cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 487cabdff1aSopenharmony_ci pshufw m0, m3, q3333 488cabdff1aSopenharmony_ci pshufw m1, m3, q2222 489cabdff1aSopenharmony_ci pshufw m2, m3, q1111 490cabdff1aSopenharmony_ci pshufw m3, m3, q0000 491cabdff1aSopenharmony_ci paddw m0, m4 492cabdff1aSopenharmony_ci paddw m1, m4 493cabdff1aSopenharmony_ci paddw m2, m4 494cabdff1aSopenharmony_ci paddw m3, m4 495cabdff1aSopenharmony_ci pxor m4, m4 496cabdff1aSopenharmony_ci pmaxsw m0, m4 497cabdff1aSopenharmony_ci pmaxsw m1, m4 498cabdff1aSopenharmony_ci pmaxsw m2, m4 499cabdff1aSopenharmony_ci pmaxsw m3, m4 500cabdff1aSopenharmony_ci pminsw m0, m5 501cabdff1aSopenharmony_ci pminsw m1, m5 502cabdff1aSopenharmony_ci pminsw m2, m5 503cabdff1aSopenharmony_ci pminsw m3, m5 504cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 505cabdff1aSopenharmony_ci mova [dstq+strideq*1], m1 506cabdff1aSopenharmony_ci mova [dstq+strideq*2], m2 507cabdff1aSopenharmony_ci mova [dstq+stride3q ], m3 508cabdff1aSopenharmony_ci RET 509cabdff1aSopenharmony_ci 510cabdff1aSopenharmony_cicglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a 511cabdff1aSopenharmony_ci mova m5, [pw_4095] 512cabdff1aSopenharmony_ci jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body 513cabdff1aSopenharmony_ci 514cabdff1aSopenharmony_ciINIT_XMM sse2 515cabdff1aSopenharmony_cicglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a 516cabdff1aSopenharmony_ci mova m4, [pw_1023] 517cabdff1aSopenharmony_ci.body: 518cabdff1aSopenharmony_ci pxor m6, m6 519cabdff1aSopenharmony_ci mova m5, [aq] 520cabdff1aSopenharmony_ci movd m0, [aq-4] 521cabdff1aSopenharmony_ci pshuflw m0, m0, q1111 522cabdff1aSopenharmony_ci punpcklqdq m0, m0 523cabdff1aSopenharmony_ci psubw m5, m0 524cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, l, stride3, cnt 525cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 526cabdff1aSopenharmony_ci mov cntd, 1 527cabdff1aSopenharmony_ci.loop: 528cabdff1aSopenharmony_ci movh m3, [lq+cntq*8] 529cabdff1aSopenharmony_ci punpcklwd m3, m3 530cabdff1aSopenharmony_ci pshufd m0, m3, q3333 531cabdff1aSopenharmony_ci pshufd m1, m3, q2222 532cabdff1aSopenharmony_ci pshufd m2, m3, q1111 533cabdff1aSopenharmony_ci pshufd m3, m3, q0000 534cabdff1aSopenharmony_ci paddw m0, m5 535cabdff1aSopenharmony_ci paddw m1, m5 536cabdff1aSopenharmony_ci paddw m2, m5 537cabdff1aSopenharmony_ci paddw m3, m5 538cabdff1aSopenharmony_ci pmaxsw m0, m6 539cabdff1aSopenharmony_ci pmaxsw m1, m6 540cabdff1aSopenharmony_ci pmaxsw m2, m6 541cabdff1aSopenharmony_ci pmaxsw m3, m6 542cabdff1aSopenharmony_ci pminsw m0, m4 543cabdff1aSopenharmony_ci pminsw m1, m4 544cabdff1aSopenharmony_ci pminsw m2, m4 545cabdff1aSopenharmony_ci pminsw m3, m4 546cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 547cabdff1aSopenharmony_ci mova [dstq+strideq*1], m1 548cabdff1aSopenharmony_ci mova [dstq+strideq*2], m2 549cabdff1aSopenharmony_ci mova [dstq+stride3q ], m3 550cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 551cabdff1aSopenharmony_ci dec cntd 552cabdff1aSopenharmony_ci jge .loop 553cabdff1aSopenharmony_ci RET 554cabdff1aSopenharmony_ci 555cabdff1aSopenharmony_cicglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a 556cabdff1aSopenharmony_ci mova m4, [pw_4095] 557cabdff1aSopenharmony_ci jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body 558cabdff1aSopenharmony_ci 559cabdff1aSopenharmony_ciINIT_XMM sse2 560cabdff1aSopenharmony_cicglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a 561cabdff1aSopenharmony_ci mova m7, [pw_1023] 562cabdff1aSopenharmony_ci.body: 563cabdff1aSopenharmony_ci pxor m6, m6 564cabdff1aSopenharmony_ci mova m4, [aq] 565cabdff1aSopenharmony_ci mova m5, [aq+mmsize] 566cabdff1aSopenharmony_ci movd m0, [aq-4] 567cabdff1aSopenharmony_ci pshuflw m0, m0, q1111 568cabdff1aSopenharmony_ci punpcklqdq m0, m0 569cabdff1aSopenharmony_ci psubw m4, m0 570cabdff1aSopenharmony_ci psubw m5, m0 571cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, l, cnt 572cabdff1aSopenharmony_ci mov cntd, 7 573cabdff1aSopenharmony_ci.loop: 574cabdff1aSopenharmony_ci movd m3, [lq+cntq*4] 575cabdff1aSopenharmony_ci punpcklwd m3, m3 576cabdff1aSopenharmony_ci pshufd m2, m3, q1111 577cabdff1aSopenharmony_ci pshufd m3, m3, q0000 578cabdff1aSopenharmony_ci paddw m0, m2, m4 579cabdff1aSopenharmony_ci paddw m2, m5 580cabdff1aSopenharmony_ci paddw m1, m3, m4 581cabdff1aSopenharmony_ci paddw m3, m5 582cabdff1aSopenharmony_ci pmaxsw m0, m6 583cabdff1aSopenharmony_ci pmaxsw m2, m6 584cabdff1aSopenharmony_ci pmaxsw m1, m6 585cabdff1aSopenharmony_ci pmaxsw m3, m6 586cabdff1aSopenharmony_ci pminsw m0, m7 587cabdff1aSopenharmony_ci pminsw m2, m7 588cabdff1aSopenharmony_ci pminsw m1, m7 589cabdff1aSopenharmony_ci pminsw m3, m7 590cabdff1aSopenharmony_ci mova [dstq+strideq*0+ 0], m0 591cabdff1aSopenharmony_ci mova [dstq+strideq*0+16], m2 592cabdff1aSopenharmony_ci mova [dstq+strideq*1+ 0], m1 593cabdff1aSopenharmony_ci mova [dstq+strideq*1+16], m3 594cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 595cabdff1aSopenharmony_ci dec cntd 596cabdff1aSopenharmony_ci jge .loop 597cabdff1aSopenharmony_ci RET 598cabdff1aSopenharmony_ci 599cabdff1aSopenharmony_cicglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a 600cabdff1aSopenharmony_ci mova m7, [pw_4095] 601cabdff1aSopenharmony_ci jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body 602cabdff1aSopenharmony_ci 603cabdff1aSopenharmony_ciINIT_XMM sse2 604cabdff1aSopenharmony_cicglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a 605cabdff1aSopenharmony_ci mova m0, [pw_1023] 606cabdff1aSopenharmony_ci.body: 607cabdff1aSopenharmony_ci pxor m1, m1 608cabdff1aSopenharmony_ci%if ARCH_X86_64 609cabdff1aSopenharmony_ci SWAP 0, 8 610cabdff1aSopenharmony_ci SWAP 1, 9 611cabdff1aSopenharmony_ci%define reg_min m9 612cabdff1aSopenharmony_ci%define reg_max m8 613cabdff1aSopenharmony_ci%else 614cabdff1aSopenharmony_ci mova [rsp+ 0], m0 615cabdff1aSopenharmony_ci mova [rsp+16], m1 616cabdff1aSopenharmony_ci%define reg_min [rsp+16] 617cabdff1aSopenharmony_ci%define reg_max [rsp+ 0] 618cabdff1aSopenharmony_ci%endif 619cabdff1aSopenharmony_ci 620cabdff1aSopenharmony_ci mova m4, [aq+mmsize*0] 621cabdff1aSopenharmony_ci mova m5, [aq+mmsize*1] 622cabdff1aSopenharmony_ci mova m6, [aq+mmsize*2] 623cabdff1aSopenharmony_ci mova m7, [aq+mmsize*3] 624cabdff1aSopenharmony_ci movd m0, [aq-4] 625cabdff1aSopenharmony_ci pshuflw m0, m0, q1111 626cabdff1aSopenharmony_ci punpcklqdq m0, m0 627cabdff1aSopenharmony_ci psubw m4, m0 628cabdff1aSopenharmony_ci psubw m5, m0 629cabdff1aSopenharmony_ci psubw m6, m0 630cabdff1aSopenharmony_ci psubw m7, m0 631cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, l, cnt 632cabdff1aSopenharmony_ci mov cntd, 31 633cabdff1aSopenharmony_ci.loop: 634cabdff1aSopenharmony_ci pinsrw m3, [lq+cntq*2], 0 635cabdff1aSopenharmony_ci punpcklwd m3, m3 636cabdff1aSopenharmony_ci pshufd m3, m3, q0000 637cabdff1aSopenharmony_ci paddw m0, m3, m4 638cabdff1aSopenharmony_ci paddw m1, m3, m5 639cabdff1aSopenharmony_ci paddw m2, m3, m6 640cabdff1aSopenharmony_ci paddw m3, m7 641cabdff1aSopenharmony_ci pmaxsw m0, reg_min 642cabdff1aSopenharmony_ci pmaxsw m1, reg_min 643cabdff1aSopenharmony_ci pmaxsw m2, reg_min 644cabdff1aSopenharmony_ci pmaxsw m3, reg_min 645cabdff1aSopenharmony_ci pminsw m0, reg_max 646cabdff1aSopenharmony_ci pminsw m1, reg_max 647cabdff1aSopenharmony_ci pminsw m2, reg_max 648cabdff1aSopenharmony_ci pminsw m3, reg_max 649cabdff1aSopenharmony_ci mova [dstq+strideq*0+ 0], m0 650cabdff1aSopenharmony_ci mova [dstq+strideq*0+16], m1 651cabdff1aSopenharmony_ci mova [dstq+strideq*0+32], m2 652cabdff1aSopenharmony_ci mova [dstq+strideq*0+48], m3 653cabdff1aSopenharmony_ci add dstq, strideq 654cabdff1aSopenharmony_ci dec cntd 655cabdff1aSopenharmony_ci jge .loop 656cabdff1aSopenharmony_ci RET 657cabdff1aSopenharmony_ci 658cabdff1aSopenharmony_cicglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a 659cabdff1aSopenharmony_ci mova m0, [pw_4095] 660cabdff1aSopenharmony_ci jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body 661cabdff1aSopenharmony_ci 662cabdff1aSopenharmony_ci; Directional intra predicion functions 663cabdff1aSopenharmony_ci; 664cabdff1aSopenharmony_ci; in the functions below, 'abcdefgh' refers to above data (sometimes simply 665cabdff1aSopenharmony_ci; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply 666cabdff1aSopenharmony_ci; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered 667cabdff1aSopenharmony_ci; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered 668cabdff1aSopenharmony_ci; top-left data. 669cabdff1aSopenharmony_ci 670cabdff1aSopenharmony_ci; left=(left+2*center+right+2)>>2 671cabdff1aSopenharmony_ci%macro LOWPASS 3 ; left [dst], center, right 672cabdff1aSopenharmony_ci paddw m%1, m%3 673cabdff1aSopenharmony_ci psraw m%1, 1 674cabdff1aSopenharmony_ci pavgw m%1, m%2 675cabdff1aSopenharmony_ci%endmacro 676cabdff1aSopenharmony_ci 677cabdff1aSopenharmony_ci; abcdefgh (src) -> bcdefghh (dst) 678cabdff1aSopenharmony_ci; dst/src can be the same register 679cabdff1aSopenharmony_ci%macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg] 680cabdff1aSopenharmony_ci%if cpuflag(ssse3) 681cabdff1aSopenharmony_ci pshufb %1, %2, %3 ; abcdefgh -> bcdefghh 682cabdff1aSopenharmony_ci%else 683cabdff1aSopenharmony_ci psrldq %1, %2, 2 ; abcdefgh -> bcdefgh. 684cabdff1aSopenharmony_ci pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh 685cabdff1aSopenharmony_ci%endif 686cabdff1aSopenharmony_ci%endmacro 687cabdff1aSopenharmony_ci 688cabdff1aSopenharmony_ci; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2) 689cabdff1aSopenharmony_ci%macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg] 690cabdff1aSopenharmony_ci%if cpuflag(ssse3) 691cabdff1aSopenharmony_ci pshufb %1, %3, %4 ; abcdefgh -> bcdefghh 692cabdff1aSopenharmony_ci pshufb %2, %1, %4 ; bcdefghh -> cdefghhh 693cabdff1aSopenharmony_ci%else 694cabdff1aSopenharmony_ci psrldq %1, %3, 2 ; abcdefgh -> bcdefgh. 695cabdff1aSopenharmony_ci psrldq %2, %3, 4 ; abcdefgh -> cdefgh.. 696cabdff1aSopenharmony_ci pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh 697cabdff1aSopenharmony_ci pshufhw %2, %2, q1110 ; cdefgh.. -> cdefghhh 698cabdff1aSopenharmony_ci%endif 699cabdff1aSopenharmony_ci%endmacro 700cabdff1aSopenharmony_ci 701cabdff1aSopenharmony_ci%macro DL_FUNCS 0 702cabdff1aSopenharmony_cicglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a 703cabdff1aSopenharmony_ci movifnidn aq, amp 704cabdff1aSopenharmony_ci movu m1, [aq] ; abcdefgh 705cabdff1aSopenharmony_ci pshufhw m0, m1, q3310 ; abcdefhh 706cabdff1aSopenharmony_ci SHIFT_RIGHT m1, m1 ; bcdefghh 707cabdff1aSopenharmony_ci psrldq m2, m1, 2 ; cdefghh. 708cabdff1aSopenharmony_ci LOWPASS 0, 1, 2 ; BCDEFGh. 709cabdff1aSopenharmony_ci pshufd m1, m0, q3321 ; DEFGh... 710cabdff1aSopenharmony_ci movh [dstq+strideq*0], m0 711cabdff1aSopenharmony_ci movh [dstq+strideq*2], m1 712cabdff1aSopenharmony_ci add dstq, strideq 713cabdff1aSopenharmony_ci psrldq m0, 2 ; CDEFGh.. 714cabdff1aSopenharmony_ci psrldq m1, 2 ; EFGh.... 715cabdff1aSopenharmony_ci movh [dstq+strideq*0], m0 716cabdff1aSopenharmony_ci movh [dstq+strideq*2], m1 717cabdff1aSopenharmony_ci RET 718cabdff1aSopenharmony_ci 719cabdff1aSopenharmony_cicglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a 720cabdff1aSopenharmony_ci movifnidn aq, amp 721cabdff1aSopenharmony_ci mova m0, [aq] ; abcdefgh 722cabdff1aSopenharmony_ci%if cpuflag(ssse3) 723cabdff1aSopenharmony_ci mova m4, [pb_2to15_14_15] 724cabdff1aSopenharmony_ci%endif 725cabdff1aSopenharmony_ci SHIFT_RIGHTx2 m1, m2, m0, m4 ; bcdefghh/cdefghhh 726cabdff1aSopenharmony_ci LOWPASS 0, 1, 2 ; BCDEFGHh 727cabdff1aSopenharmony_ci shufps m1, m0, m2, q3332 ; FGHhhhhh 728cabdff1aSopenharmony_ci shufps m3, m0, m1, q2121 ; DEFGHhhh 729cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride5 730cabdff1aSopenharmony_ci lea stride5q, [strideq*5] 731cabdff1aSopenharmony_ci 732cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 733cabdff1aSopenharmony_ci mova [dstq+strideq*4], m1 734cabdff1aSopenharmony_ci SHIFT_RIGHT m0, m0, m4 ; CDEFGHhh 735cabdff1aSopenharmony_ci pshuflw m1, m1, q3321 ; GHhhhhhh 736cabdff1aSopenharmony_ci pshufd m2, m0, q3321 ; EFGHhhhh 737cabdff1aSopenharmony_ci mova [dstq+strideq*1], m0 738cabdff1aSopenharmony_ci mova [dstq+stride5q ], m1 739cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 740cabdff1aSopenharmony_ci pshuflw m1, m1, q3321 ; Hhhhhhhh 741cabdff1aSopenharmony_ci mova [dstq+strideq*0], m3 742cabdff1aSopenharmony_ci mova [dstq+strideq*4], m1 743cabdff1aSopenharmony_ci pshuflw m1, m1, q3321 ; hhhhhhhh 744cabdff1aSopenharmony_ci mova [dstq+strideq*1], m2 745cabdff1aSopenharmony_ci mova [dstq+stride5q ], m1 746cabdff1aSopenharmony_ci RET 747cabdff1aSopenharmony_ci 748cabdff1aSopenharmony_cicglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a 749cabdff1aSopenharmony_ci movifnidn aq, amp 750cabdff1aSopenharmony_ci mova m0, [aq] ; abcdefgh 751cabdff1aSopenharmony_ci mova m3, [aq+mmsize] ; ijklmnop 752cabdff1aSopenharmony_ci PALIGNR m1, m3, m0, 2, m4 ; bcdefghi 753cabdff1aSopenharmony_ci PALIGNR m2, m3, m0, 4, m4 ; cdefghij 754cabdff1aSopenharmony_ci LOWPASS 0, 1, 2 ; BCDEFGHI 755cabdff1aSopenharmony_ci%if cpuflag(ssse3) 756cabdff1aSopenharmony_ci mova m4, [pb_2to15_14_15] 757cabdff1aSopenharmony_ci%endif 758cabdff1aSopenharmony_ci SHIFT_RIGHTx2 m2, m1, m3, m4 ; jklmnopp/klmnoppp 759cabdff1aSopenharmony_ci LOWPASS 1, 2, 3 ; JKLMNOPp 760cabdff1aSopenharmony_ci pshufd m2, m2, q3333 ; pppppppp 761cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, cnt 762cabdff1aSopenharmony_ci mov cntd, 8 763cabdff1aSopenharmony_ci 764cabdff1aSopenharmony_ci.loop: 765cabdff1aSopenharmony_ci mova [dstq+strideq*0+ 0], m0 766cabdff1aSopenharmony_ci mova [dstq+strideq*0+16], m1 767cabdff1aSopenharmony_ci mova [dstq+strideq*8+ 0], m1 768cabdff1aSopenharmony_ci mova [dstq+strideq*8+16], m2 769cabdff1aSopenharmony_ci add dstq, strideq 770cabdff1aSopenharmony_ci%if cpuflag(avx) 771cabdff1aSopenharmony_ci vpalignr m0, m1, m0, 2 772cabdff1aSopenharmony_ci%else 773cabdff1aSopenharmony_ci PALIGNR m3, m1, m0, 2, m4 774cabdff1aSopenharmony_ci mova m0, m3 775cabdff1aSopenharmony_ci%endif 776cabdff1aSopenharmony_ci SHIFT_RIGHT m1, m1, m4 777cabdff1aSopenharmony_ci dec cntd 778cabdff1aSopenharmony_ci jg .loop 779cabdff1aSopenharmony_ci RET 780cabdff1aSopenharmony_ci 781cabdff1aSopenharmony_cicglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a 782cabdff1aSopenharmony_ci movifnidn aq, amp 783cabdff1aSopenharmony_ci mova m0, [aq+mmsize*0] ; abcdefgh 784cabdff1aSopenharmony_ci mova m1, [aq+mmsize*1] ; ijklmnop 785cabdff1aSopenharmony_ci mova m2, [aq+mmsize*2] ; qrstuvwx 786cabdff1aSopenharmony_ci mova m3, [aq+mmsize*3] ; yz012345 787cabdff1aSopenharmony_ci PALIGNR m4, m1, m0, 2, m6 788cabdff1aSopenharmony_ci PALIGNR m5, m1, m0, 4, m6 789cabdff1aSopenharmony_ci LOWPASS 0, 4, 5 ; BCDEFGHI 790cabdff1aSopenharmony_ci PALIGNR m4, m2, m1, 2, m6 791cabdff1aSopenharmony_ci PALIGNR m5, m2, m1, 4, m6 792cabdff1aSopenharmony_ci LOWPASS 1, 4, 5 ; JKLMNOPQ 793cabdff1aSopenharmony_ci PALIGNR m4, m3, m2, 2, m6 794cabdff1aSopenharmony_ci PALIGNR m5, m3, m2, 4, m6 795cabdff1aSopenharmony_ci LOWPASS 2, 4, 5 ; RSTUVWXY 796cabdff1aSopenharmony_ci%if cpuflag(ssse3) 797cabdff1aSopenharmony_ci mova m6, [pb_2to15_14_15] 798cabdff1aSopenharmony_ci%endif 799cabdff1aSopenharmony_ci SHIFT_RIGHTx2 m4, m5, m3, m6 800cabdff1aSopenharmony_ci LOWPASS 3, 4, 5 ; Z0123455 801cabdff1aSopenharmony_ci pshufd m4, m4, q3333 ; 55555555 802cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride8, stride24, cnt 803cabdff1aSopenharmony_ci mov cntd, 8 804cabdff1aSopenharmony_ci lea stride8q, [strideq*8] 805cabdff1aSopenharmony_ci lea stride24q, [stride8q*3] 806cabdff1aSopenharmony_ci 807cabdff1aSopenharmony_ci.loop: 808cabdff1aSopenharmony_ci mova [dstq+stride8q*0+ 0], m0 809cabdff1aSopenharmony_ci mova [dstq+stride8q*0+16], m1 810cabdff1aSopenharmony_ci mova [dstq+stride8q*0+32], m2 811cabdff1aSopenharmony_ci mova [dstq+stride8q*0+48], m3 812cabdff1aSopenharmony_ci mova [dstq+stride8q*1+ 0], m1 813cabdff1aSopenharmony_ci mova [dstq+stride8q*1+16], m2 814cabdff1aSopenharmony_ci mova [dstq+stride8q*1+32], m3 815cabdff1aSopenharmony_ci mova [dstq+stride8q*1+48], m4 816cabdff1aSopenharmony_ci mova [dstq+stride8q*2+ 0], m2 817cabdff1aSopenharmony_ci mova [dstq+stride8q*2+16], m3 818cabdff1aSopenharmony_ci mova [dstq+stride8q*2+32], m4 819cabdff1aSopenharmony_ci mova [dstq+stride8q*2+48], m4 820cabdff1aSopenharmony_ci mova [dstq+stride24q + 0], m3 821cabdff1aSopenharmony_ci mova [dstq+stride24q +16], m4 822cabdff1aSopenharmony_ci mova [dstq+stride24q +32], m4 823cabdff1aSopenharmony_ci mova [dstq+stride24q +48], m4 824cabdff1aSopenharmony_ci add dstq, strideq 825cabdff1aSopenharmony_ci%if cpuflag(avx) 826cabdff1aSopenharmony_ci vpalignr m0, m1, m0, 2 827cabdff1aSopenharmony_ci vpalignr m1, m2, m1, 2 828cabdff1aSopenharmony_ci vpalignr m2, m3, m2, 2 829cabdff1aSopenharmony_ci%else 830cabdff1aSopenharmony_ci PALIGNR m5, m1, m0, 2, m6 831cabdff1aSopenharmony_ci mova m0, m5 832cabdff1aSopenharmony_ci PALIGNR m5, m2, m1, 2, m6 833cabdff1aSopenharmony_ci mova m1, m5 834cabdff1aSopenharmony_ci PALIGNR m5, m3, m2, 2, m6 835cabdff1aSopenharmony_ci mova m2, m5 836cabdff1aSopenharmony_ci%endif 837cabdff1aSopenharmony_ci SHIFT_RIGHT m3, m3, m6 838cabdff1aSopenharmony_ci dec cntd 839cabdff1aSopenharmony_ci jg .loop 840cabdff1aSopenharmony_ci RET 841cabdff1aSopenharmony_ci%endmacro 842cabdff1aSopenharmony_ci 843cabdff1aSopenharmony_ciINIT_XMM sse2 844cabdff1aSopenharmony_ciDL_FUNCS 845cabdff1aSopenharmony_ciINIT_XMM ssse3 846cabdff1aSopenharmony_ciDL_FUNCS 847cabdff1aSopenharmony_ciINIT_XMM avx 848cabdff1aSopenharmony_ciDL_FUNCS 849cabdff1aSopenharmony_ci 850cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 851cabdff1aSopenharmony_ciINIT_YMM avx2 852cabdff1aSopenharmony_cicglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a 853cabdff1aSopenharmony_ci movifnidn aq, amp 854cabdff1aSopenharmony_ci mova m0, [aq] ; abcdefghijklmnop 855cabdff1aSopenharmony_ci vpbroadcastw xm1, [aq+30] ; pppppppp 856cabdff1aSopenharmony_ci vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp 857cabdff1aSopenharmony_ci vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp 858cabdff1aSopenharmony_ci vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp 859cabdff1aSopenharmony_ci LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp 860cabdff1aSopenharmony_ci vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp 861cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3, cnt 862cabdff1aSopenharmony_ci mov cntd, 2 863cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 864cabdff1aSopenharmony_ci 865cabdff1aSopenharmony_ci.loop: 866cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 867cabdff1aSopenharmony_ci vpalignr m3, m2, m0, 2 868cabdff1aSopenharmony_ci vpalignr m4, m2, m0, 4 869cabdff1aSopenharmony_ci mova [dstq+strideq*1], m3 870cabdff1aSopenharmony_ci mova [dstq+strideq*2], m4 871cabdff1aSopenharmony_ci vpalignr m3, m2, m0, 6 872cabdff1aSopenharmony_ci vpalignr m4, m2, m0, 8 873cabdff1aSopenharmony_ci mova [dstq+stride3q ], m3 874cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 875cabdff1aSopenharmony_ci mova [dstq+strideq*0], m4 876cabdff1aSopenharmony_ci vpalignr m3, m2, m0, 10 877cabdff1aSopenharmony_ci vpalignr m4, m2, m0, 12 878cabdff1aSopenharmony_ci mova [dstq+strideq*1], m3 879cabdff1aSopenharmony_ci mova [dstq+strideq*2], m4 880cabdff1aSopenharmony_ci vpalignr m3, m2, m0, 14 881cabdff1aSopenharmony_ci mova [dstq+stride3q ], m3 882cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 883cabdff1aSopenharmony_ci mova m0, m2 884cabdff1aSopenharmony_ci vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp 885cabdff1aSopenharmony_ci dec cntd 886cabdff1aSopenharmony_ci jg .loop 887cabdff1aSopenharmony_ci RET 888cabdff1aSopenharmony_ci 889cabdff1aSopenharmony_cicglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a 890cabdff1aSopenharmony_ci movifnidn aq, amp 891cabdff1aSopenharmony_ci mova m0, [aq+mmsize*0+ 0] ; abcdefghijklmnop 892cabdff1aSopenharmony_ci mova m1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345 893cabdff1aSopenharmony_ci vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555 894cabdff1aSopenharmony_ci vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx 895cabdff1aSopenharmony_ci vpalignr m2, m5, m0, 2 ; bcdefghijklmnopq 896cabdff1aSopenharmony_ci vpalignr m3, m5, m0, 4 ; cdefghijklmnopqr 897cabdff1aSopenharmony_ci LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ 898cabdff1aSopenharmony_ci vperm2i128 m5, m1, m4, q0201 ; yz01234555555555 899cabdff1aSopenharmony_ci vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455 900cabdff1aSopenharmony_ci vpalignr m3, m5, m1, 4 ; stuvwxyz01234555 901cabdff1aSopenharmony_ci LOWPASS 1, 2, 3 ; RSTUVWXYZ......5 902cabdff1aSopenharmony_ci vperm2i128 m2, m1, m4, q0201 ; Z......555555555 903cabdff1aSopenharmony_ci vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY 904cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3, cnt 905cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 906cabdff1aSopenharmony_ci mov cntd, 4 907cabdff1aSopenharmony_ci 908cabdff1aSopenharmony_ci.loop: 909cabdff1aSopenharmony_ci mova [dstq+strideq*0 + 0], m0 910cabdff1aSopenharmony_ci mova [dstq+strideq*0 +32], m1 911cabdff1aSopenharmony_ci vpalignr m3, m5, m0, 2 912cabdff1aSopenharmony_ci vpalignr m4, m2, m1, 2 913cabdff1aSopenharmony_ci mova [dstq+strideq*1 + 0], m3 914cabdff1aSopenharmony_ci mova [dstq+strideq*1 +32], m4 915cabdff1aSopenharmony_ci vpalignr m3, m5, m0, 4 916cabdff1aSopenharmony_ci vpalignr m4, m2, m1, 4 917cabdff1aSopenharmony_ci mova [dstq+strideq*2 + 0], m3 918cabdff1aSopenharmony_ci mova [dstq+strideq*2 +32], m4 919cabdff1aSopenharmony_ci vpalignr m3, m5, m0, 6 920cabdff1aSopenharmony_ci vpalignr m4, m2, m1, 6 921cabdff1aSopenharmony_ci mova [dstq+stride3q*1+ 0], m3 922cabdff1aSopenharmony_ci mova [dstq+stride3q*1+32], m4 923cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 924cabdff1aSopenharmony_ci vpalignr m3, m5, m0, 8 925cabdff1aSopenharmony_ci vpalignr m4, m2, m1, 8 926cabdff1aSopenharmony_ci mova [dstq+strideq*0 + 0], m3 927cabdff1aSopenharmony_ci mova [dstq+strideq*0 +32], m4 928cabdff1aSopenharmony_ci vpalignr m3, m5, m0, 10 929cabdff1aSopenharmony_ci vpalignr m4, m2, m1, 10 930cabdff1aSopenharmony_ci mova [dstq+strideq*1 + 0], m3 931cabdff1aSopenharmony_ci mova [dstq+strideq*1 +32], m4 932cabdff1aSopenharmony_ci vpalignr m3, m5, m0, 12 933cabdff1aSopenharmony_ci vpalignr m4, m2, m1, 12 934cabdff1aSopenharmony_ci mova [dstq+strideq*2+ 0], m3 935cabdff1aSopenharmony_ci mova [dstq+strideq*2+32], m4 936cabdff1aSopenharmony_ci vpalignr m3, m5, m0, 14 937cabdff1aSopenharmony_ci vpalignr m4, m2, m1, 14 938cabdff1aSopenharmony_ci mova [dstq+stride3q+ 0], m3 939cabdff1aSopenharmony_ci mova [dstq+stride3q+ 32], m4 940cabdff1aSopenharmony_ci vpalignr m3, m5, m0, 16 941cabdff1aSopenharmony_ci vpalignr m4, m2, m1, 16 942cabdff1aSopenharmony_ci vperm2i128 m5, m3, m4, q0201 943cabdff1aSopenharmony_ci vperm2i128 m2, m4, m4, q0101 944cabdff1aSopenharmony_ci mova m0, m3 945cabdff1aSopenharmony_ci mova m1, m4 946cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 947cabdff1aSopenharmony_ci dec cntd 948cabdff1aSopenharmony_ci jg .loop 949cabdff1aSopenharmony_ci RET 950cabdff1aSopenharmony_ci%endif 951cabdff1aSopenharmony_ci 952cabdff1aSopenharmony_ci%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function 953cabdff1aSopenharmony_cicglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a 954cabdff1aSopenharmony_ci movh m0, [lq] ; wxyz.... 955cabdff1aSopenharmony_ci movhps m0, [aq-2] ; wxyz*abc 956cabdff1aSopenharmony_ci movd m1, [aq+6] ; d....... 957cabdff1aSopenharmony_ci PALIGNR m1, m0, 2, m2 ; xyz*abcd 958cabdff1aSopenharmony_ci psrldq m2, m1, 2 ; yz*abcd. 959cabdff1aSopenharmony_ci LOWPASS 0, 1, 2 ; XYZ#ABC. 960cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 961cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 962cabdff1aSopenharmony_ci 963cabdff1aSopenharmony_ci movh [dstq+stride3q ], m0 964cabdff1aSopenharmony_ci psrldq m0, 2 ; YZ#ABC.. 965cabdff1aSopenharmony_ci movh [dstq+strideq*2], m0 966cabdff1aSopenharmony_ci psrldq m0, 2 ; Z#ABC... 967cabdff1aSopenharmony_ci movh [dstq+strideq*1], m0 968cabdff1aSopenharmony_ci psrldq m0, 2 ; #ABC.... 969cabdff1aSopenharmony_ci movh [dstq+strideq*0], m0 970cabdff1aSopenharmony_ci RET 971cabdff1aSopenharmony_ci 972cabdff1aSopenharmony_cicglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a 973cabdff1aSopenharmony_ci mova m0, [lq] ; stuvwxyz 974cabdff1aSopenharmony_ci movu m1, [aq-2] ; *abcdefg 975cabdff1aSopenharmony_ci mova m2, [aq] ; abcdefgh 976cabdff1aSopenharmony_ci psrldq m3, m2, 2 ; bcdefgh. 977cabdff1aSopenharmony_ci LOWPASS 3, 2, 1 ; ABCDEFG. 978cabdff1aSopenharmony_ci PALIGNR m1, m0, 2, m4 ; tuvwxyz* 979cabdff1aSopenharmony_ci PALIGNR m2, m1, 2, m4 ; uvwxyz*a 980cabdff1aSopenharmony_ci LOWPASS 2, 1, 0 ; TUVWXYZ# 981cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, dst4, stride3 982cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 983cabdff1aSopenharmony_ci lea dst4q, [dstq+strideq*4] 984cabdff1aSopenharmony_ci 985cabdff1aSopenharmony_ci movhps [dstq +stride3q +0], m2 986cabdff1aSopenharmony_ci movh [dstq+ stride3q +8], m3 987cabdff1aSopenharmony_ci mova [dst4q+stride3q +0], m2 988cabdff1aSopenharmony_ci PALIGNR m1, m3, m2, 2, m0 989cabdff1aSopenharmony_ci psrldq m3, 2 990cabdff1aSopenharmony_ci movhps [dstq +strideq*2+0], m1 991cabdff1aSopenharmony_ci movh [dstq+ strideq*2+8], m3 992cabdff1aSopenharmony_ci mova [dst4q+strideq*2+0], m1 993cabdff1aSopenharmony_ci PALIGNR m2, m3, m1, 2, m0 994cabdff1aSopenharmony_ci psrldq m3, 2 995cabdff1aSopenharmony_ci movhps [dstq +strideq*1+0], m2 996cabdff1aSopenharmony_ci movh [dstq+ strideq*1+8], m3 997cabdff1aSopenharmony_ci mova [dst4q+strideq*1+0], m2 998cabdff1aSopenharmony_ci PALIGNR m1, m3, m2, 2, m0 999cabdff1aSopenharmony_ci psrldq m3, 2 1000cabdff1aSopenharmony_ci movhps [dstq +strideq*0+0], m1 1001cabdff1aSopenharmony_ci movh [dstq+ strideq*0+8], m3 1002cabdff1aSopenharmony_ci mova [dst4q+strideq*0+0], m1 1003cabdff1aSopenharmony_ci RET 1004cabdff1aSopenharmony_ci 1005cabdff1aSopenharmony_cicglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a 1006cabdff1aSopenharmony_ci mova m0, [lq] ; klmnopqr 1007cabdff1aSopenharmony_ci mova m1, [lq+mmsize] ; stuvwxyz 1008cabdff1aSopenharmony_ci movu m2, [aq-2] ; *abcdefg 1009cabdff1aSopenharmony_ci movu m3, [aq+mmsize-2] ; hijklmno 1010cabdff1aSopenharmony_ci mova m4, [aq] ; abcdefgh 1011cabdff1aSopenharmony_ci mova m5, [aq+mmsize] ; ijklmnop 1012cabdff1aSopenharmony_ci psrldq m6, m5, 2 ; jklmnop. 1013cabdff1aSopenharmony_ci LOWPASS 6, 5, 3 ; IJKLMNO. 1014cabdff1aSopenharmony_ci PALIGNR m5, m4, 2, m3 ; bcdefghi 1015cabdff1aSopenharmony_ci LOWPASS 5, 4, 2 ; ABCDEFGH 1016cabdff1aSopenharmony_ci PALIGNR m2, m1, 2, m3 ; tuvwxyz* 1017cabdff1aSopenharmony_ci PALIGNR m4, m2, 2, m3 ; uvwxyz*a 1018cabdff1aSopenharmony_ci LOWPASS 4, 2, 1 ; TUVWXYZ# 1019cabdff1aSopenharmony_ci PALIGNR m1, m0, 2, m3 ; lmnopqrs 1020cabdff1aSopenharmony_ci PALIGNR m2, m1, 2, m3 ; mnopqrst 1021cabdff1aSopenharmony_ci LOWPASS 2, 1, 0 ; LMNOPQRS 1022cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, dst8, cnt 1023cabdff1aSopenharmony_ci lea dst8q, [dstq+strideq*8] 1024cabdff1aSopenharmony_ci mov cntd, 8 1025cabdff1aSopenharmony_ci 1026cabdff1aSopenharmony_ci.loop: 1027cabdff1aSopenharmony_ci sub dst8q, strideq 1028cabdff1aSopenharmony_ci mova [dst8q+strideq*0+ 0], m4 1029cabdff1aSopenharmony_ci mova [dst8q+strideq*0+16], m5 1030cabdff1aSopenharmony_ci mova [dst8q+strideq*8+ 0], m2 1031cabdff1aSopenharmony_ci mova [dst8q+strideq*8+16], m4 1032cabdff1aSopenharmony_ci%if cpuflag(avx) 1033cabdff1aSopenharmony_ci vpalignr m2, m4, m2, 2 1034cabdff1aSopenharmony_ci vpalignr m4, m5, m4, 2 1035cabdff1aSopenharmony_ci vpalignr m5, m6, m5, 2 1036cabdff1aSopenharmony_ci%else 1037cabdff1aSopenharmony_ci PALIGNR m0, m4, m2, 2, m1 1038cabdff1aSopenharmony_ci mova m2, m0 1039cabdff1aSopenharmony_ci PALIGNR m0, m5, m4, 2, m1 1040cabdff1aSopenharmony_ci mova m4, m0 1041cabdff1aSopenharmony_ci PALIGNR m0, m6, m5, 2, m1 1042cabdff1aSopenharmony_ci mova m5, m0 1043cabdff1aSopenharmony_ci%endif 1044cabdff1aSopenharmony_ci psrldq m6, 2 1045cabdff1aSopenharmony_ci dec cntd 1046cabdff1aSopenharmony_ci jg .loop 1047cabdff1aSopenharmony_ci RET 1048cabdff1aSopenharmony_ci 1049cabdff1aSopenharmony_cicglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \ 1050cabdff1aSopenharmony_ci %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a 1051cabdff1aSopenharmony_ci mova m0, [aq+mmsize*3] ; a[24-31] 1052cabdff1aSopenharmony_ci movu m1, [aq+mmsize*3-2] ; a[23-30] 1053cabdff1aSopenharmony_ci psrldq m2, m0, 2 ; a[25-31]. 1054cabdff1aSopenharmony_ci LOWPASS 2, 0, 1 ; A[24-30]. 1055cabdff1aSopenharmony_ci mova m1, [aq+mmsize*2] ; a[16-23] 1056cabdff1aSopenharmony_ci movu m3, [aq+mmsize*2-2] ; a[15-22] 1057cabdff1aSopenharmony_ci PALIGNR m0, m1, 2, m4 ; a[17-24] 1058cabdff1aSopenharmony_ci LOWPASS 0, 1, 3 ; A[16-23] 1059cabdff1aSopenharmony_ci mova m3, [aq+mmsize*1] ; a[8-15] 1060cabdff1aSopenharmony_ci movu m4, [aq+mmsize*1-2] ; a[7-14] 1061cabdff1aSopenharmony_ci PALIGNR m1, m3, 2, m5 ; a[9-16] 1062cabdff1aSopenharmony_ci LOWPASS 1, 3, 4 ; A[8-15] 1063cabdff1aSopenharmony_ci mova m4, [aq+mmsize*0] ; a[0-7] 1064cabdff1aSopenharmony_ci movu m5, [aq+mmsize*0-2] ; *a[0-6] 1065cabdff1aSopenharmony_ci PALIGNR m3, m4, 2, m6 ; a[1-8] 1066cabdff1aSopenharmony_ci LOWPASS 3, 4, 5 ; A[0-7] 1067cabdff1aSopenharmony_ci SCRATCH 1, 8, rsp+0*mmsize 1068cabdff1aSopenharmony_ci SCRATCH 3, 9, rsp+1*mmsize 1069cabdff1aSopenharmony_ci%if notcpuflag(ssse3) 1070cabdff1aSopenharmony_ci SCRATCH 0, 10, rsp+2*mmsize 1071cabdff1aSopenharmony_ci%endif 1072cabdff1aSopenharmony_ci mova m6, [lq+mmsize*3] ; l[24-31] 1073cabdff1aSopenharmony_ci PALIGNR m5, m6, 2, m0 ; l[25-31]* 1074cabdff1aSopenharmony_ci PALIGNR m4, m5, 2, m0 ; l[26-31]*a 1075cabdff1aSopenharmony_ci LOWPASS 4, 5, 6 ; L[25-31]# 1076cabdff1aSopenharmony_ci mova m7, [lq+mmsize*2] ; l[16-23] 1077cabdff1aSopenharmony_ci PALIGNR m6, m7, 2, m0 ; l[17-24] 1078cabdff1aSopenharmony_ci PALIGNR m5, m6, 2, m0 ; l[18-25] 1079cabdff1aSopenharmony_ci LOWPASS 5, 6, 7 ; L[17-24] 1080cabdff1aSopenharmony_ci mova m1, [lq+mmsize*1] ; l[8-15] 1081cabdff1aSopenharmony_ci PALIGNR m7, m1, 2, m0 ; l[9-16] 1082cabdff1aSopenharmony_ci PALIGNR m6, m7, 2, m0 ; l[10-17] 1083cabdff1aSopenharmony_ci LOWPASS 6, 7, 1 ; L[9-16] 1084cabdff1aSopenharmony_ci mova m3, [lq+mmsize*0] ; l[0-7] 1085cabdff1aSopenharmony_ci PALIGNR m1, m3, 2, m0 ; l[1-8] 1086cabdff1aSopenharmony_ci PALIGNR m7, m1, 2, m0 ; l[2-9] 1087cabdff1aSopenharmony_ci LOWPASS 7, 1, 3 ; L[1-8] 1088cabdff1aSopenharmony_ci%if cpuflag(ssse3) 1089cabdff1aSopenharmony_ci%if cpuflag(avx) 1090cabdff1aSopenharmony_ci UNSCRATCH 1, 8, rsp+0*mmsize 1091cabdff1aSopenharmony_ci%endif 1092cabdff1aSopenharmony_ci UNSCRATCH 3, 9, rsp+1*mmsize 1093cabdff1aSopenharmony_ci%else 1094cabdff1aSopenharmony_ci UNSCRATCH 0, 10, rsp+2*mmsize 1095cabdff1aSopenharmony_ci%endif 1096cabdff1aSopenharmony_ci DEFINE_ARGS dst8, stride, stride8, stride24, cnt 1097cabdff1aSopenharmony_ci lea stride8q, [strideq*8] 1098cabdff1aSopenharmony_ci lea stride24q, [stride8q*3] 1099cabdff1aSopenharmony_ci lea dst8q, [dst8q+strideq*8] 1100cabdff1aSopenharmony_ci mov cntd, 8 1101cabdff1aSopenharmony_ci 1102cabdff1aSopenharmony_ci.loop: 1103cabdff1aSopenharmony_ci sub dst8q, strideq 1104cabdff1aSopenharmony_ci%if notcpuflag(avx) 1105cabdff1aSopenharmony_ci UNSCRATCH 1, 8, rsp+0*mmsize 1106cabdff1aSopenharmony_ci%if notcpuflag(ssse3) 1107cabdff1aSopenharmony_ci UNSCRATCH 3, 9, rsp+1*mmsize 1108cabdff1aSopenharmony_ci%endif 1109cabdff1aSopenharmony_ci%endif 1110cabdff1aSopenharmony_ci mova [dst8q+stride8q*0+ 0], m4 1111cabdff1aSopenharmony_ci mova [dst8q+stride8q*0+16], m3 1112cabdff1aSopenharmony_ci mova [dst8q+stride8q*0+32], m1 1113cabdff1aSopenharmony_ci mova [dst8q+stride8q*0+48], m0 1114cabdff1aSopenharmony_ci mova [dst8q+stride8q*1+ 0], m5 1115cabdff1aSopenharmony_ci mova [dst8q+stride8q*1+16], m4 1116cabdff1aSopenharmony_ci mova [dst8q+stride8q*1+32], m3 1117cabdff1aSopenharmony_ci mova [dst8q+stride8q*1+48], m1 1118cabdff1aSopenharmony_ci mova [dst8q+stride8q*2+ 0], m6 1119cabdff1aSopenharmony_ci mova [dst8q+stride8q*2+16], m5 1120cabdff1aSopenharmony_ci mova [dst8q+stride8q*2+32], m4 1121cabdff1aSopenharmony_ci mova [dst8q+stride8q*2+48], m3 1122cabdff1aSopenharmony_ci mova [dst8q+stride24q + 0], m7 1123cabdff1aSopenharmony_ci mova [dst8q+stride24q +16], m6 1124cabdff1aSopenharmony_ci mova [dst8q+stride24q +32], m5 1125cabdff1aSopenharmony_ci mova [dst8q+stride24q +48], m4 1126cabdff1aSopenharmony_ci%if cpuflag(avx) 1127cabdff1aSopenharmony_ci vpalignr m7, m6, m7, 2 1128cabdff1aSopenharmony_ci vpalignr m6, m5, m6, 2 1129cabdff1aSopenharmony_ci vpalignr m5, m4, m5, 2 1130cabdff1aSopenharmony_ci vpalignr m4, m3, m4, 2 1131cabdff1aSopenharmony_ci vpalignr m3, m1, m3, 2 1132cabdff1aSopenharmony_ci vpalignr m1, m0, m1, 2 1133cabdff1aSopenharmony_ci vpalignr m0, m2, m0, 2 1134cabdff1aSopenharmony_ci%else 1135cabdff1aSopenharmony_ci SCRATCH 2, 8, rsp+0*mmsize 1136cabdff1aSopenharmony_ci%if notcpuflag(ssse3) 1137cabdff1aSopenharmony_ci SCRATCH 0, 9, rsp+1*mmsize 1138cabdff1aSopenharmony_ci%endif 1139cabdff1aSopenharmony_ci PALIGNR m2, m6, m7, 2, m0 1140cabdff1aSopenharmony_ci mova m7, m2 1141cabdff1aSopenharmony_ci PALIGNR m2, m5, m6, 2, m0 1142cabdff1aSopenharmony_ci mova m6, m2 1143cabdff1aSopenharmony_ci PALIGNR m2, m4, m5, 2, m0 1144cabdff1aSopenharmony_ci mova m5, m2 1145cabdff1aSopenharmony_ci PALIGNR m2, m3, m4, 2, m0 1146cabdff1aSopenharmony_ci mova m4, m2 1147cabdff1aSopenharmony_ci PALIGNR m2, m1, m3, 2, m0 1148cabdff1aSopenharmony_ci mova m3, m2 1149cabdff1aSopenharmony_ci%if notcpuflag(ssse3) 1150cabdff1aSopenharmony_ci UNSCRATCH 0, 9, rsp+1*mmsize 1151cabdff1aSopenharmony_ci SCRATCH 3, 9, rsp+1*mmsize 1152cabdff1aSopenharmony_ci%endif 1153cabdff1aSopenharmony_ci PALIGNR m2, m0, m1, 2, m3 1154cabdff1aSopenharmony_ci mova m1, m2 1155cabdff1aSopenharmony_ci UNSCRATCH 2, 8, rsp+0*mmsize 1156cabdff1aSopenharmony_ci SCRATCH 1, 8, rsp+0*mmsize 1157cabdff1aSopenharmony_ci PALIGNR m1, m2, m0, 2, m3 1158cabdff1aSopenharmony_ci mova m0, m1 1159cabdff1aSopenharmony_ci%endif 1160cabdff1aSopenharmony_ci psrldq m2, 2 1161cabdff1aSopenharmony_ci dec cntd 1162cabdff1aSopenharmony_ci jg .loop 1163cabdff1aSopenharmony_ci RET 1164cabdff1aSopenharmony_ci%endmacro 1165cabdff1aSopenharmony_ci 1166cabdff1aSopenharmony_ciINIT_XMM sse2 1167cabdff1aSopenharmony_ciDR_FUNCS 3 1168cabdff1aSopenharmony_ciINIT_XMM ssse3 1169cabdff1aSopenharmony_ciDR_FUNCS 2 1170cabdff1aSopenharmony_ciINIT_XMM avx 1171cabdff1aSopenharmony_ciDR_FUNCS 2 1172cabdff1aSopenharmony_ci 1173cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 1174cabdff1aSopenharmony_ciINIT_YMM avx2 1175cabdff1aSopenharmony_cicglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a 1176cabdff1aSopenharmony_ci mova m0, [lq] ; klmnopqrstuvwxyz 1177cabdff1aSopenharmony_ci movu m1, [aq-2] ; *abcdefghijklmno 1178cabdff1aSopenharmony_ci mova m2, [aq] ; abcdefghijklmnop 1179cabdff1aSopenharmony_ci vperm2i128 m4, m2, m2, q2001 ; ijklmnop........ 1180cabdff1aSopenharmony_ci vpalignr m5, m4, m2, 2 ; bcdefghijklmnop. 1181cabdff1aSopenharmony_ci vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg 1182cabdff1aSopenharmony_ci LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO. 1183cabdff1aSopenharmony_ci vpalignr m4, m3, m0, 2 ; lmnopqrstuvwxyz* 1184cabdff1aSopenharmony_ci vpalignr m5, m3, m0, 4 ; mnopqrstuvwxyz*a 1185cabdff1aSopenharmony_ci LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ# 1186cabdff1aSopenharmony_ci vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH 1187cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3, stride5, dst3 1188cabdff1aSopenharmony_ci lea dst3q, [dstq+strideq*4] 1189cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 1190cabdff1aSopenharmony_ci lea stride5q, [stride3q+strideq*2] 1191cabdff1aSopenharmony_ci 1192cabdff1aSopenharmony_ci vpalignr m3, m5, m0, 2 1193cabdff1aSopenharmony_ci vpalignr m4, m1, m5, 2 1194cabdff1aSopenharmony_ci mova [dst3q+stride5q*2], m3 ; 14 1195cabdff1aSopenharmony_ci mova [ dstq+stride3q*2], m4 ; 6 1196cabdff1aSopenharmony_ci vpalignr m3, m5, m0, 4 1197cabdff1aSopenharmony_ci vpalignr m4, m1, m5, 4 1198cabdff1aSopenharmony_ci sub dst3q, strideq 1199cabdff1aSopenharmony_ci mova [dst3q+stride5q*2], m3 ; 13 1200cabdff1aSopenharmony_ci mova [dst3q+strideq*2 ], m4 ; 5 1201cabdff1aSopenharmony_ci mova [dst3q+stride3q*4], m0 ; 15 1202cabdff1aSopenharmony_ci vpalignr m3, m5, m0, 6 1203cabdff1aSopenharmony_ci vpalignr m4, m1, m5, 6 1204cabdff1aSopenharmony_ci mova [dstq+stride3q*4], m3 ; 12 1205cabdff1aSopenharmony_ci mova [dst3q+strideq*1], m4 ; 4 1206cabdff1aSopenharmony_ci vpalignr m3, m5, m0, 8 1207cabdff1aSopenharmony_ci vpalignr m4, m1, m5, 8 1208cabdff1aSopenharmony_ci mova [dst3q+strideq*8], m3 ; 11 1209cabdff1aSopenharmony_ci mova [dst3q+strideq*0], m4 ; 3 1210cabdff1aSopenharmony_ci vpalignr m3, m5, m0, 10 1211cabdff1aSopenharmony_ci vpalignr m4, m1, m5, 10 1212cabdff1aSopenharmony_ci mova [dstq+stride5q*2], m3 ; 10 1213cabdff1aSopenharmony_ci mova [dstq+strideq*2 ], m4 ; 2 1214cabdff1aSopenharmony_ci vpalignr m3, m5, m0, 12 1215cabdff1aSopenharmony_ci vpalignr m4, m1, m5, 12 1216cabdff1aSopenharmony_ci mova [dst3q+stride3q*2], m3 ; 9 1217cabdff1aSopenharmony_ci mova [dstq+strideq*1 ], m4 ; 1 1218cabdff1aSopenharmony_ci vpalignr m3, m5, m0, 14 1219cabdff1aSopenharmony_ci vpalignr m4, m1, m5, 14 1220cabdff1aSopenharmony_ci mova [dstq+strideq*8], m3 ; 8 1221cabdff1aSopenharmony_ci mova [dstq+strideq*0], m4 ; 0 1222cabdff1aSopenharmony_ci mova [dst3q+strideq*4], m5 ; 7 1223cabdff1aSopenharmony_ci RET 1224cabdff1aSopenharmony_ci 1225cabdff1aSopenharmony_cicglobal vp9_ipred_vl_16x16_16, 4, 5, 7, dst, stride, l, a 1226cabdff1aSopenharmony_ci movifnidn aq, amp 1227cabdff1aSopenharmony_ci mova m0, [aq] ; abcdefghijklmnop 1228cabdff1aSopenharmony_ci vpbroadcastw xm1, [aq+30] ; pppppppp 1229cabdff1aSopenharmony_ci vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp 1230cabdff1aSopenharmony_ci vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp 1231cabdff1aSopenharmony_ci vperm2i128 m4, m3, m1, q0201 ; jklmnopppppppppp 1232cabdff1aSopenharmony_ci vpalignr m5, m2, m0, 4 ; cdefghijklmnoppp 1233cabdff1aSopenharmony_ci vperm2i128 m6, m5, m1, q0201 ; klmnoppppppppppp 1234cabdff1aSopenharmony_ci LOWPASS 5, 3, 0 ; BCDEFGHIJKLMNOPP 1235cabdff1aSopenharmony_ci LOWPASS 6, 4, 2 ; JKLMNOPPPPPPPPPP 1236cabdff1aSopenharmony_ci pavgw m3, m0 ; abcdefghijklmnop 1237cabdff1aSopenharmony_ci pavgw m4, m2 ; ijklmnoppppppppp 1238cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3, stride5, dst4 1239cabdff1aSopenharmony_ci lea dst4q, [dstq+strideq*4] 1240cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 1241cabdff1aSopenharmony_ci lea stride5q, [stride3q+strideq*2] 1242cabdff1aSopenharmony_ci 1243cabdff1aSopenharmony_ci mova [dstq+strideq*0], m3 ; 0 abcdefghijklmnop 1244cabdff1aSopenharmony_ci mova [dstq+strideq*1], m5 ; 1 BCDEFGHIJKLMNOPP 1245cabdff1aSopenharmony_ci vpalignr m0, m4, m3, 2 1246cabdff1aSopenharmony_ci vpalignr m1, m6, m5, 2 1247cabdff1aSopenharmony_ci mova [dstq+strideq*2 ], m0 ; 2 bcdefghijklmnopp 1248cabdff1aSopenharmony_ci mova [dstq+stride3q*1], m1 ; 3 CDEFGHIJKLMNOPPP 1249cabdff1aSopenharmony_ci vpalignr m0, m4, m3, 4 1250cabdff1aSopenharmony_ci vpalignr m1, m6, m5, 4 1251cabdff1aSopenharmony_ci mova [dst4q+strideq*0], m0 ; 4 cdefghijklmnoppp 1252cabdff1aSopenharmony_ci mova [dstq+stride5q*1], m1 ; 5 DEFGHIJKLMNOPPPP 1253cabdff1aSopenharmony_ci vpalignr m0, m4, m3, 6 1254cabdff1aSopenharmony_ci vpalignr m1, m6, m5, 6 1255cabdff1aSopenharmony_ci mova [ dstq+stride3q*2], m0 ; 6 defghijklmnopppp 1256cabdff1aSopenharmony_ci mova [dst4q+stride3q*1], m1 ; 7 EFGHIJKLMNOPPPPP 1257cabdff1aSopenharmony_ci vpalignr m0, m4, m3, 8 1258cabdff1aSopenharmony_ci vpalignr m1, m6, m5, 8 1259cabdff1aSopenharmony_ci mova [ dstq+strideq*8], m0 ; 8 efghijklmnoppppp 1260cabdff1aSopenharmony_ci mova [dst4q+stride5q*1], m1 ; 9 FGHIJKLMNOPPPPPP 1261cabdff1aSopenharmony_ci vpalignr m0, m4, m3, 10 1262cabdff1aSopenharmony_ci mova [dstq+stride5q*2], m0 ; 10 fghijklmnopppppp 1263cabdff1aSopenharmony_ci vpalignr m0, m4, m3, 12 1264cabdff1aSopenharmony_ci mova [dst4q+strideq*8], m0 ; 12 ghijklmnoppppppp 1265cabdff1aSopenharmony_ci vpalignr m0, m4, m3, 14 1266cabdff1aSopenharmony_ci mova [dst4q+stride5q*2], m0 ; 14 hijklmnopppppppp 1267cabdff1aSopenharmony_ci sub dst4q, strideq 1268cabdff1aSopenharmony_ci vpalignr m1, m6, m5, 10 1269cabdff1aSopenharmony_ci mova [dst4q+strideq*8], m1 ; 11 GHIJKLMNOPPPPPPP 1270cabdff1aSopenharmony_ci vpalignr m1, m6, m5, 12 1271cabdff1aSopenharmony_ci mova [dst4q+stride5q*2], m1 ; 13 HIJKLMNOPPPPPPPP 1272cabdff1aSopenharmony_ci vpalignr m1, m6, m5, 14 1273cabdff1aSopenharmony_ci mova [dst4q+stride3q*4], m1 ; 15 IJKLMNOPPPPPPPPP 1274cabdff1aSopenharmony_ci RET 1275cabdff1aSopenharmony_ci 1276cabdff1aSopenharmony_cicglobal vp9_ipred_hd_16x16_16, 4, 5, 7, dst, stride, l, a 1277cabdff1aSopenharmony_ci movu m0, [aq-2] ; *abcdefghijklmno 1278cabdff1aSopenharmony_ci mova m1, [lq] ; klmnopqrstuvwxyz 1279cabdff1aSopenharmony_ci vperm2i128 m2, m1, m0, q0201 ; stuvwxyz*abcdefg 1280cabdff1aSopenharmony_ci vpalignr m3, m2, m1, 2 ; lmnopqrstuvwxyz* 1281cabdff1aSopenharmony_ci vpalignr m4, m2, m1, 4 ; mnopqrstuvwxyz*a 1282cabdff1aSopenharmony_ci LOWPASS 4, 3, 1 ; LMNOPQRSTUVWXYZ# 1283cabdff1aSopenharmony_ci pavgw m3, m1 ; klmnopqrstuvwxyz 1284cabdff1aSopenharmony_ci mova m1, [aq] ; abcdefghijklmnop 1285cabdff1aSopenharmony_ci movu m2, [aq+2] ; bcdefghijklmnop. 1286cabdff1aSopenharmony_ci LOWPASS 2, 1, 0 ; ABCDEFGHIJKLMNO. 1287cabdff1aSopenharmony_ci vpunpcklwd m0, m3, m4 ; kLlMmNnOsTtUuVvW 1288cabdff1aSopenharmony_ci vpunpckhwd m1, m3, m4 ; oPpQqRrSwXxYyZz# 1289cabdff1aSopenharmony_ci vperm2i128 m3, m1, m0, q0002 ; kLlMmNnOoPpQqRrS 1290cabdff1aSopenharmony_ci vperm2i128 m4, m0, m1, q0301 ; sTtUuVvWwXxYyZz# 1291cabdff1aSopenharmony_ci vperm2i128 m0, m4, m2, q0201 ; wXxYyZz#ABCDEFGH 1292cabdff1aSopenharmony_ci vperm2i128 m1, m3, m4, q0201 ; oPpQqRrSsTtUuVvW 1293cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3, stride5, dst5 1294cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 1295cabdff1aSopenharmony_ci lea stride5q, [stride3q+strideq*2] 1296cabdff1aSopenharmony_ci lea dst5q, [dstq+stride5q] 1297cabdff1aSopenharmony_ci 1298cabdff1aSopenharmony_ci mova [dst5q+stride5q*2], m3 ; 15 kLlMmNnOoPpQqRrS 1299cabdff1aSopenharmony_ci mova [dst5q+stride3q*2], m1 ; 11 oPpQqRrSsTtUuVvW 1300cabdff1aSopenharmony_ci mova [dst5q+strideq*2], m4 ; 7 sTtUuVvWwXxYyZz# 1301cabdff1aSopenharmony_ci mova [dstq+stride3q*1], m0 ; 3 wXxYyZz#ABCDEFGH 1302cabdff1aSopenharmony_ci vpalignr m5, m4, m1, 4 1303cabdff1aSopenharmony_ci mova [dstq+stride5q*2], m5 ; 10 pQqRrSsTtUuVvWwX 1304cabdff1aSopenharmony_ci vpalignr m5, m0, m4, 4 1305cabdff1aSopenharmony_ci vpalignr m6, m2, m0, 4 1306cabdff1aSopenharmony_ci mova [dstq+stride3q*2], m5 ; 6 tUuVvWwXxYyZz#AB 1307cabdff1aSopenharmony_ci mova [dstq+strideq*2], m6 ; 2 xYyZz#ABCDEFGHIJ 1308cabdff1aSopenharmony_ci vpalignr m5, m4, m1, 8 1309cabdff1aSopenharmony_ci mova [dst5q+strideq*4], m5 ; 9 qRrSsTtUuVvWwXxY 1310cabdff1aSopenharmony_ci vpalignr m5, m0, m4, 8 1311cabdff1aSopenharmony_ci vpalignr m6, m2, m0, 8 1312cabdff1aSopenharmony_ci mova [dstq+stride5q*1], m5 ; 5 uVvWwXxYyZz#ABCD 1313cabdff1aSopenharmony_ci mova [dstq+strideq*1], m6 ; 1 yZz#ABCDEFGHIJKL 1314cabdff1aSopenharmony_ci vpalignr m5, m1, m3, 12 1315cabdff1aSopenharmony_ci vpalignr m6, m4, m1, 12 1316cabdff1aSopenharmony_ci mova [dstq+stride3q*4], m5 ; 12 nOoPpQqRrSsTtUuV 1317cabdff1aSopenharmony_ci mova [dst5q+stride3q], m6 ; 8 rSsTtUuVvWwXxYyZ 1318cabdff1aSopenharmony_ci vpalignr m5, m0, m4, 12 1319cabdff1aSopenharmony_ci vpalignr m6, m2, m0, 12 1320cabdff1aSopenharmony_ci mova [dstq+strideq*4], m5 ; 4 nOoPpQqRrSsTtUuV 1321cabdff1aSopenharmony_ci mova [dstq+strideq*0], m6 ; 0 z#ABCDEFGHIJKLMN 1322cabdff1aSopenharmony_ci sub dst5q, strideq 1323cabdff1aSopenharmony_ci vpalignr m5, m1, m3, 4 1324cabdff1aSopenharmony_ci mova [dst5q+stride5q*2], m5 ; 14 lMmNnOoPpQqRrSsT 1325cabdff1aSopenharmony_ci sub dst5q, strideq 1326cabdff1aSopenharmony_ci vpalignr m5, m1, m3, 8 1327cabdff1aSopenharmony_ci mova [dst5q+stride5q*2], m5 ; 13 mNnOoPpQqRrSsTtU 1328cabdff1aSopenharmony_ci RET 1329cabdff1aSopenharmony_ci 1330cabdff1aSopenharmony_ci%if ARCH_X86_64 1331cabdff1aSopenharmony_cicglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a 1332cabdff1aSopenharmony_ci mova m0, [lq+mmsize*0+0] ; l[0-15] 1333cabdff1aSopenharmony_ci mova m1, [lq+mmsize*1+0] ; l[16-31] 1334cabdff1aSopenharmony_ci movu m2, [aq+mmsize*0-2] ; *abcdefghijklmno 1335cabdff1aSopenharmony_ci mova m3, [aq+mmsize*0+0] ; abcdefghijklmnop 1336cabdff1aSopenharmony_ci mova m4, [aq+mmsize*1+0] ; qrstuvwxyz012345 1337cabdff1aSopenharmony_ci vperm2i128 m5, m0, m1, q0201 ; lmnopqrstuvwxyz0 1338cabdff1aSopenharmony_ci vpalignr m6, m5, m0, 2 ; mnopqrstuvwxyz01 1339cabdff1aSopenharmony_ci vpalignr m7, m5, m0, 4 ; nopqrstuvwxyz012 1340cabdff1aSopenharmony_ci LOWPASS 0, 6, 7 ; L[0-15] 1341cabdff1aSopenharmony_ci vperm2i128 m7, m1, m2, q0201 ; stuvwxyz*abcdefg 1342cabdff1aSopenharmony_ci vpalignr m5, m7, m1, 2 ; lmnopqrstuvwxyz* 1343cabdff1aSopenharmony_ci vpalignr m6, m7, m1, 4 ; mnopqrstuvwxyz*a 1344cabdff1aSopenharmony_ci LOWPASS 1, 5, 6 ; L[16-31]# 1345cabdff1aSopenharmony_ci vperm2i128 m5, m3, m4, q0201 ; ijklmnopqrstuvwx 1346cabdff1aSopenharmony_ci vpalignr m6, m5, m3, 2 ; bcdefghijklmnopq 1347cabdff1aSopenharmony_ci LOWPASS 2, 3, 6 ; A[0-15] 1348cabdff1aSopenharmony_ci movu m3, [aq+mmsize*1-2] ; pqrstuvwxyz01234 1349cabdff1aSopenharmony_ci vperm2i128 m6, m4, m4, q2001 ; yz012345........ 1350cabdff1aSopenharmony_ci vpalignr m7, m6, m4, 2 ; rstuvwxyz012345. 1351cabdff1aSopenharmony_ci LOWPASS 3, 4, 7 ; A[16-31]. 1352cabdff1aSopenharmony_ci vperm2i128 m4, m1, m2, q0201 ; TUVWXYZ#ABCDEFGH 1353cabdff1aSopenharmony_ci vperm2i128 m5, m0, m1, q0201 ; L[7-15]L[16-23] 1354cabdff1aSopenharmony_ci vperm2i128 m8, m2, m3, q0201 ; IJKLMNOPQRSTUVWX 1355cabdff1aSopenharmony_ci DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt 1356cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 1357cabdff1aSopenharmony_ci lea stride5q, [stride3q+strideq*2] 1358cabdff1aSopenharmony_ci lea stride7q, [strideq*4+stride3q] 1359cabdff1aSopenharmony_ci lea dst24q, [dst8q+stride3q*8] 1360cabdff1aSopenharmony_ci lea dst8q, [dst8q+strideq*8] 1361cabdff1aSopenharmony_ci mov cntd, 2 1362cabdff1aSopenharmony_ci 1363cabdff1aSopenharmony_ci.loop: 1364cabdff1aSopenharmony_ci mova [dst24q+stride7q+0 ], m0 ; 31 23 15 7 1365cabdff1aSopenharmony_ci mova [dst24q+stride7q+32], m1 1366cabdff1aSopenharmony_ci mova [dst8q+stride7q+0], m1 1367cabdff1aSopenharmony_ci mova [dst8q+stride7q+32], m2 1368cabdff1aSopenharmony_ci vpalignr m6, m4, m1, 2 1369cabdff1aSopenharmony_ci vpalignr m7, m5, m0, 2 1370cabdff1aSopenharmony_ci vpalignr m9, m8, m2, 2 1371cabdff1aSopenharmony_ci mova [dst24q+stride3q*2+0], m7 ; 30 22 14 6 1372cabdff1aSopenharmony_ci mova [dst24q+stride3q*2+32], m6 1373cabdff1aSopenharmony_ci mova [dst8q+stride3q*2+0], m6 1374cabdff1aSopenharmony_ci mova [dst8q+stride3q*2+32], m9 1375cabdff1aSopenharmony_ci vpalignr m6, m4, m1, 4 1376cabdff1aSopenharmony_ci vpalignr m7, m5, m0, 4 1377cabdff1aSopenharmony_ci vpalignr m9, m8, m2, 4 1378cabdff1aSopenharmony_ci mova [dst24q+stride5q+0], m7 ; 29 21 13 5 1379cabdff1aSopenharmony_ci mova [dst24q+stride5q+32], m6 1380cabdff1aSopenharmony_ci mova [dst8q+stride5q+0], m6 1381cabdff1aSopenharmony_ci mova [dst8q+stride5q+32], m9 1382cabdff1aSopenharmony_ci vpalignr m6, m4, m1, 6 1383cabdff1aSopenharmony_ci vpalignr m7, m5, m0, 6 1384cabdff1aSopenharmony_ci vpalignr m9, m8, m2, 6 1385cabdff1aSopenharmony_ci mova [dst24q+strideq*4+0 ], m7 ; 28 20 12 4 1386cabdff1aSopenharmony_ci mova [dst24q+strideq*4+32], m6 1387cabdff1aSopenharmony_ci mova [dst8q+strideq*4+0], m6 1388cabdff1aSopenharmony_ci mova [dst8q+strideq*4+32], m9 1389cabdff1aSopenharmony_ci vpalignr m6, m4, m1, 8 1390cabdff1aSopenharmony_ci vpalignr m7, m5, m0, 8 1391cabdff1aSopenharmony_ci vpalignr m9, m8, m2, 8 1392cabdff1aSopenharmony_ci mova [dst24q+stride3q+0 ], m7 ; 27 19 11 3 1393cabdff1aSopenharmony_ci mova [dst24q+stride3q+32], m6 1394cabdff1aSopenharmony_ci mova [dst8q+stride3q+0], m6 1395cabdff1aSopenharmony_ci mova [dst8q+stride3q+32], m9 1396cabdff1aSopenharmony_ci vpalignr m6, m4, m1, 10 1397cabdff1aSopenharmony_ci vpalignr m7, m5, m0, 10 1398cabdff1aSopenharmony_ci vpalignr m9, m8, m2, 10 1399cabdff1aSopenharmony_ci mova [dst24q+strideq*2+0 ], m7 ; 26 18 10 2 1400cabdff1aSopenharmony_ci mova [dst24q+strideq*2+32], m6 1401cabdff1aSopenharmony_ci mova [dst8q+strideq*2+0], m6 1402cabdff1aSopenharmony_ci mova [dst8q+strideq*2+32], m9 1403cabdff1aSopenharmony_ci vpalignr m6, m4, m1, 12 1404cabdff1aSopenharmony_ci vpalignr m7, m5, m0, 12 1405cabdff1aSopenharmony_ci vpalignr m9, m8, m2, 12 1406cabdff1aSopenharmony_ci mova [dst24q+strideq+0 ], m7 ; 25 17 9 1 1407cabdff1aSopenharmony_ci mova [dst24q+strideq+32], m6 1408cabdff1aSopenharmony_ci mova [dst8q+strideq+0], m6 1409cabdff1aSopenharmony_ci mova [dst8q+strideq+32], m9 1410cabdff1aSopenharmony_ci vpalignr m6, m4, m1, 14 1411cabdff1aSopenharmony_ci vpalignr m7, m5, m0, 14 1412cabdff1aSopenharmony_ci vpalignr m9, m8, m2, 14 1413cabdff1aSopenharmony_ci mova [dst24q+strideq*0+0 ], m7 ; 24 16 8 0 1414cabdff1aSopenharmony_ci mova [dst24q+strideq*0+32], m6 1415cabdff1aSopenharmony_ci mova [dst8q+strideq*0+0], m6 1416cabdff1aSopenharmony_ci mova [dst8q+strideq*0+32], m9 1417cabdff1aSopenharmony_ci mova m0, m5 1418cabdff1aSopenharmony_ci mova m5, m1 1419cabdff1aSopenharmony_ci mova m1, m4 1420cabdff1aSopenharmony_ci mova m4, m2 1421cabdff1aSopenharmony_ci mova m2, m8 1422cabdff1aSopenharmony_ci mova m8, m3 1423cabdff1aSopenharmony_ci sub dst24q, stride7q 1424cabdff1aSopenharmony_ci sub dst24q, strideq 1425cabdff1aSopenharmony_ci sub dst8q, stride7q 1426cabdff1aSopenharmony_ci sub dst8q, strideq 1427cabdff1aSopenharmony_ci dec cntd 1428cabdff1aSopenharmony_ci jg .loop 1429cabdff1aSopenharmony_ci RET 1430cabdff1aSopenharmony_ci%endif 1431cabdff1aSopenharmony_ci%endif 1432cabdff1aSopenharmony_ci 1433cabdff1aSopenharmony_ci%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function 1434cabdff1aSopenharmony_cicglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a 1435cabdff1aSopenharmony_ci movifnidn aq, amp 1436cabdff1aSopenharmony_ci movu m0, [aq] ; abcdefgh 1437cabdff1aSopenharmony_ci psrldq m1, m0, 2 ; bcdefgh. 1438cabdff1aSopenharmony_ci psrldq m2, m0, 4 ; cdefgh.. 1439cabdff1aSopenharmony_ci LOWPASS 2, 1, 0 ; BCDEFGH. 1440cabdff1aSopenharmony_ci pavgw m1, m0 ; ABCDEFG. 1441cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 1442cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 1443cabdff1aSopenharmony_ci 1444cabdff1aSopenharmony_ci movh [dstq+strideq*0], m1 1445cabdff1aSopenharmony_ci movh [dstq+strideq*1], m2 1446cabdff1aSopenharmony_ci psrldq m1, 2 1447cabdff1aSopenharmony_ci psrldq m2, 2 1448cabdff1aSopenharmony_ci movh [dstq+strideq*2], m1 1449cabdff1aSopenharmony_ci movh [dstq+stride3q ], m2 1450cabdff1aSopenharmony_ci RET 1451cabdff1aSopenharmony_ci 1452cabdff1aSopenharmony_cicglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a 1453cabdff1aSopenharmony_ci movifnidn aq, amp 1454cabdff1aSopenharmony_ci mova m0, [aq] ; abcdefgh 1455cabdff1aSopenharmony_ci%if cpuflag(ssse3) 1456cabdff1aSopenharmony_ci mova m3, [pb_2to15_14_15] 1457cabdff1aSopenharmony_ci%endif 1458cabdff1aSopenharmony_ci SHIFT_RIGHTx2 m1, m2, m0, m3 ; bcdefghh/cdefghhh 1459cabdff1aSopenharmony_ci LOWPASS 2, 1, 0 ; BCDEFGHh 1460cabdff1aSopenharmony_ci pavgw m1, m0 ; ABCDEFGh 1461cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 1462cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 1463cabdff1aSopenharmony_ci 1464cabdff1aSopenharmony_ci mova [dstq+strideq*0], m1 1465cabdff1aSopenharmony_ci mova [dstq+strideq*1], m2 1466cabdff1aSopenharmony_ci SHIFT_RIGHT m1, m1, m3 1467cabdff1aSopenharmony_ci SHIFT_RIGHT m2, m2, m3 1468cabdff1aSopenharmony_ci mova [dstq+strideq*2], m1 1469cabdff1aSopenharmony_ci mova [dstq+stride3q ], m2 1470cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 1471cabdff1aSopenharmony_ci SHIFT_RIGHT m1, m1, m3 1472cabdff1aSopenharmony_ci SHIFT_RIGHT m2, m2, m3 1473cabdff1aSopenharmony_ci mova [dstq+strideq*0], m1 1474cabdff1aSopenharmony_ci mova [dstq+strideq*1], m2 1475cabdff1aSopenharmony_ci SHIFT_RIGHT m1, m1, m3 1476cabdff1aSopenharmony_ci SHIFT_RIGHT m2, m2, m3 1477cabdff1aSopenharmony_ci mova [dstq+strideq*2], m1 1478cabdff1aSopenharmony_ci mova [dstq+stride3q ], m2 1479cabdff1aSopenharmony_ci RET 1480cabdff1aSopenharmony_ci 1481cabdff1aSopenharmony_cicglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a 1482cabdff1aSopenharmony_ci movifnidn aq, amp 1483cabdff1aSopenharmony_ci mova m0, [aq] 1484cabdff1aSopenharmony_ci mova m1, [aq+mmsize] 1485cabdff1aSopenharmony_ci PALIGNR m2, m1, m0, 2, m3 1486cabdff1aSopenharmony_ci PALIGNR m3, m1, m0, 4, m4 1487cabdff1aSopenharmony_ci LOWPASS 3, 2, 0 1488cabdff1aSopenharmony_ci pavgw m2, m0 1489cabdff1aSopenharmony_ci%if cpuflag(ssse3) 1490cabdff1aSopenharmony_ci mova m4, [pb_2to15_14_15] 1491cabdff1aSopenharmony_ci%endif 1492cabdff1aSopenharmony_ci SHIFT_RIGHTx2 m5, m0, m1, m4 1493cabdff1aSopenharmony_ci LOWPASS 0, 5, 1 1494cabdff1aSopenharmony_ci pavgw m1, m5 1495cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, cnt 1496cabdff1aSopenharmony_ci mov cntd, 8 1497cabdff1aSopenharmony_ci 1498cabdff1aSopenharmony_ci.loop: 1499cabdff1aSopenharmony_ci mova [dstq+strideq*0+ 0], m2 1500cabdff1aSopenharmony_ci mova [dstq+strideq*0+16], m1 1501cabdff1aSopenharmony_ci mova [dstq+strideq*1+ 0], m3 1502cabdff1aSopenharmony_ci mova [dstq+strideq*1+16], m0 1503cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1504cabdff1aSopenharmony_ci%if cpuflag(avx) 1505cabdff1aSopenharmony_ci vpalignr m2, m1, m2, 2 1506cabdff1aSopenharmony_ci vpalignr m3, m0, m3, 2 1507cabdff1aSopenharmony_ci%else 1508cabdff1aSopenharmony_ci PALIGNR m5, m1, m2, 2, m4 1509cabdff1aSopenharmony_ci mova m2, m5 1510cabdff1aSopenharmony_ci PALIGNR m5, m0, m3, 2, m4 1511cabdff1aSopenharmony_ci mova m3, m5 1512cabdff1aSopenharmony_ci%endif 1513cabdff1aSopenharmony_ci SHIFT_RIGHT m1, m1, m4 1514cabdff1aSopenharmony_ci SHIFT_RIGHT m0, m0, m4 1515cabdff1aSopenharmony_ci dec cntd 1516cabdff1aSopenharmony_ci jg .loop 1517cabdff1aSopenharmony_ci RET 1518cabdff1aSopenharmony_ci 1519cabdff1aSopenharmony_cicglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a 1520cabdff1aSopenharmony_ci movifnidn aq, amp 1521cabdff1aSopenharmony_ci mova m0, [aq+mmsize*0] 1522cabdff1aSopenharmony_ci mova m1, [aq+mmsize*1] 1523cabdff1aSopenharmony_ci mova m2, [aq+mmsize*2] 1524cabdff1aSopenharmony_ci PALIGNR m6, m1, m0, 2, m5 1525cabdff1aSopenharmony_ci PALIGNR m7, m1, m0, 4, m5 1526cabdff1aSopenharmony_ci LOWPASS 7, 6, 0 1527cabdff1aSopenharmony_ci pavgw m6, m0 1528cabdff1aSopenharmony_ci SCRATCH 6, 8, rsp+0*mmsize 1529cabdff1aSopenharmony_ci PALIGNR m4, m2, m1, 2, m0 1530cabdff1aSopenharmony_ci PALIGNR m5, m2, m1, 4, m0 1531cabdff1aSopenharmony_ci LOWPASS 5, 4, 1 1532cabdff1aSopenharmony_ci pavgw m4, m1 1533cabdff1aSopenharmony_ci mova m0, [aq+mmsize*3] 1534cabdff1aSopenharmony_ci PALIGNR m1, m0, m2, 2, m6 1535cabdff1aSopenharmony_ci PALIGNR m3, m0, m2, 4, m6 1536cabdff1aSopenharmony_ci LOWPASS 3, 1, 2 1537cabdff1aSopenharmony_ci pavgw m2, m1 1538cabdff1aSopenharmony_ci%if cpuflag(ssse3) 1539cabdff1aSopenharmony_ci PRELOAD 10, pb_2to15_14_15, shuf 1540cabdff1aSopenharmony_ci%endif 1541cabdff1aSopenharmony_ci SHIFT_RIGHTx2 m6, m1, m0, reg_shuf 1542cabdff1aSopenharmony_ci LOWPASS 1, 6, 0 1543cabdff1aSopenharmony_ci pavgw m0, m6 1544cabdff1aSopenharmony_ci%if ARCH_X86_64 1545cabdff1aSopenharmony_ci pshufd m9, m6, q3333 1546cabdff1aSopenharmony_ci%endif 1547cabdff1aSopenharmony_ci%if cpuflag(avx) 1548cabdff1aSopenharmony_ci UNSCRATCH 6, 8, rsp+0*mmsize 1549cabdff1aSopenharmony_ci%endif 1550cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, cnt, stride16, stride17 1551cabdff1aSopenharmony_ci mov stride16q, strideq 1552cabdff1aSopenharmony_ci mov cntd, 8 1553cabdff1aSopenharmony_ci shl stride16q, 4 1554cabdff1aSopenharmony_ci lea stride17q, [stride16q+strideq] 1555cabdff1aSopenharmony_ci 1556cabdff1aSopenharmony_ci ; FIXME m8 is unused for avx, so we could save one register here for win64 1557cabdff1aSopenharmony_ci.loop: 1558cabdff1aSopenharmony_ci%if notcpuflag(avx) 1559cabdff1aSopenharmony_ci UNSCRATCH 6, 8, rsp+0*mmsize 1560cabdff1aSopenharmony_ci%endif 1561cabdff1aSopenharmony_ci mova [dstq+strideq*0+ 0], m6 1562cabdff1aSopenharmony_ci mova [dstq+strideq*0+16], m4 1563cabdff1aSopenharmony_ci mova [dstq+strideq*0+32], m2 1564cabdff1aSopenharmony_ci mova [dstq+strideq*0+48], m0 1565cabdff1aSopenharmony_ci mova [dstq+strideq*1+ 0], m7 1566cabdff1aSopenharmony_ci mova [dstq+strideq*1+16], m5 1567cabdff1aSopenharmony_ci mova [dstq+strideq*1+32], m3 1568cabdff1aSopenharmony_ci mova [dstq+strideq*1+48], m1 1569cabdff1aSopenharmony_ci mova [dstq+stride16q+ 0], m4 1570cabdff1aSopenharmony_ci mova [dstq+stride16q+16], m2 1571cabdff1aSopenharmony_ci mova [dstq+stride16q+32], m0 1572cabdff1aSopenharmony_ci%if ARCH_X86_64 1573cabdff1aSopenharmony_ci mova [dstq+stride16q+48], m9 1574cabdff1aSopenharmony_ci%endif 1575cabdff1aSopenharmony_ci mova [dstq+stride17q+ 0], m5 1576cabdff1aSopenharmony_ci mova [dstq+stride17q+16], m3 1577cabdff1aSopenharmony_ci mova [dstq+stride17q+32], m1 1578cabdff1aSopenharmony_ci%if ARCH_X86_64 1579cabdff1aSopenharmony_ci mova [dstq+stride17q+48], m9 1580cabdff1aSopenharmony_ci%endif 1581cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1582cabdff1aSopenharmony_ci%if cpuflag(avx) 1583cabdff1aSopenharmony_ci vpalignr m6, m4, m6, 2 1584cabdff1aSopenharmony_ci vpalignr m4, m2, m4, 2 1585cabdff1aSopenharmony_ci vpalignr m2, m0, m2, 2 1586cabdff1aSopenharmony_ci vpalignr m7, m5, m7, 2 1587cabdff1aSopenharmony_ci vpalignr m5, m3, m5, 2 1588cabdff1aSopenharmony_ci vpalignr m3, m1, m3, 2 1589cabdff1aSopenharmony_ci%else 1590cabdff1aSopenharmony_ci SCRATCH 3, 8, rsp+0*mmsize 1591cabdff1aSopenharmony_ci%if notcpuflag(ssse3) 1592cabdff1aSopenharmony_ci SCRATCH 1, 10, rsp+1*mmsize 1593cabdff1aSopenharmony_ci%endif 1594cabdff1aSopenharmony_ci PALIGNR m3, m4, m6, 2, m1 1595cabdff1aSopenharmony_ci mova m6, m3 1596cabdff1aSopenharmony_ci PALIGNR m3, m2, m4, 2, m1 1597cabdff1aSopenharmony_ci mova m4, m3 1598cabdff1aSopenharmony_ci PALIGNR m3, m0, m2, 2, m1 1599cabdff1aSopenharmony_ci mova m2, m3 1600cabdff1aSopenharmony_ci PALIGNR m3, m5, m7, 2, m1 1601cabdff1aSopenharmony_ci mova m7, m3 1602cabdff1aSopenharmony_ci UNSCRATCH 3, 8, rsp+0*mmsize 1603cabdff1aSopenharmony_ci SCRATCH 6, 8, rsp+0*mmsize 1604cabdff1aSopenharmony_ci%if notcpuflag(ssse3) 1605cabdff1aSopenharmony_ci UNSCRATCH 1, 10, rsp+1*mmsize 1606cabdff1aSopenharmony_ci SCRATCH 7, 10, rsp+1*mmsize 1607cabdff1aSopenharmony_ci%endif 1608cabdff1aSopenharmony_ci PALIGNR m6, m3, m5, 2, m7 1609cabdff1aSopenharmony_ci mova m5, m6 1610cabdff1aSopenharmony_ci PALIGNR m6, m1, m3, 2, m7 1611cabdff1aSopenharmony_ci mova m3, m6 1612cabdff1aSopenharmony_ci%if notcpuflag(ssse3) 1613cabdff1aSopenharmony_ci UNSCRATCH 7, 10, rsp+1*mmsize 1614cabdff1aSopenharmony_ci%endif 1615cabdff1aSopenharmony_ci%endif 1616cabdff1aSopenharmony_ci SHIFT_RIGHT m1, m1, reg_shuf 1617cabdff1aSopenharmony_ci SHIFT_RIGHT m0, m0, reg_shuf 1618cabdff1aSopenharmony_ci dec cntd 1619cabdff1aSopenharmony_ci jg .loop 1620cabdff1aSopenharmony_ci 1621cabdff1aSopenharmony_ci%if ARCH_X86_32 1622cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 1623cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 1624cabdff1aSopenharmony_ci%assign %%n 0 1625cabdff1aSopenharmony_ci%rep 4 1626cabdff1aSopenharmony_ci mova [dstq+strideq*0+48], m0 1627cabdff1aSopenharmony_ci mova [dstq+strideq*1+48], m0 1628cabdff1aSopenharmony_ci mova [dstq+strideq*2+48], m0 1629cabdff1aSopenharmony_ci mova [dstq+stride3q +48], m0 1630cabdff1aSopenharmony_ci%if %%n < 3 1631cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 1632cabdff1aSopenharmony_ci%endif 1633cabdff1aSopenharmony_ci%assign %%n (%%n+1) 1634cabdff1aSopenharmony_ci%endrep 1635cabdff1aSopenharmony_ci%endif 1636cabdff1aSopenharmony_ci RET 1637cabdff1aSopenharmony_ci%endmacro 1638cabdff1aSopenharmony_ci 1639cabdff1aSopenharmony_ciINIT_XMM sse2 1640cabdff1aSopenharmony_ciVL_FUNCS 2 1641cabdff1aSopenharmony_ciINIT_XMM ssse3 1642cabdff1aSopenharmony_ciVL_FUNCS 1 1643cabdff1aSopenharmony_ciINIT_XMM avx 1644cabdff1aSopenharmony_ciVL_FUNCS 1 1645cabdff1aSopenharmony_ci 1646cabdff1aSopenharmony_ci%macro VR_FUNCS 0 1647cabdff1aSopenharmony_cicglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a 1648cabdff1aSopenharmony_ci movu m0, [aq-2] 1649cabdff1aSopenharmony_ci movhps m1, [lq] 1650cabdff1aSopenharmony_ci PALIGNR m0, m1, 10, m2 ; xyz*abcd 1651cabdff1aSopenharmony_ci pslldq m1, m0, 2 ; .xyz*abc 1652cabdff1aSopenharmony_ci pslldq m2, m0, 4 ; ..xyz*ab 1653cabdff1aSopenharmony_ci LOWPASS 2, 1, 0 ; ..YZ#ABC 1654cabdff1aSopenharmony_ci pavgw m1, m0 ; ....#ABC 1655cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 1656cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 1657cabdff1aSopenharmony_ci 1658cabdff1aSopenharmony_ci movhps [dstq+strideq*0], m1 1659cabdff1aSopenharmony_ci movhps [dstq+strideq*1], m2 1660cabdff1aSopenharmony_ci shufps m0, m2, m1, q3210 1661cabdff1aSopenharmony_ci%if cpuflag(ssse3) 1662cabdff1aSopenharmony_ci pshufb m2, [pb_4_5_8to13_8x0] 1663cabdff1aSopenharmony_ci%else 1664cabdff1aSopenharmony_ci pshuflw m2, m2, q2222 1665cabdff1aSopenharmony_ci psrldq m2, 6 1666cabdff1aSopenharmony_ci%endif 1667cabdff1aSopenharmony_ci psrldq m0, 6 1668cabdff1aSopenharmony_ci movh [dstq+strideq*2], m0 1669cabdff1aSopenharmony_ci movh [dstq+stride3q ], m2 1670cabdff1aSopenharmony_ci RET 1671cabdff1aSopenharmony_ci 1672cabdff1aSopenharmony_cicglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a 1673cabdff1aSopenharmony_ci movu m1, [aq-2] ; *abcdefg 1674cabdff1aSopenharmony_ci movu m2, [lq] ; stuvwxyz 1675cabdff1aSopenharmony_ci mova m0, [aq] ; abcdefgh 1676cabdff1aSopenharmony_ci PALIGNR m3, m1, m2, 14, m4 ; z*abcdef 1677cabdff1aSopenharmony_ci LOWPASS 3, 1, 0 1678cabdff1aSopenharmony_ci pavgw m0, m1 1679cabdff1aSopenharmony_ci PALIGNR m1, m2, 2, m4 ; tuvwxyz* 1680cabdff1aSopenharmony_ci pslldq m4, m2, 2 ; .stuvwxy 1681cabdff1aSopenharmony_ci LOWPASS 4, 2, 1 1682cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 1683cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 1684cabdff1aSopenharmony_ci 1685cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 1686cabdff1aSopenharmony_ci mova [dstq+strideq*1], m3 1687cabdff1aSopenharmony_ci PALIGNR m0, m4, 14, m1 1688cabdff1aSopenharmony_ci pslldq m4, 2 1689cabdff1aSopenharmony_ci PALIGNR m3, m4, 14, m1 1690cabdff1aSopenharmony_ci pslldq m4, 2 1691cabdff1aSopenharmony_ci mova [dstq+strideq*2], m0 1692cabdff1aSopenharmony_ci mova [dstq+stride3q ], m3 1693cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 1694cabdff1aSopenharmony_ci PALIGNR m0, m4, 14, m1 1695cabdff1aSopenharmony_ci pslldq m4, 2 1696cabdff1aSopenharmony_ci PALIGNR m3, m4, 14, m1 1697cabdff1aSopenharmony_ci pslldq m4, 2 1698cabdff1aSopenharmony_ci mova [dstq+strideq*0], m0 1699cabdff1aSopenharmony_ci mova [dstq+strideq*1], m3 1700cabdff1aSopenharmony_ci PALIGNR m0, m4, 14, m1 1701cabdff1aSopenharmony_ci pslldq m4, 2 1702cabdff1aSopenharmony_ci PALIGNR m3, m4, 14, m4 1703cabdff1aSopenharmony_ci mova [dstq+strideq*2], m0 1704cabdff1aSopenharmony_ci mova [dstq+stride3q ], m3 1705cabdff1aSopenharmony_ci RET 1706cabdff1aSopenharmony_ci 1707cabdff1aSopenharmony_cicglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a 1708cabdff1aSopenharmony_ci movu m1, [aq-2] ; *abcdefg 1709cabdff1aSopenharmony_ci movu m2, [aq+mmsize-2] ; hijklmno 1710cabdff1aSopenharmony_ci mova m3, [aq] ; abcdefgh 1711cabdff1aSopenharmony_ci mova m4, [aq+mmsize] ; ijklmnop 1712cabdff1aSopenharmony_ci mova m5, [lq+mmsize] ; stuvwxyz 1713cabdff1aSopenharmony_ci PALIGNR m0, m1, m5, 14, m6 ; z*abcdef 1714cabdff1aSopenharmony_ci movu m6, [aq+mmsize-4] ; ghijklmn 1715cabdff1aSopenharmony_ci LOWPASS 6, 2, 4 1716cabdff1aSopenharmony_ci pavgw m2, m4 1717cabdff1aSopenharmony_ci LOWPASS 0, 1, 3 1718cabdff1aSopenharmony_ci pavgw m3, m1 1719cabdff1aSopenharmony_ci PALIGNR m1, m5, 2, m7 ; tuvwxyz* 1720cabdff1aSopenharmony_ci movu m7, [lq+mmsize-2] ; rstuvwxy 1721cabdff1aSopenharmony_ci LOWPASS 1, 5, 7 1722cabdff1aSopenharmony_ci movu m5, [lq+2] ; lmnopqrs 1723cabdff1aSopenharmony_ci pslldq m4, m5, 2 ; .lmnopqr 1724cabdff1aSopenharmony_ci pslldq m7, m5, 4 ; ..lmnopq 1725cabdff1aSopenharmony_ci LOWPASS 5, 4, 7 1726cabdff1aSopenharmony_ci psrld m4, m1, 16 1727cabdff1aSopenharmony_ci psrld m7, m5, 16 1728cabdff1aSopenharmony_ci pand m1, [pd_65535] 1729cabdff1aSopenharmony_ci pand m5, [pd_65535] 1730cabdff1aSopenharmony_ci packssdw m7, m4 1731cabdff1aSopenharmony_ci packssdw m5, m1 1732cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, cnt 1733cabdff1aSopenharmony_ci mov cntd, 8 1734cabdff1aSopenharmony_ci 1735cabdff1aSopenharmony_ci.loop: 1736cabdff1aSopenharmony_ci mova [dstq+strideq*0+ 0], m3 1737cabdff1aSopenharmony_ci mova [dstq+strideq*0+16], m2 1738cabdff1aSopenharmony_ci mova [dstq+strideq*1+ 0], m0 1739cabdff1aSopenharmony_ci mova [dstq+strideq*1+16], m6 1740cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1741cabdff1aSopenharmony_ci PALIGNR m2, m3, 14, m4 1742cabdff1aSopenharmony_ci PALIGNR m3, m7, 14, m4 1743cabdff1aSopenharmony_ci pslldq m7, 2 1744cabdff1aSopenharmony_ci PALIGNR m6, m0, 14, m4 1745cabdff1aSopenharmony_ci PALIGNR m0, m5, 14, m4 1746cabdff1aSopenharmony_ci pslldq m5, 2 1747cabdff1aSopenharmony_ci dec cntd 1748cabdff1aSopenharmony_ci jg .loop 1749cabdff1aSopenharmony_ci RET 1750cabdff1aSopenharmony_ci 1751cabdff1aSopenharmony_cicglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a 1752cabdff1aSopenharmony_ci movu m0, [aq+mmsize*0-2] ; *a[0-6] 1753cabdff1aSopenharmony_ci movu m1, [aq+mmsize*1-2] ; a[7-14] 1754cabdff1aSopenharmony_ci movu m2, [aq+mmsize*2-2] ; a[15-22] 1755cabdff1aSopenharmony_ci movu m3, [aq+mmsize*3-2] ; a[23-30] 1756cabdff1aSopenharmony_ci mova m4, [aq+mmsize*3+0] ; a[24-31] 1757cabdff1aSopenharmony_ci movu m5, [aq+mmsize*3-4] ; a[22-29] 1758cabdff1aSopenharmony_ci LOWPASS 5, 3, 4 ; A[23-30] 1759cabdff1aSopenharmony_ci SCRATCH 5, 8, rsp+0*mmsize 1760cabdff1aSopenharmony_ci pavgw m3, m4 1761cabdff1aSopenharmony_ci mova m4, [aq+mmsize*2+0] ; a[16-23] 1762cabdff1aSopenharmony_ci movu m6, [aq+mmsize*2-4] ; a[14-21] 1763cabdff1aSopenharmony_ci LOWPASS 6, 2, 4 ; A[15-22] 1764cabdff1aSopenharmony_ci SCRATCH 6, 9, rsp+1*mmsize 1765cabdff1aSopenharmony_ci pavgw m2, m4 1766cabdff1aSopenharmony_ci mova m4, [aq+mmsize*1+0] ; a[8-15] 1767cabdff1aSopenharmony_ci movu m7, [aq+mmsize*1-4] ; a[6-13] 1768cabdff1aSopenharmony_ci LOWPASS 7, 1, 4 ; A[7-14] 1769cabdff1aSopenharmony_ci SCRATCH 7, 10, rsp+2*mmsize 1770cabdff1aSopenharmony_ci pavgw m1, m4 1771cabdff1aSopenharmony_ci mova m4, [aq+mmsize*0+0] ; a[0-7] 1772cabdff1aSopenharmony_ci mova m5, [lq+mmsize*3+0] ; l[24-31] 1773cabdff1aSopenharmony_ci PALIGNR m6, m0, m5, 14, m7 ; l[31]*a[0-5] 1774cabdff1aSopenharmony_ci LOWPASS 6, 0, 4 ; #A[0-6] 1775cabdff1aSopenharmony_ci SCRATCH 6, 11, rsp+3*mmsize 1776cabdff1aSopenharmony_ci pavgw m4, m0 1777cabdff1aSopenharmony_ci PALIGNR m0, m5, 2, m7 ; l[25-31]* 1778cabdff1aSopenharmony_ci movu m7, [lq+mmsize*3-2] ; l[23-30] 1779cabdff1aSopenharmony_ci LOWPASS 0, 5, 7 ; L[24-31] 1780cabdff1aSopenharmony_ci movu m5, [lq+mmsize*2-2] ; l[15-22] 1781cabdff1aSopenharmony_ci mova m7, [lq+mmsize*2+0] ; l[16-23] 1782cabdff1aSopenharmony_ci movu m6, [lq+mmsize*2+2] ; l[17-24] 1783cabdff1aSopenharmony_ci LOWPASS 5, 7, 6 ; L[16-23] 1784cabdff1aSopenharmony_ci psrld m7, m0, 16 1785cabdff1aSopenharmony_ci psrld m6, m5, 16 1786cabdff1aSopenharmony_ci pand m0, [pd_65535] 1787cabdff1aSopenharmony_ci pand m5, [pd_65535] 1788cabdff1aSopenharmony_ci packssdw m6, m7 1789cabdff1aSopenharmony_ci packssdw m5, m0 1790cabdff1aSopenharmony_ci SCRATCH 5, 12, rsp+4*mmsize 1791cabdff1aSopenharmony_ci SCRATCH 6, 13, rsp+5*mmsize 1792cabdff1aSopenharmony_ci movu m6, [lq+mmsize*1-2] ; l[7-14] 1793cabdff1aSopenharmony_ci mova m0, [lq+mmsize*1+0] ; l[8-15] 1794cabdff1aSopenharmony_ci movu m5, [lq+mmsize*1+2] ; l[9-16] 1795cabdff1aSopenharmony_ci LOWPASS 6, 0, 5 ; L[8-15] 1796cabdff1aSopenharmony_ci movu m0, [lq+mmsize*0+2] ; l[1-8] 1797cabdff1aSopenharmony_ci pslldq m5, m0, 2 ; .l[1-7] 1798cabdff1aSopenharmony_ci pslldq m7, m0, 4 ; ..l[1-6] 1799cabdff1aSopenharmony_ci LOWPASS 0, 5, 7 1800cabdff1aSopenharmony_ci psrld m5, m6, 16 1801cabdff1aSopenharmony_ci psrld m7, m0, 16 1802cabdff1aSopenharmony_ci pand m6, [pd_65535] 1803cabdff1aSopenharmony_ci pand m0, [pd_65535] 1804cabdff1aSopenharmony_ci packssdw m7, m5 1805cabdff1aSopenharmony_ci packssdw m0, m6 1806cabdff1aSopenharmony_ci UNSCRATCH 6, 13, rsp+5*mmsize 1807cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride16, cnt, stride17 1808cabdff1aSopenharmony_ci mov stride16q, strideq 1809cabdff1aSopenharmony_ci mov cntd, 8 1810cabdff1aSopenharmony_ci shl stride16q, 4 1811cabdff1aSopenharmony_ci%if ARCH_X86_64 1812cabdff1aSopenharmony_ci lea stride17q, [stride16q+strideq] 1813cabdff1aSopenharmony_ci%endif 1814cabdff1aSopenharmony_ci 1815cabdff1aSopenharmony_ci.loop: 1816cabdff1aSopenharmony_ci mova [dstq+strideq*0+ 0], m4 1817cabdff1aSopenharmony_ci mova [dstq+strideq*0+16], m1 1818cabdff1aSopenharmony_ci mova [dstq+strideq*0+32], m2 1819cabdff1aSopenharmony_ci mova [dstq+strideq*0+48], m3 1820cabdff1aSopenharmony_ci%if ARCH_X86_64 1821cabdff1aSopenharmony_ci mova [dstq+strideq*1+ 0], m11 1822cabdff1aSopenharmony_ci mova [dstq+strideq*1+16], m10 1823cabdff1aSopenharmony_ci mova [dstq+strideq*1+32], m9 1824cabdff1aSopenharmony_ci mova [dstq+strideq*1+48], m8 1825cabdff1aSopenharmony_ci%endif 1826cabdff1aSopenharmony_ci mova [dstq+stride16q+ 0], m6 1827cabdff1aSopenharmony_ci mova [dstq+stride16q+16], m4 1828cabdff1aSopenharmony_ci mova [dstq+stride16q+32], m1 1829cabdff1aSopenharmony_ci mova [dstq+stride16q+48], m2 1830cabdff1aSopenharmony_ci%if ARCH_X86_64 1831cabdff1aSopenharmony_ci mova [dstq+stride17q+ 0], m12 1832cabdff1aSopenharmony_ci mova [dstq+stride17q+16], m11 1833cabdff1aSopenharmony_ci mova [dstq+stride17q+32], m10 1834cabdff1aSopenharmony_ci mova [dstq+stride17q+48], m9 1835cabdff1aSopenharmony_ci%endif 1836cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1837cabdff1aSopenharmony_ci PALIGNR m3, m2, 14, m5 1838cabdff1aSopenharmony_ci PALIGNR m2, m1, 14, m5 1839cabdff1aSopenharmony_ci PALIGNR m1, m4, 14, m5 1840cabdff1aSopenharmony_ci PALIGNR m4, m6, 14, m5 1841cabdff1aSopenharmony_ci PALIGNR m6, m7, 14, m5 1842cabdff1aSopenharmony_ci pslldq m7, 2 1843cabdff1aSopenharmony_ci%if ARCH_X86_64 1844cabdff1aSopenharmony_ci PALIGNR m8, m9, 14, m5 1845cabdff1aSopenharmony_ci PALIGNR m9, m10, 14, m5 1846cabdff1aSopenharmony_ci PALIGNR m10, m11, 14, m5 1847cabdff1aSopenharmony_ci PALIGNR m11, m12, 14, m5 1848cabdff1aSopenharmony_ci PALIGNR m12, m0, 14, m5 1849cabdff1aSopenharmony_ci pslldq m0, 2 1850cabdff1aSopenharmony_ci%endif 1851cabdff1aSopenharmony_ci dec cntd 1852cabdff1aSopenharmony_ci jg .loop 1853cabdff1aSopenharmony_ci 1854cabdff1aSopenharmony_ci%if ARCH_X86_32 1855cabdff1aSopenharmony_ci UNSCRATCH 5, 12, rsp+4*mmsize 1856cabdff1aSopenharmony_ci UNSCRATCH 4, 11, rsp+3*mmsize 1857cabdff1aSopenharmony_ci UNSCRATCH 3, 10, rsp+2*mmsize 1858cabdff1aSopenharmony_ci UNSCRATCH 2, 9, rsp+1*mmsize 1859cabdff1aSopenharmony_ci UNSCRATCH 1, 8, rsp+0*mmsize 1860cabdff1aSopenharmony_ci mov dstq, dstm 1861cabdff1aSopenharmony_ci mov cntd, 8 1862cabdff1aSopenharmony_ci add dstq, strideq 1863cabdff1aSopenharmony_ci.loop2: 1864cabdff1aSopenharmony_ci mova [dstq+strideq*0+ 0], m4 1865cabdff1aSopenharmony_ci mova [dstq+strideq*0+16], m3 1866cabdff1aSopenharmony_ci mova [dstq+strideq*0+32], m2 1867cabdff1aSopenharmony_ci mova [dstq+strideq*0+48], m1 1868cabdff1aSopenharmony_ci mova [dstq+stride16q+ 0], m5 1869cabdff1aSopenharmony_ci mova [dstq+stride16q+16], m4 1870cabdff1aSopenharmony_ci mova [dstq+stride16q+32], m3 1871cabdff1aSopenharmony_ci mova [dstq+stride16q+48], m2 1872cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1873cabdff1aSopenharmony_ci PALIGNR m1, m2, 14, m6 1874cabdff1aSopenharmony_ci PALIGNR m2, m3, 14, m6 1875cabdff1aSopenharmony_ci PALIGNR m3, m4, 14, m6 1876cabdff1aSopenharmony_ci PALIGNR m4, m5, 14, m6 1877cabdff1aSopenharmony_ci PALIGNR m5, m0, 14, m6 1878cabdff1aSopenharmony_ci pslldq m0, 2 1879cabdff1aSopenharmony_ci dec cntd 1880cabdff1aSopenharmony_ci jg .loop2 1881cabdff1aSopenharmony_ci%endif 1882cabdff1aSopenharmony_ci RET 1883cabdff1aSopenharmony_ci%endmacro 1884cabdff1aSopenharmony_ci 1885cabdff1aSopenharmony_ciINIT_XMM sse2 1886cabdff1aSopenharmony_ciVR_FUNCS 1887cabdff1aSopenharmony_ciINIT_XMM ssse3 1888cabdff1aSopenharmony_ciVR_FUNCS 1889cabdff1aSopenharmony_ciINIT_XMM avx 1890cabdff1aSopenharmony_ciVR_FUNCS 1891cabdff1aSopenharmony_ci 1892cabdff1aSopenharmony_ci%macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function 1893cabdff1aSopenharmony_cicglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a 1894cabdff1aSopenharmony_ci movh m0, [lq] ; abcd 1895cabdff1aSopenharmony_ci%if cpuflag(ssse3) 1896cabdff1aSopenharmony_ci pshufb m0, [pb_0to7_67x4] ; abcddddd 1897cabdff1aSopenharmony_ci%else 1898cabdff1aSopenharmony_ci punpcklqdq m0, m0 1899cabdff1aSopenharmony_ci pshufhw m0, m0, q3333 ; abcddddd 1900cabdff1aSopenharmony_ci%endif 1901cabdff1aSopenharmony_ci psrldq m1, m0, 2 ; bcddddd. 1902cabdff1aSopenharmony_ci psrldq m2, m0, 4 ; cddddd.. 1903cabdff1aSopenharmony_ci LOWPASS 2, 1, 0 ; BCDddd.. 1904cabdff1aSopenharmony_ci pavgw m1, m0 ; abcddddd 1905cabdff1aSopenharmony_ci SBUTTERFLY wd, 1, 2, 0 ; aBbCcDdd, dddddddd 1906cabdff1aSopenharmony_ci PALIGNR m2, m1, 4, m0 ; bCcDdddd 1907cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 1908cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 1909cabdff1aSopenharmony_ci 1910cabdff1aSopenharmony_ci movh [dstq+strideq*0], m1 ; aBbC 1911cabdff1aSopenharmony_ci movh [dstq+strideq*1], m2 ; bCcD 1912cabdff1aSopenharmony_ci movhps [dstq+strideq*2], m1 ; cDdd 1913cabdff1aSopenharmony_ci movhps [dstq+stride3q ], m2 ; dddd 1914cabdff1aSopenharmony_ci RET 1915cabdff1aSopenharmony_ci 1916cabdff1aSopenharmony_cicglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a 1917cabdff1aSopenharmony_ci mova m0, [lq] 1918cabdff1aSopenharmony_ci%if cpuflag(ssse3) 1919cabdff1aSopenharmony_ci mova m3, [pb_2to15_14_15] 1920cabdff1aSopenharmony_ci%endif 1921cabdff1aSopenharmony_ci SHIFT_RIGHTx2 m1, m2, m0, m3 1922cabdff1aSopenharmony_ci LOWPASS 2, 1, 0 1923cabdff1aSopenharmony_ci pavgw m1, m0 1924cabdff1aSopenharmony_ci SBUTTERFLY wd, 1, 2, 0 1925cabdff1aSopenharmony_ci shufps m0, m1, m2, q1032 1926cabdff1aSopenharmony_ci pshufd m3, m2, q3332 1927cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 1928cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 1929cabdff1aSopenharmony_ci 1930cabdff1aSopenharmony_ci mova [dstq+strideq *0], m1 1931cabdff1aSopenharmony_ci mova [dstq+strideq *2], m0 1932cabdff1aSopenharmony_ci mova [dstq+strideq *4], m2 1933cabdff1aSopenharmony_ci mova [dstq+stride3q*2], m3 1934cabdff1aSopenharmony_ci add dstq, strideq 1935cabdff1aSopenharmony_ci%if cpuflag(avx) 1936cabdff1aSopenharmony_ci vpalignr m1, m2, m1, 4 1937cabdff1aSopenharmony_ci%else 1938cabdff1aSopenharmony_ci PALIGNR m0, m2, m1, 4, m3 1939cabdff1aSopenharmony_ci mova m1, m0 1940cabdff1aSopenharmony_ci%endif 1941cabdff1aSopenharmony_ci pshufd m2, m2, q3321 1942cabdff1aSopenharmony_ci shufps m0, m1, m2, q1032 1943cabdff1aSopenharmony_ci pshufd m3, m2, q3332 1944cabdff1aSopenharmony_ci mova [dstq+strideq *0], m1 1945cabdff1aSopenharmony_ci mova [dstq+strideq *2], m0 1946cabdff1aSopenharmony_ci mova [dstq+strideq *4], m2 1947cabdff1aSopenharmony_ci mova [dstq+stride3q*2], m3 1948cabdff1aSopenharmony_ci RET 1949cabdff1aSopenharmony_ci 1950cabdff1aSopenharmony_cicglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a 1951cabdff1aSopenharmony_ci mova m0, [lq] 1952cabdff1aSopenharmony_ci mova m3, [lq+mmsize] 1953cabdff1aSopenharmony_ci movu m1, [lq+2] 1954cabdff1aSopenharmony_ci movu m2, [lq+4] 1955cabdff1aSopenharmony_ci LOWPASS 2, 1, 0 1956cabdff1aSopenharmony_ci pavgw m1, m0 1957cabdff1aSopenharmony_ci SBUTTERFLY wd, 1, 2, 0 1958cabdff1aSopenharmony_ci%if cpuflag(ssse3) 1959cabdff1aSopenharmony_ci mova m5, [pb_2to15_14_15] 1960cabdff1aSopenharmony_ci%endif 1961cabdff1aSopenharmony_ci SHIFT_RIGHTx2 m0, m4, m3, m5 1962cabdff1aSopenharmony_ci LOWPASS 4, 0, 3 1963cabdff1aSopenharmony_ci pavgw m3, m0 1964cabdff1aSopenharmony_ci SBUTTERFLY wd, 3, 4, 5 1965cabdff1aSopenharmony_ci pshufd m0, m0, q3333 1966cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3, cnt 1967cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 1968cabdff1aSopenharmony_ci mov cntd, 4 1969cabdff1aSopenharmony_ci 1970cabdff1aSopenharmony_ci.loop: 1971cabdff1aSopenharmony_ci mova [dstq+strideq *0+ 0], m1 1972cabdff1aSopenharmony_ci mova [dstq+strideq *0+16], m2 1973cabdff1aSopenharmony_ci mova [dstq+strideq *4+ 0], m2 1974cabdff1aSopenharmony_ci mova [dstq+strideq *4+16], m3 1975cabdff1aSopenharmony_ci mova [dstq+strideq *8+ 0], m3 1976cabdff1aSopenharmony_ci mova [dstq+strideq *8+16], m4 1977cabdff1aSopenharmony_ci mova [dstq+stride3q*4+ 0], m4 1978cabdff1aSopenharmony_ci mova [dstq+stride3q*4+16], m0 1979cabdff1aSopenharmony_ci add dstq, strideq 1980cabdff1aSopenharmony_ci%if cpuflag(avx) 1981cabdff1aSopenharmony_ci vpalignr m1, m2, m1, 4 1982cabdff1aSopenharmony_ci vpalignr m2, m3, m2, 4 1983cabdff1aSopenharmony_ci vpalignr m3, m4, m3, 4 1984cabdff1aSopenharmony_ci vpalignr m4, m0, m4, 4 1985cabdff1aSopenharmony_ci%else 1986cabdff1aSopenharmony_ci PALIGNR m5, m2, m1, 4, m6 1987cabdff1aSopenharmony_ci mova m1, m5 1988cabdff1aSopenharmony_ci PALIGNR m5, m3, m2, 4, m6 1989cabdff1aSopenharmony_ci mova m2, m5 1990cabdff1aSopenharmony_ci PALIGNR m5, m4, m3, 4, m6 1991cabdff1aSopenharmony_ci mova m3, m5 1992cabdff1aSopenharmony_ci PALIGNR m5, m0, m4, 4, m6 1993cabdff1aSopenharmony_ci mova m4, m5 1994cabdff1aSopenharmony_ci%endif 1995cabdff1aSopenharmony_ci dec cntd 1996cabdff1aSopenharmony_ci jg .loop 1997cabdff1aSopenharmony_ci RET 1998cabdff1aSopenharmony_ci 1999cabdff1aSopenharmony_cicglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \ 2000cabdff1aSopenharmony_ci %1 * -mmsize * ARCH_X86_32, dst, stride, l, a 2001cabdff1aSopenharmony_ci mova m2, [lq+mmsize*0+0] 2002cabdff1aSopenharmony_ci movu m1, [lq+mmsize*0+2] 2003cabdff1aSopenharmony_ci movu m0, [lq+mmsize*0+4] 2004cabdff1aSopenharmony_ci LOWPASS 0, 1, 2 2005cabdff1aSopenharmony_ci pavgw m1, m2 2006cabdff1aSopenharmony_ci SBUTTERFLY wd, 1, 0, 2 2007cabdff1aSopenharmony_ci SCRATCH 1, 8, rsp+0*mmsize 2008cabdff1aSopenharmony_ci mova m4, [lq+mmsize*1+0] 2009cabdff1aSopenharmony_ci movu m3, [lq+mmsize*1+2] 2010cabdff1aSopenharmony_ci movu m2, [lq+mmsize*1+4] 2011cabdff1aSopenharmony_ci LOWPASS 2, 3, 4 2012cabdff1aSopenharmony_ci pavgw m3, m4 2013cabdff1aSopenharmony_ci SBUTTERFLY wd, 3, 2, 4 2014cabdff1aSopenharmony_ci mova m6, [lq+mmsize*2+0] 2015cabdff1aSopenharmony_ci movu m5, [lq+mmsize*2+2] 2016cabdff1aSopenharmony_ci movu m4, [lq+mmsize*2+4] 2017cabdff1aSopenharmony_ci LOWPASS 4, 5, 6 2018cabdff1aSopenharmony_ci pavgw m5, m6 2019cabdff1aSopenharmony_ci SBUTTERFLY wd, 5, 4, 6 2020cabdff1aSopenharmony_ci mova m7, [lq+mmsize*3+0] 2021cabdff1aSopenharmony_ci SCRATCH 0, 9, rsp+1*mmsize 2022cabdff1aSopenharmony_ci%if cpuflag(ssse3) 2023cabdff1aSopenharmony_ci mova m0, [pb_2to15_14_15] 2024cabdff1aSopenharmony_ci%endif 2025cabdff1aSopenharmony_ci SHIFT_RIGHTx2 m1, m6, m7, m0 2026cabdff1aSopenharmony_ci LOWPASS 6, 1, 7 2027cabdff1aSopenharmony_ci pavgw m7, m1 2028cabdff1aSopenharmony_ci SBUTTERFLY wd, 7, 6, 0 2029cabdff1aSopenharmony_ci pshufd m1, m1, q3333 2030cabdff1aSopenharmony_ci UNSCRATCH 0, 9, rsp+1*mmsize 2031cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28 2032cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 2033cabdff1aSopenharmony_ci lea stride4q, [strideq*4] 2034cabdff1aSopenharmony_ci lea stride28q, [stride4q*8] 2035cabdff1aSopenharmony_ci lea stride20q, [stride4q*5] 2036cabdff1aSopenharmony_ci sub stride28q, stride4q 2037cabdff1aSopenharmony_ci mov cntd, 4 2038cabdff1aSopenharmony_ci 2039cabdff1aSopenharmony_ci.loop: 2040cabdff1aSopenharmony_ci%if ARCH_X86_64 2041cabdff1aSopenharmony_ci SWAP 1, 8 2042cabdff1aSopenharmony_ci%else 2043cabdff1aSopenharmony_ci mova [rsp+1*mmsize], m1 2044cabdff1aSopenharmony_ci mova m1, [rsp+0*mmsize] 2045cabdff1aSopenharmony_ci%endif 2046cabdff1aSopenharmony_ci mova [dstq+strideq *0+ 0], m1 2047cabdff1aSopenharmony_ci mova [dstq+strideq *0+16], m0 2048cabdff1aSopenharmony_ci mova [dstq+strideq *0+32], m3 2049cabdff1aSopenharmony_ci mova [dstq+strideq *0+48], m2 2050cabdff1aSopenharmony_ci mova [dstq+stride4q*1+ 0], m0 2051cabdff1aSopenharmony_ci mova [dstq+stride4q*1+16], m3 2052cabdff1aSopenharmony_ci mova [dstq+stride4q*1+32], m2 2053cabdff1aSopenharmony_ci mova [dstq+stride4q*1+48], m5 2054cabdff1aSopenharmony_ci mova [dstq+stride4q*2+ 0], m3 2055cabdff1aSopenharmony_ci mova [dstq+stride4q*2+16], m2 2056cabdff1aSopenharmony_ci mova [dstq+stride4q*2+32], m5 2057cabdff1aSopenharmony_ci mova [dstq+stride4q*2+48], m4 2058cabdff1aSopenharmony_ci%if cpuflag(avx) 2059cabdff1aSopenharmony_ci vpalignr m1, m0, m1, 4 2060cabdff1aSopenharmony_ci vpalignr m0, m3, m0, 4 2061cabdff1aSopenharmony_ci vpalignr m3, m2, m3, 4 2062cabdff1aSopenharmony_ci%else 2063cabdff1aSopenharmony_ci SCRATCH 6, 9, rsp+2*mmsize 2064cabdff1aSopenharmony_ci%if notcpuflag(ssse3) 2065cabdff1aSopenharmony_ci SCRATCH 7, 10, rsp+3*mmsize 2066cabdff1aSopenharmony_ci%endif 2067cabdff1aSopenharmony_ci PALIGNR m6, m0, m1, 4, m7 2068cabdff1aSopenharmony_ci mova m1, m6 2069cabdff1aSopenharmony_ci PALIGNR m6, m3, m0, 4, m7 2070cabdff1aSopenharmony_ci mova m0, m6 2071cabdff1aSopenharmony_ci PALIGNR m6, m2, m3, 4, m7 2072cabdff1aSopenharmony_ci mova m3, m6 2073cabdff1aSopenharmony_ci UNSCRATCH 6, 9, rsp+2*mmsize 2074cabdff1aSopenharmony_ci SCRATCH 0, 9, rsp+2*mmsize 2075cabdff1aSopenharmony_ci%if notcpuflag(ssse3) 2076cabdff1aSopenharmony_ci UNSCRATCH 7, 10, rsp+3*mmsize 2077cabdff1aSopenharmony_ci SCRATCH 3, 10, rsp+3*mmsize 2078cabdff1aSopenharmony_ci%endif 2079cabdff1aSopenharmony_ci%endif 2080cabdff1aSopenharmony_ci%if ARCH_X86_64 2081cabdff1aSopenharmony_ci SWAP 1, 8 2082cabdff1aSopenharmony_ci%else 2083cabdff1aSopenharmony_ci mova [rsp+0*mmsize], m1 2084cabdff1aSopenharmony_ci mova m1, [rsp+1*mmsize] 2085cabdff1aSopenharmony_ci%endif 2086cabdff1aSopenharmony_ci mova [dstq+stride3q*4+ 0], m2 2087cabdff1aSopenharmony_ci mova [dstq+stride3q*4+16], m5 2088cabdff1aSopenharmony_ci mova [dstq+stride3q*4+32], m4 2089cabdff1aSopenharmony_ci mova [dstq+stride3q*4+48], m7 2090cabdff1aSopenharmony_ci mova [dstq+stride4q*4+ 0], m5 2091cabdff1aSopenharmony_ci mova [dstq+stride4q*4+16], m4 2092cabdff1aSopenharmony_ci mova [dstq+stride4q*4+32], m7 2093cabdff1aSopenharmony_ci mova [dstq+stride4q*4+48], m6 2094cabdff1aSopenharmony_ci mova [dstq+stride20q + 0], m4 2095cabdff1aSopenharmony_ci mova [dstq+stride20q +16], m7 2096cabdff1aSopenharmony_ci mova [dstq+stride20q +32], m6 2097cabdff1aSopenharmony_ci mova [dstq+stride20q +48], m1 2098cabdff1aSopenharmony_ci mova [dstq+stride3q*8+ 0], m7 2099cabdff1aSopenharmony_ci mova [dstq+stride3q*8+16], m6 2100cabdff1aSopenharmony_ci mova [dstq+stride3q*8+32], m1 2101cabdff1aSopenharmony_ci mova [dstq+stride3q*8+48], m1 2102cabdff1aSopenharmony_ci mova [dstq+stride28q + 0], m6 2103cabdff1aSopenharmony_ci mova [dstq+stride28q +16], m1 2104cabdff1aSopenharmony_ci mova [dstq+stride28q +32], m1 2105cabdff1aSopenharmony_ci mova [dstq+stride28q +48], m1 2106cabdff1aSopenharmony_ci%if cpuflag(avx) 2107cabdff1aSopenharmony_ci vpalignr m2, m5, m2, 4 2108cabdff1aSopenharmony_ci vpalignr m5, m4, m5, 4 2109cabdff1aSopenharmony_ci vpalignr m4, m7, m4, 4 2110cabdff1aSopenharmony_ci vpalignr m7, m6, m7, 4 2111cabdff1aSopenharmony_ci vpalignr m6, m1, m6, 4 2112cabdff1aSopenharmony_ci%else 2113cabdff1aSopenharmony_ci PALIGNR m0, m5, m2, 4, m3 2114cabdff1aSopenharmony_ci mova m2, m0 2115cabdff1aSopenharmony_ci PALIGNR m0, m4, m5, 4, m3 2116cabdff1aSopenharmony_ci mova m5, m0 2117cabdff1aSopenharmony_ci PALIGNR m0, m7, m4, 4, m3 2118cabdff1aSopenharmony_ci mova m4, m0 2119cabdff1aSopenharmony_ci PALIGNR m0, m6, m7, 4, m3 2120cabdff1aSopenharmony_ci mova m7, m0 2121cabdff1aSopenharmony_ci PALIGNR m0, m1, m6, 4, m3 2122cabdff1aSopenharmony_ci mova m6, m0 2123cabdff1aSopenharmony_ci UNSCRATCH 0, 9, rsp+2*mmsize 2124cabdff1aSopenharmony_ci%if notcpuflag(ssse3) 2125cabdff1aSopenharmony_ci UNSCRATCH 3, 10, rsp+3*mmsize 2126cabdff1aSopenharmony_ci%endif 2127cabdff1aSopenharmony_ci%endif 2128cabdff1aSopenharmony_ci add dstq, strideq 2129cabdff1aSopenharmony_ci dec cntd 2130cabdff1aSopenharmony_ci jg .loop 2131cabdff1aSopenharmony_ci RET 2132cabdff1aSopenharmony_ci%endmacro 2133cabdff1aSopenharmony_ci 2134cabdff1aSopenharmony_ciINIT_XMM sse2 2135cabdff1aSopenharmony_ciHU_FUNCS 4 2136cabdff1aSopenharmony_ciINIT_XMM ssse3 2137cabdff1aSopenharmony_ciHU_FUNCS 3 2138cabdff1aSopenharmony_ciINIT_XMM avx 2139cabdff1aSopenharmony_ciHU_FUNCS 2 2140cabdff1aSopenharmony_ci 2141cabdff1aSopenharmony_ci%macro HD_FUNCS 0 2142cabdff1aSopenharmony_cicglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a 2143cabdff1aSopenharmony_ci movh m0, [lq] 2144cabdff1aSopenharmony_ci movhps m0, [aq-2] 2145cabdff1aSopenharmony_ci psrldq m1, m0, 2 2146cabdff1aSopenharmony_ci psrldq m2, m0, 4 2147cabdff1aSopenharmony_ci LOWPASS 2, 1, 0 2148cabdff1aSopenharmony_ci pavgw m1, m0 2149cabdff1aSopenharmony_ci punpcklwd m1, m2 2150cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 2151cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 2152cabdff1aSopenharmony_ci 2153cabdff1aSopenharmony_ci movh [dstq+stride3q ], m1 2154cabdff1aSopenharmony_ci movhps [dstq+strideq*1], m1 2155cabdff1aSopenharmony_ci movhlps m2, m2 2156cabdff1aSopenharmony_ci PALIGNR m2, m1, 4, m0 2157cabdff1aSopenharmony_ci movh [dstq+strideq*2], m2 2158cabdff1aSopenharmony_ci movhps [dstq+strideq*0], m2 2159cabdff1aSopenharmony_ci RET 2160cabdff1aSopenharmony_ci 2161cabdff1aSopenharmony_cicglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a 2162cabdff1aSopenharmony_ci mova m0, [lq] 2163cabdff1aSopenharmony_ci movu m1, [aq-2] 2164cabdff1aSopenharmony_ci PALIGNR m2, m1, m0, 2, m3 2165cabdff1aSopenharmony_ci PALIGNR m3, m1, m0, 4, m4 2166cabdff1aSopenharmony_ci LOWPASS 3, 2, 0 2167cabdff1aSopenharmony_ci pavgw m2, m0 2168cabdff1aSopenharmony_ci SBUTTERFLY wd, 2, 3, 0 2169cabdff1aSopenharmony_ci psrldq m0, m1, 2 2170cabdff1aSopenharmony_ci psrldq m4, m1, 4 2171cabdff1aSopenharmony_ci LOWPASS 1, 0, 4 2172cabdff1aSopenharmony_ci DEFINE_ARGS dst8, mstride, cnt 2173cabdff1aSopenharmony_ci lea dst8q, [dst8q+mstrideq*8] 2174cabdff1aSopenharmony_ci neg mstrideq 2175cabdff1aSopenharmony_ci mov cntd, 4 2176cabdff1aSopenharmony_ci 2177cabdff1aSopenharmony_ci.loop: 2178cabdff1aSopenharmony_ci add dst8q, mstrideq 2179cabdff1aSopenharmony_ci mova [dst8q+mstrideq*0], m2 2180cabdff1aSopenharmony_ci mova [dst8q+mstrideq*4], m3 2181cabdff1aSopenharmony_ci%if cpuflag(avx) 2182cabdff1aSopenharmony_ci vpalignr m2, m3, m2, 4 2183cabdff1aSopenharmony_ci vpalignr m3, m1, m3, 4 2184cabdff1aSopenharmony_ci%else 2185cabdff1aSopenharmony_ci PALIGNR m0, m3, m2, 4, m4 2186cabdff1aSopenharmony_ci mova m2, m0 2187cabdff1aSopenharmony_ci PALIGNR m0, m1, m3, 4, m4 2188cabdff1aSopenharmony_ci mova m3, m0 2189cabdff1aSopenharmony_ci%endif 2190cabdff1aSopenharmony_ci psrldq m1, 4 2191cabdff1aSopenharmony_ci dec cntd 2192cabdff1aSopenharmony_ci jg .loop 2193cabdff1aSopenharmony_ci RET 2194cabdff1aSopenharmony_ci 2195cabdff1aSopenharmony_cicglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a 2196cabdff1aSopenharmony_ci mova m2, [lq] 2197cabdff1aSopenharmony_ci movu m1, [lq+2] 2198cabdff1aSopenharmony_ci movu m0, [lq+4] 2199cabdff1aSopenharmony_ci LOWPASS 0, 1, 2 2200cabdff1aSopenharmony_ci pavgw m1, m2 2201cabdff1aSopenharmony_ci mova m4, [lq+mmsize] 2202cabdff1aSopenharmony_ci movu m5, [aq-2] 2203cabdff1aSopenharmony_ci PALIGNR m3, m5, m4, 2, m6 2204cabdff1aSopenharmony_ci PALIGNR m2, m5, m4, 4, m6 2205cabdff1aSopenharmony_ci LOWPASS 2, 3, 4 2206cabdff1aSopenharmony_ci pavgw m3, m4 2207cabdff1aSopenharmony_ci SBUTTERFLY wd, 1, 0, 4 2208cabdff1aSopenharmony_ci SBUTTERFLY wd, 3, 2, 4 2209cabdff1aSopenharmony_ci mova m6, [aq] 2210cabdff1aSopenharmony_ci movu m4, [aq+2] 2211cabdff1aSopenharmony_ci LOWPASS 4, 6, 5 2212cabdff1aSopenharmony_ci movu m5, [aq+mmsize-2] 2213cabdff1aSopenharmony_ci psrldq m6, m5, 2 2214cabdff1aSopenharmony_ci psrldq m7, m5, 4 2215cabdff1aSopenharmony_ci LOWPASS 5, 6, 7 2216cabdff1aSopenharmony_ci DEFINE_ARGS dst, mstride, mstride3, cnt 2217cabdff1aSopenharmony_ci lea dstq, [dstq+mstrideq*8] 2218cabdff1aSopenharmony_ci lea dstq, [dstq+mstrideq*8] 2219cabdff1aSopenharmony_ci neg mstrideq 2220cabdff1aSopenharmony_ci lea mstride3q, [mstrideq*3] 2221cabdff1aSopenharmony_ci mov cntd, 4 2222cabdff1aSopenharmony_ci 2223cabdff1aSopenharmony_ci.loop: 2224cabdff1aSopenharmony_ci add dstq, mstrideq 2225cabdff1aSopenharmony_ci mova [dstq+mstride3q*4+ 0], m2 2226cabdff1aSopenharmony_ci mova [dstq+mstride3q*4+16], m4 2227cabdff1aSopenharmony_ci mova [dstq+mstrideq *8+ 0], m3 2228cabdff1aSopenharmony_ci mova [dstq+mstrideq *8+16], m2 2229cabdff1aSopenharmony_ci mova [dstq+mstrideq *4+ 0], m0 2230cabdff1aSopenharmony_ci mova [dstq+mstrideq *4+16], m3 2231cabdff1aSopenharmony_ci mova [dstq+mstrideq *0+ 0], m1 2232cabdff1aSopenharmony_ci mova [dstq+mstrideq *0+16], m0 2233cabdff1aSopenharmony_ci%if cpuflag(avx) 2234cabdff1aSopenharmony_ci vpalignr m1, m0, m1, 4 2235cabdff1aSopenharmony_ci vpalignr m0, m3, m0, 4 2236cabdff1aSopenharmony_ci vpalignr m3, m2, m3, 4 2237cabdff1aSopenharmony_ci vpalignr m2, m4, m2, 4 2238cabdff1aSopenharmony_ci vpalignr m4, m5, m4, 4 2239cabdff1aSopenharmony_ci%else 2240cabdff1aSopenharmony_ci PALIGNR m6, m0, m1, 4, m7 2241cabdff1aSopenharmony_ci mova m1, m6 2242cabdff1aSopenharmony_ci PALIGNR m6, m3, m0, 4, m7 2243cabdff1aSopenharmony_ci mova m0, m6 2244cabdff1aSopenharmony_ci PALIGNR m6, m2, m3, 4, m7 2245cabdff1aSopenharmony_ci mova m3, m6 2246cabdff1aSopenharmony_ci PALIGNR m6, m4, m2, 4, m7 2247cabdff1aSopenharmony_ci mova m2, m6 2248cabdff1aSopenharmony_ci PALIGNR m6, m5, m4, 4, m7 2249cabdff1aSopenharmony_ci mova m4, m6 2250cabdff1aSopenharmony_ci%endif 2251cabdff1aSopenharmony_ci psrldq m5, 4 2252cabdff1aSopenharmony_ci dec cntd 2253cabdff1aSopenharmony_ci jg .loop 2254cabdff1aSopenharmony_ci RET 2255cabdff1aSopenharmony_ci 2256cabdff1aSopenharmony_cicglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \ 2257cabdff1aSopenharmony_ci 10 * -mmsize * ARCH_X86_32, dst, stride, l, a 2258cabdff1aSopenharmony_ci mova m2, [lq+mmsize*0+0] 2259cabdff1aSopenharmony_ci movu m1, [lq+mmsize*0+2] 2260cabdff1aSopenharmony_ci movu m0, [lq+mmsize*0+4] 2261cabdff1aSopenharmony_ci LOWPASS 0, 1, 2 2262cabdff1aSopenharmony_ci pavgw m1, m2 2263cabdff1aSopenharmony_ci SBUTTERFLY wd, 1, 0, 2 2264cabdff1aSopenharmony_ci mova m4, [lq+mmsize*1+0] 2265cabdff1aSopenharmony_ci movu m3, [lq+mmsize*1+2] 2266cabdff1aSopenharmony_ci movu m2, [lq+mmsize*1+4] 2267cabdff1aSopenharmony_ci LOWPASS 2, 3, 4 2268cabdff1aSopenharmony_ci pavgw m3, m4 2269cabdff1aSopenharmony_ci SBUTTERFLY wd, 3, 2, 4 2270cabdff1aSopenharmony_ci SCRATCH 0, 8, rsp+0*mmsize 2271cabdff1aSopenharmony_ci SCRATCH 1, 9, rsp+1*mmsize 2272cabdff1aSopenharmony_ci SCRATCH 2, 10, rsp+2*mmsize 2273cabdff1aSopenharmony_ci SCRATCH 3, 11, rsp+3*mmsize 2274cabdff1aSopenharmony_ci mova m6, [lq+mmsize*2+0] 2275cabdff1aSopenharmony_ci movu m5, [lq+mmsize*2+2] 2276cabdff1aSopenharmony_ci movu m4, [lq+mmsize*2+4] 2277cabdff1aSopenharmony_ci LOWPASS 4, 5, 6 2278cabdff1aSopenharmony_ci pavgw m5, m6 2279cabdff1aSopenharmony_ci SBUTTERFLY wd, 5, 4, 6 2280cabdff1aSopenharmony_ci mova m0, [lq+mmsize*3+0] 2281cabdff1aSopenharmony_ci movu m1, [aq+mmsize*0-2] 2282cabdff1aSopenharmony_ci PALIGNR m7, m1, m0, 2, m2 2283cabdff1aSopenharmony_ci PALIGNR m6, m1, m0, 4, m2 2284cabdff1aSopenharmony_ci LOWPASS 6, 7, 0 2285cabdff1aSopenharmony_ci pavgw m7, m0 2286cabdff1aSopenharmony_ci SBUTTERFLY wd, 7, 6, 0 2287cabdff1aSopenharmony_ci mova m2, [aq+mmsize*0+0] 2288cabdff1aSopenharmony_ci movu m0, [aq+mmsize*0+2] 2289cabdff1aSopenharmony_ci LOWPASS 0, 2, 1 2290cabdff1aSopenharmony_ci movu m1, [aq+mmsize*1-2] 2291cabdff1aSopenharmony_ci mova m2, [aq+mmsize*1+0] 2292cabdff1aSopenharmony_ci movu m3, [aq+mmsize*1+2] 2293cabdff1aSopenharmony_ci LOWPASS 1, 2, 3 2294cabdff1aSopenharmony_ci SCRATCH 6, 12, rsp+6*mmsize 2295cabdff1aSopenharmony_ci SCRATCH 7, 13, rsp+7*mmsize 2296cabdff1aSopenharmony_ci movu m2, [aq+mmsize*2-2] 2297cabdff1aSopenharmony_ci mova m3, [aq+mmsize*2+0] 2298cabdff1aSopenharmony_ci movu m6, [aq+mmsize*2+2] 2299cabdff1aSopenharmony_ci LOWPASS 2, 3, 6 2300cabdff1aSopenharmony_ci movu m3, [aq+mmsize*3-2] 2301cabdff1aSopenharmony_ci psrldq m6, m3, 2 2302cabdff1aSopenharmony_ci psrldq m7, m3, 4 2303cabdff1aSopenharmony_ci LOWPASS 3, 6, 7 2304cabdff1aSopenharmony_ci UNSCRATCH 6, 12, rsp+6*mmsize 2305cabdff1aSopenharmony_ci UNSCRATCH 7, 13, rsp+7*mmsize 2306cabdff1aSopenharmony_ci%if ARCH_X86_32 2307cabdff1aSopenharmony_ci mova [rsp+4*mmsize], m4 2308cabdff1aSopenharmony_ci mova [rsp+5*mmsize], m5 2309cabdff1aSopenharmony_ci ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need 2310cabdff1aSopenharmony_ci ; to do it again here 2311cabdff1aSopenharmony_ci%endif 2312cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28 2313cabdff1aSopenharmony_ci mov cntd, 4 2314cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 2315cabdff1aSopenharmony_ci%if ARCH_X86_64 2316cabdff1aSopenharmony_ci lea stride4q, [strideq*4] 2317cabdff1aSopenharmony_ci lea stride28q, [stride4q*8] 2318cabdff1aSopenharmony_ci lea stride20q, [stride4q*5] 2319cabdff1aSopenharmony_ci sub stride28q, stride4q 2320cabdff1aSopenharmony_ci%endif 2321cabdff1aSopenharmony_ci add dstq, stride3q 2322cabdff1aSopenharmony_ci 2323cabdff1aSopenharmony_ci ; x86-32 doesn't have enough registers, so on that platform, we split 2324cabdff1aSopenharmony_ci ; the loop in 2... Otherwise you spend most of the loop (un)scratching 2325cabdff1aSopenharmony_ci.loop: 2326cabdff1aSopenharmony_ci%if ARCH_X86_64 2327cabdff1aSopenharmony_ci mova [dstq+stride28q + 0], m9 2328cabdff1aSopenharmony_ci mova [dstq+stride28q +16], m8 2329cabdff1aSopenharmony_ci mova [dstq+stride28q +32], m11 2330cabdff1aSopenharmony_ci mova [dstq+stride28q +48], m10 2331cabdff1aSopenharmony_ci mova [dstq+stride3q*8+ 0], m8 2332cabdff1aSopenharmony_ci mova [dstq+stride3q*8+16], m11 2333cabdff1aSopenharmony_ci mova [dstq+stride3q*8+32], m10 2334cabdff1aSopenharmony_ci mova [dstq+stride3q*8+48], m5 2335cabdff1aSopenharmony_ci mova [dstq+stride20q + 0], m11 2336cabdff1aSopenharmony_ci mova [dstq+stride20q +16], m10 2337cabdff1aSopenharmony_ci mova [dstq+stride20q +32], m5 2338cabdff1aSopenharmony_ci mova [dstq+stride20q +48], m4 2339cabdff1aSopenharmony_ci mova [dstq+stride4q*4+ 0], m10 2340cabdff1aSopenharmony_ci mova [dstq+stride4q*4+16], m5 2341cabdff1aSopenharmony_ci mova [dstq+stride4q*4+32], m4 2342cabdff1aSopenharmony_ci mova [dstq+stride4q*4+48], m7 2343cabdff1aSopenharmony_ci%endif 2344cabdff1aSopenharmony_ci mova [dstq+stride3q*4+ 0], m5 2345cabdff1aSopenharmony_ci mova [dstq+stride3q*4+16], m4 2346cabdff1aSopenharmony_ci mova [dstq+stride3q*4+32], m7 2347cabdff1aSopenharmony_ci mova [dstq+stride3q*4+48], m6 2348cabdff1aSopenharmony_ci mova [dstq+strideq* 8+ 0], m4 2349cabdff1aSopenharmony_ci mova [dstq+strideq* 8+16], m7 2350cabdff1aSopenharmony_ci mova [dstq+strideq* 8+32], m6 2351cabdff1aSopenharmony_ci mova [dstq+strideq* 8+48], m0 2352cabdff1aSopenharmony_ci mova [dstq+strideq* 4+ 0], m7 2353cabdff1aSopenharmony_ci mova [dstq+strideq* 4+16], m6 2354cabdff1aSopenharmony_ci mova [dstq+strideq* 4+32], m0 2355cabdff1aSopenharmony_ci mova [dstq+strideq* 4+48], m1 2356cabdff1aSopenharmony_ci mova [dstq+strideq* 0+ 0], m6 2357cabdff1aSopenharmony_ci mova [dstq+strideq* 0+16], m0 2358cabdff1aSopenharmony_ci mova [dstq+strideq* 0+32], m1 2359cabdff1aSopenharmony_ci mova [dstq+strideq* 0+48], m2 2360cabdff1aSopenharmony_ci sub dstq, strideq 2361cabdff1aSopenharmony_ci%if cpuflag(avx) 2362cabdff1aSopenharmony_ci%if ARCH_X86_64 2363cabdff1aSopenharmony_ci vpalignr m9, m8, m9, 4 2364cabdff1aSopenharmony_ci vpalignr m8, m11, m8, 4 2365cabdff1aSopenharmony_ci vpalignr m11, m10, m11, 4 2366cabdff1aSopenharmony_ci vpalignr m10, m5, m10, 4 2367cabdff1aSopenharmony_ci%endif 2368cabdff1aSopenharmony_ci vpalignr m5, m4, m5, 4 2369cabdff1aSopenharmony_ci vpalignr m4, m7, m4, 4 2370cabdff1aSopenharmony_ci vpalignr m7, m6, m7, 4 2371cabdff1aSopenharmony_ci vpalignr m6, m0, m6, 4 2372cabdff1aSopenharmony_ci vpalignr m0, m1, m0, 4 2373cabdff1aSopenharmony_ci vpalignr m1, m2, m1, 4 2374cabdff1aSopenharmony_ci vpalignr m2, m3, m2, 4 2375cabdff1aSopenharmony_ci%else 2376cabdff1aSopenharmony_ci%if ARCH_X86_64 2377cabdff1aSopenharmony_ci PALIGNR m12, m8, m9, 4, m13 2378cabdff1aSopenharmony_ci mova m9, m12 2379cabdff1aSopenharmony_ci PALIGNR m12, m11, m8, 4, m13 2380cabdff1aSopenharmony_ci mova m8, m12 2381cabdff1aSopenharmony_ci PALIGNR m12, m10, m11, 4, m13 2382cabdff1aSopenharmony_ci mova m11, m12 2383cabdff1aSopenharmony_ci PALIGNR m12, m5, m10, 4, m13 2384cabdff1aSopenharmony_ci mova m10, m12 2385cabdff1aSopenharmony_ci%endif 2386cabdff1aSopenharmony_ci SCRATCH 3, 12, rsp+8*mmsize, sh 2387cabdff1aSopenharmony_ci%if notcpuflag(ssse3) 2388cabdff1aSopenharmony_ci SCRATCH 2, 13, rsp+9*mmsize 2389cabdff1aSopenharmony_ci%endif 2390cabdff1aSopenharmony_ci PALIGNR m3, m4, m5, 4, m2 2391cabdff1aSopenharmony_ci mova m5, m3 2392cabdff1aSopenharmony_ci PALIGNR m3, m7, m4, 4, m2 2393cabdff1aSopenharmony_ci mova m4, m3 2394cabdff1aSopenharmony_ci PALIGNR m3, m6, m7, 4, m2 2395cabdff1aSopenharmony_ci mova m7, m3 2396cabdff1aSopenharmony_ci PALIGNR m3, m0, m6, 4, m2 2397cabdff1aSopenharmony_ci mova m6, m3 2398cabdff1aSopenharmony_ci PALIGNR m3, m1, m0, 4, m2 2399cabdff1aSopenharmony_ci mova m0, m3 2400cabdff1aSopenharmony_ci%if notcpuflag(ssse3) 2401cabdff1aSopenharmony_ci UNSCRATCH 2, 13, rsp+9*mmsize 2402cabdff1aSopenharmony_ci SCRATCH 0, 13, rsp+9*mmsize 2403cabdff1aSopenharmony_ci%endif 2404cabdff1aSopenharmony_ci PALIGNR m3, m2, m1, 4, m0 2405cabdff1aSopenharmony_ci mova m1, m3 2406cabdff1aSopenharmony_ci PALIGNR m3, reg_sh, m2, 4, m0 2407cabdff1aSopenharmony_ci mova m2, m3 2408cabdff1aSopenharmony_ci%if notcpuflag(ssse3) 2409cabdff1aSopenharmony_ci UNSCRATCH 0, 13, rsp+9*mmsize 2410cabdff1aSopenharmony_ci%endif 2411cabdff1aSopenharmony_ci UNSCRATCH 3, 12, rsp+8*mmsize, sh 2412cabdff1aSopenharmony_ci%endif 2413cabdff1aSopenharmony_ci psrldq m3, 4 2414cabdff1aSopenharmony_ci dec cntd 2415cabdff1aSopenharmony_ci jg .loop 2416cabdff1aSopenharmony_ci 2417cabdff1aSopenharmony_ci%if ARCH_X86_32 2418cabdff1aSopenharmony_ci UNSCRATCH 0, 8, rsp+0*mmsize 2419cabdff1aSopenharmony_ci UNSCRATCH 1, 9, rsp+1*mmsize 2420cabdff1aSopenharmony_ci UNSCRATCH 2, 10, rsp+2*mmsize 2421cabdff1aSopenharmony_ci UNSCRATCH 3, 11, rsp+3*mmsize 2422cabdff1aSopenharmony_ci mova m4, [rsp+4*mmsize] 2423cabdff1aSopenharmony_ci mova m5, [rsp+5*mmsize] 2424cabdff1aSopenharmony_ci mova m6, [rsp+6*mmsize] 2425cabdff1aSopenharmony_ci mova m7, [rsp+7*mmsize] 2426cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride5, stride3 2427cabdff1aSopenharmony_ci lea stride5q, [strideq*5] 2428cabdff1aSopenharmony_ci lea dstq, [dstq+stride5q*4] 2429cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, cnt, stride3 2430cabdff1aSopenharmony_ci mov cntd, 4 2431cabdff1aSopenharmony_ci.loop_2: 2432cabdff1aSopenharmony_ci mova [dstq+stride3q*4+ 0], m1 2433cabdff1aSopenharmony_ci mova [dstq+stride3q*4+16], m0 2434cabdff1aSopenharmony_ci mova [dstq+stride3q*4+32], m3 2435cabdff1aSopenharmony_ci mova [dstq+stride3q*4+48], m2 2436cabdff1aSopenharmony_ci mova [dstq+strideq* 8+ 0], m0 2437cabdff1aSopenharmony_ci mova [dstq+strideq* 8+16], m3 2438cabdff1aSopenharmony_ci mova [dstq+strideq* 8+32], m2 2439cabdff1aSopenharmony_ci mova [dstq+strideq* 8+48], m5 2440cabdff1aSopenharmony_ci mova [dstq+strideq* 4+ 0], m3 2441cabdff1aSopenharmony_ci mova [dstq+strideq* 4+16], m2 2442cabdff1aSopenharmony_ci mova [dstq+strideq* 4+32], m5 2443cabdff1aSopenharmony_ci mova [dstq+strideq* 4+48], m4 2444cabdff1aSopenharmony_ci mova [dstq+strideq* 0+ 0], m2 2445cabdff1aSopenharmony_ci mova [dstq+strideq* 0+16], m5 2446cabdff1aSopenharmony_ci mova [dstq+strideq* 0+32], m4 2447cabdff1aSopenharmony_ci mova [dstq+strideq* 0+48], m7 2448cabdff1aSopenharmony_ci sub dstq, strideq 2449cabdff1aSopenharmony_ci%if cpuflag(avx) 2450cabdff1aSopenharmony_ci vpalignr m1, m0, m1, 4 2451cabdff1aSopenharmony_ci vpalignr m0, m3, m0, 4 2452cabdff1aSopenharmony_ci vpalignr m3, m2, m3, 4 2453cabdff1aSopenharmony_ci vpalignr m2, m5, m2, 4 2454cabdff1aSopenharmony_ci vpalignr m5, m4, m5, 4 2455cabdff1aSopenharmony_ci vpalignr m4, m7, m4, 4 2456cabdff1aSopenharmony_ci vpalignr m7, m6, m7, 4 2457cabdff1aSopenharmony_ci%else 2458cabdff1aSopenharmony_ci SCRATCH 6, 12, rsp+8*mmsize, sh 2459cabdff1aSopenharmony_ci%if notcpuflag(ssse3) 2460cabdff1aSopenharmony_ci SCRATCH 7, 13, rsp+9*mmsize 2461cabdff1aSopenharmony_ci%endif 2462cabdff1aSopenharmony_ci PALIGNR m6, m0, m1, 4, m7 2463cabdff1aSopenharmony_ci mova m1, m6 2464cabdff1aSopenharmony_ci PALIGNR m6, m3, m0, 4, m7 2465cabdff1aSopenharmony_ci mova m0, m6 2466cabdff1aSopenharmony_ci PALIGNR m6, m2, m3, 4, m7 2467cabdff1aSopenharmony_ci mova m3, m6 2468cabdff1aSopenharmony_ci PALIGNR m6, m5, m2, 4, m7 2469cabdff1aSopenharmony_ci mova m2, m6 2470cabdff1aSopenharmony_ci PALIGNR m6, m4, m5, 4, m7 2471cabdff1aSopenharmony_ci mova m5, m6 2472cabdff1aSopenharmony_ci%if notcpuflag(ssse3) 2473cabdff1aSopenharmony_ci UNSCRATCH 7, 13, rsp+9*mmsize 2474cabdff1aSopenharmony_ci SCRATCH 5, 13, rsp+9*mmsize 2475cabdff1aSopenharmony_ci%endif 2476cabdff1aSopenharmony_ci PALIGNR m6, m7, m4, 4, m5 2477cabdff1aSopenharmony_ci mova m4, m6 2478cabdff1aSopenharmony_ci PALIGNR m6, reg_sh, m7, 4, m5 2479cabdff1aSopenharmony_ci mova m7, m6 2480cabdff1aSopenharmony_ci%if notcpuflag(ssse3) 2481cabdff1aSopenharmony_ci UNSCRATCH 5, 13, rsp+9*mmsize 2482cabdff1aSopenharmony_ci%endif 2483cabdff1aSopenharmony_ci UNSCRATCH 6, 12, rsp+8*mmsize, sh 2484cabdff1aSopenharmony_ci%endif 2485cabdff1aSopenharmony_ci psrldq m6, 4 2486cabdff1aSopenharmony_ci dec cntd 2487cabdff1aSopenharmony_ci jg .loop_2 2488cabdff1aSopenharmony_ci%endif 2489cabdff1aSopenharmony_ci RET 2490cabdff1aSopenharmony_ci%endmacro 2491cabdff1aSopenharmony_ci 2492cabdff1aSopenharmony_ciINIT_XMM sse2 2493cabdff1aSopenharmony_ciHD_FUNCS 2494cabdff1aSopenharmony_ciINIT_XMM ssse3 2495cabdff1aSopenharmony_ciHD_FUNCS 2496cabdff1aSopenharmony_ciINIT_XMM avx 2497cabdff1aSopenharmony_ciHD_FUNCS 2498