1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* x86-optimized functions for the CFHD decoder 3cabdff1aSopenharmony_ci;* Copyright (c) 2020 Paul B Mahol 4cabdff1aSopenharmony_ci;* 5cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci;* 12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 16cabdff1aSopenharmony_ci;* 17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci;****************************************************************************** 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ciSECTION_RODATA 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_cifactor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1, 27cabdff1aSopenharmony_cifactor_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1, 28cabdff1aSopenharmony_cifactor_p11_n4: dw 11, -4, 11, -4, 11, -4, 11, -4, 29cabdff1aSopenharmony_cifactor_p5_p4: dw 5, 4, 5, 4, 5, 4, 5, 4, 30cabdff1aSopenharmony_cipd_4: times 4 dd 4 31cabdff1aSopenharmony_cipw_1: times 8 dw 1 32cabdff1aSopenharmony_cipw_0: times 8 dw 0 33cabdff1aSopenharmony_cipw_1023: times 8 dw 1023 34cabdff1aSopenharmony_cipw_4095: times 8 dw 4095 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_ciSECTION .text 37cabdff1aSopenharmony_ci 38cabdff1aSopenharmony_ci%macro CFHD_HORIZ_FILTER 1 39cabdff1aSopenharmony_ci%if %1 == 1023 40cabdff1aSopenharmony_cicglobal cfhd_horiz_filter_clip10, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp 41cabdff1aSopenharmony_ci shl widthd, 1 42cabdff1aSopenharmony_ci%define ostrideq widthq 43cabdff1aSopenharmony_ci%define lwidthq widthq 44cabdff1aSopenharmony_ci%define hwidthq widthq 45cabdff1aSopenharmony_ci%elif %1 == 4095 46cabdff1aSopenharmony_cicglobal cfhd_horiz_filter_clip12, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp 47cabdff1aSopenharmony_ci shl widthd, 1 48cabdff1aSopenharmony_ci%define ostrideq widthq 49cabdff1aSopenharmony_ci%define lwidthq widthq 50cabdff1aSopenharmony_ci%define hwidthq widthq 51cabdff1aSopenharmony_ci%else 52cabdff1aSopenharmony_ci%if ARCH_X86_64 53cabdff1aSopenharmony_cicglobal cfhd_horiz_filter, 8, 11, 12, output, ostride, low, lwidth, high, hwidth, width, height, x, y, temp 54cabdff1aSopenharmony_ci shl ostrided, 1 55cabdff1aSopenharmony_ci shl lwidthd, 1 56cabdff1aSopenharmony_ci shl hwidthd, 1 57cabdff1aSopenharmony_ci shl widthd, 1 58cabdff1aSopenharmony_ci 59cabdff1aSopenharmony_ci mov yd, heightd 60cabdff1aSopenharmony_ci neg yq 61cabdff1aSopenharmony_ci%else 62cabdff1aSopenharmony_cicglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height 63cabdff1aSopenharmony_ci shl xd, 1 64cabdff1aSopenharmony_ci shl yd, 1 65cabdff1aSopenharmony_ci shl tempd, 1 66cabdff1aSopenharmony_ci shl widthd, 1 67cabdff1aSopenharmony_ci 68cabdff1aSopenharmony_ci mov xmp, xq 69cabdff1aSopenharmony_ci mov ymp, yq 70cabdff1aSopenharmony_ci mov tempmp, tempq 71cabdff1aSopenharmony_ci 72cabdff1aSopenharmony_ci mov yd, r7m 73cabdff1aSopenharmony_ci neg yq 74cabdff1aSopenharmony_ci 75cabdff1aSopenharmony_ci%define ostrideq xm 76cabdff1aSopenharmony_ci%define lwidthq ym 77cabdff1aSopenharmony_ci%define hwidthq tempm 78cabdff1aSopenharmony_ci%endif 79cabdff1aSopenharmony_ci%endif 80cabdff1aSopenharmony_ci 81cabdff1aSopenharmony_ci%if ARCH_X86_64 82cabdff1aSopenharmony_ci mova m8, [factor_p1_n1] 83cabdff1aSopenharmony_ci mova m9, [factor_n1_p1] 84cabdff1aSopenharmony_ci mova m10, [pw_1] 85cabdff1aSopenharmony_ci mova m11, [pd_4] 86cabdff1aSopenharmony_ci%endif 87cabdff1aSopenharmony_ci 88cabdff1aSopenharmony_ci%if %1 == 0 89cabdff1aSopenharmony_ci.looph: 90cabdff1aSopenharmony_ci%endif 91cabdff1aSopenharmony_ci movsx xq, word [lowq] 92cabdff1aSopenharmony_ci imul xq, 11 93cabdff1aSopenharmony_ci 94cabdff1aSopenharmony_ci movsx tempq, word [lowq + 2] 95cabdff1aSopenharmony_ci imul tempq, -4 96cabdff1aSopenharmony_ci add tempq, xq 97cabdff1aSopenharmony_ci 98cabdff1aSopenharmony_ci movsx xq, word [lowq + 4] 99cabdff1aSopenharmony_ci add tempq, xq 100cabdff1aSopenharmony_ci add tempq, 4 101cabdff1aSopenharmony_ci sar tempq, 3 102cabdff1aSopenharmony_ci 103cabdff1aSopenharmony_ci movsx xq, word [highq] 104cabdff1aSopenharmony_ci add tempq, xq 105cabdff1aSopenharmony_ci sar tempq, 1 106cabdff1aSopenharmony_ci 107cabdff1aSopenharmony_ci%if %1 108cabdff1aSopenharmony_ci movd xm0, tempd 109cabdff1aSopenharmony_ci CLIPW m0, [pw_0], [pw_%1] 110cabdff1aSopenharmony_ci pextrw tempd, xm0, 0 111cabdff1aSopenharmony_ci%endif 112cabdff1aSopenharmony_ci mov word [outputq], tempw 113cabdff1aSopenharmony_ci 114cabdff1aSopenharmony_ci movsx xq, word [lowq] 115cabdff1aSopenharmony_ci imul xq, 5 116cabdff1aSopenharmony_ci 117cabdff1aSopenharmony_ci movsx tempq, word [lowq + 2] 118cabdff1aSopenharmony_ci imul tempq, 4 119cabdff1aSopenharmony_ci add tempq, xq 120cabdff1aSopenharmony_ci 121cabdff1aSopenharmony_ci movsx xq, word [lowq + 4] 122cabdff1aSopenharmony_ci sub tempq, xq 123cabdff1aSopenharmony_ci add tempq, 4 124cabdff1aSopenharmony_ci sar tempq, 3 125cabdff1aSopenharmony_ci 126cabdff1aSopenharmony_ci movsx xq, word [highq] 127cabdff1aSopenharmony_ci sub tempq, xq 128cabdff1aSopenharmony_ci sar tempq, 1 129cabdff1aSopenharmony_ci 130cabdff1aSopenharmony_ci%if %1 131cabdff1aSopenharmony_ci movd xm0, tempd 132cabdff1aSopenharmony_ci CLIPW m0, [pw_0], [pw_%1] 133cabdff1aSopenharmony_ci pextrw tempd, xm0, 0 134cabdff1aSopenharmony_ci%endif 135cabdff1aSopenharmony_ci mov word [outputq + 2], tempw 136cabdff1aSopenharmony_ci 137cabdff1aSopenharmony_ci mov xq, 0 138cabdff1aSopenharmony_ci 139cabdff1aSopenharmony_ci.loop: 140cabdff1aSopenharmony_ci movu m4, [lowq + xq] 141cabdff1aSopenharmony_ci movu m1, [lowq + xq + 4] 142cabdff1aSopenharmony_ci 143cabdff1aSopenharmony_ci mova m5, m4 144cabdff1aSopenharmony_ci punpcklwd m4, m1 145cabdff1aSopenharmony_ci punpckhwd m5, m1 146cabdff1aSopenharmony_ci 147cabdff1aSopenharmony_ci mova m6, m4 148cabdff1aSopenharmony_ci mova m7, m5 149cabdff1aSopenharmony_ci 150cabdff1aSopenharmony_ci%if ARCH_X86_64 151cabdff1aSopenharmony_ci pmaddwd m4, m8 152cabdff1aSopenharmony_ci pmaddwd m5, m8 153cabdff1aSopenharmony_ci pmaddwd m6, m9 154cabdff1aSopenharmony_ci pmaddwd m7, m9 155cabdff1aSopenharmony_ci 156cabdff1aSopenharmony_ci paddd m4, m11 157cabdff1aSopenharmony_ci paddd m5, m11 158cabdff1aSopenharmony_ci paddd m6, m11 159cabdff1aSopenharmony_ci paddd m7, m11 160cabdff1aSopenharmony_ci%else 161cabdff1aSopenharmony_ci pmaddwd m4, [factor_p1_n1] 162cabdff1aSopenharmony_ci pmaddwd m5, [factor_p1_n1] 163cabdff1aSopenharmony_ci pmaddwd m6, [factor_n1_p1] 164cabdff1aSopenharmony_ci pmaddwd m7, [factor_n1_p1] 165cabdff1aSopenharmony_ci 166cabdff1aSopenharmony_ci paddd m4, [pd_4] 167cabdff1aSopenharmony_ci paddd m5, [pd_4] 168cabdff1aSopenharmony_ci paddd m6, [pd_4] 169cabdff1aSopenharmony_ci paddd m7, [pd_4] 170cabdff1aSopenharmony_ci%endif 171cabdff1aSopenharmony_ci 172cabdff1aSopenharmony_ci psrad m4, 3 173cabdff1aSopenharmony_ci psrad m5, 3 174cabdff1aSopenharmony_ci psrad m6, 3 175cabdff1aSopenharmony_ci psrad m7, 3 176cabdff1aSopenharmony_ci 177cabdff1aSopenharmony_ci movu m2, [lowq + xq + 2] 178cabdff1aSopenharmony_ci movu m3, [highq + xq + 2] 179cabdff1aSopenharmony_ci 180cabdff1aSopenharmony_ci mova m0, m2 181cabdff1aSopenharmony_ci punpcklwd m2, m3 182cabdff1aSopenharmony_ci punpckhwd m0, m3 183cabdff1aSopenharmony_ci 184cabdff1aSopenharmony_ci mova m1, m2 185cabdff1aSopenharmony_ci mova m3, m0 186cabdff1aSopenharmony_ci 187cabdff1aSopenharmony_ci%if ARCH_X86_64 188cabdff1aSopenharmony_ci pmaddwd m2, m10 189cabdff1aSopenharmony_ci pmaddwd m0, m10 190cabdff1aSopenharmony_ci pmaddwd m1, m8 191cabdff1aSopenharmony_ci pmaddwd m3, m8 192cabdff1aSopenharmony_ci%else 193cabdff1aSopenharmony_ci pmaddwd m2, [pw_1] 194cabdff1aSopenharmony_ci pmaddwd m0, [pw_1] 195cabdff1aSopenharmony_ci pmaddwd m1, [factor_p1_n1] 196cabdff1aSopenharmony_ci pmaddwd m3, [factor_p1_n1] 197cabdff1aSopenharmony_ci%endif 198cabdff1aSopenharmony_ci 199cabdff1aSopenharmony_ci paddd m2, m4 200cabdff1aSopenharmony_ci paddd m0, m5 201cabdff1aSopenharmony_ci paddd m1, m6 202cabdff1aSopenharmony_ci paddd m3, m7 203cabdff1aSopenharmony_ci 204cabdff1aSopenharmony_ci psrad m2, 1 205cabdff1aSopenharmony_ci psrad m0, 1 206cabdff1aSopenharmony_ci psrad m1, 1 207cabdff1aSopenharmony_ci psrad m3, 1 208cabdff1aSopenharmony_ci 209cabdff1aSopenharmony_ci packssdw m2, m0 210cabdff1aSopenharmony_ci packssdw m1, m3 211cabdff1aSopenharmony_ci 212cabdff1aSopenharmony_ci mova m0, m2 213cabdff1aSopenharmony_ci punpcklwd m2, m1 214cabdff1aSopenharmony_ci punpckhwd m0, m1 215cabdff1aSopenharmony_ci 216cabdff1aSopenharmony_ci%if %1 217cabdff1aSopenharmony_ci CLIPW m2, [pw_0], [pw_%1] 218cabdff1aSopenharmony_ci CLIPW m0, [pw_0], [pw_%1] 219cabdff1aSopenharmony_ci%endif 220cabdff1aSopenharmony_ci 221cabdff1aSopenharmony_ci movu [outputq + xq * 2 + 4], m2 222cabdff1aSopenharmony_ci movu [outputq + xq * 2 + mmsize + 4], m0 223cabdff1aSopenharmony_ci 224cabdff1aSopenharmony_ci add xq, mmsize 225cabdff1aSopenharmony_ci cmp xq, widthq 226cabdff1aSopenharmony_ci jl .loop 227cabdff1aSopenharmony_ci 228cabdff1aSopenharmony_ci add lowq, widthq 229cabdff1aSopenharmony_ci add highq, widthq 230cabdff1aSopenharmony_ci add outputq, widthq 231cabdff1aSopenharmony_ci add outputq, widthq 232cabdff1aSopenharmony_ci 233cabdff1aSopenharmony_ci movsx xq, word [lowq - 2] 234cabdff1aSopenharmony_ci imul xq, 5 235cabdff1aSopenharmony_ci 236cabdff1aSopenharmony_ci movsx tempq, word [lowq - 4] 237cabdff1aSopenharmony_ci imul tempq, 4 238cabdff1aSopenharmony_ci add tempq, xq 239cabdff1aSopenharmony_ci 240cabdff1aSopenharmony_ci movsx xq, word [lowq - 6] 241cabdff1aSopenharmony_ci sub tempq, xq 242cabdff1aSopenharmony_ci add tempq, 4 243cabdff1aSopenharmony_ci sar tempq, 3 244cabdff1aSopenharmony_ci 245cabdff1aSopenharmony_ci movsx xq, word [highq - 2] 246cabdff1aSopenharmony_ci add tempq, xq 247cabdff1aSopenharmony_ci sar tempq, 1 248cabdff1aSopenharmony_ci 249cabdff1aSopenharmony_ci%if %1 250cabdff1aSopenharmony_ci movd xm0, tempd 251cabdff1aSopenharmony_ci CLIPW m0, [pw_0], [pw_%1] 252cabdff1aSopenharmony_ci pextrw tempd, xm0, 0 253cabdff1aSopenharmony_ci%endif 254cabdff1aSopenharmony_ci mov word [outputq - 4], tempw 255cabdff1aSopenharmony_ci 256cabdff1aSopenharmony_ci movsx xq, word [lowq - 2] 257cabdff1aSopenharmony_ci imul xq, 11 258cabdff1aSopenharmony_ci 259cabdff1aSopenharmony_ci movsx tempq, word [lowq - 4] 260cabdff1aSopenharmony_ci imul tempq, -4 261cabdff1aSopenharmony_ci add tempq, xq 262cabdff1aSopenharmony_ci 263cabdff1aSopenharmony_ci movsx xq, word [lowq - 6] 264cabdff1aSopenharmony_ci add tempq, xq 265cabdff1aSopenharmony_ci add tempq, 4 266cabdff1aSopenharmony_ci sar tempq, 3 267cabdff1aSopenharmony_ci 268cabdff1aSopenharmony_ci movsx xq, word [highq - 2] 269cabdff1aSopenharmony_ci sub tempq, xq 270cabdff1aSopenharmony_ci sar tempq, 1 271cabdff1aSopenharmony_ci 272cabdff1aSopenharmony_ci%if %1 273cabdff1aSopenharmony_ci movd xm0, tempd 274cabdff1aSopenharmony_ci CLIPW m0, [pw_0], [pw_%1] 275cabdff1aSopenharmony_ci pextrw tempd, xm0, 0 276cabdff1aSopenharmony_ci%endif 277cabdff1aSopenharmony_ci mov word [outputq - 2], tempw 278cabdff1aSopenharmony_ci 279cabdff1aSopenharmony_ci%if %1 == 0 280cabdff1aSopenharmony_ci sub lowq, widthq 281cabdff1aSopenharmony_ci sub highq, widthq 282cabdff1aSopenharmony_ci sub outputq, widthq 283cabdff1aSopenharmony_ci sub outputq, widthq 284cabdff1aSopenharmony_ci 285cabdff1aSopenharmony_ci add lowq, lwidthq 286cabdff1aSopenharmony_ci add highq, hwidthq 287cabdff1aSopenharmony_ci add outputq, ostrideq 288cabdff1aSopenharmony_ci add outputq, ostrideq 289cabdff1aSopenharmony_ci add yq, 1 290cabdff1aSopenharmony_ci jl .looph 291cabdff1aSopenharmony_ci%endif 292cabdff1aSopenharmony_ci 293cabdff1aSopenharmony_ci RET 294cabdff1aSopenharmony_ci%endmacro 295cabdff1aSopenharmony_ci 296cabdff1aSopenharmony_ciINIT_XMM sse2 297cabdff1aSopenharmony_ciCFHD_HORIZ_FILTER 0 298cabdff1aSopenharmony_ci 299cabdff1aSopenharmony_ciINIT_XMM sse2 300cabdff1aSopenharmony_ciCFHD_HORIZ_FILTER 1023 301cabdff1aSopenharmony_ci 302cabdff1aSopenharmony_ciINIT_XMM sse2 303cabdff1aSopenharmony_ciCFHD_HORIZ_FILTER 4095 304cabdff1aSopenharmony_ci 305cabdff1aSopenharmony_ciINIT_XMM sse2 306cabdff1aSopenharmony_ci%if ARCH_X86_64 307cabdff1aSopenharmony_cicglobal cfhd_vert_filter, 8, 11, 14, output, ostride, low, lwidth, high, hwidth, width, height, x, y, pos 308cabdff1aSopenharmony_ci shl ostrided, 1 309cabdff1aSopenharmony_ci shl lwidthd, 1 310cabdff1aSopenharmony_ci shl hwidthd, 1 311cabdff1aSopenharmony_ci shl widthd, 1 312cabdff1aSopenharmony_ci 313cabdff1aSopenharmony_ci dec heightd 314cabdff1aSopenharmony_ci 315cabdff1aSopenharmony_ci mova m8, [factor_p1_n1] 316cabdff1aSopenharmony_ci mova m9, [factor_n1_p1] 317cabdff1aSopenharmony_ci mova m10, [pw_1] 318cabdff1aSopenharmony_ci mova m11, [pd_4] 319cabdff1aSopenharmony_ci mova m12, [factor_p11_n4] 320cabdff1aSopenharmony_ci mova m13, [factor_p5_p4] 321cabdff1aSopenharmony_ci%else 322cabdff1aSopenharmony_cicglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height 323cabdff1aSopenharmony_ci shl xd, 1 324cabdff1aSopenharmony_ci shl yd, 1 325cabdff1aSopenharmony_ci shl posd, 1 326cabdff1aSopenharmony_ci shl widthd, 1 327cabdff1aSopenharmony_ci 328cabdff1aSopenharmony_ci mov xmp, xq 329cabdff1aSopenharmony_ci mov ymp, yq 330cabdff1aSopenharmony_ci mov posmp, posq 331cabdff1aSopenharmony_ci 332cabdff1aSopenharmony_ci mov xq, r7m 333cabdff1aSopenharmony_ci dec xq 334cabdff1aSopenharmony_ci mov widthmp, xq 335cabdff1aSopenharmony_ci 336cabdff1aSopenharmony_ci%define ostrideq xm 337cabdff1aSopenharmony_ci%define lwidthq ym 338cabdff1aSopenharmony_ci%define hwidthq posm 339cabdff1aSopenharmony_ci%define heightq widthm 340cabdff1aSopenharmony_ci 341cabdff1aSopenharmony_ci%endif 342cabdff1aSopenharmony_ci 343cabdff1aSopenharmony_ci xor xq, xq 344cabdff1aSopenharmony_ci.loopw: 345cabdff1aSopenharmony_ci xor yq, yq 346cabdff1aSopenharmony_ci 347cabdff1aSopenharmony_ci mov posq, xq 348cabdff1aSopenharmony_ci movu m0, [lowq + posq] 349cabdff1aSopenharmony_ci add posq, lwidthq 350cabdff1aSopenharmony_ci movu m1, [lowq + posq] 351cabdff1aSopenharmony_ci mova m2, m0 352cabdff1aSopenharmony_ci punpcklwd m0, m1 353cabdff1aSopenharmony_ci punpckhwd m2, m1 354cabdff1aSopenharmony_ci 355cabdff1aSopenharmony_ci%if ARCH_X86_64 356cabdff1aSopenharmony_ci pmaddwd m0, m12 357cabdff1aSopenharmony_ci pmaddwd m2, m12 358cabdff1aSopenharmony_ci%else 359cabdff1aSopenharmony_ci pmaddwd m0, [factor_p11_n4] 360cabdff1aSopenharmony_ci pmaddwd m2, [factor_p11_n4] 361cabdff1aSopenharmony_ci%endif 362cabdff1aSopenharmony_ci 363cabdff1aSopenharmony_ci pxor m4, m4 364cabdff1aSopenharmony_ci add posq, lwidthq 365cabdff1aSopenharmony_ci movu m1, [lowq + posq] 366cabdff1aSopenharmony_ci mova m3, m4 367cabdff1aSopenharmony_ci punpcklwd m4, m1 368cabdff1aSopenharmony_ci punpckhwd m3, m1 369cabdff1aSopenharmony_ci 370cabdff1aSopenharmony_ci psrad m4, 16 371cabdff1aSopenharmony_ci psrad m3, 16 372cabdff1aSopenharmony_ci 373cabdff1aSopenharmony_ci paddd m0, m4 374cabdff1aSopenharmony_ci paddd m2, m3 375cabdff1aSopenharmony_ci 376cabdff1aSopenharmony_ci paddd m0, [pd_4] 377cabdff1aSopenharmony_ci paddd m2, [pd_4] 378cabdff1aSopenharmony_ci 379cabdff1aSopenharmony_ci psrad m0, 3 380cabdff1aSopenharmony_ci psrad m2, 3 381cabdff1aSopenharmony_ci 382cabdff1aSopenharmony_ci mov posq, xq 383cabdff1aSopenharmony_ci pxor m4, m4 384cabdff1aSopenharmony_ci movu m1, [highq + posq] 385cabdff1aSopenharmony_ci mova m3, m4 386cabdff1aSopenharmony_ci punpcklwd m4, m1 387cabdff1aSopenharmony_ci punpckhwd m3, m1 388cabdff1aSopenharmony_ci 389cabdff1aSopenharmony_ci psrad m4, 16 390cabdff1aSopenharmony_ci psrad m3, 16 391cabdff1aSopenharmony_ci 392cabdff1aSopenharmony_ci paddd m0, m4 393cabdff1aSopenharmony_ci paddd m2, m3 394cabdff1aSopenharmony_ci 395cabdff1aSopenharmony_ci psrad m0, 1 396cabdff1aSopenharmony_ci psrad m2, 1 397cabdff1aSopenharmony_ci 398cabdff1aSopenharmony_ci packssdw m0, m2 399cabdff1aSopenharmony_ci 400cabdff1aSopenharmony_ci movu [outputq + posq], m0 401cabdff1aSopenharmony_ci 402cabdff1aSopenharmony_ci movu m0, [lowq + posq] 403cabdff1aSopenharmony_ci add posq, lwidthq 404cabdff1aSopenharmony_ci movu m1, [lowq + posq] 405cabdff1aSopenharmony_ci mova m2, m0 406cabdff1aSopenharmony_ci punpcklwd m0, m1 407cabdff1aSopenharmony_ci punpckhwd m2, m1 408cabdff1aSopenharmony_ci 409cabdff1aSopenharmony_ci%if ARCH_X86_64 410cabdff1aSopenharmony_ci pmaddwd m0, m13 411cabdff1aSopenharmony_ci pmaddwd m2, m13 412cabdff1aSopenharmony_ci%else 413cabdff1aSopenharmony_ci pmaddwd m0, [factor_p5_p4] 414cabdff1aSopenharmony_ci pmaddwd m2, [factor_p5_p4] 415cabdff1aSopenharmony_ci%endif 416cabdff1aSopenharmony_ci 417cabdff1aSopenharmony_ci pxor m4, m4 418cabdff1aSopenharmony_ci add posq, lwidthq 419cabdff1aSopenharmony_ci movu m1, [lowq + posq] 420cabdff1aSopenharmony_ci mova m3, m4 421cabdff1aSopenharmony_ci punpcklwd m4, m1 422cabdff1aSopenharmony_ci punpckhwd m3, m1 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_ci psrad m4, 16 425cabdff1aSopenharmony_ci psrad m3, 16 426cabdff1aSopenharmony_ci 427cabdff1aSopenharmony_ci psubd m0, m4 428cabdff1aSopenharmony_ci psubd m2, m3 429cabdff1aSopenharmony_ci 430cabdff1aSopenharmony_ci paddd m0, [pd_4] 431cabdff1aSopenharmony_ci paddd m2, [pd_4] 432cabdff1aSopenharmony_ci 433cabdff1aSopenharmony_ci psrad m0, 3 434cabdff1aSopenharmony_ci psrad m2, 3 435cabdff1aSopenharmony_ci 436cabdff1aSopenharmony_ci mov posq, xq 437cabdff1aSopenharmony_ci pxor m4, m4 438cabdff1aSopenharmony_ci movu m1, [highq + posq] 439cabdff1aSopenharmony_ci mova m3, m4 440cabdff1aSopenharmony_ci punpcklwd m4, m1 441cabdff1aSopenharmony_ci punpckhwd m3, m1 442cabdff1aSopenharmony_ci 443cabdff1aSopenharmony_ci psrad m4, 16 444cabdff1aSopenharmony_ci psrad m3, 16 445cabdff1aSopenharmony_ci 446cabdff1aSopenharmony_ci psubd m0, m4 447cabdff1aSopenharmony_ci psubd m2, m3 448cabdff1aSopenharmony_ci 449cabdff1aSopenharmony_ci psrad m0, 1 450cabdff1aSopenharmony_ci psrad m2, 1 451cabdff1aSopenharmony_ci 452cabdff1aSopenharmony_ci packssdw m0, m2 453cabdff1aSopenharmony_ci 454cabdff1aSopenharmony_ci add posq, ostrideq 455cabdff1aSopenharmony_ci movu [outputq + posq], m0 456cabdff1aSopenharmony_ci 457cabdff1aSopenharmony_ci add yq, 1 458cabdff1aSopenharmony_ci.looph: 459cabdff1aSopenharmony_ci mov posq, lwidthq 460cabdff1aSopenharmony_ci imul posq, yq 461cabdff1aSopenharmony_ci sub posq, lwidthq 462cabdff1aSopenharmony_ci add posq, xq 463cabdff1aSopenharmony_ci 464cabdff1aSopenharmony_ci movu m4, [lowq + posq] 465cabdff1aSopenharmony_ci 466cabdff1aSopenharmony_ci add posq, lwidthq 467cabdff1aSopenharmony_ci add posq, lwidthq 468cabdff1aSopenharmony_ci movu m1, [lowq + posq] 469cabdff1aSopenharmony_ci 470cabdff1aSopenharmony_ci mova m5, m4 471cabdff1aSopenharmony_ci punpcklwd m4, m1 472cabdff1aSopenharmony_ci punpckhwd m5, m1 473cabdff1aSopenharmony_ci 474cabdff1aSopenharmony_ci mova m6, m4 475cabdff1aSopenharmony_ci mova m7, m5 476cabdff1aSopenharmony_ci 477cabdff1aSopenharmony_ci%if ARCH_X86_64 478cabdff1aSopenharmony_ci pmaddwd m4, m8 479cabdff1aSopenharmony_ci pmaddwd m5, m8 480cabdff1aSopenharmony_ci pmaddwd m6, m9 481cabdff1aSopenharmony_ci pmaddwd m7, m9 482cabdff1aSopenharmony_ci 483cabdff1aSopenharmony_ci paddd m4, m11 484cabdff1aSopenharmony_ci paddd m5, m11 485cabdff1aSopenharmony_ci paddd m6, m11 486cabdff1aSopenharmony_ci paddd m7, m11 487cabdff1aSopenharmony_ci%else 488cabdff1aSopenharmony_ci pmaddwd m4, [factor_p1_n1] 489cabdff1aSopenharmony_ci pmaddwd m5, [factor_p1_n1] 490cabdff1aSopenharmony_ci pmaddwd m6, [factor_n1_p1] 491cabdff1aSopenharmony_ci pmaddwd m7, [factor_n1_p1] 492cabdff1aSopenharmony_ci 493cabdff1aSopenharmony_ci paddd m4, [pd_4] 494cabdff1aSopenharmony_ci paddd m5, [pd_4] 495cabdff1aSopenharmony_ci paddd m6, [pd_4] 496cabdff1aSopenharmony_ci paddd m7, [pd_4] 497cabdff1aSopenharmony_ci%endif 498cabdff1aSopenharmony_ci 499cabdff1aSopenharmony_ci psrad m4, 3 500cabdff1aSopenharmony_ci psrad m5, 3 501cabdff1aSopenharmony_ci psrad m6, 3 502cabdff1aSopenharmony_ci psrad m7, 3 503cabdff1aSopenharmony_ci 504cabdff1aSopenharmony_ci sub posq, lwidthq 505cabdff1aSopenharmony_ci movu m0, [lowq + posq] 506cabdff1aSopenharmony_ci 507cabdff1aSopenharmony_ci mov posq, hwidthq 508cabdff1aSopenharmony_ci imul posq, yq 509cabdff1aSopenharmony_ci add posq, xq 510cabdff1aSopenharmony_ci movu m1, [highq + posq] 511cabdff1aSopenharmony_ci 512cabdff1aSopenharmony_ci mova m2, m0 513cabdff1aSopenharmony_ci punpcklwd m0, m1 514cabdff1aSopenharmony_ci punpckhwd m2, m1 515cabdff1aSopenharmony_ci 516cabdff1aSopenharmony_ci mova m1, m0 517cabdff1aSopenharmony_ci mova m3, m2 518cabdff1aSopenharmony_ci 519cabdff1aSopenharmony_ci%if ARCH_X86_64 520cabdff1aSopenharmony_ci pmaddwd m0, m10 521cabdff1aSopenharmony_ci pmaddwd m2, m10 522cabdff1aSopenharmony_ci pmaddwd m1, m8 523cabdff1aSopenharmony_ci pmaddwd m3, m8 524cabdff1aSopenharmony_ci%else 525cabdff1aSopenharmony_ci pmaddwd m0, [pw_1] 526cabdff1aSopenharmony_ci pmaddwd m2, [pw_1] 527cabdff1aSopenharmony_ci pmaddwd m1, [factor_p1_n1] 528cabdff1aSopenharmony_ci pmaddwd m3, [factor_p1_n1] 529cabdff1aSopenharmony_ci%endif 530cabdff1aSopenharmony_ci 531cabdff1aSopenharmony_ci paddd m0, m4 532cabdff1aSopenharmony_ci paddd m2, m5 533cabdff1aSopenharmony_ci paddd m1, m6 534cabdff1aSopenharmony_ci paddd m3, m7 535cabdff1aSopenharmony_ci 536cabdff1aSopenharmony_ci psrad m0, 1 537cabdff1aSopenharmony_ci psrad m2, 1 538cabdff1aSopenharmony_ci psrad m1, 1 539cabdff1aSopenharmony_ci psrad m3, 1 540cabdff1aSopenharmony_ci 541cabdff1aSopenharmony_ci packssdw m0, m2 542cabdff1aSopenharmony_ci packssdw m1, m3 543cabdff1aSopenharmony_ci 544cabdff1aSopenharmony_ci mov posq, ostrideq 545cabdff1aSopenharmony_ci imul posq, 2 546cabdff1aSopenharmony_ci imul posq, yq 547cabdff1aSopenharmony_ci add posq, xq 548cabdff1aSopenharmony_ci 549cabdff1aSopenharmony_ci movu [outputq + posq], m0 550cabdff1aSopenharmony_ci add posq, ostrideq 551cabdff1aSopenharmony_ci movu [outputq + posq], m1 552cabdff1aSopenharmony_ci 553cabdff1aSopenharmony_ci add yq, 1 554cabdff1aSopenharmony_ci cmp yq, heightq 555cabdff1aSopenharmony_ci jl .looph 556cabdff1aSopenharmony_ci 557cabdff1aSopenharmony_ci mov posq, lwidthq 558cabdff1aSopenharmony_ci imul posq, yq 559cabdff1aSopenharmony_ci add posq, xq 560cabdff1aSopenharmony_ci movu m0, [lowq + posq] 561cabdff1aSopenharmony_ci sub posq, lwidthq 562cabdff1aSopenharmony_ci movu m1, [lowq + posq] 563cabdff1aSopenharmony_ci mova m2, m0 564cabdff1aSopenharmony_ci punpcklwd m0, m1 565cabdff1aSopenharmony_ci punpckhwd m2, m1 566cabdff1aSopenharmony_ci 567cabdff1aSopenharmony_ci%if ARCH_X86_64 568cabdff1aSopenharmony_ci pmaddwd m0, m13 569cabdff1aSopenharmony_ci pmaddwd m2, m13 570cabdff1aSopenharmony_ci%else 571cabdff1aSopenharmony_ci pmaddwd m0, [factor_p5_p4] 572cabdff1aSopenharmony_ci pmaddwd m2, [factor_p5_p4] 573cabdff1aSopenharmony_ci%endif 574cabdff1aSopenharmony_ci 575cabdff1aSopenharmony_ci pxor m4, m4 576cabdff1aSopenharmony_ci sub posq, lwidthq 577cabdff1aSopenharmony_ci movu m1, [lowq + posq] 578cabdff1aSopenharmony_ci mova m3, m4 579cabdff1aSopenharmony_ci punpcklwd m4, m1 580cabdff1aSopenharmony_ci punpckhwd m3, m1 581cabdff1aSopenharmony_ci 582cabdff1aSopenharmony_ci psrad m4, 16 583cabdff1aSopenharmony_ci psrad m3, 16 584cabdff1aSopenharmony_ci 585cabdff1aSopenharmony_ci psubd m0, m4 586cabdff1aSopenharmony_ci psubd m2, m3 587cabdff1aSopenharmony_ci 588cabdff1aSopenharmony_ci%if ARCH_X86_64 589cabdff1aSopenharmony_ci paddd m0, m11 590cabdff1aSopenharmony_ci paddd m2, m11 591cabdff1aSopenharmony_ci%else 592cabdff1aSopenharmony_ci paddd m0, [pd_4] 593cabdff1aSopenharmony_ci paddd m2, [pd_4] 594cabdff1aSopenharmony_ci%endif 595cabdff1aSopenharmony_ci 596cabdff1aSopenharmony_ci psrad m0, 3 597cabdff1aSopenharmony_ci psrad m2, 3 598cabdff1aSopenharmony_ci 599cabdff1aSopenharmony_ci mov posq, hwidthq 600cabdff1aSopenharmony_ci imul posq, yq 601cabdff1aSopenharmony_ci add posq, xq 602cabdff1aSopenharmony_ci pxor m4, m4 603cabdff1aSopenharmony_ci movu m1, [highq + posq] 604cabdff1aSopenharmony_ci mova m3, m4 605cabdff1aSopenharmony_ci punpcklwd m4, m1 606cabdff1aSopenharmony_ci punpckhwd m3, m1 607cabdff1aSopenharmony_ci 608cabdff1aSopenharmony_ci psrad m4, 16 609cabdff1aSopenharmony_ci psrad m3, 16 610cabdff1aSopenharmony_ci 611cabdff1aSopenharmony_ci paddd m0, m4 612cabdff1aSopenharmony_ci paddd m2, m3 613cabdff1aSopenharmony_ci 614cabdff1aSopenharmony_ci psrad m0, 1 615cabdff1aSopenharmony_ci psrad m2, 1 616cabdff1aSopenharmony_ci 617cabdff1aSopenharmony_ci packssdw m0, m2 618cabdff1aSopenharmony_ci 619cabdff1aSopenharmony_ci mov posq, ostrideq 620cabdff1aSopenharmony_ci imul posq, 2 621cabdff1aSopenharmony_ci imul posq, yq 622cabdff1aSopenharmony_ci add posq, xq 623cabdff1aSopenharmony_ci movu [outputq + posq], m0 624cabdff1aSopenharmony_ci 625cabdff1aSopenharmony_ci mov posq, lwidthq 626cabdff1aSopenharmony_ci imul posq, yq 627cabdff1aSopenharmony_ci add posq, xq 628cabdff1aSopenharmony_ci movu m0, [lowq + posq] 629cabdff1aSopenharmony_ci sub posq, lwidthq 630cabdff1aSopenharmony_ci movu m1, [lowq + posq] 631cabdff1aSopenharmony_ci mova m2, m0 632cabdff1aSopenharmony_ci punpcklwd m0, m1 633cabdff1aSopenharmony_ci punpckhwd m2, m1 634cabdff1aSopenharmony_ci 635cabdff1aSopenharmony_ci%if ARCH_X86_64 636cabdff1aSopenharmony_ci pmaddwd m0, m12 637cabdff1aSopenharmony_ci pmaddwd m2, m12 638cabdff1aSopenharmony_ci%else 639cabdff1aSopenharmony_ci pmaddwd m0, [factor_p11_n4] 640cabdff1aSopenharmony_ci pmaddwd m2, [factor_p11_n4] 641cabdff1aSopenharmony_ci%endif 642cabdff1aSopenharmony_ci 643cabdff1aSopenharmony_ci pxor m4, m4 644cabdff1aSopenharmony_ci sub posq, lwidthq 645cabdff1aSopenharmony_ci movu m1, [lowq + posq] 646cabdff1aSopenharmony_ci mova m3, m4 647cabdff1aSopenharmony_ci punpcklwd m4, m1 648cabdff1aSopenharmony_ci punpckhwd m3, m1 649cabdff1aSopenharmony_ci 650cabdff1aSopenharmony_ci psrad m4, 16 651cabdff1aSopenharmony_ci psrad m3, 16 652cabdff1aSopenharmony_ci 653cabdff1aSopenharmony_ci paddd m0, m4 654cabdff1aSopenharmony_ci paddd m2, m3 655cabdff1aSopenharmony_ci 656cabdff1aSopenharmony_ci%if ARCH_X86_64 657cabdff1aSopenharmony_ci paddd m0, m11 658cabdff1aSopenharmony_ci paddd m2, m11 659cabdff1aSopenharmony_ci%else 660cabdff1aSopenharmony_ci paddd m0, [pd_4] 661cabdff1aSopenharmony_ci paddd m2, [pd_4] 662cabdff1aSopenharmony_ci%endif 663cabdff1aSopenharmony_ci 664cabdff1aSopenharmony_ci psrad m0, 3 665cabdff1aSopenharmony_ci psrad m2, 3 666cabdff1aSopenharmony_ci 667cabdff1aSopenharmony_ci mov posq, hwidthq 668cabdff1aSopenharmony_ci imul posq, yq 669cabdff1aSopenharmony_ci add posq, xq 670cabdff1aSopenharmony_ci pxor m4, m4 671cabdff1aSopenharmony_ci movu m1, [highq + posq] 672cabdff1aSopenharmony_ci mova m3, m4 673cabdff1aSopenharmony_ci punpcklwd m4, m1 674cabdff1aSopenharmony_ci punpckhwd m3, m1 675cabdff1aSopenharmony_ci 676cabdff1aSopenharmony_ci psrad m4, 16 677cabdff1aSopenharmony_ci psrad m3, 16 678cabdff1aSopenharmony_ci 679cabdff1aSopenharmony_ci psubd m0, m4 680cabdff1aSopenharmony_ci psubd m2, m3 681cabdff1aSopenharmony_ci 682cabdff1aSopenharmony_ci psrad m0, 1 683cabdff1aSopenharmony_ci psrad m2, 1 684cabdff1aSopenharmony_ci 685cabdff1aSopenharmony_ci packssdw m0, m2 686cabdff1aSopenharmony_ci 687cabdff1aSopenharmony_ci mov posq, ostrideq 688cabdff1aSopenharmony_ci imul posq, 2 689cabdff1aSopenharmony_ci imul posq, yq 690cabdff1aSopenharmony_ci add posq, ostrideq 691cabdff1aSopenharmony_ci add posq, xq 692cabdff1aSopenharmony_ci movu [outputq + posq], m0 693cabdff1aSopenharmony_ci 694cabdff1aSopenharmony_ci add xq, mmsize 695cabdff1aSopenharmony_ci cmp xq, widthq 696cabdff1aSopenharmony_ci jl .loopw 697cabdff1aSopenharmony_ci RET 698