1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* x86-optimized functions for the CFHD encoder 3cabdff1aSopenharmony_ci;* Copyright (c) 2021 Paul B Mahol 4cabdff1aSopenharmony_ci;* 5cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci;* 12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 16cabdff1aSopenharmony_ci;* 17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci;****************************************************************************** 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ciSECTION_RODATA 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_cipw_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1 27cabdff1aSopenharmony_cipw_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1 28cabdff1aSopenharmony_cipw_p5_n11: dw 5, -11, 5, -11, 5, -11, 5, -11 29cabdff1aSopenharmony_cipw_n5_p11: dw -5, 11, -5, 11, -5, 11, -5, 11 30cabdff1aSopenharmony_cipw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5 31cabdff1aSopenharmony_cipw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5 32cabdff1aSopenharmony_cipd_4: times 4 dd 4 33cabdff1aSopenharmony_cipw_n4: times 8 dw -4 34cabdff1aSopenharmony_cicextern pw_m1 35cabdff1aSopenharmony_cicextern pw_1 36cabdff1aSopenharmony_cicextern pw_4 37cabdff1aSopenharmony_ci 38cabdff1aSopenharmony_ciSECTION .text 39cabdff1aSopenharmony_ci 40cabdff1aSopenharmony_ci%if ARCH_X86_64 41cabdff1aSopenharmony_ciINIT_XMM sse2 42cabdff1aSopenharmony_cicglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwidth, width, y, x, temp 43cabdff1aSopenharmony_ci shl istrideq, 1 44cabdff1aSopenharmony_ci shl lwidthq, 1 45cabdff1aSopenharmony_ci shl hwidthq, 1 46cabdff1aSopenharmony_ci mova m7, [pd_4] 47cabdff1aSopenharmony_ci mova m8, [pw_1] 48cabdff1aSopenharmony_ci mova m9, [pw_m1] 49cabdff1aSopenharmony_ci mova m10,[pw_p1_n1] 50cabdff1aSopenharmony_ci movsxdifnidn yq, yd 51cabdff1aSopenharmony_ci movsxdifnidn widthq, widthd 52cabdff1aSopenharmony_ci neg yq 53cabdff1aSopenharmony_ci.looph: 54cabdff1aSopenharmony_ci movsx xq, word [inputq] 55cabdff1aSopenharmony_ci 56cabdff1aSopenharmony_ci movsx tempq, word [inputq + 2] 57cabdff1aSopenharmony_ci add tempq, xq 58cabdff1aSopenharmony_ci 59cabdff1aSopenharmony_ci movd xm0, tempd 60cabdff1aSopenharmony_ci packssdw m0, m0 61cabdff1aSopenharmony_ci movd tempd, m0 62cabdff1aSopenharmony_ci mov word [lowq], tempw 63cabdff1aSopenharmony_ci 64cabdff1aSopenharmony_ci movsx xq, word [inputq] 65cabdff1aSopenharmony_ci imul xq, 5 66cabdff1aSopenharmony_ci movsx tempq, word [inputq + 2] 67cabdff1aSopenharmony_ci imul tempq, -11 68cabdff1aSopenharmony_ci add tempq, xq 69cabdff1aSopenharmony_ci 70cabdff1aSopenharmony_ci movsx xq, word [inputq + 4] 71cabdff1aSopenharmony_ci imul xq, 4 72cabdff1aSopenharmony_ci add tempq, xq 73cabdff1aSopenharmony_ci 74cabdff1aSopenharmony_ci movsx xq, word [inputq + 6] 75cabdff1aSopenharmony_ci imul xq, 4 76cabdff1aSopenharmony_ci add tempq, xq 77cabdff1aSopenharmony_ci 78cabdff1aSopenharmony_ci movsx xq, word [inputq + 8] 79cabdff1aSopenharmony_ci imul xq, -1 80cabdff1aSopenharmony_ci add tempq, xq 81cabdff1aSopenharmony_ci 82cabdff1aSopenharmony_ci movsx xq, word [inputq + 10] 83cabdff1aSopenharmony_ci imul xq, -1 84cabdff1aSopenharmony_ci add tempq, xq 85cabdff1aSopenharmony_ci 86cabdff1aSopenharmony_ci add tempq, 4 87cabdff1aSopenharmony_ci sar tempq, 3 88cabdff1aSopenharmony_ci 89cabdff1aSopenharmony_ci movd xm0, tempd 90cabdff1aSopenharmony_ci packssdw m0, m0 91cabdff1aSopenharmony_ci movd tempd, m0 92cabdff1aSopenharmony_ci mov word [highq], tempw 93cabdff1aSopenharmony_ci 94cabdff1aSopenharmony_ci mov xq, 2 95cabdff1aSopenharmony_ci 96cabdff1aSopenharmony_ci.loopw: 97cabdff1aSopenharmony_ci movu m0, [inputq + xq * 2] 98cabdff1aSopenharmony_ci movu m1, [inputq + xq * 2 + mmsize] 99cabdff1aSopenharmony_ci 100cabdff1aSopenharmony_ci pmaddwd m0, m8 101cabdff1aSopenharmony_ci pmaddwd m1, m8 102cabdff1aSopenharmony_ci 103cabdff1aSopenharmony_ci packssdw m0, m1 104cabdff1aSopenharmony_ci movu [lowq+xq], m0 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_ci movu m2, [inputq + xq * 2 - 4] 107cabdff1aSopenharmony_ci movu m3, [inputq + xq * 2 - 4 + mmsize] 108cabdff1aSopenharmony_ci 109cabdff1aSopenharmony_ci pmaddwd m2, m9 110cabdff1aSopenharmony_ci pmaddwd m3, m9 111cabdff1aSopenharmony_ci 112cabdff1aSopenharmony_ci movu m0, [inputq + xq * 2 + 4] 113cabdff1aSopenharmony_ci movu m1, [inputq + xq * 2 + 4 + mmsize] 114cabdff1aSopenharmony_ci 115cabdff1aSopenharmony_ci pmaddwd m0, m8 116cabdff1aSopenharmony_ci pmaddwd m1, m8 117cabdff1aSopenharmony_ci 118cabdff1aSopenharmony_ci paddd m0, m2 119cabdff1aSopenharmony_ci paddd m1, m3 120cabdff1aSopenharmony_ci 121cabdff1aSopenharmony_ci paddd m0, m7 122cabdff1aSopenharmony_ci paddd m1, m7 123cabdff1aSopenharmony_ci 124cabdff1aSopenharmony_ci psrad m0, 3 125cabdff1aSopenharmony_ci psrad m1, 3 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ci movu m5, [inputq + xq * 2 + 0] 128cabdff1aSopenharmony_ci movu m6, [inputq + xq * 2 + mmsize] 129cabdff1aSopenharmony_ci 130cabdff1aSopenharmony_ci pmaddwd m5, m10 131cabdff1aSopenharmony_ci pmaddwd m6, m10 132cabdff1aSopenharmony_ci 133cabdff1aSopenharmony_ci paddd m0, m5 134cabdff1aSopenharmony_ci paddd m1, m6 135cabdff1aSopenharmony_ci 136cabdff1aSopenharmony_ci packssdw m0, m1 137cabdff1aSopenharmony_ci movu [highq+xq], m0 138cabdff1aSopenharmony_ci 139cabdff1aSopenharmony_ci add xq, mmsize 140cabdff1aSopenharmony_ci cmp xq, widthq 141cabdff1aSopenharmony_ci jl .loopw 142cabdff1aSopenharmony_ci 143cabdff1aSopenharmony_ci add lowq, widthq 144cabdff1aSopenharmony_ci add highq, widthq 145cabdff1aSopenharmony_ci lea inputq, [inputq + widthq * 2] 146cabdff1aSopenharmony_ci 147cabdff1aSopenharmony_ci movsx xq, word [inputq - 4] 148cabdff1aSopenharmony_ci movsx tempq, word [inputq - 2] 149cabdff1aSopenharmony_ci add tempq, xq 150cabdff1aSopenharmony_ci 151cabdff1aSopenharmony_ci movd xm0, tempd 152cabdff1aSopenharmony_ci packssdw m0, m0 153cabdff1aSopenharmony_ci movd tempd, m0 154cabdff1aSopenharmony_ci mov word [lowq-2], tempw 155cabdff1aSopenharmony_ci 156cabdff1aSopenharmony_ci movsx tempq, word [inputq - 4] 157cabdff1aSopenharmony_ci imul tempq, 11 158cabdff1aSopenharmony_ci movsx xq, word [inputq - 2] 159cabdff1aSopenharmony_ci imul xq, -5 160cabdff1aSopenharmony_ci add tempq, xq 161cabdff1aSopenharmony_ci 162cabdff1aSopenharmony_ci movsx xq, word [inputq - 6] 163cabdff1aSopenharmony_ci imul xq, -4 164cabdff1aSopenharmony_ci add tempq, xq 165cabdff1aSopenharmony_ci 166cabdff1aSopenharmony_ci movsx xq, word [inputq - 8] 167cabdff1aSopenharmony_ci imul xq, -4 168cabdff1aSopenharmony_ci add tempq, xq 169cabdff1aSopenharmony_ci 170cabdff1aSopenharmony_ci movsx xq, word [inputq - 10] 171cabdff1aSopenharmony_ci add tempq, xq 172cabdff1aSopenharmony_ci 173cabdff1aSopenharmony_ci movsx xq, word [inputq - 12] 174cabdff1aSopenharmony_ci add tempq, xq 175cabdff1aSopenharmony_ci 176cabdff1aSopenharmony_ci add tempq, 4 177cabdff1aSopenharmony_ci sar tempq, 3 178cabdff1aSopenharmony_ci 179cabdff1aSopenharmony_ci movd xm0, tempd 180cabdff1aSopenharmony_ci packssdw m0, m0 181cabdff1aSopenharmony_ci movd tempd, m0 182cabdff1aSopenharmony_ci mov word [highq-2], tempw 183cabdff1aSopenharmony_ci 184cabdff1aSopenharmony_ci sub inputq, widthq 185cabdff1aSopenharmony_ci sub inputq, widthq 186cabdff1aSopenharmony_ci sub highq, widthq 187cabdff1aSopenharmony_ci sub lowq, widthq 188cabdff1aSopenharmony_ci 189cabdff1aSopenharmony_ci add lowq, lwidthq 190cabdff1aSopenharmony_ci add highq, hwidthq 191cabdff1aSopenharmony_ci add inputq, istrideq 192cabdff1aSopenharmony_ci add yq, 1 193cabdff1aSopenharmony_ci jl .looph 194cabdff1aSopenharmony_ci 195cabdff1aSopenharmony_ci RET 196cabdff1aSopenharmony_ci%endif 197cabdff1aSopenharmony_ci 198cabdff1aSopenharmony_ci%if ARCH_X86_64 199cabdff1aSopenharmony_ciINIT_XMM sse2 200cabdff1aSopenharmony_cicglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidth, width, height, x, y, pos 201cabdff1aSopenharmony_ci shl istrideq, 1 202cabdff1aSopenharmony_ci 203cabdff1aSopenharmony_ci shl widthd, 1 204cabdff1aSopenharmony_ci sub heightd, 2 205cabdff1aSopenharmony_ci 206cabdff1aSopenharmony_ci xor xq, xq 207cabdff1aSopenharmony_ci 208cabdff1aSopenharmony_ci mova m7, [pd_4] 209cabdff1aSopenharmony_ci mova m8, [pw_1] 210cabdff1aSopenharmony_ci mova m9, [pw_m1] 211cabdff1aSopenharmony_ci mova m10,[pw_p1_n1] 212cabdff1aSopenharmony_ci mova m11,[pw_n1_p1] 213cabdff1aSopenharmony_ci mova m12,[pw_4] 214cabdff1aSopenharmony_ci mova m13,[pw_n4] 215cabdff1aSopenharmony_ci.loopw: 216cabdff1aSopenharmony_ci mov yq, 2 217cabdff1aSopenharmony_ci 218cabdff1aSopenharmony_ci mov posq, xq 219cabdff1aSopenharmony_ci movu m0, [inputq + posq] 220cabdff1aSopenharmony_ci add posq, istrideq 221cabdff1aSopenharmony_ci movu m1, [inputq + posq] 222cabdff1aSopenharmony_ci 223cabdff1aSopenharmony_ci paddsw m0, m1 224cabdff1aSopenharmony_ci 225cabdff1aSopenharmony_ci movu [lowq + xq], m0 226cabdff1aSopenharmony_ci 227cabdff1aSopenharmony_ci mov posq, xq 228cabdff1aSopenharmony_ci 229cabdff1aSopenharmony_ci movu m0, [inputq + posq] 230cabdff1aSopenharmony_ci add posq, istrideq 231cabdff1aSopenharmony_ci movu m1, [inputq + posq] 232cabdff1aSopenharmony_ci add posq, istrideq 233cabdff1aSopenharmony_ci movu m2, [inputq + posq] 234cabdff1aSopenharmony_ci add posq, istrideq 235cabdff1aSopenharmony_ci movu m3, [inputq + posq] 236cabdff1aSopenharmony_ci add posq, istrideq 237cabdff1aSopenharmony_ci movu m4, [inputq + posq] 238cabdff1aSopenharmony_ci add posq, istrideq 239cabdff1aSopenharmony_ci movu m5, [inputq + posq] 240cabdff1aSopenharmony_ci 241cabdff1aSopenharmony_ci mova m6, m0 242cabdff1aSopenharmony_ci punpcklwd m0, m1 243cabdff1aSopenharmony_ci punpckhwd m1, m6 244cabdff1aSopenharmony_ci 245cabdff1aSopenharmony_ci mova m6, m2 246cabdff1aSopenharmony_ci punpcklwd m2, m3 247cabdff1aSopenharmony_ci punpckhwd m3, m6 248cabdff1aSopenharmony_ci 249cabdff1aSopenharmony_ci mova m6, m4 250cabdff1aSopenharmony_ci punpcklwd m4, m5 251cabdff1aSopenharmony_ci punpckhwd m5, m6 252cabdff1aSopenharmony_ci 253cabdff1aSopenharmony_ci pmaddwd m0, [pw_p5_n11] 254cabdff1aSopenharmony_ci pmaddwd m1, [pw_n11_p5] 255cabdff1aSopenharmony_ci pmaddwd m2, m12 256cabdff1aSopenharmony_ci pmaddwd m3, m12 257cabdff1aSopenharmony_ci pmaddwd m4, m9 258cabdff1aSopenharmony_ci pmaddwd m5, m9 259cabdff1aSopenharmony_ci 260cabdff1aSopenharmony_ci paddd m0, m2 261cabdff1aSopenharmony_ci paddd m1, m3 262cabdff1aSopenharmony_ci paddd m0, m4 263cabdff1aSopenharmony_ci paddd m1, m5 264cabdff1aSopenharmony_ci 265cabdff1aSopenharmony_ci paddd m0, m7 266cabdff1aSopenharmony_ci paddd m1, m7 267cabdff1aSopenharmony_ci 268cabdff1aSopenharmony_ci psrad m0, 3 269cabdff1aSopenharmony_ci psrad m1, 3 270cabdff1aSopenharmony_ci packssdw m0, m1 271cabdff1aSopenharmony_ci 272cabdff1aSopenharmony_ci movu [highq + xq], m0 273cabdff1aSopenharmony_ci 274cabdff1aSopenharmony_ci.looph: 275cabdff1aSopenharmony_ci 276cabdff1aSopenharmony_ci mov posq, istrideq 277cabdff1aSopenharmony_ci imul posq, yq 278cabdff1aSopenharmony_ci add posq, xq 279cabdff1aSopenharmony_ci 280cabdff1aSopenharmony_ci movu m0, [inputq + posq] 281cabdff1aSopenharmony_ci 282cabdff1aSopenharmony_ci add posq, istrideq 283cabdff1aSopenharmony_ci movu m1, [inputq + posq] 284cabdff1aSopenharmony_ci 285cabdff1aSopenharmony_ci paddsw m0, m1 286cabdff1aSopenharmony_ci 287cabdff1aSopenharmony_ci mov posq, lwidthq 288cabdff1aSopenharmony_ci imul posq, yq 289cabdff1aSopenharmony_ci add posq, xq 290cabdff1aSopenharmony_ci 291cabdff1aSopenharmony_ci movu [lowq + posq], m0 292cabdff1aSopenharmony_ci 293cabdff1aSopenharmony_ci add yq, -2 294cabdff1aSopenharmony_ci 295cabdff1aSopenharmony_ci mov posq, istrideq 296cabdff1aSopenharmony_ci imul posq, yq 297cabdff1aSopenharmony_ci add posq, xq 298cabdff1aSopenharmony_ci 299cabdff1aSopenharmony_ci movu m0, [inputq + posq] 300cabdff1aSopenharmony_ci add posq, istrideq 301cabdff1aSopenharmony_ci movu m1, [inputq + posq] 302cabdff1aSopenharmony_ci add posq, istrideq 303cabdff1aSopenharmony_ci movu m2, [inputq + posq] 304cabdff1aSopenharmony_ci add posq, istrideq 305cabdff1aSopenharmony_ci movu m3, [inputq + posq] 306cabdff1aSopenharmony_ci add posq, istrideq 307cabdff1aSopenharmony_ci movu m4, [inputq + posq] 308cabdff1aSopenharmony_ci add posq, istrideq 309cabdff1aSopenharmony_ci movu m5, [inputq + posq] 310cabdff1aSopenharmony_ci 311cabdff1aSopenharmony_ci add yq, 2 312cabdff1aSopenharmony_ci 313cabdff1aSopenharmony_ci mova m6, m0 314cabdff1aSopenharmony_ci punpcklwd m0, m1 315cabdff1aSopenharmony_ci punpckhwd m1, m6 316cabdff1aSopenharmony_ci 317cabdff1aSopenharmony_ci mova m6, m2 318cabdff1aSopenharmony_ci punpcklwd m2, m3 319cabdff1aSopenharmony_ci punpckhwd m3, m6 320cabdff1aSopenharmony_ci 321cabdff1aSopenharmony_ci mova m6, m4 322cabdff1aSopenharmony_ci punpcklwd m4, m5 323cabdff1aSopenharmony_ci punpckhwd m5, m6 324cabdff1aSopenharmony_ci 325cabdff1aSopenharmony_ci pmaddwd m0, m9 326cabdff1aSopenharmony_ci pmaddwd m1, m9 327cabdff1aSopenharmony_ci pmaddwd m2, m10 328cabdff1aSopenharmony_ci pmaddwd m3, m11 329cabdff1aSopenharmony_ci pmaddwd m4, m8 330cabdff1aSopenharmony_ci pmaddwd m5, m8 331cabdff1aSopenharmony_ci 332cabdff1aSopenharmony_ci paddd m0, m4 333cabdff1aSopenharmony_ci paddd m1, m5 334cabdff1aSopenharmony_ci 335cabdff1aSopenharmony_ci paddd m0, m7 336cabdff1aSopenharmony_ci paddd m1, m7 337cabdff1aSopenharmony_ci 338cabdff1aSopenharmony_ci psrad m0, 3 339cabdff1aSopenharmony_ci psrad m1, 3 340cabdff1aSopenharmony_ci paddd m0, m2 341cabdff1aSopenharmony_ci paddd m1, m3 342cabdff1aSopenharmony_ci packssdw m0, m1 343cabdff1aSopenharmony_ci 344cabdff1aSopenharmony_ci mov posq, hwidthq 345cabdff1aSopenharmony_ci imul posq, yq 346cabdff1aSopenharmony_ci add posq, xq 347cabdff1aSopenharmony_ci 348cabdff1aSopenharmony_ci movu [highq + posq], m0 349cabdff1aSopenharmony_ci 350cabdff1aSopenharmony_ci add yq, 2 351cabdff1aSopenharmony_ci cmp yq, heightq 352cabdff1aSopenharmony_ci jl .looph 353cabdff1aSopenharmony_ci 354cabdff1aSopenharmony_ci mov posq, istrideq 355cabdff1aSopenharmony_ci imul posq, yq 356cabdff1aSopenharmony_ci add posq, xq 357cabdff1aSopenharmony_ci 358cabdff1aSopenharmony_ci movu m0, [inputq + posq] 359cabdff1aSopenharmony_ci add posq, istrideq 360cabdff1aSopenharmony_ci movu m1, [inputq + posq] 361cabdff1aSopenharmony_ci 362cabdff1aSopenharmony_ci paddsw m0, m1 363cabdff1aSopenharmony_ci 364cabdff1aSopenharmony_ci mov posq, lwidthq 365cabdff1aSopenharmony_ci imul posq, yq 366cabdff1aSopenharmony_ci add posq, xq 367cabdff1aSopenharmony_ci 368cabdff1aSopenharmony_ci movu [lowq + posq], m0 369cabdff1aSopenharmony_ci 370cabdff1aSopenharmony_ci sub yq, 4 371cabdff1aSopenharmony_ci 372cabdff1aSopenharmony_ci mov posq, istrideq 373cabdff1aSopenharmony_ci imul posq, yq 374cabdff1aSopenharmony_ci add posq, xq 375cabdff1aSopenharmony_ci 376cabdff1aSopenharmony_ci movu m0, [inputq + posq] 377cabdff1aSopenharmony_ci add posq, istrideq 378cabdff1aSopenharmony_ci movu m1, [inputq + posq] 379cabdff1aSopenharmony_ci add posq, istrideq 380cabdff1aSopenharmony_ci movu m2, [inputq + posq] 381cabdff1aSopenharmony_ci add posq, istrideq 382cabdff1aSopenharmony_ci movu m3, [inputq + posq] 383cabdff1aSopenharmony_ci add posq, istrideq 384cabdff1aSopenharmony_ci movu m4, [inputq + posq] 385cabdff1aSopenharmony_ci add posq, istrideq 386cabdff1aSopenharmony_ci movu m5, [inputq + posq] 387cabdff1aSopenharmony_ci 388cabdff1aSopenharmony_ci add yq, 4 389cabdff1aSopenharmony_ci 390cabdff1aSopenharmony_ci mova m6, m0 391cabdff1aSopenharmony_ci punpcklwd m0, m1 392cabdff1aSopenharmony_ci punpckhwd m1, m6 393cabdff1aSopenharmony_ci 394cabdff1aSopenharmony_ci mova m6, m2 395cabdff1aSopenharmony_ci punpcklwd m2, m3 396cabdff1aSopenharmony_ci punpckhwd m3, m6 397cabdff1aSopenharmony_ci 398cabdff1aSopenharmony_ci mova m6, m4 399cabdff1aSopenharmony_ci punpcklwd m4, m5 400cabdff1aSopenharmony_ci punpckhwd m5, m6 401cabdff1aSopenharmony_ci 402cabdff1aSopenharmony_ci pmaddwd m0, m8 403cabdff1aSopenharmony_ci pmaddwd m1, m8 404cabdff1aSopenharmony_ci pmaddwd m2, m13 405cabdff1aSopenharmony_ci pmaddwd m3, m13 406cabdff1aSopenharmony_ci pmaddwd m4, [pw_p11_n5] 407cabdff1aSopenharmony_ci pmaddwd m5, [pw_n5_p11] 408cabdff1aSopenharmony_ci 409cabdff1aSopenharmony_ci paddd m4, m2 410cabdff1aSopenharmony_ci paddd m5, m3 411cabdff1aSopenharmony_ci 412cabdff1aSopenharmony_ci paddd m4, m0 413cabdff1aSopenharmony_ci paddd m5, m1 414cabdff1aSopenharmony_ci 415cabdff1aSopenharmony_ci paddd m4, m7 416cabdff1aSopenharmony_ci paddd m5, m7 417cabdff1aSopenharmony_ci 418cabdff1aSopenharmony_ci psrad m4, 3 419cabdff1aSopenharmony_ci psrad m5, 3 420cabdff1aSopenharmony_ci packssdw m4, m5 421cabdff1aSopenharmony_ci 422cabdff1aSopenharmony_ci mov posq, hwidthq 423cabdff1aSopenharmony_ci imul posq, yq 424cabdff1aSopenharmony_ci add posq, xq 425cabdff1aSopenharmony_ci 426cabdff1aSopenharmony_ci movu [highq + posq], m4 427cabdff1aSopenharmony_ci 428cabdff1aSopenharmony_ci add xq, mmsize 429cabdff1aSopenharmony_ci cmp xq, widthq 430cabdff1aSopenharmony_ci jl .loopw 431cabdff1aSopenharmony_ci RET 432cabdff1aSopenharmony_ci%endif 433