1cabdff1aSopenharmony_ci; ***************************************************************************** 2cabdff1aSopenharmony_ci; * Provide SIMD optimizations for add_residual functions for HEVC decoding 3cabdff1aSopenharmony_ci; * Copyright (c) 2014 Pierre-Edouard LEPERE 4cabdff1aSopenharmony_ci; * 5cabdff1aSopenharmony_ci; * This file is part of FFmpeg. 6cabdff1aSopenharmony_ci; * 7cabdff1aSopenharmony_ci; * FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci; * modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci; * License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci; * version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci; * 12cabdff1aSopenharmony_ci; * FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci; * but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci; * Lesser General Public License for more details. 16cabdff1aSopenharmony_ci; * 17cabdff1aSopenharmony_ci; * You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci; * License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci; ****************************************************************************** 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ciSECTION .text 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_cicextern pw_1023 27cabdff1aSopenharmony_ci%define max_pixels_10 pw_1023 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_ci; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project 30cabdff1aSopenharmony_ci%macro ADD_RES_MMX_4_8 0 31cabdff1aSopenharmony_ci mova m0, [r1] 32cabdff1aSopenharmony_ci mova m2, [r1+8] 33cabdff1aSopenharmony_ci 34cabdff1aSopenharmony_ci movd m1, [r0] 35cabdff1aSopenharmony_ci movd m3, [r0+r2] 36cabdff1aSopenharmony_ci punpcklbw m1, m4 37cabdff1aSopenharmony_ci punpcklbw m3, m4 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_ci paddsw m0, m1 40cabdff1aSopenharmony_ci paddsw m2, m3 41cabdff1aSopenharmony_ci packuswb m0, m4 42cabdff1aSopenharmony_ci packuswb m2, m4 43cabdff1aSopenharmony_ci 44cabdff1aSopenharmony_ci movd [r0], m0 45cabdff1aSopenharmony_ci movd [r0+r2], m2 46cabdff1aSopenharmony_ci%endmacro 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_ci 49cabdff1aSopenharmony_ciINIT_MMX mmxext 50cabdff1aSopenharmony_ci; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride) 51cabdff1aSopenharmony_cicglobal hevc_add_residual_4_8, 3, 3, 6 52cabdff1aSopenharmony_ci pxor m4, m4 53cabdff1aSopenharmony_ci ADD_RES_MMX_4_8 54cabdff1aSopenharmony_ci add r1, 16 55cabdff1aSopenharmony_ci lea r0, [r0+r2*2] 56cabdff1aSopenharmony_ci ADD_RES_MMX_4_8 57cabdff1aSopenharmony_ci RET 58cabdff1aSopenharmony_ci 59cabdff1aSopenharmony_ci%macro ADD_RES_SSE_8_8 0 60cabdff1aSopenharmony_ci movq m0, [r0] 61cabdff1aSopenharmony_ci movq m1, [r0+r2] 62cabdff1aSopenharmony_ci punpcklbw m0, m4 63cabdff1aSopenharmony_ci punpcklbw m1, m4 64cabdff1aSopenharmony_ci mova m2, [r1] 65cabdff1aSopenharmony_ci mova m3, [r1+16] 66cabdff1aSopenharmony_ci paddsw m0, m2 67cabdff1aSopenharmony_ci paddsw m1, m3 68cabdff1aSopenharmony_ci packuswb m0, m1 69cabdff1aSopenharmony_ci 70cabdff1aSopenharmony_ci movq m2, [r0+r2*2] 71cabdff1aSopenharmony_ci movq m3, [r0+r3] 72cabdff1aSopenharmony_ci punpcklbw m2, m4 73cabdff1aSopenharmony_ci punpcklbw m3, m4 74cabdff1aSopenharmony_ci mova m6, [r1+32] 75cabdff1aSopenharmony_ci mova m7, [r1+48] 76cabdff1aSopenharmony_ci paddsw m2, m6 77cabdff1aSopenharmony_ci paddsw m3, m7 78cabdff1aSopenharmony_ci packuswb m2, m3 79cabdff1aSopenharmony_ci 80cabdff1aSopenharmony_ci movq [r0], m0 81cabdff1aSopenharmony_ci movhps [r0+r2], m0 82cabdff1aSopenharmony_ci movq [r0+r2*2], m2 83cabdff1aSopenharmony_ci movhps [r0+r3], m2 84cabdff1aSopenharmony_ci%endmacro 85cabdff1aSopenharmony_ci 86cabdff1aSopenharmony_ci%macro ADD_RES_SSE_16_32_8 3 87cabdff1aSopenharmony_ci mova m1, [%2] 88cabdff1aSopenharmony_ci mova m2, m1 89cabdff1aSopenharmony_ci punpcklbw m1, m0 90cabdff1aSopenharmony_ci punpckhbw m2, m0 91cabdff1aSopenharmony_ci mova xm5, [r1+%1] 92cabdff1aSopenharmony_ci mova xm6, [r1+%1+16] 93cabdff1aSopenharmony_ci%if cpuflag(avx2) 94cabdff1aSopenharmony_ci vinserti128 m5, m5, [r1+%1+32], 1 95cabdff1aSopenharmony_ci vinserti128 m6, m6, [r1+%1+48], 1 96cabdff1aSopenharmony_ci%endif 97cabdff1aSopenharmony_ci paddsw m1, m5 98cabdff1aSopenharmony_ci paddsw m2, m6 99cabdff1aSopenharmony_ci 100cabdff1aSopenharmony_ci mova m3, [%3] 101cabdff1aSopenharmony_ci mova m4, m3 102cabdff1aSopenharmony_ci punpcklbw m3, m0 103cabdff1aSopenharmony_ci punpckhbw m4, m0 104cabdff1aSopenharmony_ci mova xm5, [r1+%1+mmsize*2] 105cabdff1aSopenharmony_ci mova xm6, [r1+%1+mmsize*2+16] 106cabdff1aSopenharmony_ci%if cpuflag(avx2) 107cabdff1aSopenharmony_ci vinserti128 m5, m5, [r1+%1+96], 1 108cabdff1aSopenharmony_ci vinserti128 m6, m6, [r1+%1+112], 1 109cabdff1aSopenharmony_ci%endif 110cabdff1aSopenharmony_ci paddsw m3, m5 111cabdff1aSopenharmony_ci paddsw m4, m6 112cabdff1aSopenharmony_ci 113cabdff1aSopenharmony_ci packuswb m1, m2 114cabdff1aSopenharmony_ci packuswb m3, m4 115cabdff1aSopenharmony_ci mova [%2], m1 116cabdff1aSopenharmony_ci mova [%3], m3 117cabdff1aSopenharmony_ci%endmacro 118cabdff1aSopenharmony_ci 119cabdff1aSopenharmony_ci 120cabdff1aSopenharmony_ci%macro TRANSFORM_ADD_8 0 121cabdff1aSopenharmony_ci; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride) 122cabdff1aSopenharmony_cicglobal hevc_add_residual_8_8, 3, 4, 8 123cabdff1aSopenharmony_ci pxor m4, m4 124cabdff1aSopenharmony_ci lea r3, [r2*3] 125cabdff1aSopenharmony_ci ADD_RES_SSE_8_8 126cabdff1aSopenharmony_ci add r1, 64 127cabdff1aSopenharmony_ci lea r0, [r0+r2*4] 128cabdff1aSopenharmony_ci ADD_RES_SSE_8_8 129cabdff1aSopenharmony_ci RET 130cabdff1aSopenharmony_ci 131cabdff1aSopenharmony_ci; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride) 132cabdff1aSopenharmony_cicglobal hevc_add_residual_16_8, 3, 5, 7 133cabdff1aSopenharmony_ci pxor m0, m0 134cabdff1aSopenharmony_ci lea r3, [r2*3] 135cabdff1aSopenharmony_ci mov r4d, 4 136cabdff1aSopenharmony_ci.loop: 137cabdff1aSopenharmony_ci ADD_RES_SSE_16_32_8 0, r0, r0+r2 138cabdff1aSopenharmony_ci ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3 139cabdff1aSopenharmony_ci add r1, 128 140cabdff1aSopenharmony_ci lea r0, [r0+r2*4] 141cabdff1aSopenharmony_ci dec r4d 142cabdff1aSopenharmony_ci jg .loop 143cabdff1aSopenharmony_ci RET 144cabdff1aSopenharmony_ci 145cabdff1aSopenharmony_ci; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride) 146cabdff1aSopenharmony_cicglobal hevc_add_residual_32_8, 3, 5, 7 147cabdff1aSopenharmony_ci pxor m0, m0 148cabdff1aSopenharmony_ci mov r4d, 16 149cabdff1aSopenharmony_ci.loop: 150cabdff1aSopenharmony_ci ADD_RES_SSE_16_32_8 0, r0, r0+16 151cabdff1aSopenharmony_ci ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16 152cabdff1aSopenharmony_ci add r1, 128 153cabdff1aSopenharmony_ci lea r0, [r0+r2*2] 154cabdff1aSopenharmony_ci dec r4d 155cabdff1aSopenharmony_ci jg .loop 156cabdff1aSopenharmony_ci RET 157cabdff1aSopenharmony_ci%endmacro 158cabdff1aSopenharmony_ci 159cabdff1aSopenharmony_ciINIT_XMM sse2 160cabdff1aSopenharmony_ciTRANSFORM_ADD_8 161cabdff1aSopenharmony_ciINIT_XMM avx 162cabdff1aSopenharmony_ciTRANSFORM_ADD_8 163cabdff1aSopenharmony_ci 164cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 165cabdff1aSopenharmony_ciINIT_YMM avx2 166cabdff1aSopenharmony_ci; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride) 167cabdff1aSopenharmony_cicglobal hevc_add_residual_32_8, 3, 5, 7 168cabdff1aSopenharmony_ci pxor m0, m0 169cabdff1aSopenharmony_ci lea r3, [r2*3] 170cabdff1aSopenharmony_ci mov r4d, 8 171cabdff1aSopenharmony_ci.loop: 172cabdff1aSopenharmony_ci ADD_RES_SSE_16_32_8 0, r0, r0+r2 173cabdff1aSopenharmony_ci ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3 174cabdff1aSopenharmony_ci add r1, 256 175cabdff1aSopenharmony_ci lea r0, [r0+r2*4] 176cabdff1aSopenharmony_ci dec r4d 177cabdff1aSopenharmony_ci jg .loop 178cabdff1aSopenharmony_ci RET 179cabdff1aSopenharmony_ci%endif ;HAVE_AVX2_EXTERNAL 180cabdff1aSopenharmony_ci 181cabdff1aSopenharmony_ci%macro ADD_RES_SSE_8_10 4 182cabdff1aSopenharmony_ci mova m0, [%4] 183cabdff1aSopenharmony_ci mova m1, [%4+16] 184cabdff1aSopenharmony_ci mova m2, [%4+32] 185cabdff1aSopenharmony_ci mova m3, [%4+48] 186cabdff1aSopenharmony_ci paddw m0, [%1+0] 187cabdff1aSopenharmony_ci paddw m1, [%1+%2] 188cabdff1aSopenharmony_ci paddw m2, [%1+%2*2] 189cabdff1aSopenharmony_ci paddw m3, [%1+%3] 190cabdff1aSopenharmony_ci CLIPW m0, m4, m5 191cabdff1aSopenharmony_ci CLIPW m1, m4, m5 192cabdff1aSopenharmony_ci CLIPW m2, m4, m5 193cabdff1aSopenharmony_ci CLIPW m3, m4, m5 194cabdff1aSopenharmony_ci mova [%1+0], m0 195cabdff1aSopenharmony_ci mova [%1+%2], m1 196cabdff1aSopenharmony_ci mova [%1+%2*2], m2 197cabdff1aSopenharmony_ci mova [%1+%3], m3 198cabdff1aSopenharmony_ci%endmacro 199cabdff1aSopenharmony_ci 200cabdff1aSopenharmony_ci%macro ADD_RES_MMX_4_10 3 201cabdff1aSopenharmony_ci mova m0, [%1+0] 202cabdff1aSopenharmony_ci mova m1, [%1+%2] 203cabdff1aSopenharmony_ci paddw m0, [%3] 204cabdff1aSopenharmony_ci paddw m1, [%3+8] 205cabdff1aSopenharmony_ci CLIPW m0, m2, m3 206cabdff1aSopenharmony_ci CLIPW m1, m2, m3 207cabdff1aSopenharmony_ci mova [%1+0], m0 208cabdff1aSopenharmony_ci mova [%1+%2], m1 209cabdff1aSopenharmony_ci%endmacro 210cabdff1aSopenharmony_ci 211cabdff1aSopenharmony_ci%macro ADD_RES_SSE_16_10 3 212cabdff1aSopenharmony_ci mova m0, [%3] 213cabdff1aSopenharmony_ci mova m1, [%3+16] 214cabdff1aSopenharmony_ci mova m2, [%3+32] 215cabdff1aSopenharmony_ci mova m3, [%3+48] 216cabdff1aSopenharmony_ci paddw m0, [%1] 217cabdff1aSopenharmony_ci paddw m1, [%1+16] 218cabdff1aSopenharmony_ci paddw m2, [%1+%2] 219cabdff1aSopenharmony_ci paddw m3, [%1+%2+16] 220cabdff1aSopenharmony_ci CLIPW m0, m4, m5 221cabdff1aSopenharmony_ci CLIPW m1, m4, m5 222cabdff1aSopenharmony_ci CLIPW m2, m4, m5 223cabdff1aSopenharmony_ci CLIPW m3, m4, m5 224cabdff1aSopenharmony_ci mova [%1], m0 225cabdff1aSopenharmony_ci mova [%1+16], m1 226cabdff1aSopenharmony_ci mova [%1+%2], m2 227cabdff1aSopenharmony_ci mova [%1+%2+16], m3 228cabdff1aSopenharmony_ci%endmacro 229cabdff1aSopenharmony_ci 230cabdff1aSopenharmony_ci%macro ADD_RES_SSE_32_10 2 231cabdff1aSopenharmony_ci mova m0, [%2] 232cabdff1aSopenharmony_ci mova m1, [%2+16] 233cabdff1aSopenharmony_ci mova m2, [%2+32] 234cabdff1aSopenharmony_ci mova m3, [%2+48] 235cabdff1aSopenharmony_ci 236cabdff1aSopenharmony_ci paddw m0, [%1] 237cabdff1aSopenharmony_ci paddw m1, [%1+16] 238cabdff1aSopenharmony_ci paddw m2, [%1+32] 239cabdff1aSopenharmony_ci paddw m3, [%1+48] 240cabdff1aSopenharmony_ci CLIPW m0, m4, m5 241cabdff1aSopenharmony_ci CLIPW m1, m4, m5 242cabdff1aSopenharmony_ci CLIPW m2, m4, m5 243cabdff1aSopenharmony_ci CLIPW m3, m4, m5 244cabdff1aSopenharmony_ci mova [%1], m0 245cabdff1aSopenharmony_ci mova [%1+16], m1 246cabdff1aSopenharmony_ci mova [%1+32], m2 247cabdff1aSopenharmony_ci mova [%1+48], m3 248cabdff1aSopenharmony_ci%endmacro 249cabdff1aSopenharmony_ci 250cabdff1aSopenharmony_ci%macro ADD_RES_AVX2_16_10 4 251cabdff1aSopenharmony_ci mova m0, [%4] 252cabdff1aSopenharmony_ci mova m1, [%4+32] 253cabdff1aSopenharmony_ci mova m2, [%4+64] 254cabdff1aSopenharmony_ci mova m3, [%4+96] 255cabdff1aSopenharmony_ci 256cabdff1aSopenharmony_ci paddw m0, [%1+0] 257cabdff1aSopenharmony_ci paddw m1, [%1+%2] 258cabdff1aSopenharmony_ci paddw m2, [%1+%2*2] 259cabdff1aSopenharmony_ci paddw m3, [%1+%3] 260cabdff1aSopenharmony_ci 261cabdff1aSopenharmony_ci CLIPW m0, m4, m5 262cabdff1aSopenharmony_ci CLIPW m1, m4, m5 263cabdff1aSopenharmony_ci CLIPW m2, m4, m5 264cabdff1aSopenharmony_ci CLIPW m3, m4, m5 265cabdff1aSopenharmony_ci mova [%1+0], m0 266cabdff1aSopenharmony_ci mova [%1+%2], m1 267cabdff1aSopenharmony_ci mova [%1+%2*2], m2 268cabdff1aSopenharmony_ci mova [%1+%3], m3 269cabdff1aSopenharmony_ci%endmacro 270cabdff1aSopenharmony_ci 271cabdff1aSopenharmony_ci%macro ADD_RES_AVX2_32_10 3 272cabdff1aSopenharmony_ci mova m0, [%3] 273cabdff1aSopenharmony_ci mova m1, [%3+32] 274cabdff1aSopenharmony_ci mova m2, [%3+64] 275cabdff1aSopenharmony_ci mova m3, [%3+96] 276cabdff1aSopenharmony_ci 277cabdff1aSopenharmony_ci paddw m0, [%1] 278cabdff1aSopenharmony_ci paddw m1, [%1+32] 279cabdff1aSopenharmony_ci paddw m2, [%1+%2] 280cabdff1aSopenharmony_ci paddw m3, [%1+%2+32] 281cabdff1aSopenharmony_ci 282cabdff1aSopenharmony_ci CLIPW m0, m4, m5 283cabdff1aSopenharmony_ci CLIPW m1, m4, m5 284cabdff1aSopenharmony_ci CLIPW m2, m4, m5 285cabdff1aSopenharmony_ci CLIPW m3, m4, m5 286cabdff1aSopenharmony_ci mova [%1], m0 287cabdff1aSopenharmony_ci mova [%1+32], m1 288cabdff1aSopenharmony_ci mova [%1+%2], m2 289cabdff1aSopenharmony_ci mova [%1+%2+32], m3 290cabdff1aSopenharmony_ci%endmacro 291cabdff1aSopenharmony_ci 292cabdff1aSopenharmony_ci; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride) 293cabdff1aSopenharmony_ciINIT_MMX mmxext 294cabdff1aSopenharmony_cicglobal hevc_add_residual_4_10, 3, 3, 6 295cabdff1aSopenharmony_ci pxor m2, m2 296cabdff1aSopenharmony_ci mova m3, [max_pixels_10] 297cabdff1aSopenharmony_ci ADD_RES_MMX_4_10 r0, r2, r1 298cabdff1aSopenharmony_ci add r1, 16 299cabdff1aSopenharmony_ci lea r0, [r0+2*r2] 300cabdff1aSopenharmony_ci ADD_RES_MMX_4_10 r0, r2, r1 301cabdff1aSopenharmony_ci RET 302cabdff1aSopenharmony_ci 303cabdff1aSopenharmony_ciINIT_XMM sse2 304cabdff1aSopenharmony_cicglobal hevc_add_residual_8_10, 3, 4, 6 305cabdff1aSopenharmony_ci pxor m4, m4 306cabdff1aSopenharmony_ci mova m5, [max_pixels_10] 307cabdff1aSopenharmony_ci lea r3, [r2*3] 308cabdff1aSopenharmony_ci 309cabdff1aSopenharmony_ci ADD_RES_SSE_8_10 r0, r2, r3, r1 310cabdff1aSopenharmony_ci lea r0, [r0+r2*4] 311cabdff1aSopenharmony_ci add r1, 64 312cabdff1aSopenharmony_ci ADD_RES_SSE_8_10 r0, r2, r3, r1 313cabdff1aSopenharmony_ci RET 314cabdff1aSopenharmony_ci 315cabdff1aSopenharmony_cicglobal hevc_add_residual_16_10, 3, 5, 6 316cabdff1aSopenharmony_ci pxor m4, m4 317cabdff1aSopenharmony_ci mova m5, [max_pixels_10] 318cabdff1aSopenharmony_ci 319cabdff1aSopenharmony_ci mov r4d, 8 320cabdff1aSopenharmony_ci.loop: 321cabdff1aSopenharmony_ci ADD_RES_SSE_16_10 r0, r2, r1 322cabdff1aSopenharmony_ci lea r0, [r0+r2*2] 323cabdff1aSopenharmony_ci add r1, 64 324cabdff1aSopenharmony_ci dec r4d 325cabdff1aSopenharmony_ci jg .loop 326cabdff1aSopenharmony_ci RET 327cabdff1aSopenharmony_ci 328cabdff1aSopenharmony_cicglobal hevc_add_residual_32_10, 3, 5, 6 329cabdff1aSopenharmony_ci pxor m4, m4 330cabdff1aSopenharmony_ci mova m5, [max_pixels_10] 331cabdff1aSopenharmony_ci 332cabdff1aSopenharmony_ci mov r4d, 32 333cabdff1aSopenharmony_ci.loop: 334cabdff1aSopenharmony_ci ADD_RES_SSE_32_10 r0, r1 335cabdff1aSopenharmony_ci lea r0, [r0+r2] 336cabdff1aSopenharmony_ci add r1, 64 337cabdff1aSopenharmony_ci dec r4d 338cabdff1aSopenharmony_ci jg .loop 339cabdff1aSopenharmony_ci RET 340cabdff1aSopenharmony_ci 341cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 342cabdff1aSopenharmony_ciINIT_YMM avx2 343cabdff1aSopenharmony_cicglobal hevc_add_residual_16_10, 3, 5, 6 344cabdff1aSopenharmony_ci pxor m4, m4 345cabdff1aSopenharmony_ci mova m5, [max_pixels_10] 346cabdff1aSopenharmony_ci lea r3, [r2*3] 347cabdff1aSopenharmony_ci 348cabdff1aSopenharmony_ci mov r4d, 4 349cabdff1aSopenharmony_ci.loop: 350cabdff1aSopenharmony_ci ADD_RES_AVX2_16_10 r0, r2, r3, r1 351cabdff1aSopenharmony_ci lea r0, [r0+r2*4] 352cabdff1aSopenharmony_ci add r1, 128 353cabdff1aSopenharmony_ci dec r4d 354cabdff1aSopenharmony_ci jg .loop 355cabdff1aSopenharmony_ci RET 356cabdff1aSopenharmony_ci 357cabdff1aSopenharmony_cicglobal hevc_add_residual_32_10, 3, 5, 6 358cabdff1aSopenharmony_ci pxor m4, m4 359cabdff1aSopenharmony_ci mova m5, [max_pixels_10] 360cabdff1aSopenharmony_ci 361cabdff1aSopenharmony_ci mov r4d, 16 362cabdff1aSopenharmony_ci.loop: 363cabdff1aSopenharmony_ci ADD_RES_AVX2_32_10 r0, r2, r1 364cabdff1aSopenharmony_ci lea r0, [r0+r2*2] 365cabdff1aSopenharmony_ci add r1, 128 366cabdff1aSopenharmony_ci dec r4d 367cabdff1aSopenharmony_ci jg .loop 368cabdff1aSopenharmony_ci RET 369cabdff1aSopenharmony_ci%endif ;HAVE_AVX2_EXTERNAL 370