1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * ARM NEON optimised IDCT functions for HEVC decoding 3cabdff1aSopenharmony_ci * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi> 4cabdff1aSopenharmony_ci * Copyright (c) 2017 Alexandra Hájková 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * This file is part of FFmpeg. 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci * 13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci * Lesser General Public License for more details. 17cabdff1aSopenharmony_ci * 18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci */ 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci#include "libavutil/arm/asm.S" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ciconst trans, align=4 26cabdff1aSopenharmony_ci .short 64, 83, 64, 36 27cabdff1aSopenharmony_ci .short 89, 75, 50, 18 28cabdff1aSopenharmony_ci .short 90, 87, 80, 70 29cabdff1aSopenharmony_ci .short 57, 43, 25, 9 30cabdff1aSopenharmony_ci .short 90, 90, 88, 85 31cabdff1aSopenharmony_ci .short 82, 78, 73, 67 32cabdff1aSopenharmony_ci .short 61, 54, 46, 38 33cabdff1aSopenharmony_ci .short 31, 22, 13, 4 34cabdff1aSopenharmony_ciendconst 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_ci.macro clip10 in1, in2, c1, c2 37cabdff1aSopenharmony_ci vmax.s16 \in1, \in1, \c1 38cabdff1aSopenharmony_ci vmax.s16 \in2, \in2, \c1 39cabdff1aSopenharmony_ci vmin.s16 \in1, \in1, \c2 40cabdff1aSopenharmony_ci vmin.s16 \in2, \in2, \c2 41cabdff1aSopenharmony_ci.endm 42cabdff1aSopenharmony_ci 43cabdff1aSopenharmony_cifunction ff_hevc_add_residual_4x4_8_neon, export=1 44cabdff1aSopenharmony_ci vld1.16 {q0-q1}, [r1, :128] 45cabdff1aSopenharmony_ci vld1.32 d4[0], [r0, :32], r2 46cabdff1aSopenharmony_ci vld1.32 d4[1], [r0, :32], r2 47cabdff1aSopenharmony_ci vld1.32 d5[0], [r0, :32], r2 48cabdff1aSopenharmony_ci vld1.32 d5[1], [r0, :32], r2 49cabdff1aSopenharmony_ci sub r0, r0, r2, lsl #2 50cabdff1aSopenharmony_ci vmovl.u8 q8, d4 51cabdff1aSopenharmony_ci vmovl.u8 q9, d5 52cabdff1aSopenharmony_ci vqadd.s16 q0, q0, q8 53cabdff1aSopenharmony_ci vqadd.s16 q1, q1, q9 54cabdff1aSopenharmony_ci vqmovun.s16 d0, q0 55cabdff1aSopenharmony_ci vqmovun.s16 d1, q1 56cabdff1aSopenharmony_ci vst1.32 d0[0], [r0, :32], r2 57cabdff1aSopenharmony_ci vst1.32 d0[1], [r0, :32], r2 58cabdff1aSopenharmony_ci vst1.32 d1[0], [r0, :32], r2 59cabdff1aSopenharmony_ci vst1.32 d1[1], [r0, :32], r2 60cabdff1aSopenharmony_ci bx lr 61cabdff1aSopenharmony_ciendfunc 62cabdff1aSopenharmony_ci 63cabdff1aSopenharmony_cifunction ff_hevc_add_residual_4x4_10_neon, export=1 64cabdff1aSopenharmony_ci mov r12, r0 65cabdff1aSopenharmony_ci vld1.16 {q0-q1}, [r1, :128] 66cabdff1aSopenharmony_ci vld1.16 d4, [r12, :64], r2 67cabdff1aSopenharmony_ci vld1.16 d5, [r12, :64], r2 68cabdff1aSopenharmony_ci vld1.16 d6, [r12, :64], r2 69cabdff1aSopenharmony_ci vqadd.s16 q0, q2 70cabdff1aSopenharmony_ci vld1.16 d7, [r12, :64], r2 71cabdff1aSopenharmony_ci vmov.s16 q12, #0 72cabdff1aSopenharmony_ci vqadd.s16 q1, q3 73cabdff1aSopenharmony_ci vmvn.s16 q13, #0xFC00 @ vmov.s16 #0x3FF 74cabdff1aSopenharmony_ci clip10 q0, q1, q12, q13 75cabdff1aSopenharmony_ci vst1.16 d0, [r0, :64], r2 76cabdff1aSopenharmony_ci vst1.16 d1, [r0, :64], r2 77cabdff1aSopenharmony_ci vst1.16 d2, [r0, :64], r2 78cabdff1aSopenharmony_ci vst1.16 d3, [r0, :64], r2 79cabdff1aSopenharmony_ci bx lr 80cabdff1aSopenharmony_ciendfunc 81cabdff1aSopenharmony_ci 82cabdff1aSopenharmony_cifunction ff_hevc_add_residual_8x8_8_neon, export=1 83cabdff1aSopenharmony_ci add r12, r0, r2 84cabdff1aSopenharmony_ci add r2, r2, r2 85cabdff1aSopenharmony_ci mov r3, #8 86cabdff1aSopenharmony_ci1: subs r3, #2 87cabdff1aSopenharmony_ci vld1.8 {d16}, [r0, :64] 88cabdff1aSopenharmony_ci vld1.8 {d17}, [r12, :64] 89cabdff1aSopenharmony_ci vmovl.u8 q9, d16 90cabdff1aSopenharmony_ci vld1.16 {q0-q1}, [r1, :128]! 91cabdff1aSopenharmony_ci vmovl.u8 q8, d17 92cabdff1aSopenharmony_ci vqadd.s16 q0, q9 93cabdff1aSopenharmony_ci vqadd.s16 q1, q8 94cabdff1aSopenharmony_ci vqmovun.s16 d0, q0 95cabdff1aSopenharmony_ci vqmovun.s16 d1, q1 96cabdff1aSopenharmony_ci vst1.8 d0, [r0, :64], r2 97cabdff1aSopenharmony_ci vst1.8 d1, [r12, :64], r2 98cabdff1aSopenharmony_ci bne 1b 99cabdff1aSopenharmony_ci bx lr 100cabdff1aSopenharmony_ciendfunc 101cabdff1aSopenharmony_ci 102cabdff1aSopenharmony_cifunction ff_hevc_add_residual_8x8_10_neon, export=1 103cabdff1aSopenharmony_ci add r12, r0, r2 104cabdff1aSopenharmony_ci add r2, r2, r2 105cabdff1aSopenharmony_ci mov r3, #8 106cabdff1aSopenharmony_ci vmov.s16 q12, #0 107cabdff1aSopenharmony_ci vmvn.s16 q13, #0xFC00 @ vmov.s16 #0x3FF 108cabdff1aSopenharmony_ci1: subs r3, #2 109cabdff1aSopenharmony_ci vld1.16 {q0-q1}, [r1, :128]! 110cabdff1aSopenharmony_ci vld1.16 {q8}, [r0, :128] 111cabdff1aSopenharmony_ci vqadd.s16 q0, q8 112cabdff1aSopenharmony_ci vld1.16 {q9}, [r12, :128] 113cabdff1aSopenharmony_ci vqadd.s16 q1, q9 114cabdff1aSopenharmony_ci clip10 q0, q1, q12, q13 115cabdff1aSopenharmony_ci vst1.16 {q0}, [r0, :128], r2 116cabdff1aSopenharmony_ci vst1.16 {q1}, [r12, :128], r2 117cabdff1aSopenharmony_ci bne 1b 118cabdff1aSopenharmony_ci bx lr 119cabdff1aSopenharmony_ciendfunc 120cabdff1aSopenharmony_ci 121cabdff1aSopenharmony_cifunction ff_hevc_add_residual_16x16_8_neon, export=1 122cabdff1aSopenharmony_ci mov r3, #16 123cabdff1aSopenharmony_ci add r12, r0, r2 124cabdff1aSopenharmony_ci add r2, r2, r2 125cabdff1aSopenharmony_ci1: subs r3, #2 126cabdff1aSopenharmony_ci vld1.8 {q8}, [r0, :128] 127cabdff1aSopenharmony_ci vld1.16 {q0, q1}, [r1, :128]! 128cabdff1aSopenharmony_ci vld1.8 {q11}, [r12, :128] 129cabdff1aSopenharmony_ci vld1.16 {q2, q3}, [r1, :128]! 130cabdff1aSopenharmony_ci vmovl.u8 q9, d16 131cabdff1aSopenharmony_ci vmovl.u8 q10, d17 132cabdff1aSopenharmony_ci vmovl.u8 q12, d22 133cabdff1aSopenharmony_ci vmovl.u8 q13, d23 134cabdff1aSopenharmony_ci vqadd.s16 q0, q9 135cabdff1aSopenharmony_ci vqadd.s16 q1, q10 136cabdff1aSopenharmony_ci vqadd.s16 q2, q12 137cabdff1aSopenharmony_ci vqadd.s16 q3, q13 138cabdff1aSopenharmony_ci vqmovun.s16 d0, q0 139cabdff1aSopenharmony_ci vqmovun.s16 d1, q1 140cabdff1aSopenharmony_ci vqmovun.s16 d2, q2 141cabdff1aSopenharmony_ci vqmovun.s16 d3, q3 142cabdff1aSopenharmony_ci vst1.8 {q0}, [r0, :128], r2 143cabdff1aSopenharmony_ci vst1.8 {q1}, [r12, :128], r2 144cabdff1aSopenharmony_ci bne 1b 145cabdff1aSopenharmony_ci bx lr 146cabdff1aSopenharmony_ciendfunc 147cabdff1aSopenharmony_ci 148cabdff1aSopenharmony_cifunction ff_hevc_add_residual_16x16_10_neon, export=1 149cabdff1aSopenharmony_ci mov r3, #16 150cabdff1aSopenharmony_ci vmov.s16 q12, #0 151cabdff1aSopenharmony_ci vmvn.s16 q13, #0xFC00 @ vmov.s16 #0x3FF 152cabdff1aSopenharmony_ci add r12, r0, r2 153cabdff1aSopenharmony_ci add r2, r2, r2 154cabdff1aSopenharmony_ci1: subs r3, #2 155cabdff1aSopenharmony_ci vld1.16 {q8-q9}, [r0, :128] 156cabdff1aSopenharmony_ci vld1.16 {q0, q1}, [r1, :128]! 157cabdff1aSopenharmony_ci vqadd.s16 q0, q8 158cabdff1aSopenharmony_ci vld1.16 {q10-q11}, [r12, :128] 159cabdff1aSopenharmony_ci vqadd.s16 q1, q9 160cabdff1aSopenharmony_ci vld1.16 {q2, q3}, [r1, :128]! 161cabdff1aSopenharmony_ci vqadd.s16 q2, q10 162cabdff1aSopenharmony_ci vqadd.s16 q3, q11 163cabdff1aSopenharmony_ci clip10 q0, q1, q12, q13 164cabdff1aSopenharmony_ci clip10 q2, q3, q12, q13 165cabdff1aSopenharmony_ci vst1.16 {q0-q1}, [r0, :128], r2 166cabdff1aSopenharmony_ci vst1.16 {q2-q3}, [r12, :128], r2 167cabdff1aSopenharmony_ci bne 1b 168cabdff1aSopenharmony_ci bx lr 169cabdff1aSopenharmony_ciendfunc 170cabdff1aSopenharmony_ci 171cabdff1aSopenharmony_cifunction ff_hevc_add_residual_32x32_8_neon, export=1 172cabdff1aSopenharmony_ci vpush {q4-q7} 173cabdff1aSopenharmony_ci add r12, r0, r2 174cabdff1aSopenharmony_ci add r2, r2, r2 175cabdff1aSopenharmony_ci mov r3, #32 176cabdff1aSopenharmony_ci1: subs r3, #2 177cabdff1aSopenharmony_ci vld1.8 {q12, q13}, [r0, :128] 178cabdff1aSopenharmony_ci vmovl.u8 q8, d24 179cabdff1aSopenharmony_ci vmovl.u8 q9, d25 180cabdff1aSopenharmony_ci vld1.8 {q14, q15}, [r12, :128] 181cabdff1aSopenharmony_ci vmovl.u8 q10, d26 182cabdff1aSopenharmony_ci vmovl.u8 q11, d27 183cabdff1aSopenharmony_ci vmovl.u8 q12, d28 184cabdff1aSopenharmony_ci vldm r1!, {q0-q7} 185cabdff1aSopenharmony_ci vmovl.u8 q13, d29 186cabdff1aSopenharmony_ci vmovl.u8 q14, d30 187cabdff1aSopenharmony_ci vmovl.u8 q15, d31 188cabdff1aSopenharmony_ci vqadd.s16 q0, q8 189cabdff1aSopenharmony_ci vqadd.s16 q1, q9 190cabdff1aSopenharmony_ci vqadd.s16 q2, q10 191cabdff1aSopenharmony_ci vqadd.s16 q3, q11 192cabdff1aSopenharmony_ci vqadd.s16 q4, q12 193cabdff1aSopenharmony_ci vqadd.s16 q5, q13 194cabdff1aSopenharmony_ci vqadd.s16 q6, q14 195cabdff1aSopenharmony_ci vqadd.s16 q7, q15 196cabdff1aSopenharmony_ci vqmovun.s16 d0, q0 197cabdff1aSopenharmony_ci vqmovun.s16 d1, q1 198cabdff1aSopenharmony_ci vqmovun.s16 d2, q2 199cabdff1aSopenharmony_ci vqmovun.s16 d3, q3 200cabdff1aSopenharmony_ci vqmovun.s16 d4, q4 201cabdff1aSopenharmony_ci vqmovun.s16 d5, q5 202cabdff1aSopenharmony_ci vst1.8 {q0, q1}, [r0, :128], r2 203cabdff1aSopenharmony_ci vqmovun.s16 d6, q6 204cabdff1aSopenharmony_ci vqmovun.s16 d7, q7 205cabdff1aSopenharmony_ci vst1.8 {q2, q3}, [r12, :128], r2 206cabdff1aSopenharmony_ci bne 1b 207cabdff1aSopenharmony_ci vpop {q4-q7} 208cabdff1aSopenharmony_ci bx lr 209cabdff1aSopenharmony_ciendfunc 210cabdff1aSopenharmony_ci 211cabdff1aSopenharmony_cifunction ff_hevc_add_residual_32x32_10_neon, export=1 212cabdff1aSopenharmony_ci mov r3, #32 213cabdff1aSopenharmony_ci add r12, r0, #32 214cabdff1aSopenharmony_ci vmov.s16 q12, #0 215cabdff1aSopenharmony_ci vmvn.s16 q13, #0xFC00 @ vmov.s16 #0x3FF 216cabdff1aSopenharmony_ci1: subs r3, #1 217cabdff1aSopenharmony_ci vldm r1!, {q0-q3} 218cabdff1aSopenharmony_ci vld1.16 {q8, q9}, [r0, :128] 219cabdff1aSopenharmony_ci vld1.16 {q10, q11}, [r12, :128] 220cabdff1aSopenharmony_ci vqadd.s16 q0, q8 221cabdff1aSopenharmony_ci vqadd.s16 q1, q9 222cabdff1aSopenharmony_ci vqadd.s16 q2, q10 223cabdff1aSopenharmony_ci vqadd.s16 q3, q11 224cabdff1aSopenharmony_ci clip10 q0, q1, q12, q13 225cabdff1aSopenharmony_ci clip10 q2, q3, q12, q13 226cabdff1aSopenharmony_ci vst1.16 {q0-q1}, [r0, :128], r2 227cabdff1aSopenharmony_ci vst1.16 {q2-q3}, [r12, :128], r2 228cabdff1aSopenharmony_ci bne 1b 229cabdff1aSopenharmony_ci bx lr 230cabdff1aSopenharmony_ciendfunc 231cabdff1aSopenharmony_ci 232cabdff1aSopenharmony_ci.macro idct_4x4_dc bitdepth 233cabdff1aSopenharmony_cifunction ff_hevc_idct_4x4_dc_\bitdepth\()_neon, export=1 234cabdff1aSopenharmony_ci ldrsh r1, [r0] 235cabdff1aSopenharmony_ci ldr r2, =(1 << (13 - \bitdepth)) 236cabdff1aSopenharmony_ci add r1, #1 237cabdff1aSopenharmony_ci asr r1, #1 238cabdff1aSopenharmony_ci add r1, r2 239cabdff1aSopenharmony_ci asr r1, #(14 - \bitdepth) 240cabdff1aSopenharmony_ci vdup.16 q0, r1 241cabdff1aSopenharmony_ci vdup.16 q1, r1 242cabdff1aSopenharmony_ci vst1.16 {q0, q1}, [r0, :128] 243cabdff1aSopenharmony_ci bx lr 244cabdff1aSopenharmony_ciendfunc 245cabdff1aSopenharmony_ci.endm 246cabdff1aSopenharmony_ci 247cabdff1aSopenharmony_ci.macro idct_8x8_dc bitdepth 248cabdff1aSopenharmony_cifunction ff_hevc_idct_8x8_dc_\bitdepth\()_neon, export=1 249cabdff1aSopenharmony_ci ldrsh r1, [r0] 250cabdff1aSopenharmony_ci ldr r2, =(1 << (13 - \bitdepth)) 251cabdff1aSopenharmony_ci add r1, #1 252cabdff1aSopenharmony_ci asr r1, #1 253cabdff1aSopenharmony_ci add r1, r2 254cabdff1aSopenharmony_ci asr r1, #(14 - \bitdepth) 255cabdff1aSopenharmony_ci vdup.16 q8, r1 256cabdff1aSopenharmony_ci vdup.16 q9, r1 257cabdff1aSopenharmony_ci vmov.16 q10, q8 258cabdff1aSopenharmony_ci vmov.16 q11, q8 259cabdff1aSopenharmony_ci vmov.16 q12, q8 260cabdff1aSopenharmony_ci vmov.16 q13, q8 261cabdff1aSopenharmony_ci vmov.16 q14, q8 262cabdff1aSopenharmony_ci vmov.16 q15, q8 263cabdff1aSopenharmony_ci vstm r0, {q8-q15} 264cabdff1aSopenharmony_ci bx lr 265cabdff1aSopenharmony_ciendfunc 266cabdff1aSopenharmony_ci.endm 267cabdff1aSopenharmony_ci 268cabdff1aSopenharmony_ci.macro idct_16x16_dc bitdepth 269cabdff1aSopenharmony_cifunction ff_hevc_idct_16x16_dc_\bitdepth\()_neon, export=1 270cabdff1aSopenharmony_ci ldrsh r1, [r0] 271cabdff1aSopenharmony_ci ldr r2, =(1 << (13 - \bitdepth)) 272cabdff1aSopenharmony_ci add r1, #1 273cabdff1aSopenharmony_ci asr r1, #1 274cabdff1aSopenharmony_ci add r1, r2 275cabdff1aSopenharmony_ci asr r1, #(14 - \bitdepth) 276cabdff1aSopenharmony_ci vdup.16 q8, r1 277cabdff1aSopenharmony_ci vdup.16 q9, r1 278cabdff1aSopenharmony_ci vmov.16 q10, q8 279cabdff1aSopenharmony_ci vmov.16 q11, q8 280cabdff1aSopenharmony_ci vmov.16 q12, q8 281cabdff1aSopenharmony_ci vmov.16 q13, q8 282cabdff1aSopenharmony_ci vmov.16 q14, q8 283cabdff1aSopenharmony_ci vmov.16 q15, q8 284cabdff1aSopenharmony_ci vstm r0!, {q8-q15} 285cabdff1aSopenharmony_ci vstm r0!, {q8-q15} 286cabdff1aSopenharmony_ci vstm r0!, {q8-q15} 287cabdff1aSopenharmony_ci vstm r0, {q8-q15} 288cabdff1aSopenharmony_ci bx lr 289cabdff1aSopenharmony_ciendfunc 290cabdff1aSopenharmony_ci.endm 291cabdff1aSopenharmony_ci 292cabdff1aSopenharmony_ci.macro idct_32x32_dc bitdepth 293cabdff1aSopenharmony_cifunction ff_hevc_idct_32x32_dc_\bitdepth\()_neon, export=1 294cabdff1aSopenharmony_ci ldrsh r1, [r0] 295cabdff1aSopenharmony_ci ldr r2, =(1 << (13 - \bitdepth)) 296cabdff1aSopenharmony_ci add r1, #1 297cabdff1aSopenharmony_ci asr r1, #1 298cabdff1aSopenharmony_ci add r1, r2 299cabdff1aSopenharmony_ci asr r1, #(14 - \bitdepth) 300cabdff1aSopenharmony_ci mov r3, #16 301cabdff1aSopenharmony_ci vdup.16 q8, r1 302cabdff1aSopenharmony_ci vdup.16 q9, r1 303cabdff1aSopenharmony_ci vmov.16 q10, q8 304cabdff1aSopenharmony_ci vmov.16 q11, q8 305cabdff1aSopenharmony_ci vmov.16 q12, q8 306cabdff1aSopenharmony_ci vmov.16 q13, q8 307cabdff1aSopenharmony_ci vmov.16 q14, q8 308cabdff1aSopenharmony_ci vmov.16 q15, q8 309cabdff1aSopenharmony_ci1: subs r3, #1 310cabdff1aSopenharmony_ci vstm r0!, {q8-q15} 311cabdff1aSopenharmony_ci bne 1b 312cabdff1aSopenharmony_ci bx lr 313cabdff1aSopenharmony_ciendfunc 314cabdff1aSopenharmony_ci.endm 315cabdff1aSopenharmony_ci 316cabdff1aSopenharmony_ci.macro sum_sub out, in, c, op 317cabdff1aSopenharmony_ci .ifc \op, + 318cabdff1aSopenharmony_ci vmlal.s16 \out, \in, \c 319cabdff1aSopenharmony_ci .else 320cabdff1aSopenharmony_ci vmlsl.s16 \out, \in, \c 321cabdff1aSopenharmony_ci .endif 322cabdff1aSopenharmony_ci.endm 323cabdff1aSopenharmony_ci 324cabdff1aSopenharmony_ci.macro tr_4x4 in0, in1, in2, in3, out0, out1, out2, out3, shift, tmp0, tmp1, tmp2, tmp3, tmp4 325cabdff1aSopenharmony_ci vshll.s16 \tmp0, \in0, #6 326cabdff1aSopenharmony_ci vmull.s16 \tmp2, \in1, d4[1] 327cabdff1aSopenharmony_ci vmov \tmp1, \tmp0 328cabdff1aSopenharmony_ci vmull.s16 \tmp3, \in1, d4[3] 329cabdff1aSopenharmony_ci vmlal.s16 \tmp0, \in2, d4[0] @e0 330cabdff1aSopenharmony_ci vmlsl.s16 \tmp1, \in2, d4[0] @e1 331cabdff1aSopenharmony_ci vmlal.s16 \tmp2, \in3, d4[3] @o0 332cabdff1aSopenharmony_ci vmlsl.s16 \tmp3, \in3, d4[1] @o1 333cabdff1aSopenharmony_ci 334cabdff1aSopenharmony_ci vadd.s32 \tmp4, \tmp0, \tmp2 335cabdff1aSopenharmony_ci vsub.s32 \tmp0, \tmp0, \tmp2 336cabdff1aSopenharmony_ci vadd.s32 \tmp2, \tmp1, \tmp3 337cabdff1aSopenharmony_ci vsub.s32 \tmp1, \tmp1, \tmp3 338cabdff1aSopenharmony_ci vqrshrn.s32 \out0, \tmp4, #\shift 339cabdff1aSopenharmony_ci vqrshrn.s32 \out3, \tmp0, #\shift 340cabdff1aSopenharmony_ci vqrshrn.s32 \out1, \tmp2, #\shift 341cabdff1aSopenharmony_ci vqrshrn.s32 \out2, \tmp1, #\shift 342cabdff1aSopenharmony_ci.endm 343cabdff1aSopenharmony_ci 344cabdff1aSopenharmony_ci.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3 345cabdff1aSopenharmony_ci vshll.s16 \tmp0, \in0, #6 346cabdff1aSopenharmony_ci vld1.s16 {\in0}, [r1, :64]! 347cabdff1aSopenharmony_ci vmov \tmp1, \tmp0 348cabdff1aSopenharmony_ci vmull.s16 \tmp2, \in1, \in0[1] 349cabdff1aSopenharmony_ci vmull.s16 \tmp3, \in1, \in0[3] 350cabdff1aSopenharmony_ci vmlal.s16 \tmp0, \in2, \in0[0] @e0 351cabdff1aSopenharmony_ci vmlsl.s16 \tmp1, \in2, \in0[0] @e1 352cabdff1aSopenharmony_ci vmlal.s16 \tmp2, \in3, \in0[3] @o0 353cabdff1aSopenharmony_ci vmlsl.s16 \tmp3, \in3, \in0[1] @o1 354cabdff1aSopenharmony_ci 355cabdff1aSopenharmony_ci vld1.s16 {\in0}, [r1, :64] 356cabdff1aSopenharmony_ci 357cabdff1aSopenharmony_ci vadd.s32 \out0, \tmp0, \tmp2 358cabdff1aSopenharmony_ci vadd.s32 \out1, \tmp1, \tmp3 359cabdff1aSopenharmony_ci vsub.s32 \out2, \tmp1, \tmp3 360cabdff1aSopenharmony_ci vsub.s32 \out3, \tmp0, \tmp2 361cabdff1aSopenharmony_ci 362cabdff1aSopenharmony_ci sub r1, r1, #8 363cabdff1aSopenharmony_ci.endm 364cabdff1aSopenharmony_ci 365cabdff1aSopenharmony_ci@ Do a 4x4 transpose, using q registers for the subtransposes that don't 366cabdff1aSopenharmony_ci@ need to address the indiviudal d registers. 367cabdff1aSopenharmony_ci@ r0,r1 == rq0, r2,r3 == rq1 368cabdff1aSopenharmony_ci.macro transpose_4x4 rq0, rq1, r0, r1, r2, r3 369cabdff1aSopenharmony_ci vtrn.32 \rq0, \rq1 370cabdff1aSopenharmony_ci vtrn.16 \r0, \r1 371cabdff1aSopenharmony_ci vtrn.16 \r2, \r3 372cabdff1aSopenharmony_ci.endm 373cabdff1aSopenharmony_ci 374cabdff1aSopenharmony_ci.macro idct_4x4 bitdepth 375cabdff1aSopenharmony_cifunction ff_hevc_idct_4x4_\bitdepth\()_neon, export=1 376cabdff1aSopenharmony_ci@r0 - coeffs 377cabdff1aSopenharmony_ci vld1.s16 {q0-q1}, [r0, :128] 378cabdff1aSopenharmony_ci 379cabdff1aSopenharmony_ci movrel r1, trans 380cabdff1aSopenharmony_ci vld1.s16 {d4}, [r1, :64] 381cabdff1aSopenharmony_ci 382cabdff1aSopenharmony_ci tr_4x4 d0, d1, d2, d3, d16, d17, d18, d19, 7, q10, q11, q12, q13, q0 383cabdff1aSopenharmony_ci transpose_4x4 q8, q9, d16, d17, d18, d19 384cabdff1aSopenharmony_ci 385cabdff1aSopenharmony_ci tr_4x4 d16, d17, d18, d19, d0, d1, d2, d3, 20 - \bitdepth, q10, q11, q12, q13, q0 386cabdff1aSopenharmony_ci transpose_4x4 q0, q1, d0, d1, d2, d3 387cabdff1aSopenharmony_ci vst1.s16 {d0-d3}, [r0, :128] 388cabdff1aSopenharmony_ci bx lr 389cabdff1aSopenharmony_ciendfunc 390cabdff1aSopenharmony_ci.endm 391cabdff1aSopenharmony_ci 392cabdff1aSopenharmony_ci.macro transpose8_4x4 r0, r1, r2, r3 393cabdff1aSopenharmony_ci vtrn.16 \r0, \r1 394cabdff1aSopenharmony_ci vtrn.16 \r2, \r3 395cabdff1aSopenharmony_ci vtrn.32 \r0, \r2 396cabdff1aSopenharmony_ci vtrn.32 \r1, \r3 397cabdff1aSopenharmony_ci.endm 398cabdff1aSopenharmony_ci 399cabdff1aSopenharmony_ci.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7, l0, l1, l2, l3, l4, l5, l6, l7 400cabdff1aSopenharmony_ci transpose8_4x4 \r0, \r1, \r2, \r3 401cabdff1aSopenharmony_ci transpose8_4x4 \r4, \r5, \r6, \r7 402cabdff1aSopenharmony_ci 403cabdff1aSopenharmony_ci transpose8_4x4 \l0, \l1, \l2, \l3 404cabdff1aSopenharmony_ci transpose8_4x4 \l4, \l5, \l6, \l7 405cabdff1aSopenharmony_ci.endm 406cabdff1aSopenharmony_ci 407cabdff1aSopenharmony_ci.macro tr_8x4 shift, in0, in1, in2, in3, in4, in5, in6, in7 408cabdff1aSopenharmony_ci tr_4x4_8 \in0, \in2, \in4, \in6, q8, q9, q10, q11, q12, q13, q14, q15 409cabdff1aSopenharmony_ci 410cabdff1aSopenharmony_ci vmull.s16 q14, \in1, \in0[2] 411cabdff1aSopenharmony_ci vmull.s16 q12, \in1, \in0[0] 412cabdff1aSopenharmony_ci vmull.s16 q13, \in1, \in0[1] 413cabdff1aSopenharmony_ci sum_sub q14, \in3, \in0[0], - 414cabdff1aSopenharmony_ci sum_sub q12, \in3, \in0[1], + 415cabdff1aSopenharmony_ci sum_sub q13, \in3, \in0[3], - 416cabdff1aSopenharmony_ci 417cabdff1aSopenharmony_ci sum_sub q14, \in5, \in0[3], + 418cabdff1aSopenharmony_ci sum_sub q12, \in5, \in0[2], + 419cabdff1aSopenharmony_ci sum_sub q13, \in5, \in0[0], - 420cabdff1aSopenharmony_ci 421cabdff1aSopenharmony_ci sum_sub q14, \in7, \in0[1], + 422cabdff1aSopenharmony_ci sum_sub q12, \in7, \in0[3], + 423cabdff1aSopenharmony_ci sum_sub q13, \in7, \in0[2], - 424cabdff1aSopenharmony_ci 425cabdff1aSopenharmony_ci vadd.s32 q15, q10, q14 426cabdff1aSopenharmony_ci vsub.s32 q10, q10, q14 427cabdff1aSopenharmony_ci vqrshrn.s32 \in2, q15, \shift 428cabdff1aSopenharmony_ci 429cabdff1aSopenharmony_ci vmull.s16 q15, \in1, \in0[3] 430cabdff1aSopenharmony_ci sum_sub q15, \in3, \in0[2], - 431cabdff1aSopenharmony_ci sum_sub q15, \in5, \in0[1], + 432cabdff1aSopenharmony_ci sum_sub q15, \in7, \in0[0], - 433cabdff1aSopenharmony_ci 434cabdff1aSopenharmony_ci vqrshrn.s32 \in5, q10, \shift 435cabdff1aSopenharmony_ci 436cabdff1aSopenharmony_ci vadd.s32 q10, q8, q12 437cabdff1aSopenharmony_ci vsub.s32 q8, q8, q12 438cabdff1aSopenharmony_ci vadd.s32 q12, q9, q13 439cabdff1aSopenharmony_ci vsub.s32 q9, q9, q13 440cabdff1aSopenharmony_ci vadd.s32 q14, q11, q15 441cabdff1aSopenharmony_ci vsub.s32 q11, q11, q15 442cabdff1aSopenharmony_ci 443cabdff1aSopenharmony_ci vqrshrn.s32 \in0, q10, \shift 444cabdff1aSopenharmony_ci vqrshrn.s32 \in7, q8, \shift 445cabdff1aSopenharmony_ci vqrshrn.s32 \in1, q12, \shift 446cabdff1aSopenharmony_ci vqrshrn.s32 \in6, q9, \shift 447cabdff1aSopenharmony_ci vqrshrn.s32 \in3, q14, \shift 448cabdff1aSopenharmony_ci vqrshrn.s32 \in4, q11, \shift 449cabdff1aSopenharmony_ci.endm 450cabdff1aSopenharmony_ci 451cabdff1aSopenharmony_ci.macro idct_8x8 bitdepth 452cabdff1aSopenharmony_cifunction ff_hevc_idct_8x8_\bitdepth\()_neon, export=1 453cabdff1aSopenharmony_ci@r0 - coeffs 454cabdff1aSopenharmony_ci vpush {q4-q7} 455cabdff1aSopenharmony_ci 456cabdff1aSopenharmony_ci mov r1, r0 457cabdff1aSopenharmony_ci mov r2, #64 458cabdff1aSopenharmony_ci add r3, r0, #32 459cabdff1aSopenharmony_ci vld1.s16 {q0-q1}, [r1,:128], r2 460cabdff1aSopenharmony_ci vld1.s16 {q2-q3}, [r3,:128], r2 461cabdff1aSopenharmony_ci vld1.s16 {q4-q5}, [r1,:128], r2 462cabdff1aSopenharmony_ci vld1.s16 {q6-q7}, [r3,:128], r2 463cabdff1aSopenharmony_ci 464cabdff1aSopenharmony_ci movrel r1, trans 465cabdff1aSopenharmony_ci 466cabdff1aSopenharmony_ci tr_8x4 7, d0, d2, d4, d6, d8, d10, d12, d14 467cabdff1aSopenharmony_ci tr_8x4 7, d1, d3, d5, d7, d9, d11, d13, d15 468cabdff1aSopenharmony_ci 469cabdff1aSopenharmony_ci @ Transpose each 4x4 block, and swap how d4-d7 and d8-d11 are used. 470cabdff1aSopenharmony_ci @ Layout before: 471cabdff1aSopenharmony_ci @ d0 d1 472cabdff1aSopenharmony_ci @ d2 d3 473cabdff1aSopenharmony_ci @ d4 d5 474cabdff1aSopenharmony_ci @ d6 d7 475cabdff1aSopenharmony_ci @ d8 d9 476cabdff1aSopenharmony_ci @ d10 d11 477cabdff1aSopenharmony_ci @ d12 d13 478cabdff1aSopenharmony_ci @ d14 d15 479cabdff1aSopenharmony_ci transpose_8x8 d0, d2, d4, d6, d8, d10, d12, d14, d1, d3, d5, d7, d9, d11, d13, d15 480cabdff1aSopenharmony_ci @ Now the layout is: 481cabdff1aSopenharmony_ci @ d0 d8 482cabdff1aSopenharmony_ci @ d2 d10 483cabdff1aSopenharmony_ci @ d4 d12 484cabdff1aSopenharmony_ci @ d6 d14 485cabdff1aSopenharmony_ci @ d1 d9 486cabdff1aSopenharmony_ci @ d3 d11 487cabdff1aSopenharmony_ci @ d5 d13 488cabdff1aSopenharmony_ci @ d7 d15 489cabdff1aSopenharmony_ci 490cabdff1aSopenharmony_ci tr_8x4 20 - \bitdepth, d0, d2, d4, d6, d1, d3, d5, d7 491cabdff1aSopenharmony_ci vswp d0, d8 492cabdff1aSopenharmony_ci tr_8x4 20 - \bitdepth, d0, d10, d12, d14, d9, d11, d13, d15 493cabdff1aSopenharmony_ci vswp d0, d8 494cabdff1aSopenharmony_ci 495cabdff1aSopenharmony_ci transpose_8x8 d0, d2, d4, d6, d8, d10, d12, d14, d1, d3, d5, d7, d9, d11, d13, d15 496cabdff1aSopenharmony_ci 497cabdff1aSopenharmony_ci mov r1, r0 498cabdff1aSopenharmony_ci mov r2, #64 499cabdff1aSopenharmony_ci add r3, r0, #32 500cabdff1aSopenharmony_ci vst1.s16 {q0-q1}, [r1,:128], r2 501cabdff1aSopenharmony_ci vst1.s16 {q2-q3}, [r3,:128], r2 502cabdff1aSopenharmony_ci vst1.s16 {q4-q5}, [r1,:128], r2 503cabdff1aSopenharmony_ci vst1.s16 {q6-q7}, [r3,:128], r2 504cabdff1aSopenharmony_ci 505cabdff1aSopenharmony_ci vpop {q4-q7} 506cabdff1aSopenharmony_ci bx lr 507cabdff1aSopenharmony_ciendfunc 508cabdff1aSopenharmony_ci.endm 509cabdff1aSopenharmony_ci 510cabdff1aSopenharmony_ci.macro butterfly e, o, tmp_p, tmp_m 511cabdff1aSopenharmony_ci vadd.s32 \tmp_p, \e, \o 512cabdff1aSopenharmony_ci vsub.s32 \tmp_m, \e, \o 513cabdff1aSopenharmony_ci.endm 514cabdff1aSopenharmony_ci 515cabdff1aSopenharmony_ci.macro tr16_8x4 in0, in1, in2, in3, in4, in5, in6, in7, offset 516cabdff1aSopenharmony_ci tr_4x4_8 \in0, \in2, \in4, \in6, q8, q9, q10, q11, q12, q13, q14, q15 517cabdff1aSopenharmony_ci 518cabdff1aSopenharmony_ci vmull.s16 q12, \in1, \in0[0] 519cabdff1aSopenharmony_ci vmull.s16 q13, \in1, \in0[1] 520cabdff1aSopenharmony_ci vmull.s16 q14, \in1, \in0[2] 521cabdff1aSopenharmony_ci vmull.s16 q15, \in1, \in0[3] 522cabdff1aSopenharmony_ci sum_sub q12, \in3, \in0[1], + 523cabdff1aSopenharmony_ci sum_sub q13, \in3, \in0[3], - 524cabdff1aSopenharmony_ci sum_sub q14, \in3, \in0[0], - 525cabdff1aSopenharmony_ci sum_sub q15, \in3, \in0[2], - 526cabdff1aSopenharmony_ci 527cabdff1aSopenharmony_ci sum_sub q12, \in5, \in0[2], + 528cabdff1aSopenharmony_ci sum_sub q13, \in5, \in0[0], - 529cabdff1aSopenharmony_ci sum_sub q14, \in5, \in0[3], + 530cabdff1aSopenharmony_ci sum_sub q15, \in5, \in0[1], + 531cabdff1aSopenharmony_ci 532cabdff1aSopenharmony_ci sum_sub q12, \in7, \in0[3], + 533cabdff1aSopenharmony_ci sum_sub q13, \in7, \in0[2], - 534cabdff1aSopenharmony_ci sum_sub q14, \in7, \in0[1], + 535cabdff1aSopenharmony_ci sum_sub q15, \in7, \in0[0], - 536cabdff1aSopenharmony_ci 537cabdff1aSopenharmony_ci butterfly q8, q12, q0, q7 538cabdff1aSopenharmony_ci butterfly q9, q13, q1, q6 539cabdff1aSopenharmony_ci butterfly q10, q14, q2, q5 540cabdff1aSopenharmony_ci butterfly q11, q15, q3, q4 541cabdff1aSopenharmony_ci add r4, sp, #\offset 542cabdff1aSopenharmony_ci vst1.s32 {q0-q1}, [r4, :128]! 543cabdff1aSopenharmony_ci vst1.s32 {q2-q3}, [r4, :128]! 544cabdff1aSopenharmony_ci vst1.s32 {q4-q5}, [r4, :128]! 545cabdff1aSopenharmony_ci vst1.s32 {q6-q7}, [r4, :128] 546cabdff1aSopenharmony_ci.endm 547cabdff1aSopenharmony_ci 548cabdff1aSopenharmony_ci.macro load16 in0, in1, in2, in3, in4, in5, in6, in7 549cabdff1aSopenharmony_ci vld1.s16 {\in0}, [r1, :64], r2 550cabdff1aSopenharmony_ci vld1.s16 {\in1}, [r3, :64], r2 551cabdff1aSopenharmony_ci vld1.s16 {\in2}, [r1, :64], r2 552cabdff1aSopenharmony_ci vld1.s16 {\in3}, [r3, :64], r2 553cabdff1aSopenharmony_ci vld1.s16 {\in4}, [r1, :64], r2 554cabdff1aSopenharmony_ci vld1.s16 {\in5}, [r3, :64], r2 555cabdff1aSopenharmony_ci vld1.s16 {\in6}, [r1, :64], r2 556cabdff1aSopenharmony_ci vld1.s16 {\in7}, [r3, :64], r2 557cabdff1aSopenharmony_ci.endm 558cabdff1aSopenharmony_ci 559cabdff1aSopenharmony_ci.macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7 560cabdff1aSopenharmony_ci sum_sub q5, \in, \t0, \op0 561cabdff1aSopenharmony_ci sum_sub q6, \in, \t1, \op1 562cabdff1aSopenharmony_ci sum_sub q7, \in, \t2, \op2 563cabdff1aSopenharmony_ci sum_sub q8, \in, \t3, \op3 564cabdff1aSopenharmony_ci sum_sub q9, \in, \t4, \op4 565cabdff1aSopenharmony_ci sum_sub q10, \in, \t5, \op5 566cabdff1aSopenharmony_ci sum_sub q11, \in, \t6, \op6 567cabdff1aSopenharmony_ci sum_sub q12, \in, \t7, \op7 568cabdff1aSopenharmony_ci.endm 569cabdff1aSopenharmony_ci 570cabdff1aSopenharmony_ci.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7 571cabdff1aSopenharmony_ci vadd.s32 q4, \in0, \in1 572cabdff1aSopenharmony_ci vsub.s32 \in0, \in0, \in1 573cabdff1aSopenharmony_ci vadd.s32 \in1, \in2, \in3 574cabdff1aSopenharmony_ci vsub.s32 \in2, \in2, \in3 575cabdff1aSopenharmony_ci vadd.s32 \in3, \in4, \in5 576cabdff1aSopenharmony_ci vsub.s32 \in4, \in4, \in5 577cabdff1aSopenharmony_ci vadd.s32 \in5, \in6, \in7 578cabdff1aSopenharmony_ci vsub.s32 \in6, \in6, \in7 579cabdff1aSopenharmony_ci.endm 580cabdff1aSopenharmony_ci 581cabdff1aSopenharmony_ci.macro store16 in0, in1, in2, in3, in4, in5, in6, in7, rx 582cabdff1aSopenharmony_ci vst1.s16 \in0, [r1, :64], r2 583cabdff1aSopenharmony_ci vst1.s16 \in1, [r3, :64], \rx 584cabdff1aSopenharmony_ci vst1.s16 \in2, [r1, :64], r2 585cabdff1aSopenharmony_ci vst1.s16 \in3, [r3, :64], \rx 586cabdff1aSopenharmony_ci vst1.s16 \in4, [r1, :64], r2 587cabdff1aSopenharmony_ci vst1.s16 \in5, [r3, :64], \rx 588cabdff1aSopenharmony_ci vst1.s16 \in6, [r1, :64], r2 589cabdff1aSopenharmony_ci vst1.s16 \in7, [r3, :64], \rx 590cabdff1aSopenharmony_ci.endm 591cabdff1aSopenharmony_ci 592cabdff1aSopenharmony_ci.macro scale out0, out1, out2, out3, out4, out5, out6, out7, in0, in1, in2, in3, in4, in5, in6, in7, shift 593cabdff1aSopenharmony_ci vqrshrn.s32 \out0, \in0, \shift 594cabdff1aSopenharmony_ci vqrshrn.s32 \out1, \in1, \shift 595cabdff1aSopenharmony_ci vqrshrn.s32 \out2, \in2, \shift 596cabdff1aSopenharmony_ci vqrshrn.s32 \out3, \in3, \shift 597cabdff1aSopenharmony_ci vqrshrn.s32 \out4, \in4, \shift 598cabdff1aSopenharmony_ci vqrshrn.s32 \out5, \in5, \shift 599cabdff1aSopenharmony_ci vqrshrn.s32 \out6, \in6, \shift 600cabdff1aSopenharmony_ci vqrshrn.s32 \out7, \in7, \shift 601cabdff1aSopenharmony_ci.endm 602cabdff1aSopenharmony_ci 603cabdff1aSopenharmony_ci@stores in1, in2, in4, in6 ascending from off1 and 604cabdff1aSopenharmony_ci@stores in1, in3, in5, in7 descending from off2 605cabdff1aSopenharmony_ci.macro store_to_stack off1, off2, in0, in2, in4, in6, in7, in5, in3, in1 606cabdff1aSopenharmony_ci add r1, sp, #\off1 607cabdff1aSopenharmony_ci add r3, sp, #\off2 608cabdff1aSopenharmony_ci mov r2, #-16 609cabdff1aSopenharmony_ci vst1.s32 {\in0}, [r1, :128]! 610cabdff1aSopenharmony_ci vst1.s32 {\in1}, [r3, :128], r2 611cabdff1aSopenharmony_ci vst1.s32 {\in2}, [r1, :128]! 612cabdff1aSopenharmony_ci vst1.s32 {\in3}, [r3, :128], r2 613cabdff1aSopenharmony_ci vst1.s32 {\in4}, [r1, :128]! 614cabdff1aSopenharmony_ci vst1.s32 {\in5}, [r3, :128], r2 615cabdff1aSopenharmony_ci vst1.s32 {\in6}, [r1, :128] 616cabdff1aSopenharmony_ci vst1.s32 {\in7}, [r3, :128] 617cabdff1aSopenharmony_ci.endm 618cabdff1aSopenharmony_ci 619cabdff1aSopenharmony_ci.macro tr_16x4 name, shift, offset, step 620cabdff1aSopenharmony_cifunction func_tr_16x4_\name 621cabdff1aSopenharmony_ci mov r1, r5 622cabdff1aSopenharmony_ci add r3, r5, #(\step * 64) 623cabdff1aSopenharmony_ci mov r2, #(\step * 128) 624cabdff1aSopenharmony_ci load16 d0, d1, d2, d3, d4, d5, d6, d7 625cabdff1aSopenharmony_ci movrel r1, trans 626cabdff1aSopenharmony_ci 627cabdff1aSopenharmony_ci tr16_8x4 d0, d1, d2, d3, d4, d5, d6, d7, \offset 628cabdff1aSopenharmony_ci 629cabdff1aSopenharmony_ci add r1, r5, #(\step * 32) 630cabdff1aSopenharmony_ci add r3, r5, #(\step * 3 *32) 631cabdff1aSopenharmony_ci mov r2, #(\step * 128) 632cabdff1aSopenharmony_ci load16 d8, d9, d2, d3, d4, d5, d6, d7 633cabdff1aSopenharmony_ci movrel r1, trans + 16 634cabdff1aSopenharmony_ci vld1.s16 {q0}, [r1, :128] 635cabdff1aSopenharmony_ci vmull.s16 q5, d8, d0[0] 636cabdff1aSopenharmony_ci vmull.s16 q6, d8, d0[1] 637cabdff1aSopenharmony_ci vmull.s16 q7, d8, d0[2] 638cabdff1aSopenharmony_ci vmull.s16 q8, d8, d0[3] 639cabdff1aSopenharmony_ci vmull.s16 q9, d8, d1[0] 640cabdff1aSopenharmony_ci vmull.s16 q10, d8, d1[1] 641cabdff1aSopenharmony_ci vmull.s16 q11, d8, d1[2] 642cabdff1aSopenharmony_ci vmull.s16 q12, d8, d1[3] 643cabdff1aSopenharmony_ci 644cabdff1aSopenharmony_ci add_member d9, d0[1], d1[0], d1[3], d1[1], d0[2], d0[0], d0[3], d1[2], +, +, +, -, -, -, -, - 645cabdff1aSopenharmony_ci add_member d2, d0[2], d1[3], d0[3], d0[1], d1[2], d1[0], d0[0], d1[1], +, +, -, -, -, +, +, + 646cabdff1aSopenharmony_ci add_member d3, d0[3], d1[1], d0[1], d1[3], d0[0], d1[2], d0[2], d1[0], +, -, -, +, +, +, -, - 647cabdff1aSopenharmony_ci add_member d4, d1[0], d0[2], d1[2], d0[0], d1[3], d0[1], d1[1], d0[3], +, -, -, +, -, -, +, + 648cabdff1aSopenharmony_ci add_member d5, d1[1], d0[0], d1[0], d1[2], d0[1], d0[3], d1[3], d0[2], +, -, +, +, -, +, +, - 649cabdff1aSopenharmony_ci add_member d6, d1[2], d0[3], d0[0], d0[2], d1[1], d1[3], d1[0], d0[1], +, -, +, -, +, +, -, + 650cabdff1aSopenharmony_ci add_member d7, d1[3], d1[2], d1[1], d1[0], d0[3], d0[2], d0[1], d0[0], +, -, +, -, +, -, +, - 651cabdff1aSopenharmony_ci 652cabdff1aSopenharmony_ci add r4, sp, #\offset 653cabdff1aSopenharmony_ci vld1.s32 {q0-q1}, [r4, :128]! 654cabdff1aSopenharmony_ci vld1.s32 {q2-q3}, [r4, :128]! 655cabdff1aSopenharmony_ci 656cabdff1aSopenharmony_ci butterfly16 q0, q5, q1, q6, q2, q7, q3, q8 657cabdff1aSopenharmony_ci .if \shift > 0 658cabdff1aSopenharmony_ci scale d26, d27, d28, d29, d30, d31, d16, d17, q4, q0, q5, q1, q6, q2, q7, q3, \shift 659cabdff1aSopenharmony_ci transpose8_4x4 d26, d28, d30, d16 660cabdff1aSopenharmony_ci transpose8_4x4 d17, d31, d29, d27 661cabdff1aSopenharmony_ci mov r1, r6 662cabdff1aSopenharmony_ci add r3, r6, #(24 +3*32) 663cabdff1aSopenharmony_ci mov r2, #32 664cabdff1aSopenharmony_ci mov r4, #-32 665cabdff1aSopenharmony_ci store16 d26, d27, d28, d29, d30, d31, d16, d17, r4 666cabdff1aSopenharmony_ci .else 667cabdff1aSopenharmony_ci store_to_stack \offset, (\offset + 240), q4, q5, q6, q7, q3, q2, q1, q0 668cabdff1aSopenharmony_ci .endif 669cabdff1aSopenharmony_ci 670cabdff1aSopenharmony_ci add r4, sp, #(\offset + 64) 671cabdff1aSopenharmony_ci vld1.s32 {q0-q1}, [r4, :128]! 672cabdff1aSopenharmony_ci vld1.s32 {q2-q3}, [r4, :128] 673cabdff1aSopenharmony_ci butterfly16 q0, q9, q1, q10, q2, q11, q3, q12 674cabdff1aSopenharmony_ci .if \shift > 0 675cabdff1aSopenharmony_ci scale d26, d27, d28, d29, d30, d31, d8, d9, q4, q0, q9, q1, q10, q2, q11, q3, \shift 676cabdff1aSopenharmony_ci transpose8_4x4 d26, d28, d30, d8 677cabdff1aSopenharmony_ci transpose8_4x4 d9, d31, d29, d27 678cabdff1aSopenharmony_ci 679cabdff1aSopenharmony_ci add r1, r6, #8 680cabdff1aSopenharmony_ci add r3, r6, #(16 + 3 * 32) 681cabdff1aSopenharmony_ci mov r2, #32 682cabdff1aSopenharmony_ci mov r4, #-32 683cabdff1aSopenharmony_ci store16 d26, d27, d28, d29, d30, d31, d8, d9, r4 684cabdff1aSopenharmony_ci .else 685cabdff1aSopenharmony_ci store_to_stack (\offset + 64), (\offset + 176), q4, q9, q10, q11, q3, q2, q1, q0 686cabdff1aSopenharmony_ci .endif 687cabdff1aSopenharmony_ci 688cabdff1aSopenharmony_ci bx lr 689cabdff1aSopenharmony_ciendfunc 690cabdff1aSopenharmony_ci.endm 691cabdff1aSopenharmony_ci 692cabdff1aSopenharmony_ci.macro idct_16x16 bitdepth 693cabdff1aSopenharmony_cifunction ff_hevc_idct_16x16_\bitdepth\()_neon, export=1 694cabdff1aSopenharmony_ci@r0 - coeffs 695cabdff1aSopenharmony_ci push {r4-r7, lr} 696cabdff1aSopenharmony_ci vpush {q4-q7} 697cabdff1aSopenharmony_ci 698cabdff1aSopenharmony_ci @ Align the stack, allocate a temp buffer 699cabdff1aSopenharmony_ciT mov r7, sp 700cabdff1aSopenharmony_ciT and r7, r7, #15 701cabdff1aSopenharmony_ciA and r7, sp, #15 702cabdff1aSopenharmony_ci add r7, r7, #640 703cabdff1aSopenharmony_ci sub sp, sp, r7 704cabdff1aSopenharmony_ci 705cabdff1aSopenharmony_ci.irp i, 0, 1, 2, 3 706cabdff1aSopenharmony_ci add r5, r0, #(8 * \i) 707cabdff1aSopenharmony_ci add r6, sp, #(8 * \i * 16) 708cabdff1aSopenharmony_ci bl func_tr_16x4_firstpass 709cabdff1aSopenharmony_ci.endr 710cabdff1aSopenharmony_ci 711cabdff1aSopenharmony_ci.irp i, 0, 1, 2, 3 712cabdff1aSopenharmony_ci add r5, sp, #(8 * \i) 713cabdff1aSopenharmony_ci add r6, r0, #(8 * \i * 16) 714cabdff1aSopenharmony_ci bl func_tr_16x4_secondpass_\bitdepth 715cabdff1aSopenharmony_ci.endr 716cabdff1aSopenharmony_ci 717cabdff1aSopenharmony_ci add sp, sp, r7 718cabdff1aSopenharmony_ci 719cabdff1aSopenharmony_ci vpop {q4-q7} 720cabdff1aSopenharmony_ci pop {r4-r7, pc} 721cabdff1aSopenharmony_ciendfunc 722cabdff1aSopenharmony_ci.endm 723cabdff1aSopenharmony_ci 724cabdff1aSopenharmony_ci.macro load32 725cabdff1aSopenharmony_ci add r1, r5, #64 726cabdff1aSopenharmony_ci add r3, r1, #128 727cabdff1aSopenharmony_ci mov r2, #256 728cabdff1aSopenharmony_ci vld1.s16 {d4}, [r1, :64], r2 729cabdff1aSopenharmony_ci vld1.s16 {d5}, [r3, :64], r2 730cabdff1aSopenharmony_ci vld1.s16 {d6}, [r1, :64], r2 731cabdff1aSopenharmony_ci vld1.s16 {d7}, [r3, :64], r2 732cabdff1aSopenharmony_ci vld1.s16 {d8}, [r1, :64], r2 733cabdff1aSopenharmony_ci vld1.s16 {d9}, [r3, :64], r2 734cabdff1aSopenharmony_ci vld1.s16 {d10}, [r1, :64], r2 735cabdff1aSopenharmony_ci vld1.s16 {d11}, [r3, :64], r2 736cabdff1aSopenharmony_ci vld1.s16 {d12}, [r1, :64], r2 737cabdff1aSopenharmony_ci vld1.s16 {d13}, [r3, :64], r2 738cabdff1aSopenharmony_ci vld1.s16 {d14}, [r1, :64], r2 739cabdff1aSopenharmony_ci vld1.s16 {d15}, [r3, :64], r2 740cabdff1aSopenharmony_ci vld1.s16 {d16}, [r1, :64], r2 741cabdff1aSopenharmony_ci vld1.s16 {d17}, [r3, :64], r2 742cabdff1aSopenharmony_ci vld1.s16 {d18}, [r1, :64], r2 743cabdff1aSopenharmony_ci vld1.s16 {d19}, [r3, :64], r2 744cabdff1aSopenharmony_ci.endm 745cabdff1aSopenharmony_ci 746cabdff1aSopenharmony_ci.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3 747cabdff1aSopenharmony_ci sum_sub q10, \in, \t0, \op0 748cabdff1aSopenharmony_ci sum_sub q11, \in, \t1, \op1 749cabdff1aSopenharmony_ci sum_sub q12, \in, \t2, \op2 750cabdff1aSopenharmony_ci sum_sub q13, \in, \t3, \op3 751cabdff1aSopenharmony_ci.endm 752cabdff1aSopenharmony_ci 753cabdff1aSopenharmony_ci.macro butterfly32 in0, in1, in2, in3 754cabdff1aSopenharmony_ci vadd.s32 q1, \in0, \in1 755cabdff1aSopenharmony_ci vsub.s32 \in0, \in0, \in1 756cabdff1aSopenharmony_ci vadd.s32 \in1, \in2, \in3 757cabdff1aSopenharmony_ci vsub.s32 \in2, \in2, \in3 758cabdff1aSopenharmony_ci.endm 759cabdff1aSopenharmony_ci 760cabdff1aSopenharmony_ci.macro scale32 out0, out1, out2, out3, in0, in1, in2, in3, shift 761cabdff1aSopenharmony_ci vqrshrn.s32 \out0, \in0, \shift 762cabdff1aSopenharmony_ci vqrshrn.s32 \out1, \in1, \shift 763cabdff1aSopenharmony_ci vqrshrn.s32 \out2, \in2, \shift 764cabdff1aSopenharmony_ci vqrshrn.s32 \out3, \in3, \shift 765cabdff1aSopenharmony_ci.endm 766cabdff1aSopenharmony_ci 767cabdff1aSopenharmony_ci.macro multiply in 768cabdff1aSopenharmony_ci vmull.s16 q10, d4, \in[0] 769cabdff1aSopenharmony_ci vmull.s16 q11, d4, \in[1] 770cabdff1aSopenharmony_ci vmull.s16 q12, d4, \in[2] 771cabdff1aSopenharmony_ci vmull.s16 q13, d4, \in[3] 772cabdff1aSopenharmony_ci.endm 773cabdff1aSopenharmony_ci 774cabdff1aSopenharmony_ci.macro scale_store shift 775cabdff1aSopenharmony_ci vld1.s16 {q14-q15}, [r4, :128]! 776cabdff1aSopenharmony_ci butterfly32 q14, q10, q15, q11 777cabdff1aSopenharmony_ci scale32 d22, d23, d20, d21, q1, q14, q10, q15, \shift 778cabdff1aSopenharmony_ci 779cabdff1aSopenharmony_ci vld1.s16 {q14-q15}, [r4, :128]! 780cabdff1aSopenharmony_ci butterfly32 q14, q12, q15, q13 781cabdff1aSopenharmony_ci scale32 d2, d3, d28, d29, q1, q14, q12, q15, \shift 782cabdff1aSopenharmony_ci transpose8_4x4 d22, d20, d2, d28 783cabdff1aSopenharmony_ci transpose8_4x4 d29, d3, d21, d23 784cabdff1aSopenharmony_ci store16 d22, d23, d20, d21, d2, d3, d28, d29, r8 785cabdff1aSopenharmony_ci 786cabdff1aSopenharmony_ci @ reload multiplication coefficiens to q1 787cabdff1aSopenharmony_ci vld1.s16 {q1}, [r9, :128] 788cabdff1aSopenharmony_ci.endm 789cabdff1aSopenharmony_ci 790cabdff1aSopenharmony_cifunction tr_block1 791cabdff1aSopenharmony_ci multiply d0 792cabdff1aSopenharmony_ci add_member32 d5, d0[1], d1[0], d1[3], d2[2], +, +, +, + 793cabdff1aSopenharmony_ci add_member32 d6, d0[2], d1[3], d3[0], d3[2], +, +, +, - 794cabdff1aSopenharmony_ci add_member32 d7, d0[3], d2[2], d3[2], d1[3], +, +, -, - 795cabdff1aSopenharmony_ci add_member32 d8, d1[0], d3[1], d2[1], d0[0], +, +, -, - 796cabdff1aSopenharmony_ci add_member32 d9, d1[1], d3[3], d1[0], d1[2], +, -, -, - 797cabdff1aSopenharmony_ci add_member32 d10, d1[2], d3[0], d0[0], d3[1], +, -, -, - 798cabdff1aSopenharmony_ci add_member32 d11, d1[3], d2[1], d1[1], d2[3], +, -, -, + 799cabdff1aSopenharmony_ci add_member32 d12, d2[0], d1[2], d2[2], d1[0], +, -, -, + 800cabdff1aSopenharmony_ci add_member32 d13, d2[1], d0[3], d3[3], d0[2], +, -, -, + 801cabdff1aSopenharmony_ci add_member32 d14, d2[2], d0[1], d2[3], d2[1], +, -, +, + 802cabdff1aSopenharmony_ci add_member32 d15, d2[3], d0[2], d1[2], d3[3], +, -, +, - 803cabdff1aSopenharmony_ci add_member32 d16, d3[0], d1[1], d0[1], d2[0], +, -, +, - 804cabdff1aSopenharmony_ci add_member32 d17, d3[1], d2[0], d0[3], d0[1], +, -, +, - 805cabdff1aSopenharmony_ci add_member32 d18, d3[2], d2[3], d2[0], d1[1], +, -, +, - 806cabdff1aSopenharmony_ci add_member32 d19, d3[3], d3[2], d3[1], d3[0], +, -, +, - 807cabdff1aSopenharmony_ci bx lr 808cabdff1aSopenharmony_ciendfunc 809cabdff1aSopenharmony_ci 810cabdff1aSopenharmony_cifunction tr_block2 811cabdff1aSopenharmony_ci multiply d1 812cabdff1aSopenharmony_ci add_member32 d5, d3[1], d3[3], d3[0], d2[1], +, -, -, - 813cabdff1aSopenharmony_ci add_member32 d6, d2[1], d1[0], d0[0], d1[1], -, -, -, - 814cabdff1aSopenharmony_ci add_member32 d7, d0[0], d1[2], d3[1], d2[3], -, -, -, + 815cabdff1aSopenharmony_ci add_member32 d8, d2[0], d3[2], d1[1], d0[3], -, +, +, + 816cabdff1aSopenharmony_ci add_member32 d9, d3[2], d0[3], d1[3], d3[1], +, +, +, - 817cabdff1aSopenharmony_ci add_member32 d10, d1[1], d1[3], d2[3], d0[0], +, +, -, - 818cabdff1aSopenharmony_ci add_member32 d11, d0[3], d3[1], d0[1], d3[3], +, -, -, + 819cabdff1aSopenharmony_ci add_member32 d12, d3[0], d0[2], d3[2], d0[1], +, -, -, + 820cabdff1aSopenharmony_ci add_member32 d13, d2[2], d2[0], d1[0], d3[2], -, -, +, + 821cabdff1aSopenharmony_ci add_member32 d14, d0[1], d3[0], d2[0], d0[2], -, +, +, - 822cabdff1aSopenharmony_ci add_member32 d15, d1[3], d0[1], d2[2], d3[0], -, +, -, - 823cabdff1aSopenharmony_ci add_member32 d16, d3[3], d2[1], d0[2], d1[0], +, +, -, + 824cabdff1aSopenharmony_ci add_member32 d17, d1[2], d2[3], d3[3], d2[2], +, -, -, + 825cabdff1aSopenharmony_ci add_member32 d18, d0[2], d0[1], d0[3], d1[2], +, -, +, - 826cabdff1aSopenharmony_ci add_member32 d19, d2[3], d2[2], d2[1], d2[0], +, -, +, - 827cabdff1aSopenharmony_ci bx lr 828cabdff1aSopenharmony_ciendfunc 829cabdff1aSopenharmony_ci 830cabdff1aSopenharmony_cifunction tr_block3 831cabdff1aSopenharmony_ci multiply d2 832cabdff1aSopenharmony_ci add_member32 d5, d1[2], d0[3], d0[0], d0[2], -, -, -, - 833cabdff1aSopenharmony_ci add_member32 d6, d2[2], d3[3], d2[3], d1[2], -, -, +, + 834cabdff1aSopenharmony_ci add_member32 d7, d1[0], d0[2], d2[1], d3[3], +, +, +, - 835cabdff1aSopenharmony_ci add_member32 d8, d3[0], d2[2], d0[1], d1[3], +, -, -, - 836cabdff1aSopenharmony_ci add_member32 d9, d0[2], d2[0], d3[0], d0[0], -, -, +, + 837cabdff1aSopenharmony_ci add_member32 d10, d3[2], d1[0], d2[0], d2[2], -, +, +, - 838cabdff1aSopenharmony_ci add_member32 d11, d0[0], d3[2], d0[2], d3[0], +, +, -, - 839cabdff1aSopenharmony_ci add_member32 d12, d3[3], d0[1], d3[1], d0[3], -, -, +, + 840cabdff1aSopenharmony_ci add_member32 d13, d0[1], d2[3], d1[3], d1[1], -, +, +, - 841cabdff1aSopenharmony_ci add_member32 d14, d3[1], d1[3], d0[3], d3[2], +, +, -, + 842cabdff1aSopenharmony_ci add_member32 d15, d0[3], d1[1], d3[2], d2[0], +, -, +, + 843cabdff1aSopenharmony_ci add_member32 d16, d2[3], d3[1], d1[2], d0[1], -, -, +, - 844cabdff1aSopenharmony_ci add_member32 d17, d1[1], d0[0], d1[0], d2[1], -, +, -, + 845cabdff1aSopenharmony_ci add_member32 d18, d2[1], d3[0], d3[3], d3[1], +, -, +, + 846cabdff1aSopenharmony_ci add_member32 d19, d1[3], d1[2], d1[1], d1[0], +, -, +, - 847cabdff1aSopenharmony_ci bx lr 848cabdff1aSopenharmony_ciendfunc 849cabdff1aSopenharmony_ci 850cabdff1aSopenharmony_cifunction tr_block4 851cabdff1aSopenharmony_ci multiply d3 852cabdff1aSopenharmony_ci add_member32 d5, d1[1], d2[0], d2[3], d3[2], -, -, -, - 853cabdff1aSopenharmony_ci add_member32 d6, d0[0], d0[3], d2[0], d3[1], +, +, +, + 854cabdff1aSopenharmony_ci add_member32 d7, d2[0], d0[0], d1[1], d3[0], -, -, -, - 855cabdff1aSopenharmony_ci add_member32 d8, d3[3], d1[2], d0[2], d2[3], +, +, +, + 856cabdff1aSopenharmony_ci add_member32 d9, d2[1], d2[3], d0[0], d2[2], +, -, -, - 857cabdff1aSopenharmony_ci add_member32 d10, d0[2], d3[3], d0[3], d2[1], -, -, +, + 858cabdff1aSopenharmony_ci add_member32 d11, d1[0], d2[2], d1[2], d2[0], +, +, -, - 859cabdff1aSopenharmony_ci add_member32 d12, d2[3], d1[1], d2[1], d1[3], -, -, +, + 860cabdff1aSopenharmony_ci add_member32 d13, d3[1], d0[1], d3[0], d1[2], -, +, -, - 861cabdff1aSopenharmony_ci add_member32 d14, d1[2], d1[0], d3[3], d1[1], +, -, +, + 862cabdff1aSopenharmony_ci add_member32 d15, d0[1], d2[1], d3[1], d1[0], -, +, +, - 863cabdff1aSopenharmony_ci add_member32 d16, d1[3], d3[2], d2[2], d0[3], +, -, -, + 864cabdff1aSopenharmony_ci add_member32 d17, d3[2], d3[0], d1[3], d0[2], -, -, +, - 865cabdff1aSopenharmony_ci add_member32 d18, d2[2], d1[3], d1[0], d0[1], -, +, -, + 866cabdff1aSopenharmony_ci add_member32 d19, d0[3], d0[2], d0[1], d0[0], +, -, +, - 867cabdff1aSopenharmony_ci bx lr 868cabdff1aSopenharmony_ciendfunc 869cabdff1aSopenharmony_ci 870cabdff1aSopenharmony_ci.macro tr_32x4 name, shift 871cabdff1aSopenharmony_cifunction func_tr_32x4_\name 872cabdff1aSopenharmony_ci mov r10, lr 873cabdff1aSopenharmony_ci bl func_tr_16x4_noscale 874cabdff1aSopenharmony_ci 875cabdff1aSopenharmony_ci load32 876cabdff1aSopenharmony_ci movrel r9, trans + 32 877cabdff1aSopenharmony_ci vld1.s16 {q0}, [r9, :128]! 878cabdff1aSopenharmony_ci vld1.s16 {q1}, [r9, :128] 879cabdff1aSopenharmony_ci 880cabdff1aSopenharmony_ci bl tr_block1 881cabdff1aSopenharmony_ci 882cabdff1aSopenharmony_ci add r4, sp, #2048 883cabdff1aSopenharmony_ci vld1.s16 {q14-q15}, [r4, :128]! 884cabdff1aSopenharmony_ci butterfly32 q14, q10, q15, q11 885cabdff1aSopenharmony_ci scale32 d22, d23, d20, d21, q1, q14, q10, q15, \shift 886cabdff1aSopenharmony_ci 887cabdff1aSopenharmony_ci vld1.s16 {q14-q15}, [r4, :128]! 888cabdff1aSopenharmony_ci butterfly32 q14, q12, q15, q13 889cabdff1aSopenharmony_ci scale32 d2, d3, d28, d29, q1, q14, q12, q15, \shift 890cabdff1aSopenharmony_ci 891cabdff1aSopenharmony_ci transpose8_4x4 d22, d20, d2, d28 892cabdff1aSopenharmony_ci transpose8_4x4 d29, d3, d21, d23 893cabdff1aSopenharmony_ci mov r1, r11 894cabdff1aSopenharmony_ci mov r2, #64 895cabdff1aSopenharmony_ci mov r8, #-64 896cabdff1aSopenharmony_ci add r3, r11, #(56 + 3 * 64) 897cabdff1aSopenharmony_ci store16 d22, d23, d20, d21, d2, d3, d28, d29, r8 898cabdff1aSopenharmony_ci 899cabdff1aSopenharmony_ci @ reload multiplication coefficiens to q1 900cabdff1aSopenharmony_ci vld1.s16 {q1}, [r9, :128] 901cabdff1aSopenharmony_ci 902cabdff1aSopenharmony_ci bl tr_block2 903cabdff1aSopenharmony_ci add r1, r11, #8 904cabdff1aSopenharmony_ci add r3, r11, #(48 + 3 * 64) 905cabdff1aSopenharmony_ci mov r2, #64 906cabdff1aSopenharmony_ci mov r8, #-64 907cabdff1aSopenharmony_ci scale_store \shift 908cabdff1aSopenharmony_ci 909cabdff1aSopenharmony_ci bl tr_block3 910cabdff1aSopenharmony_ci add r1, r11, #16 911cabdff1aSopenharmony_ci add r3, r11, #(40 + 3 * 64) 912cabdff1aSopenharmony_ci mov r2, #64 913cabdff1aSopenharmony_ci mov r8, #-64 914cabdff1aSopenharmony_ci scale_store \shift 915cabdff1aSopenharmony_ci 916cabdff1aSopenharmony_ci bl tr_block4 917cabdff1aSopenharmony_ci add r1, r11, #24 918cabdff1aSopenharmony_ci add r3, r11, #(32 + 3 * 64) 919cabdff1aSopenharmony_ci mov r2, #64 920cabdff1aSopenharmony_ci mov r8, #-64 921cabdff1aSopenharmony_ci scale_store \shift 922cabdff1aSopenharmony_ci 923cabdff1aSopenharmony_ci bx r10 924cabdff1aSopenharmony_ciendfunc 925cabdff1aSopenharmony_ci.endm 926cabdff1aSopenharmony_ci 927cabdff1aSopenharmony_ci.macro idct_32x32 bitdepth 928cabdff1aSopenharmony_cifunction ff_hevc_idct_32x32_\bitdepth\()_neon, export=1 929cabdff1aSopenharmony_ci@r0 - coeffs 930cabdff1aSopenharmony_ci push {r4-r11, lr} 931cabdff1aSopenharmony_ci vpush {q4-q7} 932cabdff1aSopenharmony_ci 933cabdff1aSopenharmony_ci @ Align the stack, allocate a temp buffer 934cabdff1aSopenharmony_ciT mov r7, sp 935cabdff1aSopenharmony_ciT and r7, r7, #15 936cabdff1aSopenharmony_ciA and r7, sp, #15 937cabdff1aSopenharmony_ci add r7, r7, #2432 938cabdff1aSopenharmony_ci sub sp, sp, r7 939cabdff1aSopenharmony_ci 940cabdff1aSopenharmony_ci.irp i, 0, 1, 2, 3, 4, 5, 6, 7 941cabdff1aSopenharmony_ci add r5, r0, #(8 * \i) 942cabdff1aSopenharmony_ci add r11, sp, #(8 * \i * 32) 943cabdff1aSopenharmony_ci bl func_tr_32x4_firstpass 944cabdff1aSopenharmony_ci.endr 945cabdff1aSopenharmony_ci 946cabdff1aSopenharmony_ci.irp i, 0, 1, 2, 3, 4, 5, 6, 7 947cabdff1aSopenharmony_ci add r5, sp, #(8 * \i) 948cabdff1aSopenharmony_ci add r11, r0, #(8 * \i * 32) 949cabdff1aSopenharmony_ci bl func_tr_32x4_secondpass_\bitdepth 950cabdff1aSopenharmony_ci.endr 951cabdff1aSopenharmony_ci 952cabdff1aSopenharmony_ci add sp, sp, r7 953cabdff1aSopenharmony_ci vpop {q4-q7} 954cabdff1aSopenharmony_ci pop {r4-r11, pc} 955cabdff1aSopenharmony_ciendfunc 956cabdff1aSopenharmony_ci.endm 957cabdff1aSopenharmony_ci 958cabdff1aSopenharmony_citr_16x4 firstpass, 7, 512, 1 959cabdff1aSopenharmony_citr_16x4 secondpass_8, 20 - 8, 512, 1 960cabdff1aSopenharmony_citr_16x4 secondpass_10, 20 - 10, 512, 1 961cabdff1aSopenharmony_citr_16x4 noscale, 0, 2048, 4 962cabdff1aSopenharmony_ci.ltorg 963cabdff1aSopenharmony_citr_32x4 firstpass, 7 964cabdff1aSopenharmony_citr_32x4 secondpass_8, 20 - 8 965cabdff1aSopenharmony_citr_32x4 secondpass_10, 20 - 10 966cabdff1aSopenharmony_ci.ltorg 967cabdff1aSopenharmony_ci 968cabdff1aSopenharmony_ciidct_4x4 8 969cabdff1aSopenharmony_ciidct_4x4_dc 8 970cabdff1aSopenharmony_ciidct_4x4 10 971cabdff1aSopenharmony_ciidct_4x4_dc 10 972cabdff1aSopenharmony_ciidct_8x8 8 973cabdff1aSopenharmony_ciidct_8x8_dc 8 974cabdff1aSopenharmony_ciidct_8x8 10 975cabdff1aSopenharmony_ciidct_8x8_dc 10 976cabdff1aSopenharmony_ciidct_16x16 8 977cabdff1aSopenharmony_ciidct_16x16_dc 8 978cabdff1aSopenharmony_ciidct_16x16 10 979cabdff1aSopenharmony_ciidct_16x16_dc 10 980cabdff1aSopenharmony_ciidct_32x32 8 981cabdff1aSopenharmony_ciidct_32x32_dc 8 982cabdff1aSopenharmony_ciidct_32x32 10 983cabdff1aSopenharmony_ciidct_32x32_dc 10 984cabdff1aSopenharmony_ci 985cabdff1aSopenharmony_ci/* uses registers q2 - q9 for temp values */ 986cabdff1aSopenharmony_ci/* TODO: reorder */ 987cabdff1aSopenharmony_ci.macro tr4_luma_shift r0, r1, r2, r3, shift 988cabdff1aSopenharmony_ci vaddl.s16 q5, \r0, \r2 // c0 = src0 + src2 989cabdff1aSopenharmony_ci vaddl.s16 q2, \r2, \r3 // c1 = src2 + src3 990cabdff1aSopenharmony_ci vsubl.s16 q4, \r0, \r3 // c2 = src0 - src3 991cabdff1aSopenharmony_ci vmull.s16 q6, \r1, d0[0] // c3 = 74 * src1 992cabdff1aSopenharmony_ci 993cabdff1aSopenharmony_ci vaddl.s16 q7, \r0, \r3 // src0 + src3 994cabdff1aSopenharmony_ci vsubw.s16 q7, q7, \r2 // src0 - src2 + src3 995cabdff1aSopenharmony_ci vmul.s32 q7, q7, d0[0] // dst2 = 74 * (src0 - src2 + src3) 996cabdff1aSopenharmony_ci 997cabdff1aSopenharmony_ci vmul.s32 q8, q5, d0[1] // 29 * c0 998cabdff1aSopenharmony_ci vmul.s32 q9, q2, d1[0] // 55 * c1 999cabdff1aSopenharmony_ci vadd.s32 q8, q9 // 29 * c0 + 55 * c1 1000cabdff1aSopenharmony_ci vadd.s32 q8, q6 // dst0 = 29 * c0 + 55 * c1 + c3 1001cabdff1aSopenharmony_ci 1002cabdff1aSopenharmony_ci vmul.s32 q2, q2, d0[1] // 29 * c1 1003cabdff1aSopenharmony_ci vmul.s32 q9, q4, d1[0] // 55 * c2 1004cabdff1aSopenharmony_ci vsub.s32 q9, q2 // 55 * c2 - 29 * c1 1005cabdff1aSopenharmony_ci vadd.s32 q9, q6 // dst1 = 55 * c2 - 29 * c1 + c3 1006cabdff1aSopenharmony_ci 1007cabdff1aSopenharmony_ci vmul.s32 q5, q5, d1[0] // 55 * c0 1008cabdff1aSopenharmony_ci vmul.s32 q4, q4, d0[1] // 29 * c2 1009cabdff1aSopenharmony_ci vadd.s32 q5, q4 // 55 * c0 + 29 * c2 1010cabdff1aSopenharmony_ci vsub.s32 q5, q6 // dst3 = 55 * c0 + 29 * c2 - c3 1011cabdff1aSopenharmony_ci 1012cabdff1aSopenharmony_ci vqrshrn.s32 \r0, q8, \shift 1013cabdff1aSopenharmony_ci vqrshrn.s32 \r1, q9, \shift 1014cabdff1aSopenharmony_ci vqrshrn.s32 \r2, q7, \shift 1015cabdff1aSopenharmony_ci vqrshrn.s32 \r3, q5, \shift 1016cabdff1aSopenharmony_ci.endm 1017cabdff1aSopenharmony_ci 1018cabdff1aSopenharmony_ci.ltorg 1019cabdff1aSopenharmony_cifunction ff_hevc_transform_luma_4x4_neon_8, export=1 1020cabdff1aSopenharmony_ci vpush {d8-d15} 1021cabdff1aSopenharmony_ci vld1.16 {q14, q15}, [r0] // coeffs 1022cabdff1aSopenharmony_ci ldr r3, =0x4a // 74 1023cabdff1aSopenharmony_ci vmov.32 d0[0], r3 1024cabdff1aSopenharmony_ci ldr r3, =0x1d // 29 1025cabdff1aSopenharmony_ci vmov.32 d0[1], r3 1026cabdff1aSopenharmony_ci ldr r3, =0x37 // 55 1027cabdff1aSopenharmony_ci vmov.32 d1[0], r3 1028cabdff1aSopenharmony_ci 1029cabdff1aSopenharmony_ci tr4_luma_shift d28, d29, d30, d31, #7 1030cabdff1aSopenharmony_ci 1031cabdff1aSopenharmony_ci vtrn.16 d28, d29 1032cabdff1aSopenharmony_ci vtrn.16 d30, d31 1033cabdff1aSopenharmony_ci vtrn.32 q14, q15 1034cabdff1aSopenharmony_ci 1035cabdff1aSopenharmony_ci tr4_luma_shift d28, d29, d30, d31, #12 1036cabdff1aSopenharmony_ci 1037cabdff1aSopenharmony_ci vtrn.16 d28, d29 1038cabdff1aSopenharmony_ci vtrn.16 d30, d31 1039cabdff1aSopenharmony_ci vtrn.32 q14, q15 1040cabdff1aSopenharmony_ci vst1.16 {q14, q15}, [r0] 1041cabdff1aSopenharmony_ci vpop {d8-d15} 1042cabdff1aSopenharmony_ci bx lr 1043cabdff1aSopenharmony_ciendfunc 1044