1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2017 Google Inc. 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/arm/asm.S" 22cabdff1aSopenharmony_ci#include "neon.S" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ciconst itxfm4_coeffs, align=4 25cabdff1aSopenharmony_ci .short 11585, 0, 6270, 15137 26cabdff1aSopenharmony_ciiadst4_coeffs: 27cabdff1aSopenharmony_ci .short 5283, 15212, 9929, 13377 28cabdff1aSopenharmony_ciendconst 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_ciconst iadst8_coeffs, align=4 31cabdff1aSopenharmony_ci .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679 32cabdff1aSopenharmony_ciidct_coeffs: 33cabdff1aSopenharmony_ci .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102 34cabdff1aSopenharmony_ci .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756 35cabdff1aSopenharmony_ci .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520 36cabdff1aSopenharmony_ci .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404 37cabdff1aSopenharmony_ciendconst 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_ciconst iadst16_coeffs, align=4 40cabdff1aSopenharmony_ci .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053 41cabdff1aSopenharmony_ci .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207 42cabdff1aSopenharmony_ciendconst 43cabdff1aSopenharmony_ci 44cabdff1aSopenharmony_ci@ Do two 4x4 transposes, using q registers for the subtransposes that don't 45cabdff1aSopenharmony_ci@ need to address the individual d registers. 46cabdff1aSopenharmony_ci@ r0,r1 == rq1, r2,r3 == rq1, etc 47cabdff1aSopenharmony_ci.macro transpose32_q_2x_4x4 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 48cabdff1aSopenharmony_ci vswp \r1, \r4 @ vtrn.64 \rq0, \rq2 49cabdff1aSopenharmony_ci vswp \r3, \r6 @ vtrn.64 \rq1, \rq3 50cabdff1aSopenharmony_ci vswp \r9, \r12 @ vtrn.64 \rq4, \rq6 51cabdff1aSopenharmony_ci vswp \r11, \r14 @ vtrn.64 \rq5, \rq7 52cabdff1aSopenharmony_ci vtrn.32 \rq0, \rq1 53cabdff1aSopenharmony_ci vtrn.32 \rq2, \rq3 54cabdff1aSopenharmony_ci vtrn.32 \rq4, \rq5 55cabdff1aSopenharmony_ci vtrn.32 \rq6, \rq7 56cabdff1aSopenharmony_ci.endm 57cabdff1aSopenharmony_ci 58cabdff1aSopenharmony_ci@ Do eight 2x2 transposes. 59cabdff1aSopenharmony_ci.macro transpose32_8x_2x2 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 60cabdff1aSopenharmony_ci vtrn.32 \r0, \r1 61cabdff1aSopenharmony_ci vtrn.32 \r2, \r3 62cabdff1aSopenharmony_ci vtrn.32 \r4, \r5 63cabdff1aSopenharmony_ci vtrn.32 \r6, \r7 64cabdff1aSopenharmony_ci vtrn.32 \r8, \r9 65cabdff1aSopenharmony_ci vtrn.32 \r10, \r11 66cabdff1aSopenharmony_ci vtrn.32 \r12, \r13 67cabdff1aSopenharmony_ci vtrn.32 \r14, \r15 68cabdff1aSopenharmony_ci.endm 69cabdff1aSopenharmony_ci 70cabdff1aSopenharmony_ci@ out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 71cabdff1aSopenharmony_ci@ out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 72cabdff1aSopenharmony_ci@ in/out are d registers 73cabdff1aSopenharmony_ci.macro mbutterfly0 out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4, neg=0 74cabdff1aSopenharmony_ci vadd.s32 \tmpd1, \in1, \in2 75cabdff1aSopenharmony_ci vsub.s32 \tmpd2, \in1, \in2 76cabdff1aSopenharmony_ci.if \neg > 0 77cabdff1aSopenharmony_ci vneg.s32 \tmpd1, \tmpd1 78cabdff1aSopenharmony_ci.endif 79cabdff1aSopenharmony_ci vmull.s32 \tmpq3, \tmpd1, d0[0] 80cabdff1aSopenharmony_ci vmull.s32 \tmpq4, \tmpd2, d0[0] 81cabdff1aSopenharmony_ci vrshrn.s64 \out1, \tmpq3, #14 82cabdff1aSopenharmony_ci vrshrn.s64 \out2, \tmpq4, #14 83cabdff1aSopenharmony_ci.endm 84cabdff1aSopenharmony_ci 85cabdff1aSopenharmony_ci@ Same as mbutterfly0 above, but treating the input in in2 as zero, 86cabdff1aSopenharmony_ci@ writing the same output into both out1 and out2. 87cabdff1aSopenharmony_ci.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4 88cabdff1aSopenharmony_ci vmull.s32 \tmpq3, \in1, d0[0] 89cabdff1aSopenharmony_ci vrshrn.s64 \out1, \tmpq3, #14 90cabdff1aSopenharmony_ci vrshrn.s64 \out2, \tmpq3, #14 91cabdff1aSopenharmony_ci.endm 92cabdff1aSopenharmony_ci 93cabdff1aSopenharmony_ci@ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 94cabdff1aSopenharmony_ci@ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 95cabdff1aSopenharmony_ci@ Same as mbutterfly0, but with input being 2 q registers, output 96cabdff1aSopenharmony_ci@ being 4 d registers. 97cabdff1aSopenharmony_ci@ This can do with either 4 or 6 temporary q registers. 98cabdff1aSopenharmony_ci.macro dmbutterfly0 out1, out2, out3, out4, in1, in2, tmpq1, tmpq2, tmpd11, tmpd12, tmpd21, tmpd22, tmpq3, tmpq4, tmpq5, tmpq6 99cabdff1aSopenharmony_ci vadd.s32 \tmpq1, \in1, \in2 100cabdff1aSopenharmony_ci vsub.s32 \tmpq2, \in1, \in2 101cabdff1aSopenharmony_ci vmull.s32 \tmpq3, \tmpd11, d0[0] 102cabdff1aSopenharmony_ci vmull.s32 \tmpq4, \tmpd12, d0[0] 103cabdff1aSopenharmony_ci.ifb \tmpq5 104cabdff1aSopenharmony_ci vrshrn.s64 \out1, \tmpq3, #14 105cabdff1aSopenharmony_ci vrshrn.s64 \out2, \tmpq4, #14 106cabdff1aSopenharmony_ci vmull.s32 \tmpq3, \tmpd21, d0[0] 107cabdff1aSopenharmony_ci vmull.s32 \tmpq4, \tmpd22, d0[0] 108cabdff1aSopenharmony_ci vrshrn.s64 \out3, \tmpq3, #14 109cabdff1aSopenharmony_ci vrshrn.s64 \out4, \tmpq4, #14 110cabdff1aSopenharmony_ci.else 111cabdff1aSopenharmony_ci vmull.s32 \tmpq5, \tmpd21, d0[0] 112cabdff1aSopenharmony_ci vmull.s32 \tmpq6, \tmpd22, d0[0] 113cabdff1aSopenharmony_ci vrshrn.s64 \out1, \tmpq3, #14 114cabdff1aSopenharmony_ci vrshrn.s64 \out2, \tmpq4, #14 115cabdff1aSopenharmony_ci vrshrn.s64 \out3, \tmpq5, #14 116cabdff1aSopenharmony_ci vrshrn.s64 \out4, \tmpq6, #14 117cabdff1aSopenharmony_ci.endif 118cabdff1aSopenharmony_ci.endm 119cabdff1aSopenharmony_ci 120cabdff1aSopenharmony_ci@ out1 = in1 * coef1 - in2 * coef2 121cabdff1aSopenharmony_ci@ out2 = in1 * coef2 + in2 * coef1 122cabdff1aSopenharmony_ci@ out are 2 q registers, in are 2 d registers 123cabdff1aSopenharmony_ci.macro mbutterfly_l out1, out2, in1, in2, coef1, coef2, neg=0 124cabdff1aSopenharmony_ci vmull.s32 \out1, \in1, \coef1 125cabdff1aSopenharmony_ci vmlsl.s32 \out1, \in2, \coef2 126cabdff1aSopenharmony_ci.if \neg 127cabdff1aSopenharmony_ci vmov.s64 \out2, #0 128cabdff1aSopenharmony_ci vmlsl.s32 \out2, \in1, \coef2 129cabdff1aSopenharmony_ci vmlsl.s32 \out2, \in2, \coef1 130cabdff1aSopenharmony_ci.else 131cabdff1aSopenharmony_ci vmull.s32 \out2, \in1, \coef2 132cabdff1aSopenharmony_ci vmlal.s32 \out2, \in2, \coef1 133cabdff1aSopenharmony_ci.endif 134cabdff1aSopenharmony_ci.endm 135cabdff1aSopenharmony_ci 136cabdff1aSopenharmony_ci@ out1,out2 = in1,in2 * coef1 - in3,in4 * coef2 137cabdff1aSopenharmony_ci@ out3,out4 = in1,in2 * coef2 + in3,in4 * coef1 138cabdff1aSopenharmony_ci@ out are 4 q registers, in are 4 d registers 139cabdff1aSopenharmony_ci.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, in3, in4, coef1, coef2 140cabdff1aSopenharmony_ci vmull.s32 \out1, \in1, \coef1 141cabdff1aSopenharmony_ci vmull.s32 \out2, \in2, \coef1 142cabdff1aSopenharmony_ci vmull.s32 \out3, \in1, \coef2 143cabdff1aSopenharmony_ci vmull.s32 \out4, \in2, \coef2 144cabdff1aSopenharmony_ci vmlsl.s32 \out1, \in3, \coef2 145cabdff1aSopenharmony_ci vmlsl.s32 \out2, \in4, \coef2 146cabdff1aSopenharmony_ci vmlal.s32 \out3, \in3, \coef1 147cabdff1aSopenharmony_ci vmlal.s32 \out4, \in4, \coef1 148cabdff1aSopenharmony_ci.endm 149cabdff1aSopenharmony_ci 150cabdff1aSopenharmony_ci@ inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14 151cabdff1aSopenharmony_ci@ inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14 152cabdff1aSopenharmony_ci@ inout are 2 d registers, tmp are 2 q registers 153cabdff1aSopenharmony_ci.macro mbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, neg=0 154cabdff1aSopenharmony_ci mbutterfly_l \tmp1, \tmp2, \inout1, \inout2, \coef1, \coef2, \neg 155cabdff1aSopenharmony_ci vrshrn.s64 \inout1, \tmp1, #14 156cabdff1aSopenharmony_ci vrshrn.s64 \inout2, \tmp2, #14 157cabdff1aSopenharmony_ci.endm 158cabdff1aSopenharmony_ci 159cabdff1aSopenharmony_ci@ Same as mbutterfly above, but treating the input in inout2 as zero 160cabdff1aSopenharmony_ci.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2 161cabdff1aSopenharmony_ci vmull.s32 \tmp1, \inout1, \coef1 162cabdff1aSopenharmony_ci vmull.s32 \tmp2, \inout1, \coef2 163cabdff1aSopenharmony_ci vrshrn.s64 \inout1, \tmp1, #14 164cabdff1aSopenharmony_ci vrshrn.s64 \inout2, \tmp2, #14 165cabdff1aSopenharmony_ci.endm 166cabdff1aSopenharmony_ci 167cabdff1aSopenharmony_ci@ Same as mbutterfly above, but treating the input in inout1 as zero 168cabdff1aSopenharmony_ci.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2 169cabdff1aSopenharmony_ci vmov.s64 \tmp1, #0 170cabdff1aSopenharmony_ci vmull.s32 \tmp2, \inout2, \coef1 171cabdff1aSopenharmony_ci vmlsl.s32 \tmp1, \inout2, \coef2 172cabdff1aSopenharmony_ci vrshrn.s64 \inout2, \tmp2, #14 173cabdff1aSopenharmony_ci vrshrn.s64 \inout1, \tmp1, #14 174cabdff1aSopenharmony_ci.endm 175cabdff1aSopenharmony_ci 176cabdff1aSopenharmony_ci@ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14 177cabdff1aSopenharmony_ci@ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14 178cabdff1aSopenharmony_ci@ inout are 4 d registers, tmp are 4 q registers 179cabdff1aSopenharmony_ci.macro dmbutterfly inout1, inout2, inout3, inout4, coef1, coef2, tmp1, tmp2, tmp3, tmp4 180cabdff1aSopenharmony_ci dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \inout3, \inout4, \coef1, \coef2 181cabdff1aSopenharmony_ci vrshrn.s64 \inout1, \tmp1, #14 182cabdff1aSopenharmony_ci vrshrn.s64 \inout2, \tmp2, #14 183cabdff1aSopenharmony_ci vrshrn.s64 \inout3, \tmp3, #14 184cabdff1aSopenharmony_ci vrshrn.s64 \inout4, \tmp4, #14 185cabdff1aSopenharmony_ci.endm 186cabdff1aSopenharmony_ci 187cabdff1aSopenharmony_ci@ out1 = in1 + in2 188cabdff1aSopenharmony_ci@ out2 = in1 - in2 189cabdff1aSopenharmony_ci.macro butterfly out1, out2, in1, in2 190cabdff1aSopenharmony_ci vadd.s32 \out1, \in1, \in2 191cabdff1aSopenharmony_ci vsub.s32 \out2, \in1, \in2 192cabdff1aSopenharmony_ci.endm 193cabdff1aSopenharmony_ci 194cabdff1aSopenharmony_ci@ out1 = in1 - in2 195cabdff1aSopenharmony_ci@ out2 = in1 + in2 196cabdff1aSopenharmony_ci.macro butterfly_r out1, out2, in1, in2 197cabdff1aSopenharmony_ci vsub.s32 \out1, \in1, \in2 198cabdff1aSopenharmony_ci vadd.s32 \out2, \in1, \in2 199cabdff1aSopenharmony_ci.endm 200cabdff1aSopenharmony_ci 201cabdff1aSopenharmony_ci@ out1 = (in1 + in2 + (1 << 13)) >> 14 202cabdff1aSopenharmony_ci@ out2 = (in1 - in2 + (1 << 13)) >> 14 203cabdff1aSopenharmony_ci@ out are 2 d registers, in are 2 q registers, tmp are 2 q registers 204cabdff1aSopenharmony_ci.macro butterfly_n out1, out2, in1, in2, tmp1, tmp2 205cabdff1aSopenharmony_ci vadd.s64 \tmp1, \in1, \in2 206cabdff1aSopenharmony_ci vsub.s64 \tmp2, \in1, \in2 207cabdff1aSopenharmony_ci vrshrn.s64 \out1, \tmp1, #14 208cabdff1aSopenharmony_ci vrshrn.s64 \out2, \tmp2, #14 209cabdff1aSopenharmony_ci.endm 210cabdff1aSopenharmony_ci 211cabdff1aSopenharmony_ci@ out1,out2 = (in1,in2 + in3,in4 + (1 << 13)) >> 14 212cabdff1aSopenharmony_ci@ out3,out4 = (in1,in2 - in3,in4 + (1 << 13)) >> 14 213cabdff1aSopenharmony_ci@ out are 4 d registers, in are 4 q registers, tmp are 4 q registers 214cabdff1aSopenharmony_ci.macro dbutterfly_n out1, out2, out3, out4, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4 215cabdff1aSopenharmony_ci vadd.s64 \tmp1, \in1, \in3 216cabdff1aSopenharmony_ci vadd.s64 \tmp2, \in2, \in4 217cabdff1aSopenharmony_ci vsub.s64 \tmp3, \in1, \in3 218cabdff1aSopenharmony_ci vsub.s64 \tmp4, \in2, \in4 219cabdff1aSopenharmony_ci vrshrn.s64 \out1, \tmp1, #14 220cabdff1aSopenharmony_ci vrshrn.s64 \out2, \tmp2, #14 221cabdff1aSopenharmony_ci vrshrn.s64 \out3, \tmp3, #14 222cabdff1aSopenharmony_ci vrshrn.s64 \out4, \tmp4, #14 223cabdff1aSopenharmony_ci.endm 224cabdff1aSopenharmony_ci 225cabdff1aSopenharmony_ci 226cabdff1aSopenharmony_ci.macro iwht4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7 227cabdff1aSopenharmony_ci vadd.i32 \c0, \c0, \c1 228cabdff1aSopenharmony_ci vsub.i32 q11, \c2, \c3 229cabdff1aSopenharmony_ci vsub.i32 q10, \c0, q11 230cabdff1aSopenharmony_ci vshr.s32 q10, q10, #1 231cabdff1aSopenharmony_ci vsub.i32 \c2, q10, \c1 232cabdff1aSopenharmony_ci vsub.i32 \c1, q10, \c3 233cabdff1aSopenharmony_ci vadd.i32 \c3, q11, \c2 234cabdff1aSopenharmony_ci vsub.i32 \c0, \c0, \c1 235cabdff1aSopenharmony_ci.endm 236cabdff1aSopenharmony_ci 237cabdff1aSopenharmony_ci.macro iwht4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7 238cabdff1aSopenharmony_ci iwht4_10 \c0, \c1, \c2, \c3, \cd0, \cd1, \cd2, \cd3, \cd4, \cd5, \cd6, \cd7 239cabdff1aSopenharmony_ci.endm 240cabdff1aSopenharmony_ci 241cabdff1aSopenharmony_ci@ c0 == cd0,cd1, c1 == cd2,cd3 242cabdff1aSopenharmony_ci.macro idct4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7 243cabdff1aSopenharmony_ci vmul.s32 q13, \c1, d1[1] 244cabdff1aSopenharmony_ci vmul.s32 q11, \c1, d1[0] 245cabdff1aSopenharmony_ci vadd.i32 q14, \c0, \c2 246cabdff1aSopenharmony_ci vsub.i32 q15, \c0, \c2 247cabdff1aSopenharmony_ci vmla.s32 q13, \c3, d1[0] 248cabdff1aSopenharmony_ci vmul.s32 q12, q14, d0[0] 249cabdff1aSopenharmony_ci vmul.s32 q10, q15, d0[0] 250cabdff1aSopenharmony_ci vmls.s32 q11, \c3, d1[1] 251cabdff1aSopenharmony_ci vrshr.s32 q13, q13, #14 252cabdff1aSopenharmony_ci vrshr.s32 q12, q12, #14 253cabdff1aSopenharmony_ci vrshr.s32 q10, q10, #14 254cabdff1aSopenharmony_ci vrshr.s32 q11, q11, #14 255cabdff1aSopenharmony_ci vadd.i32 \c0, q12, q13 256cabdff1aSopenharmony_ci vsub.i32 \c3, q12, q13 257cabdff1aSopenharmony_ci vadd.i32 \c1, q10, q11 258cabdff1aSopenharmony_ci vsub.i32 \c2, q10, q11 259cabdff1aSopenharmony_ci.endm 260cabdff1aSopenharmony_ci 261cabdff1aSopenharmony_ci.macro idct4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7 262cabdff1aSopenharmony_ci vmull.s32 q13, \cd2, d1[1] 263cabdff1aSopenharmony_ci vmull.s32 q15, \cd3, d1[1] 264cabdff1aSopenharmony_ci vmull.s32 q11, \cd2, d1[0] 265cabdff1aSopenharmony_ci vmull.s32 q3, \cd3, d1[0] 266cabdff1aSopenharmony_ci vadd.i32 q14, \c0, \c2 267cabdff1aSopenharmony_ci vsub.i32 q2, \c0, \c2 268cabdff1aSopenharmony_ci vmlal.s32 q13, \cd6, d1[0] 269cabdff1aSopenharmony_ci vmlal.s32 q15, \cd7, d1[0] 270cabdff1aSopenharmony_ci vmull.s32 q12, d28, d0[0] 271cabdff1aSopenharmony_ci vmull.s32 q14, d29, d0[0] 272cabdff1aSopenharmony_ci vmull.s32 q10, d4, d0[0] 273cabdff1aSopenharmony_ci vmull.s32 q8, d5, d0[0] 274cabdff1aSopenharmony_ci vmlsl.s32 q11, \cd6, d1[1] 275cabdff1aSopenharmony_ci vmlsl.s32 q3, \cd7, d1[1] 276cabdff1aSopenharmony_ci vrshrn.s64 d26, q13, #14 277cabdff1aSopenharmony_ci vrshrn.s64 d27, q15, #14 278cabdff1aSopenharmony_ci vrshrn.s64 d24, q12, #14 279cabdff1aSopenharmony_ci vrshrn.s64 d25, q14, #14 280cabdff1aSopenharmony_ci vrshrn.s64 d20, q10, #14 281cabdff1aSopenharmony_ci vrshrn.s64 d21, q8, #14 282cabdff1aSopenharmony_ci vrshrn.s64 d22, q11, #14 283cabdff1aSopenharmony_ci vrshrn.s64 d23, q3, #14 284cabdff1aSopenharmony_ci vadd.i32 \c0, q12, q13 285cabdff1aSopenharmony_ci vsub.i32 \c3, q12, q13 286cabdff1aSopenharmony_ci vadd.i32 \c1, q10, q11 287cabdff1aSopenharmony_ci vsub.i32 \c2, q10, q11 288cabdff1aSopenharmony_ci.endm 289cabdff1aSopenharmony_ci 290cabdff1aSopenharmony_ci.macro iadst4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7 291cabdff1aSopenharmony_ci vmul.s32 q10, \c0, d2[0] 292cabdff1aSopenharmony_ci vmla.s32 q10, \c2, d2[1] 293cabdff1aSopenharmony_ci vmla.s32 q10, \c3, d3[0] 294cabdff1aSopenharmony_ci vmul.s32 q11, \c0, d3[0] 295cabdff1aSopenharmony_ci vmls.s32 q11, \c2, d2[0] 296cabdff1aSopenharmony_ci vsub.s32 \c0, \c0, \c2 297cabdff1aSopenharmony_ci vmls.s32 q11, \c3, d2[1] 298cabdff1aSopenharmony_ci vadd.s32 \c0, \c0, \c3 299cabdff1aSopenharmony_ci vmul.s32 q13, \c1, d3[1] 300cabdff1aSopenharmony_ci vmul.s32 q12, \c0, d3[1] 301cabdff1aSopenharmony_ci vadd.s32 q14, q10, q13 302cabdff1aSopenharmony_ci vadd.s32 q15, q11, q13 303cabdff1aSopenharmony_ci vrshr.s32 \c0, q14, #14 304cabdff1aSopenharmony_ci vadd.s32 q10, q10, q11 305cabdff1aSopenharmony_ci vrshr.s32 \c1, q15, #14 306cabdff1aSopenharmony_ci vsub.s32 q10, q10, q13 307cabdff1aSopenharmony_ci vrshr.s32 \c2, q12, #14 308cabdff1aSopenharmony_ci vrshr.s32 \c3, q10, #14 309cabdff1aSopenharmony_ci.endm 310cabdff1aSopenharmony_ci 311cabdff1aSopenharmony_ci.macro iadst4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7 312cabdff1aSopenharmony_ci vmull.s32 q10, \cd0, d2[0] 313cabdff1aSopenharmony_ci vmull.s32 q4, \cd1, d2[0] 314cabdff1aSopenharmony_ci vmlal.s32 q10, \cd4, d2[1] 315cabdff1aSopenharmony_ci vmlal.s32 q4, \cd5, d2[1] 316cabdff1aSopenharmony_ci vmlal.s32 q10, \cd6, d3[0] 317cabdff1aSopenharmony_ci vmlal.s32 q4, \cd7, d3[0] 318cabdff1aSopenharmony_ci vmull.s32 q11, \cd0, d3[0] 319cabdff1aSopenharmony_ci vmull.s32 q5, \cd1, d3[0] 320cabdff1aSopenharmony_ci vmlsl.s32 q11, \cd4, d2[0] 321cabdff1aSopenharmony_ci vmlsl.s32 q5, \cd5, d2[0] 322cabdff1aSopenharmony_ci vsub.s32 \c0, \c0, \c2 323cabdff1aSopenharmony_ci vmlsl.s32 q11, \cd6, d2[1] 324cabdff1aSopenharmony_ci vmlsl.s32 q5, \cd7, d2[1] 325cabdff1aSopenharmony_ci vadd.s32 \c0, \c0, \c3 326cabdff1aSopenharmony_ci vmull.s32 q13, \cd2, d3[1] 327cabdff1aSopenharmony_ci vmull.s32 q6, \cd3, d3[1] 328cabdff1aSopenharmony_ci vmull.s32 q12, \cd0, d3[1] 329cabdff1aSopenharmony_ci vmull.s32 q7, \cd1, d3[1] 330cabdff1aSopenharmony_ci vadd.s64 q14, q10, q13 331cabdff1aSopenharmony_ci vadd.s64 q2, q4, q6 332cabdff1aSopenharmony_ci vadd.s64 q15, q11, q13 333cabdff1aSopenharmony_ci vadd.s64 q3, q5, q6 334cabdff1aSopenharmony_ci vrshrn.s64 \cd1, q2, #14 335cabdff1aSopenharmony_ci vrshrn.s64 \cd0, q14, #14 336cabdff1aSopenharmony_ci vadd.s64 q10, q10, q11 337cabdff1aSopenharmony_ci vadd.s64 q4, q4, q5 338cabdff1aSopenharmony_ci vrshrn.s64 \cd3, q3, #14 339cabdff1aSopenharmony_ci vrshrn.s64 \cd2, q15, #14 340cabdff1aSopenharmony_ci vsub.s64 q10, q10, q13 341cabdff1aSopenharmony_ci vsub.s64 q4, q4, q6 342cabdff1aSopenharmony_ci vrshrn.s64 \cd4, q12, #14 343cabdff1aSopenharmony_ci vrshrn.s64 \cd5, q7, #14 344cabdff1aSopenharmony_ci vrshrn.s64 \cd6, q10, #14 345cabdff1aSopenharmony_ci vrshrn.s64 \cd7, q4, #14 346cabdff1aSopenharmony_ci.endm 347cabdff1aSopenharmony_ci 348cabdff1aSopenharmony_ci@ The public functions in this file have got the following signature: 349cabdff1aSopenharmony_ci@ void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 350cabdff1aSopenharmony_ci 351cabdff1aSopenharmony_ci.macro itxfm_func4x4 txfm1, txfm2, bpp 352cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1 353cabdff1aSopenharmony_ci.ifc \txfm1,\txfm2 354cabdff1aSopenharmony_ci.ifc \txfm1,idct 355cabdff1aSopenharmony_ci movrel r12, itxfm4_coeffs 356cabdff1aSopenharmony_ci vld1.16 {d0}, [r12,:64] 357cabdff1aSopenharmony_ci vmovl.s16 q0, d0 358cabdff1aSopenharmony_ci.endif 359cabdff1aSopenharmony_ci.ifc \txfm1,iadst 360cabdff1aSopenharmony_ci movrel r12, iadst4_coeffs 361cabdff1aSopenharmony_ci vld1.16 {d1}, [r12,:64] 362cabdff1aSopenharmony_ci vmovl.s16 q1, d1 363cabdff1aSopenharmony_ci.endif 364cabdff1aSopenharmony_ci.else 365cabdff1aSopenharmony_ci movrel r12, itxfm4_coeffs 366cabdff1aSopenharmony_ci vld1.16 {q0}, [r12,:128] 367cabdff1aSopenharmony_ci vmovl.s16 q1, d1 368cabdff1aSopenharmony_ci vmovl.s16 q0, d0 369cabdff1aSopenharmony_ci.endif 370cabdff1aSopenharmony_ci.if \bpp > 10 371cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct 372cabdff1aSopenharmony_ci @ iadst4_12 needs q4-q7 373cabdff1aSopenharmony_ci vpush {q4-q7} 374cabdff1aSopenharmony_ci.endif 375cabdff1aSopenharmony_ci.endif 376cabdff1aSopenharmony_ci 377cabdff1aSopenharmony_ci vmov.i32 q14, #0 378cabdff1aSopenharmony_ci vmov.i32 q15, #0 379cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 380cabdff1aSopenharmony_ci cmp r3, #1 381cabdff1aSopenharmony_ci bne 1f 382cabdff1aSopenharmony_ci @ DC-only for idct/idct 383cabdff1aSopenharmony_ci vld1.32 {d4[]}, [r2,:32] 384cabdff1aSopenharmony_ci vmull.s32 q2, d4, d0[0] 385cabdff1aSopenharmony_ci vrshrn.s64 d4, q2, #14 386cabdff1aSopenharmony_ci vmull.s32 q2, d4, d0[0] 387cabdff1aSopenharmony_ci vrshrn.s64 d4, q2, #14 388cabdff1aSopenharmony_ci vst1.32 {d30[0]}, [r2,:32] 389cabdff1aSopenharmony_ci vdup.32 q2, d4[0] 390cabdff1aSopenharmony_ci vmov q3, q2 391cabdff1aSopenharmony_ci vmov q8, q2 392cabdff1aSopenharmony_ci vmov q9, q2 393cabdff1aSopenharmony_ci b 2f 394cabdff1aSopenharmony_ci.endif 395cabdff1aSopenharmony_ci 396cabdff1aSopenharmony_ci1: 397cabdff1aSopenharmony_ci vld1.32 {q2-q3}, [r2,:128] 398cabdff1aSopenharmony_ci vst1.32 {q14-q15}, [r2,:128]! 399cabdff1aSopenharmony_ci vld1.32 {q8-q9}, [r2,:128] 400cabdff1aSopenharmony_ci 401cabdff1aSopenharmony_ci.ifc \txfm1,iwht 402cabdff1aSopenharmony_ci vshr.s32 q2, q2, #2 403cabdff1aSopenharmony_ci vshr.s32 q3, q3, #2 404cabdff1aSopenharmony_ci vshr.s32 q8, q8, #2 405cabdff1aSopenharmony_ci vshr.s32 q9, q9, #2 406cabdff1aSopenharmony_ci.endif 407cabdff1aSopenharmony_ci 408cabdff1aSopenharmony_ci vst1.16 {q14-q15}, [r2,:128]! 409cabdff1aSopenharmony_ci \txfm1\()4_\bpp q2, q3, q8, q9, d4, d5, d6, d7, d16, d17, d18, d19 410cabdff1aSopenharmony_ci 411cabdff1aSopenharmony_ci @ Transpose 4x4 with 32 bit elements 412cabdff1aSopenharmony_ci vtrn.32 q2, q3 413cabdff1aSopenharmony_ci vtrn.32 q8, q9 414cabdff1aSopenharmony_ci vswp d5, d16 415cabdff1aSopenharmony_ci vswp d7, d18 416cabdff1aSopenharmony_ci 417cabdff1aSopenharmony_ci \txfm2\()4_\bpp q2, q3, q8, q9, d4, d5, d6, d7, d16, d17, d18, d19 418cabdff1aSopenharmony_ci2: 419cabdff1aSopenharmony_ci vmvn.u16 q15, #((0xffff << \bpp) & 0xffff) 420cabdff1aSopenharmony_ci vld1.16 {d0}, [r0,:64], r1 421cabdff1aSopenharmony_ci vld1.16 {d1}, [r0,:64], r1 422cabdff1aSopenharmony_ci.ifnc \txfm1,iwht 423cabdff1aSopenharmony_ci vrshr.s32 q2, q2, #4 424cabdff1aSopenharmony_ci vrshr.s32 q3, q3, #4 425cabdff1aSopenharmony_ci vrshr.s32 q8, q8, #4 426cabdff1aSopenharmony_ci vrshr.s32 q9, q9, #4 427cabdff1aSopenharmony_ci.endif 428cabdff1aSopenharmony_ci vaddw.u16 q2, q2, d0 429cabdff1aSopenharmony_ci vaddw.u16 q3, q3, d1 430cabdff1aSopenharmony_ci vld1.16 {d2}, [r0,:64], r1 431cabdff1aSopenharmony_ci vld1.16 {d3}, [r0,:64], r1 432cabdff1aSopenharmony_ci vqmovun.s32 d0, q2 433cabdff1aSopenharmony_ci vqmovun.s32 d1, q3 434cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 435cabdff1aSopenharmony_ci 436cabdff1aSopenharmony_ci vaddw.u16 q8, q8, d2 437cabdff1aSopenharmony_ci vmin.u16 q0, q0, q15 438cabdff1aSopenharmony_ci vaddw.u16 q9, q9, d3 439cabdff1aSopenharmony_ci vst1.16 {d0}, [r0,:64], r1 440cabdff1aSopenharmony_ci vqmovun.s32 d2, q8 441cabdff1aSopenharmony_ci vqmovun.s32 d3, q9 442cabdff1aSopenharmony_ci vmin.u16 q1, q1, q15 443cabdff1aSopenharmony_ci 444cabdff1aSopenharmony_ci vst1.16 {d1}, [r0,:64], r1 445cabdff1aSopenharmony_ci vst1.16 {d2}, [r0,:64], r1 446cabdff1aSopenharmony_ci vst1.16 {d3}, [r0,:64], r1 447cabdff1aSopenharmony_ci 448cabdff1aSopenharmony_ci.if \bpp > 10 449cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct 450cabdff1aSopenharmony_ci vpop {q4-q7} 451cabdff1aSopenharmony_ci.endif 452cabdff1aSopenharmony_ci.endif 453cabdff1aSopenharmony_ci bx lr 454cabdff1aSopenharmony_ciendfunc 455cabdff1aSopenharmony_ci.endm 456cabdff1aSopenharmony_ci 457cabdff1aSopenharmony_ci.macro itxfm_funcs4x4 bpp 458cabdff1aSopenharmony_ciitxfm_func4x4 idct, idct, \bpp 459cabdff1aSopenharmony_ciitxfm_func4x4 iadst, idct, \bpp 460cabdff1aSopenharmony_ciitxfm_func4x4 idct, iadst, \bpp 461cabdff1aSopenharmony_ciitxfm_func4x4 iadst, iadst, \bpp 462cabdff1aSopenharmony_ciitxfm_func4x4 iwht, iwht, \bpp 463cabdff1aSopenharmony_ci.endm 464cabdff1aSopenharmony_ci 465cabdff1aSopenharmony_ciitxfm_funcs4x4 10 466cabdff1aSopenharmony_ciitxfm_funcs4x4 12 467cabdff1aSopenharmony_ci 468cabdff1aSopenharmony_ci.macro idct8 469cabdff1aSopenharmony_ci dmbutterfly0 d16, d17, d24, d25, q8, q12, q2, q4, d4, d5, d8, d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a 470cabdff1aSopenharmony_ci dmbutterfly d20, d21, d28, d29, d1[0], d1[1], q2, q3, q4, q5 @ q10 = t2a, q14 = t3a 471cabdff1aSopenharmony_ci dmbutterfly d18, d19, d30, d31, d2[0], d2[1], q2, q3, q4, q5 @ q9 = t4a, q15 = t7a 472cabdff1aSopenharmony_ci dmbutterfly d26, d27, d22, d23, d3[0], d3[1], q2, q3, q4, q5 @ q13 = t5a, q11 = t6a 473cabdff1aSopenharmony_ci 474cabdff1aSopenharmony_ci butterfly q2, q14, q8, q14 @ q2 = t0, q14 = t3 475cabdff1aSopenharmony_ci butterfly q3, q10, q12, q10 @ q3 = t1, q10 = t2 476cabdff1aSopenharmony_ci butterfly q4, q13, q9, q13 @ q4 = t4, q13 = t5a 477cabdff1aSopenharmony_ci butterfly q5, q11, q15, q11 @ q5 = t7, q11 = t6a 478cabdff1aSopenharmony_ci 479cabdff1aSopenharmony_ci butterfly q8, q15, q2, q5 @ q8 = out[0], q15 = out[7] 480cabdff1aSopenharmony_ci 481cabdff1aSopenharmony_ci dmbutterfly0 d4, d5, d10, d11, q11, q13, q9, q13, d18, d19, d26, d27, q2, q5, q11, q12 @ q2 = t6, q5 = t5 482cabdff1aSopenharmony_ci 483cabdff1aSopenharmony_ci butterfly q11, q12, q14, q4 @ q11 = out[3], q12 = out[4] 484cabdff1aSopenharmony_ci butterfly q9, q14, q3, q2 @ q9 = out[1], q14 = out[6] 485cabdff1aSopenharmony_ci butterfly_r q13, q10, q10, q5 @ q13 = out[5], q10 = out[2] 486cabdff1aSopenharmony_ci.endm 487cabdff1aSopenharmony_ci 488cabdff1aSopenharmony_ci.macro iadst8 489cabdff1aSopenharmony_ci movrel r12, iadst8_coeffs 490cabdff1aSopenharmony_ci vld1.16 {q1}, [r12,:128]! 491cabdff1aSopenharmony_ci vmovl.s16 q0, d2 492cabdff1aSopenharmony_ci vmovl.s16 q1, d3 493cabdff1aSopenharmony_ci 494cabdff1aSopenharmony_ci dmbutterfly_l q4, q5, q2, q3, d30, d31, d16, d17, d0[1], d0[0] @ q4,q5 = t1a, q2,q3 = t0a 495cabdff1aSopenharmony_ci dmbutterfly_l q8, q15, q6, q7, d22, d23, d24, d25, d2[1], d2[0] @ q8,q15 = t5a, q6,q7 = t4a 496cabdff1aSopenharmony_ci 497cabdff1aSopenharmony_ci dbutterfly_n d22, d23, d4, d5, q2, q3, q6, q7, q11, q12, q2, q3 @ q11 = t0, q2 = t4 498cabdff1aSopenharmony_ci 499cabdff1aSopenharmony_ci dbutterfly_n d24, d25, d6, d7, q4, q5, q8, q15, q12, q3, q6, q7 @ q12 = t1, q3 = t5 500cabdff1aSopenharmony_ci 501cabdff1aSopenharmony_ci dmbutterfly_l q6, q7, q4, q5, d26, d27, d20, d21, d1[1], d1[0] @ q6,q7 = t3a, q4,q5 = t2a 502cabdff1aSopenharmony_ci dmbutterfly_l q10, q13, q8, q15, d18, d19, d28, d29, d3[1], d3[0] @ q10,q13 = t7a, q8,q15 = t6a 503cabdff1aSopenharmony_ci 504cabdff1aSopenharmony_ci dbutterfly_n d18, d19, d8, d9, q4, q5, q8, q15, q9, q14, q4, q5 @ q9 = t2, q4 = t6 505cabdff1aSopenharmony_ci dbutterfly_n d16, d17, d12, d13, q6, q7, q10, q13, q8, q15, q6, q7 @ q8 = t3, q6 = t7 506cabdff1aSopenharmony_ci 507cabdff1aSopenharmony_ci movrel r12, idct_coeffs 508cabdff1aSopenharmony_ci vld1.16 {q0}, [r12,:128] 509cabdff1aSopenharmony_ci vmovl.s16 q1, d1 510cabdff1aSopenharmony_ci vmovl.s16 q0, d0 511cabdff1aSopenharmony_ci 512cabdff1aSopenharmony_ci butterfly q15, q12, q12, q8 @ q15 = -out[7], q12 = t3 513cabdff1aSopenharmony_ci vneg.s32 q15, q15 @ q15 = out[7] 514cabdff1aSopenharmony_ci butterfly q8, q9, q11, q9 @ q8 = out[0], q9 = t2 515cabdff1aSopenharmony_ci 516cabdff1aSopenharmony_ci dmbutterfly_l q10, q11, q5, q7, d4, d5, d6, d7, d1[0], d1[1] @ q10,q11 = t5a, q5,q7 = t4a 517cabdff1aSopenharmony_ci dmbutterfly_l q2, q3, q13, q14, d12, d13, d8, d9, d1[1], d1[0] @ q2,q3 = t6a, q13,q14 = t7a 518cabdff1aSopenharmony_ci 519cabdff1aSopenharmony_ci dbutterfly_n d28, d29, d8, d9, q10, q11, q13, q14, q4, q6, q10, q11 @ q14 = out[6], q4 = t7 520cabdff1aSopenharmony_ci 521cabdff1aSopenharmony_ci dmbutterfly0 d22, d23, d24, d25, q9, q12, q6, q13, d12, d13, d26, d27, q9, q10 @ q11 = -out[3], q12 = out[4] 522cabdff1aSopenharmony_ci vneg.s32 q11, q11 @ q11 = out[3] 523cabdff1aSopenharmony_ci 524cabdff1aSopenharmony_ci dbutterfly_n d18, d19, d4, d5, q5, q7, q2, q3, q9, q10, q2, q3 @ q9 = -out[1], q2 = t6 525cabdff1aSopenharmony_ci vneg.s32 q9, q9 @ q9 = out[1] 526cabdff1aSopenharmony_ci 527cabdff1aSopenharmony_ci dmbutterfly0 d20, d21, d26, d27, q2, q4, q3, q5, d6, d7, d10, d11, q6, q7 @ q10 = out[2], q13 = -out[5] 528cabdff1aSopenharmony_ci vneg.s32 q13, q13 @ q13 = out[5] 529cabdff1aSopenharmony_ci.endm 530cabdff1aSopenharmony_ci 531cabdff1aSopenharmony_cifunction idct8x8_dc_add_neon 532cabdff1aSopenharmony_ci movrel r12, idct_coeffs 533cabdff1aSopenharmony_ci vld1.16 {d0}, [r12,:64] 534cabdff1aSopenharmony_ci 535cabdff1aSopenharmony_ci vmov.i32 q2, #0 536cabdff1aSopenharmony_ci vmovl.s16 q0, d0 537cabdff1aSopenharmony_ci 538cabdff1aSopenharmony_ci vld1.32 {d16[]}, [r2,:32] 539cabdff1aSopenharmony_ci vmull.s32 q8, d16, d0[0] 540cabdff1aSopenharmony_ci vrshrn.s64 d16, q8, #14 541cabdff1aSopenharmony_ci vmull.s32 q8, d16, d0[0] 542cabdff1aSopenharmony_ci vrshrn.s64 d16, q8, #14 543cabdff1aSopenharmony_ci vdup.32 q8, d16[0] 544cabdff1aSopenharmony_ci vst1.32 {d4[0]}, [r2,:32] 545cabdff1aSopenharmony_ci 546cabdff1aSopenharmony_ci vrshr.s32 q8, q8, #5 547cabdff1aSopenharmony_ci vdup.s16 q15, r8 548cabdff1aSopenharmony_ci 549cabdff1aSopenharmony_ci mov r3, r0 550cabdff1aSopenharmony_ci mov r12, #8 551cabdff1aSopenharmony_ci1: 552cabdff1aSopenharmony_ci @ Loop to add the constant from q8 into all 8x8 outputs 553cabdff1aSopenharmony_ci subs r12, r12, #2 554cabdff1aSopenharmony_ci vld1.16 {q2}, [r0,:128], r1 555cabdff1aSopenharmony_ci vaddw.u16 q10, q8, d4 556cabdff1aSopenharmony_ci vld1.16 {q3}, [r0,:128], r1 557cabdff1aSopenharmony_ci vaddw.u16 q11, q8, d5 558cabdff1aSopenharmony_ci vaddw.u16 q12, q8, d6 559cabdff1aSopenharmony_ci vaddw.u16 q13, q8, d7 560cabdff1aSopenharmony_ci vqmovun.s32 d4, q10 561cabdff1aSopenharmony_ci vqmovun.s32 d5, q11 562cabdff1aSopenharmony_ci vqmovun.s32 d6, q12 563cabdff1aSopenharmony_ci vqmovun.s32 d7, q13 564cabdff1aSopenharmony_ci vmin.u16 q2, q2, q15 565cabdff1aSopenharmony_ci vst1.16 {q2}, [r3,:128], r1 566cabdff1aSopenharmony_ci vmin.u16 q3, q3, q15 567cabdff1aSopenharmony_ci vst1.16 {q3}, [r3,:128], r1 568cabdff1aSopenharmony_ci bne 1b 569cabdff1aSopenharmony_ci 570cabdff1aSopenharmony_ci pop {r4-r8,pc} 571cabdff1aSopenharmony_ciendfunc 572cabdff1aSopenharmony_ci.ltorg 573cabdff1aSopenharmony_ci 574cabdff1aSopenharmony_ci.macro itxfm8_1d_funcs txfm 575cabdff1aSopenharmony_ci@ Read a vertical 4x8 slice out of a 8x8 matrix, do a transform on it, 576cabdff1aSopenharmony_ci@ transpose into a horizontal 8x4 slice and store. 577cabdff1aSopenharmony_ci@ r0 = dst (temp buffer) 578cabdff1aSopenharmony_ci@ r1 = slice offset 579cabdff1aSopenharmony_ci@ r2 = src 580cabdff1aSopenharmony_cifunction \txfm\()8_1d_4x8_pass1_neon 581cabdff1aSopenharmony_ci mov r12, #32 582cabdff1aSopenharmony_ci vmov.s32 q2, #0 583cabdff1aSopenharmony_ci.irp i, 8, 9, 10, 11, 12, 13, 14, 15 584cabdff1aSopenharmony_ci vld1.32 {q\i}, [r2,:128] 585cabdff1aSopenharmony_ci vst1.32 {q2}, [r2,:128], r12 586cabdff1aSopenharmony_ci.endr 587cabdff1aSopenharmony_ci 588cabdff1aSopenharmony_ci \txfm\()8 589cabdff1aSopenharmony_ci 590cabdff1aSopenharmony_ci @ Do two 4x4 transposes. Originally, q8-q15 contain the 591cabdff1aSopenharmony_ci @ 8 rows. Afterwards, q8-q11, q12-q15 contain the transposed 592cabdff1aSopenharmony_ci @ 4x4 blocks. 593cabdff1aSopenharmony_ci transpose32_q_2x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 594cabdff1aSopenharmony_ci 595cabdff1aSopenharmony_ci @ Store the transposed 4x4 blocks horizontally. 596cabdff1aSopenharmony_ci cmp r1, #4 597cabdff1aSopenharmony_ci beq 1f 598cabdff1aSopenharmony_ci.irp i, 8, 12, 9, 13, 10, 14, 11, 15 599cabdff1aSopenharmony_ci vst1.32 {q\i}, [r0,:128]! 600cabdff1aSopenharmony_ci.endr 601cabdff1aSopenharmony_ci bx lr 602cabdff1aSopenharmony_ci1: 603cabdff1aSopenharmony_ci @ Special case: For the last input column (r1 == 4), 604cabdff1aSopenharmony_ci @ which would be stored as the last row in the temp buffer, 605cabdff1aSopenharmony_ci @ don't store the first 4x4 block, but keep it in registers 606cabdff1aSopenharmony_ci @ for the first slice of the second pass (where it is the 607cabdff1aSopenharmony_ci @ last 4x4 block). 608cabdff1aSopenharmony_ci.irp i, 12, 13, 14, 15 609cabdff1aSopenharmony_ci add r0, r0, #16 610cabdff1aSopenharmony_ci vst1.32 {q\i}, [r0,:128]! 611cabdff1aSopenharmony_ci.endr 612cabdff1aSopenharmony_ci vmov q12, q8 613cabdff1aSopenharmony_ci vmov q13, q9 614cabdff1aSopenharmony_ci vmov q14, q10 615cabdff1aSopenharmony_ci vmov q15, q11 616cabdff1aSopenharmony_ci bx lr 617cabdff1aSopenharmony_ciendfunc 618cabdff1aSopenharmony_ci 619cabdff1aSopenharmony_ci@ Read a vertical 4x8 slice out of a 8x8 matrix, do a transform on it, 620cabdff1aSopenharmony_ci@ load the destination pixels (from a similar 4x8 slice), add and store back. 621cabdff1aSopenharmony_ci@ r0 = dst 622cabdff1aSopenharmony_ci@ r1 = dst stride 623cabdff1aSopenharmony_ci@ r2 = src (temp buffer) 624cabdff1aSopenharmony_ci@ r3 = slice offset 625cabdff1aSopenharmony_cifunction \txfm\()8_1d_4x8_pass2_neon 626cabdff1aSopenharmony_ci mov r12, #32 627cabdff1aSopenharmony_ci.irp i, 8, 9, 10, 11 628cabdff1aSopenharmony_ci vld1.32 {q\i}, [r2,:128], r12 629cabdff1aSopenharmony_ci.endr 630cabdff1aSopenharmony_ci cmp r3, #0 631cabdff1aSopenharmony_ci beq 1f 632cabdff1aSopenharmony_ci.irp i, 12, 13, 14, 15 633cabdff1aSopenharmony_ci vld1.32 {q\i}, [r2,:128], r12 634cabdff1aSopenharmony_ci.endr 635cabdff1aSopenharmony_ci1: 636cabdff1aSopenharmony_ci 637cabdff1aSopenharmony_ci add r3, r0, r1 638cabdff1aSopenharmony_ci lsl r1, r1, #1 639cabdff1aSopenharmony_ci \txfm\()8 640cabdff1aSopenharmony_ci 641cabdff1aSopenharmony_ci vdup.s16 q4, r8 642cabdff1aSopenharmony_ci.macro load_add_store coef0, coef1, coef2, coef3 643cabdff1aSopenharmony_ci vld1.16 {d4}, [r0,:64], r1 644cabdff1aSopenharmony_ci vld1.16 {d5}, [r3,:64], r1 645cabdff1aSopenharmony_ci vld1.16 {d6}, [r0,:64], r1 646cabdff1aSopenharmony_ci vld1.16 {d7}, [r3,:64], r1 647cabdff1aSopenharmony_ci 648cabdff1aSopenharmony_ci vrshr.s32 \coef0, \coef0, #5 649cabdff1aSopenharmony_ci vrshr.s32 \coef1, \coef1, #5 650cabdff1aSopenharmony_ci vrshr.s32 \coef2, \coef2, #5 651cabdff1aSopenharmony_ci vrshr.s32 \coef3, \coef3, #5 652cabdff1aSopenharmony_ci 653cabdff1aSopenharmony_ci vaddw.u16 \coef0, \coef0, d4 654cabdff1aSopenharmony_ci vaddw.u16 \coef1, \coef1, d5 655cabdff1aSopenharmony_ci vaddw.u16 \coef2, \coef2, d6 656cabdff1aSopenharmony_ci vaddw.u16 \coef3, \coef3, d7 657cabdff1aSopenharmony_ci 658cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #1 659cabdff1aSopenharmony_ci sub r3, r3, r1, lsl #1 660cabdff1aSopenharmony_ci 661cabdff1aSopenharmony_ci vqmovun.s32 d4, \coef0 662cabdff1aSopenharmony_ci vqmovun.s32 d5, \coef1 663cabdff1aSopenharmony_ci vqmovun.s32 d6, \coef2 664cabdff1aSopenharmony_ci vqmovun.s32 d7, \coef3 665cabdff1aSopenharmony_ci 666cabdff1aSopenharmony_ci vmin.u16 q2, q2, q4 667cabdff1aSopenharmony_ci vmin.u16 q3, q3, q4 668cabdff1aSopenharmony_ci 669cabdff1aSopenharmony_ci vst1.16 {d4}, [r0,:64], r1 670cabdff1aSopenharmony_ci vst1.16 {d5}, [r3,:64], r1 671cabdff1aSopenharmony_ci vst1.16 {d6}, [r0,:64], r1 672cabdff1aSopenharmony_ci vst1.16 {d7}, [r3,:64], r1 673cabdff1aSopenharmony_ci.endm 674cabdff1aSopenharmony_ci load_add_store q8, q9, q10, q11 675cabdff1aSopenharmony_ci load_add_store q12, q13, q14, q15 676cabdff1aSopenharmony_ci.purgem load_add_store 677cabdff1aSopenharmony_ci 678cabdff1aSopenharmony_ci bx lr 679cabdff1aSopenharmony_ciendfunc 680cabdff1aSopenharmony_ci.endm 681cabdff1aSopenharmony_ci 682cabdff1aSopenharmony_ciitxfm8_1d_funcs idct 683cabdff1aSopenharmony_ciitxfm8_1d_funcs iadst 684cabdff1aSopenharmony_ci 685cabdff1aSopenharmony_ci.macro itxfm_func8x8 txfm1, txfm2 686cabdff1aSopenharmony_cifunction vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon 687cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 688cabdff1aSopenharmony_ci cmp r3, #1 689cabdff1aSopenharmony_ci beq idct8x8_dc_add_neon 690cabdff1aSopenharmony_ci.endif 691cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct 692cabdff1aSopenharmony_ci vpush {q4-q7} 693cabdff1aSopenharmony_ci.else 694cabdff1aSopenharmony_ci vpush {q4-q5} 695cabdff1aSopenharmony_ci.endif 696cabdff1aSopenharmony_ci 697cabdff1aSopenharmony_ci @ Align the stack, allocate a temp buffer 698cabdff1aSopenharmony_ciT mov r7, sp 699cabdff1aSopenharmony_ciT and r7, r7, #15 700cabdff1aSopenharmony_ciA and r7, sp, #15 701cabdff1aSopenharmony_ci add r7, r7, #256 702cabdff1aSopenharmony_ci sub sp, sp, r7 703cabdff1aSopenharmony_ci 704cabdff1aSopenharmony_ci mov r4, r0 705cabdff1aSopenharmony_ci mov r5, r1 706cabdff1aSopenharmony_ci mov r6, r2 707cabdff1aSopenharmony_ci 708cabdff1aSopenharmony_ci.ifc \txfm1,idct 709cabdff1aSopenharmony_ci movrel r12, idct_coeffs 710cabdff1aSopenharmony_ci vld1.16 {q0}, [r12,:128] 711cabdff1aSopenharmony_ci vmovl.s16 q1, d1 712cabdff1aSopenharmony_ci vmovl.s16 q0, d0 713cabdff1aSopenharmony_ci.endif 714cabdff1aSopenharmony_ci 715cabdff1aSopenharmony_ci.irp i, 0, 4 716cabdff1aSopenharmony_ci add r0, sp, #(\i*32) 717cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 718cabdff1aSopenharmony_ci.if \i == 4 719cabdff1aSopenharmony_ci cmp r3, #12 720cabdff1aSopenharmony_ci ble 1f 721cabdff1aSopenharmony_ci.endif 722cabdff1aSopenharmony_ci.endif 723cabdff1aSopenharmony_ci mov r1, #\i 724cabdff1aSopenharmony_ci add r2, r6, #(\i*4) 725cabdff1aSopenharmony_ci bl \txfm1\()8_1d_4x8_pass1_neon 726cabdff1aSopenharmony_ci.endr 727cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 728cabdff1aSopenharmony_ci b 3f 729cabdff1aSopenharmony_ci1: 730cabdff1aSopenharmony_ci @ For all-zero slices in pass 1, set q12-q15 to zero, for the in-register 731cabdff1aSopenharmony_ci @ passthrough of coefficients to pass 2 and clear the end of the temp buffer 732cabdff1aSopenharmony_ci vmov.i32 q12, #0 733cabdff1aSopenharmony_ci vmov.i32 q13, #0 734cabdff1aSopenharmony_ci vmov.i32 q14, #0 735cabdff1aSopenharmony_ci vmov.i32 q15, #0 736cabdff1aSopenharmony_ci.rept 4 737cabdff1aSopenharmony_ci vst1.32 {q12-q13}, [r0,:128]! 738cabdff1aSopenharmony_ci.endr 739cabdff1aSopenharmony_ci3: 740cabdff1aSopenharmony_ci.endif 741cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,iadst_idct 742cabdff1aSopenharmony_ci movrel r12, idct_coeffs 743cabdff1aSopenharmony_ci vld1.16 {q0}, [r12,:128] 744cabdff1aSopenharmony_ci vmovl.s16 q1, d1 745cabdff1aSopenharmony_ci vmovl.s16 q0, d0 746cabdff1aSopenharmony_ci.endif 747cabdff1aSopenharmony_ci.irp i, 0, 4 748cabdff1aSopenharmony_ci add r0, r4, #(\i*2) 749cabdff1aSopenharmony_ci mov r1, r5 750cabdff1aSopenharmony_ci add r2, sp, #(\i*4) 751cabdff1aSopenharmony_ci mov r3, #\i 752cabdff1aSopenharmony_ci bl \txfm2\()8_1d_4x8_pass2_neon 753cabdff1aSopenharmony_ci.endr 754cabdff1aSopenharmony_ci 755cabdff1aSopenharmony_ci add sp, sp, r7 756cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct 757cabdff1aSopenharmony_ci vpop {q4-q7} 758cabdff1aSopenharmony_ci.else 759cabdff1aSopenharmony_ci vpop {q4-q5} 760cabdff1aSopenharmony_ci.endif 761cabdff1aSopenharmony_ci pop {r4-r8,pc} 762cabdff1aSopenharmony_ciendfunc 763cabdff1aSopenharmony_ci 764cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1 765cabdff1aSopenharmony_ci push {r4-r8,lr} 766cabdff1aSopenharmony_ci movw r8, #0x03ff 767cabdff1aSopenharmony_ci b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon 768cabdff1aSopenharmony_ciendfunc 769cabdff1aSopenharmony_ci 770cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1 771cabdff1aSopenharmony_ci push {r4-r8,lr} 772cabdff1aSopenharmony_ci movw r8, #0x0fff 773cabdff1aSopenharmony_ci b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon 774cabdff1aSopenharmony_ciendfunc 775cabdff1aSopenharmony_ci.endm 776cabdff1aSopenharmony_ci 777cabdff1aSopenharmony_ciitxfm_func8x8 idct, idct 778cabdff1aSopenharmony_ciitxfm_func8x8 iadst, idct 779cabdff1aSopenharmony_ciitxfm_func8x8 idct, iadst 780cabdff1aSopenharmony_ciitxfm_func8x8 iadst, iadst 781cabdff1aSopenharmony_ci 782cabdff1aSopenharmony_cifunction idct16x16_dc_add_neon 783cabdff1aSopenharmony_ci movrel r12, idct_coeffs 784cabdff1aSopenharmony_ci vld1.16 {d0}, [r12,:64] 785cabdff1aSopenharmony_ci 786cabdff1aSopenharmony_ci vmov.i32 q2, #0 787cabdff1aSopenharmony_ci vmovl.s16 q0, d0 788cabdff1aSopenharmony_ci 789cabdff1aSopenharmony_ci vld1.32 {d16[]}, [r2,:32] 790cabdff1aSopenharmony_ci vmull.s32 q8, d16, d0[0] 791cabdff1aSopenharmony_ci vrshrn.s64 d16, q8, #14 792cabdff1aSopenharmony_ci vmull.s32 q8, d16, d0[0] 793cabdff1aSopenharmony_ci vrshrn.s64 d16, q8, #14 794cabdff1aSopenharmony_ci vdup.32 q8, d16[0] 795cabdff1aSopenharmony_ci vst1.32 {d4[0]}, [r2,:32] 796cabdff1aSopenharmony_ci 797cabdff1aSopenharmony_ci vrshr.s32 q8, q8, #6 798cabdff1aSopenharmony_ci vdup.s16 q15, r9 799cabdff1aSopenharmony_ci 800cabdff1aSopenharmony_ci mov r3, r0 801cabdff1aSopenharmony_ci mov r12, #16 802cabdff1aSopenharmony_ci1: 803cabdff1aSopenharmony_ci @ Loop to add the constant from q8 into all 16x16 outputs 804cabdff1aSopenharmony_ci subs r12, r12, #2 805cabdff1aSopenharmony_ci vld1.16 {q0-q1}, [r0,:128], r1 806cabdff1aSopenharmony_ci vaddw.u16 q9, q8, d0 807cabdff1aSopenharmony_ci vaddw.u16 q10, q8, d1 808cabdff1aSopenharmony_ci vld1.16 {q2-q3}, [r0,:128], r1 809cabdff1aSopenharmony_ci vaddw.u16 q11, q8, d2 810cabdff1aSopenharmony_ci vaddw.u16 q12, q8, d3 811cabdff1aSopenharmony_ci vaddw.u16 q13, q8, d4 812cabdff1aSopenharmony_ci vaddw.u16 q14, q8, d5 813cabdff1aSopenharmony_ci vqmovun.s32 d0, q9 814cabdff1aSopenharmony_ci vaddw.u16 q9, q8, d6 815cabdff1aSopenharmony_ci vqmovun.s32 d1, q10 816cabdff1aSopenharmony_ci vaddw.u16 q10, q8, d7 817cabdff1aSopenharmony_ci vqmovun.s32 d2, q11 818cabdff1aSopenharmony_ci vqmovun.s32 d3, q12 819cabdff1aSopenharmony_ci vqmovun.s32 d4, q13 820cabdff1aSopenharmony_ci vqmovun.s32 d5, q14 821cabdff1aSopenharmony_ci vmin.u16 q0, q0, q15 822cabdff1aSopenharmony_ci vmin.u16 q1, q1, q15 823cabdff1aSopenharmony_ci vqmovun.s32 d6, q9 824cabdff1aSopenharmony_ci vqmovun.s32 d7, q10 825cabdff1aSopenharmony_ci vst1.16 {q0-q1}, [r3,:128], r1 826cabdff1aSopenharmony_ci vmin.u16 q2, q2, q15 827cabdff1aSopenharmony_ci vmin.u16 q3, q3, q15 828cabdff1aSopenharmony_ci vst1.16 {q2-q3}, [r3,:128], r1 829cabdff1aSopenharmony_ci bne 1b 830cabdff1aSopenharmony_ci 831cabdff1aSopenharmony_ci pop {r4-r9,pc} 832cabdff1aSopenharmony_ciendfunc 833cabdff1aSopenharmony_ci.ltorg 834cabdff1aSopenharmony_ci 835cabdff1aSopenharmony_ci.macro idct16_end 836cabdff1aSopenharmony_ci butterfly d18, d11, d8, d11 @ d18 = t0a, d11 = t7a 837cabdff1aSopenharmony_ci butterfly d19, d22, d9, d22 @ d19 = t1a, d22 = t6 838cabdff1aSopenharmony_ci butterfly d8, d26, d20, d26 @ d8 = t2a, d26 = t5 839cabdff1aSopenharmony_ci butterfly d9, d10, d28, d10 @ d9 = t3a, d10 = t4 840cabdff1aSopenharmony_ci butterfly d20, d28, d16, d24 @ d20 = t8a, d28 = t11a 841cabdff1aSopenharmony_ci butterfly d24, d21, d23, d21 @ d24 = t9, d21 = t10 842cabdff1aSopenharmony_ci butterfly d23, d27, d25, d27 @ d23 = t14, d27 = t13 843cabdff1aSopenharmony_ci butterfly d25, d29, d29, d17 @ d25 = t15a, d29 = t12a 844cabdff1aSopenharmony_ci 845cabdff1aSopenharmony_ci mbutterfly0 d27, d21, d27, d21, d16, d30, q8, q15 @ d27 = t13a, d21 = t10a 846cabdff1aSopenharmony_ci mbutterfly0 d29, d28, d29, d28, d16, d30, q8, q15 @ d29 = t12, d28 = t11 847cabdff1aSopenharmony_ci 848cabdff1aSopenharmony_ci vswp d27, d29 @ d27 = t12, d29 = t13a 849cabdff1aSopenharmony_ci vswp d28, d27 @ d28 = t12, d27 = t11 850cabdff1aSopenharmony_ci butterfly d16, d31, d18, d25 @ d16 = out[0], d31 = out[15] 851cabdff1aSopenharmony_ci butterfly d17, d30, d19, d23 @ d17 = out[1], d30 = out[14] 852cabdff1aSopenharmony_ci butterfly_r d25, d22, d22, d24 @ d25 = out[9], d22 = out[6] 853cabdff1aSopenharmony_ci butterfly d23, d24, d11, d20 @ d23 = out[7], d24 = out[8] 854cabdff1aSopenharmony_ci butterfly d18, d29, d8, d29 @ d18 = out[2], d29 = out[13] 855cabdff1aSopenharmony_ci butterfly d19, d28, d9, d28 @ d19 = out[3], d28 = out[12] 856cabdff1aSopenharmony_ci vmov d8, d21 @ d8 = t10a 857cabdff1aSopenharmony_ci butterfly d20, d27, d10, d27 @ d20 = out[4], d27 = out[11] 858cabdff1aSopenharmony_ci butterfly d21, d26, d26, d8 @ d21 = out[5], d26 = out[10] 859cabdff1aSopenharmony_ci bx lr 860cabdff1aSopenharmony_ci.endm 861cabdff1aSopenharmony_ci 862cabdff1aSopenharmony_cifunction idct16 863cabdff1aSopenharmony_ci mbutterfly0 d16, d24, d16, d24, d8, d10, q4, q5 @ d16 = t0a, d24 = t1a 864cabdff1aSopenharmony_ci mbutterfly d20, d28, d1[0], d1[1], q4, q5 @ d20 = t2a, d28 = t3a 865cabdff1aSopenharmony_ci mbutterfly d18, d30, d2[0], d2[1], q4, q5 @ d18 = t4a, d30 = t7a 866cabdff1aSopenharmony_ci mbutterfly d26, d22, d3[0], d3[1], q4, q5 @ d26 = t5a, d22 = t6a 867cabdff1aSopenharmony_ci mbutterfly d17, d31, d4[0], d4[1], q4, q5 @ d17 = t8a, d31 = t15a 868cabdff1aSopenharmony_ci mbutterfly d25, d23, d5[0], d5[1], q4, q5 @ d25 = t9a, d23 = t14a 869cabdff1aSopenharmony_ci mbutterfly d21, d27, d6[0], d6[1], q4, q5 @ d21 = t10a, d27 = t13a 870cabdff1aSopenharmony_ci mbutterfly d29, d19, d7[0], d7[1], q4, q5 @ d29 = t11a, d19 = t12a 871cabdff1aSopenharmony_ci 872cabdff1aSopenharmony_ci butterfly d8, d28, d16, d28 @ d8 = t0, d28 = t3 873cabdff1aSopenharmony_ci butterfly d9, d20, d24, d20 @ d9 = t1, d20 = t2 874cabdff1aSopenharmony_ci butterfly d10, d26, d18, d26 @ d10 = t4, d26 = t5 875cabdff1aSopenharmony_ci butterfly d11, d22, d30, d22 @ d11 = t7, d22 = t6 876cabdff1aSopenharmony_ci butterfly d16, d25, d17, d25 @ d16 = t8, d25 = t9 877cabdff1aSopenharmony_ci butterfly d24, d21, d29, d21 @ d24 = t11, d21 = t10 878cabdff1aSopenharmony_ci butterfly d17, d27, d19, d27 @ d17 = t12, d27 = t13 879cabdff1aSopenharmony_ci butterfly d29, d23, d31, d23 @ d29 = t15, d23 = t14 880cabdff1aSopenharmony_ci 881cabdff1aSopenharmony_ci mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a, d26 = t5a 882cabdff1aSopenharmony_ci mbutterfly d23, d25, d1[0], d1[1], q9, q15 @ d23 = t9a, d25 = t14a 883cabdff1aSopenharmony_ci mbutterfly d27, d21, d1[0], d1[1], q9, q15, neg=1 @ d27 = t13a, d21 = t10a 884cabdff1aSopenharmony_ci idct16_end 885cabdff1aSopenharmony_ciendfunc 886cabdff1aSopenharmony_ci 887cabdff1aSopenharmony_cifunction idct16_half 888cabdff1aSopenharmony_ci mbutterfly0_h d16, d24, d16, d24, d8, d10, q4, q5 @ d16 = t0a, d24 = t1a 889cabdff1aSopenharmony_ci mbutterfly_h1 d20, d28, d1[0], d1[1], q4, q5 @ d20 = t2a, d28 = t3a 890cabdff1aSopenharmony_ci mbutterfly_h1 d18, d30, d2[0], d2[1], q4, q5 @ d18 = t4a, d30 = t7a 891cabdff1aSopenharmony_ci mbutterfly_h2 d26, d22, d3[0], d3[1], q4, q5 @ d26 = t5a, d22 = t6a 892cabdff1aSopenharmony_ci mbutterfly_h1 d17, d31, d4[0], d4[1], q4, q5 @ d17 = t8a, d31 = t15a 893cabdff1aSopenharmony_ci mbutterfly_h2 d25, d23, d5[0], d5[1], q4, q5 @ d25 = t9a, d23 = t14a 894cabdff1aSopenharmony_ci mbutterfly_h1 d21, d27, d6[0], d6[1], q4, q5 @ d21 = t10a, d27 = t13a 895cabdff1aSopenharmony_ci mbutterfly_h2 d29, d19, d7[0], d7[1], q4, q5 @ d29 = t11a, d19 = t12a 896cabdff1aSopenharmony_ci 897cabdff1aSopenharmony_ci butterfly d8, d28, d16, d28 @ d8 = t0, d28 = t3 898cabdff1aSopenharmony_ci butterfly d9, d20, d24, d20 @ d9 = t1, d20 = t2 899cabdff1aSopenharmony_ci butterfly d10, d26, d18, d26 @ d10 = t4, d26 = t5 900cabdff1aSopenharmony_ci butterfly d11, d22, d30, d22 @ d11 = t7, d22 = t6 901cabdff1aSopenharmony_ci butterfly d16, d25, d17, d25 @ d16 = t8, d25 = t9 902cabdff1aSopenharmony_ci butterfly d24, d21, d29, d21 @ d24 = t11, d21 = t10 903cabdff1aSopenharmony_ci butterfly d17, d27, d19, d27 @ d17 = t12, d27 = t13 904cabdff1aSopenharmony_ci butterfly d29, d23, d31, d23 @ d29 = t15, d23 = t14 905cabdff1aSopenharmony_ci 906cabdff1aSopenharmony_ci mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a, d26 = t5a 907cabdff1aSopenharmony_ci mbutterfly d23, d25, d1[0], d1[1], q9, q15 @ d23 = t9a, d25 = t14a 908cabdff1aSopenharmony_ci mbutterfly d27, d21, d1[0], d1[1], q9, q15, neg=1 @ d27 = t13a, d21 = t10a 909cabdff1aSopenharmony_ci idct16_end 910cabdff1aSopenharmony_ciendfunc 911cabdff1aSopenharmony_ci 912cabdff1aSopenharmony_cifunction idct16_quarter 913cabdff1aSopenharmony_ci vmov.s64 q12, #0 914cabdff1aSopenharmony_ci vmull.s32 q4, d17, d4[0] 915cabdff1aSopenharmony_ci vmull.s32 q5, d18, d2[1] 916cabdff1aSopenharmony_ci vmull.s32 q15, d18, d2[0] 917cabdff1aSopenharmony_ci vmlsl.s32 q12, d19, d7[1] 918cabdff1aSopenharmony_ci vmull.s32 q14, d17, d4[1] 919cabdff1aSopenharmony_ci vmull.s32 q13, d19, d7[0] 920cabdff1aSopenharmony_ci vmull.s32 q11, d16, d0[0] 921cabdff1aSopenharmony_ci vrshrn.s64 d16, q4, #14 922cabdff1aSopenharmony_ci vrshrn.s64 d11, q5, #14 923cabdff1aSopenharmony_ci vrshrn.s64 d10, q15, #14 924cabdff1aSopenharmony_ci vrshrn.s64 d24, q12, #14 925cabdff1aSopenharmony_ci vrshrn.s64 d29, q14, #14 926cabdff1aSopenharmony_ci vrshrn.s64 d17, q13, #14 927cabdff1aSopenharmony_ci vrshrn.s64 d28, q11, #14 928cabdff1aSopenharmony_ci 929cabdff1aSopenharmony_ci mbutterfly_l q10, q11, d17, d24, d1[0], d1[1], neg=1 930cabdff1aSopenharmony_ci mbutterfly_l q9, q15, d29, d16, d1[0], d1[1] 931cabdff1aSopenharmony_ci vrshrn.s64 d27, q10, #14 932cabdff1aSopenharmony_ci vrshrn.s64 d21, q11, #14 933cabdff1aSopenharmony_ci vrshrn.s64 d23, q9, #14 934cabdff1aSopenharmony_ci vrshrn.s64 d25, q15, #14 935cabdff1aSopenharmony_ci vmov d8, d28 936cabdff1aSopenharmony_ci vmov d9, d28 937cabdff1aSopenharmony_ci mbutterfly0 d22, d26, d11, d10, d18, d30, q9, q15 938cabdff1aSopenharmony_ci vmov d20, d28 939cabdff1aSopenharmony_ci idct16_end 940cabdff1aSopenharmony_ciendfunc 941cabdff1aSopenharmony_ci 942cabdff1aSopenharmony_cifunction iadst16 943cabdff1aSopenharmony_ci movrel r12, iadst16_coeffs 944cabdff1aSopenharmony_ci vld1.16 {q0}, [r12,:128]! 945cabdff1aSopenharmony_ci vmovl.s16 q1, d1 946cabdff1aSopenharmony_ci vmovl.s16 q0, d0 947cabdff1aSopenharmony_ci 948cabdff1aSopenharmony_ci mbutterfly_l q3, q2, d31, d16, d0[1], d0[0] @ q3 = t1, q2 = t0 949cabdff1aSopenharmony_ci mbutterfly_l q5, q4, d23, d24, d2[1], d2[0] @ q5 = t9, q4 = t8 950cabdff1aSopenharmony_ci butterfly_n d31, d24, q3, q5, q6, q5 @ d31 = t1a, d24 = t9a 951cabdff1aSopenharmony_ci mbutterfly_l q7, q6, d29, d18, d1[1], d1[0] @ q7 = t3, q6 = t2 952cabdff1aSopenharmony_ci butterfly_n d16, d23, q2, q4, q3, q4 @ d16 = t0a, d23 = t8a 953cabdff1aSopenharmony_ci mbutterfly_l q3, q2, d21, d26, d3[1], d3[0] @ q3 = t11, q2 = t10 954cabdff1aSopenharmony_ci 955cabdff1aSopenharmony_ci vld1.16 {q0}, [r12,:128]! 956cabdff1aSopenharmony_ci butterfly_n d29, d26, q7, q3, q4, q3 @ d29 = t3a, d26 = t11a 957cabdff1aSopenharmony_ci vmovl.s16 q1, d1 958cabdff1aSopenharmony_ci vmovl.s16 q0, d0 959cabdff1aSopenharmony_ci mbutterfly_l q5, q4, d27, d20, d0[1], d0[0] @ q5 = t5, q4 = t4 960cabdff1aSopenharmony_ci butterfly_n d18, d21, q6, q2, q3, q2 @ d18 = t2a, d21 = t10a 961cabdff1aSopenharmony_ci 962cabdff1aSopenharmony_ci mbutterfly_l q7, q6, d19, d28, d2[1], d2[0] @ q7 = t13, q6 = t12 963cabdff1aSopenharmony_ci butterfly_n d20, d28, q5, q7, q2, q7 @ d20 = t5a, d28 = t13a 964cabdff1aSopenharmony_ci mbutterfly_l q3, q2, d25, d22, d1[1], d1[0] @ q3 = t7, q2 = t6 965cabdff1aSopenharmony_ci butterfly_n d27, d19, q4, q6, q5, q6 @ d27 = t4a, d19 = t12a 966cabdff1aSopenharmony_ci 967cabdff1aSopenharmony_ci mbutterfly_l q5, q4, d17, d30, d3[1], d3[0] @ q5 = t15, q4 = t14 968cabdff1aSopenharmony_ci movrel r12, idct_coeffs 969cabdff1aSopenharmony_ci vld1.16 {q0}, [r12,:128] 970cabdff1aSopenharmony_ci vmovl.s16 q1, d1 971cabdff1aSopenharmony_ci vmovl.s16 q0, d0 972cabdff1aSopenharmony_ci butterfly_n d22, d30, q3, q5, q6, q5 @ d22 = t7a, d30 = t15a 973cabdff1aSopenharmony_ci mbutterfly_l q7, q6, d23, d24, d2[0], d2[1] @ q7 = t9, q6 = t8 974cabdff1aSopenharmony_ci butterfly_n d25, d17, q2, q4, q3, q4 @ d25 = t6a, d17 = t14a 975cabdff1aSopenharmony_ci 976cabdff1aSopenharmony_ci mbutterfly_l q2, q3, d28, d19, d2[1], d2[0] @ q2 = t12, q3 = t13 977cabdff1aSopenharmony_ci butterfly_n d23, d19, q6, q2, q4, q2 @ d23 = t8a, d19 = t12a 978cabdff1aSopenharmony_ci mbutterfly_l q5, q4, d21, d26, d3[0], d3[1] @ q5 = t11, q4 = t10 979cabdff1aSopenharmony_ci butterfly_r d4, d27, d16, d27 @ d4 = t4, d27 = t0 980cabdff1aSopenharmony_ci butterfly_n d24, d28, q7, q3, q6, q3 @ d24 = t9a, d28 = t13a 981cabdff1aSopenharmony_ci 982cabdff1aSopenharmony_ci mbutterfly_l q6, q7, d30, d17, d3[1], d3[0] @ q6 = t14, q7 = t15 983cabdff1aSopenharmony_ci butterfly_r d5, d20, d31, d20 @ d5 = t5, d20 = t1 984cabdff1aSopenharmony_ci butterfly_n d21, d17, q4, q6, q3, q6 @ d21 = t10a, d17 = t14a 985cabdff1aSopenharmony_ci butterfly_n d26, d30, q5, q7, q4, q7 @ d26 = t11a, d30 = t15a 986cabdff1aSopenharmony_ci 987cabdff1aSopenharmony_ci butterfly_r d6, d25, d18, d25 @ d6 = t6, d25 = t2 988cabdff1aSopenharmony_ci butterfly_r d7, d22, d29, d22 @ d7 = t7, d22 = t3 989cabdff1aSopenharmony_ci 990cabdff1aSopenharmony_ci mbutterfly_l q5, q4, d19, d28, d1[0], d1[1] @ q5 = t13, q4 = t12 991cabdff1aSopenharmony_ci mbutterfly_l q6, q7, d30, d17, d1[1], d1[0] @ q6 = t14, q7 = t15 992cabdff1aSopenharmony_ci 993cabdff1aSopenharmony_ci butterfly_n d18, d30, q4, q6, q8, q6 @ d18 = out[2], d30 = t14a 994cabdff1aSopenharmony_ci butterfly_n d29, d17, q5, q7, q6, q7 @ d29 = -out[13], d17 = t15a 995cabdff1aSopenharmony_ci vneg.s32 d29, d29 @ d29 = out[13] 996cabdff1aSopenharmony_ci 997cabdff1aSopenharmony_ci mbutterfly_l q5, q4, d4, d5, d1[0], d1[1] @ q5 = t5a, q4 = t4a 998cabdff1aSopenharmony_ci mbutterfly_l q6, q7, d7, d6, d1[1], d1[0] @ q6 = t6a, q7 = t7a 999cabdff1aSopenharmony_ci 1000cabdff1aSopenharmony_ci butterfly d2, d6, d27, d25 @ d2 = out[0], d6 = t2a 1001cabdff1aSopenharmony_ci butterfly d3, d7, d23, d21 @ d3 =-out[1], d7 = t10 1002cabdff1aSopenharmony_ci 1003cabdff1aSopenharmony_ci butterfly_n d19, d31, q4, q6, q2, q4 @ d19 = -out[3], d31 = t6 1004cabdff1aSopenharmony_ci vneg.s32 d19, d19 @ d19 = out[3] 1005cabdff1aSopenharmony_ci butterfly_n d28, d16, q5, q7, q2, q5 @ d28 = out[12], d16 = t7 1006cabdff1aSopenharmony_ci 1007cabdff1aSopenharmony_ci butterfly d5, d8, d20, d22 @ d5 =-out[15],d8 = t3a 1008cabdff1aSopenharmony_ci butterfly d4, d9, d24, d26 @ d4 = out[14],d9 = t11 1009cabdff1aSopenharmony_ci 1010cabdff1aSopenharmony_ci mbutterfly0 d23, d24, d6, d8, d10, d11, q6, q7, 1 @ d23 = out[7], d24 = out[8] 1011cabdff1aSopenharmony_ci mbutterfly0 d20, d27, d16, d31, d10, d11, q6, q7 @ d20 = out[4], d27 = out[11] 1012cabdff1aSopenharmony_ci mbutterfly0 d22, d25, d9, d7, d10, d11, q6, q7 @ d22 = out[6], d25 = out[9] 1013cabdff1aSopenharmony_ci mbutterfly0 d21, d26, d30, d17, d10, d11, q6, q7, 1 @ d21 = out[5], d26 = out[10] 1014cabdff1aSopenharmony_ci 1015cabdff1aSopenharmony_ci vneg.s32 d31, d5 @ d31 = out[15] 1016cabdff1aSopenharmony_ci vneg.s32 d17, d3 @ d17 = out[1] 1017cabdff1aSopenharmony_ci 1018cabdff1aSopenharmony_ci vmov d16, d2 1019cabdff1aSopenharmony_ci vmov d30, d4 1020cabdff1aSopenharmony_ci bx lr 1021cabdff1aSopenharmony_ciendfunc 1022cabdff1aSopenharmony_ci 1023cabdff1aSopenharmony_ci.macro itxfm16_1d_funcs txfm, suffix 1024cabdff1aSopenharmony_ci@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it, 1025cabdff1aSopenharmony_ci@ transpose into a horizontal 16x2 slice and store. 1026cabdff1aSopenharmony_ci@ r0 = dst (temp buffer) 1027cabdff1aSopenharmony_ci@ r2 = src 1028cabdff1aSopenharmony_cifunction \txfm\()16_1d_2x16_pass1\suffix\()_neon 1029cabdff1aSopenharmony_ci push {lr} 1030cabdff1aSopenharmony_ci 1031cabdff1aSopenharmony_ci mov r12, #64 1032cabdff1aSopenharmony_ci vmov.s32 q4, #0 1033cabdff1aSopenharmony_ci.ifb \suffix 1034cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1035cabdff1aSopenharmony_ci vld1.32 {d\i}, [r2,:64] 1036cabdff1aSopenharmony_ci vst1.32 {d8}, [r2,:64], r12 1037cabdff1aSopenharmony_ci.endr 1038cabdff1aSopenharmony_ci.endif 1039cabdff1aSopenharmony_ci.ifc \suffix,_quarter 1040cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 1041cabdff1aSopenharmony_ci vld1.32 {d\i}, [r2,:64] 1042cabdff1aSopenharmony_ci vst1.32 {d8}, [r2,:64], r12 1043cabdff1aSopenharmony_ci.endr 1044cabdff1aSopenharmony_ci.endif 1045cabdff1aSopenharmony_ci.ifc \suffix,_half 1046cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1047cabdff1aSopenharmony_ci vld1.32 {d\i}, [r2,:64] 1048cabdff1aSopenharmony_ci vst1.32 {d8}, [r2,:64], r12 1049cabdff1aSopenharmony_ci.endr 1050cabdff1aSopenharmony_ci.endif 1051cabdff1aSopenharmony_ci 1052cabdff1aSopenharmony_ci bl \txfm\()16\suffix 1053cabdff1aSopenharmony_ci 1054cabdff1aSopenharmony_ci @ Do eight 2x2 transposes. Originally, d16-d31 contain the 1055cabdff1aSopenharmony_ci @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight 1056cabdff1aSopenharmony_ci @ transposed 2x2 blocks. 1057cabdff1aSopenharmony_ci transpose32_8x_2x2 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 1058cabdff1aSopenharmony_ci 1059cabdff1aSopenharmony_ci @ Store the transposed 2x2 blocks horizontally. 1060cabdff1aSopenharmony_ci.irp i, 16, 18, 20, 22, 24, 26, 28, 30, 17, 19, 21, 23, 25, 27, 29, 31 1061cabdff1aSopenharmony_ci vst1.32 {d\i}, [r0,:64]! 1062cabdff1aSopenharmony_ci.endr 1063cabdff1aSopenharmony_ci pop {pc} 1064cabdff1aSopenharmony_ciendfunc 1065cabdff1aSopenharmony_ci 1066cabdff1aSopenharmony_ci@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it, 1067cabdff1aSopenharmony_ci@ load the destination pixels (from a similar 2x16 slice), add and store back. 1068cabdff1aSopenharmony_ci@ r0 = dst 1069cabdff1aSopenharmony_ci@ r1 = dst stride 1070cabdff1aSopenharmony_ci@ r2 = src (temp buffer) 1071cabdff1aSopenharmony_cifunction \txfm\()16_1d_2x16_pass2\suffix\()_neon 1072cabdff1aSopenharmony_ci push {lr} 1073cabdff1aSopenharmony_ci 1074cabdff1aSopenharmony_ci mov r12, #64 1075cabdff1aSopenharmony_ci.ifb \suffix 1076cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1077cabdff1aSopenharmony_ci vld1.16 {d\i}, [r2,:64], r12 1078cabdff1aSopenharmony_ci.endr 1079cabdff1aSopenharmony_ci.endif 1080cabdff1aSopenharmony_ci.ifc \suffix,_quarter 1081cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20 1082cabdff1aSopenharmony_ci vld1.16 {d\i}, [r2,:64], r12 1083cabdff1aSopenharmony_ci.endr 1084cabdff1aSopenharmony_ci.endif 1085cabdff1aSopenharmony_ci.ifc \suffix,_half 1086cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1087cabdff1aSopenharmony_ci vld1.16 {d\i}, [r2,:64], r12 1088cabdff1aSopenharmony_ci.endr 1089cabdff1aSopenharmony_ci.endif 1090cabdff1aSopenharmony_ci 1091cabdff1aSopenharmony_ci add r3, r0, r1 1092cabdff1aSopenharmony_ci lsl r1, r1, #1 1093cabdff1aSopenharmony_ci bl \txfm\()16\suffix 1094cabdff1aSopenharmony_ci 1095cabdff1aSopenharmony_ci.macro load_add_store coef0, coef1, coef2, coef3 1096cabdff1aSopenharmony_ci vrshr.s32 \coef0, \coef0, #6 1097cabdff1aSopenharmony_ci vrshr.s32 \coef1, \coef1, #6 1098cabdff1aSopenharmony_ci 1099cabdff1aSopenharmony_ci vld1.32 {d8[]}, [r0,:32], r1 1100cabdff1aSopenharmony_ci vld1.32 {d8[1]}, [r3,:32], r1 1101cabdff1aSopenharmony_ci vrshr.s32 \coef2, \coef2, #6 1102cabdff1aSopenharmony_ci vrshr.s32 \coef3, \coef3, #6 1103cabdff1aSopenharmony_ci vld1.32 {d9[]}, [r0,:32], r1 1104cabdff1aSopenharmony_ci vld1.32 {d9[1]}, [r3,:32], r1 1105cabdff1aSopenharmony_ci vaddw.u16 \coef0, \coef0, d8 1106cabdff1aSopenharmony_ci vld1.32 {d10[]}, [r0,:32], r1 1107cabdff1aSopenharmony_ci vld1.32 {d10[1]}, [r3,:32], r1 1108cabdff1aSopenharmony_ci vaddw.u16 \coef1, \coef1, d9 1109cabdff1aSopenharmony_ci vld1.32 {d11[]}, [r0,:32], r1 1110cabdff1aSopenharmony_ci vld1.32 {d11[1]}, [r3,:32], r1 1111cabdff1aSopenharmony_ci 1112cabdff1aSopenharmony_ci vqmovun.s32 d8, \coef0 1113cabdff1aSopenharmony_ci vdup.s16 q8, r9 1114cabdff1aSopenharmony_ci vqmovun.s32 d9, \coef1 1115cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 1116cabdff1aSopenharmony_ci sub r3, r3, r1, lsl #2 1117cabdff1aSopenharmony_ci vaddw.u16 \coef2, \coef2, d10 1118cabdff1aSopenharmony_ci vaddw.u16 \coef3, \coef3, d11 1119cabdff1aSopenharmony_ci vmin.u16 q4, q4, q8 1120cabdff1aSopenharmony_ci vst1.32 {d8[0]}, [r0,:32], r1 1121cabdff1aSopenharmony_ci vst1.32 {d8[1]}, [r3,:32], r1 1122cabdff1aSopenharmony_ci vqmovun.s32 d10, \coef2 1123cabdff1aSopenharmony_ci vst1.32 {d9[0]}, [r0,:32], r1 1124cabdff1aSopenharmony_ci vst1.32 {d9[1]}, [r3,:32], r1 1125cabdff1aSopenharmony_ci vqmovun.s32 d11, \coef3 1126cabdff1aSopenharmony_ci vmin.u16 q5, q5, q8 1127cabdff1aSopenharmony_ci 1128cabdff1aSopenharmony_ci vst1.32 {d10[0]}, [r0,:32], r1 1129cabdff1aSopenharmony_ci vst1.32 {d10[1]}, [r3,:32], r1 1130cabdff1aSopenharmony_ci vst1.32 {d11[0]}, [r0,:32], r1 1131cabdff1aSopenharmony_ci vst1.32 {d11[1]}, [r3,:32], r1 1132cabdff1aSopenharmony_ci.endm 1133cabdff1aSopenharmony_ci load_add_store q8, q9, q10, q11 1134cabdff1aSopenharmony_ci load_add_store q12, q13, q14, q15 1135cabdff1aSopenharmony_ci.purgem load_add_store 1136cabdff1aSopenharmony_ci 1137cabdff1aSopenharmony_ci pop {pc} 1138cabdff1aSopenharmony_ciendfunc 1139cabdff1aSopenharmony_ci.endm 1140cabdff1aSopenharmony_ci 1141cabdff1aSopenharmony_ciitxfm16_1d_funcs idct 1142cabdff1aSopenharmony_ciitxfm16_1d_funcs iadst 1143cabdff1aSopenharmony_ciitxfm16_1d_funcs idct, _quarter 1144cabdff1aSopenharmony_ciitxfm16_1d_funcs idct, _half 1145cabdff1aSopenharmony_ci.ltorg 1146cabdff1aSopenharmony_ci 1147cabdff1aSopenharmony_ci@ This is the minimum eob value for each subpartition, in increments of 2 1148cabdff1aSopenharmony_ciconst min_eob_idct_idct_16, align=4 1149cabdff1aSopenharmony_ci .short 0, 3, 10, 22, 38, 62, 89, 121 1150cabdff1aSopenharmony_ciendconst 1151cabdff1aSopenharmony_ci 1152cabdff1aSopenharmony_ci.macro itxfm_func16x16 txfm1, txfm2 1153cabdff1aSopenharmony_cifunction vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon 1154cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 1155cabdff1aSopenharmony_ci cmp r3, #1 1156cabdff1aSopenharmony_ci beq idct16x16_dc_add_neon 1157cabdff1aSopenharmony_ci.endif 1158cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct 1159cabdff1aSopenharmony_ci vpush {q4-q7} 1160cabdff1aSopenharmony_ci.else 1161cabdff1aSopenharmony_ci vpush {q4-q5} 1162cabdff1aSopenharmony_ci.endif 1163cabdff1aSopenharmony_ci 1164cabdff1aSopenharmony_ci @ Align the stack, allocate a temp buffer 1165cabdff1aSopenharmony_ciT mov r7, sp 1166cabdff1aSopenharmony_ciT and r7, r7, #15 1167cabdff1aSopenharmony_ciA and r7, sp, #15 1168cabdff1aSopenharmony_ci add r7, r7, #1024 1169cabdff1aSopenharmony_ci sub sp, sp, r7 1170cabdff1aSopenharmony_ci 1171cabdff1aSopenharmony_ci mov r4, r0 1172cabdff1aSopenharmony_ci mov r5, r1 1173cabdff1aSopenharmony_ci mov r6, r2 1174cabdff1aSopenharmony_ci 1175cabdff1aSopenharmony_ci.ifc \txfm1,idct 1176cabdff1aSopenharmony_ci movrel r12, idct_coeffs 1177cabdff1aSopenharmony_ci vld1.16 {q0-q1}, [r12,:128] 1178cabdff1aSopenharmony_ci vmovl.s16 q2, d2 1179cabdff1aSopenharmony_ci vmovl.s16 q3, d3 1180cabdff1aSopenharmony_ci vmovl.s16 q1, d1 1181cabdff1aSopenharmony_ci vmovl.s16 q0, d0 1182cabdff1aSopenharmony_ci.endif 1183cabdff1aSopenharmony_ci 1184cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 1185cabdff1aSopenharmony_ci cmp r3, #10 1186cabdff1aSopenharmony_ci ble idct16x16_quarter_add_16_neon 1187cabdff1aSopenharmony_ci cmp r3, #38 1188cabdff1aSopenharmony_ci ble idct16x16_half_add_16_neon 1189cabdff1aSopenharmony_ci 1190cabdff1aSopenharmony_ci movrel r8, min_eob_idct_idct_16 + 2 1191cabdff1aSopenharmony_ci.endif 1192cabdff1aSopenharmony_ci 1193cabdff1aSopenharmony_ci.irp i, 0, 2, 4, 6, 8, 10, 12, 14 1194cabdff1aSopenharmony_ci add r0, sp, #(\i*64) 1195cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 1196cabdff1aSopenharmony_ci.if \i > 0 1197cabdff1aSopenharmony_ci ldrh_post r1, r8, #2 1198cabdff1aSopenharmony_ci cmp r3, r1 1199cabdff1aSopenharmony_ci it le 1200cabdff1aSopenharmony_ci movle r1, #(16 - \i)/2 1201cabdff1aSopenharmony_ci ble 1f 1202cabdff1aSopenharmony_ci.endif 1203cabdff1aSopenharmony_ci.endif 1204cabdff1aSopenharmony_ci add r2, r6, #(\i*4) 1205cabdff1aSopenharmony_ci bl \txfm1\()16_1d_2x16_pass1_neon 1206cabdff1aSopenharmony_ci.endr 1207cabdff1aSopenharmony_ci 1208cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 1209cabdff1aSopenharmony_ci b 3f 1210cabdff1aSopenharmony_ci1: 1211cabdff1aSopenharmony_ci vmov.i32 q14, #0 1212cabdff1aSopenharmony_ci vmov.i32 q15, #0 1213cabdff1aSopenharmony_ci2: 1214cabdff1aSopenharmony_ci subs r1, r1, #1 1215cabdff1aSopenharmony_ci @ Unroll for 2 lines 1216cabdff1aSopenharmony_ci.rept 2 1217cabdff1aSopenharmony_ci @ Fill one line with zeros 1218cabdff1aSopenharmony_ci vst1.32 {q14-q15}, [r0,:128]! 1219cabdff1aSopenharmony_ci vst1.32 {q14-q15}, [r0,:128]! 1220cabdff1aSopenharmony_ci.endr 1221cabdff1aSopenharmony_ci bne 2b 1222cabdff1aSopenharmony_ci3: 1223cabdff1aSopenharmony_ci.endif 1224cabdff1aSopenharmony_ci 1225cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,iadst_idct 1226cabdff1aSopenharmony_ci movrel r12, idct_coeffs 1227cabdff1aSopenharmony_ci vld1.16 {q0-q1}, [r12,:128] 1228cabdff1aSopenharmony_ci vmovl.s16 q2, d2 1229cabdff1aSopenharmony_ci vmovl.s16 q3, d3 1230cabdff1aSopenharmony_ci vmovl.s16 q1, d1 1231cabdff1aSopenharmony_ci vmovl.s16 q0, d0 1232cabdff1aSopenharmony_ci.endif 1233cabdff1aSopenharmony_ci.irp i, 0, 2, 4, 6, 8, 10, 12, 14 1234cabdff1aSopenharmony_ci add r0, r4, #(\i*2) 1235cabdff1aSopenharmony_ci mov r1, r5 1236cabdff1aSopenharmony_ci add r2, sp, #(\i*4) 1237cabdff1aSopenharmony_ci bl \txfm2\()16_1d_2x16_pass2_neon 1238cabdff1aSopenharmony_ci.endr 1239cabdff1aSopenharmony_ci 1240cabdff1aSopenharmony_ci add sp, sp, r7 1241cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct 1242cabdff1aSopenharmony_ci vpop {q4-q7} 1243cabdff1aSopenharmony_ci.else 1244cabdff1aSopenharmony_ci vpop {q4-q5} 1245cabdff1aSopenharmony_ci.endif 1246cabdff1aSopenharmony_ci pop {r4-r9,pc} 1247cabdff1aSopenharmony_ciendfunc 1248cabdff1aSopenharmony_ci 1249cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1 1250cabdff1aSopenharmony_ci push {r4-r9,lr} 1251cabdff1aSopenharmony_ci movw r9, #0x03ff 1252cabdff1aSopenharmony_ci b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon 1253cabdff1aSopenharmony_ciendfunc 1254cabdff1aSopenharmony_ci 1255cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1 1256cabdff1aSopenharmony_ci push {r4-r9,lr} 1257cabdff1aSopenharmony_ci movw r9, #0x0fff 1258cabdff1aSopenharmony_ci b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon 1259cabdff1aSopenharmony_ciendfunc 1260cabdff1aSopenharmony_ci.endm 1261cabdff1aSopenharmony_ci 1262cabdff1aSopenharmony_ciitxfm_func16x16 idct, idct 1263cabdff1aSopenharmony_ciitxfm_func16x16 iadst, idct 1264cabdff1aSopenharmony_ciitxfm_func16x16 idct, iadst 1265cabdff1aSopenharmony_ciitxfm_func16x16 iadst, iadst 1266cabdff1aSopenharmony_ci.ltorg 1267cabdff1aSopenharmony_ci 1268cabdff1aSopenharmony_ci.macro idct16_partial size 1269cabdff1aSopenharmony_cifunction idct16x16_\size\()_add_16_neon 1270cabdff1aSopenharmony_ci.irp i, 0, 2 1271cabdff1aSopenharmony_ci add r0, sp, #(\i*64) 1272cabdff1aSopenharmony_ci.ifc \size,quarter 1273cabdff1aSopenharmony_ci.if \i == 2 1274cabdff1aSopenharmony_ci cmp r3, #3 1275cabdff1aSopenharmony_ci ble 1f 1276cabdff1aSopenharmony_ci.endif 1277cabdff1aSopenharmony_ci.endif 1278cabdff1aSopenharmony_ci add r2, r6, #(\i*4) 1279cabdff1aSopenharmony_ci bl idct16_1d_2x16_pass1_\size\()_neon 1280cabdff1aSopenharmony_ci.endr 1281cabdff1aSopenharmony_ci 1282cabdff1aSopenharmony_ci.ifc \size,half 1283cabdff1aSopenharmony_ci.irp i, 4, 6 1284cabdff1aSopenharmony_ci add r0, sp, #(\i*64) 1285cabdff1aSopenharmony_ci.if \i == 6 1286cabdff1aSopenharmony_ci cmp r3, #22 1287cabdff1aSopenharmony_ci ble 1f 1288cabdff1aSopenharmony_ci.endif 1289cabdff1aSopenharmony_ci add r2, r6, #(\i*4) 1290cabdff1aSopenharmony_ci bl idct16_1d_2x16_pass1_\size\()_neon 1291cabdff1aSopenharmony_ci.endr 1292cabdff1aSopenharmony_ci.endif 1293cabdff1aSopenharmony_ci 1294cabdff1aSopenharmony_ci b 3f 1295cabdff1aSopenharmony_ci1: 1296cabdff1aSopenharmony_ci vmov.i32 q14, #0 1297cabdff1aSopenharmony_ci vmov.i32 q15, #0 1298cabdff1aSopenharmony_ci 1299cabdff1aSopenharmony_ci @ Unroll for 2 lines 1300cabdff1aSopenharmony_ci.rept 2 1301cabdff1aSopenharmony_ci @ Fill one line with zeros 1302cabdff1aSopenharmony_ci vst1.32 {q14-q15}, [r0,:128]! 1303cabdff1aSopenharmony_ci vst1.32 {q14-q15}, [r0,:128]! 1304cabdff1aSopenharmony_ci.endr 1305cabdff1aSopenharmony_ci 1306cabdff1aSopenharmony_ci3: 1307cabdff1aSopenharmony_ci 1308cabdff1aSopenharmony_ci.irp i, 0, 2, 4, 6, 8, 10, 12, 14 1309cabdff1aSopenharmony_ci add r0, r4, #(\i*2) 1310cabdff1aSopenharmony_ci mov r1, r5 1311cabdff1aSopenharmony_ci add r2, sp, #(\i*4) 1312cabdff1aSopenharmony_ci bl idct16_1d_2x16_pass2_\size\()_neon 1313cabdff1aSopenharmony_ci.endr 1314cabdff1aSopenharmony_ci 1315cabdff1aSopenharmony_ci add sp, sp, r7 1316cabdff1aSopenharmony_ci vpop {q4-q5} 1317cabdff1aSopenharmony_ci pop {r4-r9,pc} 1318cabdff1aSopenharmony_ciendfunc 1319cabdff1aSopenharmony_ci.endm 1320cabdff1aSopenharmony_ci 1321cabdff1aSopenharmony_ciidct16_partial quarter 1322cabdff1aSopenharmony_ciidct16_partial half 1323cabdff1aSopenharmony_ci 1324cabdff1aSopenharmony_cifunction idct32x32_dc_add_neon 1325cabdff1aSopenharmony_ci movrel r12, idct_coeffs 1326cabdff1aSopenharmony_ci vld1.16 {d0}, [r12,:64] 1327cabdff1aSopenharmony_ci 1328cabdff1aSopenharmony_ci vmov.i32 q2, #0 1329cabdff1aSopenharmony_ci vmovl.s16 q0, d0 1330cabdff1aSopenharmony_ci 1331cabdff1aSopenharmony_ci vld1.32 {d16[]}, [r2,:32] 1332cabdff1aSopenharmony_ci vmull.s32 q8, d16, d0[0] 1333cabdff1aSopenharmony_ci vrshrn.s64 d16, q8, #14 1334cabdff1aSopenharmony_ci vmull.s32 q8, d16, d0[0] 1335cabdff1aSopenharmony_ci vrshrn.s64 d16, q8, #14 1336cabdff1aSopenharmony_ci vdup.32 q8, d16[0] 1337cabdff1aSopenharmony_ci vst1.32 {d4[0]}, [r2,:32] 1338cabdff1aSopenharmony_ci 1339cabdff1aSopenharmony_ci vrshr.s32 q8, q8, #6 1340cabdff1aSopenharmony_ci vdup.s16 q15, r9 1341cabdff1aSopenharmony_ci 1342cabdff1aSopenharmony_ci mov r3, r0 1343cabdff1aSopenharmony_ci mov r12, #32 1344cabdff1aSopenharmony_ci sub r1, r1, #32 1345cabdff1aSopenharmony_ci1: 1346cabdff1aSopenharmony_ci @ Loop to add the constant from q8 into all 32x32 outputs 1347cabdff1aSopenharmony_ci subs r12, r12, #1 1348cabdff1aSopenharmony_ci vld1.16 {q0-q1}, [r0,:128]! 1349cabdff1aSopenharmony_ci vaddw.u16 q9, q8, d0 1350cabdff1aSopenharmony_ci vaddw.u16 q10, q8, d1 1351cabdff1aSopenharmony_ci vld1.16 {q2-q3}, [r0,:128], r1 1352cabdff1aSopenharmony_ci vaddw.u16 q11, q8, d2 1353cabdff1aSopenharmony_ci vaddw.u16 q12, q8, d3 1354cabdff1aSopenharmony_ci vaddw.u16 q13, q8, d4 1355cabdff1aSopenharmony_ci vaddw.u16 q14, q8, d5 1356cabdff1aSopenharmony_ci vqmovun.s32 d0, q9 1357cabdff1aSopenharmony_ci vaddw.u16 q9, q8, d6 1358cabdff1aSopenharmony_ci vqmovun.s32 d1, q10 1359cabdff1aSopenharmony_ci vaddw.u16 q10, q8, d7 1360cabdff1aSopenharmony_ci vqmovun.s32 d2, q11 1361cabdff1aSopenharmony_ci vqmovun.s32 d3, q12 1362cabdff1aSopenharmony_ci vqmovun.s32 d4, q13 1363cabdff1aSopenharmony_ci vqmovun.s32 d5, q14 1364cabdff1aSopenharmony_ci vmin.u16 q0, q0, q15 1365cabdff1aSopenharmony_ci vmin.u16 q1, q1, q15 1366cabdff1aSopenharmony_ci vqmovun.s32 d6, q9 1367cabdff1aSopenharmony_ci vqmovun.s32 d7, q10 1368cabdff1aSopenharmony_ci vst1.16 {q0-q1}, [r3,:128]! 1369cabdff1aSopenharmony_ci vmin.u16 q2, q2, q15 1370cabdff1aSopenharmony_ci vmin.u16 q3, q3, q15 1371cabdff1aSopenharmony_ci vst1.16 {q2-q3}, [r3,:128], r1 1372cabdff1aSopenharmony_ci bne 1b 1373cabdff1aSopenharmony_ci 1374cabdff1aSopenharmony_ci pop {r4-r9,pc} 1375cabdff1aSopenharmony_ciendfunc 1376cabdff1aSopenharmony_ci 1377cabdff1aSopenharmony_ci.macro idct32_end 1378cabdff1aSopenharmony_ci butterfly d16, d9, d8, d9 @ d16 = t16a, d9 = t19a 1379cabdff1aSopenharmony_ci butterfly d17, d20, d23, d20 @ d17 = t17, d20 = t18 1380cabdff1aSopenharmony_ci butterfly d18, d10, d11, d10 @ d18 = t23a, d10 = t20a 1381cabdff1aSopenharmony_ci butterfly d19, d21, d22, d21 @ d19 = t22, d21 = t21 1382cabdff1aSopenharmony_ci butterfly d8, d28, d28, d30 @ d8 = t24a, d28 = t27a 1383cabdff1aSopenharmony_ci butterfly d23, d26, d25, d26 @ d23 = t25, d26 = t26 1384cabdff1aSopenharmony_ci butterfly d11, d29, d29, d31 @ d11 = t31a, d29 = t28a 1385cabdff1aSopenharmony_ci butterfly d22, d27, d24, d27 @ d22 = t30, d27 = t29 1386cabdff1aSopenharmony_ci 1387cabdff1aSopenharmony_ci mbutterfly d27, d20, d1[0], d1[1], q12, q15 @ d27 = t18a, d20 = t29a 1388cabdff1aSopenharmony_ci mbutterfly d29, d9, d1[0], d1[1], q12, q15 @ d29 = t19, d9 = t28 1389cabdff1aSopenharmony_ci mbutterfly d28, d10, d1[0], d1[1], q12, q15, neg=1 @ d28 = t27, d10 = t20 1390cabdff1aSopenharmony_ci mbutterfly d26, d21, d1[0], d1[1], q12, q15, neg=1 @ d26 = t26a, d21 = t21a 1391cabdff1aSopenharmony_ci 1392cabdff1aSopenharmony_ci butterfly d31, d24, d11, d8 @ d31 = t31, d24 = t24 1393cabdff1aSopenharmony_ci butterfly d30, d25, d22, d23 @ d30 = t30a, d25 = t25a 1394cabdff1aSopenharmony_ci butterfly_r d23, d16, d16, d18 @ d23 = t23, d16 = t16 1395cabdff1aSopenharmony_ci butterfly_r d22, d17, d17, d19 @ d22 = t22a, d17 = t17a 1396cabdff1aSopenharmony_ci butterfly d18, d21, d27, d21 @ d18 = t18, d21 = t21 1397cabdff1aSopenharmony_ci butterfly_r d27, d28, d9, d28 @ d27 = t27a, d28 = t28a 1398cabdff1aSopenharmony_ci butterfly d8, d26, d20, d26 @ d8 = t29, d26 = t26 1399cabdff1aSopenharmony_ci butterfly d19, d20, d29, d10 @ d19 = t19a, d20 = t20 1400cabdff1aSopenharmony_ci vmov d29, d8 @ d29 = t29 1401cabdff1aSopenharmony_ci 1402cabdff1aSopenharmony_ci mbutterfly0 d27, d20, d27, d20, d8, d10, q4, q5 @ d27 = t27, d20 = t20 1403cabdff1aSopenharmony_ci mbutterfly0 d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a 1404cabdff1aSopenharmony_ci mbutterfly0 d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25, d22 = t22 1405cabdff1aSopenharmony_ci mbutterfly0 d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a 1406cabdff1aSopenharmony_ci bx lr 1407cabdff1aSopenharmony_ci.endm 1408cabdff1aSopenharmony_ci 1409cabdff1aSopenharmony_cifunction idct32_odd 1410cabdff1aSopenharmony_ci movrel r12, idct_coeffs 1411cabdff1aSopenharmony_ci 1412cabdff1aSopenharmony_ci @ Overwrite the idct16 coeffs with the stored ones for idct32 1413cabdff1aSopenharmony_ci vmovl.s16 q0, d12 1414cabdff1aSopenharmony_ci vmovl.s16 q1, d13 1415cabdff1aSopenharmony_ci vmovl.s16 q2, d14 1416cabdff1aSopenharmony_ci vmovl.s16 q3, d15 1417cabdff1aSopenharmony_ci 1418cabdff1aSopenharmony_ci mbutterfly d16, d31, d0[0], d0[1], q4, q5 @ d16 = t16a, d31 = t31a 1419cabdff1aSopenharmony_ci mbutterfly d24, d23, d1[0], d1[1], q4, q5 @ d24 = t17a, d23 = t30a 1420cabdff1aSopenharmony_ci mbutterfly d20, d27, d2[0], d2[1], q4, q5 @ d20 = t18a, d27 = t29a 1421cabdff1aSopenharmony_ci mbutterfly d28, d19, d3[0], d3[1], q4, q5 @ d28 = t19a, d19 = t28a 1422cabdff1aSopenharmony_ci mbutterfly d18, d29, d4[0], d4[1], q4, q5 @ d18 = t20a, d29 = t27a 1423cabdff1aSopenharmony_ci mbutterfly d26, d21, d5[0], d5[1], q4, q5 @ d26 = t21a, d21 = t26a 1424cabdff1aSopenharmony_ci mbutterfly d22, d25, d6[0], d6[1], q4, q5 @ d22 = t22a, d25 = t25a 1425cabdff1aSopenharmony_ci mbutterfly d30, d17, d7[0], d7[1], q4, q5 @ d30 = t23a, d17 = t24a 1426cabdff1aSopenharmony_ci 1427cabdff1aSopenharmony_ci @ Reload the idct16 coefficients. We could swap the coefficients between 1428cabdff1aSopenharmony_ci @ q0-q3 and q6-q7 by narrowing/lengthening, but that's slower than just 1429cabdff1aSopenharmony_ci @ loading and lengthening. 1430cabdff1aSopenharmony_ci vld1.16 {q0-q1}, [r12,:128] 1431cabdff1aSopenharmony_ci 1432cabdff1aSopenharmony_ci butterfly d8, d24, d16, d24 @ d8 = t16, d24 = t17 1433cabdff1aSopenharmony_ci butterfly d9, d20, d28, d20 @ d9 = t19, d20 = t18 1434cabdff1aSopenharmony_ci butterfly d10, d26, d18, d26 @ d10 = t20, d26 = t21 1435cabdff1aSopenharmony_ci butterfly d11, d22, d30, d22 @ d11 = t23, d22 = t22 1436cabdff1aSopenharmony_ci vmovl.s16 q2, d2 1437cabdff1aSopenharmony_ci vmovl.s16 q3, d3 1438cabdff1aSopenharmony_ci vmovl.s16 q1, d1 1439cabdff1aSopenharmony_ci vmovl.s16 q0, d0 1440cabdff1aSopenharmony_ci butterfly d28, d25, d17, d25 @ d28 = t24, d25 = t25 1441cabdff1aSopenharmony_ci butterfly d30, d21, d29, d21 @ d30 = t27, d21 = t26 1442cabdff1aSopenharmony_ci butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30 1443cabdff1aSopenharmony_ci butterfly d31, d27, d19, d27 @ d31 = t28, d27 = t29 1444cabdff1aSopenharmony_ci 1445cabdff1aSopenharmony_ci mbutterfly d23, d24, d2[0], d2[1], q8, q9 @ d23 = t17a, d24 = t30a 1446cabdff1aSopenharmony_ci mbutterfly d27, d20, d2[0], d2[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a 1447cabdff1aSopenharmony_ci mbutterfly d21, d26, d3[0], d3[1], q8, q9 @ d21 = t21a, d26 = t26a 1448cabdff1aSopenharmony_ci mbutterfly d25, d22, d3[0], d3[1], q8, q9, neg=1 @ d25 = t25a, d22 = t22a 1449cabdff1aSopenharmony_ci idct32_end 1450cabdff1aSopenharmony_ciendfunc 1451cabdff1aSopenharmony_ci 1452cabdff1aSopenharmony_cifunction idct32_odd_half 1453cabdff1aSopenharmony_ci movrel r12, idct_coeffs 1454cabdff1aSopenharmony_ci 1455cabdff1aSopenharmony_ci vmovl.s16 q0, d12 1456cabdff1aSopenharmony_ci vmovl.s16 q1, d13 1457cabdff1aSopenharmony_ci vmovl.s16 q2, d14 1458cabdff1aSopenharmony_ci vmovl.s16 q3, d15 1459cabdff1aSopenharmony_ci 1460cabdff1aSopenharmony_ci mbutterfly_h1 d16, d31, d0[0], d0[1], q4, q5 @ d16 = t16a, d31 = t31a 1461cabdff1aSopenharmony_ci mbutterfly_h2 d24, d23, d1[0], d1[1], q4, q5 @ d24 = t17a, d23 = t30a 1462cabdff1aSopenharmony_ci mbutterfly_h1 d20, d27, d2[0], d2[1], q4, q5 @ d20 = t18a, d27 = t29a 1463cabdff1aSopenharmony_ci mbutterfly_h2 d28, d19, d3[0], d3[1], q4, q5 @ d28 = t19a, d19 = t28a 1464cabdff1aSopenharmony_ci mbutterfly_h1 d18, d29, d4[0], d4[1], q4, q5 @ d18 = t20a, d29 = t27a 1465cabdff1aSopenharmony_ci mbutterfly_h2 d26, d21, d5[0], d5[1], q4, q5 @ d26 = t21a, d21 = t26a 1466cabdff1aSopenharmony_ci mbutterfly_h1 d22, d25, d6[0], d6[1], q4, q5 @ d22 = t22a, d25 = t25a 1467cabdff1aSopenharmony_ci mbutterfly_h2 d30, d17, d7[0], d7[1], q4, q5 @ d30 = t23a, d17 = t24a 1468cabdff1aSopenharmony_ci 1469cabdff1aSopenharmony_ci vld1.16 {q0-q1}, [r12,:128] 1470cabdff1aSopenharmony_ci 1471cabdff1aSopenharmony_ci butterfly d8, d24, d16, d24 @ d8 = t16, d24 = t17 1472cabdff1aSopenharmony_ci butterfly d9, d20, d28, d20 @ d9 = t19, d20 = t18 1473cabdff1aSopenharmony_ci butterfly d10, d26, d18, d26 @ d10 = t20, d26 = t21 1474cabdff1aSopenharmony_ci butterfly d11, d22, d30, d22 @ d11 = t23, d22 = t22 1475cabdff1aSopenharmony_ci vmovl.s16 q2, d2 1476cabdff1aSopenharmony_ci vmovl.s16 q3, d3 1477cabdff1aSopenharmony_ci vmovl.s16 q1, d1 1478cabdff1aSopenharmony_ci vmovl.s16 q0, d0 1479cabdff1aSopenharmony_ci butterfly d28, d25, d17, d25 @ d28 = t24, d25 = t25 1480cabdff1aSopenharmony_ci butterfly d30, d21, d29, d21 @ d30 = t27, d21 = t26 1481cabdff1aSopenharmony_ci butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30 1482cabdff1aSopenharmony_ci butterfly d31, d27, d19, d27 @ d31 = t28, d27 = t29 1483cabdff1aSopenharmony_ci 1484cabdff1aSopenharmony_ci mbutterfly d23, d24, d2[0], d2[1], q8, q9 @ d23 = t17a, d24 = t30a 1485cabdff1aSopenharmony_ci mbutterfly d27, d20, d2[0], d2[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a 1486cabdff1aSopenharmony_ci mbutterfly d21, d26, d3[0], d3[1], q8, q9 @ d21 = t21a, d26 = t26a 1487cabdff1aSopenharmony_ci mbutterfly d25, d22, d3[0], d3[1], q8, q9, neg=1 @ d25 = t25a, d22 = t22a 1488cabdff1aSopenharmony_ci idct32_end 1489cabdff1aSopenharmony_ciendfunc 1490cabdff1aSopenharmony_ci 1491cabdff1aSopenharmony_cifunction idct32_odd_quarter 1492cabdff1aSopenharmony_ci movrel r12, idct_coeffs 1493cabdff1aSopenharmony_ci 1494cabdff1aSopenharmony_ci vmovl.s16 q0, d12 1495cabdff1aSopenharmony_ci vmovl.s16 q1, d13 1496cabdff1aSopenharmony_ci vmovl.s16 q2, d14 1497cabdff1aSopenharmony_ci vmovl.s16 q3, d15 1498cabdff1aSopenharmony_ci 1499cabdff1aSopenharmony_ci vmov.s64 q14, #0 1500cabdff1aSopenharmony_ci vmov.s64 q5, #0 1501cabdff1aSopenharmony_ci 1502cabdff1aSopenharmony_ci vmull.s32 q4, d16, d0[0] 1503cabdff1aSopenharmony_ci vmlsl.s32 q14, d19, d3[1] 1504cabdff1aSopenharmony_ci vmull.s32 q15, d16, d0[1] 1505cabdff1aSopenharmony_ci vmull.s32 q11, d17, d7[0] 1506cabdff1aSopenharmony_ci vmlsl.s32 q5, d17, d7[1] 1507cabdff1aSopenharmony_ci vmull.s32 q13, d19, d3[0] 1508cabdff1aSopenharmony_ci vmull.s32 q10, d18, d4[0] 1509cabdff1aSopenharmony_ci vmull.s32 q12, d18, d4[1] 1510cabdff1aSopenharmony_ci 1511cabdff1aSopenharmony_ci vld1.16 {q0-q1}, [r12,:128] 1512cabdff1aSopenharmony_ci 1513cabdff1aSopenharmony_ci vrshrn.s64 d8, q4, #14 1514cabdff1aSopenharmony_ci vrshrn.s64 d9, q14, #14 1515cabdff1aSopenharmony_ci vrshrn.s64 d29, q15, #14 1516cabdff1aSopenharmony_ci vrshrn.s64 d28, q11, #14 1517cabdff1aSopenharmony_ci 1518cabdff1aSopenharmony_ci vmovl.s16 q2, d2 1519cabdff1aSopenharmony_ci vmovl.s16 q3, d3 1520cabdff1aSopenharmony_ci vmovl.s16 q1, d1 1521cabdff1aSopenharmony_ci vmovl.s16 q0, d0 1522cabdff1aSopenharmony_ci 1523cabdff1aSopenharmony_ci vrshrn.s64 d11, q5, #14 1524cabdff1aSopenharmony_ci vrshrn.s64 d31, q13, #14 1525cabdff1aSopenharmony_ci vrshrn.s64 d10, q10, #14 1526cabdff1aSopenharmony_ci vrshrn.s64 d30, q12, #14 1527cabdff1aSopenharmony_ci 1528cabdff1aSopenharmony_ci mbutterfly_l q8, q9, d29, d8, d2[0], d2[1] 1529cabdff1aSopenharmony_ci mbutterfly_l q13, q10, d31, d9, d2[0], d2[1], neg=1 1530cabdff1aSopenharmony_ci vrshrn.s64 d23, q8, #14 1531cabdff1aSopenharmony_ci vrshrn.s64 d24, q9, #14 1532cabdff1aSopenharmony_ci vrshrn.s64 d27, q13, #14 1533cabdff1aSopenharmony_ci vrshrn.s64 d20, q10, #14 1534cabdff1aSopenharmony_ci mbutterfly_l q8, q9, d30, d10, d3[0], d3[1] 1535cabdff1aSopenharmony_ci vrshrn.s64 d21, q8, #14 1536cabdff1aSopenharmony_ci vrshrn.s64 d26, q9, #14 1537cabdff1aSopenharmony_ci mbutterfly_l q8, q9, d28, d11, d3[0], d3[1], neg=1 1538cabdff1aSopenharmony_ci vrshrn.s64 d25, q8, #14 1539cabdff1aSopenharmony_ci vrshrn.s64 d22, q9, #14 1540cabdff1aSopenharmony_ci 1541cabdff1aSopenharmony_ci idct32_end 1542cabdff1aSopenharmony_ciendfunc 1543cabdff1aSopenharmony_ci 1544cabdff1aSopenharmony_ci.macro idct32_funcs suffix 1545cabdff1aSopenharmony_ci@ Do an 32-point IDCT of a 2x32 slice out of a 32x32 matrix. 1546cabdff1aSopenharmony_ci@ We don't have register space to do a single pass IDCT of 2x32 though, 1547cabdff1aSopenharmony_ci@ but the 32-point IDCT can be decomposed into two 16-point IDCTs; 1548cabdff1aSopenharmony_ci@ a normal IDCT16 with every other input component (the even ones, with 1549cabdff1aSopenharmony_ci@ each output written twice), followed by a separate 16-point IDCT 1550cabdff1aSopenharmony_ci@ of the odd inputs, added/subtracted onto the outputs of the first idct16. 1551cabdff1aSopenharmony_ci@ r0 = dst (temp buffer) 1552cabdff1aSopenharmony_ci@ r1 = unused 1553cabdff1aSopenharmony_ci@ r2 = src 1554cabdff1aSopenharmony_cifunction idct32_1d_2x32_pass1\suffix\()_neon 1555cabdff1aSopenharmony_ci push {lr} 1556cabdff1aSopenharmony_ci 1557cabdff1aSopenharmony_ci @ Double stride of the input, since we only read every other line 1558cabdff1aSopenharmony_ci mov r12, #256 1559cabdff1aSopenharmony_ci vmov.s32 d8, #0 1560cabdff1aSopenharmony_ci 1561cabdff1aSopenharmony_ci @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30) 1562cabdff1aSopenharmony_ci.ifb \suffix 1563cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1564cabdff1aSopenharmony_ci vld1.32 {d\i}, [r2,:64] 1565cabdff1aSopenharmony_ci vst1.32 {d8}, [r2,:64], r12 1566cabdff1aSopenharmony_ci.endr 1567cabdff1aSopenharmony_ci.endif 1568cabdff1aSopenharmony_ci.ifc \suffix,_quarter 1569cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 1570cabdff1aSopenharmony_ci vld1.32 {d\i}, [r2,:64] 1571cabdff1aSopenharmony_ci vst1.32 {d8}, [r2,:64], r12 1572cabdff1aSopenharmony_ci.endr 1573cabdff1aSopenharmony_ci.endif 1574cabdff1aSopenharmony_ci.ifc \suffix,_half 1575cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1576cabdff1aSopenharmony_ci vld1.32 {d\i}, [r2,:64] 1577cabdff1aSopenharmony_ci vst1.32 {d8}, [r2,:64], r12 1578cabdff1aSopenharmony_ci.endr 1579cabdff1aSopenharmony_ci.endif 1580cabdff1aSopenharmony_ci 1581cabdff1aSopenharmony_ci bl idct16\suffix 1582cabdff1aSopenharmony_ci 1583cabdff1aSopenharmony_ci @ Do eight 2x2 transposes. Originally, d16-d31 contain the 1584cabdff1aSopenharmony_ci @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight 1585cabdff1aSopenharmony_ci @ transposed 2x2 blocks. 1586cabdff1aSopenharmony_ci transpose32_8x_2x2 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 1587cabdff1aSopenharmony_ci 1588cabdff1aSopenharmony_ci @ Store the registers a, b, c, d, e, f, g, h horizontally, followed 1589cabdff1aSopenharmony_ci @ by the same registers h, g, f, e, d, c, b, a mirrored. 1590cabdff1aSopenharmony_ci.macro store_rev a, b, c, d, e, f, g, h 1591cabdff1aSopenharmony_ci.irp i, \a, \b, \c, \d, \e, \f, \g, \h 1592cabdff1aSopenharmony_ci vst1.32 {d\i}, [r0,:64]! 1593cabdff1aSopenharmony_ci vrev64.32 d\i, d\i 1594cabdff1aSopenharmony_ci.endr 1595cabdff1aSopenharmony_ci.irp i, \h, \g, \f, \e, \d, \c, \b, \a 1596cabdff1aSopenharmony_ci vst1.32 {d\i}, [r0,:64]! 1597cabdff1aSopenharmony_ci.endr 1598cabdff1aSopenharmony_ci.endm 1599cabdff1aSopenharmony_ci store_rev 16, 18, 20, 22, 24, 26, 28, 30 1600cabdff1aSopenharmony_ci store_rev 17, 19, 21, 23, 25, 27, 29, 31 1601cabdff1aSopenharmony_ci sub r0, r0, #256 1602cabdff1aSopenharmony_ci.purgem store_rev 1603cabdff1aSopenharmony_ci 1604cabdff1aSopenharmony_ci @ Move r2 back to the start of the input, and move 1605cabdff1aSopenharmony_ci @ to the first odd row 1606cabdff1aSopenharmony_ci.ifb \suffix 1607cabdff1aSopenharmony_ci sub r2, r2, r12, lsl #4 1608cabdff1aSopenharmony_ci.endif 1609cabdff1aSopenharmony_ci.ifc \suffix,_quarter 1610cabdff1aSopenharmony_ci sub r2, r2, r12, lsl #2 1611cabdff1aSopenharmony_ci.endif 1612cabdff1aSopenharmony_ci.ifc \suffix,_half 1613cabdff1aSopenharmony_ci sub r2, r2, r12, lsl #3 1614cabdff1aSopenharmony_ci.endif 1615cabdff1aSopenharmony_ci add r2, r2, #128 1616cabdff1aSopenharmony_ci 1617cabdff1aSopenharmony_ci vmov.s32 d8, #0 1618cabdff1aSopenharmony_ci @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31) 1619cabdff1aSopenharmony_ci.ifb \suffix 1620cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1621cabdff1aSopenharmony_ci vld1.16 {d\i}, [r2,:64] 1622cabdff1aSopenharmony_ci vst1.16 {d8}, [r2,:64], r12 1623cabdff1aSopenharmony_ci.endr 1624cabdff1aSopenharmony_ci.endif 1625cabdff1aSopenharmony_ci.ifc \suffix,_quarter 1626cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 1627cabdff1aSopenharmony_ci vld1.16 {d\i}, [r2,:64] 1628cabdff1aSopenharmony_ci vst1.16 {d8}, [r2,:64], r12 1629cabdff1aSopenharmony_ci.endr 1630cabdff1aSopenharmony_ci.endif 1631cabdff1aSopenharmony_ci.ifc \suffix,_half 1632cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1633cabdff1aSopenharmony_ci vld1.16 {d\i}, [r2,:64] 1634cabdff1aSopenharmony_ci vst1.16 {d8}, [r2,:64], r12 1635cabdff1aSopenharmony_ci.endr 1636cabdff1aSopenharmony_ci.endif 1637cabdff1aSopenharmony_ci 1638cabdff1aSopenharmony_ci bl idct32_odd\suffix 1639cabdff1aSopenharmony_ci 1640cabdff1aSopenharmony_ci transpose32_8x_2x2 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 1641cabdff1aSopenharmony_ci 1642cabdff1aSopenharmony_ci @ Store the registers a, b, c, d, e, f, g, h horizontally, 1643cabdff1aSopenharmony_ci @ adding into the output first, and then mirrored, subtracted 1644cabdff1aSopenharmony_ci @ from the output. 1645cabdff1aSopenharmony_ci.macro store_rev a, b, c, d, e, f, g, h 1646cabdff1aSopenharmony_ci.irp i, \a, \b, \c, \d, \e, \f, \g, \h 1647cabdff1aSopenharmony_ci vld1.32 {d8}, [r0,:64] 1648cabdff1aSopenharmony_ci vadd.s32 d8, d8, d\i 1649cabdff1aSopenharmony_ci vst1.32 {d8}, [r0,:64]! 1650cabdff1aSopenharmony_ci vrev64.32 d\i, d\i 1651cabdff1aSopenharmony_ci.endr 1652cabdff1aSopenharmony_ci.irp i, \h, \g, \f, \e, \d, \c, \b, \a 1653cabdff1aSopenharmony_ci vld1.32 {d8}, [r0,:64] 1654cabdff1aSopenharmony_ci vsub.s32 d8, d8, d\i 1655cabdff1aSopenharmony_ci vst1.32 {d8}, [r0,:64]! 1656cabdff1aSopenharmony_ci.endr 1657cabdff1aSopenharmony_ci.endm 1658cabdff1aSopenharmony_ci 1659cabdff1aSopenharmony_ci store_rev 31, 29, 27, 25, 23, 21, 19, 17 1660cabdff1aSopenharmony_ci store_rev 30, 28, 26, 24, 22, 20, 18, 16 1661cabdff1aSopenharmony_ci.purgem store_rev 1662cabdff1aSopenharmony_ci pop {pc} 1663cabdff1aSopenharmony_ciendfunc 1664cabdff1aSopenharmony_ci.ltorg 1665cabdff1aSopenharmony_ci 1666cabdff1aSopenharmony_ci@ This is mostly the same as 2x32_pass1, but without the transpose, 1667cabdff1aSopenharmony_ci@ and use the source as temp buffer between the two idct passes, and 1668cabdff1aSopenharmony_ci@ add into the destination. 1669cabdff1aSopenharmony_ci@ r0 = dst 1670cabdff1aSopenharmony_ci@ r1 = dst stride 1671cabdff1aSopenharmony_ci@ r2 = src (temp buffer) 1672cabdff1aSopenharmony_cifunction idct32_1d_2x32_pass2\suffix\()_neon 1673cabdff1aSopenharmony_ci push {lr} 1674cabdff1aSopenharmony_ci 1675cabdff1aSopenharmony_ci mov r12, #256 1676cabdff1aSopenharmony_ci @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30) 1677cabdff1aSopenharmony_ci.ifb \suffix 1678cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1679cabdff1aSopenharmony_ci vld1.32 {d\i}, [r2,:64], r12 1680cabdff1aSopenharmony_ci.endr 1681cabdff1aSopenharmony_ci sub r2, r2, r12, lsl #4 1682cabdff1aSopenharmony_ci.endif 1683cabdff1aSopenharmony_ci.ifc \suffix,_quarter 1684cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 1685cabdff1aSopenharmony_ci vld1.32 {d\i}, [r2,:64], r12 1686cabdff1aSopenharmony_ci.endr 1687cabdff1aSopenharmony_ci sub r2, r2, r12, lsl #2 1688cabdff1aSopenharmony_ci.endif 1689cabdff1aSopenharmony_ci.ifc \suffix,_half 1690cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1691cabdff1aSopenharmony_ci vld1.32 {d\i}, [r2,:64], r12 1692cabdff1aSopenharmony_ci.endr 1693cabdff1aSopenharmony_ci sub r2, r2, r12, lsl #3 1694cabdff1aSopenharmony_ci.endif 1695cabdff1aSopenharmony_ci 1696cabdff1aSopenharmony_ci bl idct16\suffix 1697cabdff1aSopenharmony_ci 1698cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1699cabdff1aSopenharmony_ci vst1.32 {d\i}, [r2,:64], r12 1700cabdff1aSopenharmony_ci.endr 1701cabdff1aSopenharmony_ci 1702cabdff1aSopenharmony_ci sub r2, r2, r12, lsl #4 1703cabdff1aSopenharmony_ci add r2, r2, #128 1704cabdff1aSopenharmony_ci 1705cabdff1aSopenharmony_ci @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31) 1706cabdff1aSopenharmony_ci.ifb \suffix 1707cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1708cabdff1aSopenharmony_ci vld1.32 {d\i}, [r2,:64], r12 1709cabdff1aSopenharmony_ci.endr 1710cabdff1aSopenharmony_ci sub r2, r2, r12, lsl #4 1711cabdff1aSopenharmony_ci.endif 1712cabdff1aSopenharmony_ci.ifc \suffix,_quarter 1713cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 1714cabdff1aSopenharmony_ci vld1.32 {d\i}, [r2,:64], r12 1715cabdff1aSopenharmony_ci.endr 1716cabdff1aSopenharmony_ci sub r2, r2, r12, lsl #2 1717cabdff1aSopenharmony_ci.endif 1718cabdff1aSopenharmony_ci.ifc \suffix,_half 1719cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1720cabdff1aSopenharmony_ci vld1.32 {d\i}, [r2,:64], r12 1721cabdff1aSopenharmony_ci.endr 1722cabdff1aSopenharmony_ci sub r2, r2, r12, lsl #3 1723cabdff1aSopenharmony_ci.endif 1724cabdff1aSopenharmony_ci sub r2, r2, #128 1725cabdff1aSopenharmony_ci 1726cabdff1aSopenharmony_ci bl idct32_odd\suffix 1727cabdff1aSopenharmony_ci 1728cabdff1aSopenharmony_ci @ Narrow the ict16 coefficients in q0-q3 into q0-q1, to 1729cabdff1aSopenharmony_ci @ allow clobbering q2-q3 below. 1730cabdff1aSopenharmony_ci vmovn.s32 d0, q0 1731cabdff1aSopenharmony_ci vmovn.s32 d1, q1 1732cabdff1aSopenharmony_ci vmovn.s32 d2, q2 1733cabdff1aSopenharmony_ci vmovn.s32 d3, q3 1734cabdff1aSopenharmony_ci 1735cabdff1aSopenharmony_ci mov r12, #256 1736cabdff1aSopenharmony_ci vdup.s16 q4, r9 1737cabdff1aSopenharmony_ci.macro load_acc_store a, b, c, d, neg=0 1738cabdff1aSopenharmony_ci vld1.32 {d4}, [r2,:64], r12 1739cabdff1aSopenharmony_ci vld1.32 {d5}, [r2,:64], r12 1740cabdff1aSopenharmony_ci.if \neg == 0 1741cabdff1aSopenharmony_ci vadd.s32 d4, d4, d\a 1742cabdff1aSopenharmony_ci vld1.32 {d6}, [r2,:64], r12 1743cabdff1aSopenharmony_ci vadd.s32 d5, d5, d\b 1744cabdff1aSopenharmony_ci vld1.32 {d7}, [r2,:64], r12 1745cabdff1aSopenharmony_ci vadd.s32 d6, d6, d\c 1746cabdff1aSopenharmony_ci vadd.s32 d7, d7, d\d 1747cabdff1aSopenharmony_ci.else 1748cabdff1aSopenharmony_ci vsub.s32 d4, d4, d\a 1749cabdff1aSopenharmony_ci vld1.32 {d6}, [r2,:64], r12 1750cabdff1aSopenharmony_ci vsub.s32 d5, d5, d\b 1751cabdff1aSopenharmony_ci vld1.32 {d7}, [r2,:64], r12 1752cabdff1aSopenharmony_ci vsub.s32 d6, d6, d\c 1753cabdff1aSopenharmony_ci vsub.s32 d7, d7, d\d 1754cabdff1aSopenharmony_ci.endif 1755cabdff1aSopenharmony_ci vld1.32 {d10[]}, [r0,:32], r1 1756cabdff1aSopenharmony_ci vld1.32 {d10[1]}, [r0,:32], r1 1757cabdff1aSopenharmony_ci vrshr.s32 q2, q2, #6 1758cabdff1aSopenharmony_ci vld1.32 {d11[]}, [r0,:32], r1 1759cabdff1aSopenharmony_ci vrshr.s32 q3, q3, #6 1760cabdff1aSopenharmony_ci vld1.32 {d11[1]}, [r0,:32], r1 1761cabdff1aSopenharmony_ci sub r0, r0, r1, lsl #2 1762cabdff1aSopenharmony_ci vaddw.u16 q2, q2, d10 1763cabdff1aSopenharmony_ci vaddw.u16 q3, q3, d11 1764cabdff1aSopenharmony_ci vqmovun.s32 d4, q2 1765cabdff1aSopenharmony_ci vqmovun.s32 d5, q3 1766cabdff1aSopenharmony_ci vmin.u16 q2, q2, q4 1767cabdff1aSopenharmony_ci vst1.32 {d4[0]}, [r0,:32], r1 1768cabdff1aSopenharmony_ci vst1.32 {d4[1]}, [r0,:32], r1 1769cabdff1aSopenharmony_ci vst1.32 {d5[0]}, [r0,:32], r1 1770cabdff1aSopenharmony_ci vst1.32 {d5[1]}, [r0,:32], r1 1771cabdff1aSopenharmony_ci.endm 1772cabdff1aSopenharmony_ci load_acc_store 31, 30, 29, 28 1773cabdff1aSopenharmony_ci load_acc_store 27, 26, 25, 24 1774cabdff1aSopenharmony_ci load_acc_store 23, 22, 21, 20 1775cabdff1aSopenharmony_ci load_acc_store 19, 18, 17, 16 1776cabdff1aSopenharmony_ci sub r2, r2, r12 1777cabdff1aSopenharmony_ci neg r12, r12 1778cabdff1aSopenharmony_ci load_acc_store 16, 17, 18, 19, 1 1779cabdff1aSopenharmony_ci load_acc_store 20, 21, 22, 23, 1 1780cabdff1aSopenharmony_ci load_acc_store 24, 25, 26, 27, 1 1781cabdff1aSopenharmony_ci load_acc_store 28, 29, 30, 31, 1 1782cabdff1aSopenharmony_ci.purgem load_acc_store 1783cabdff1aSopenharmony_ci @ Lengthen the idct16 coeffs back into 32 bit form 1784cabdff1aSopenharmony_ci vmovl.s16 q2, d2 1785cabdff1aSopenharmony_ci vmovl.s16 q3, d3 1786cabdff1aSopenharmony_ci vmovl.s16 q1, d1 1787cabdff1aSopenharmony_ci vmovl.s16 q0, d0 1788cabdff1aSopenharmony_ci pop {pc} 1789cabdff1aSopenharmony_ciendfunc 1790cabdff1aSopenharmony_ci.endm 1791cabdff1aSopenharmony_ci 1792cabdff1aSopenharmony_ciidct32_funcs 1793cabdff1aSopenharmony_ciidct32_funcs _quarter 1794cabdff1aSopenharmony_ciidct32_funcs _half 1795cabdff1aSopenharmony_ci 1796cabdff1aSopenharmony_ciconst min_eob_idct_idct_32, align=4 1797cabdff1aSopenharmony_ci .short 0, 3, 9, 21, 34, 51, 70, 98, 135, 176, 240, 258, 336, 357, 448, 472 1798cabdff1aSopenharmony_ciendconst 1799cabdff1aSopenharmony_ci 1800cabdff1aSopenharmony_cifunction vp9_idct_idct_32x32_add_16_neon 1801cabdff1aSopenharmony_ci cmp r3, #1 1802cabdff1aSopenharmony_ci beq idct32x32_dc_add_neon 1803cabdff1aSopenharmony_ci vpush {q4-q7} 1804cabdff1aSopenharmony_ci movrel r8, min_eob_idct_idct_32 + 2 1805cabdff1aSopenharmony_ci 1806cabdff1aSopenharmony_ci @ Align the stack, allocate a temp buffer 1807cabdff1aSopenharmony_ciT mov r7, sp 1808cabdff1aSopenharmony_ciT and r7, r7, #15 1809cabdff1aSopenharmony_ciA and r7, sp, #15 1810cabdff1aSopenharmony_ci add r7, r7, #4096 1811cabdff1aSopenharmony_ci sub sp, sp, r7 1812cabdff1aSopenharmony_ci 1813cabdff1aSopenharmony_ci mov r4, r0 1814cabdff1aSopenharmony_ci mov r5, r1 1815cabdff1aSopenharmony_ci mov r6, r2 1816cabdff1aSopenharmony_ci 1817cabdff1aSopenharmony_ci movrel r12, idct_coeffs 1818cabdff1aSopenharmony_ci vld1.16 {q0-q1}, [r12,:128]! 1819cabdff1aSopenharmony_ci vld1.16 {q6-q7}, [r12,:128] 1820cabdff1aSopenharmony_ci vmovl.s16 q2, d2 1821cabdff1aSopenharmony_ci vmovl.s16 q3, d3 1822cabdff1aSopenharmony_ci vmovl.s16 q1, d1 1823cabdff1aSopenharmony_ci vmovl.s16 q0, d0 1824cabdff1aSopenharmony_ci 1825cabdff1aSopenharmony_ci cmp r3, #34 1826cabdff1aSopenharmony_ci ble idct32x32_quarter_add_16_neon 1827cabdff1aSopenharmony_ci cmp r3, #135 1828cabdff1aSopenharmony_ci ble idct32x32_half_add_16_neon 1829cabdff1aSopenharmony_ci 1830cabdff1aSopenharmony_ci.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 1831cabdff1aSopenharmony_ci add r0, sp, #(\i*128) 1832cabdff1aSopenharmony_ci.if \i > 0 1833cabdff1aSopenharmony_ci ldrh_post r1, r8, #2 1834cabdff1aSopenharmony_ci cmp r3, r1 1835cabdff1aSopenharmony_ci it le 1836cabdff1aSopenharmony_ci movle r1, #(32 - \i)/2 1837cabdff1aSopenharmony_ci ble 1f 1838cabdff1aSopenharmony_ci.endif 1839cabdff1aSopenharmony_ci add r2, r6, #(\i*4) 1840cabdff1aSopenharmony_ci bl idct32_1d_2x32_pass1_neon 1841cabdff1aSopenharmony_ci.endr 1842cabdff1aSopenharmony_ci b 3f 1843cabdff1aSopenharmony_ci 1844cabdff1aSopenharmony_ci1: 1845cabdff1aSopenharmony_ci @ Write zeros to the temp buffer for pass 2 1846cabdff1aSopenharmony_ci vmov.i16 q14, #0 1847cabdff1aSopenharmony_ci vmov.i16 q15, #0 1848cabdff1aSopenharmony_ci2: 1849cabdff1aSopenharmony_ci subs r1, r1, #1 1850cabdff1aSopenharmony_ci.rept 2 1851cabdff1aSopenharmony_ci @ Fill one line with zeros 1852cabdff1aSopenharmony_ci vst1.16 {q14-q15}, [r0,:128]! 1853cabdff1aSopenharmony_ci vst1.16 {q14-q15}, [r0,:128]! 1854cabdff1aSopenharmony_ci vst1.16 {q14-q15}, [r0,:128]! 1855cabdff1aSopenharmony_ci vst1.16 {q14-q15}, [r0,:128]! 1856cabdff1aSopenharmony_ci.endr 1857cabdff1aSopenharmony_ci bne 2b 1858cabdff1aSopenharmony_ci3: 1859cabdff1aSopenharmony_ci.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 1860cabdff1aSopenharmony_ci add r0, r4, #(\i*2) 1861cabdff1aSopenharmony_ci mov r1, r5 1862cabdff1aSopenharmony_ci add r2, sp, #(\i*4) 1863cabdff1aSopenharmony_ci bl idct32_1d_2x32_pass2_neon 1864cabdff1aSopenharmony_ci.endr 1865cabdff1aSopenharmony_ci 1866cabdff1aSopenharmony_ci add sp, sp, r7 1867cabdff1aSopenharmony_ci vpop {q4-q7} 1868cabdff1aSopenharmony_ci pop {r4-r9,pc} 1869cabdff1aSopenharmony_ciendfunc 1870cabdff1aSopenharmony_ci 1871cabdff1aSopenharmony_cifunction ff_vp9_idct_idct_32x32_add_10_neon, export=1 1872cabdff1aSopenharmony_ci push {r4-r9,lr} 1873cabdff1aSopenharmony_ci movw r9, #0x03ff 1874cabdff1aSopenharmony_ci b vp9_idct_idct_32x32_add_16_neon 1875cabdff1aSopenharmony_ciendfunc 1876cabdff1aSopenharmony_ci 1877cabdff1aSopenharmony_cifunction ff_vp9_idct_idct_32x32_add_12_neon, export=1 1878cabdff1aSopenharmony_ci push {r4-r9,lr} 1879cabdff1aSopenharmony_ci movw r9, #0x0fff 1880cabdff1aSopenharmony_ci b vp9_idct_idct_32x32_add_16_neon 1881cabdff1aSopenharmony_ciendfunc 1882cabdff1aSopenharmony_ci 1883cabdff1aSopenharmony_ci.macro idct32_partial size, rows 1884cabdff1aSopenharmony_cifunction idct32x32_\size\()_add_16_neon 1885cabdff1aSopenharmony_ci.irp i, 0, 2, 4, 6 1886cabdff1aSopenharmony_ci add r0, sp, #(\i*128) 1887cabdff1aSopenharmony_ci.ifc \size,quarter 1888cabdff1aSopenharmony_ci.if \i > 0 1889cabdff1aSopenharmony_ci ldrh_post r1, r8, #2 1890cabdff1aSopenharmony_ci cmp r3, r1 1891cabdff1aSopenharmony_ci it le 1892cabdff1aSopenharmony_ci movle r1, #(\rows - \i)/2 1893cabdff1aSopenharmony_ci ble 1f 1894cabdff1aSopenharmony_ci.endif 1895cabdff1aSopenharmony_ci.endif 1896cabdff1aSopenharmony_ci add r2, r6, #(\i*4) 1897cabdff1aSopenharmony_ci bl idct32_1d_2x32_pass1_\size\()_neon 1898cabdff1aSopenharmony_ci.endr 1899cabdff1aSopenharmony_ci.ifc \size,half 1900cabdff1aSopenharmony_ci add r8, r8, #8 1901cabdff1aSopenharmony_ci.irp i, 8, 10, 12, 14 1902cabdff1aSopenharmony_ci add r0, sp, #(\i*128) 1903cabdff1aSopenharmony_ci.if \i > 8 1904cabdff1aSopenharmony_ci ldrh_post r1, r8, #2 1905cabdff1aSopenharmony_ci cmp r3, r1 1906cabdff1aSopenharmony_ci it le 1907cabdff1aSopenharmony_ci movle r1, #(\rows - \i)/2 1908cabdff1aSopenharmony_ci ble 1f 1909cabdff1aSopenharmony_ci.endif 1910cabdff1aSopenharmony_ci add r2, r6, #(\i*4) 1911cabdff1aSopenharmony_ci bl idct32_1d_2x32_pass1_\size\()_neon 1912cabdff1aSopenharmony_ci.endr 1913cabdff1aSopenharmony_ci.endif 1914cabdff1aSopenharmony_ci b 3f 1915cabdff1aSopenharmony_ci 1916cabdff1aSopenharmony_ci1: 1917cabdff1aSopenharmony_ci @ Write zeros to the temp buffer for pass 2 1918cabdff1aSopenharmony_ci vmov.i16 q14, #0 1919cabdff1aSopenharmony_ci vmov.i16 q15, #0 1920cabdff1aSopenharmony_ci2: 1921cabdff1aSopenharmony_ci subs r1, r1, #1 1922cabdff1aSopenharmony_ci.rept 2 1923cabdff1aSopenharmony_ci @ Fill one line with zeros 1924cabdff1aSopenharmony_ci vst1.16 {q14-q15}, [r0,:128]! 1925cabdff1aSopenharmony_ci vst1.16 {q14-q15}, [r0,:128]! 1926cabdff1aSopenharmony_ci vst1.16 {q14-q15}, [r0,:128]! 1927cabdff1aSopenharmony_ci vst1.16 {q14-q15}, [r0,:128]! 1928cabdff1aSopenharmony_ci.endr 1929cabdff1aSopenharmony_ci bne 2b 1930cabdff1aSopenharmony_ci3: 1931cabdff1aSopenharmony_ci.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 1932cabdff1aSopenharmony_ci add r0, r4, #(\i*2) 1933cabdff1aSopenharmony_ci mov r1, r5 1934cabdff1aSopenharmony_ci add r2, sp, #(\i*4) 1935cabdff1aSopenharmony_ci bl idct32_1d_2x32_pass2_\size\()_neon 1936cabdff1aSopenharmony_ci.endr 1937cabdff1aSopenharmony_ci 1938cabdff1aSopenharmony_ci add sp, sp, r7 1939cabdff1aSopenharmony_ci vpop {q4-q7} 1940cabdff1aSopenharmony_ci pop {r4-r9,pc} 1941cabdff1aSopenharmony_ciendfunc 1942cabdff1aSopenharmony_ci.endm 1943cabdff1aSopenharmony_ci 1944cabdff1aSopenharmony_ciidct32_partial quarter, 8 1945cabdff1aSopenharmony_ciidct32_partial half, 16 1946