1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2016 Google Inc. 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S" 22cabdff1aSopenharmony_ci#include "neon.S" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ciconst itxfm4_coeffs, align=4 25cabdff1aSopenharmony_ci .short 11585, 0, 6270, 15137 26cabdff1aSopenharmony_ciiadst4_coeffs: 27cabdff1aSopenharmony_ci .short 5283, 15212, 9929, 13377 28cabdff1aSopenharmony_ciendconst 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_ciconst iadst8_coeffs, align=4 31cabdff1aSopenharmony_ci .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679 32cabdff1aSopenharmony_ciidct_coeffs: 33cabdff1aSopenharmony_ci .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102 34cabdff1aSopenharmony_ci .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756 35cabdff1aSopenharmony_ci .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520 36cabdff1aSopenharmony_ci .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404 37cabdff1aSopenharmony_ciendconst 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_ciconst iadst16_coeffs, align=4 40cabdff1aSopenharmony_ci .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053 41cabdff1aSopenharmony_ci .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207 42cabdff1aSopenharmony_ciendconst 43cabdff1aSopenharmony_ci 44cabdff1aSopenharmony_ci// out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14 45cabdff1aSopenharmony_ci// out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14 46cabdff1aSopenharmony_ci// in/out are .8h registers; this can do with 4 temp registers, but is 47cabdff1aSopenharmony_ci// more efficient if 6 temp registers are available. 48cabdff1aSopenharmony_ci.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0 49cabdff1aSopenharmony_ci.if \neg > 0 50cabdff1aSopenharmony_ci neg \tmp4\().4h, v0.4h 51cabdff1aSopenharmony_ci.endif 52cabdff1aSopenharmony_ci add \tmp1\().8h, \in1\().8h, \in2\().8h 53cabdff1aSopenharmony_ci sub \tmp2\().8h, \in1\().8h, \in2\().8h 54cabdff1aSopenharmony_ci.if \neg > 0 55cabdff1aSopenharmony_ci smull \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0] 56cabdff1aSopenharmony_ci smull2 \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0] 57cabdff1aSopenharmony_ci.else 58cabdff1aSopenharmony_ci smull \tmp3\().4s, \tmp1\().4h, v0.h[0] 59cabdff1aSopenharmony_ci smull2 \tmp4\().4s, \tmp1\().8h, v0.h[0] 60cabdff1aSopenharmony_ci.endif 61cabdff1aSopenharmony_ci.ifb \tmp5 62cabdff1aSopenharmony_ci rshrn \out1\().4h, \tmp3\().4s, #14 63cabdff1aSopenharmony_ci rshrn2 \out1\().8h, \tmp4\().4s, #14 64cabdff1aSopenharmony_ci smull \tmp3\().4s, \tmp2\().4h, v0.h[0] 65cabdff1aSopenharmony_ci smull2 \tmp4\().4s, \tmp2\().8h, v0.h[0] 66cabdff1aSopenharmony_ci rshrn \out2\().4h, \tmp3\().4s, #14 67cabdff1aSopenharmony_ci rshrn2 \out2\().8h, \tmp4\().4s, #14 68cabdff1aSopenharmony_ci.else 69cabdff1aSopenharmony_ci smull \tmp5\().4s, \tmp2\().4h, v0.h[0] 70cabdff1aSopenharmony_ci smull2 \tmp6\().4s, \tmp2\().8h, v0.h[0] 71cabdff1aSopenharmony_ci rshrn \out1\().4h, \tmp3\().4s, #14 72cabdff1aSopenharmony_ci rshrn2 \out1\().8h, \tmp4\().4s, #14 73cabdff1aSopenharmony_ci rshrn \out2\().4h, \tmp5\().4s, #14 74cabdff1aSopenharmony_ci rshrn2 \out2\().8h, \tmp6\().4s, #14 75cabdff1aSopenharmony_ci.endif 76cabdff1aSopenharmony_ci.endm 77cabdff1aSopenharmony_ci 78cabdff1aSopenharmony_ci// Same as dmbutterfly0 above, but treating the input in in2 as zero, 79cabdff1aSopenharmony_ci// writing the same output into both out1 and out2. 80cabdff1aSopenharmony_ci.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6 81cabdff1aSopenharmony_ci smull \tmp1\().4s, \in1\().4h, v0.h[0] 82cabdff1aSopenharmony_ci smull2 \tmp2\().4s, \in1\().8h, v0.h[0] 83cabdff1aSopenharmony_ci rshrn \out1\().4h, \tmp1\().4s, #14 84cabdff1aSopenharmony_ci rshrn2 \out1\().8h, \tmp2\().4s, #14 85cabdff1aSopenharmony_ci rshrn \out2\().4h, \tmp1\().4s, #14 86cabdff1aSopenharmony_ci rshrn2 \out2\().8h, \tmp2\().4s, #14 87cabdff1aSopenharmony_ci.endm 88cabdff1aSopenharmony_ci 89cabdff1aSopenharmony_ci// out1,out2 = in1 * coef1 - in2 * coef2 90cabdff1aSopenharmony_ci// out3,out4 = in1 * coef2 + in2 * coef1 91cabdff1aSopenharmony_ci// out are 4 x .4s registers, in are 2 x .8h registers 92cabdff1aSopenharmony_ci.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2 93cabdff1aSopenharmony_ci smull \out1\().4s, \in1\().4h, \coef1 94cabdff1aSopenharmony_ci smull2 \out2\().4s, \in1\().8h, \coef1 95cabdff1aSopenharmony_ci smull \out3\().4s, \in1\().4h, \coef2 96cabdff1aSopenharmony_ci smull2 \out4\().4s, \in1\().8h, \coef2 97cabdff1aSopenharmony_ci smlsl \out1\().4s, \in2\().4h, \coef2 98cabdff1aSopenharmony_ci smlsl2 \out2\().4s, \in2\().8h, \coef2 99cabdff1aSopenharmony_ci smlal \out3\().4s, \in2\().4h, \coef1 100cabdff1aSopenharmony_ci smlal2 \out4\().4s, \in2\().8h, \coef1 101cabdff1aSopenharmony_ci.endm 102cabdff1aSopenharmony_ci 103cabdff1aSopenharmony_ci// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14 104cabdff1aSopenharmony_ci// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14 105cabdff1aSopenharmony_ci// inout are 2 x .8h registers 106cabdff1aSopenharmony_ci.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0 107cabdff1aSopenharmony_ci dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2 108cabdff1aSopenharmony_ci.if \neg > 0 109cabdff1aSopenharmony_ci neg \tmp3\().4s, \tmp3\().4s 110cabdff1aSopenharmony_ci neg \tmp4\().4s, \tmp4\().4s 111cabdff1aSopenharmony_ci.endif 112cabdff1aSopenharmony_ci rshrn \inout1\().4h, \tmp1\().4s, #14 113cabdff1aSopenharmony_ci rshrn2 \inout1\().8h, \tmp2\().4s, #14 114cabdff1aSopenharmony_ci rshrn \inout2\().4h, \tmp3\().4s, #14 115cabdff1aSopenharmony_ci rshrn2 \inout2\().8h, \tmp4\().4s, #14 116cabdff1aSopenharmony_ci.endm 117cabdff1aSopenharmony_ci 118cabdff1aSopenharmony_ci// Same as dmbutterfly above, but treating the input in inout2 as zero 119cabdff1aSopenharmony_ci.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 120cabdff1aSopenharmony_ci smull \tmp1\().4s, \inout1\().4h, \coef1 121cabdff1aSopenharmony_ci smull2 \tmp2\().4s, \inout1\().8h, \coef1 122cabdff1aSopenharmony_ci smull \tmp3\().4s, \inout1\().4h, \coef2 123cabdff1aSopenharmony_ci smull2 \tmp4\().4s, \inout1\().8h, \coef2 124cabdff1aSopenharmony_ci rshrn \inout1\().4h, \tmp1\().4s, #14 125cabdff1aSopenharmony_ci rshrn2 \inout1\().8h, \tmp2\().4s, #14 126cabdff1aSopenharmony_ci rshrn \inout2\().4h, \tmp3\().4s, #14 127cabdff1aSopenharmony_ci rshrn2 \inout2\().8h, \tmp4\().4s, #14 128cabdff1aSopenharmony_ci.endm 129cabdff1aSopenharmony_ci 130cabdff1aSopenharmony_ci// Same as dmbutterfly above, but treating the input in inout1 as zero 131cabdff1aSopenharmony_ci.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 132cabdff1aSopenharmony_ci smull \tmp1\().4s, \inout2\().4h, \coef2 133cabdff1aSopenharmony_ci smull2 \tmp2\().4s, \inout2\().8h, \coef2 134cabdff1aSopenharmony_ci smull \tmp3\().4s, \inout2\().4h, \coef1 135cabdff1aSopenharmony_ci smull2 \tmp4\().4s, \inout2\().8h, \coef1 136cabdff1aSopenharmony_ci neg \tmp1\().4s, \tmp1\().4s 137cabdff1aSopenharmony_ci neg \tmp2\().4s, \tmp2\().4s 138cabdff1aSopenharmony_ci rshrn \inout2\().4h, \tmp3\().4s, #14 139cabdff1aSopenharmony_ci rshrn2 \inout2\().8h, \tmp4\().4s, #14 140cabdff1aSopenharmony_ci rshrn \inout1\().4h, \tmp1\().4s, #14 141cabdff1aSopenharmony_ci rshrn2 \inout1\().8h, \tmp2\().4s, #14 142cabdff1aSopenharmony_ci.endm 143cabdff1aSopenharmony_ci 144cabdff1aSopenharmony_ci.macro dsmull_h out1, out2, in, coef 145cabdff1aSopenharmony_ci smull \out1\().4s, \in\().4h, \coef 146cabdff1aSopenharmony_ci smull2 \out2\().4s, \in\().8h, \coef 147cabdff1aSopenharmony_ci.endm 148cabdff1aSopenharmony_ci 149cabdff1aSopenharmony_ci.macro drshrn_h out, in1, in2, shift 150cabdff1aSopenharmony_ci rshrn \out\().4h, \in1\().4s, \shift 151cabdff1aSopenharmony_ci rshrn2 \out\().8h, \in2\().4s, \shift 152cabdff1aSopenharmony_ci.endm 153cabdff1aSopenharmony_ci 154cabdff1aSopenharmony_ci 155cabdff1aSopenharmony_ci// out1 = in1 + in2 156cabdff1aSopenharmony_ci// out2 = in1 - in2 157cabdff1aSopenharmony_ci.macro butterfly_8h out1, out2, in1, in2 158cabdff1aSopenharmony_ci add \out1\().8h, \in1\().8h, \in2\().8h 159cabdff1aSopenharmony_ci sub \out2\().8h, \in1\().8h, \in2\().8h 160cabdff1aSopenharmony_ci.endm 161cabdff1aSopenharmony_ci 162cabdff1aSopenharmony_ci// out1 = in1 - in2 163cabdff1aSopenharmony_ci// out2 = in1 + in2 164cabdff1aSopenharmony_ci.macro butterfly_8h_r out1, out2, in1, in2 165cabdff1aSopenharmony_ci sub \out1\().8h, \in1\().8h, \in2\().8h 166cabdff1aSopenharmony_ci add \out2\().8h, \in1\().8h, \in2\().8h 167cabdff1aSopenharmony_ci.endm 168cabdff1aSopenharmony_ci 169cabdff1aSopenharmony_ci// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14 170cabdff1aSopenharmony_ci// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14 171cabdff1aSopenharmony_ci// out are 2 x .8h registers, in are 4 x .4s registers 172cabdff1aSopenharmony_ci.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4 173cabdff1aSopenharmony_ci add \tmp1\().4s, \in1\().4s, \in3\().4s 174cabdff1aSopenharmony_ci add \tmp2\().4s, \in2\().4s, \in4\().4s 175cabdff1aSopenharmony_ci sub \tmp3\().4s, \in1\().4s, \in3\().4s 176cabdff1aSopenharmony_ci sub \tmp4\().4s, \in2\().4s, \in4\().4s 177cabdff1aSopenharmony_ci rshrn \out1\().4h, \tmp1\().4s, #14 178cabdff1aSopenharmony_ci rshrn2 \out1\().8h, \tmp2\().4s, #14 179cabdff1aSopenharmony_ci rshrn \out2\().4h, \tmp3\().4s, #14 180cabdff1aSopenharmony_ci rshrn2 \out2\().8h, \tmp4\().4s, #14 181cabdff1aSopenharmony_ci.endm 182cabdff1aSopenharmony_ci 183cabdff1aSopenharmony_ci.macro iwht4 c0, c1, c2, c3 184cabdff1aSopenharmony_ci add \c0\().4h, \c0\().4h, \c1\().4h 185cabdff1aSopenharmony_ci sub v17.4h, \c2\().4h, \c3\().4h 186cabdff1aSopenharmony_ci sub v16.4h, \c0\().4h, v17.4h 187cabdff1aSopenharmony_ci sshr v16.4h, v16.4h, #1 188cabdff1aSopenharmony_ci sub \c2\().4h, v16.4h, \c1\().4h 189cabdff1aSopenharmony_ci sub \c1\().4h, v16.4h, \c3\().4h 190cabdff1aSopenharmony_ci add \c3\().4h, v17.4h, \c2\().4h 191cabdff1aSopenharmony_ci sub \c0\().4h, \c0\().4h, \c1\().4h 192cabdff1aSopenharmony_ci.endm 193cabdff1aSopenharmony_ci 194cabdff1aSopenharmony_ci.macro idct4 c0, c1, c2, c3 195cabdff1aSopenharmony_ci smull v22.4s, \c1\().4h, v0.h[3] 196cabdff1aSopenharmony_ci smull v20.4s, \c1\().4h, v0.h[2] 197cabdff1aSopenharmony_ci add v16.4h, \c0\().4h, \c2\().4h 198cabdff1aSopenharmony_ci sub v17.4h, \c0\().4h, \c2\().4h 199cabdff1aSopenharmony_ci smlal v22.4s, \c3\().4h, v0.h[2] 200cabdff1aSopenharmony_ci smull v18.4s, v16.4h, v0.h[0] 201cabdff1aSopenharmony_ci smull v19.4s, v17.4h, v0.h[0] 202cabdff1aSopenharmony_ci smlsl v20.4s, \c3\().4h, v0.h[3] 203cabdff1aSopenharmony_ci rshrn v22.4h, v22.4s, #14 204cabdff1aSopenharmony_ci rshrn v18.4h, v18.4s, #14 205cabdff1aSopenharmony_ci rshrn v19.4h, v19.4s, #14 206cabdff1aSopenharmony_ci rshrn v20.4h, v20.4s, #14 207cabdff1aSopenharmony_ci add \c0\().4h, v18.4h, v22.4h 208cabdff1aSopenharmony_ci sub \c3\().4h, v18.4h, v22.4h 209cabdff1aSopenharmony_ci add \c1\().4h, v19.4h, v20.4h 210cabdff1aSopenharmony_ci sub \c2\().4h, v19.4h, v20.4h 211cabdff1aSopenharmony_ci.endm 212cabdff1aSopenharmony_ci 213cabdff1aSopenharmony_ci.macro iadst4 c0, c1, c2, c3 214cabdff1aSopenharmony_ci smull v16.4s, \c0\().4h, v0.h[4] 215cabdff1aSopenharmony_ci smlal v16.4s, \c2\().4h, v0.h[5] 216cabdff1aSopenharmony_ci smlal v16.4s, \c3\().4h, v0.h[6] 217cabdff1aSopenharmony_ci smull v17.4s, \c0\().4h, v0.h[6] 218cabdff1aSopenharmony_ci smlsl v17.4s, \c2\().4h, v0.h[4] 219cabdff1aSopenharmony_ci sub \c0\().4h, \c0\().4h, \c2\().4h 220cabdff1aSopenharmony_ci smlsl v17.4s, \c3\().4h, v0.h[5] 221cabdff1aSopenharmony_ci add \c0\().4h, \c0\().4h, \c3\().4h 222cabdff1aSopenharmony_ci smull v19.4s, \c1\().4h, v0.h[7] 223cabdff1aSopenharmony_ci smull v18.4s, \c0\().4h, v0.h[7] 224cabdff1aSopenharmony_ci add v20.4s, v16.4s, v19.4s 225cabdff1aSopenharmony_ci add v21.4s, v17.4s, v19.4s 226cabdff1aSopenharmony_ci rshrn \c0\().4h, v20.4s, #14 227cabdff1aSopenharmony_ci add v16.4s, v16.4s, v17.4s 228cabdff1aSopenharmony_ci rshrn \c1\().4h, v21.4s, #14 229cabdff1aSopenharmony_ci sub v16.4s, v16.4s, v19.4s 230cabdff1aSopenharmony_ci rshrn \c2\().4h, v18.4s, #14 231cabdff1aSopenharmony_ci rshrn \c3\().4h, v16.4s, #14 232cabdff1aSopenharmony_ci.endm 233cabdff1aSopenharmony_ci 234cabdff1aSopenharmony_ci// The public functions in this file have got the following signature: 235cabdff1aSopenharmony_ci// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 236cabdff1aSopenharmony_ci 237cabdff1aSopenharmony_ci.macro itxfm_func4x4 txfm1, txfm2 238cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1 239cabdff1aSopenharmony_ci.ifc \txfm1,\txfm2 240cabdff1aSopenharmony_ci.ifc \txfm1,idct 241cabdff1aSopenharmony_ci movrel x4, itxfm4_coeffs 242cabdff1aSopenharmony_ci ld1 {v0.4h}, [x4] 243cabdff1aSopenharmony_ci.endif 244cabdff1aSopenharmony_ci.ifc \txfm1,iadst 245cabdff1aSopenharmony_ci movrel x4, iadst4_coeffs 246cabdff1aSopenharmony_ci ld1 {v0.d}[1], [x4] 247cabdff1aSopenharmony_ci.endif 248cabdff1aSopenharmony_ci.else 249cabdff1aSopenharmony_ci movrel x4, itxfm4_coeffs 250cabdff1aSopenharmony_ci ld1 {v0.8h}, [x4] 251cabdff1aSopenharmony_ci.endif 252cabdff1aSopenharmony_ci 253cabdff1aSopenharmony_ci movi v31.8h, #0 254cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 255cabdff1aSopenharmony_ci cmp w3, #1 256cabdff1aSopenharmony_ci b.ne 1f 257cabdff1aSopenharmony_ci // DC-only for idct/idct 258cabdff1aSopenharmony_ci ld1 {v2.h}[0], [x2] 259cabdff1aSopenharmony_ci smull v2.4s, v2.4h, v0.h[0] 260cabdff1aSopenharmony_ci rshrn v2.4h, v2.4s, #14 261cabdff1aSopenharmony_ci smull v2.4s, v2.4h, v0.h[0] 262cabdff1aSopenharmony_ci rshrn v2.4h, v2.4s, #14 263cabdff1aSopenharmony_ci st1 {v31.h}[0], [x2] 264cabdff1aSopenharmony_ci dup v4.4h, v2.h[0] 265cabdff1aSopenharmony_ci mov v5.16b, v4.16b 266cabdff1aSopenharmony_ci mov v6.16b, v4.16b 267cabdff1aSopenharmony_ci mov v7.16b, v4.16b 268cabdff1aSopenharmony_ci b 2f 269cabdff1aSopenharmony_ci.endif 270cabdff1aSopenharmony_ci 271cabdff1aSopenharmony_ci1: 272cabdff1aSopenharmony_ci ld1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x2] 273cabdff1aSopenharmony_ci st1 {v31.8h}, [x2], #16 274cabdff1aSopenharmony_ci 275cabdff1aSopenharmony_ci.ifc \txfm1,iwht 276cabdff1aSopenharmony_ci sshr v4.4h, v4.4h, #2 277cabdff1aSopenharmony_ci sshr v5.4h, v5.4h, #2 278cabdff1aSopenharmony_ci sshr v6.4h, v6.4h, #2 279cabdff1aSopenharmony_ci sshr v7.4h, v7.4h, #2 280cabdff1aSopenharmony_ci.endif 281cabdff1aSopenharmony_ci 282cabdff1aSopenharmony_ci \txfm1\()4 v4, v5, v6, v7 283cabdff1aSopenharmony_ci 284cabdff1aSopenharmony_ci st1 {v31.8h}, [x2], #16 285cabdff1aSopenharmony_ci // Transpose 4x4 with 16 bit elements 286cabdff1aSopenharmony_ci transpose_4x4H v4, v5, v6, v7, v16, v17, v18, v19 287cabdff1aSopenharmony_ci 288cabdff1aSopenharmony_ci \txfm2\()4 v4, v5, v6, v7 289cabdff1aSopenharmony_ci2: 290cabdff1aSopenharmony_ci ld1 {v0.s}[0], [x0], x1 291cabdff1aSopenharmony_ci ld1 {v1.s}[0], [x0], x1 292cabdff1aSopenharmony_ci.ifnc \txfm1,iwht 293cabdff1aSopenharmony_ci srshr v4.4h, v4.4h, #4 294cabdff1aSopenharmony_ci srshr v5.4h, v5.4h, #4 295cabdff1aSopenharmony_ci srshr v6.4h, v6.4h, #4 296cabdff1aSopenharmony_ci srshr v7.4h, v7.4h, #4 297cabdff1aSopenharmony_ci.endif 298cabdff1aSopenharmony_ci uaddw v4.8h, v4.8h, v0.8b 299cabdff1aSopenharmony_ci uaddw v5.8h, v5.8h, v1.8b 300cabdff1aSopenharmony_ci ld1 {v2.s}[0], [x0], x1 301cabdff1aSopenharmony_ci ld1 {v3.s}[0], [x0], x1 302cabdff1aSopenharmony_ci sqxtun v0.8b, v4.8h 303cabdff1aSopenharmony_ci sqxtun v1.8b, v5.8h 304cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #2 305cabdff1aSopenharmony_ci 306cabdff1aSopenharmony_ci uaddw v6.8h, v6.8h, v2.8b 307cabdff1aSopenharmony_ci uaddw v7.8h, v7.8h, v3.8b 308cabdff1aSopenharmony_ci st1 {v0.s}[0], [x0], x1 309cabdff1aSopenharmony_ci sqxtun v2.8b, v6.8h 310cabdff1aSopenharmony_ci sqxtun v3.8b, v7.8h 311cabdff1aSopenharmony_ci 312cabdff1aSopenharmony_ci st1 {v1.s}[0], [x0], x1 313cabdff1aSopenharmony_ci st1 {v2.s}[0], [x0], x1 314cabdff1aSopenharmony_ci st1 {v3.s}[0], [x0], x1 315cabdff1aSopenharmony_ci 316cabdff1aSopenharmony_ci ret 317cabdff1aSopenharmony_ciendfunc 318cabdff1aSopenharmony_ci.endm 319cabdff1aSopenharmony_ci 320cabdff1aSopenharmony_ciitxfm_func4x4 idct, idct 321cabdff1aSopenharmony_ciitxfm_func4x4 iadst, idct 322cabdff1aSopenharmony_ciitxfm_func4x4 idct, iadst 323cabdff1aSopenharmony_ciitxfm_func4x4 iadst, iadst 324cabdff1aSopenharmony_ciitxfm_func4x4 iwht, iwht 325cabdff1aSopenharmony_ci 326cabdff1aSopenharmony_ci 327cabdff1aSopenharmony_ci.macro idct8 328cabdff1aSopenharmony_ci dmbutterfly0 v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a 329cabdff1aSopenharmony_ci dmbutterfly v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a 330cabdff1aSopenharmony_ci dmbutterfly v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a 331cabdff1aSopenharmony_ci dmbutterfly v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a 332cabdff1aSopenharmony_ci 333cabdff1aSopenharmony_ci butterfly_8h v24, v25, v16, v22 // v24 = t0, v25 = t3 334cabdff1aSopenharmony_ci butterfly_8h v28, v29, v17, v21 // v28 = t4, v29 = t5a 335cabdff1aSopenharmony_ci butterfly_8h v30, v31, v23, v19 // v30 = t7, v31 = t6a 336cabdff1aSopenharmony_ci butterfly_8h v26, v27, v20, v18 // v26 = t1, v27 = t2 337cabdff1aSopenharmony_ci 338cabdff1aSopenharmony_ci dmbutterfly0 v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5 339cabdff1aSopenharmony_ci 340cabdff1aSopenharmony_ci butterfly_8h v16, v23, v24, v30 // v16 = out[0], v23 = out[7] 341cabdff1aSopenharmony_ci butterfly_8h v17, v22, v26, v31 // v17 = out[1], v22 = out[6] 342cabdff1aSopenharmony_ci butterfly_8h v18, v21, v27, v29 // q13 = out[2], q10 = out[5] 343cabdff1aSopenharmony_ci butterfly_8h v19, v20, v25, v28 // v17 = out[3], q12 = out[4] 344cabdff1aSopenharmony_ci.endm 345cabdff1aSopenharmony_ci 346cabdff1aSopenharmony_ci.macro iadst8 347cabdff1aSopenharmony_ci dmbutterfly_l v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0] // v24,v25 = t1a, v26,v27 = t0a 348cabdff1aSopenharmony_ci dmbutterfly_l v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2] // v28,v29 = t3a, v30,v31 = t2a 349cabdff1aSopenharmony_ci dmbutterfly_l v2, v3, v4, v5, v19, v20, v1.h[5], v1.h[4] // v2,v3 = t5a, v4,v5 = t4a 350cabdff1aSopenharmony_ci dmbutterfly_l v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6] // v16,v18 = t7a, v21,v23 = t6a 351cabdff1aSopenharmony_ci 352cabdff1aSopenharmony_ci dbutterfly_n v4, v5, v26, v27, v4, v5, v6, v7, v26, v27 // v4 = t0, v5 = t4 353cabdff1aSopenharmony_ci dbutterfly_n v2, v3, v24, v25, v2, v3, v6, v7, v26, v27 // v2 = t1, v3 = t5 354cabdff1aSopenharmony_ci dbutterfly_n v24, v25, v30, v31, v21, v23, v6, v7, v26, v27 // v24 = t2, v25 = t6 355cabdff1aSopenharmony_ci dbutterfly_n v30, v31, v28, v29, v16, v18, v6, v7, v26, v27 // v30 = t3, v31 = t7 356cabdff1aSopenharmony_ci 357cabdff1aSopenharmony_ci butterfly_8h v16, v6, v4, v24 // v16 = out[0], v6 = t2 358cabdff1aSopenharmony_ci butterfly_8h v23, v7, v2, v30 // v23 = -out[7], v7 = t3 359cabdff1aSopenharmony_ci neg v23.8h, v23.8h // v23 = out[7] 360cabdff1aSopenharmony_ci 361cabdff1aSopenharmony_ci dmbutterfly0 v19, v20, v6, v7, v24, v26, v27, v28, v29, v30 // v19 = -out[3], v20 = out[4] 362cabdff1aSopenharmony_ci neg v19.8h, v19.8h // v19 = out[3] 363cabdff1aSopenharmony_ci 364cabdff1aSopenharmony_ci dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[2], v0.h[3] // v26,v27 = t5a, v28,v29 = t4a 365cabdff1aSopenharmony_ci dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[3], v0.h[2] // v2,v3 = t6a, v4,v5 = t7a 366cabdff1aSopenharmony_ci 367cabdff1aSopenharmony_ci dbutterfly_n v17, v30, v28, v29, v2, v3, v6, v7, v24, v25 // v17 = -out[1], v30 = t6 368cabdff1aSopenharmony_ci dbutterfly_n v22, v31, v26, v27, v4, v5, v6, v7, v24, v25 // v22 = out[6], v31 = t7 369cabdff1aSopenharmony_ci neg v17.8h, v17.8h // v17 = out[1] 370cabdff1aSopenharmony_ci 371cabdff1aSopenharmony_ci dmbutterfly0 v18, v21, v30, v31, v2, v3, v4, v5, v6, v7 // v18 = out[2], v21 = -out[5] 372cabdff1aSopenharmony_ci neg v21.8h, v21.8h // v21 = out[5] 373cabdff1aSopenharmony_ci.endm 374cabdff1aSopenharmony_ci 375cabdff1aSopenharmony_ci 376cabdff1aSopenharmony_ci.macro itxfm_func8x8 txfm1, txfm2 377cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1 378cabdff1aSopenharmony_ci // The iadst also uses a few coefficients from 379cabdff1aSopenharmony_ci // idct, so those always need to be loaded. 380cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 381cabdff1aSopenharmony_ci movrel x4, idct_coeffs 382cabdff1aSopenharmony_ci.else 383cabdff1aSopenharmony_ci movrel x4, iadst8_coeffs 384cabdff1aSopenharmony_ci ld1 {v1.8h}, [x4], #16 385cabdff1aSopenharmony_ci.endif 386cabdff1aSopenharmony_ci ld1 {v0.8h}, [x4] 387cabdff1aSopenharmony_ci 388cabdff1aSopenharmony_ci movi v2.8h, #0 389cabdff1aSopenharmony_ci movi v3.8h, #0 390cabdff1aSopenharmony_ci movi v4.8h, #0 391cabdff1aSopenharmony_ci movi v5.8h, #0 392cabdff1aSopenharmony_ci 393cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 394cabdff1aSopenharmony_ci cmp w3, #1 395cabdff1aSopenharmony_ci b.ne 1f 396cabdff1aSopenharmony_ci // DC-only for idct/idct 397cabdff1aSopenharmony_ci ld1 {v2.h}[0], [x2] 398cabdff1aSopenharmony_ci smull v2.4s, v2.4h, v0.h[0] 399cabdff1aSopenharmony_ci rshrn v2.4h, v2.4s, #14 400cabdff1aSopenharmony_ci smull v2.4s, v2.4h, v0.h[0] 401cabdff1aSopenharmony_ci rshrn v2.4h, v2.4s, #14 402cabdff1aSopenharmony_ci st1 {v3.h}[0], [x2] 403cabdff1aSopenharmony_ci dup v16.8h, v2.h[0] 404cabdff1aSopenharmony_ci mov v17.16b, v16.16b 405cabdff1aSopenharmony_ci mov v18.16b, v16.16b 406cabdff1aSopenharmony_ci mov v19.16b, v16.16b 407cabdff1aSopenharmony_ci mov v20.16b, v16.16b 408cabdff1aSopenharmony_ci mov v21.16b, v16.16b 409cabdff1aSopenharmony_ci mov v22.16b, v16.16b 410cabdff1aSopenharmony_ci mov v23.16b, v16.16b 411cabdff1aSopenharmony_ci b 2f 412cabdff1aSopenharmony_ci.endif 413cabdff1aSopenharmony_ci1: 414cabdff1aSopenharmony_ci ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2], #64 415cabdff1aSopenharmony_ci ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2], #64 416cabdff1aSopenharmony_ci sub x2, x2, #128 417cabdff1aSopenharmony_ci st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64 418cabdff1aSopenharmony_ci st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64 419cabdff1aSopenharmony_ci 420cabdff1aSopenharmony_ci \txfm1\()8 421cabdff1aSopenharmony_ci 422cabdff1aSopenharmony_ci // Transpose 8x8 with 16 bit elements 423cabdff1aSopenharmony_ci transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 424cabdff1aSopenharmony_ci 425cabdff1aSopenharmony_ci \txfm2\()8 426cabdff1aSopenharmony_ci2: 427cabdff1aSopenharmony_ci mov x3, x0 428cabdff1aSopenharmony_ci // Add into the destination 429cabdff1aSopenharmony_ci ld1 {v0.8b}, [x0], x1 430cabdff1aSopenharmony_ci srshr v16.8h, v16.8h, #5 431cabdff1aSopenharmony_ci ld1 {v1.8b}, [x0], x1 432cabdff1aSopenharmony_ci srshr v17.8h, v17.8h, #5 433cabdff1aSopenharmony_ci ld1 {v2.8b}, [x0], x1 434cabdff1aSopenharmony_ci srshr v18.8h, v18.8h, #5 435cabdff1aSopenharmony_ci uaddw v16.8h, v16.8h, v0.8b 436cabdff1aSopenharmony_ci ld1 {v3.8b}, [x0], x1 437cabdff1aSopenharmony_ci srshr v19.8h, v19.8h, #5 438cabdff1aSopenharmony_ci uaddw v17.8h, v17.8h, v1.8b 439cabdff1aSopenharmony_ci ld1 {v4.8b}, [x0], x1 440cabdff1aSopenharmony_ci srshr v20.8h, v20.8h, #5 441cabdff1aSopenharmony_ci uaddw v18.8h, v18.8h, v2.8b 442cabdff1aSopenharmony_ci sqxtun v0.8b, v16.8h 443cabdff1aSopenharmony_ci ld1 {v5.8b}, [x0], x1 444cabdff1aSopenharmony_ci srshr v21.8h, v21.8h, #5 445cabdff1aSopenharmony_ci uaddw v19.8h, v19.8h, v3.8b 446cabdff1aSopenharmony_ci sqxtun v1.8b, v17.8h 447cabdff1aSopenharmony_ci ld1 {v6.8b}, [x0], x1 448cabdff1aSopenharmony_ci srshr v22.8h, v22.8h, #5 449cabdff1aSopenharmony_ci uaddw v20.8h, v20.8h, v4.8b 450cabdff1aSopenharmony_ci sqxtun v2.8b, v18.8h 451cabdff1aSopenharmony_ci ld1 {v7.8b}, [x0], x1 452cabdff1aSopenharmony_ci srshr v23.8h, v23.8h, #5 453cabdff1aSopenharmony_ci uaddw v21.8h, v21.8h, v5.8b 454cabdff1aSopenharmony_ci sqxtun v3.8b, v19.8h 455cabdff1aSopenharmony_ci 456cabdff1aSopenharmony_ci st1 {v0.8b}, [x3], x1 457cabdff1aSopenharmony_ci uaddw v22.8h, v22.8h, v6.8b 458cabdff1aSopenharmony_ci st1 {v1.8b}, [x3], x1 459cabdff1aSopenharmony_ci sqxtun v4.8b, v20.8h 460cabdff1aSopenharmony_ci st1 {v2.8b}, [x3], x1 461cabdff1aSopenharmony_ci uaddw v23.8h, v23.8h, v7.8b 462cabdff1aSopenharmony_ci st1 {v3.8b}, [x3], x1 463cabdff1aSopenharmony_ci sqxtun v5.8b, v21.8h 464cabdff1aSopenharmony_ci st1 {v4.8b}, [x3], x1 465cabdff1aSopenharmony_ci sqxtun v6.8b, v22.8h 466cabdff1aSopenharmony_ci st1 {v5.8b}, [x3], x1 467cabdff1aSopenharmony_ci sqxtun v7.8b, v23.8h 468cabdff1aSopenharmony_ci 469cabdff1aSopenharmony_ci st1 {v6.8b}, [x3], x1 470cabdff1aSopenharmony_ci st1 {v7.8b}, [x3], x1 471cabdff1aSopenharmony_ci 472cabdff1aSopenharmony_ci ret 473cabdff1aSopenharmony_ciendfunc 474cabdff1aSopenharmony_ci.endm 475cabdff1aSopenharmony_ci 476cabdff1aSopenharmony_ciitxfm_func8x8 idct, idct 477cabdff1aSopenharmony_ciitxfm_func8x8 iadst, idct 478cabdff1aSopenharmony_ciitxfm_func8x8 idct, iadst 479cabdff1aSopenharmony_ciitxfm_func8x8 iadst, iadst 480cabdff1aSopenharmony_ci 481cabdff1aSopenharmony_ci 482cabdff1aSopenharmony_cifunction idct16x16_dc_add_neon 483cabdff1aSopenharmony_ci movrel x4, idct_coeffs 484cabdff1aSopenharmony_ci ld1 {v0.4h}, [x4] 485cabdff1aSopenharmony_ci 486cabdff1aSopenharmony_ci movi v1.4h, #0 487cabdff1aSopenharmony_ci 488cabdff1aSopenharmony_ci ld1 {v2.h}[0], [x2] 489cabdff1aSopenharmony_ci smull v2.4s, v2.4h, v0.h[0] 490cabdff1aSopenharmony_ci rshrn v2.4h, v2.4s, #14 491cabdff1aSopenharmony_ci smull v2.4s, v2.4h, v0.h[0] 492cabdff1aSopenharmony_ci rshrn v2.4h, v2.4s, #14 493cabdff1aSopenharmony_ci dup v2.8h, v2.h[0] 494cabdff1aSopenharmony_ci st1 {v1.h}[0], [x2] 495cabdff1aSopenharmony_ci 496cabdff1aSopenharmony_ci srshr v2.8h, v2.8h, #6 497cabdff1aSopenharmony_ci 498cabdff1aSopenharmony_ci mov x3, x0 499cabdff1aSopenharmony_ci mov x4, #16 500cabdff1aSopenharmony_ci1: 501cabdff1aSopenharmony_ci // Loop to add the constant from v2 into all 16x16 outputs 502cabdff1aSopenharmony_ci subs x4, x4, #2 503cabdff1aSopenharmony_ci ld1 {v3.16b}, [x0], x1 504cabdff1aSopenharmony_ci ld1 {v4.16b}, [x0], x1 505cabdff1aSopenharmony_ci uaddw v16.8h, v2.8h, v3.8b 506cabdff1aSopenharmony_ci uaddw2 v17.8h, v2.8h, v3.16b 507cabdff1aSopenharmony_ci uaddw v18.8h, v2.8h, v4.8b 508cabdff1aSopenharmony_ci uaddw2 v19.8h, v2.8h, v4.16b 509cabdff1aSopenharmony_ci sqxtun v3.8b, v16.8h 510cabdff1aSopenharmony_ci sqxtun2 v3.16b, v17.8h 511cabdff1aSopenharmony_ci sqxtun v4.8b, v18.8h 512cabdff1aSopenharmony_ci sqxtun2 v4.16b, v19.8h 513cabdff1aSopenharmony_ci st1 {v3.16b}, [x3], x1 514cabdff1aSopenharmony_ci st1 {v4.16b}, [x3], x1 515cabdff1aSopenharmony_ci b.ne 1b 516cabdff1aSopenharmony_ci 517cabdff1aSopenharmony_ci ret 518cabdff1aSopenharmony_ciendfunc 519cabdff1aSopenharmony_ci 520cabdff1aSopenharmony_ci.macro idct16_end 521cabdff1aSopenharmony_ci butterfly_8h v18, v7, v4, v7 // v18 = t0a, v7 = t7a 522cabdff1aSopenharmony_ci butterfly_8h v19, v22, v5, v22 // v19 = t1a, v22 = t6 523cabdff1aSopenharmony_ci butterfly_8h v4, v26, v20, v26 // v4 = t2a, v26 = t5 524cabdff1aSopenharmony_ci butterfly_8h v5, v6, v28, v6 // v5 = t3a, v6 = t4 525cabdff1aSopenharmony_ci butterfly_8h v20, v28, v16, v24 // v20 = t8a, v28 = t11a 526cabdff1aSopenharmony_ci butterfly_8h v24, v21, v23, v21 // v24 = t9, v21 = t10 527cabdff1aSopenharmony_ci butterfly_8h v23, v27, v25, v27 // v23 = t14, v27 = t13 528cabdff1aSopenharmony_ci butterfly_8h v25, v29, v29, v17 // v25 = t15a, v29 = t12a 529cabdff1aSopenharmony_ci 530cabdff1aSopenharmony_ci dmbutterfly0 v2, v3, v27, v21, v2, v3, v16, v17, v30, v31 // v2 = t13a, v3 = t10a 531cabdff1aSopenharmony_ci dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11 532cabdff1aSopenharmony_ci 533cabdff1aSopenharmony_ci butterfly_8h v16, v31, v18, v25 // v16 = out[0], v31 = out[15] 534cabdff1aSopenharmony_ci butterfly_8h v17, v30, v19, v23 // v17 = out[1], v30 = out[14] 535cabdff1aSopenharmony_ci butterfly_8h_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6] 536cabdff1aSopenharmony_ci butterfly_8h v23, v24, v7, v20 // v23 = out[7], v24 = out[8] 537cabdff1aSopenharmony_ci butterfly_8h v18, v29, v4, v2 // v18 = out[2], v29 = out[13] 538cabdff1aSopenharmony_ci butterfly_8h v19, v28, v5, v28 // v19 = out[3], v28 = out[12] 539cabdff1aSopenharmony_ci butterfly_8h v20, v27, v6, v27 // v20 = out[4], v27 = out[11] 540cabdff1aSopenharmony_ci butterfly_8h v21, v26, v26, v3 // v21 = out[5], v26 = out[10] 541cabdff1aSopenharmony_ci ret 542cabdff1aSopenharmony_ci.endm 543cabdff1aSopenharmony_ci 544cabdff1aSopenharmony_cifunction idct16 545cabdff1aSopenharmony_ci dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a 546cabdff1aSopenharmony_ci dmbutterfly v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a 547cabdff1aSopenharmony_ci dmbutterfly v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a 548cabdff1aSopenharmony_ci dmbutterfly v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a 549cabdff1aSopenharmony_ci dmbutterfly v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a 550cabdff1aSopenharmony_ci dmbutterfly v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a 551cabdff1aSopenharmony_ci dmbutterfly v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a 552cabdff1aSopenharmony_ci dmbutterfly v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a 553cabdff1aSopenharmony_ci 554cabdff1aSopenharmony_ci butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3 555cabdff1aSopenharmony_ci butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2 556cabdff1aSopenharmony_ci butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5 557cabdff1aSopenharmony_ci butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6 558cabdff1aSopenharmony_ci butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9 559cabdff1aSopenharmony_ci butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10 560cabdff1aSopenharmony_ci butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13 561cabdff1aSopenharmony_ci butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14 562cabdff1aSopenharmony_ci 563cabdff1aSopenharmony_ci dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a 564cabdff1aSopenharmony_ci dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a 565cabdff1aSopenharmony_ci dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a 566cabdff1aSopenharmony_ci idct16_end 567cabdff1aSopenharmony_ciendfunc 568cabdff1aSopenharmony_ci 569cabdff1aSopenharmony_cifunction idct16_half 570cabdff1aSopenharmony_ci dmbutterfly0_h v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a 571cabdff1aSopenharmony_ci dmbutterfly_h1 v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a 572cabdff1aSopenharmony_ci dmbutterfly_h1 v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a 573cabdff1aSopenharmony_ci dmbutterfly_h2 v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a 574cabdff1aSopenharmony_ci dmbutterfly_h1 v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a 575cabdff1aSopenharmony_ci dmbutterfly_h2 v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a 576cabdff1aSopenharmony_ci dmbutterfly_h1 v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a 577cabdff1aSopenharmony_ci dmbutterfly_h2 v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a 578cabdff1aSopenharmony_ci 579cabdff1aSopenharmony_ci butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3 580cabdff1aSopenharmony_ci butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2 581cabdff1aSopenharmony_ci butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5 582cabdff1aSopenharmony_ci butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6 583cabdff1aSopenharmony_ci butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9 584cabdff1aSopenharmony_ci butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10 585cabdff1aSopenharmony_ci butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13 586cabdff1aSopenharmony_ci butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14 587cabdff1aSopenharmony_ci 588cabdff1aSopenharmony_ci dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a 589cabdff1aSopenharmony_ci dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a 590cabdff1aSopenharmony_ci dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a 591cabdff1aSopenharmony_ci idct16_end 592cabdff1aSopenharmony_ciendfunc 593cabdff1aSopenharmony_ci 594cabdff1aSopenharmony_cifunction idct16_quarter 595cabdff1aSopenharmony_ci dsmull_h v24, v25, v19, v1.h[7] 596cabdff1aSopenharmony_ci dsmull_h v4, v5, v17, v1.h[0] 597cabdff1aSopenharmony_ci dsmull_h v7, v6, v18, v0.h[5] 598cabdff1aSopenharmony_ci dsmull_h v30, v31, v18, v0.h[4] 599cabdff1aSopenharmony_ci neg v24.4s, v24.4s 600cabdff1aSopenharmony_ci neg v25.4s, v25.4s 601cabdff1aSopenharmony_ci dsmull_h v29, v28, v17, v1.h[1] 602cabdff1aSopenharmony_ci dsmull_h v26, v27, v19, v1.h[6] 603cabdff1aSopenharmony_ci dsmull_h v22, v23, v16, v0.h[0] 604cabdff1aSopenharmony_ci drshrn_h v24, v24, v25, #14 605cabdff1aSopenharmony_ci drshrn_h v16, v4, v5, #14 606cabdff1aSopenharmony_ci drshrn_h v7, v7, v6, #14 607cabdff1aSopenharmony_ci drshrn_h v6, v30, v31, #14 608cabdff1aSopenharmony_ci drshrn_h v29, v29, v28, #14 609cabdff1aSopenharmony_ci drshrn_h v17, v26, v27, #14 610cabdff1aSopenharmony_ci drshrn_h v28, v22, v23, #14 611cabdff1aSopenharmony_ci 612cabdff1aSopenharmony_ci dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3] 613cabdff1aSopenharmony_ci dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3] 614cabdff1aSopenharmony_ci neg v22.4s, v22.4s 615cabdff1aSopenharmony_ci neg v23.4s, v23.4s 616cabdff1aSopenharmony_ci drshrn_h v27, v20, v21, #14 617cabdff1aSopenharmony_ci drshrn_h v21, v22, v23, #14 618cabdff1aSopenharmony_ci drshrn_h v23, v18, v19, #14 619cabdff1aSopenharmony_ci drshrn_h v25, v30, v31, #14 620cabdff1aSopenharmony_ci mov v4.16b, v28.16b 621cabdff1aSopenharmony_ci mov v5.16b, v28.16b 622cabdff1aSopenharmony_ci dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31 623cabdff1aSopenharmony_ci mov v20.16b, v28.16b 624cabdff1aSopenharmony_ci idct16_end 625cabdff1aSopenharmony_ciendfunc 626cabdff1aSopenharmony_ci 627cabdff1aSopenharmony_cifunction iadst16 628cabdff1aSopenharmony_ci ld1 {v0.8h,v1.8h}, [x11] 629cabdff1aSopenharmony_ci 630cabdff1aSopenharmony_ci dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // v6,v7 = t1, v4,v5 = t0 631cabdff1aSopenharmony_ci dmbutterfly_l v10, v11, v8, v9, v23, v24, v0.h[5], v0.h[4] // v10,v11 = t9, v8,v9 = t8 632cabdff1aSopenharmony_ci dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a 633cabdff1aSopenharmony_ci dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2] // v14,v15 = t3, v12,v13 = t2 634cabdff1aSopenharmony_ci dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a 635cabdff1aSopenharmony_ci 636cabdff1aSopenharmony_ci dmbutterfly_l v6, v7, v4, v5, v21, v26, v0.h[7], v0.h[6] // v6,v7 = t11, v4,v5 = t10 637cabdff1aSopenharmony_ci dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a 638cabdff1aSopenharmony_ci dmbutterfly_l v10, v11, v8, v9, v27, v20, v1.h[1], v1.h[0] // v10,v11 = t5, v8,v9 = t4 639cabdff1aSopenharmony_ci dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a 640cabdff1aSopenharmony_ci 641cabdff1aSopenharmony_ci dmbutterfly_l v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4] // v14,v15 = t13, v12,v13 = t12 642cabdff1aSopenharmony_ci dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a 643cabdff1aSopenharmony_ci dmbutterfly_l v6, v7, v4, v5, v25, v22, v1.h[3], v1.h[2] // v6,v7 = t7, v4,v5 = t6 644cabdff1aSopenharmony_ci dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a 645cabdff1aSopenharmony_ci 646cabdff1aSopenharmony_ci dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // v10,v11 = t15, v8,v9 = t14 647cabdff1aSopenharmony_ci ld1 {v0.8h}, [x10] 648cabdff1aSopenharmony_ci dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a 649cabdff1aSopenharmony_ci dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5] // v14,v15 = t9, v12,v13 = t8 650cabdff1aSopenharmony_ci dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a 651cabdff1aSopenharmony_ci 652cabdff1aSopenharmony_ci dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[5], v0.h[4] // v4,v5 = t12, v6,v7 = t13 653cabdff1aSopenharmony_ci dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a 654cabdff1aSopenharmony_ci dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[6], v0.h[7] // v10,v11 = t11, v8,v9 = t10 655cabdff1aSopenharmony_ci butterfly_8h_r v4, v27, v16, v27 // v4 = t4, v27 = t0 656cabdff1aSopenharmony_ci dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a 657cabdff1aSopenharmony_ci 658cabdff1aSopenharmony_ci dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6] // v12,v13 = t14, v14,v15 = t15 659cabdff1aSopenharmony_ci butterfly_8h_r v5, v20, v31, v20 // v5 = t5, v20 = t1 660cabdff1aSopenharmony_ci dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a 661cabdff1aSopenharmony_ci dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a 662cabdff1aSopenharmony_ci 663cabdff1aSopenharmony_ci butterfly_8h_r v6, v25, v18, v25 // v6 = t6, v25 = t2 664cabdff1aSopenharmony_ci butterfly_8h_r v7, v22, v29, v22 // v7 = t7, v22 = t3 665cabdff1aSopenharmony_ci 666cabdff1aSopenharmony_ci dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[2], v0.h[3] // v10,v11 = t13, v8,v9 = t12 667cabdff1aSopenharmony_ci dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2] // v12,v13 = t14, v14,v15 = t15 668cabdff1aSopenharmony_ci 669cabdff1aSopenharmony_ci dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a 670cabdff1aSopenharmony_ci dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a 671cabdff1aSopenharmony_ci neg v29.8h, v29.8h // v29 = out[13] 672cabdff1aSopenharmony_ci 673cabdff1aSopenharmony_ci dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[2], v0.h[3] // v10,v11 = t5a, v8,v9 = t4a 674cabdff1aSopenharmony_ci dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[3], v0.h[2] // v12,v13 = t6a, v14,v15 = t7a 675cabdff1aSopenharmony_ci 676cabdff1aSopenharmony_ci butterfly_8h v2, v6, v27, v25 // v2 = out[0], v6 = t2a 677cabdff1aSopenharmony_ci butterfly_8h v3, v7, v23, v21 // v3 =-out[1], v7 = t10 678cabdff1aSopenharmony_ci 679cabdff1aSopenharmony_ci dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6 680cabdff1aSopenharmony_ci neg v19.8h, v19.8h // v19 = out[3] 681cabdff1aSopenharmony_ci dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7 682cabdff1aSopenharmony_ci 683cabdff1aSopenharmony_ci butterfly_8h v5, v8, v20, v22 // v5 =-out[15],v8 = t3a 684cabdff1aSopenharmony_ci butterfly_8h v4, v9, v24, v26 // v4 = out[14],v9 = t11 685cabdff1aSopenharmony_ci 686cabdff1aSopenharmony_ci dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8] 687cabdff1aSopenharmony_ci dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10] 688cabdff1aSopenharmony_ci dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11] 689cabdff1aSopenharmony_ci dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9] 690cabdff1aSopenharmony_ci 691cabdff1aSopenharmony_ci neg v31.8h, v5.8h // v31 = out[15] 692cabdff1aSopenharmony_ci neg v17.8h, v3.8h // v17 = out[1] 693cabdff1aSopenharmony_ci 694cabdff1aSopenharmony_ci mov v16.16b, v2.16b 695cabdff1aSopenharmony_ci mov v30.16b, v4.16b 696cabdff1aSopenharmony_ci ret 697cabdff1aSopenharmony_ciendfunc 698cabdff1aSopenharmony_ci 699cabdff1aSopenharmony_ci// Helper macros; we can't use these expressions directly within 700cabdff1aSopenharmony_ci// e.g. .irp due to the extra concatenation \(). Therefore wrap 701cabdff1aSopenharmony_ci// them in macros to allow using .irp below. 702cabdff1aSopenharmony_ci.macro load i, src, inc 703cabdff1aSopenharmony_ci ld1 {v\i\().8h}, [\src], \inc 704cabdff1aSopenharmony_ci.endm 705cabdff1aSopenharmony_ci.macro store i, dst, inc 706cabdff1aSopenharmony_ci st1 {v\i\().8h}, [\dst], \inc 707cabdff1aSopenharmony_ci.endm 708cabdff1aSopenharmony_ci.macro movi_v i, size, imm 709cabdff1aSopenharmony_ci movi v\i\()\size, \imm 710cabdff1aSopenharmony_ci.endm 711cabdff1aSopenharmony_ci.macro load_clear i, src, inc 712cabdff1aSopenharmony_ci ld1 {v\i\().8h}, [\src] 713cabdff1aSopenharmony_ci st1 {v2.8h}, [\src], \inc 714cabdff1aSopenharmony_ci.endm 715cabdff1aSopenharmony_ci 716cabdff1aSopenharmony_ci.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2 717cabdff1aSopenharmony_ci srshr \coef0, \coef0, #6 718cabdff1aSopenharmony_ci ld1 {v2.8b}, [x0], x1 719cabdff1aSopenharmony_ci srshr \coef1, \coef1, #6 720cabdff1aSopenharmony_ci ld1 {v3.8b}, [x3], x1 721cabdff1aSopenharmony_ci srshr \coef2, \coef2, #6 722cabdff1aSopenharmony_ci ld1 {v4.8b}, [x0], x1 723cabdff1aSopenharmony_ci srshr \coef3, \coef3, #6 724cabdff1aSopenharmony_ci uaddw \coef0, \coef0, v2.8b 725cabdff1aSopenharmony_ci ld1 {v5.8b}, [x3], x1 726cabdff1aSopenharmony_ci uaddw \coef1, \coef1, v3.8b 727cabdff1aSopenharmony_ci srshr \coef4, \coef4, #6 728cabdff1aSopenharmony_ci ld1 {v6.8b}, [x0], x1 729cabdff1aSopenharmony_ci srshr \coef5, \coef5, #6 730cabdff1aSopenharmony_ci ld1 {v7.8b}, [x3], x1 731cabdff1aSopenharmony_ci sqxtun v2.8b, \coef0 732cabdff1aSopenharmony_ci srshr \coef6, \coef6, #6 733cabdff1aSopenharmony_ci sqxtun v3.8b, \coef1 734cabdff1aSopenharmony_ci srshr \coef7, \coef7, #6 735cabdff1aSopenharmony_ci uaddw \coef2, \coef2, v4.8b 736cabdff1aSopenharmony_ci ld1 {\tmp1}, [x0], x1 737cabdff1aSopenharmony_ci uaddw \coef3, \coef3, v5.8b 738cabdff1aSopenharmony_ci ld1 {\tmp2}, [x3], x1 739cabdff1aSopenharmony_ci sqxtun v4.8b, \coef2 740cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #2 741cabdff1aSopenharmony_ci sub x3, x3, x1, lsl #2 742cabdff1aSopenharmony_ci sqxtun v5.8b, \coef3 743cabdff1aSopenharmony_ci uaddw \coef4, \coef4, v6.8b 744cabdff1aSopenharmony_ci st1 {v2.8b}, [x0], x1 745cabdff1aSopenharmony_ci uaddw \coef5, \coef5, v7.8b 746cabdff1aSopenharmony_ci st1 {v3.8b}, [x3], x1 747cabdff1aSopenharmony_ci sqxtun v6.8b, \coef4 748cabdff1aSopenharmony_ci st1 {v4.8b}, [x0], x1 749cabdff1aSopenharmony_ci sqxtun v7.8b, \coef5 750cabdff1aSopenharmony_ci st1 {v5.8b}, [x3], x1 751cabdff1aSopenharmony_ci uaddw \coef6, \coef6, \tmp1 752cabdff1aSopenharmony_ci st1 {v6.8b}, [x0], x1 753cabdff1aSopenharmony_ci uaddw \coef7, \coef7, \tmp2 754cabdff1aSopenharmony_ci st1 {v7.8b}, [x3], x1 755cabdff1aSopenharmony_ci sqxtun \tmp1, \coef6 756cabdff1aSopenharmony_ci sqxtun \tmp2, \coef7 757cabdff1aSopenharmony_ci st1 {\tmp1}, [x0], x1 758cabdff1aSopenharmony_ci st1 {\tmp2}, [x3], x1 759cabdff1aSopenharmony_ci.endm 760cabdff1aSopenharmony_ci 761cabdff1aSopenharmony_ci// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, 762cabdff1aSopenharmony_ci// transpose into a horizontal 16x8 slice and store. 763cabdff1aSopenharmony_ci// x0 = dst (temp buffer) 764cabdff1aSopenharmony_ci// x1 = slice offset 765cabdff1aSopenharmony_ci// x2 = src 766cabdff1aSopenharmony_ci// x9 = input stride 767cabdff1aSopenharmony_ci.macro itxfm16_1d_funcs txfm 768cabdff1aSopenharmony_cifunction \txfm\()16_1d_8x16_pass1_neon 769cabdff1aSopenharmony_ci mov x14, x30 770cabdff1aSopenharmony_ci 771cabdff1aSopenharmony_ci movi v2.8h, #0 772cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 773cabdff1aSopenharmony_ci load_clear \i, x2, x9 774cabdff1aSopenharmony_ci.endr 775cabdff1aSopenharmony_ci 776cabdff1aSopenharmony_ci bl \txfm\()16 777cabdff1aSopenharmony_ci 778cabdff1aSopenharmony_ci // Do two 8x8 transposes. Originally, v16-v31 contain the 779cabdff1aSopenharmony_ci // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two 780cabdff1aSopenharmony_ci // transposed 8x8 blocks. 781cabdff1aSopenharmony_ci transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 782cabdff1aSopenharmony_ci transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 783cabdff1aSopenharmony_ci 784cabdff1aSopenharmony_ci // Store the transposed 8x8 blocks horizontally. 785cabdff1aSopenharmony_ci cmp x1, #8 786cabdff1aSopenharmony_ci b.eq 1f 787cabdff1aSopenharmony_ci.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 788cabdff1aSopenharmony_ci store \i, x0, #16 789cabdff1aSopenharmony_ci.endr 790cabdff1aSopenharmony_ci ret x14 791cabdff1aSopenharmony_ci1: 792cabdff1aSopenharmony_ci // Special case: For the last input column (x1 == 8), 793cabdff1aSopenharmony_ci // which would be stored as the last row in the temp buffer, 794cabdff1aSopenharmony_ci // don't store the first 8x8 block, but keep it in registers 795cabdff1aSopenharmony_ci // for the first slice of the second pass (where it is the 796cabdff1aSopenharmony_ci // last 8x8 block). 797cabdff1aSopenharmony_ci.irp i, 24, 25, 26, 27, 28, 29, 30, 31 798cabdff1aSopenharmony_ci add x0, x0, #16 799cabdff1aSopenharmony_ci store \i, x0, #16 800cabdff1aSopenharmony_ci.endr 801cabdff1aSopenharmony_ci mov v24.16b, v16.16b 802cabdff1aSopenharmony_ci mov v25.16b, v17.16b 803cabdff1aSopenharmony_ci mov v26.16b, v18.16b 804cabdff1aSopenharmony_ci mov v27.16b, v19.16b 805cabdff1aSopenharmony_ci mov v28.16b, v20.16b 806cabdff1aSopenharmony_ci mov v29.16b, v21.16b 807cabdff1aSopenharmony_ci mov v30.16b, v22.16b 808cabdff1aSopenharmony_ci mov v31.16b, v23.16b 809cabdff1aSopenharmony_ci ret x14 810cabdff1aSopenharmony_ciendfunc 811cabdff1aSopenharmony_ci 812cabdff1aSopenharmony_ci// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, 813cabdff1aSopenharmony_ci// load the destination pixels (from a similar 8x16 slice), add and store back. 814cabdff1aSopenharmony_ci// x0 = dst 815cabdff1aSopenharmony_ci// x1 = dst stride 816cabdff1aSopenharmony_ci// x2 = src (temp buffer) 817cabdff1aSopenharmony_ci// x3 = slice offset 818cabdff1aSopenharmony_ci// x9 = temp buffer stride 819cabdff1aSopenharmony_cifunction \txfm\()16_1d_8x16_pass2_neon 820cabdff1aSopenharmony_ci mov x14, x30 821cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 822cabdff1aSopenharmony_ci load \i, x2, x9 823cabdff1aSopenharmony_ci.endr 824cabdff1aSopenharmony_ci cbz x3, 1f 825cabdff1aSopenharmony_ci.irp i, 24, 25, 26, 27, 28, 29, 30, 31 826cabdff1aSopenharmony_ci load \i, x2, x9 827cabdff1aSopenharmony_ci.endr 828cabdff1aSopenharmony_ci1: 829cabdff1aSopenharmony_ci 830cabdff1aSopenharmony_ci add x3, x0, x1 831cabdff1aSopenharmony_ci lsl x1, x1, #1 832cabdff1aSopenharmony_ci bl \txfm\()16 833cabdff1aSopenharmony_ci 834cabdff1aSopenharmony_ci load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b 835cabdff1aSopenharmony_ci load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b 836cabdff1aSopenharmony_ci 837cabdff1aSopenharmony_ci ret x14 838cabdff1aSopenharmony_ciendfunc 839cabdff1aSopenharmony_ci.endm 840cabdff1aSopenharmony_ci 841cabdff1aSopenharmony_ciitxfm16_1d_funcs idct 842cabdff1aSopenharmony_ciitxfm16_1d_funcs iadst 843cabdff1aSopenharmony_ci 844cabdff1aSopenharmony_ci.macro itxfm_func16x16 txfm1, txfm2 845cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 846cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 847cabdff1aSopenharmony_ci cmp w3, #1 848cabdff1aSopenharmony_ci b.eq idct16x16_dc_add_neon 849cabdff1aSopenharmony_ci.endif 850cabdff1aSopenharmony_ci mov x15, x30 851cabdff1aSopenharmony_ci // iadst16 requires clobbering v8-v15, but idct16 doesn't need to. 852cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct 853cabdff1aSopenharmony_ci stp d14, d15, [sp, #-0x10]! 854cabdff1aSopenharmony_ci stp d12, d13, [sp, #-0x10]! 855cabdff1aSopenharmony_ci stp d10, d11, [sp, #-0x10]! 856cabdff1aSopenharmony_ci stp d8, d9, [sp, #-0x10]! 857cabdff1aSopenharmony_ci.endif 858cabdff1aSopenharmony_ci 859cabdff1aSopenharmony_ci sub sp, sp, #512 860cabdff1aSopenharmony_ci 861cabdff1aSopenharmony_ci mov x4, x0 862cabdff1aSopenharmony_ci mov x5, x1 863cabdff1aSopenharmony_ci mov x6, x2 864cabdff1aSopenharmony_ci 865cabdff1aSopenharmony_ci movrel x10, idct_coeffs 866cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct 867cabdff1aSopenharmony_ci movrel x11, iadst16_coeffs 868cabdff1aSopenharmony_ci.endif 869cabdff1aSopenharmony_ci.ifc \txfm1,idct 870cabdff1aSopenharmony_ci ld1 {v0.8h,v1.8h}, [x10] 871cabdff1aSopenharmony_ci.endif 872cabdff1aSopenharmony_ci mov x9, #32 873cabdff1aSopenharmony_ci 874cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 875cabdff1aSopenharmony_ci cmp w3, #10 876cabdff1aSopenharmony_ci b.le idct16x16_quarter_add_neon 877cabdff1aSopenharmony_ci cmp w3, #38 878cabdff1aSopenharmony_ci b.le idct16x16_half_add_neon 879cabdff1aSopenharmony_ci.endif 880cabdff1aSopenharmony_ci 881cabdff1aSopenharmony_ci.irp i, 0, 8 882cabdff1aSopenharmony_ci add x0, sp, #(\i*32) 883cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 884cabdff1aSopenharmony_ci.if \i == 8 885cabdff1aSopenharmony_ci cmp w3, #38 886cabdff1aSopenharmony_ci b.le 1f 887cabdff1aSopenharmony_ci.endif 888cabdff1aSopenharmony_ci.endif 889cabdff1aSopenharmony_ci mov x1, #\i 890cabdff1aSopenharmony_ci add x2, x6, #(\i*2) 891cabdff1aSopenharmony_ci bl \txfm1\()16_1d_8x16_pass1_neon 892cabdff1aSopenharmony_ci.endr 893cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,iadst_idct 894cabdff1aSopenharmony_ci ld1 {v0.8h,v1.8h}, [x10] 895cabdff1aSopenharmony_ci.endif 896cabdff1aSopenharmony_ci 897cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 898cabdff1aSopenharmony_ci b 3f 899cabdff1aSopenharmony_ci1: 900cabdff1aSopenharmony_ci // Set v24-v31 to zero, for the in-register passthrough of 901cabdff1aSopenharmony_ci // coefficients to pass 2. Since we only do two slices, this can 902cabdff1aSopenharmony_ci // only ever happen for the second slice. So we only need to store 903cabdff1aSopenharmony_ci // zeros to the temp buffer for the second half of the buffer. 904cabdff1aSopenharmony_ci // Move x0 to the second half, and use x9 == 32 as increment. 905cabdff1aSopenharmony_ci add x0, x0, #16 906cabdff1aSopenharmony_ci.irp i, 24, 25, 26, 27, 28, 29, 30, 31 907cabdff1aSopenharmony_ci movi_v \i, .16b, #0 908cabdff1aSopenharmony_ci st1 {v24.8h}, [x0], x9 909cabdff1aSopenharmony_ci.endr 910cabdff1aSopenharmony_ci3: 911cabdff1aSopenharmony_ci.endif 912cabdff1aSopenharmony_ci 913cabdff1aSopenharmony_ci.irp i, 0, 8 914cabdff1aSopenharmony_ci add x0, x4, #(\i) 915cabdff1aSopenharmony_ci mov x1, x5 916cabdff1aSopenharmony_ci add x2, sp, #(\i*2) 917cabdff1aSopenharmony_ci mov x3, #\i 918cabdff1aSopenharmony_ci bl \txfm2\()16_1d_8x16_pass2_neon 919cabdff1aSopenharmony_ci.endr 920cabdff1aSopenharmony_ci 921cabdff1aSopenharmony_ci add sp, sp, #512 922cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct 923cabdff1aSopenharmony_ci ldp d8, d9, [sp], 0x10 924cabdff1aSopenharmony_ci ldp d10, d11, [sp], 0x10 925cabdff1aSopenharmony_ci ldp d12, d13, [sp], 0x10 926cabdff1aSopenharmony_ci ldp d14, d15, [sp], 0x10 927cabdff1aSopenharmony_ci.endif 928cabdff1aSopenharmony_ci ret x15 929cabdff1aSopenharmony_ciendfunc 930cabdff1aSopenharmony_ci.endm 931cabdff1aSopenharmony_ci 932cabdff1aSopenharmony_ciitxfm_func16x16 idct, idct 933cabdff1aSopenharmony_ciitxfm_func16x16 iadst, idct 934cabdff1aSopenharmony_ciitxfm_func16x16 idct, iadst 935cabdff1aSopenharmony_ciitxfm_func16x16 iadst, iadst 936cabdff1aSopenharmony_ci 937cabdff1aSopenharmony_cifunction idct16_1d_8x16_pass1_quarter_neon 938cabdff1aSopenharmony_ci mov x14, x30 939cabdff1aSopenharmony_ci movi v2.8h, #0 940cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 941cabdff1aSopenharmony_ci load_clear \i, x2, x9 942cabdff1aSopenharmony_ci.endr 943cabdff1aSopenharmony_ci 944cabdff1aSopenharmony_ci bl idct16_quarter 945cabdff1aSopenharmony_ci 946cabdff1aSopenharmony_ci // Do two 8x8 transposes. Originally, v16-v31 contain the 947cabdff1aSopenharmony_ci // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two 948cabdff1aSopenharmony_ci // transposed 8x8 blocks. 949cabdff1aSopenharmony_ci transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 950cabdff1aSopenharmony_ci transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 951cabdff1aSopenharmony_ci 952cabdff1aSopenharmony_ci // Store the transposed 8x8 blocks horizontally. 953cabdff1aSopenharmony_ci // The first 8x8 block is kept in registers for the second pass, 954cabdff1aSopenharmony_ci // store the rest in the temp buffer. 955cabdff1aSopenharmony_ci // Since only a 4x4 part of the input was nonzero, this means that 956cabdff1aSopenharmony_ci // only 4 rows are nonzero after transposing, and the second pass 957cabdff1aSopenharmony_ci // only reads the topmost 4 rows. Therefore only store the topmost 958cabdff1aSopenharmony_ci // 4 rows. 959cabdff1aSopenharmony_ci add x0, x0, #16 960cabdff1aSopenharmony_ci.irp i, 24, 25, 26, 27 961cabdff1aSopenharmony_ci store \i, x0, x9 962cabdff1aSopenharmony_ci.endr 963cabdff1aSopenharmony_ci ret x14 964cabdff1aSopenharmony_ciendfunc 965cabdff1aSopenharmony_ci 966cabdff1aSopenharmony_cifunction idct16_1d_8x16_pass2_quarter_neon 967cabdff1aSopenharmony_ci mov x14, x30 968cabdff1aSopenharmony_ci cbz x3, 1f 969cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 970cabdff1aSopenharmony_ci load \i, x2, x9 971cabdff1aSopenharmony_ci.endr 972cabdff1aSopenharmony_ci1: 973cabdff1aSopenharmony_ci 974cabdff1aSopenharmony_ci add x3, x0, x1 975cabdff1aSopenharmony_ci lsl x1, x1, #1 976cabdff1aSopenharmony_ci bl idct16_quarter 977cabdff1aSopenharmony_ci 978cabdff1aSopenharmony_ci load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b 979cabdff1aSopenharmony_ci load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b 980cabdff1aSopenharmony_ci 981cabdff1aSopenharmony_ci ret x14 982cabdff1aSopenharmony_ciendfunc 983cabdff1aSopenharmony_ci 984cabdff1aSopenharmony_cifunction idct16_1d_8x16_pass1_half_neon 985cabdff1aSopenharmony_ci mov x14, x30 986cabdff1aSopenharmony_ci movi v2.8h, #0 987cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 988cabdff1aSopenharmony_ci load_clear \i, x2, x9 989cabdff1aSopenharmony_ci.endr 990cabdff1aSopenharmony_ci 991cabdff1aSopenharmony_ci bl idct16_half 992cabdff1aSopenharmony_ci 993cabdff1aSopenharmony_ci // Do two 8x8 transposes. Originally, v16-v31 contain the 994cabdff1aSopenharmony_ci // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two 995cabdff1aSopenharmony_ci // transposed 8x8 blocks. 996cabdff1aSopenharmony_ci transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 997cabdff1aSopenharmony_ci transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 998cabdff1aSopenharmony_ci 999cabdff1aSopenharmony_ci // Store the transposed 8x8 blocks horizontally. 1000cabdff1aSopenharmony_ci // The first 8x8 block is kept in registers for the second pass, 1001cabdff1aSopenharmony_ci // store the rest in the temp buffer. 1002cabdff1aSopenharmony_ci add x0, x0, #16 1003cabdff1aSopenharmony_ci.irp i, 24, 25, 26, 27, 28, 29, 30, 31 1004cabdff1aSopenharmony_ci store \i, x0, x9 1005cabdff1aSopenharmony_ci.endr 1006cabdff1aSopenharmony_ci ret x14 1007cabdff1aSopenharmony_ciendfunc 1008cabdff1aSopenharmony_ci 1009cabdff1aSopenharmony_cifunction idct16_1d_8x16_pass2_half_neon 1010cabdff1aSopenharmony_ci mov x14, x30 1011cabdff1aSopenharmony_ci cbz x3, 1f 1012cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1013cabdff1aSopenharmony_ci load \i, x2, x9 1014cabdff1aSopenharmony_ci.endr 1015cabdff1aSopenharmony_ci1: 1016cabdff1aSopenharmony_ci 1017cabdff1aSopenharmony_ci add x3, x0, x1 1018cabdff1aSopenharmony_ci lsl x1, x1, #1 1019cabdff1aSopenharmony_ci bl idct16_half 1020cabdff1aSopenharmony_ci 1021cabdff1aSopenharmony_ci load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b 1022cabdff1aSopenharmony_ci load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b 1023cabdff1aSopenharmony_ci 1024cabdff1aSopenharmony_ci ret x14 1025cabdff1aSopenharmony_ciendfunc 1026cabdff1aSopenharmony_ci 1027cabdff1aSopenharmony_ci.macro idct16_partial size 1028cabdff1aSopenharmony_cifunction idct16x16_\size\()_add_neon 1029cabdff1aSopenharmony_ci add x0, sp, #(0*32) 1030cabdff1aSopenharmony_ci add x2, x6, #(0*2) 1031cabdff1aSopenharmony_ci bl idct16_1d_8x16_pass1_\size\()_neon 1032cabdff1aSopenharmony_ci.irp i, 0, 8 1033cabdff1aSopenharmony_ci add x0, x4, #(\i) 1034cabdff1aSopenharmony_ci mov x1, x5 1035cabdff1aSopenharmony_ci add x2, sp, #(\i*2) 1036cabdff1aSopenharmony_ci mov x3, #\i 1037cabdff1aSopenharmony_ci bl idct16_1d_8x16_pass2_\size\()_neon 1038cabdff1aSopenharmony_ci.endr 1039cabdff1aSopenharmony_ci 1040cabdff1aSopenharmony_ci add sp, sp, #512 1041cabdff1aSopenharmony_ci ret x15 1042cabdff1aSopenharmony_ciendfunc 1043cabdff1aSopenharmony_ci.endm 1044cabdff1aSopenharmony_ci 1045cabdff1aSopenharmony_ciidct16_partial quarter 1046cabdff1aSopenharmony_ciidct16_partial half 1047cabdff1aSopenharmony_ci 1048cabdff1aSopenharmony_cifunction idct32x32_dc_add_neon 1049cabdff1aSopenharmony_ci movrel x4, idct_coeffs 1050cabdff1aSopenharmony_ci ld1 {v0.4h}, [x4] 1051cabdff1aSopenharmony_ci 1052cabdff1aSopenharmony_ci movi v1.4h, #0 1053cabdff1aSopenharmony_ci 1054cabdff1aSopenharmony_ci ld1 {v2.h}[0], [x2] 1055cabdff1aSopenharmony_ci smull v2.4s, v2.4h, v0.h[0] 1056cabdff1aSopenharmony_ci rshrn v2.4h, v2.4s, #14 1057cabdff1aSopenharmony_ci smull v2.4s, v2.4h, v0.h[0] 1058cabdff1aSopenharmony_ci rshrn v2.4h, v2.4s, #14 1059cabdff1aSopenharmony_ci dup v2.8h, v2.h[0] 1060cabdff1aSopenharmony_ci st1 {v1.h}[0], [x2] 1061cabdff1aSopenharmony_ci 1062cabdff1aSopenharmony_ci srshr v0.8h, v2.8h, #6 1063cabdff1aSopenharmony_ci 1064cabdff1aSopenharmony_ci mov x3, x0 1065cabdff1aSopenharmony_ci mov x4, #32 1066cabdff1aSopenharmony_ci1: 1067cabdff1aSopenharmony_ci // Loop to add the constant v0 into all 32x32 outputs 1068cabdff1aSopenharmony_ci subs x4, x4, #2 1069cabdff1aSopenharmony_ci ld1 {v1.16b,v2.16b}, [x0], x1 1070cabdff1aSopenharmony_ci uaddw v16.8h, v0.8h, v1.8b 1071cabdff1aSopenharmony_ci uaddw2 v17.8h, v0.8h, v1.16b 1072cabdff1aSopenharmony_ci ld1 {v3.16b,v4.16b}, [x0], x1 1073cabdff1aSopenharmony_ci uaddw v18.8h, v0.8h, v2.8b 1074cabdff1aSopenharmony_ci uaddw2 v19.8h, v0.8h, v2.16b 1075cabdff1aSopenharmony_ci uaddw v20.8h, v0.8h, v3.8b 1076cabdff1aSopenharmony_ci uaddw2 v21.8h, v0.8h, v3.16b 1077cabdff1aSopenharmony_ci uaddw v22.8h, v0.8h, v4.8b 1078cabdff1aSopenharmony_ci uaddw2 v23.8h, v0.8h, v4.16b 1079cabdff1aSopenharmony_ci sqxtun v1.8b, v16.8h 1080cabdff1aSopenharmony_ci sqxtun2 v1.16b, v17.8h 1081cabdff1aSopenharmony_ci sqxtun v2.8b, v18.8h 1082cabdff1aSopenharmony_ci sqxtun2 v2.16b, v19.8h 1083cabdff1aSopenharmony_ci sqxtun v3.8b, v20.8h 1084cabdff1aSopenharmony_ci sqxtun2 v3.16b, v21.8h 1085cabdff1aSopenharmony_ci st1 {v1.16b,v2.16b}, [x3], x1 1086cabdff1aSopenharmony_ci sqxtun v4.8b, v22.8h 1087cabdff1aSopenharmony_ci sqxtun2 v4.16b, v23.8h 1088cabdff1aSopenharmony_ci st1 {v3.16b,v4.16b}, [x3], x1 1089cabdff1aSopenharmony_ci b.ne 1b 1090cabdff1aSopenharmony_ci 1091cabdff1aSopenharmony_ci ret 1092cabdff1aSopenharmony_ciendfunc 1093cabdff1aSopenharmony_ci 1094cabdff1aSopenharmony_ci.macro idct32_end 1095cabdff1aSopenharmony_ci butterfly_8h v16, v5, v4, v5 // v16 = t16a, v5 = t19a 1096cabdff1aSopenharmony_ci butterfly_8h v17, v20, v23, v20 // v17 = t17, v20 = t18 1097cabdff1aSopenharmony_ci butterfly_8h v18, v6, v7, v6 // v18 = t23a, v6 = t20a 1098cabdff1aSopenharmony_ci butterfly_8h v19, v21, v22, v21 // v19 = t22, v21 = t21 1099cabdff1aSopenharmony_ci butterfly_8h v4, v28, v28, v30 // v4 = t24a, v28 = t27a 1100cabdff1aSopenharmony_ci butterfly_8h v23, v26, v25, v26 // v23 = t25, v26 = t26 1101cabdff1aSopenharmony_ci butterfly_8h v7, v3, v29, v31 // v7 = t31a, v3 = t28a 1102cabdff1aSopenharmony_ci butterfly_8h v22, v27, v24, v27 // v22 = t30, v27 = t29 1103cabdff1aSopenharmony_ci 1104cabdff1aSopenharmony_ci dmbutterfly v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a 1105cabdff1aSopenharmony_ci dmbutterfly v3, v5, v0.h[2], v0.h[3], v24, v25, v30, v31 // v3 = t19, v5 = t28 1106cabdff1aSopenharmony_ci dmbutterfly v28, v6, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20 1107cabdff1aSopenharmony_ci dmbutterfly v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a 1108cabdff1aSopenharmony_ci 1109cabdff1aSopenharmony_ci butterfly_8h v31, v24, v7, v4 // v31 = t31, v24 = t24 1110cabdff1aSopenharmony_ci butterfly_8h v30, v25, v22, v23 // v30 = t30a, v25 = t25a 1111cabdff1aSopenharmony_ci butterfly_8h_r v23, v16, v16, v18 // v23 = t23, v16 = t16 1112cabdff1aSopenharmony_ci butterfly_8h_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a 1113cabdff1aSopenharmony_ci butterfly_8h v18, v21, v27, v21 // v18 = t18, v21 = t21 1114cabdff1aSopenharmony_ci butterfly_8h_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a 1115cabdff1aSopenharmony_ci butterfly_8h v29, v26, v20, v26 // v29 = t29, v26 = t26 1116cabdff1aSopenharmony_ci butterfly_8h v19, v20, v3, v6 // v19 = t19a, v20 = t20 1117cabdff1aSopenharmony_ci 1118cabdff1aSopenharmony_ci dmbutterfly0 v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27, v20 = t20 1119cabdff1aSopenharmony_ci dmbutterfly0 v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a 1120cabdff1aSopenharmony_ci dmbutterfly0 v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22 1121cabdff1aSopenharmony_ci dmbutterfly0 v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a 1122cabdff1aSopenharmony_ci ret 1123cabdff1aSopenharmony_ci.endm 1124cabdff1aSopenharmony_ci 1125cabdff1aSopenharmony_cifunction idct32_odd 1126cabdff1aSopenharmony_ci dmbutterfly v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a 1127cabdff1aSopenharmony_ci dmbutterfly v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a 1128cabdff1aSopenharmony_ci dmbutterfly v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a 1129cabdff1aSopenharmony_ci dmbutterfly v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a 1130cabdff1aSopenharmony_ci dmbutterfly v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a 1131cabdff1aSopenharmony_ci dmbutterfly v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a 1132cabdff1aSopenharmony_ci dmbutterfly v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a 1133cabdff1aSopenharmony_ci dmbutterfly v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a 1134cabdff1aSopenharmony_ci 1135cabdff1aSopenharmony_ci butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17 1136cabdff1aSopenharmony_ci butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18 1137cabdff1aSopenharmony_ci butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21 1138cabdff1aSopenharmony_ci butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22 1139cabdff1aSopenharmony_ci butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25 1140cabdff1aSopenharmony_ci butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26 1141cabdff1aSopenharmony_ci butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30 1142cabdff1aSopenharmony_ci butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29 1143cabdff1aSopenharmony_ci 1144cabdff1aSopenharmony_ci dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a 1145cabdff1aSopenharmony_ci dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a 1146cabdff1aSopenharmony_ci dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a 1147cabdff1aSopenharmony_ci dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a 1148cabdff1aSopenharmony_ci idct32_end 1149cabdff1aSopenharmony_ciendfunc 1150cabdff1aSopenharmony_ci 1151cabdff1aSopenharmony_cifunction idct32_odd_half 1152cabdff1aSopenharmony_ci dmbutterfly_h1 v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a 1153cabdff1aSopenharmony_ci dmbutterfly_h2 v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a 1154cabdff1aSopenharmony_ci dmbutterfly_h1 v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a 1155cabdff1aSopenharmony_ci dmbutterfly_h2 v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a 1156cabdff1aSopenharmony_ci dmbutterfly_h1 v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a 1157cabdff1aSopenharmony_ci dmbutterfly_h2 v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a 1158cabdff1aSopenharmony_ci dmbutterfly_h1 v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a 1159cabdff1aSopenharmony_ci dmbutterfly_h2 v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a 1160cabdff1aSopenharmony_ci 1161cabdff1aSopenharmony_ci butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17 1162cabdff1aSopenharmony_ci butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18 1163cabdff1aSopenharmony_ci butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21 1164cabdff1aSopenharmony_ci butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22 1165cabdff1aSopenharmony_ci butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25 1166cabdff1aSopenharmony_ci butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26 1167cabdff1aSopenharmony_ci butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30 1168cabdff1aSopenharmony_ci butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29 1169cabdff1aSopenharmony_ci 1170cabdff1aSopenharmony_ci dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a 1171cabdff1aSopenharmony_ci dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a 1172cabdff1aSopenharmony_ci dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a 1173cabdff1aSopenharmony_ci dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a 1174cabdff1aSopenharmony_ci idct32_end 1175cabdff1aSopenharmony_ciendfunc 1176cabdff1aSopenharmony_ci 1177cabdff1aSopenharmony_cifunction idct32_odd_quarter 1178cabdff1aSopenharmony_ci dsmull_h v4, v5, v16, v8.h[0] 1179cabdff1aSopenharmony_ci dsmull_h v28, v29, v19, v8.h[7] 1180cabdff1aSopenharmony_ci dsmull_h v30, v31, v16, v8.h[1] 1181cabdff1aSopenharmony_ci dsmull_h v22, v23, v17, v9.h[6] 1182cabdff1aSopenharmony_ci dsmull_h v7, v6, v17, v9.h[7] 1183cabdff1aSopenharmony_ci dsmull_h v26, v27, v19, v8.h[6] 1184cabdff1aSopenharmony_ci dsmull_h v20, v21, v18, v9.h[0] 1185cabdff1aSopenharmony_ci dsmull_h v24, v25, v18, v9.h[1] 1186cabdff1aSopenharmony_ci 1187cabdff1aSopenharmony_ci neg v28.4s, v28.4s 1188cabdff1aSopenharmony_ci neg v29.4s, v29.4s 1189cabdff1aSopenharmony_ci neg v7.4s, v7.4s 1190cabdff1aSopenharmony_ci neg v6.4s, v6.4s 1191cabdff1aSopenharmony_ci 1192cabdff1aSopenharmony_ci drshrn_h v4, v4, v5, #14 1193cabdff1aSopenharmony_ci drshrn_h v5, v28, v29, #14 1194cabdff1aSopenharmony_ci drshrn_h v29, v30, v31, #14 1195cabdff1aSopenharmony_ci drshrn_h v28, v22, v23, #14 1196cabdff1aSopenharmony_ci drshrn_h v7, v7, v6, #14 1197cabdff1aSopenharmony_ci drshrn_h v31, v26, v27, #14 1198cabdff1aSopenharmony_ci drshrn_h v6, v20, v21, #14 1199cabdff1aSopenharmony_ci drshrn_h v30, v24, v25, #14 1200cabdff1aSopenharmony_ci 1201cabdff1aSopenharmony_ci dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[4], v0.h[5] 1202cabdff1aSopenharmony_ci dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[4], v0.h[5] 1203cabdff1aSopenharmony_ci drshrn_h v23, v16, v17, #14 1204cabdff1aSopenharmony_ci drshrn_h v24, v18, v19, #14 1205cabdff1aSopenharmony_ci neg v20.4s, v20.4s 1206cabdff1aSopenharmony_ci neg v21.4s, v21.4s 1207cabdff1aSopenharmony_ci drshrn_h v27, v27, v26, #14 1208cabdff1aSopenharmony_ci drshrn_h v20, v20, v21, #14 1209cabdff1aSopenharmony_ci dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[6], v0.h[7] 1210cabdff1aSopenharmony_ci drshrn_h v21, v16, v17, #14 1211cabdff1aSopenharmony_ci drshrn_h v26, v18, v19, #14 1212cabdff1aSopenharmony_ci dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[6], v0.h[7] 1213cabdff1aSopenharmony_ci drshrn_h v25, v16, v17, #14 1214cabdff1aSopenharmony_ci neg v18.4s, v18.4s 1215cabdff1aSopenharmony_ci neg v19.4s, v19.4s 1216cabdff1aSopenharmony_ci drshrn_h v22, v18, v19, #14 1217cabdff1aSopenharmony_ci 1218cabdff1aSopenharmony_ci idct32_end 1219cabdff1aSopenharmony_ciendfunc 1220cabdff1aSopenharmony_ci 1221cabdff1aSopenharmony_ci.macro idct32_funcs suffix 1222cabdff1aSopenharmony_ci// Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix. 1223cabdff1aSopenharmony_ci// The 32-point IDCT can be decomposed into two 16-point IDCTs; 1224cabdff1aSopenharmony_ci// a normal IDCT16 with every other input component (the even ones, with 1225cabdff1aSopenharmony_ci// each output written twice), followed by a separate 16-point IDCT 1226cabdff1aSopenharmony_ci// of the odd inputs, added/subtracted onto the outputs of the first idct16. 1227cabdff1aSopenharmony_ci// x0 = dst (temp buffer) 1228cabdff1aSopenharmony_ci// x1 = unused 1229cabdff1aSopenharmony_ci// x2 = src 1230cabdff1aSopenharmony_ci// x9 = double input stride 1231cabdff1aSopenharmony_cifunction idct32_1d_8x32_pass1\suffix\()_neon 1232cabdff1aSopenharmony_ci mov x14, x30 1233cabdff1aSopenharmony_ci movi v2.8h, #0 1234cabdff1aSopenharmony_ci 1235cabdff1aSopenharmony_ci // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) 1236cabdff1aSopenharmony_ci.ifb \suffix 1237cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1238cabdff1aSopenharmony_ci load_clear \i, x2, x9 1239cabdff1aSopenharmony_ci.endr 1240cabdff1aSopenharmony_ci.endif 1241cabdff1aSopenharmony_ci.ifc \suffix,_quarter 1242cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 1243cabdff1aSopenharmony_ci load_clear \i, x2, x9 1244cabdff1aSopenharmony_ci.endr 1245cabdff1aSopenharmony_ci.endif 1246cabdff1aSopenharmony_ci.ifc \suffix,_half 1247cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1248cabdff1aSopenharmony_ci load_clear \i, x2, x9 1249cabdff1aSopenharmony_ci.endr 1250cabdff1aSopenharmony_ci.endif 1251cabdff1aSopenharmony_ci 1252cabdff1aSopenharmony_ci bl idct16\suffix 1253cabdff1aSopenharmony_ci 1254cabdff1aSopenharmony_ci // Do two 8x8 transposes. Originally, v16-v31 contain the 1255cabdff1aSopenharmony_ci // 16 rows. Afterwards, v16-v23 and v24-v31 contain the 1256cabdff1aSopenharmony_ci // two transposed 8x8 blocks. 1257cabdff1aSopenharmony_ci transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 1258cabdff1aSopenharmony_ci transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 1259cabdff1aSopenharmony_ci 1260cabdff1aSopenharmony_ci // Store the registers a, b horizontally, followed by the 1261cabdff1aSopenharmony_ci // same registers b, a mirrored. 1262cabdff1aSopenharmony_ci.macro store_rev a, b 1263cabdff1aSopenharmony_ci // There's no rev128 instruction, but we reverse each 64 bit 1264cabdff1aSopenharmony_ci // half, and then flip them using an ext with 8 bytes offset. 1265cabdff1aSopenharmony_ci rev64 v3.8h, \b 1266cabdff1aSopenharmony_ci st1 {\a}, [x0], #16 1267cabdff1aSopenharmony_ci rev64 v2.8h, \a 1268cabdff1aSopenharmony_ci ext v3.16b, v3.16b, v3.16b, #8 1269cabdff1aSopenharmony_ci st1 {\b}, [x0], #16 1270cabdff1aSopenharmony_ci ext v2.16b, v2.16b, v2.16b, #8 1271cabdff1aSopenharmony_ci st1 {v3.8h}, [x0], #16 1272cabdff1aSopenharmony_ci st1 {v2.8h}, [x0], #16 1273cabdff1aSopenharmony_ci.endm 1274cabdff1aSopenharmony_ci store_rev v16.8h, v24.8h 1275cabdff1aSopenharmony_ci store_rev v17.8h, v25.8h 1276cabdff1aSopenharmony_ci store_rev v18.8h, v26.8h 1277cabdff1aSopenharmony_ci store_rev v19.8h, v27.8h 1278cabdff1aSopenharmony_ci store_rev v20.8h, v28.8h 1279cabdff1aSopenharmony_ci store_rev v21.8h, v29.8h 1280cabdff1aSopenharmony_ci store_rev v22.8h, v30.8h 1281cabdff1aSopenharmony_ci store_rev v23.8h, v31.8h 1282cabdff1aSopenharmony_ci sub x0, x0, #512 1283cabdff1aSopenharmony_ci.purgem store_rev 1284cabdff1aSopenharmony_ci 1285cabdff1aSopenharmony_ci // Move x2 back to the start of the input, and move 1286cabdff1aSopenharmony_ci // to the first odd row 1287cabdff1aSopenharmony_ci.ifb \suffix 1288cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #4 1289cabdff1aSopenharmony_ci.endif 1290cabdff1aSopenharmony_ci.ifc \suffix,_quarter 1291cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #2 1292cabdff1aSopenharmony_ci.endif 1293cabdff1aSopenharmony_ci.ifc \suffix,_half 1294cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #3 1295cabdff1aSopenharmony_ci.endif 1296cabdff1aSopenharmony_ci add x2, x2, #64 1297cabdff1aSopenharmony_ci 1298cabdff1aSopenharmony_ci movi v2.8h, #0 1299cabdff1aSopenharmony_ci // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) 1300cabdff1aSopenharmony_ci.ifb \suffix 1301cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1302cabdff1aSopenharmony_ci load_clear \i, x2, x9 1303cabdff1aSopenharmony_ci.endr 1304cabdff1aSopenharmony_ci.endif 1305cabdff1aSopenharmony_ci.ifc \suffix,_quarter 1306cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 1307cabdff1aSopenharmony_ci load_clear \i, x2, x9 1308cabdff1aSopenharmony_ci.endr 1309cabdff1aSopenharmony_ci.endif 1310cabdff1aSopenharmony_ci.ifc \suffix,_half 1311cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1312cabdff1aSopenharmony_ci load_clear \i, x2, x9 1313cabdff1aSopenharmony_ci.endr 1314cabdff1aSopenharmony_ci.endif 1315cabdff1aSopenharmony_ci 1316cabdff1aSopenharmony_ci bl idct32_odd\suffix 1317cabdff1aSopenharmony_ci 1318cabdff1aSopenharmony_ci transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3 1319cabdff1aSopenharmony_ci transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3 1320cabdff1aSopenharmony_ci 1321cabdff1aSopenharmony_ci // Store the registers a, b horizontally, 1322cabdff1aSopenharmony_ci // adding into the output first, and the mirrored, 1323cabdff1aSopenharmony_ci // subtracted from the output. 1324cabdff1aSopenharmony_ci.macro store_rev a, b 1325cabdff1aSopenharmony_ci ld1 {v4.8h}, [x0] 1326cabdff1aSopenharmony_ci rev64 v3.8h, \b 1327cabdff1aSopenharmony_ci add v4.8h, v4.8h, \a 1328cabdff1aSopenharmony_ci rev64 v2.8h, \a 1329cabdff1aSopenharmony_ci st1 {v4.8h}, [x0], #16 1330cabdff1aSopenharmony_ci ext v3.16b, v3.16b, v3.16b, #8 1331cabdff1aSopenharmony_ci ld1 {v5.8h}, [x0] 1332cabdff1aSopenharmony_ci ext v2.16b, v2.16b, v2.16b, #8 1333cabdff1aSopenharmony_ci add v5.8h, v5.8h, \b 1334cabdff1aSopenharmony_ci st1 {v5.8h}, [x0], #16 1335cabdff1aSopenharmony_ci ld1 {v6.8h}, [x0] 1336cabdff1aSopenharmony_ci sub v6.8h, v6.8h, v3.8h 1337cabdff1aSopenharmony_ci st1 {v6.8h}, [x0], #16 1338cabdff1aSopenharmony_ci ld1 {v7.8h}, [x0] 1339cabdff1aSopenharmony_ci sub v7.8h, v7.8h, v2.8h 1340cabdff1aSopenharmony_ci st1 {v7.8h}, [x0], #16 1341cabdff1aSopenharmony_ci.endm 1342cabdff1aSopenharmony_ci 1343cabdff1aSopenharmony_ci store_rev v31.8h, v23.8h 1344cabdff1aSopenharmony_ci store_rev v30.8h, v22.8h 1345cabdff1aSopenharmony_ci store_rev v29.8h, v21.8h 1346cabdff1aSopenharmony_ci store_rev v28.8h, v20.8h 1347cabdff1aSopenharmony_ci store_rev v27.8h, v19.8h 1348cabdff1aSopenharmony_ci store_rev v26.8h, v18.8h 1349cabdff1aSopenharmony_ci store_rev v25.8h, v17.8h 1350cabdff1aSopenharmony_ci store_rev v24.8h, v16.8h 1351cabdff1aSopenharmony_ci.purgem store_rev 1352cabdff1aSopenharmony_ci ret x14 1353cabdff1aSopenharmony_ciendfunc 1354cabdff1aSopenharmony_ci 1355cabdff1aSopenharmony_ci// This is mostly the same as 8x32_pass1, but without the transpose, 1356cabdff1aSopenharmony_ci// and use the source as temp buffer between the two idct passes, and 1357cabdff1aSopenharmony_ci// add into the destination. 1358cabdff1aSopenharmony_ci// x0 = dst 1359cabdff1aSopenharmony_ci// x1 = dst stride 1360cabdff1aSopenharmony_ci// x2 = src (temp buffer) 1361cabdff1aSopenharmony_ci// x7 = negative double temp buffer stride 1362cabdff1aSopenharmony_ci// x9 = double temp buffer stride 1363cabdff1aSopenharmony_cifunction idct32_1d_8x32_pass2\suffix\()_neon 1364cabdff1aSopenharmony_ci mov x14, x30 1365cabdff1aSopenharmony_ci // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) 1366cabdff1aSopenharmony_ci.ifb \suffix 1367cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1368cabdff1aSopenharmony_ci load \i, x2, x9 1369cabdff1aSopenharmony_ci.endr 1370cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #4 1371cabdff1aSopenharmony_ci.endif 1372cabdff1aSopenharmony_ci.ifc \suffix,_quarter 1373cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 1374cabdff1aSopenharmony_ci load \i, x2, x9 1375cabdff1aSopenharmony_ci.endr 1376cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #2 1377cabdff1aSopenharmony_ci.endif 1378cabdff1aSopenharmony_ci.ifc \suffix,_half 1379cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1380cabdff1aSopenharmony_ci load \i, x2, x9 1381cabdff1aSopenharmony_ci.endr 1382cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #3 1383cabdff1aSopenharmony_ci.endif 1384cabdff1aSopenharmony_ci 1385cabdff1aSopenharmony_ci bl idct16\suffix 1386cabdff1aSopenharmony_ci 1387cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1388cabdff1aSopenharmony_ci store \i, x2, x9 1389cabdff1aSopenharmony_ci.endr 1390cabdff1aSopenharmony_ci 1391cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #4 1392cabdff1aSopenharmony_ci add x2, x2, #64 1393cabdff1aSopenharmony_ci 1394cabdff1aSopenharmony_ci // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) 1395cabdff1aSopenharmony_ci.ifb \suffix 1396cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1397cabdff1aSopenharmony_ci load \i, x2, x9 1398cabdff1aSopenharmony_ci.endr 1399cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #4 1400cabdff1aSopenharmony_ci.endif 1401cabdff1aSopenharmony_ci.ifc \suffix,_quarter 1402cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 1403cabdff1aSopenharmony_ci load \i, x2, x9 1404cabdff1aSopenharmony_ci.endr 1405cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #2 1406cabdff1aSopenharmony_ci.endif 1407cabdff1aSopenharmony_ci.ifc \suffix,_half 1408cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1409cabdff1aSopenharmony_ci load \i, x2, x9 1410cabdff1aSopenharmony_ci.endr 1411cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #3 1412cabdff1aSopenharmony_ci.endif 1413cabdff1aSopenharmony_ci sub x2, x2, #64 1414cabdff1aSopenharmony_ci 1415cabdff1aSopenharmony_ci bl idct32_odd\suffix 1416cabdff1aSopenharmony_ci 1417cabdff1aSopenharmony_ci.macro load_acc_store a, b, c, d, neg=0 1418cabdff1aSopenharmony_ci.if \neg == 0 1419cabdff1aSopenharmony_ci ld1 {v4.8h}, [x2], x9 1420cabdff1aSopenharmony_ci ld1 {v5.8h}, [x2], x9 1421cabdff1aSopenharmony_ci add v4.8h, v4.8h, \a 1422cabdff1aSopenharmony_ci ld1 {v6.8h}, [x2], x9 1423cabdff1aSopenharmony_ci add v5.8h, v5.8h, \b 1424cabdff1aSopenharmony_ci ld1 {v7.8h}, [x2], x9 1425cabdff1aSopenharmony_ci add v6.8h, v6.8h, \c 1426cabdff1aSopenharmony_ci add v7.8h, v7.8h, \d 1427cabdff1aSopenharmony_ci.else 1428cabdff1aSopenharmony_ci ld1 {v4.8h}, [x2], x7 1429cabdff1aSopenharmony_ci ld1 {v5.8h}, [x2], x7 1430cabdff1aSopenharmony_ci sub v4.8h, v4.8h, \a 1431cabdff1aSopenharmony_ci ld1 {v6.8h}, [x2], x7 1432cabdff1aSopenharmony_ci sub v5.8h, v5.8h, \b 1433cabdff1aSopenharmony_ci ld1 {v7.8h}, [x2], x7 1434cabdff1aSopenharmony_ci sub v6.8h, v6.8h, \c 1435cabdff1aSopenharmony_ci sub v7.8h, v7.8h, \d 1436cabdff1aSopenharmony_ci.endif 1437cabdff1aSopenharmony_ci ld1 {v10.8b}, [x0], x1 1438cabdff1aSopenharmony_ci ld1 {v11.8b}, [x0], x1 1439cabdff1aSopenharmony_ci srshr v4.8h, v4.8h, #6 1440cabdff1aSopenharmony_ci ld1 {v2.8b}, [x0], x1 1441cabdff1aSopenharmony_ci srshr v5.8h, v5.8h, #6 1442cabdff1aSopenharmony_ci uaddw v4.8h, v4.8h, v10.8b 1443cabdff1aSopenharmony_ci ld1 {v3.8b}, [x0], x1 1444cabdff1aSopenharmony_ci srshr v6.8h, v6.8h, #6 1445cabdff1aSopenharmony_ci uaddw v5.8h, v5.8h, v11.8b 1446cabdff1aSopenharmony_ci srshr v7.8h, v7.8h, #6 1447cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #2 1448cabdff1aSopenharmony_ci uaddw v6.8h, v6.8h, v2.8b 1449cabdff1aSopenharmony_ci sqxtun v4.8b, v4.8h 1450cabdff1aSopenharmony_ci uaddw v7.8h, v7.8h, v3.8b 1451cabdff1aSopenharmony_ci sqxtun v5.8b, v5.8h 1452cabdff1aSopenharmony_ci st1 {v4.8b}, [x0], x1 1453cabdff1aSopenharmony_ci sqxtun v6.8b, v6.8h 1454cabdff1aSopenharmony_ci st1 {v5.8b}, [x0], x1 1455cabdff1aSopenharmony_ci sqxtun v7.8b, v7.8h 1456cabdff1aSopenharmony_ci st1 {v6.8b}, [x0], x1 1457cabdff1aSopenharmony_ci st1 {v7.8b}, [x0], x1 1458cabdff1aSopenharmony_ci.endm 1459cabdff1aSopenharmony_ci load_acc_store v31.8h, v30.8h, v29.8h, v28.8h 1460cabdff1aSopenharmony_ci load_acc_store v27.8h, v26.8h, v25.8h, v24.8h 1461cabdff1aSopenharmony_ci load_acc_store v23.8h, v22.8h, v21.8h, v20.8h 1462cabdff1aSopenharmony_ci load_acc_store v19.8h, v18.8h, v17.8h, v16.8h 1463cabdff1aSopenharmony_ci sub x2, x2, x9 1464cabdff1aSopenharmony_ci load_acc_store v16.8h, v17.8h, v18.8h, v19.8h, 1 1465cabdff1aSopenharmony_ci load_acc_store v20.8h, v21.8h, v22.8h, v23.8h, 1 1466cabdff1aSopenharmony_ci load_acc_store v24.8h, v25.8h, v26.8h, v27.8h, 1 1467cabdff1aSopenharmony_ci load_acc_store v28.8h, v29.8h, v30.8h, v31.8h, 1 1468cabdff1aSopenharmony_ci.purgem load_acc_store 1469cabdff1aSopenharmony_ci ret x14 1470cabdff1aSopenharmony_ciendfunc 1471cabdff1aSopenharmony_ci.endm 1472cabdff1aSopenharmony_ci 1473cabdff1aSopenharmony_ciidct32_funcs 1474cabdff1aSopenharmony_ciidct32_funcs _quarter 1475cabdff1aSopenharmony_ciidct32_funcs _half 1476cabdff1aSopenharmony_ci 1477cabdff1aSopenharmony_ciconst min_eob_idct_idct_32, align=4 1478cabdff1aSopenharmony_ci .short 0, 34, 135, 336 1479cabdff1aSopenharmony_ciendconst 1480cabdff1aSopenharmony_ci 1481cabdff1aSopenharmony_cifunction ff_vp9_idct_idct_32x32_add_neon, export=1 1482cabdff1aSopenharmony_ci cmp w3, #1 1483cabdff1aSopenharmony_ci b.eq idct32x32_dc_add_neon 1484cabdff1aSopenharmony_ci 1485cabdff1aSopenharmony_ci movrel x10, idct_coeffs 1486cabdff1aSopenharmony_ci 1487cabdff1aSopenharmony_ci mov x15, x30 1488cabdff1aSopenharmony_ci 1489cabdff1aSopenharmony_ci stp d10, d11, [sp, #-0x10]! 1490cabdff1aSopenharmony_ci stp d8, d9, [sp, #-0x10]! 1491cabdff1aSopenharmony_ci 1492cabdff1aSopenharmony_ci sub sp, sp, #2048 1493cabdff1aSopenharmony_ci 1494cabdff1aSopenharmony_ci mov x4, x0 1495cabdff1aSopenharmony_ci mov x5, x1 1496cabdff1aSopenharmony_ci mov x6, x2 1497cabdff1aSopenharmony_ci 1498cabdff1aSopenharmony_ci // Double stride of the input, since we only read every other line 1499cabdff1aSopenharmony_ci mov x9, #128 1500cabdff1aSopenharmony_ci neg x7, x9 1501cabdff1aSopenharmony_ci 1502cabdff1aSopenharmony_ci ld1 {v0.8h,v1.8h}, [x10], #32 1503cabdff1aSopenharmony_ci ld1 {v8.8h,v9.8h}, [x10] 1504cabdff1aSopenharmony_ci 1505cabdff1aSopenharmony_ci cmp w3, #34 1506cabdff1aSopenharmony_ci b.le idct32x32_quarter_add_neon 1507cabdff1aSopenharmony_ci cmp w3, #135 1508cabdff1aSopenharmony_ci b.le idct32x32_half_add_neon 1509cabdff1aSopenharmony_ci 1510cabdff1aSopenharmony_ci movrel x12, min_eob_idct_idct_32, 2 1511cabdff1aSopenharmony_ci 1512cabdff1aSopenharmony_ci.irp i, 0, 8, 16, 24 1513cabdff1aSopenharmony_ci add x0, sp, #(\i*64) 1514cabdff1aSopenharmony_ci.if \i > 0 1515cabdff1aSopenharmony_ci ldrh w1, [x12], #2 1516cabdff1aSopenharmony_ci cmp w3, w1 1517cabdff1aSopenharmony_ci mov x1, #(32 - \i)/4 1518cabdff1aSopenharmony_ci b.le 1f 1519cabdff1aSopenharmony_ci.endif 1520cabdff1aSopenharmony_ci add x2, x6, #(\i*2) 1521cabdff1aSopenharmony_ci bl idct32_1d_8x32_pass1_neon 1522cabdff1aSopenharmony_ci.endr 1523cabdff1aSopenharmony_ci b 3f 1524cabdff1aSopenharmony_ci 1525cabdff1aSopenharmony_ci1: 1526cabdff1aSopenharmony_ci // Write zeros to the temp buffer for pass 2 1527cabdff1aSopenharmony_ci movi v16.8h, #0 1528cabdff1aSopenharmony_ci movi v17.8h, #0 1529cabdff1aSopenharmony_ci movi v18.8h, #0 1530cabdff1aSopenharmony_ci movi v19.8h, #0 1531cabdff1aSopenharmony_ci2: 1532cabdff1aSopenharmony_ci subs x1, x1, #1 1533cabdff1aSopenharmony_ci.rept 4 1534cabdff1aSopenharmony_ci st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x0], #64 1535cabdff1aSopenharmony_ci.endr 1536cabdff1aSopenharmony_ci b.ne 2b 1537cabdff1aSopenharmony_ci3: 1538cabdff1aSopenharmony_ci.irp i, 0, 8, 16, 24 1539cabdff1aSopenharmony_ci add x0, x4, #(\i) 1540cabdff1aSopenharmony_ci mov x1, x5 1541cabdff1aSopenharmony_ci add x2, sp, #(\i*2) 1542cabdff1aSopenharmony_ci bl idct32_1d_8x32_pass2_neon 1543cabdff1aSopenharmony_ci.endr 1544cabdff1aSopenharmony_ci 1545cabdff1aSopenharmony_ci add sp, sp, #2048 1546cabdff1aSopenharmony_ci 1547cabdff1aSopenharmony_ci ldp d8, d9, [sp], 0x10 1548cabdff1aSopenharmony_ci ldp d10, d11, [sp], 0x10 1549cabdff1aSopenharmony_ci 1550cabdff1aSopenharmony_ci ret x15 1551cabdff1aSopenharmony_ciendfunc 1552cabdff1aSopenharmony_ci 1553cabdff1aSopenharmony_ci.macro idct32_partial size 1554cabdff1aSopenharmony_cifunction idct32x32_\size\()_add_neon 1555cabdff1aSopenharmony_ci add x0, sp, #(0*64) 1556cabdff1aSopenharmony_ci add x2, x6, #(0*2) 1557cabdff1aSopenharmony_ci bl idct32_1d_8x32_pass1_\size\()_neon 1558cabdff1aSopenharmony_ci.ifc \size,half 1559cabdff1aSopenharmony_ci add x0, sp, #(8*64) 1560cabdff1aSopenharmony_ci add x2, x6, #(8*2) 1561cabdff1aSopenharmony_ci bl idct32_1d_8x32_pass1_\size\()_neon 1562cabdff1aSopenharmony_ci.endif 1563cabdff1aSopenharmony_ci.irp i, 0, 8, 16, 24 1564cabdff1aSopenharmony_ci add x0, x4, #(\i) 1565cabdff1aSopenharmony_ci mov x1, x5 1566cabdff1aSopenharmony_ci add x2, sp, #(\i*2) 1567cabdff1aSopenharmony_ci bl idct32_1d_8x32_pass2_\size\()_neon 1568cabdff1aSopenharmony_ci.endr 1569cabdff1aSopenharmony_ci 1570cabdff1aSopenharmony_ci add sp, sp, #2048 1571cabdff1aSopenharmony_ci 1572cabdff1aSopenharmony_ci ldp d8, d9, [sp], 0x10 1573cabdff1aSopenharmony_ci ldp d10, d11, [sp], 0x10 1574cabdff1aSopenharmony_ci 1575cabdff1aSopenharmony_ci ret x15 1576cabdff1aSopenharmony_ciendfunc 1577cabdff1aSopenharmony_ci.endm 1578cabdff1aSopenharmony_ci 1579cabdff1aSopenharmony_ciidct32_partial quarter 1580cabdff1aSopenharmony_ciidct32_partial half 1581