1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2017 Google Inc. 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S" 22cabdff1aSopenharmony_ci#include "neon.S" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ciconst itxfm4_coeffs, align=4 25cabdff1aSopenharmony_ci .short 11585, 0, 6270, 15137 26cabdff1aSopenharmony_ciiadst4_coeffs: 27cabdff1aSopenharmony_ci .short 5283, 15212, 9929, 13377 28cabdff1aSopenharmony_ciendconst 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_ciconst iadst8_coeffs, align=4 31cabdff1aSopenharmony_ci .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679 32cabdff1aSopenharmony_ciidct_coeffs: 33cabdff1aSopenharmony_ci .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102 34cabdff1aSopenharmony_ci .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756 35cabdff1aSopenharmony_ci .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520 36cabdff1aSopenharmony_ci .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404 37cabdff1aSopenharmony_ciendconst 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_ciconst iadst16_coeffs, align=4 40cabdff1aSopenharmony_ci .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053 41cabdff1aSopenharmony_ci .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207 42cabdff1aSopenharmony_ciendconst 43cabdff1aSopenharmony_ci 44cabdff1aSopenharmony_ci.macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7 45cabdff1aSopenharmony_ci trn1 \r4\().4s, \r0\().4s, \r1\().4s 46cabdff1aSopenharmony_ci trn2 \r5\().4s, \r0\().4s, \r1\().4s 47cabdff1aSopenharmony_ci trn1 \r6\().4s, \r2\().4s, \r3\().4s 48cabdff1aSopenharmony_ci trn2 \r7\().4s, \r2\().4s, \r3\().4s 49cabdff1aSopenharmony_ci trn1 \r0\().2d, \r4\().2d, \r6\().2d 50cabdff1aSopenharmony_ci trn2 \r2\().2d, \r4\().2d, \r6\().2d 51cabdff1aSopenharmony_ci trn1 \r1\().2d, \r5\().2d, \r7\().2d 52cabdff1aSopenharmony_ci trn2 \r3\().2d, \r5\().2d, \r7\().2d 53cabdff1aSopenharmony_ci.endm 54cabdff1aSopenharmony_ci 55cabdff1aSopenharmony_ci// Transpose a 8x8 matrix of 32 bit elements, where each row is spread out 56cabdff1aSopenharmony_ci// over two registers. 57cabdff1aSopenharmony_ci.macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3 58cabdff1aSopenharmony_ci transpose_4x4s \r0, \r2, \r4, \r6, \t0, \t1, \t2, \t3 59cabdff1aSopenharmony_ci transpose_4x4s \r9, \r11, \r13, \r15, \t0, \t1, \t2, \t3 60cabdff1aSopenharmony_ci 61cabdff1aSopenharmony_ci // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14 62cabdff1aSopenharmony_ci // while swapping the two 4x4 matrices between each other 63cabdff1aSopenharmony_ci 64cabdff1aSopenharmony_ci // First step of the 4x4 transpose of r1-r7, into t0-t3 65cabdff1aSopenharmony_ci trn1 \t0\().4s, \r1\().4s, \r3\().4s 66cabdff1aSopenharmony_ci trn2 \t1\().4s, \r1\().4s, \r3\().4s 67cabdff1aSopenharmony_ci trn1 \t2\().4s, \r5\().4s, \r7\().4s 68cabdff1aSopenharmony_ci trn2 \t3\().4s, \r5\().4s, \r7\().4s 69cabdff1aSopenharmony_ci 70cabdff1aSopenharmony_ci // First step of the 4x4 transpose of r8-r12, into r1-r7 71cabdff1aSopenharmony_ci trn1 \r1\().4s, \r8\().4s, \r10\().4s 72cabdff1aSopenharmony_ci trn2 \r3\().4s, \r8\().4s, \r10\().4s 73cabdff1aSopenharmony_ci trn1 \r5\().4s, \r12\().4s, \r14\().4s 74cabdff1aSopenharmony_ci trn2 \r7\().4s, \r12\().4s, \r14\().4s 75cabdff1aSopenharmony_ci 76cabdff1aSopenharmony_ci // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12 77cabdff1aSopenharmony_ci trn1 \r8\().2d, \t0\().2d, \t2\().2d 78cabdff1aSopenharmony_ci trn2 \r12\().2d, \t0\().2d, \t2\().2d 79cabdff1aSopenharmony_ci trn1 \r10\().2d, \t1\().2d, \t3\().2d 80cabdff1aSopenharmony_ci trn2 \r14\().2d, \t1\().2d, \t3\().2d 81cabdff1aSopenharmony_ci 82cabdff1aSopenharmony_ci // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible 83cabdff1aSopenharmony_ci trn1 \t0\().2d, \r1\().2d, \r5\().2d 84cabdff1aSopenharmony_ci trn2 \r5\().2d, \r1\().2d, \r5\().2d 85cabdff1aSopenharmony_ci trn1 \t1\().2d, \r3\().2d, \r7\().2d 86cabdff1aSopenharmony_ci trn2 \r7\().2d, \r3\().2d, \r7\().2d 87cabdff1aSopenharmony_ci 88cabdff1aSopenharmony_ci // Move the outputs of trn1 back in place 89cabdff1aSopenharmony_ci mov \r1\().16b, \t0\().16b 90cabdff1aSopenharmony_ci mov \r3\().16b, \t1\().16b 91cabdff1aSopenharmony_ci.endm 92cabdff1aSopenharmony_ci 93cabdff1aSopenharmony_ci// out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 94cabdff1aSopenharmony_ci// out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 95cabdff1aSopenharmony_ci// in/out are .4s registers; this can do with 4 temp registers, but is 96cabdff1aSopenharmony_ci// more efficient if 6 temp registers are available. 97cabdff1aSopenharmony_ci.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0 98cabdff1aSopenharmony_ci.if \neg > 0 99cabdff1aSopenharmony_ci neg \tmp4\().4s, v0.4s 100cabdff1aSopenharmony_ci.endif 101cabdff1aSopenharmony_ci add \tmp1\().4s, \in1\().4s, \in2\().4s 102cabdff1aSopenharmony_ci sub \tmp2\().4s, \in1\().4s, \in2\().4s 103cabdff1aSopenharmony_ci.if \neg > 0 104cabdff1aSopenharmony_ci smull \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0] 105cabdff1aSopenharmony_ci smull2 \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0] 106cabdff1aSopenharmony_ci.else 107cabdff1aSopenharmony_ci smull \tmp3\().2d, \tmp1\().2s, v0.s[0] 108cabdff1aSopenharmony_ci smull2 \tmp4\().2d, \tmp1\().4s, v0.s[0] 109cabdff1aSopenharmony_ci.endif 110cabdff1aSopenharmony_ci.ifb \tmp5 111cabdff1aSopenharmony_ci rshrn \out1\().2s, \tmp3\().2d, #14 112cabdff1aSopenharmony_ci rshrn2 \out1\().4s, \tmp4\().2d, #14 113cabdff1aSopenharmony_ci smull \tmp3\().2d, \tmp2\().2s, v0.s[0] 114cabdff1aSopenharmony_ci smull2 \tmp4\().2d, \tmp2\().4s, v0.s[0] 115cabdff1aSopenharmony_ci rshrn \out2\().2s, \tmp3\().2d, #14 116cabdff1aSopenharmony_ci rshrn2 \out2\().4s, \tmp4\().2d, #14 117cabdff1aSopenharmony_ci.else 118cabdff1aSopenharmony_ci smull \tmp5\().2d, \tmp2\().2s, v0.s[0] 119cabdff1aSopenharmony_ci smull2 \tmp6\().2d, \tmp2\().4s, v0.s[0] 120cabdff1aSopenharmony_ci rshrn \out1\().2s, \tmp3\().2d, #14 121cabdff1aSopenharmony_ci rshrn2 \out1\().4s, \tmp4\().2d, #14 122cabdff1aSopenharmony_ci rshrn \out2\().2s, \tmp5\().2d, #14 123cabdff1aSopenharmony_ci rshrn2 \out2\().4s, \tmp6\().2d, #14 124cabdff1aSopenharmony_ci.endif 125cabdff1aSopenharmony_ci.endm 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ci// Same as dmbutterfly0 above, but treating the input in in2 as zero, 128cabdff1aSopenharmony_ci// writing the same output into both out1 and out2. 129cabdff1aSopenharmony_ci.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6 130cabdff1aSopenharmony_ci smull \tmp1\().2d, \in1\().2s, v0.s[0] 131cabdff1aSopenharmony_ci smull2 \tmp2\().2d, \in1\().4s, v0.s[0] 132cabdff1aSopenharmony_ci rshrn \out1\().2s, \tmp1\().2d, #14 133cabdff1aSopenharmony_ci rshrn2 \out1\().4s, \tmp2\().2d, #14 134cabdff1aSopenharmony_ci rshrn \out2\().2s, \tmp1\().2d, #14 135cabdff1aSopenharmony_ci rshrn2 \out2\().4s, \tmp2\().2d, #14 136cabdff1aSopenharmony_ci.endm 137cabdff1aSopenharmony_ci 138cabdff1aSopenharmony_ci// out1,out2 = in1 * coef1 - in2 * coef2 139cabdff1aSopenharmony_ci// out3,out4 = in1 * coef2 + in2 * coef1 140cabdff1aSopenharmony_ci// out are 4 x .2d registers, in are 2 x .4s registers 141cabdff1aSopenharmony_ci.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2 142cabdff1aSopenharmony_ci smull \out1\().2d, \in1\().2s, \coef1 143cabdff1aSopenharmony_ci smull2 \out2\().2d, \in1\().4s, \coef1 144cabdff1aSopenharmony_ci smull \out3\().2d, \in1\().2s, \coef2 145cabdff1aSopenharmony_ci smull2 \out4\().2d, \in1\().4s, \coef2 146cabdff1aSopenharmony_ci smlsl \out1\().2d, \in2\().2s, \coef2 147cabdff1aSopenharmony_ci smlsl2 \out2\().2d, \in2\().4s, \coef2 148cabdff1aSopenharmony_ci smlal \out3\().2d, \in2\().2s, \coef1 149cabdff1aSopenharmony_ci smlal2 \out4\().2d, \in2\().4s, \coef1 150cabdff1aSopenharmony_ci.endm 151cabdff1aSopenharmony_ci 152cabdff1aSopenharmony_ci// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14 153cabdff1aSopenharmony_ci// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14 154cabdff1aSopenharmony_ci// inout are 2 x .4s registers 155cabdff1aSopenharmony_ci.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0 156cabdff1aSopenharmony_ci dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2 157cabdff1aSopenharmony_ci.if \neg > 0 158cabdff1aSopenharmony_ci neg \tmp3\().2d, \tmp3\().2d 159cabdff1aSopenharmony_ci neg \tmp4\().2d, \tmp4\().2d 160cabdff1aSopenharmony_ci.endif 161cabdff1aSopenharmony_ci rshrn \inout1\().2s, \tmp1\().2d, #14 162cabdff1aSopenharmony_ci rshrn2 \inout1\().4s, \tmp2\().2d, #14 163cabdff1aSopenharmony_ci rshrn \inout2\().2s, \tmp3\().2d, #14 164cabdff1aSopenharmony_ci rshrn2 \inout2\().4s, \tmp4\().2d, #14 165cabdff1aSopenharmony_ci.endm 166cabdff1aSopenharmony_ci 167cabdff1aSopenharmony_ci// Same as dmbutterfly above, but treating the input in inout2 as zero 168cabdff1aSopenharmony_ci.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 169cabdff1aSopenharmony_ci smull \tmp1\().2d, \inout1\().2s, \coef1 170cabdff1aSopenharmony_ci smull2 \tmp2\().2d, \inout1\().4s, \coef1 171cabdff1aSopenharmony_ci smull \tmp3\().2d, \inout1\().2s, \coef2 172cabdff1aSopenharmony_ci smull2 \tmp4\().2d, \inout1\().4s, \coef2 173cabdff1aSopenharmony_ci rshrn \inout1\().2s, \tmp1\().2d, #14 174cabdff1aSopenharmony_ci rshrn2 \inout1\().4s, \tmp2\().2d, #14 175cabdff1aSopenharmony_ci rshrn \inout2\().2s, \tmp3\().2d, #14 176cabdff1aSopenharmony_ci rshrn2 \inout2\().4s, \tmp4\().2d, #14 177cabdff1aSopenharmony_ci.endm 178cabdff1aSopenharmony_ci 179cabdff1aSopenharmony_ci// Same as dmbutterfly above, but treating the input in inout1 as zero 180cabdff1aSopenharmony_ci.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 181cabdff1aSopenharmony_ci smull \tmp1\().2d, \inout2\().2s, \coef2 182cabdff1aSopenharmony_ci smull2 \tmp2\().2d, \inout2\().4s, \coef2 183cabdff1aSopenharmony_ci smull \tmp3\().2d, \inout2\().2s, \coef1 184cabdff1aSopenharmony_ci smull2 \tmp4\().2d, \inout2\().4s, \coef1 185cabdff1aSopenharmony_ci neg \tmp1\().2d, \tmp1\().2d 186cabdff1aSopenharmony_ci neg \tmp2\().2d, \tmp2\().2d 187cabdff1aSopenharmony_ci rshrn \inout2\().2s, \tmp3\().2d, #14 188cabdff1aSopenharmony_ci rshrn2 \inout2\().4s, \tmp4\().2d, #14 189cabdff1aSopenharmony_ci rshrn \inout1\().2s, \tmp1\().2d, #14 190cabdff1aSopenharmony_ci rshrn2 \inout1\().4s, \tmp2\().2d, #14 191cabdff1aSopenharmony_ci.endm 192cabdff1aSopenharmony_ci 193cabdff1aSopenharmony_ci.macro dsmull_h out1, out2, in, coef 194cabdff1aSopenharmony_ci smull \out1\().2d, \in\().2s, \coef 195cabdff1aSopenharmony_ci smull2 \out2\().2d, \in\().4s, \coef 196cabdff1aSopenharmony_ci.endm 197cabdff1aSopenharmony_ci 198cabdff1aSopenharmony_ci.macro drshrn_h out, in1, in2, shift 199cabdff1aSopenharmony_ci rshrn \out\().2s, \in1\().2d, \shift 200cabdff1aSopenharmony_ci rshrn2 \out\().4s, \in2\().2d, \shift 201cabdff1aSopenharmony_ci.endm 202cabdff1aSopenharmony_ci 203cabdff1aSopenharmony_ci 204cabdff1aSopenharmony_ci// out1 = in1 + in2 205cabdff1aSopenharmony_ci// out2 = in1 - in2 206cabdff1aSopenharmony_ci.macro butterfly_4s out1, out2, in1, in2 207cabdff1aSopenharmony_ci add \out1\().4s, \in1\().4s, \in2\().4s 208cabdff1aSopenharmony_ci sub \out2\().4s, \in1\().4s, \in2\().4s 209cabdff1aSopenharmony_ci.endm 210cabdff1aSopenharmony_ci 211cabdff1aSopenharmony_ci// out1 = in1 - in2 212cabdff1aSopenharmony_ci// out2 = in1 + in2 213cabdff1aSopenharmony_ci.macro butterfly_4s_r out1, out2, in1, in2 214cabdff1aSopenharmony_ci sub \out1\().4s, \in1\().4s, \in2\().4s 215cabdff1aSopenharmony_ci add \out2\().4s, \in1\().4s, \in2\().4s 216cabdff1aSopenharmony_ci.endm 217cabdff1aSopenharmony_ci 218cabdff1aSopenharmony_ci// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14 219cabdff1aSopenharmony_ci// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14 220cabdff1aSopenharmony_ci// out are 2 x .4s registers, in are 4 x .2d registers 221cabdff1aSopenharmony_ci.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4 222cabdff1aSopenharmony_ci add \tmp1\().2d, \in1\().2d, \in3\().2d 223cabdff1aSopenharmony_ci add \tmp2\().2d, \in2\().2d, \in4\().2d 224cabdff1aSopenharmony_ci sub \tmp3\().2d, \in1\().2d, \in3\().2d 225cabdff1aSopenharmony_ci sub \tmp4\().2d, \in2\().2d, \in4\().2d 226cabdff1aSopenharmony_ci rshrn \out1\().2s, \tmp1\().2d, #14 227cabdff1aSopenharmony_ci rshrn2 \out1\().4s, \tmp2\().2d, #14 228cabdff1aSopenharmony_ci rshrn \out2\().2s, \tmp3\().2d, #14 229cabdff1aSopenharmony_ci rshrn2 \out2\().4s, \tmp4\().2d, #14 230cabdff1aSopenharmony_ci.endm 231cabdff1aSopenharmony_ci 232cabdff1aSopenharmony_ci.macro iwht4_10 c0, c1, c2, c3 233cabdff1aSopenharmony_ci add \c0\().4s, \c0\().4s, \c1\().4s 234cabdff1aSopenharmony_ci sub v17.4s, \c2\().4s, \c3\().4s 235cabdff1aSopenharmony_ci sub v16.4s, \c0\().4s, v17.4s 236cabdff1aSopenharmony_ci sshr v16.4s, v16.4s, #1 237cabdff1aSopenharmony_ci sub \c2\().4s, v16.4s, \c1\().4s 238cabdff1aSopenharmony_ci sub \c1\().4s, v16.4s, \c3\().4s 239cabdff1aSopenharmony_ci add \c3\().4s, v17.4s, \c2\().4s 240cabdff1aSopenharmony_ci sub \c0\().4s, \c0\().4s, \c1\().4s 241cabdff1aSopenharmony_ci.endm 242cabdff1aSopenharmony_ci 243cabdff1aSopenharmony_ci.macro iwht4_12 c0, c1, c2, c3 244cabdff1aSopenharmony_ci iwht4_10 \c0, \c1, \c2, \c3 245cabdff1aSopenharmony_ci.endm 246cabdff1aSopenharmony_ci 247cabdff1aSopenharmony_ci.macro idct4_10 c0, c1, c2, c3 248cabdff1aSopenharmony_ci mul v22.4s, \c1\().4s, v0.s[3] 249cabdff1aSopenharmony_ci mul v20.4s, \c1\().4s, v0.s[2] 250cabdff1aSopenharmony_ci add v16.4s, \c0\().4s, \c2\().4s 251cabdff1aSopenharmony_ci sub v17.4s, \c0\().4s, \c2\().4s 252cabdff1aSopenharmony_ci mla v22.4s, \c3\().4s, v0.s[2] 253cabdff1aSopenharmony_ci mul v18.4s, v16.4s, v0.s[0] 254cabdff1aSopenharmony_ci mul v24.4s, v17.4s, v0.s[0] 255cabdff1aSopenharmony_ci mls v20.4s, \c3\().4s, v0.s[3] 256cabdff1aSopenharmony_ci srshr v22.4s, v22.4s, #14 257cabdff1aSopenharmony_ci srshr v18.4s, v18.4s, #14 258cabdff1aSopenharmony_ci srshr v24.4s, v24.4s, #14 259cabdff1aSopenharmony_ci srshr v20.4s, v20.4s, #14 260cabdff1aSopenharmony_ci add \c0\().4s, v18.4s, v22.4s 261cabdff1aSopenharmony_ci sub \c3\().4s, v18.4s, v22.4s 262cabdff1aSopenharmony_ci add \c1\().4s, v24.4s, v20.4s 263cabdff1aSopenharmony_ci sub \c2\().4s, v24.4s, v20.4s 264cabdff1aSopenharmony_ci.endm 265cabdff1aSopenharmony_ci 266cabdff1aSopenharmony_ci.macro idct4_12 c0, c1, c2, c3 267cabdff1aSopenharmony_ci smull v22.2d, \c1\().2s, v0.s[3] 268cabdff1aSopenharmony_ci smull2 v23.2d, \c1\().4s, v0.s[3] 269cabdff1aSopenharmony_ci smull v20.2d, \c1\().2s, v0.s[2] 270cabdff1aSopenharmony_ci smull2 v21.2d, \c1\().4s, v0.s[2] 271cabdff1aSopenharmony_ci add v16.4s, \c0\().4s, \c2\().4s 272cabdff1aSopenharmony_ci sub v17.4s, \c0\().4s, \c2\().4s 273cabdff1aSopenharmony_ci smlal v22.2d, \c3\().2s, v0.s[2] 274cabdff1aSopenharmony_ci smlal2 v23.2d, \c3\().4s, v0.s[2] 275cabdff1aSopenharmony_ci smull v18.2d, v16.2s, v0.s[0] 276cabdff1aSopenharmony_ci smull2 v19.2d, v16.4s, v0.s[0] 277cabdff1aSopenharmony_ci smull v24.2d, v17.2s, v0.s[0] 278cabdff1aSopenharmony_ci smull2 v25.2d, v17.4s, v0.s[0] 279cabdff1aSopenharmony_ci smlsl v20.2d, \c3\().2s, v0.s[3] 280cabdff1aSopenharmony_ci smlsl2 v21.2d, \c3\().4s, v0.s[3] 281cabdff1aSopenharmony_ci rshrn v22.2s, v22.2d, #14 282cabdff1aSopenharmony_ci rshrn2 v22.4s, v23.2d, #14 283cabdff1aSopenharmony_ci rshrn v18.2s, v18.2d, #14 284cabdff1aSopenharmony_ci rshrn2 v18.4s, v19.2d, #14 285cabdff1aSopenharmony_ci rshrn v24.2s, v24.2d, #14 286cabdff1aSopenharmony_ci rshrn2 v24.4s, v25.2d, #14 287cabdff1aSopenharmony_ci rshrn v20.2s, v20.2d, #14 288cabdff1aSopenharmony_ci rshrn2 v20.4s, v21.2d, #14 289cabdff1aSopenharmony_ci add \c0\().4s, v18.4s, v22.4s 290cabdff1aSopenharmony_ci sub \c3\().4s, v18.4s, v22.4s 291cabdff1aSopenharmony_ci add \c1\().4s, v24.4s, v20.4s 292cabdff1aSopenharmony_ci sub \c2\().4s, v24.4s, v20.4s 293cabdff1aSopenharmony_ci.endm 294cabdff1aSopenharmony_ci 295cabdff1aSopenharmony_ci.macro iadst4_10 c0, c1, c2, c3 296cabdff1aSopenharmony_ci mul v16.4s, \c0\().4s, v1.s[0] 297cabdff1aSopenharmony_ci mla v16.4s, \c2\().4s, v1.s[1] 298cabdff1aSopenharmony_ci mla v16.4s, \c3\().4s, v1.s[2] 299cabdff1aSopenharmony_ci mul v18.4s, \c0\().4s, v1.s[2] 300cabdff1aSopenharmony_ci mls v18.4s, \c2\().4s, v1.s[0] 301cabdff1aSopenharmony_ci sub \c0\().4s, \c0\().4s, \c2\().4s 302cabdff1aSopenharmony_ci mls v18.4s, \c3\().4s, v1.s[1] 303cabdff1aSopenharmony_ci add \c0\().4s, \c0\().4s, \c3\().4s 304cabdff1aSopenharmony_ci mul v22.4s, \c1\().4s, v1.s[3] 305cabdff1aSopenharmony_ci mul v20.4s, \c0\().4s, v1.s[3] 306cabdff1aSopenharmony_ci add v24.4s, v16.4s, v22.4s 307cabdff1aSopenharmony_ci add v26.4s, v18.4s, v22.4s 308cabdff1aSopenharmony_ci srshr \c0\().4s, v24.4s, #14 309cabdff1aSopenharmony_ci add v16.4s, v16.4s, v18.4s 310cabdff1aSopenharmony_ci srshr \c1\().4s, v26.4s, #14 311cabdff1aSopenharmony_ci sub v16.4s, v16.4s, v22.4s 312cabdff1aSopenharmony_ci srshr \c2\().4s, v20.4s, #14 313cabdff1aSopenharmony_ci srshr \c3\().4s, v16.4s, #14 314cabdff1aSopenharmony_ci.endm 315cabdff1aSopenharmony_ci 316cabdff1aSopenharmony_ci.macro iadst4_12 c0, c1, c2, c3 317cabdff1aSopenharmony_ci smull v16.2d, \c0\().2s, v1.s[0] 318cabdff1aSopenharmony_ci smull2 v17.2d, \c0\().4s, v1.s[0] 319cabdff1aSopenharmony_ci smlal v16.2d, \c2\().2s, v1.s[1] 320cabdff1aSopenharmony_ci smlal2 v17.2d, \c2\().4s, v1.s[1] 321cabdff1aSopenharmony_ci smlal v16.2d, \c3\().2s, v1.s[2] 322cabdff1aSopenharmony_ci smlal2 v17.2d, \c3\().4s, v1.s[2] 323cabdff1aSopenharmony_ci smull v18.2d, \c0\().2s, v1.s[2] 324cabdff1aSopenharmony_ci smull2 v19.2d, \c0\().4s, v1.s[2] 325cabdff1aSopenharmony_ci smlsl v18.2d, \c2\().2s, v1.s[0] 326cabdff1aSopenharmony_ci smlsl2 v19.2d, \c2\().4s, v1.s[0] 327cabdff1aSopenharmony_ci sub \c0\().4s, \c0\().4s, \c2\().4s 328cabdff1aSopenharmony_ci smlsl v18.2d, \c3\().2s, v1.s[1] 329cabdff1aSopenharmony_ci smlsl2 v19.2d, \c3\().4s, v1.s[1] 330cabdff1aSopenharmony_ci add \c0\().4s, \c0\().4s, \c3\().4s 331cabdff1aSopenharmony_ci smull v22.2d, \c1\().2s, v1.s[3] 332cabdff1aSopenharmony_ci smull2 v23.2d, \c1\().4s, v1.s[3] 333cabdff1aSopenharmony_ci smull v20.2d, \c0\().2s, v1.s[3] 334cabdff1aSopenharmony_ci smull2 v21.2d, \c0\().4s, v1.s[3] 335cabdff1aSopenharmony_ci add v24.2d, v16.2d, v22.2d 336cabdff1aSopenharmony_ci add v25.2d, v17.2d, v23.2d 337cabdff1aSopenharmony_ci add v26.2d, v18.2d, v22.2d 338cabdff1aSopenharmony_ci add v27.2d, v19.2d, v23.2d 339cabdff1aSopenharmony_ci rshrn \c0\().2s, v24.2d, #14 340cabdff1aSopenharmony_ci rshrn2 \c0\().4s, v25.2d, #14 341cabdff1aSopenharmony_ci add v16.2d, v16.2d, v18.2d 342cabdff1aSopenharmony_ci add v17.2d, v17.2d, v19.2d 343cabdff1aSopenharmony_ci rshrn \c1\().2s, v26.2d, #14 344cabdff1aSopenharmony_ci rshrn2 \c1\().4s, v27.2d, #14 345cabdff1aSopenharmony_ci sub v16.2d, v16.2d, v22.2d 346cabdff1aSopenharmony_ci sub v17.2d, v17.2d, v23.2d 347cabdff1aSopenharmony_ci rshrn \c2\().2s, v20.2d, #14 348cabdff1aSopenharmony_ci rshrn2 \c2\().4s, v21.2d, #14 349cabdff1aSopenharmony_ci rshrn \c3\().2s, v16.2d, #14 350cabdff1aSopenharmony_ci rshrn2 \c3\().4s, v17.2d, #14 351cabdff1aSopenharmony_ci.endm 352cabdff1aSopenharmony_ci 353cabdff1aSopenharmony_ci// The public functions in this file have got the following signature: 354cabdff1aSopenharmony_ci// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 355cabdff1aSopenharmony_ci 356cabdff1aSopenharmony_ci.macro itxfm_func4x4 txfm1, txfm2, bpp 357cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1 358cabdff1aSopenharmony_ci.ifc \txfm1,\txfm2 359cabdff1aSopenharmony_ci.ifc \txfm1,idct 360cabdff1aSopenharmony_ci movrel x4, itxfm4_coeffs 361cabdff1aSopenharmony_ci ld1 {v0.4h}, [x4] 362cabdff1aSopenharmony_ci sxtl v0.4s, v0.4h 363cabdff1aSopenharmony_ci.endif 364cabdff1aSopenharmony_ci.ifc \txfm1,iadst 365cabdff1aSopenharmony_ci movrel x4, iadst4_coeffs 366cabdff1aSopenharmony_ci ld1 {v0.d}[1], [x4] 367cabdff1aSopenharmony_ci sxtl2 v1.4s, v0.8h 368cabdff1aSopenharmony_ci.endif 369cabdff1aSopenharmony_ci.else 370cabdff1aSopenharmony_ci movrel x4, itxfm4_coeffs 371cabdff1aSopenharmony_ci ld1 {v0.8h}, [x4] 372cabdff1aSopenharmony_ci sxtl2 v1.4s, v0.8h 373cabdff1aSopenharmony_ci sxtl v0.4s, v0.4h 374cabdff1aSopenharmony_ci.endif 375cabdff1aSopenharmony_ci 376cabdff1aSopenharmony_ci movi v30.4s, #0 377cabdff1aSopenharmony_ci movi v31.4s, #0 378cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 379cabdff1aSopenharmony_ci cmp w3, #1 380cabdff1aSopenharmony_ci b.ne 1f 381cabdff1aSopenharmony_ci // DC-only for idct/idct 382cabdff1aSopenharmony_ci ld1 {v2.s}[0], [x2] 383cabdff1aSopenharmony_ci smull v2.2d, v2.2s, v0.s[0] 384cabdff1aSopenharmony_ci rshrn v2.2s, v2.2d, #14 385cabdff1aSopenharmony_ci smull v2.2d, v2.2s, v0.s[0] 386cabdff1aSopenharmony_ci rshrn v2.2s, v2.2d, #14 387cabdff1aSopenharmony_ci st1 {v31.s}[0], [x2] 388cabdff1aSopenharmony_ci dup v4.4s, v2.s[0] 389cabdff1aSopenharmony_ci mov v5.16b, v4.16b 390cabdff1aSopenharmony_ci mov v6.16b, v4.16b 391cabdff1aSopenharmony_ci mov v7.16b, v4.16b 392cabdff1aSopenharmony_ci b 2f 393cabdff1aSopenharmony_ci.endif 394cabdff1aSopenharmony_ci 395cabdff1aSopenharmony_ci1: 396cabdff1aSopenharmony_ci ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2] 397cabdff1aSopenharmony_ci st1 {v30.4s,v31.4s}, [x2], #32 398cabdff1aSopenharmony_ci 399cabdff1aSopenharmony_ci.ifc \txfm1,iwht 400cabdff1aSopenharmony_ci sshr v4.4s, v4.4s, #2 401cabdff1aSopenharmony_ci sshr v5.4s, v5.4s, #2 402cabdff1aSopenharmony_ci sshr v6.4s, v6.4s, #2 403cabdff1aSopenharmony_ci sshr v7.4s, v7.4s, #2 404cabdff1aSopenharmony_ci.endif 405cabdff1aSopenharmony_ci 406cabdff1aSopenharmony_ci \txfm1\()4_\bpp v4, v5, v6, v7 407cabdff1aSopenharmony_ci 408cabdff1aSopenharmony_ci st1 {v30.4s,v31.4s}, [x2], #32 409cabdff1aSopenharmony_ci // Transpose 4x4 with 32 bit elements 410cabdff1aSopenharmony_ci transpose_4x4s v4, v5, v6, v7, v16, v17, v18, v19 411cabdff1aSopenharmony_ci 412cabdff1aSopenharmony_ci \txfm2\()4_\bpp v4, v5, v6, v7 413cabdff1aSopenharmony_ci2: 414cabdff1aSopenharmony_ci mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8 415cabdff1aSopenharmony_ci ld1 {v0.4h}, [x0], x1 416cabdff1aSopenharmony_ci ld1 {v1.4h}, [x0], x1 417cabdff1aSopenharmony_ci.ifnc \txfm1,iwht 418cabdff1aSopenharmony_ci srshr v4.4s, v4.4s, #4 419cabdff1aSopenharmony_ci srshr v5.4s, v5.4s, #4 420cabdff1aSopenharmony_ci srshr v6.4s, v6.4s, #4 421cabdff1aSopenharmony_ci srshr v7.4s, v7.4s, #4 422cabdff1aSopenharmony_ci.endif 423cabdff1aSopenharmony_ci uaddw v4.4s, v4.4s, v0.4h 424cabdff1aSopenharmony_ci uaddw v5.4s, v5.4s, v1.4h 425cabdff1aSopenharmony_ci ld1 {v2.4h}, [x0], x1 426cabdff1aSopenharmony_ci ld1 {v3.4h}, [x0], x1 427cabdff1aSopenharmony_ci sqxtun v0.4h, v4.4s 428cabdff1aSopenharmony_ci sqxtun2 v0.8h, v5.4s 429cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #2 430cabdff1aSopenharmony_ci 431cabdff1aSopenharmony_ci uaddw v6.4s, v6.4s, v2.4h 432cabdff1aSopenharmony_ci umin v0.8h, v0.8h, v31.8h 433cabdff1aSopenharmony_ci uaddw v7.4s, v7.4s, v3.4h 434cabdff1aSopenharmony_ci st1 {v0.4h}, [x0], x1 435cabdff1aSopenharmony_ci sqxtun v2.4h, v6.4s 436cabdff1aSopenharmony_ci sqxtun2 v2.8h, v7.4s 437cabdff1aSopenharmony_ci umin v2.8h, v2.8h, v31.8h 438cabdff1aSopenharmony_ci 439cabdff1aSopenharmony_ci st1 {v0.d}[1], [x0], x1 440cabdff1aSopenharmony_ci st1 {v2.4h}, [x0], x1 441cabdff1aSopenharmony_ci st1 {v2.d}[1], [x0], x1 442cabdff1aSopenharmony_ci 443cabdff1aSopenharmony_ci ret 444cabdff1aSopenharmony_ciendfunc 445cabdff1aSopenharmony_ci.endm 446cabdff1aSopenharmony_ci 447cabdff1aSopenharmony_ci.macro itxfm_funcs4x4 bpp 448cabdff1aSopenharmony_ciitxfm_func4x4 idct, idct, \bpp 449cabdff1aSopenharmony_ciitxfm_func4x4 iadst, idct, \bpp 450cabdff1aSopenharmony_ciitxfm_func4x4 idct, iadst, \bpp 451cabdff1aSopenharmony_ciitxfm_func4x4 iadst, iadst, \bpp 452cabdff1aSopenharmony_ciitxfm_func4x4 iwht, iwht, \bpp 453cabdff1aSopenharmony_ci.endm 454cabdff1aSopenharmony_ci 455cabdff1aSopenharmony_ciitxfm_funcs4x4 10 456cabdff1aSopenharmony_ciitxfm_funcs4x4 12 457cabdff1aSopenharmony_ci 458cabdff1aSopenharmony_cifunction idct8x8_dc_add_neon 459cabdff1aSopenharmony_ci movrel x4, idct_coeffs 460cabdff1aSopenharmony_ci ld1 {v0.4h}, [x4] 461cabdff1aSopenharmony_ci 462cabdff1aSopenharmony_ci movi v1.4h, #0 463cabdff1aSopenharmony_ci sxtl v0.4s, v0.4h 464cabdff1aSopenharmony_ci 465cabdff1aSopenharmony_ci ld1 {v2.s}[0], [x2] 466cabdff1aSopenharmony_ci smull v2.2d, v2.2s, v0.s[0] 467cabdff1aSopenharmony_ci rshrn v2.2s, v2.2d, #14 468cabdff1aSopenharmony_ci smull v2.2d, v2.2s, v0.s[0] 469cabdff1aSopenharmony_ci rshrn v2.2s, v2.2d, #14 470cabdff1aSopenharmony_ci st1 {v1.s}[0], [x2] 471cabdff1aSopenharmony_ci dup v2.4s, v2.s[0] 472cabdff1aSopenharmony_ci 473cabdff1aSopenharmony_ci srshr v2.4s, v2.4s, #5 474cabdff1aSopenharmony_ci 475cabdff1aSopenharmony_ci mov x4, #8 476cabdff1aSopenharmony_ci mov x3, x0 477cabdff1aSopenharmony_ci dup v31.8h, w5 478cabdff1aSopenharmony_ci1: 479cabdff1aSopenharmony_ci // Loop to add the constant from v2 into all 8x8 outputs 480cabdff1aSopenharmony_ci subs x4, x4, #2 481cabdff1aSopenharmony_ci ld1 {v3.8h}, [x0], x1 482cabdff1aSopenharmony_ci ld1 {v4.8h}, [x0], x1 483cabdff1aSopenharmony_ci uaddw v16.4s, v2.4s, v3.4h 484cabdff1aSopenharmony_ci uaddw2 v17.4s, v2.4s, v3.8h 485cabdff1aSopenharmony_ci uaddw v18.4s, v2.4s, v4.4h 486cabdff1aSopenharmony_ci uaddw2 v19.4s, v2.4s, v4.8h 487cabdff1aSopenharmony_ci sqxtun v3.4h, v16.4s 488cabdff1aSopenharmony_ci sqxtun2 v3.8h, v17.4s 489cabdff1aSopenharmony_ci sqxtun v4.4h, v18.4s 490cabdff1aSopenharmony_ci sqxtun2 v4.8h, v19.4s 491cabdff1aSopenharmony_ci umin v3.8h, v3.8h, v31.8h 492cabdff1aSopenharmony_ci umin v4.8h, v4.8h, v31.8h 493cabdff1aSopenharmony_ci st1 {v3.8h}, [x3], x1 494cabdff1aSopenharmony_ci st1 {v4.8h}, [x3], x1 495cabdff1aSopenharmony_ci b.ne 1b 496cabdff1aSopenharmony_ci 497cabdff1aSopenharmony_ci ret 498cabdff1aSopenharmony_ciendfunc 499cabdff1aSopenharmony_ci 500cabdff1aSopenharmony_ci.macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5 501cabdff1aSopenharmony_ci dmbutterfly0 \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a 502cabdff1aSopenharmony_ci dmbutterfly \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3 // r2 = t2a, r6 = t3a 503cabdff1aSopenharmony_ci dmbutterfly \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3 // r1 = t4a, r7 = t7a 504cabdff1aSopenharmony_ci dmbutterfly \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3 // r5 = t5a, r3 = t6a 505cabdff1aSopenharmony_ci 506cabdff1aSopenharmony_ci butterfly_4s \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3 507cabdff1aSopenharmony_ci butterfly_4s \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a 508cabdff1aSopenharmony_ci butterfly_4s \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a 509cabdff1aSopenharmony_ci butterfly_4s \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2 510cabdff1aSopenharmony_ci 511cabdff1aSopenharmony_ci dmbutterfly0 \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5 512cabdff1aSopenharmony_ci 513cabdff1aSopenharmony_ci butterfly_4s \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6] 514cabdff1aSopenharmony_ci butterfly_4s \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7] 515cabdff1aSopenharmony_ci butterfly_4s \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5] 516cabdff1aSopenharmony_ci butterfly_4s \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4] 517cabdff1aSopenharmony_ci.endm 518cabdff1aSopenharmony_ci 519cabdff1aSopenharmony_ci.macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5 520cabdff1aSopenharmony_ci dmbutterfly_l \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0] // t2,t3 = t1a, t0,t1 = t0a 521cabdff1aSopenharmony_ci dmbutterfly_l \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0] // r0,r7 = t5a, t4,t5 = t4a 522cabdff1aSopenharmony_ci 523cabdff1aSopenharmony_ci dbutterfly_n \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4 524cabdff1aSopenharmony_ci dbutterfly_n \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5 525cabdff1aSopenharmony_ci 526cabdff1aSopenharmony_ci dmbutterfly_l \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2] // t4,t5 = t3a, t2,t3 = t2a 527cabdff1aSopenharmony_ci dmbutterfly_l \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2] // r2,r5 = t7a, r0,r7 = t6a 528cabdff1aSopenharmony_ci 529cabdff1aSopenharmony_ci dbutterfly_n \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6 530cabdff1aSopenharmony_ci dbutterfly_n \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7 531cabdff1aSopenharmony_ci 532cabdff1aSopenharmony_ci butterfly_4s \r7, \r4, \r4, \r0 // r7 = -out[7], r4 = t3 533cabdff1aSopenharmony_ci neg \r7\().4s, \r7\().4s // r7 = out[7] 534cabdff1aSopenharmony_ci butterfly_4s \r0, \r1, \r3, \r1 // r0 = out[0], r1 = t2 535cabdff1aSopenharmony_ci 536cabdff1aSopenharmony_ci dmbutterfly_l \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3] // r2,r3 = t5a, t3,t5 = t4a 537cabdff1aSopenharmony_ci dmbutterfly_l \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2] // t0,t1 = t6a, r5,r6 = t7a 538cabdff1aSopenharmony_ci 539cabdff1aSopenharmony_ci dbutterfly_n \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6], t2 = t7 540cabdff1aSopenharmony_ci 541cabdff1aSopenharmony_ci dmbutterfly0 \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2 // r3 = -out[3], r4 = out[4] 542cabdff1aSopenharmony_ci neg \r3\().4s, \r3\().4s // r3 = out[3] 543cabdff1aSopenharmony_ci 544cabdff1aSopenharmony_ci dbutterfly_n \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6 545cabdff1aSopenharmony_ci neg \r1\().4s, \r1\().4s // r1 = out[1] 546cabdff1aSopenharmony_ci 547cabdff1aSopenharmony_ci dmbutterfly0 \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5 // r2 = out[2], r5 = -out[5] 548cabdff1aSopenharmony_ci neg \r5\().4s, \r5\().4s // r5 = out[5] 549cabdff1aSopenharmony_ci.endm 550cabdff1aSopenharmony_ci 551cabdff1aSopenharmony_ci 552cabdff1aSopenharmony_ci.macro itxfm_func8x8 txfm1, txfm2 553cabdff1aSopenharmony_cifunction vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon 554cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 555cabdff1aSopenharmony_ci cmp w3, #1 556cabdff1aSopenharmony_ci b.eq idct8x8_dc_add_neon 557cabdff1aSopenharmony_ci.endif 558cabdff1aSopenharmony_ci // The iadst also uses a few coefficients from 559cabdff1aSopenharmony_ci // idct, so those always need to be loaded. 560cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 561cabdff1aSopenharmony_ci movrel x4, idct_coeffs 562cabdff1aSopenharmony_ci.else 563cabdff1aSopenharmony_ci movrel x4, iadst8_coeffs 564cabdff1aSopenharmony_ci ld1 {v1.8h}, [x4], #16 565cabdff1aSopenharmony_ci stp d8, d9, [sp, #-0x10]! 566cabdff1aSopenharmony_ci sxtl2 v3.4s, v1.8h 567cabdff1aSopenharmony_ci sxtl v2.4s, v1.4h 568cabdff1aSopenharmony_ci.endif 569cabdff1aSopenharmony_ci ld1 {v0.8h}, [x4] 570cabdff1aSopenharmony_ci sxtl2 v1.4s, v0.8h 571cabdff1aSopenharmony_ci sxtl v0.4s, v0.4h 572cabdff1aSopenharmony_ci 573cabdff1aSopenharmony_ci movi v4.4s, #0 574cabdff1aSopenharmony_ci movi v5.4s, #0 575cabdff1aSopenharmony_ci movi v6.4s, #0 576cabdff1aSopenharmony_ci movi v7.4s, #0 577cabdff1aSopenharmony_ci 578cabdff1aSopenharmony_ci1: 579cabdff1aSopenharmony_ci ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2], #64 580cabdff1aSopenharmony_ci ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], #64 581cabdff1aSopenharmony_ci ld1 {v24.4s,v25.4s,v26.4s,v27.4s}, [x2], #64 582cabdff1aSopenharmony_ci ld1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64 583cabdff1aSopenharmony_ci sub x2, x2, #256 584cabdff1aSopenharmony_ci st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64 585cabdff1aSopenharmony_ci st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64 586cabdff1aSopenharmony_ci st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64 587cabdff1aSopenharmony_ci st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64 588cabdff1aSopenharmony_ci 589cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 590cabdff1aSopenharmony_ci idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7 591cabdff1aSopenharmony_ci idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7 592cabdff1aSopenharmony_ci.else 593cabdff1aSopenharmony_ci \txfm1\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9 594cabdff1aSopenharmony_ci \txfm1\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9 595cabdff1aSopenharmony_ci.endif 596cabdff1aSopenharmony_ci 597cabdff1aSopenharmony_ci // Transpose 8x8 with 16 bit elements 598cabdff1aSopenharmony_ci transpose_8x8s v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7 599cabdff1aSopenharmony_ci 600cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 601cabdff1aSopenharmony_ci idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7 602cabdff1aSopenharmony_ci idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7 603cabdff1aSopenharmony_ci.else 604cabdff1aSopenharmony_ci \txfm2\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9 605cabdff1aSopenharmony_ci \txfm2\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9 606cabdff1aSopenharmony_ci.endif 607cabdff1aSopenharmony_ci2: 608cabdff1aSopenharmony_ci mov x3, x0 609cabdff1aSopenharmony_ci // Add into the destination 610cabdff1aSopenharmony_ci ld1 {v0.8h}, [x0], x1 611cabdff1aSopenharmony_ci srshr v16.4s, v16.4s, #5 612cabdff1aSopenharmony_ci srshr v17.4s, v17.4s, #5 613cabdff1aSopenharmony_ci ld1 {v1.8h}, [x0], x1 614cabdff1aSopenharmony_ci srshr v18.4s, v18.4s, #5 615cabdff1aSopenharmony_ci srshr v19.4s, v19.4s, #5 616cabdff1aSopenharmony_ci ld1 {v2.8h}, [x0], x1 617cabdff1aSopenharmony_ci srshr v20.4s, v20.4s, #5 618cabdff1aSopenharmony_ci srshr v21.4s, v21.4s, #5 619cabdff1aSopenharmony_ci uaddw v16.4s, v16.4s, v0.4h 620cabdff1aSopenharmony_ci uaddw2 v17.4s, v17.4s, v0.8h 621cabdff1aSopenharmony_ci ld1 {v3.8h}, [x0], x1 622cabdff1aSopenharmony_ci srshr v22.4s, v22.4s, #5 623cabdff1aSopenharmony_ci srshr v23.4s, v23.4s, #5 624cabdff1aSopenharmony_ci uaddw v18.4s, v18.4s, v1.4h 625cabdff1aSopenharmony_ci uaddw2 v19.4s, v19.4s, v1.8h 626cabdff1aSopenharmony_ci ld1 {v4.8h}, [x0], x1 627cabdff1aSopenharmony_ci srshr v24.4s, v24.4s, #5 628cabdff1aSopenharmony_ci srshr v25.4s, v25.4s, #5 629cabdff1aSopenharmony_ci uaddw v20.4s, v20.4s, v2.4h 630cabdff1aSopenharmony_ci uaddw2 v21.4s, v21.4s, v2.8h 631cabdff1aSopenharmony_ci sqxtun v0.4h, v16.4s 632cabdff1aSopenharmony_ci sqxtun2 v0.8h, v17.4s 633cabdff1aSopenharmony_ci dup v16.8h, w5 634cabdff1aSopenharmony_ci ld1 {v5.8h}, [x0], x1 635cabdff1aSopenharmony_ci srshr v26.4s, v26.4s, #5 636cabdff1aSopenharmony_ci srshr v27.4s, v27.4s, #5 637cabdff1aSopenharmony_ci uaddw v22.4s, v22.4s, v3.4h 638cabdff1aSopenharmony_ci uaddw2 v23.4s, v23.4s, v3.8h 639cabdff1aSopenharmony_ci sqxtun v1.4h, v18.4s 640cabdff1aSopenharmony_ci sqxtun2 v1.8h, v19.4s 641cabdff1aSopenharmony_ci umin v0.8h, v0.8h, v16.8h 642cabdff1aSopenharmony_ci ld1 {v6.8h}, [x0], x1 643cabdff1aSopenharmony_ci srshr v28.4s, v28.4s, #5 644cabdff1aSopenharmony_ci srshr v29.4s, v29.4s, #5 645cabdff1aSopenharmony_ci uaddw v24.4s, v24.4s, v4.4h 646cabdff1aSopenharmony_ci uaddw2 v25.4s, v25.4s, v4.8h 647cabdff1aSopenharmony_ci sqxtun v2.4h, v20.4s 648cabdff1aSopenharmony_ci sqxtun2 v2.8h, v21.4s 649cabdff1aSopenharmony_ci umin v1.8h, v1.8h, v16.8h 650cabdff1aSopenharmony_ci ld1 {v7.8h}, [x0], x1 651cabdff1aSopenharmony_ci srshr v30.4s, v30.4s, #5 652cabdff1aSopenharmony_ci srshr v31.4s, v31.4s, #5 653cabdff1aSopenharmony_ci uaddw v26.4s, v26.4s, v5.4h 654cabdff1aSopenharmony_ci uaddw2 v27.4s, v27.4s, v5.8h 655cabdff1aSopenharmony_ci sqxtun v3.4h, v22.4s 656cabdff1aSopenharmony_ci sqxtun2 v3.8h, v23.4s 657cabdff1aSopenharmony_ci umin v2.8h, v2.8h, v16.8h 658cabdff1aSopenharmony_ci 659cabdff1aSopenharmony_ci st1 {v0.8h}, [x3], x1 660cabdff1aSopenharmony_ci uaddw v28.4s, v28.4s, v6.4h 661cabdff1aSopenharmony_ci uaddw2 v29.4s, v29.4s, v6.8h 662cabdff1aSopenharmony_ci st1 {v1.8h}, [x3], x1 663cabdff1aSopenharmony_ci sqxtun v4.4h, v24.4s 664cabdff1aSopenharmony_ci sqxtun2 v4.8h, v25.4s 665cabdff1aSopenharmony_ci umin v3.8h, v3.8h, v16.8h 666cabdff1aSopenharmony_ci st1 {v2.8h}, [x3], x1 667cabdff1aSopenharmony_ci uaddw v30.4s, v30.4s, v7.4h 668cabdff1aSopenharmony_ci uaddw2 v31.4s, v31.4s, v7.8h 669cabdff1aSopenharmony_ci st1 {v3.8h}, [x3], x1 670cabdff1aSopenharmony_ci sqxtun v5.4h, v26.4s 671cabdff1aSopenharmony_ci sqxtun2 v5.8h, v27.4s 672cabdff1aSopenharmony_ci umin v4.8h, v4.8h, v16.8h 673cabdff1aSopenharmony_ci st1 {v4.8h}, [x3], x1 674cabdff1aSopenharmony_ci sqxtun v6.4h, v28.4s 675cabdff1aSopenharmony_ci sqxtun2 v6.8h, v29.4s 676cabdff1aSopenharmony_ci umin v5.8h, v5.8h, v16.8h 677cabdff1aSopenharmony_ci st1 {v5.8h}, [x3], x1 678cabdff1aSopenharmony_ci sqxtun v7.4h, v30.4s 679cabdff1aSopenharmony_ci sqxtun2 v7.8h, v31.4s 680cabdff1aSopenharmony_ci umin v6.8h, v6.8h, v16.8h 681cabdff1aSopenharmony_ci 682cabdff1aSopenharmony_ci st1 {v6.8h}, [x3], x1 683cabdff1aSopenharmony_ci umin v7.8h, v7.8h, v16.8h 684cabdff1aSopenharmony_ci st1 {v7.8h}, [x3], x1 685cabdff1aSopenharmony_ci 686cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct 687cabdff1aSopenharmony_ci ldp d8, d9, [sp], 0x10 688cabdff1aSopenharmony_ci.endif 689cabdff1aSopenharmony_ci ret 690cabdff1aSopenharmony_ciendfunc 691cabdff1aSopenharmony_ci 692cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1 693cabdff1aSopenharmony_ci mov x5, #0x03ff 694cabdff1aSopenharmony_ci b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon 695cabdff1aSopenharmony_ciendfunc 696cabdff1aSopenharmony_ci 697cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1 698cabdff1aSopenharmony_ci mov x5, #0x0fff 699cabdff1aSopenharmony_ci b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon 700cabdff1aSopenharmony_ciendfunc 701cabdff1aSopenharmony_ci.endm 702cabdff1aSopenharmony_ci 703cabdff1aSopenharmony_ciitxfm_func8x8 idct, idct 704cabdff1aSopenharmony_ciitxfm_func8x8 iadst, idct 705cabdff1aSopenharmony_ciitxfm_func8x8 idct, iadst 706cabdff1aSopenharmony_ciitxfm_func8x8 iadst, iadst 707cabdff1aSopenharmony_ci 708cabdff1aSopenharmony_ci 709cabdff1aSopenharmony_cifunction idct16x16_dc_add_neon 710cabdff1aSopenharmony_ci movrel x4, idct_coeffs 711cabdff1aSopenharmony_ci ld1 {v0.4h}, [x4] 712cabdff1aSopenharmony_ci sxtl v0.4s, v0.4h 713cabdff1aSopenharmony_ci 714cabdff1aSopenharmony_ci movi v1.4h, #0 715cabdff1aSopenharmony_ci 716cabdff1aSopenharmony_ci ld1 {v2.s}[0], [x2] 717cabdff1aSopenharmony_ci smull v2.2d, v2.2s, v0.s[0] 718cabdff1aSopenharmony_ci rshrn v2.2s, v2.2d, #14 719cabdff1aSopenharmony_ci smull v2.2d, v2.2s, v0.s[0] 720cabdff1aSopenharmony_ci rshrn v2.2s, v2.2d, #14 721cabdff1aSopenharmony_ci st1 {v1.s}[0], [x2] 722cabdff1aSopenharmony_ci dup v2.4s, v2.s[0] 723cabdff1aSopenharmony_ci 724cabdff1aSopenharmony_ci srshr v0.4s, v2.4s, #6 725cabdff1aSopenharmony_ci 726cabdff1aSopenharmony_ci mov x3, x0 727cabdff1aSopenharmony_ci mov x4, #16 728cabdff1aSopenharmony_ci dup v31.8h, w13 729cabdff1aSopenharmony_ci1: 730cabdff1aSopenharmony_ci // Loop to add the constant from v2 into all 16x16 outputs 731cabdff1aSopenharmony_ci subs x4, x4, #2 732cabdff1aSopenharmony_ci ld1 {v1.8h,v2.8h}, [x0], x1 733cabdff1aSopenharmony_ci uaddw v16.4s, v0.4s, v1.4h 734cabdff1aSopenharmony_ci uaddw2 v17.4s, v0.4s, v1.8h 735cabdff1aSopenharmony_ci ld1 {v3.8h,v4.8h}, [x0], x1 736cabdff1aSopenharmony_ci uaddw v18.4s, v0.4s, v2.4h 737cabdff1aSopenharmony_ci uaddw2 v19.4s, v0.4s, v2.8h 738cabdff1aSopenharmony_ci uaddw v20.4s, v0.4s, v3.4h 739cabdff1aSopenharmony_ci uaddw2 v21.4s, v0.4s, v3.8h 740cabdff1aSopenharmony_ci uaddw v22.4s, v0.4s, v4.4h 741cabdff1aSopenharmony_ci uaddw2 v23.4s, v0.4s, v4.8h 742cabdff1aSopenharmony_ci sqxtun v1.4h, v16.4s 743cabdff1aSopenharmony_ci sqxtun2 v1.8h, v17.4s 744cabdff1aSopenharmony_ci sqxtun v2.4h, v18.4s 745cabdff1aSopenharmony_ci sqxtun2 v2.8h, v19.4s 746cabdff1aSopenharmony_ci sqxtun v3.4h, v20.4s 747cabdff1aSopenharmony_ci sqxtun2 v3.8h, v21.4s 748cabdff1aSopenharmony_ci sqxtun v4.4h, v22.4s 749cabdff1aSopenharmony_ci sqxtun2 v4.8h, v23.4s 750cabdff1aSopenharmony_ci umin v1.8h, v1.8h, v31.8h 751cabdff1aSopenharmony_ci umin v2.8h, v2.8h, v31.8h 752cabdff1aSopenharmony_ci st1 {v1.8h,v2.8h}, [x3], x1 753cabdff1aSopenharmony_ci umin v3.8h, v3.8h, v31.8h 754cabdff1aSopenharmony_ci umin v4.8h, v4.8h, v31.8h 755cabdff1aSopenharmony_ci st1 {v3.8h,v4.8h}, [x3], x1 756cabdff1aSopenharmony_ci b.ne 1b 757cabdff1aSopenharmony_ci 758cabdff1aSopenharmony_ci ret 759cabdff1aSopenharmony_ciendfunc 760cabdff1aSopenharmony_ci 761cabdff1aSopenharmony_ci.macro idct16_end 762cabdff1aSopenharmony_ci butterfly_4s v18, v7, v4, v7 // v18 = t0a, v7 = t7a 763cabdff1aSopenharmony_ci butterfly_4s v19, v22, v5, v22 // v19 = t1a, v22 = t6 764cabdff1aSopenharmony_ci butterfly_4s v4, v26, v20, v26 // v4 = t2a, v26 = t5 765cabdff1aSopenharmony_ci butterfly_4s v5, v6, v28, v6 // v5 = t3a, v6 = t4 766cabdff1aSopenharmony_ci butterfly_4s v20, v28, v16, v24 // v20 = t8a, v28 = t11a 767cabdff1aSopenharmony_ci butterfly_4s v24, v21, v23, v21 // v24 = t9, v21 = t10 768cabdff1aSopenharmony_ci butterfly_4s v23, v27, v25, v27 // v23 = t14, v27 = t13 769cabdff1aSopenharmony_ci butterfly_4s v25, v29, v29, v17 // v25 = t15a, v29 = t12a 770cabdff1aSopenharmony_ci 771cabdff1aSopenharmony_ci dmbutterfly0 v8, v9, v27, v21, v8, v9, v16, v17, v30, v31 // v8 = t13a, v9 = t10a 772cabdff1aSopenharmony_ci dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11 773cabdff1aSopenharmony_ci 774cabdff1aSopenharmony_ci butterfly_4s v16, v31, v18, v25 // v16 = out[0], v31 = out[15] 775cabdff1aSopenharmony_ci butterfly_4s v17, v30, v19, v23 // v17 = out[1], v30 = out[14] 776cabdff1aSopenharmony_ci butterfly_4s_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6] 777cabdff1aSopenharmony_ci butterfly_4s v23, v24, v7, v20 // v23 = out[7], v24 = out[8] 778cabdff1aSopenharmony_ci butterfly_4s v18, v29, v4, v8 // v18 = out[2], v29 = out[13] 779cabdff1aSopenharmony_ci butterfly_4s v19, v28, v5, v28 // v19 = out[3], v28 = out[12] 780cabdff1aSopenharmony_ci butterfly_4s v20, v27, v6, v27 // v20 = out[4], v27 = out[11] 781cabdff1aSopenharmony_ci butterfly_4s v21, v26, v26, v9 // v21 = out[5], v26 = out[10] 782cabdff1aSopenharmony_ci ret 783cabdff1aSopenharmony_ci.endm 784cabdff1aSopenharmony_ci 785cabdff1aSopenharmony_cifunction idct16 786cabdff1aSopenharmony_ci dmbutterfly0 v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a 787cabdff1aSopenharmony_ci dmbutterfly v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a 788cabdff1aSopenharmony_ci dmbutterfly v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a 789cabdff1aSopenharmony_ci dmbutterfly v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a, v22 = t6a 790cabdff1aSopenharmony_ci dmbutterfly v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a, v31 = t15a 791cabdff1aSopenharmony_ci dmbutterfly v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a, v23 = t14a 792cabdff1aSopenharmony_ci dmbutterfly v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a 793cabdff1aSopenharmony_ci dmbutterfly v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a 794cabdff1aSopenharmony_ci 795cabdff1aSopenharmony_ci butterfly_4s v4, v28, v16, v28 // v4 = t0, v28 = t3 796cabdff1aSopenharmony_ci butterfly_4s v5, v20, v24, v20 // v5 = t1, v20 = t2 797cabdff1aSopenharmony_ci butterfly_4s v6, v26, v18, v26 // v6 = t4, v26 = t5 798cabdff1aSopenharmony_ci butterfly_4s v7, v22, v30, v22 // v7 = t7, v22 = t6 799cabdff1aSopenharmony_ci butterfly_4s v16, v25, v17, v25 // v16 = t8, v25 = t9 800cabdff1aSopenharmony_ci butterfly_4s v24, v21, v29, v21 // v24 = t11, v21 = t10 801cabdff1aSopenharmony_ci butterfly_4s v17, v27, v19, v27 // v17 = t12, v27 = t13 802cabdff1aSopenharmony_ci butterfly_4s v29, v23, v31, v23 // v29 = t15, v23 = t14 803cabdff1aSopenharmony_ci 804cabdff1aSopenharmony_ci dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31 // v22 = t6a, v26 = t5a 805cabdff1aSopenharmony_ci dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a 806cabdff1aSopenharmony_ci dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a 807cabdff1aSopenharmony_ci idct16_end 808cabdff1aSopenharmony_ciendfunc 809cabdff1aSopenharmony_ci 810cabdff1aSopenharmony_cifunction idct16_half 811cabdff1aSopenharmony_ci dmbutterfly0_h v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a 812cabdff1aSopenharmony_ci dmbutterfly_h1 v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a 813cabdff1aSopenharmony_ci dmbutterfly_h1 v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a 814cabdff1aSopenharmony_ci dmbutterfly_h2 v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a, v22 = t6a 815cabdff1aSopenharmony_ci dmbutterfly_h1 v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a, v31 = t15a 816cabdff1aSopenharmony_ci dmbutterfly_h2 v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a, v23 = t14a 817cabdff1aSopenharmony_ci dmbutterfly_h1 v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a 818cabdff1aSopenharmony_ci dmbutterfly_h2 v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a 819cabdff1aSopenharmony_ci 820cabdff1aSopenharmony_ci butterfly_4s v4, v28, v16, v28 // v4 = t0, v28 = t3 821cabdff1aSopenharmony_ci butterfly_4s v5, v20, v24, v20 // v5 = t1, v20 = t2 822cabdff1aSopenharmony_ci butterfly_4s v6, v26, v18, v26 // v6 = t4, v26 = t5 823cabdff1aSopenharmony_ci butterfly_4s v7, v22, v30, v22 // v7 = t7, v22 = t6 824cabdff1aSopenharmony_ci butterfly_4s v16, v25, v17, v25 // v16 = t8, v25 = t9 825cabdff1aSopenharmony_ci butterfly_4s v24, v21, v29, v21 // v24 = t11, v21 = t10 826cabdff1aSopenharmony_ci butterfly_4s v17, v27, v19, v27 // v17 = t12, v27 = t13 827cabdff1aSopenharmony_ci butterfly_4s v29, v23, v31, v23 // v29 = t15, v23 = t14 828cabdff1aSopenharmony_ci 829cabdff1aSopenharmony_ci dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31 // v22 = t6a, v26 = t5a 830cabdff1aSopenharmony_ci dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a 831cabdff1aSopenharmony_ci dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a 832cabdff1aSopenharmony_ci idct16_end 833cabdff1aSopenharmony_ciendfunc 834cabdff1aSopenharmony_ci 835cabdff1aSopenharmony_cifunction idct16_quarter 836cabdff1aSopenharmony_ci dsmull_h v24, v25, v19, v3.s[3] 837cabdff1aSopenharmony_ci dsmull_h v4, v5, v17, v2.s[0] 838cabdff1aSopenharmony_ci dsmull_h v7, v6, v18, v1.s[1] 839cabdff1aSopenharmony_ci dsmull_h v30, v31, v18, v1.s[0] 840cabdff1aSopenharmony_ci neg v24.2d, v24.2d 841cabdff1aSopenharmony_ci neg v25.2d, v25.2d 842cabdff1aSopenharmony_ci dsmull_h v29, v28, v17, v2.s[1] 843cabdff1aSopenharmony_ci dsmull_h v26, v27, v19, v3.s[2] 844cabdff1aSopenharmony_ci dsmull_h v22, v23, v16, v0.s[0] 845cabdff1aSopenharmony_ci drshrn_h v24, v24, v25, #14 846cabdff1aSopenharmony_ci drshrn_h v16, v4, v5, #14 847cabdff1aSopenharmony_ci drshrn_h v7, v7, v6, #14 848cabdff1aSopenharmony_ci drshrn_h v6, v30, v31, #14 849cabdff1aSopenharmony_ci drshrn_h v29, v29, v28, #14 850cabdff1aSopenharmony_ci drshrn_h v17, v26, v27, #14 851cabdff1aSopenharmony_ci drshrn_h v28, v22, v23, #14 852cabdff1aSopenharmony_ci 853cabdff1aSopenharmony_ci dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3] 854cabdff1aSopenharmony_ci dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3] 855cabdff1aSopenharmony_ci neg v22.2d, v22.2d 856cabdff1aSopenharmony_ci neg v23.2d, v23.2d 857cabdff1aSopenharmony_ci drshrn_h v27, v20, v21, #14 858cabdff1aSopenharmony_ci drshrn_h v21, v22, v23, #14 859cabdff1aSopenharmony_ci drshrn_h v23, v18, v19, #14 860cabdff1aSopenharmony_ci drshrn_h v25, v30, v31, #14 861cabdff1aSopenharmony_ci mov v4.16b, v28.16b 862cabdff1aSopenharmony_ci mov v5.16b, v28.16b 863cabdff1aSopenharmony_ci dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31 864cabdff1aSopenharmony_ci mov v20.16b, v28.16b 865cabdff1aSopenharmony_ci idct16_end 866cabdff1aSopenharmony_ciendfunc 867cabdff1aSopenharmony_ci 868cabdff1aSopenharmony_cifunction iadst16 869cabdff1aSopenharmony_ci ld1 {v0.8h,v1.8h}, [x11] 870cabdff1aSopenharmony_ci sxtl v2.4s, v1.4h 871cabdff1aSopenharmony_ci sxtl2 v3.4s, v1.8h 872cabdff1aSopenharmony_ci sxtl2 v1.4s, v0.8h 873cabdff1aSopenharmony_ci sxtl v0.4s, v0.4h 874cabdff1aSopenharmony_ci 875cabdff1aSopenharmony_ci dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.s[1], v0.s[0] // v6,v7 = t1, v4,v5 = t0 876cabdff1aSopenharmony_ci dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.s[1], v1.s[0] // v10,v11 = t9, v8,v9 = t8 877cabdff1aSopenharmony_ci dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a 878cabdff1aSopenharmony_ci dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2] // v14,v15 = t3, v12,v13 = t2 879cabdff1aSopenharmony_ci dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a 880cabdff1aSopenharmony_ci 881cabdff1aSopenharmony_ci dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.s[3], v1.s[2] // v6,v7 = t11, v4,v5 = t10 882cabdff1aSopenharmony_ci dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a 883cabdff1aSopenharmony_ci dmbutterfly_l v10, v11, v8, v9, v27, v20, v2.s[1], v2.s[0] // v10,v11 = t5, v8,v9 = t4 884cabdff1aSopenharmony_ci dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a 885cabdff1aSopenharmony_ci 886cabdff1aSopenharmony_ci dmbutterfly_l v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0] // v14,v15 = t13, v12,v13 = t12 887cabdff1aSopenharmony_ci dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a 888cabdff1aSopenharmony_ci dmbutterfly_l v6, v7, v4, v5, v25, v22, v2.s[3], v2.s[2] // v6,v7 = t7, v4,v5 = t6 889cabdff1aSopenharmony_ci dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a 890cabdff1aSopenharmony_ci 891cabdff1aSopenharmony_ci dmbutterfly_l v10, v11, v8, v9, v17, v30, v3.s[3], v3.s[2] // v10,v11 = t15, v8,v9 = t14 892cabdff1aSopenharmony_ci ld1 {v0.8h}, [x10] 893cabdff1aSopenharmony_ci dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a 894cabdff1aSopenharmony_ci sxtl2 v1.4s, v0.8h 895cabdff1aSopenharmony_ci sxtl v0.4s, v0.4h 896cabdff1aSopenharmony_ci dmbutterfly_l v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1] // v14,v15 = t9, v12,v13 = t8 897cabdff1aSopenharmony_ci dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a 898cabdff1aSopenharmony_ci 899cabdff1aSopenharmony_ci dmbutterfly_l v4, v5, v6, v7, v28, v19, v1.s[1], v1.s[0] // v4,v5 = t12, v6,v7 = t13 900cabdff1aSopenharmony_ci dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a 901cabdff1aSopenharmony_ci dmbutterfly_l v10, v11, v8, v9, v21, v26, v1.s[2], v1.s[3] // v10,v11 = t11, v8,v9 = t10 902cabdff1aSopenharmony_ci butterfly_4s_r v4, v27, v16, v27 // v4 = t4, v27 = t0 903cabdff1aSopenharmony_ci dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a 904cabdff1aSopenharmony_ci 905cabdff1aSopenharmony_ci dmbutterfly_l v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2] // v12,v13 = t14, v14,v15 = t15 906cabdff1aSopenharmony_ci butterfly_4s_r v5, v20, v31, v20 // v5 = t5, v20 = t1 907cabdff1aSopenharmony_ci dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a 908cabdff1aSopenharmony_ci dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a 909cabdff1aSopenharmony_ci 910cabdff1aSopenharmony_ci butterfly_4s_r v6, v25, v18, v25 // v6 = t6, v25 = t2 911cabdff1aSopenharmony_ci butterfly_4s_r v7, v22, v29, v22 // v7 = t7, v22 = t3 912cabdff1aSopenharmony_ci 913cabdff1aSopenharmony_ci dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.s[2], v0.s[3] // v10,v11 = t13, v8,v9 = t12 914cabdff1aSopenharmony_ci dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2] // v12,v13 = t14, v14,v15 = t15 915cabdff1aSopenharmony_ci 916cabdff1aSopenharmony_ci dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a 917cabdff1aSopenharmony_ci dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a 918cabdff1aSopenharmony_ci neg v29.4s, v29.4s // v29 = out[13] 919cabdff1aSopenharmony_ci 920cabdff1aSopenharmony_ci dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.s[2], v0.s[3] // v10,v11 = t5a, v8,v9 = t4a 921cabdff1aSopenharmony_ci dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.s[3], v0.s[2] // v12,v13 = t6a, v14,v15 = t7a 922cabdff1aSopenharmony_ci 923cabdff1aSopenharmony_ci butterfly_4s v2, v6, v27, v25 // v2 = out[0], v6 = t2a 924cabdff1aSopenharmony_ci butterfly_4s v3, v7, v23, v21 // v3 =-out[1], v7 = t10 925cabdff1aSopenharmony_ci 926cabdff1aSopenharmony_ci dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6 927cabdff1aSopenharmony_ci neg v19.4s, v19.4s // v19 = out[3] 928cabdff1aSopenharmony_ci dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7 929cabdff1aSopenharmony_ci 930cabdff1aSopenharmony_ci butterfly_4s v5, v8, v20, v22 // v5 =-out[15],v8 = t3a 931cabdff1aSopenharmony_ci butterfly_4s v4, v9, v24, v26 // v4 = out[14],v9 = t11 932cabdff1aSopenharmony_ci 933cabdff1aSopenharmony_ci dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8] 934cabdff1aSopenharmony_ci dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10] 935cabdff1aSopenharmony_ci dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11] 936cabdff1aSopenharmony_ci dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9] 937cabdff1aSopenharmony_ci 938cabdff1aSopenharmony_ci neg v31.4s, v5.4s // v31 = out[15] 939cabdff1aSopenharmony_ci neg v17.4s, v3.4s // v17 = out[1] 940cabdff1aSopenharmony_ci 941cabdff1aSopenharmony_ci mov v16.16b, v2.16b 942cabdff1aSopenharmony_ci mov v30.16b, v4.16b 943cabdff1aSopenharmony_ci ret 944cabdff1aSopenharmony_ciendfunc 945cabdff1aSopenharmony_ci 946cabdff1aSopenharmony_ci// Helper macros; we can't use these expressions directly within 947cabdff1aSopenharmony_ci// e.g. .irp due to the extra concatenation \(). Therefore wrap 948cabdff1aSopenharmony_ci// them in macros to allow using .irp below. 949cabdff1aSopenharmony_ci.macro load i, src, inc 950cabdff1aSopenharmony_ci ld1 {v\i\().4s}, [\src], \inc 951cabdff1aSopenharmony_ci.endm 952cabdff1aSopenharmony_ci.macro store i, dst, inc 953cabdff1aSopenharmony_ci st1 {v\i\().4s}, [\dst], \inc 954cabdff1aSopenharmony_ci.endm 955cabdff1aSopenharmony_ci.macro movi_v i, size, imm 956cabdff1aSopenharmony_ci movi v\i\()\size, \imm 957cabdff1aSopenharmony_ci.endm 958cabdff1aSopenharmony_ci.macro load_clear i, src, inc 959cabdff1aSopenharmony_ci ld1 {v\i\().4s}, [\src] 960cabdff1aSopenharmony_ci st1 {v4.4s}, [\src], \inc 961cabdff1aSopenharmony_ci.endm 962cabdff1aSopenharmony_ci 963cabdff1aSopenharmony_ci.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7 964cabdff1aSopenharmony_ci srshr \coef0, \coef0, #6 965cabdff1aSopenharmony_ci ld1 {v4.4h}, [x0], x1 966cabdff1aSopenharmony_ci srshr \coef1, \coef1, #6 967cabdff1aSopenharmony_ci ld1 {v4.d}[1], [x3], x1 968cabdff1aSopenharmony_ci srshr \coef2, \coef2, #6 969cabdff1aSopenharmony_ci ld1 {v5.4h}, [x0], x1 970cabdff1aSopenharmony_ci srshr \coef3, \coef3, #6 971cabdff1aSopenharmony_ci uaddw \coef0, \coef0, v4.4h 972cabdff1aSopenharmony_ci ld1 {v5.d}[1], [x3], x1 973cabdff1aSopenharmony_ci srshr \coef4, \coef4, #6 974cabdff1aSopenharmony_ci uaddw2 \coef1, \coef1, v4.8h 975cabdff1aSopenharmony_ci ld1 {v6.4h}, [x0], x1 976cabdff1aSopenharmony_ci srshr \coef5, \coef5, #6 977cabdff1aSopenharmony_ci uaddw \coef2, \coef2, v5.4h 978cabdff1aSopenharmony_ci ld1 {v6.d}[1], [x3], x1 979cabdff1aSopenharmony_ci sqxtun v4.4h, \coef0 980cabdff1aSopenharmony_ci srshr \coef6, \coef6, #6 981cabdff1aSopenharmony_ci uaddw2 \coef3, \coef3, v5.8h 982cabdff1aSopenharmony_ci ld1 {v7.4h}, [x0], x1 983cabdff1aSopenharmony_ci sqxtun2 v4.8h, \coef1 984cabdff1aSopenharmony_ci srshr \coef7, \coef7, #6 985cabdff1aSopenharmony_ci uaddw \coef4, \coef4, v6.4h 986cabdff1aSopenharmony_ci ld1 {v7.d}[1], [x3], x1 987cabdff1aSopenharmony_ci umin v4.8h, v4.8h, v8.8h 988cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #2 989cabdff1aSopenharmony_ci sub x3, x3, x1, lsl #2 990cabdff1aSopenharmony_ci sqxtun v5.4h, \coef2 991cabdff1aSopenharmony_ci uaddw2 \coef5, \coef5, v6.8h 992cabdff1aSopenharmony_ci st1 {v4.4h}, [x0], x1 993cabdff1aSopenharmony_ci sqxtun2 v5.8h, \coef3 994cabdff1aSopenharmony_ci uaddw \coef6, \coef6, v7.4h 995cabdff1aSopenharmony_ci st1 {v4.d}[1], [x3], x1 996cabdff1aSopenharmony_ci umin v5.8h, v5.8h, v8.8h 997cabdff1aSopenharmony_ci sqxtun v6.4h, \coef4 998cabdff1aSopenharmony_ci uaddw2 \coef7, \coef7, v7.8h 999cabdff1aSopenharmony_ci st1 {v5.4h}, [x0], x1 1000cabdff1aSopenharmony_ci sqxtun2 v6.8h, \coef5 1001cabdff1aSopenharmony_ci st1 {v5.d}[1], [x3], x1 1002cabdff1aSopenharmony_ci umin v6.8h, v6.8h, v8.8h 1003cabdff1aSopenharmony_ci sqxtun v7.4h, \coef6 1004cabdff1aSopenharmony_ci st1 {v6.4h}, [x0], x1 1005cabdff1aSopenharmony_ci sqxtun2 v7.8h, \coef7 1006cabdff1aSopenharmony_ci st1 {v6.d}[1], [x3], x1 1007cabdff1aSopenharmony_ci umin v7.8h, v7.8h, v8.8h 1008cabdff1aSopenharmony_ci st1 {v7.4h}, [x0], x1 1009cabdff1aSopenharmony_ci st1 {v7.d}[1], [x3], x1 1010cabdff1aSopenharmony_ci.endm 1011cabdff1aSopenharmony_ci 1012cabdff1aSopenharmony_ci// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, 1013cabdff1aSopenharmony_ci// transpose into a horizontal 16x4 slice and store. 1014cabdff1aSopenharmony_ci// x0 = dst (temp buffer) 1015cabdff1aSopenharmony_ci// x1 = slice offset 1016cabdff1aSopenharmony_ci// x2 = src 1017cabdff1aSopenharmony_ci// x9 = input stride 1018cabdff1aSopenharmony_ci.macro itxfm16_1d_funcs txfm 1019cabdff1aSopenharmony_cifunction \txfm\()16_1d_4x16_pass1_neon 1020cabdff1aSopenharmony_ci mov x14, x30 1021cabdff1aSopenharmony_ci 1022cabdff1aSopenharmony_ci movi v4.4s, #0 1023cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1024cabdff1aSopenharmony_ci load_clear \i, x2, x9 1025cabdff1aSopenharmony_ci.endr 1026cabdff1aSopenharmony_ci 1027cabdff1aSopenharmony_ci bl \txfm\()16 1028cabdff1aSopenharmony_ci 1029cabdff1aSopenharmony_ci // Do four 4x4 transposes. Originally, v16-v31 contain the 1030cabdff1aSopenharmony_ci // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 1031cabdff1aSopenharmony_ci // contain the four transposed 4x4 blocks. 1032cabdff1aSopenharmony_ci transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7 1033cabdff1aSopenharmony_ci transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7 1034cabdff1aSopenharmony_ci transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7 1035cabdff1aSopenharmony_ci transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7 1036cabdff1aSopenharmony_ci 1037cabdff1aSopenharmony_ci // Store the transposed 4x4 blocks horizontally. 1038cabdff1aSopenharmony_ci cmp x1, #12 1039cabdff1aSopenharmony_ci b.eq 1f 1040cabdff1aSopenharmony_ci.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 1041cabdff1aSopenharmony_ci store \i, x0, #16 1042cabdff1aSopenharmony_ci.endr 1043cabdff1aSopenharmony_ci ret x14 1044cabdff1aSopenharmony_ci1: 1045cabdff1aSopenharmony_ci // Special case: For the last input column (x1 == 12), 1046cabdff1aSopenharmony_ci // which would be stored as the last row in the temp buffer, 1047cabdff1aSopenharmony_ci // don't store the first 4x4 block, but keep it in registers 1048cabdff1aSopenharmony_ci // for the first slice of the second pass (where it is the 1049cabdff1aSopenharmony_ci // last 4x4 block). 1050cabdff1aSopenharmony_ci add x0, x0, #16 1051cabdff1aSopenharmony_ci st1 {v20.4s}, [x0], #16 1052cabdff1aSopenharmony_ci st1 {v24.4s}, [x0], #16 1053cabdff1aSopenharmony_ci st1 {v28.4s}, [x0], #16 1054cabdff1aSopenharmony_ci add x0, x0, #16 1055cabdff1aSopenharmony_ci st1 {v21.4s}, [x0], #16 1056cabdff1aSopenharmony_ci st1 {v25.4s}, [x0], #16 1057cabdff1aSopenharmony_ci st1 {v29.4s}, [x0], #16 1058cabdff1aSopenharmony_ci add x0, x0, #16 1059cabdff1aSopenharmony_ci st1 {v22.4s}, [x0], #16 1060cabdff1aSopenharmony_ci st1 {v26.4s}, [x0], #16 1061cabdff1aSopenharmony_ci st1 {v30.4s}, [x0], #16 1062cabdff1aSopenharmony_ci add x0, x0, #16 1063cabdff1aSopenharmony_ci st1 {v23.4s}, [x0], #16 1064cabdff1aSopenharmony_ci st1 {v27.4s}, [x0], #16 1065cabdff1aSopenharmony_ci st1 {v31.4s}, [x0], #16 1066cabdff1aSopenharmony_ci 1067cabdff1aSopenharmony_ci mov v28.16b, v16.16b 1068cabdff1aSopenharmony_ci mov v29.16b, v17.16b 1069cabdff1aSopenharmony_ci mov v30.16b, v18.16b 1070cabdff1aSopenharmony_ci mov v31.16b, v19.16b 1071cabdff1aSopenharmony_ci ret x14 1072cabdff1aSopenharmony_ciendfunc 1073cabdff1aSopenharmony_ci 1074cabdff1aSopenharmony_ci// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, 1075cabdff1aSopenharmony_ci// load the destination pixels (from a similar 4x16 slice), add and store back. 1076cabdff1aSopenharmony_ci// x0 = dst 1077cabdff1aSopenharmony_ci// x1 = dst stride 1078cabdff1aSopenharmony_ci// x2 = src (temp buffer) 1079cabdff1aSopenharmony_ci// x3 = slice offset 1080cabdff1aSopenharmony_ci// x9 = temp buffer stride 1081cabdff1aSopenharmony_cifunction \txfm\()16_1d_4x16_pass2_neon 1082cabdff1aSopenharmony_ci mov x14, x30 1083cabdff1aSopenharmony_ci 1084cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 1085cabdff1aSopenharmony_ci load \i, x2, x9 1086cabdff1aSopenharmony_ci.endr 1087cabdff1aSopenharmony_ci cbz x3, 1f 1088cabdff1aSopenharmony_ci.irp i, 28, 29, 30, 31 1089cabdff1aSopenharmony_ci load \i, x2, x9 1090cabdff1aSopenharmony_ci.endr 1091cabdff1aSopenharmony_ci1: 1092cabdff1aSopenharmony_ci 1093cabdff1aSopenharmony_ci add x3, x0, x1 1094cabdff1aSopenharmony_ci lsl x1, x1, #1 1095cabdff1aSopenharmony_ci bl \txfm\()16 1096cabdff1aSopenharmony_ci 1097cabdff1aSopenharmony_ci dup v8.8h, w13 1098cabdff1aSopenharmony_ci load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 1099cabdff1aSopenharmony_ci load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1100cabdff1aSopenharmony_ci 1101cabdff1aSopenharmony_ci ret x14 1102cabdff1aSopenharmony_ciendfunc 1103cabdff1aSopenharmony_ci.endm 1104cabdff1aSopenharmony_ci 1105cabdff1aSopenharmony_ciitxfm16_1d_funcs idct 1106cabdff1aSopenharmony_ciitxfm16_1d_funcs iadst 1107cabdff1aSopenharmony_ci 1108cabdff1aSopenharmony_ci// This is the minimum eob value for each subpartition, in increments of 4 1109cabdff1aSopenharmony_ciconst min_eob_idct_idct_16, align=4 1110cabdff1aSopenharmony_ci .short 0, 10, 38, 89 1111cabdff1aSopenharmony_ciendconst 1112cabdff1aSopenharmony_ci 1113cabdff1aSopenharmony_ci.macro itxfm_func16x16 txfm1, txfm2 1114cabdff1aSopenharmony_cifunction vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon 1115cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 1116cabdff1aSopenharmony_ci cmp w3, #1 1117cabdff1aSopenharmony_ci b.eq idct16x16_dc_add_neon 1118cabdff1aSopenharmony_ci.endif 1119cabdff1aSopenharmony_ci mov x15, x30 1120cabdff1aSopenharmony_ci // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9. 1121cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct 1122cabdff1aSopenharmony_ci stp d14, d15, [sp, #-0x10]! 1123cabdff1aSopenharmony_ci stp d12, d13, [sp, #-0x10]! 1124cabdff1aSopenharmony_ci stp d10, d11, [sp, #-0x10]! 1125cabdff1aSopenharmony_ci.endif 1126cabdff1aSopenharmony_ci stp d8, d9, [sp, #-0x10]! 1127cabdff1aSopenharmony_ci 1128cabdff1aSopenharmony_ci sub sp, sp, #1024 1129cabdff1aSopenharmony_ci 1130cabdff1aSopenharmony_ci mov x4, x0 1131cabdff1aSopenharmony_ci mov x5, x1 1132cabdff1aSopenharmony_ci mov x6, x2 1133cabdff1aSopenharmony_ci 1134cabdff1aSopenharmony_ci movrel x10, idct_coeffs 1135cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct 1136cabdff1aSopenharmony_ci movrel x11, iadst16_coeffs 1137cabdff1aSopenharmony_ci.endif 1138cabdff1aSopenharmony_ci.ifc \txfm1,idct 1139cabdff1aSopenharmony_ci ld1 {v0.8h,v1.8h}, [x10] 1140cabdff1aSopenharmony_ci sxtl v2.4s, v1.4h 1141cabdff1aSopenharmony_ci sxtl2 v3.4s, v1.8h 1142cabdff1aSopenharmony_ci sxtl2 v1.4s, v0.8h 1143cabdff1aSopenharmony_ci sxtl v0.4s, v0.4h 1144cabdff1aSopenharmony_ci.endif 1145cabdff1aSopenharmony_ci mov x9, #64 1146cabdff1aSopenharmony_ci 1147cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 1148cabdff1aSopenharmony_ci cmp w3, #10 1149cabdff1aSopenharmony_ci b.le idct16x16_quarter_add_16_neon 1150cabdff1aSopenharmony_ci cmp w3, #38 1151cabdff1aSopenharmony_ci b.le idct16x16_half_add_16_neon 1152cabdff1aSopenharmony_ci 1153cabdff1aSopenharmony_ci movrel x12, min_eob_idct_idct_16, 2 1154cabdff1aSopenharmony_ci.endif 1155cabdff1aSopenharmony_ci 1156cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12 1157cabdff1aSopenharmony_ci add x0, sp, #(\i*64) 1158cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 1159cabdff1aSopenharmony_ci.if \i > 0 1160cabdff1aSopenharmony_ci ldrh w1, [x12], #2 1161cabdff1aSopenharmony_ci cmp w3, w1 1162cabdff1aSopenharmony_ci mov x1, #(16 - \i)/4 1163cabdff1aSopenharmony_ci b.le 1f 1164cabdff1aSopenharmony_ci.endif 1165cabdff1aSopenharmony_ci.endif 1166cabdff1aSopenharmony_ci mov x1, #\i 1167cabdff1aSopenharmony_ci add x2, x6, #(\i*4) 1168cabdff1aSopenharmony_ci bl \txfm1\()16_1d_4x16_pass1_neon 1169cabdff1aSopenharmony_ci.endr 1170cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,iadst_idct 1171cabdff1aSopenharmony_ci ld1 {v0.8h,v1.8h}, [x10] 1172cabdff1aSopenharmony_ci sxtl v2.4s, v1.4h 1173cabdff1aSopenharmony_ci sxtl2 v3.4s, v1.8h 1174cabdff1aSopenharmony_ci sxtl2 v1.4s, v0.8h 1175cabdff1aSopenharmony_ci sxtl v0.4s, v0.4h 1176cabdff1aSopenharmony_ci.endif 1177cabdff1aSopenharmony_ci 1178cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct 1179cabdff1aSopenharmony_ci b 3f 1180cabdff1aSopenharmony_ci1: 1181cabdff1aSopenharmony_ci // Set v28-v31 to zero, for the in-register passthrough of 1182cabdff1aSopenharmony_ci // coefficients to pass 2. 1183cabdff1aSopenharmony_ci movi v28.4s, #0 1184cabdff1aSopenharmony_ci movi v29.4s, #0 1185cabdff1aSopenharmony_ci movi v30.4s, #0 1186cabdff1aSopenharmony_ci movi v31.4s, #0 1187cabdff1aSopenharmony_ci2: 1188cabdff1aSopenharmony_ci subs x1, x1, #1 1189cabdff1aSopenharmony_ci.rept 4 1190cabdff1aSopenharmony_ci st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9 1191cabdff1aSopenharmony_ci.endr 1192cabdff1aSopenharmony_ci b.ne 2b 1193cabdff1aSopenharmony_ci3: 1194cabdff1aSopenharmony_ci.endif 1195cabdff1aSopenharmony_ci 1196cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12 1197cabdff1aSopenharmony_ci add x0, x4, #(\i*2) 1198cabdff1aSopenharmony_ci mov x1, x5 1199cabdff1aSopenharmony_ci add x2, sp, #(\i*4) 1200cabdff1aSopenharmony_ci mov x3, #\i 1201cabdff1aSopenharmony_ci bl \txfm2\()16_1d_4x16_pass2_neon 1202cabdff1aSopenharmony_ci.endr 1203cabdff1aSopenharmony_ci 1204cabdff1aSopenharmony_ci add sp, sp, #1024 1205cabdff1aSopenharmony_ci ldp d8, d9, [sp], 0x10 1206cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct 1207cabdff1aSopenharmony_ci ldp d10, d11, [sp], 0x10 1208cabdff1aSopenharmony_ci ldp d12, d13, [sp], 0x10 1209cabdff1aSopenharmony_ci ldp d14, d15, [sp], 0x10 1210cabdff1aSopenharmony_ci.endif 1211cabdff1aSopenharmony_ci ret x15 1212cabdff1aSopenharmony_ciendfunc 1213cabdff1aSopenharmony_ci 1214cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1 1215cabdff1aSopenharmony_ci mov x13, #0x03ff 1216cabdff1aSopenharmony_ci b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon 1217cabdff1aSopenharmony_ciendfunc 1218cabdff1aSopenharmony_ci 1219cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1 1220cabdff1aSopenharmony_ci mov x13, #0x0fff 1221cabdff1aSopenharmony_ci b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon 1222cabdff1aSopenharmony_ciendfunc 1223cabdff1aSopenharmony_ci.endm 1224cabdff1aSopenharmony_ci 1225cabdff1aSopenharmony_ciitxfm_func16x16 idct, idct 1226cabdff1aSopenharmony_ciitxfm_func16x16 iadst, idct 1227cabdff1aSopenharmony_ciitxfm_func16x16 idct, iadst 1228cabdff1aSopenharmony_ciitxfm_func16x16 iadst, iadst 1229cabdff1aSopenharmony_ci 1230cabdff1aSopenharmony_cifunction idct16_1d_4x16_pass1_quarter_neon 1231cabdff1aSopenharmony_ci mov x14, x30 1232cabdff1aSopenharmony_ci 1233cabdff1aSopenharmony_ci movi v4.4s, #0 1234cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 1235cabdff1aSopenharmony_ci load_clear \i, x2, x9 1236cabdff1aSopenharmony_ci.endr 1237cabdff1aSopenharmony_ci 1238cabdff1aSopenharmony_ci bl idct16_quarter 1239cabdff1aSopenharmony_ci 1240cabdff1aSopenharmony_ci // Do four 4x4 transposes. Originally, v16-v31 contain the 1241cabdff1aSopenharmony_ci // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 1242cabdff1aSopenharmony_ci // contain the four transposed 4x4 blocks. 1243cabdff1aSopenharmony_ci transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7 1244cabdff1aSopenharmony_ci transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7 1245cabdff1aSopenharmony_ci transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7 1246cabdff1aSopenharmony_ci transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7 1247cabdff1aSopenharmony_ci 1248cabdff1aSopenharmony_ci // Store the transposed 4x4 blocks horizontally. 1249cabdff1aSopenharmony_ci // The first 4x4 block is kept in registers for the second pass, 1250cabdff1aSopenharmony_ci // store the rest in the temp buffer. 1251cabdff1aSopenharmony_ci add x0, x0, #16 1252cabdff1aSopenharmony_ci st1 {v20.4s}, [x0], #16 1253cabdff1aSopenharmony_ci st1 {v24.4s}, [x0], #16 1254cabdff1aSopenharmony_ci st1 {v28.4s}, [x0], #16 1255cabdff1aSopenharmony_ci add x0, x0, #16 1256cabdff1aSopenharmony_ci st1 {v21.4s}, [x0], #16 1257cabdff1aSopenharmony_ci st1 {v25.4s}, [x0], #16 1258cabdff1aSopenharmony_ci st1 {v29.4s}, [x0], #16 1259cabdff1aSopenharmony_ci add x0, x0, #16 1260cabdff1aSopenharmony_ci st1 {v22.4s}, [x0], #16 1261cabdff1aSopenharmony_ci st1 {v26.4s}, [x0], #16 1262cabdff1aSopenharmony_ci st1 {v30.4s}, [x0], #16 1263cabdff1aSopenharmony_ci add x0, x0, #16 1264cabdff1aSopenharmony_ci st1 {v23.4s}, [x0], #16 1265cabdff1aSopenharmony_ci st1 {v27.4s}, [x0], #16 1266cabdff1aSopenharmony_ci st1 {v31.4s}, [x0], #16 1267cabdff1aSopenharmony_ci ret x14 1268cabdff1aSopenharmony_ciendfunc 1269cabdff1aSopenharmony_ci 1270cabdff1aSopenharmony_cifunction idct16_1d_4x16_pass2_quarter_neon 1271cabdff1aSopenharmony_ci mov x14, x30 1272cabdff1aSopenharmony_ci 1273cabdff1aSopenharmony_ci // Only load the top 4 lines, and only do it for the later slices. 1274cabdff1aSopenharmony_ci // For the first slice, d16-d19 is kept in registers from the first pass. 1275cabdff1aSopenharmony_ci cbz x3, 1f 1276cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 1277cabdff1aSopenharmony_ci load \i, x2, x9 1278cabdff1aSopenharmony_ci.endr 1279cabdff1aSopenharmony_ci1: 1280cabdff1aSopenharmony_ci 1281cabdff1aSopenharmony_ci add x3, x0, x1 1282cabdff1aSopenharmony_ci lsl x1, x1, #1 1283cabdff1aSopenharmony_ci bl idct16_quarter 1284cabdff1aSopenharmony_ci 1285cabdff1aSopenharmony_ci dup v8.8h, w13 1286cabdff1aSopenharmony_ci load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 1287cabdff1aSopenharmony_ci load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1288cabdff1aSopenharmony_ci 1289cabdff1aSopenharmony_ci ret x14 1290cabdff1aSopenharmony_ciendfunc 1291cabdff1aSopenharmony_ci 1292cabdff1aSopenharmony_cifunction idct16_1d_4x16_pass1_half_neon 1293cabdff1aSopenharmony_ci mov x14, x30 1294cabdff1aSopenharmony_ci 1295cabdff1aSopenharmony_ci movi v4.4s, #0 1296cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1297cabdff1aSopenharmony_ci load_clear \i, x2, x9 1298cabdff1aSopenharmony_ci.endr 1299cabdff1aSopenharmony_ci 1300cabdff1aSopenharmony_ci bl idct16_half 1301cabdff1aSopenharmony_ci 1302cabdff1aSopenharmony_ci // Do four 4x4 transposes. Originally, v16-v31 contain the 1303cabdff1aSopenharmony_ci // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 1304cabdff1aSopenharmony_ci // contain the four transposed 4x4 blocks. 1305cabdff1aSopenharmony_ci transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7 1306cabdff1aSopenharmony_ci transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7 1307cabdff1aSopenharmony_ci transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7 1308cabdff1aSopenharmony_ci transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7 1309cabdff1aSopenharmony_ci 1310cabdff1aSopenharmony_ci // Store the transposed 4x4 blocks horizontally. 1311cabdff1aSopenharmony_ci cmp x1, #4 1312cabdff1aSopenharmony_ci b.eq 1f 1313cabdff1aSopenharmony_ci.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 1314cabdff1aSopenharmony_ci store \i, x0, #16 1315cabdff1aSopenharmony_ci.endr 1316cabdff1aSopenharmony_ci ret x14 1317cabdff1aSopenharmony_ci1: 1318cabdff1aSopenharmony_ci // Special case: For the second input column (r1 == 4), 1319cabdff1aSopenharmony_ci // which would be stored as the second row in the temp buffer, 1320cabdff1aSopenharmony_ci // don't store the first 4x4 block, but keep it in registers 1321cabdff1aSopenharmony_ci // for the first slice of the second pass (where it is the 1322cabdff1aSopenharmony_ci // second 4x4 block). 1323cabdff1aSopenharmony_ci add x0, x0, #16 1324cabdff1aSopenharmony_ci st1 {v20.4s}, [x0], #16 1325cabdff1aSopenharmony_ci st1 {v24.4s}, [x0], #16 1326cabdff1aSopenharmony_ci st1 {v28.4s}, [x0], #16 1327cabdff1aSopenharmony_ci add x0, x0, #16 1328cabdff1aSopenharmony_ci st1 {v21.4s}, [x0], #16 1329cabdff1aSopenharmony_ci st1 {v25.4s}, [x0], #16 1330cabdff1aSopenharmony_ci st1 {v29.4s}, [x0], #16 1331cabdff1aSopenharmony_ci add x0, x0, #16 1332cabdff1aSopenharmony_ci st1 {v22.4s}, [x0], #16 1333cabdff1aSopenharmony_ci st1 {v26.4s}, [x0], #16 1334cabdff1aSopenharmony_ci st1 {v30.4s}, [x0], #16 1335cabdff1aSopenharmony_ci add x0, x0, #16 1336cabdff1aSopenharmony_ci st1 {v23.4s}, [x0], #16 1337cabdff1aSopenharmony_ci st1 {v27.4s}, [x0], #16 1338cabdff1aSopenharmony_ci st1 {v31.4s}, [x0], #16 1339cabdff1aSopenharmony_ci 1340cabdff1aSopenharmony_ci mov v20.16b, v16.16b 1341cabdff1aSopenharmony_ci mov v21.16b, v17.16b 1342cabdff1aSopenharmony_ci mov v22.16b, v18.16b 1343cabdff1aSopenharmony_ci mov v23.16b, v19.16b 1344cabdff1aSopenharmony_ci ret x14 1345cabdff1aSopenharmony_ciendfunc 1346cabdff1aSopenharmony_ci 1347cabdff1aSopenharmony_cifunction idct16_1d_4x16_pass2_half_neon 1348cabdff1aSopenharmony_ci mov x14, x30 1349cabdff1aSopenharmony_ci 1350cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 1351cabdff1aSopenharmony_ci load \i, x2, x9 1352cabdff1aSopenharmony_ci.endr 1353cabdff1aSopenharmony_ci cbz x3, 1f 1354cabdff1aSopenharmony_ci.irp i, 20, 21, 22, 23 1355cabdff1aSopenharmony_ci load \i, x2, x9 1356cabdff1aSopenharmony_ci.endr 1357cabdff1aSopenharmony_ci1: 1358cabdff1aSopenharmony_ci 1359cabdff1aSopenharmony_ci add x3, x0, x1 1360cabdff1aSopenharmony_ci lsl x1, x1, #1 1361cabdff1aSopenharmony_ci bl idct16_half 1362cabdff1aSopenharmony_ci 1363cabdff1aSopenharmony_ci dup v8.8h, w13 1364cabdff1aSopenharmony_ci load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 1365cabdff1aSopenharmony_ci load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1366cabdff1aSopenharmony_ci 1367cabdff1aSopenharmony_ci ret x14 1368cabdff1aSopenharmony_ciendfunc 1369cabdff1aSopenharmony_ci 1370cabdff1aSopenharmony_ci.macro idct16_partial size 1371cabdff1aSopenharmony_cifunction idct16x16_\size\()_add_16_neon 1372cabdff1aSopenharmony_ci add x0, sp, #(0*64) 1373cabdff1aSopenharmony_ci mov x1, #0 1374cabdff1aSopenharmony_ci add x2, x6, #(0*4) 1375cabdff1aSopenharmony_ci bl idct16_1d_4x16_pass1_\size\()_neon 1376cabdff1aSopenharmony_ci.ifc \size,half 1377cabdff1aSopenharmony_ci add x0, sp, #(4*64) 1378cabdff1aSopenharmony_ci mov x1, #4 1379cabdff1aSopenharmony_ci add x2, x6, #(4*4) 1380cabdff1aSopenharmony_ci bl idct16_1d_4x16_pass1_\size\()_neon 1381cabdff1aSopenharmony_ci.endif 1382cabdff1aSopenharmony_ci 1383cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12 1384cabdff1aSopenharmony_ci add x0, x4, #(\i*2) 1385cabdff1aSopenharmony_ci mov x1, x5 1386cabdff1aSopenharmony_ci add x2, sp, #(\i*4) 1387cabdff1aSopenharmony_ci mov x3, #\i 1388cabdff1aSopenharmony_ci bl idct16_1d_4x16_pass2_\size\()_neon 1389cabdff1aSopenharmony_ci.endr 1390cabdff1aSopenharmony_ci 1391cabdff1aSopenharmony_ci add sp, sp, #1024 1392cabdff1aSopenharmony_ci ldp d8, d9, [sp], 0x10 1393cabdff1aSopenharmony_ci ret x15 1394cabdff1aSopenharmony_ciendfunc 1395cabdff1aSopenharmony_ci.endm 1396cabdff1aSopenharmony_ci 1397cabdff1aSopenharmony_ciidct16_partial quarter 1398cabdff1aSopenharmony_ciidct16_partial half 1399cabdff1aSopenharmony_ci 1400cabdff1aSopenharmony_cifunction idct32x32_dc_add_neon 1401cabdff1aSopenharmony_ci movrel x4, idct_coeffs 1402cabdff1aSopenharmony_ci ld1 {v0.4h}, [x4] 1403cabdff1aSopenharmony_ci sxtl v0.4s, v0.4h 1404cabdff1aSopenharmony_ci 1405cabdff1aSopenharmony_ci movi v1.4h, #0 1406cabdff1aSopenharmony_ci 1407cabdff1aSopenharmony_ci ld1 {v2.s}[0], [x2] 1408cabdff1aSopenharmony_ci smull v2.2d, v2.2s, v0.s[0] 1409cabdff1aSopenharmony_ci rshrn v2.2s, v2.2d, #14 1410cabdff1aSopenharmony_ci smull v2.2d, v2.2s, v0.s[0] 1411cabdff1aSopenharmony_ci rshrn v2.2s, v2.2d, #14 1412cabdff1aSopenharmony_ci st1 {v1.s}[0], [x2] 1413cabdff1aSopenharmony_ci dup v2.4s, v2.s[0] 1414cabdff1aSopenharmony_ci 1415cabdff1aSopenharmony_ci srshr v0.4s, v2.4s, #6 1416cabdff1aSopenharmony_ci 1417cabdff1aSopenharmony_ci mov x3, x0 1418cabdff1aSopenharmony_ci mov x4, #32 1419cabdff1aSopenharmony_ci sub x1, x1, #32 1420cabdff1aSopenharmony_ci dup v31.8h, w13 1421cabdff1aSopenharmony_ci1: 1422cabdff1aSopenharmony_ci // Loop to add the constant v0 into all 32x32 outputs 1423cabdff1aSopenharmony_ci subs x4, x4, #1 1424cabdff1aSopenharmony_ci ld1 {v1.8h,v2.8h}, [x0], #32 1425cabdff1aSopenharmony_ci uaddw v16.4s, v0.4s, v1.4h 1426cabdff1aSopenharmony_ci uaddw2 v17.4s, v0.4s, v1.8h 1427cabdff1aSopenharmony_ci ld1 {v3.8h,v4.8h}, [x0], x1 1428cabdff1aSopenharmony_ci uaddw v18.4s, v0.4s, v2.4h 1429cabdff1aSopenharmony_ci uaddw2 v19.4s, v0.4s, v2.8h 1430cabdff1aSopenharmony_ci uaddw v20.4s, v0.4s, v3.4h 1431cabdff1aSopenharmony_ci uaddw2 v21.4s, v0.4s, v3.8h 1432cabdff1aSopenharmony_ci uaddw v22.4s, v0.4s, v4.4h 1433cabdff1aSopenharmony_ci uaddw2 v23.4s, v0.4s, v4.8h 1434cabdff1aSopenharmony_ci sqxtun v1.4h, v16.4s 1435cabdff1aSopenharmony_ci sqxtun2 v1.8h, v17.4s 1436cabdff1aSopenharmony_ci sqxtun v2.4h, v18.4s 1437cabdff1aSopenharmony_ci sqxtun2 v2.8h, v19.4s 1438cabdff1aSopenharmony_ci sqxtun v3.4h, v20.4s 1439cabdff1aSopenharmony_ci sqxtun2 v3.8h, v21.4s 1440cabdff1aSopenharmony_ci sqxtun v4.4h, v22.4s 1441cabdff1aSopenharmony_ci sqxtun2 v4.8h, v23.4s 1442cabdff1aSopenharmony_ci umin v1.8h, v1.8h, v31.8h 1443cabdff1aSopenharmony_ci umin v2.8h, v2.8h, v31.8h 1444cabdff1aSopenharmony_ci st1 {v1.8h,v2.8h}, [x3], #32 1445cabdff1aSopenharmony_ci umin v3.8h, v3.8h, v31.8h 1446cabdff1aSopenharmony_ci umin v4.8h, v4.8h, v31.8h 1447cabdff1aSopenharmony_ci st1 {v3.8h,v4.8h}, [x3], x1 1448cabdff1aSopenharmony_ci b.ne 1b 1449cabdff1aSopenharmony_ci 1450cabdff1aSopenharmony_ci ret 1451cabdff1aSopenharmony_ciendfunc 1452cabdff1aSopenharmony_ci 1453cabdff1aSopenharmony_ci.macro idct32_end 1454cabdff1aSopenharmony_ci butterfly_4s v16, v5, v4, v5 // v16 = t16a, v5 = t19a 1455cabdff1aSopenharmony_ci butterfly_4s v17, v20, v23, v20 // v17 = t17, v20 = t18 1456cabdff1aSopenharmony_ci butterfly_4s v18, v6, v7, v6 // v18 = t23a, v6 = t20a 1457cabdff1aSopenharmony_ci butterfly_4s v19, v21, v22, v21 // v19 = t22, v21 = t21 1458cabdff1aSopenharmony_ci butterfly_4s v4, v28, v28, v30 // v4 = t24a, v28 = t27a 1459cabdff1aSopenharmony_ci butterfly_4s v23, v26, v25, v26 // v23 = t25, v26 = t26 1460cabdff1aSopenharmony_ci butterfly_4s v7, v8, v29, v31 // v7 = t31a, v3 = t28a 1461cabdff1aSopenharmony_ci butterfly_4s v22, v27, v24, v27 // v22 = t30, v27 = t29 1462cabdff1aSopenharmony_ci 1463cabdff1aSopenharmony_ci dmbutterfly v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a 1464cabdff1aSopenharmony_ci dmbutterfly v8, v5, v0.s[2], v0.s[3], v24, v25, v30, v31 // v3 = t19, v5 = t28 1465cabdff1aSopenharmony_ci dmbutterfly v28, v6, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20 1466cabdff1aSopenharmony_ci dmbutterfly v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a 1467cabdff1aSopenharmony_ci 1468cabdff1aSopenharmony_ci butterfly_4s v31, v24, v7, v4 // v31 = t31, v24 = t24 1469cabdff1aSopenharmony_ci butterfly_4s v30, v25, v22, v23 // v30 = t30a, v25 = t25a 1470cabdff1aSopenharmony_ci butterfly_4s_r v23, v16, v16, v18 // v23 = t23, v16 = t16 1471cabdff1aSopenharmony_ci butterfly_4s_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a 1472cabdff1aSopenharmony_ci butterfly_4s v18, v21, v27, v21 // v18 = t18, v21 = t21 1473cabdff1aSopenharmony_ci butterfly_4s_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a 1474cabdff1aSopenharmony_ci butterfly_4s v29, v26, v20, v26 // v29 = t29, v26 = t26 1475cabdff1aSopenharmony_ci butterfly_4s v19, v20, v8, v6 // v19 = t19a, v20 = t20 1476cabdff1aSopenharmony_ci 1477cabdff1aSopenharmony_ci dmbutterfly0 v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27, v20 = t20 1478cabdff1aSopenharmony_ci dmbutterfly0 v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a 1479cabdff1aSopenharmony_ci dmbutterfly0 v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25, v22 = t22 1480cabdff1aSopenharmony_ci dmbutterfly0 v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a 1481cabdff1aSopenharmony_ci ret 1482cabdff1aSopenharmony_ci.endm 1483cabdff1aSopenharmony_ci 1484cabdff1aSopenharmony_cifunction idct32_odd 1485cabdff1aSopenharmony_ci dmbutterfly v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a 1486cabdff1aSopenharmony_ci dmbutterfly v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a 1487cabdff1aSopenharmony_ci dmbutterfly v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a 1488cabdff1aSopenharmony_ci dmbutterfly v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a 1489cabdff1aSopenharmony_ci dmbutterfly v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a 1490cabdff1aSopenharmony_ci dmbutterfly v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a 1491cabdff1aSopenharmony_ci dmbutterfly v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a 1492cabdff1aSopenharmony_ci dmbutterfly v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a 1493cabdff1aSopenharmony_ci 1494cabdff1aSopenharmony_ci butterfly_4s v4, v24, v16, v24 // v4 = t16, v24 = t17 1495cabdff1aSopenharmony_ci butterfly_4s v5, v20, v28, v20 // v5 = t19, v20 = t18 1496cabdff1aSopenharmony_ci butterfly_4s v6, v26, v18, v26 // v6 = t20, v26 = t21 1497cabdff1aSopenharmony_ci butterfly_4s v7, v22, v30, v22 // v7 = t23, v22 = t22 1498cabdff1aSopenharmony_ci butterfly_4s v28, v25, v17, v25 // v28 = t24, v25 = t25 1499cabdff1aSopenharmony_ci butterfly_4s v30, v21, v29, v21 // v30 = t27, v21 = t26 1500cabdff1aSopenharmony_ci butterfly_4s v29, v23, v31, v23 // v29 = t31, v23 = t30 1501cabdff1aSopenharmony_ci butterfly_4s v31, v27, v19, v27 // v31 = t28, v27 = t29 1502cabdff1aSopenharmony_ci 1503cabdff1aSopenharmony_ci dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19 // v23 = t17a, v24 = t30a 1504cabdff1aSopenharmony_ci dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a 1505cabdff1aSopenharmony_ci dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19 // v21 = t21a, v26 = t26a 1506cabdff1aSopenharmony_ci dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a 1507cabdff1aSopenharmony_ci idct32_end 1508cabdff1aSopenharmony_ciendfunc 1509cabdff1aSopenharmony_ci 1510cabdff1aSopenharmony_cifunction idct32_odd_half 1511cabdff1aSopenharmony_ci dmbutterfly_h1 v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a 1512cabdff1aSopenharmony_ci dmbutterfly_h2 v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a 1513cabdff1aSopenharmony_ci dmbutterfly_h1 v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a 1514cabdff1aSopenharmony_ci dmbutterfly_h2 v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a 1515cabdff1aSopenharmony_ci dmbutterfly_h1 v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a 1516cabdff1aSopenharmony_ci dmbutterfly_h2 v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a 1517cabdff1aSopenharmony_ci dmbutterfly_h1 v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a 1518cabdff1aSopenharmony_ci dmbutterfly_h2 v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a 1519cabdff1aSopenharmony_ci 1520cabdff1aSopenharmony_ci butterfly_4s v4, v24, v16, v24 // v4 = t16, v24 = t17 1521cabdff1aSopenharmony_ci butterfly_4s v5, v20, v28, v20 // v5 = t19, v20 = t18 1522cabdff1aSopenharmony_ci butterfly_4s v6, v26, v18, v26 // v6 = t20, v26 = t21 1523cabdff1aSopenharmony_ci butterfly_4s v7, v22, v30, v22 // v7 = t23, v22 = t22 1524cabdff1aSopenharmony_ci butterfly_4s v28, v25, v17, v25 // v28 = t24, v25 = t25 1525cabdff1aSopenharmony_ci butterfly_4s v30, v21, v29, v21 // v30 = t27, v21 = t26 1526cabdff1aSopenharmony_ci butterfly_4s v29, v23, v31, v23 // v29 = t31, v23 = t30 1527cabdff1aSopenharmony_ci butterfly_4s v31, v27, v19, v27 // v31 = t28, v27 = t29 1528cabdff1aSopenharmony_ci 1529cabdff1aSopenharmony_ci dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19 // v23 = t17a, v24 = t30a 1530cabdff1aSopenharmony_ci dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a 1531cabdff1aSopenharmony_ci dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19 // v21 = t21a, v26 = t26a 1532cabdff1aSopenharmony_ci dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a 1533cabdff1aSopenharmony_ci idct32_end 1534cabdff1aSopenharmony_ciendfunc 1535cabdff1aSopenharmony_ci 1536cabdff1aSopenharmony_cifunction idct32_odd_quarter 1537cabdff1aSopenharmony_ci dsmull_h v4, v5, v16, v10.s[0] 1538cabdff1aSopenharmony_ci dsmull_h v28, v29, v19, v11.s[3] 1539cabdff1aSopenharmony_ci dsmull_h v30, v31, v16, v10.s[1] 1540cabdff1aSopenharmony_ci dsmull_h v22, v23, v17, v13.s[2] 1541cabdff1aSopenharmony_ci dsmull_h v7, v6, v17, v13.s[3] 1542cabdff1aSopenharmony_ci dsmull_h v26, v27, v19, v11.s[2] 1543cabdff1aSopenharmony_ci dsmull_h v20, v21, v18, v12.s[0] 1544cabdff1aSopenharmony_ci dsmull_h v24, v25, v18, v12.s[1] 1545cabdff1aSopenharmony_ci 1546cabdff1aSopenharmony_ci neg v28.2d, v28.2d 1547cabdff1aSopenharmony_ci neg v29.2d, v29.2d 1548cabdff1aSopenharmony_ci neg v7.2d, v7.2d 1549cabdff1aSopenharmony_ci neg v6.2d, v6.2d 1550cabdff1aSopenharmony_ci 1551cabdff1aSopenharmony_ci drshrn_h v4, v4, v5, #14 1552cabdff1aSopenharmony_ci drshrn_h v5, v28, v29, #14 1553cabdff1aSopenharmony_ci drshrn_h v29, v30, v31, #14 1554cabdff1aSopenharmony_ci drshrn_h v28, v22, v23, #14 1555cabdff1aSopenharmony_ci drshrn_h v7, v7, v6, #14 1556cabdff1aSopenharmony_ci drshrn_h v31, v26, v27, #14 1557cabdff1aSopenharmony_ci drshrn_h v6, v20, v21, #14 1558cabdff1aSopenharmony_ci drshrn_h v30, v24, v25, #14 1559cabdff1aSopenharmony_ci 1560cabdff1aSopenharmony_ci dmbutterfly_l v16, v17, v18, v19, v29, v4, v1.s[0], v1.s[1] 1561cabdff1aSopenharmony_ci dmbutterfly_l v27, v26, v20, v21, v31, v5, v1.s[0], v1.s[1] 1562cabdff1aSopenharmony_ci drshrn_h v23, v16, v17, #14 1563cabdff1aSopenharmony_ci drshrn_h v24, v18, v19, #14 1564cabdff1aSopenharmony_ci neg v20.2d, v20.2d 1565cabdff1aSopenharmony_ci neg v21.2d, v21.2d 1566cabdff1aSopenharmony_ci drshrn_h v27, v27, v26, #14 1567cabdff1aSopenharmony_ci drshrn_h v20, v20, v21, #14 1568cabdff1aSopenharmony_ci dmbutterfly_l v16, v17, v18, v19, v30, v6, v1.s[2], v1.s[3] 1569cabdff1aSopenharmony_ci drshrn_h v21, v16, v17, #14 1570cabdff1aSopenharmony_ci drshrn_h v26, v18, v19, #14 1571cabdff1aSopenharmony_ci dmbutterfly_l v16, v17, v18, v19, v28, v7, v1.s[2], v1.s[3] 1572cabdff1aSopenharmony_ci drshrn_h v25, v16, v17, #14 1573cabdff1aSopenharmony_ci neg v18.2d, v18.2d 1574cabdff1aSopenharmony_ci neg v19.2d, v19.2d 1575cabdff1aSopenharmony_ci drshrn_h v22, v18, v19, #14 1576cabdff1aSopenharmony_ci 1577cabdff1aSopenharmony_ci idct32_end 1578cabdff1aSopenharmony_ciendfunc 1579cabdff1aSopenharmony_ci 1580cabdff1aSopenharmony_ci.macro idct32_funcs suffix 1581cabdff1aSopenharmony_ci// Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix. 1582cabdff1aSopenharmony_ci// The 32-point IDCT can be decomposed into two 16-point IDCTs; 1583cabdff1aSopenharmony_ci// a normal IDCT16 with every other input component (the even ones, with 1584cabdff1aSopenharmony_ci// each output written twice), followed by a separate 16-point IDCT 1585cabdff1aSopenharmony_ci// of the odd inputs, added/subtracted onto the outputs of the first idct16. 1586cabdff1aSopenharmony_ci// x0 = dst (temp buffer) 1587cabdff1aSopenharmony_ci// x1 = unused 1588cabdff1aSopenharmony_ci// x2 = src 1589cabdff1aSopenharmony_ci// x9 = double input stride 1590cabdff1aSopenharmony_cifunction idct32_1d_4x32_pass1\suffix\()_neon 1591cabdff1aSopenharmony_ci mov x14, x30 1592cabdff1aSopenharmony_ci 1593cabdff1aSopenharmony_ci movi v4.4s, #0 1594cabdff1aSopenharmony_ci 1595cabdff1aSopenharmony_ci // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) 1596cabdff1aSopenharmony_ci.ifb \suffix 1597cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1598cabdff1aSopenharmony_ci load_clear \i, x2, x9 1599cabdff1aSopenharmony_ci.endr 1600cabdff1aSopenharmony_ci.endif 1601cabdff1aSopenharmony_ci.ifc \suffix,_quarter 1602cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 1603cabdff1aSopenharmony_ci load_clear \i, x2, x9 1604cabdff1aSopenharmony_ci.endr 1605cabdff1aSopenharmony_ci.endif 1606cabdff1aSopenharmony_ci.ifc \suffix,_half 1607cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1608cabdff1aSopenharmony_ci load_clear \i, x2, x9 1609cabdff1aSopenharmony_ci.endr 1610cabdff1aSopenharmony_ci.endif 1611cabdff1aSopenharmony_ci 1612cabdff1aSopenharmony_ci bl idct16\suffix 1613cabdff1aSopenharmony_ci 1614cabdff1aSopenharmony_ci // Do four 4x4 transposes. Originally, v16-v31 contain the 1615cabdff1aSopenharmony_ci // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 1616cabdff1aSopenharmony_ci // contain the four transposed 4x4 blocks. 1617cabdff1aSopenharmony_ci transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7 1618cabdff1aSopenharmony_ci transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7 1619cabdff1aSopenharmony_ci transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7 1620cabdff1aSopenharmony_ci transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7 1621cabdff1aSopenharmony_ci 1622cabdff1aSopenharmony_ci // Store the registers a, b, c, d horizontally, followed by the 1623cabdff1aSopenharmony_ci // same registers d, c, b, a mirrored. 1624cabdff1aSopenharmony_ci.macro store_rev a, b, c, d 1625cabdff1aSopenharmony_ci // There's no rev128 instruction, but we reverse each 64 bit 1626cabdff1aSopenharmony_ci // half, and then flip them using an ext with 8 bytes offset. 1627cabdff1aSopenharmony_ci rev64 v7.4s, \d 1628cabdff1aSopenharmony_ci st1 {\a}, [x0], #16 1629cabdff1aSopenharmony_ci ext v7.16b, v7.16b, v7.16b, #8 1630cabdff1aSopenharmony_ci st1 {\b}, [x0], #16 1631cabdff1aSopenharmony_ci rev64 v6.4s, \c 1632cabdff1aSopenharmony_ci st1 {\c}, [x0], #16 1633cabdff1aSopenharmony_ci ext v6.16b, v6.16b, v6.16b, #8 1634cabdff1aSopenharmony_ci st1 {\d}, [x0], #16 1635cabdff1aSopenharmony_ci rev64 v5.4s, \b 1636cabdff1aSopenharmony_ci st1 {v7.4s}, [x0], #16 1637cabdff1aSopenharmony_ci ext v5.16b, v5.16b, v5.16b, #8 1638cabdff1aSopenharmony_ci st1 {v6.4s}, [x0], #16 1639cabdff1aSopenharmony_ci rev64 v4.4s, \a 1640cabdff1aSopenharmony_ci st1 {v5.4s}, [x0], #16 1641cabdff1aSopenharmony_ci ext v4.16b, v4.16b, v4.16b, #8 1642cabdff1aSopenharmony_ci st1 {v4.4s}, [x0], #16 1643cabdff1aSopenharmony_ci.endm 1644cabdff1aSopenharmony_ci store_rev v16.4s, v20.4s, v24.4s, v28.4s 1645cabdff1aSopenharmony_ci store_rev v17.4s, v21.4s, v25.4s, v29.4s 1646cabdff1aSopenharmony_ci store_rev v18.4s, v22.4s, v26.4s, v30.4s 1647cabdff1aSopenharmony_ci store_rev v19.4s, v23.4s, v27.4s, v31.4s 1648cabdff1aSopenharmony_ci sub x0, x0, #512 1649cabdff1aSopenharmony_ci.purgem store_rev 1650cabdff1aSopenharmony_ci 1651cabdff1aSopenharmony_ci // Move x2 back to the start of the input, and move 1652cabdff1aSopenharmony_ci // to the first odd row 1653cabdff1aSopenharmony_ci.ifb \suffix 1654cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #4 1655cabdff1aSopenharmony_ci.endif 1656cabdff1aSopenharmony_ci.ifc \suffix,_quarter 1657cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #2 1658cabdff1aSopenharmony_ci.endif 1659cabdff1aSopenharmony_ci.ifc \suffix,_half 1660cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #3 1661cabdff1aSopenharmony_ci.endif 1662cabdff1aSopenharmony_ci add x2, x2, #128 1663cabdff1aSopenharmony_ci 1664cabdff1aSopenharmony_ci movi v4.4s, #0 1665cabdff1aSopenharmony_ci // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) 1666cabdff1aSopenharmony_ci.ifb \suffix 1667cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1668cabdff1aSopenharmony_ci load_clear \i, x2, x9 1669cabdff1aSopenharmony_ci.endr 1670cabdff1aSopenharmony_ci.endif 1671cabdff1aSopenharmony_ci.ifc \suffix,_quarter 1672cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 1673cabdff1aSopenharmony_ci load_clear \i, x2, x9 1674cabdff1aSopenharmony_ci.endr 1675cabdff1aSopenharmony_ci.endif 1676cabdff1aSopenharmony_ci.ifc \suffix,_half 1677cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1678cabdff1aSopenharmony_ci load_clear \i, x2, x9 1679cabdff1aSopenharmony_ci.endr 1680cabdff1aSopenharmony_ci.endif 1681cabdff1aSopenharmony_ci 1682cabdff1aSopenharmony_ci bl idct32_odd\suffix 1683cabdff1aSopenharmony_ci 1684cabdff1aSopenharmony_ci transpose_4x4s v31, v30, v29, v28, v4, v5, v6, v7 1685cabdff1aSopenharmony_ci transpose_4x4s v27, v26, v25, v24, v4, v5, v6, v7 1686cabdff1aSopenharmony_ci transpose_4x4s v23, v22, v21, v20, v4, v5, v6, v7 1687cabdff1aSopenharmony_ci transpose_4x4s v19, v18, v17, v16, v4, v5, v6, v7 1688cabdff1aSopenharmony_ci 1689cabdff1aSopenharmony_ci // Store the registers a, b, c, d horizontally, 1690cabdff1aSopenharmony_ci // adding into the output first, and the mirrored, 1691cabdff1aSopenharmony_ci // subtracted from the output. 1692cabdff1aSopenharmony_ci.macro store_rev a, b, c, d, a16b, b16b 1693cabdff1aSopenharmony_ci ld1 {v4.4s}, [x0] 1694cabdff1aSopenharmony_ci rev64 v9.4s, \d 1695cabdff1aSopenharmony_ci add v4.4s, v4.4s, \a 1696cabdff1aSopenharmony_ci st1 {v4.4s}, [x0], #16 1697cabdff1aSopenharmony_ci rev64 v8.4s, \c 1698cabdff1aSopenharmony_ci ld1 {v4.4s}, [x0] 1699cabdff1aSopenharmony_ci ext v9.16b, v9.16b, v9.16b, #8 1700cabdff1aSopenharmony_ci add v4.4s, v4.4s, \b 1701cabdff1aSopenharmony_ci st1 {v4.4s}, [x0], #16 1702cabdff1aSopenharmony_ci ext v8.16b, v8.16b, v8.16b, #8 1703cabdff1aSopenharmony_ci ld1 {v4.4s}, [x0] 1704cabdff1aSopenharmony_ci rev64 \b, \b 1705cabdff1aSopenharmony_ci add v4.4s, v4.4s, \c 1706cabdff1aSopenharmony_ci st1 {v4.4s}, [x0], #16 1707cabdff1aSopenharmony_ci rev64 \a, \a 1708cabdff1aSopenharmony_ci ld1 {v4.4s}, [x0] 1709cabdff1aSopenharmony_ci ext \b16b, \b16b, \b16b, #8 1710cabdff1aSopenharmony_ci add v4.4s, v4.4s, \d 1711cabdff1aSopenharmony_ci st1 {v4.4s}, [x0], #16 1712cabdff1aSopenharmony_ci ext \a16b, \a16b, \a16b, #8 1713cabdff1aSopenharmony_ci ld1 {v4.4s}, [x0] 1714cabdff1aSopenharmony_ci sub v4.4s, v4.4s, v9.4s 1715cabdff1aSopenharmony_ci st1 {v4.4s}, [x0], #16 1716cabdff1aSopenharmony_ci ld1 {v4.4s}, [x0] 1717cabdff1aSopenharmony_ci sub v4.4s, v4.4s, v8.4s 1718cabdff1aSopenharmony_ci st1 {v4.4s}, [x0], #16 1719cabdff1aSopenharmony_ci ld1 {v4.4s}, [x0] 1720cabdff1aSopenharmony_ci sub v4.4s, v4.4s, \b 1721cabdff1aSopenharmony_ci st1 {v4.4s}, [x0], #16 1722cabdff1aSopenharmony_ci ld1 {v4.4s}, [x0] 1723cabdff1aSopenharmony_ci sub v4.4s, v4.4s, \a 1724cabdff1aSopenharmony_ci st1 {v4.4s}, [x0], #16 1725cabdff1aSopenharmony_ci.endm 1726cabdff1aSopenharmony_ci 1727cabdff1aSopenharmony_ci store_rev v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b 1728cabdff1aSopenharmony_ci store_rev v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b 1729cabdff1aSopenharmony_ci store_rev v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b 1730cabdff1aSopenharmony_ci store_rev v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b 1731cabdff1aSopenharmony_ci.purgem store_rev 1732cabdff1aSopenharmony_ci ret x14 1733cabdff1aSopenharmony_ciendfunc 1734cabdff1aSopenharmony_ci 1735cabdff1aSopenharmony_ci// This is mostly the same as 4x32_pass1, but without the transpose, 1736cabdff1aSopenharmony_ci// and use the source as temp buffer between the two idct passes, and 1737cabdff1aSopenharmony_ci// add into the destination. 1738cabdff1aSopenharmony_ci// x0 = dst 1739cabdff1aSopenharmony_ci// x1 = dst stride 1740cabdff1aSopenharmony_ci// x2 = src (temp buffer) 1741cabdff1aSopenharmony_ci// x7 = negative double temp buffer stride 1742cabdff1aSopenharmony_ci// x9 = double temp buffer stride 1743cabdff1aSopenharmony_cifunction idct32_1d_4x32_pass2\suffix\()_neon 1744cabdff1aSopenharmony_ci mov x14, x30 1745cabdff1aSopenharmony_ci 1746cabdff1aSopenharmony_ci // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) 1747cabdff1aSopenharmony_ci.ifb \suffix 1748cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1749cabdff1aSopenharmony_ci load \i, x2, x9 1750cabdff1aSopenharmony_ci.endr 1751cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #4 1752cabdff1aSopenharmony_ci.endif 1753cabdff1aSopenharmony_ci.ifc \suffix,_quarter 1754cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 1755cabdff1aSopenharmony_ci load \i, x2, x9 1756cabdff1aSopenharmony_ci.endr 1757cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #2 1758cabdff1aSopenharmony_ci.endif 1759cabdff1aSopenharmony_ci.ifc \suffix,_half 1760cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1761cabdff1aSopenharmony_ci load \i, x2, x9 1762cabdff1aSopenharmony_ci.endr 1763cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #3 1764cabdff1aSopenharmony_ci.endif 1765cabdff1aSopenharmony_ci 1766cabdff1aSopenharmony_ci bl idct16\suffix 1767cabdff1aSopenharmony_ci 1768cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1769cabdff1aSopenharmony_ci store \i, x2, x9 1770cabdff1aSopenharmony_ci.endr 1771cabdff1aSopenharmony_ci 1772cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #4 1773cabdff1aSopenharmony_ci add x2, x2, #128 1774cabdff1aSopenharmony_ci 1775cabdff1aSopenharmony_ci // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) 1776cabdff1aSopenharmony_ci.ifb \suffix 1777cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1778cabdff1aSopenharmony_ci load \i, x2, x9 1779cabdff1aSopenharmony_ci.endr 1780cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #4 1781cabdff1aSopenharmony_ci.endif 1782cabdff1aSopenharmony_ci.ifc \suffix,_quarter 1783cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19 1784cabdff1aSopenharmony_ci load \i, x2, x9 1785cabdff1aSopenharmony_ci.endr 1786cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #2 1787cabdff1aSopenharmony_ci.endif 1788cabdff1aSopenharmony_ci.ifc \suffix,_half 1789cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23 1790cabdff1aSopenharmony_ci load \i, x2, x9 1791cabdff1aSopenharmony_ci.endr 1792cabdff1aSopenharmony_ci sub x2, x2, x9, lsl #3 1793cabdff1aSopenharmony_ci.endif 1794cabdff1aSopenharmony_ci sub x2, x2, #128 1795cabdff1aSopenharmony_ci 1796cabdff1aSopenharmony_ci bl idct32_odd\suffix 1797cabdff1aSopenharmony_ci 1798cabdff1aSopenharmony_ci.macro load_acc_store a, b, c, d, neg=0 1799cabdff1aSopenharmony_ci.if \neg == 0 1800cabdff1aSopenharmony_ci ld1 {v4.4s}, [x2], x9 1801cabdff1aSopenharmony_ci ld1 {v5.4s}, [x2], x9 1802cabdff1aSopenharmony_ci add v4.4s, v4.4s, \a 1803cabdff1aSopenharmony_ci ld1 {v6.4s}, [x2], x9 1804cabdff1aSopenharmony_ci add v5.4s, v5.4s, \b 1805cabdff1aSopenharmony_ci ld1 {v7.4s}, [x2], x9 1806cabdff1aSopenharmony_ci add v6.4s, v6.4s, \c 1807cabdff1aSopenharmony_ci add v7.4s, v7.4s, \d 1808cabdff1aSopenharmony_ci.else 1809cabdff1aSopenharmony_ci ld1 {v4.4s}, [x2], x7 1810cabdff1aSopenharmony_ci ld1 {v5.4s}, [x2], x7 1811cabdff1aSopenharmony_ci sub v4.4s, v4.4s, \a 1812cabdff1aSopenharmony_ci ld1 {v6.4s}, [x2], x7 1813cabdff1aSopenharmony_ci sub v5.4s, v5.4s, \b 1814cabdff1aSopenharmony_ci ld1 {v7.4s}, [x2], x7 1815cabdff1aSopenharmony_ci sub v6.4s, v6.4s, \c 1816cabdff1aSopenharmony_ci sub v7.4s, v7.4s, \d 1817cabdff1aSopenharmony_ci.endif 1818cabdff1aSopenharmony_ci ld1 {v8.4h}, [x0], x1 1819cabdff1aSopenharmony_ci ld1 {v8.d}[1], [x0], x1 1820cabdff1aSopenharmony_ci srshr v4.4s, v4.4s, #6 1821cabdff1aSopenharmony_ci ld1 {v9.4h}, [x0], x1 1822cabdff1aSopenharmony_ci srshr v5.4s, v5.4s, #6 1823cabdff1aSopenharmony_ci uaddw v4.4s, v4.4s, v8.4h 1824cabdff1aSopenharmony_ci ld1 {v9.d}[1], [x0], x1 1825cabdff1aSopenharmony_ci srshr v6.4s, v6.4s, #6 1826cabdff1aSopenharmony_ci uaddw2 v5.4s, v5.4s, v8.8h 1827cabdff1aSopenharmony_ci srshr v7.4s, v7.4s, #6 1828cabdff1aSopenharmony_ci sub x0, x0, x1, lsl #2 1829cabdff1aSopenharmony_ci uaddw v6.4s, v6.4s, v9.4h 1830cabdff1aSopenharmony_ci sqxtun v4.4h, v4.4s 1831cabdff1aSopenharmony_ci uaddw2 v7.4s, v7.4s, v9.8h 1832cabdff1aSopenharmony_ci sqxtun2 v4.8h, v5.4s 1833cabdff1aSopenharmony_ci umin v4.8h, v4.8h, v15.8h 1834cabdff1aSopenharmony_ci st1 {v4.4h}, [x0], x1 1835cabdff1aSopenharmony_ci sqxtun v5.4h, v6.4s 1836cabdff1aSopenharmony_ci st1 {v4.d}[1], [x0], x1 1837cabdff1aSopenharmony_ci sqxtun2 v5.8h, v7.4s 1838cabdff1aSopenharmony_ci umin v5.8h, v5.8h, v15.8h 1839cabdff1aSopenharmony_ci st1 {v5.4h}, [x0], x1 1840cabdff1aSopenharmony_ci st1 {v5.d}[1], [x0], x1 1841cabdff1aSopenharmony_ci.endm 1842cabdff1aSopenharmony_ci load_acc_store v31.4s, v30.4s, v29.4s, v28.4s 1843cabdff1aSopenharmony_ci load_acc_store v27.4s, v26.4s, v25.4s, v24.4s 1844cabdff1aSopenharmony_ci load_acc_store v23.4s, v22.4s, v21.4s, v20.4s 1845cabdff1aSopenharmony_ci load_acc_store v19.4s, v18.4s, v17.4s, v16.4s 1846cabdff1aSopenharmony_ci sub x2, x2, x9 1847cabdff1aSopenharmony_ci load_acc_store v16.4s, v17.4s, v18.4s, v19.4s, 1 1848cabdff1aSopenharmony_ci load_acc_store v20.4s, v21.4s, v22.4s, v23.4s, 1 1849cabdff1aSopenharmony_ci load_acc_store v24.4s, v25.4s, v26.4s, v27.4s, 1 1850cabdff1aSopenharmony_ci load_acc_store v28.4s, v29.4s, v30.4s, v31.4s, 1 1851cabdff1aSopenharmony_ci.purgem load_acc_store 1852cabdff1aSopenharmony_ci ret x14 1853cabdff1aSopenharmony_ciendfunc 1854cabdff1aSopenharmony_ci.endm 1855cabdff1aSopenharmony_ci 1856cabdff1aSopenharmony_ciidct32_funcs 1857cabdff1aSopenharmony_ciidct32_funcs _quarter 1858cabdff1aSopenharmony_ciidct32_funcs _half 1859cabdff1aSopenharmony_ci 1860cabdff1aSopenharmony_ciconst min_eob_idct_idct_32, align=4 1861cabdff1aSopenharmony_ci .short 0, 9, 34, 70, 135, 240, 336, 448 1862cabdff1aSopenharmony_ciendconst 1863cabdff1aSopenharmony_ci 1864cabdff1aSopenharmony_cifunction vp9_idct_idct_32x32_add_16_neon 1865cabdff1aSopenharmony_ci cmp w3, #1 1866cabdff1aSopenharmony_ci b.eq idct32x32_dc_add_neon 1867cabdff1aSopenharmony_ci 1868cabdff1aSopenharmony_ci movrel x10, idct_coeffs 1869cabdff1aSopenharmony_ci 1870cabdff1aSopenharmony_ci mov x15, x30 1871cabdff1aSopenharmony_ci stp d8, d9, [sp, #-0x10]! 1872cabdff1aSopenharmony_ci stp d10, d11, [sp, #-0x10]! 1873cabdff1aSopenharmony_ci stp d12, d13, [sp, #-0x10]! 1874cabdff1aSopenharmony_ci stp d14, d15, [sp, #-0x10]! 1875cabdff1aSopenharmony_ci 1876cabdff1aSopenharmony_ci sub sp, sp, #4096 1877cabdff1aSopenharmony_ci 1878cabdff1aSopenharmony_ci mov x4, x0 1879cabdff1aSopenharmony_ci mov x5, x1 1880cabdff1aSopenharmony_ci mov x6, x2 1881cabdff1aSopenharmony_ci 1882cabdff1aSopenharmony_ci // Double stride of the input, since we only read every other line 1883cabdff1aSopenharmony_ci mov x9, #256 1884cabdff1aSopenharmony_ci neg x7, x9 1885cabdff1aSopenharmony_ci 1886cabdff1aSopenharmony_ci ld1 {v0.8h,v1.8h}, [x10], #32 1887cabdff1aSopenharmony_ci sxtl v2.4s, v1.4h 1888cabdff1aSopenharmony_ci sxtl2 v3.4s, v1.8h 1889cabdff1aSopenharmony_ci sxtl2 v1.4s, v0.8h 1890cabdff1aSopenharmony_ci sxtl v0.4s, v0.4h 1891cabdff1aSopenharmony_ci ld1 {v10.8h,v11.8h}, [x10] 1892cabdff1aSopenharmony_ci sxtl v12.4s, v11.4h 1893cabdff1aSopenharmony_ci sxtl2 v13.4s, v11.8h 1894cabdff1aSopenharmony_ci sxtl2 v11.4s, v10.8h 1895cabdff1aSopenharmony_ci sxtl v10.4s, v10.4h 1896cabdff1aSopenharmony_ci 1897cabdff1aSopenharmony_ci dup v15.8h, w13 1898cabdff1aSopenharmony_ci 1899cabdff1aSopenharmony_ci cmp w3, #34 1900cabdff1aSopenharmony_ci b.le idct32x32_quarter_add_16_neon 1901cabdff1aSopenharmony_ci cmp w3, #135 1902cabdff1aSopenharmony_ci b.le idct32x32_half_add_16_neon 1903cabdff1aSopenharmony_ci 1904cabdff1aSopenharmony_ci movrel x12, min_eob_idct_idct_32, 2 1905cabdff1aSopenharmony_ci 1906cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12, 16, 20, 24, 28 1907cabdff1aSopenharmony_ci add x0, sp, #(\i*128) 1908cabdff1aSopenharmony_ci.if \i > 0 1909cabdff1aSopenharmony_ci ldrh w1, [x12], #2 1910cabdff1aSopenharmony_ci cmp w3, w1 1911cabdff1aSopenharmony_ci mov x1, #(32 - \i)/4 1912cabdff1aSopenharmony_ci b.le 1f 1913cabdff1aSopenharmony_ci.endif 1914cabdff1aSopenharmony_ci add x2, x6, #(\i*4) 1915cabdff1aSopenharmony_ci bl idct32_1d_4x32_pass1_neon 1916cabdff1aSopenharmony_ci.endr 1917cabdff1aSopenharmony_ci b 3f 1918cabdff1aSopenharmony_ci 1919cabdff1aSopenharmony_ci1: 1920cabdff1aSopenharmony_ci // Write zeros to the temp buffer for pass 2 1921cabdff1aSopenharmony_ci movi v16.4s, #0 1922cabdff1aSopenharmony_ci movi v17.4s, #0 1923cabdff1aSopenharmony_ci movi v18.4s, #0 1924cabdff1aSopenharmony_ci movi v19.4s, #0 1925cabdff1aSopenharmony_ci2: 1926cabdff1aSopenharmony_ci subs x1, x1, #1 1927cabdff1aSopenharmony_ci.rept 4 1928cabdff1aSopenharmony_ci st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 1929cabdff1aSopenharmony_ci st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 1930cabdff1aSopenharmony_ci.endr 1931cabdff1aSopenharmony_ci b.ne 2b 1932cabdff1aSopenharmony_ci3: 1933cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12, 16, 20, 24, 28 1934cabdff1aSopenharmony_ci add x0, x4, #(\i*2) 1935cabdff1aSopenharmony_ci mov x1, x5 1936cabdff1aSopenharmony_ci add x2, sp, #(\i*4) 1937cabdff1aSopenharmony_ci bl idct32_1d_4x32_pass2_neon 1938cabdff1aSopenharmony_ci.endr 1939cabdff1aSopenharmony_ci 1940cabdff1aSopenharmony_ci add sp, sp, #4096 1941cabdff1aSopenharmony_ci ldp d14, d15, [sp], 0x10 1942cabdff1aSopenharmony_ci ldp d12, d13, [sp], 0x10 1943cabdff1aSopenharmony_ci ldp d10, d11, [sp], 0x10 1944cabdff1aSopenharmony_ci ldp d8, d9, [sp], 0x10 1945cabdff1aSopenharmony_ci 1946cabdff1aSopenharmony_ci ret x15 1947cabdff1aSopenharmony_ciendfunc 1948cabdff1aSopenharmony_ci 1949cabdff1aSopenharmony_cifunction ff_vp9_idct_idct_32x32_add_10_neon, export=1 1950cabdff1aSopenharmony_ci mov x13, #0x03ff 1951cabdff1aSopenharmony_ci b vp9_idct_idct_32x32_add_16_neon 1952cabdff1aSopenharmony_ciendfunc 1953cabdff1aSopenharmony_ci 1954cabdff1aSopenharmony_cifunction ff_vp9_idct_idct_32x32_add_12_neon, export=1 1955cabdff1aSopenharmony_ci mov x13, #0x0fff 1956cabdff1aSopenharmony_ci b vp9_idct_idct_32x32_add_16_neon 1957cabdff1aSopenharmony_ciendfunc 1958cabdff1aSopenharmony_ci 1959cabdff1aSopenharmony_ci.macro idct32_partial size 1960cabdff1aSopenharmony_cifunction idct32x32_\size\()_add_16_neon 1961cabdff1aSopenharmony_ci.irp i, 0, 4 1962cabdff1aSopenharmony_ci add x0, sp, #(\i*128) 1963cabdff1aSopenharmony_ci.ifc \size,quarter 1964cabdff1aSopenharmony_ci.if \i == 4 1965cabdff1aSopenharmony_ci cmp w3, #9 1966cabdff1aSopenharmony_ci b.le 1f 1967cabdff1aSopenharmony_ci.endif 1968cabdff1aSopenharmony_ci.endif 1969cabdff1aSopenharmony_ci add x2, x6, #(\i*4) 1970cabdff1aSopenharmony_ci bl idct32_1d_4x32_pass1_\size\()_neon 1971cabdff1aSopenharmony_ci.endr 1972cabdff1aSopenharmony_ci 1973cabdff1aSopenharmony_ci.ifc \size,half 1974cabdff1aSopenharmony_ci.irp i, 8, 12 1975cabdff1aSopenharmony_ci add x0, sp, #(\i*128) 1976cabdff1aSopenharmony_ci.if \i == 12 1977cabdff1aSopenharmony_ci cmp w3, #70 1978cabdff1aSopenharmony_ci b.le 1f 1979cabdff1aSopenharmony_ci.endif 1980cabdff1aSopenharmony_ci add x2, x6, #(\i*4) 1981cabdff1aSopenharmony_ci bl idct32_1d_4x32_pass1_\size\()_neon 1982cabdff1aSopenharmony_ci.endr 1983cabdff1aSopenharmony_ci.endif 1984cabdff1aSopenharmony_ci b 3f 1985cabdff1aSopenharmony_ci 1986cabdff1aSopenharmony_ci1: 1987cabdff1aSopenharmony_ci // Write zeros to the temp buffer for pass 2 1988cabdff1aSopenharmony_ci movi v16.4s, #0 1989cabdff1aSopenharmony_ci movi v17.4s, #0 1990cabdff1aSopenharmony_ci movi v18.4s, #0 1991cabdff1aSopenharmony_ci movi v19.4s, #0 1992cabdff1aSopenharmony_ci 1993cabdff1aSopenharmony_ci.rept 4 1994cabdff1aSopenharmony_ci st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 1995cabdff1aSopenharmony_ci st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 1996cabdff1aSopenharmony_ci.endr 1997cabdff1aSopenharmony_ci 1998cabdff1aSopenharmony_ci3: 1999cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12, 16, 20, 24, 28 2000cabdff1aSopenharmony_ci add x0, x4, #(\i*2) 2001cabdff1aSopenharmony_ci mov x1, x5 2002cabdff1aSopenharmony_ci add x2, sp, #(\i*4) 2003cabdff1aSopenharmony_ci bl idct32_1d_4x32_pass2_\size\()_neon 2004cabdff1aSopenharmony_ci.endr 2005cabdff1aSopenharmony_ci 2006cabdff1aSopenharmony_ci add sp, sp, #4096 2007cabdff1aSopenharmony_ci ldp d14, d15, [sp], 0x10 2008cabdff1aSopenharmony_ci ldp d12, d13, [sp], 0x10 2009cabdff1aSopenharmony_ci ldp d10, d11, [sp], 0x10 2010cabdff1aSopenharmony_ci ldp d8, d9, [sp], 0x10 2011cabdff1aSopenharmony_ci 2012cabdff1aSopenharmony_ci ret x15 2013cabdff1aSopenharmony_ciendfunc 2014cabdff1aSopenharmony_ci.endm 2015cabdff1aSopenharmony_ci 2016cabdff1aSopenharmony_ciidct32_partial quarter 2017cabdff1aSopenharmony_ciidct32_partial half 2018