1 /* 2 * Copyright (c) 2017 Google Inc. 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21 #include "libavutil/aarch64/asm.S" 22 #include "neon.S" 23 24 const itxfm4_coeffs, align=4 25 .short 11585, 0, 6270, 15137 26 iadst4_coeffs: 27 .short 5283, 15212, 9929, 13377 28 endconst 29 30 const iadst8_coeffs, align=4 31 .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679 32 idct_coeffs: 33 .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102 34 .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756 35 .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520 36 .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404 37 endconst 38 39 const iadst16_coeffs, align=4 40 .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053 41 .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207 42 endconst 43 44 .macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7 45 trn1 \r4\().4s, \r0\().4s, \r1\().4s 46 trn2 \r5\().4s, \r0\().4s, \r1\().4s 47 trn1 \r6\().4s, \r2\().4s, \r3\().4s 48 trn2 \r7\().4s, \r2\().4s, \r3\().4s 49 trn1 \r0\().2d, \r4\().2d, \r6\().2d 50 trn2 \r2\().2d, \r4\().2d, \r6\().2d 51 trn1 \r1\().2d, \r5\().2d, \r7\().2d 52 trn2 \r3\().2d, \r5\().2d, \r7\().2d 53 .endm 54 55 // Transpose a 8x8 matrix of 32 bit elements, where each row is spread out 56 // over two registers. 57 .macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3 58 transpose_4x4s \r0, \r2, \r4, \r6, \t0, \t1, \t2, \t3 59 transpose_4x4s \r9, \r11, \r13, \r15, \t0, \t1, \t2, \t3 60 61 // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14 62 // while swapping the two 4x4 matrices between each other 63 64 // First step of the 4x4 transpose of r1-r7, into t0-t3 65 trn1 \t0\().4s, \r1\().4s, \r3\().4s 66 trn2 \t1\().4s, \r1\().4s, \r3\().4s 67 trn1 \t2\().4s, \r5\().4s, \r7\().4s 68 trn2 \t3\().4s, \r5\().4s, \r7\().4s 69 70 // First step of the 4x4 transpose of r8-r12, into r1-r7 71 trn1 \r1\().4s, \r8\().4s, \r10\().4s 72 trn2 \r3\().4s, \r8\().4s, \r10\().4s 73 trn1 \r5\().4s, \r12\().4s, \r14\().4s 74 trn2 \r7\().4s, \r12\().4s, \r14\().4s 75 76 // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12 77 trn1 \r8\().2d, \t0\().2d, \t2\().2d 78 trn2 \r12\().2d, \t0\().2d, \t2\().2d 79 trn1 \r10\().2d, \t1\().2d, \t3\().2d 80 trn2 \r14\().2d, \t1\().2d, \t3\().2d 81 82 // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible 83 trn1 \t0\().2d, \r1\().2d, \r5\().2d 84 trn2 \r5\().2d, \r1\().2d, \r5\().2d 85 trn1 \t1\().2d, \r3\().2d, \r7\().2d 86 trn2 \r7\().2d, \r3\().2d, \r7\().2d 87 88 // Move the outputs of trn1 back in place 89 mov \r1\().16b, \t0\().16b 90 mov \r3\().16b, \t1\().16b 91 .endm 92 93 // out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 94 // out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 95 // in/out are .4s registers; this can do with 4 temp registers, but is 96 // more efficient if 6 temp registers are available. 97 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0 98 .if \neg > 0 99 neg \tmp4\().4s, v0.4s 100 .endif 101 add \tmp1\().4s, \in1\().4s, \in2\().4s 102 sub \tmp2\().4s, \in1\().4s, \in2\().4s 103 .if \neg > 0 104 smull \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0] 105 smull2 \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0] 106 .else 107 smull \tmp3\().2d, \tmp1\().2s, v0.s[0] 108 smull2 \tmp4\().2d, \tmp1\().4s, v0.s[0] 109 .endif 110 .ifb \tmp5 111 rshrn \out1\().2s, \tmp3\().2d, #14 112 rshrn2 \out1\().4s, \tmp4\().2d, #14 113 smull \tmp3\().2d, \tmp2\().2s, v0.s[0] 114 smull2 \tmp4\().2d, \tmp2\().4s, v0.s[0] 115 rshrn \out2\().2s, \tmp3\().2d, #14 116 rshrn2 \out2\().4s, \tmp4\().2d, #14 117 .else 118 smull \tmp5\().2d, \tmp2\().2s, v0.s[0] 119 smull2 \tmp6\().2d, \tmp2\().4s, v0.s[0] 120 rshrn \out1\().2s, \tmp3\().2d, #14 121 rshrn2 \out1\().4s, \tmp4\().2d, #14 122 rshrn \out2\().2s, \tmp5\().2d, #14 123 rshrn2 \out2\().4s, \tmp6\().2d, #14 124 .endif 125 .endm 126 127 // Same as dmbutterfly0 above, but treating the input in in2 as zero, 128 // writing the same output into both out1 and out2. 129 .macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6 130 smull \tmp1\().2d, \in1\().2s, v0.s[0] 131 smull2 \tmp2\().2d, \in1\().4s, v0.s[0] 132 rshrn \out1\().2s, \tmp1\().2d, #14 133 rshrn2 \out1\().4s, \tmp2\().2d, #14 134 rshrn \out2\().2s, \tmp1\().2d, #14 135 rshrn2 \out2\().4s, \tmp2\().2d, #14 136 .endm 137 138 // out1,out2 = in1 * coef1 - in2 * coef2 139 // out3,out4 = in1 * coef2 + in2 * coef1 140 // out are 4 x .2d registers, in are 2 x .4s registers 141 .macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2 142 smull \out1\().2d, \in1\().2s, \coef1 143 smull2 \out2\().2d, \in1\().4s, \coef1 144 smull \out3\().2d, \in1\().2s, \coef2 145 smull2 \out4\().2d, \in1\().4s, \coef2 146 smlsl \out1\().2d, \in2\().2s, \coef2 147 smlsl2 \out2\().2d, \in2\().4s, \coef2 148 smlal \out3\().2d, \in2\().2s, \coef1 149 smlal2 \out4\().2d, \in2\().4s, \coef1 150 .endm 151 152 // inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14 153 // inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14 154 // inout are 2 x .4s registers 155 .macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0 156 dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2 157 .if \neg > 0 158 neg \tmp3\().2d, \tmp3\().2d 159 neg \tmp4\().2d, \tmp4\().2d 160 .endif 161 rshrn \inout1\().2s, \tmp1\().2d, #14 162 rshrn2 \inout1\().4s, \tmp2\().2d, #14 163 rshrn \inout2\().2s, \tmp3\().2d, #14 164 rshrn2 \inout2\().4s, \tmp4\().2d, #14 165 .endm 166 167 // Same as dmbutterfly above, but treating the input in inout2 as zero 168 .macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 169 smull \tmp1\().2d, \inout1\().2s, \coef1 170 smull2 \tmp2\().2d, \inout1\().4s, \coef1 171 smull \tmp3\().2d, \inout1\().2s, \coef2 172 smull2 \tmp4\().2d, \inout1\().4s, \coef2 173 rshrn \inout1\().2s, \tmp1\().2d, #14 174 rshrn2 \inout1\().4s, \tmp2\().2d, #14 175 rshrn \inout2\().2s, \tmp3\().2d, #14 176 rshrn2 \inout2\().4s, \tmp4\().2d, #14 177 .endm 178 179 // Same as dmbutterfly above, but treating the input in inout1 as zero 180 .macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 181 smull \tmp1\().2d, \inout2\().2s, \coef2 182 smull2 \tmp2\().2d, \inout2\().4s, \coef2 183 smull \tmp3\().2d, \inout2\().2s, \coef1 184 smull2 \tmp4\().2d, \inout2\().4s, \coef1 185 neg \tmp1\().2d, \tmp1\().2d 186 neg \tmp2\().2d, \tmp2\().2d 187 rshrn \inout2\().2s, \tmp3\().2d, #14 188 rshrn2 \inout2\().4s, \tmp4\().2d, #14 189 rshrn \inout1\().2s, \tmp1\().2d, #14 190 rshrn2 \inout1\().4s, \tmp2\().2d, #14 191 .endm 192 193 .macro dsmull_h out1, out2, in, coef 194 smull \out1\().2d, \in\().2s, \coef 195 smull2 \out2\().2d, \in\().4s, \coef 196 .endm 197 198 .macro drshrn_h out, in1, in2, shift 199 rshrn \out\().2s, \in1\().2d, \shift 200 rshrn2 \out\().4s, \in2\().2d, \shift 201 .endm 202 203 204 // out1 = in1 + in2 205 // out2 = in1 - in2 206 .macro butterfly_4s out1, out2, in1, in2 207 add \out1\().4s, \in1\().4s, \in2\().4s 208 sub \out2\().4s, \in1\().4s, \in2\().4s 209 .endm 210 211 // out1 = in1 - in2 212 // out2 = in1 + in2 213 .macro butterfly_4s_r out1, out2, in1, in2 214 sub \out1\().4s, \in1\().4s, \in2\().4s 215 add \out2\().4s, \in1\().4s, \in2\().4s 216 .endm 217 218 // out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14 219 // out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14 220 // out are 2 x .4s registers, in are 4 x .2d registers 221 .macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4 222 add \tmp1\().2d, \in1\().2d, \in3\().2d 223 add \tmp2\().2d, \in2\().2d, \in4\().2d 224 sub \tmp3\().2d, \in1\().2d, \in3\().2d 225 sub \tmp4\().2d, \in2\().2d, \in4\().2d 226 rshrn \out1\().2s, \tmp1\().2d, #14 227 rshrn2 \out1\().4s, \tmp2\().2d, #14 228 rshrn \out2\().2s, \tmp3\().2d, #14 229 rshrn2 \out2\().4s, \tmp4\().2d, #14 230 .endm 231 232 .macro iwht4_10 c0, c1, c2, c3 233 add \c0\().4s, \c0\().4s, \c1\().4s 234 sub v17.4s, \c2\().4s, \c3\().4s 235 sub v16.4s, \c0\().4s, v17.4s 236 sshr v16.4s, v16.4s, #1 237 sub \c2\().4s, v16.4s, \c1\().4s 238 sub \c1\().4s, v16.4s, \c3\().4s 239 add \c3\().4s, v17.4s, \c2\().4s 240 sub \c0\().4s, \c0\().4s, \c1\().4s 241 .endm 242 243 .macro iwht4_12 c0, c1, c2, c3 244 iwht4_10 \c0, \c1, \c2, \c3 245 .endm 246 247 .macro idct4_10 c0, c1, c2, c3 248 mul v22.4s, \c1\().4s, v0.s[3] 249 mul v20.4s, \c1\().4s, v0.s[2] 250 add v16.4s, \c0\().4s, \c2\().4s 251 sub v17.4s, \c0\().4s, \c2\().4s 252 mla v22.4s, \c3\().4s, v0.s[2] 253 mul v18.4s, v16.4s, v0.s[0] 254 mul v24.4s, v17.4s, v0.s[0] 255 mls v20.4s, \c3\().4s, v0.s[3] 256 srshr v22.4s, v22.4s, #14 257 srshr v18.4s, v18.4s, #14 258 srshr v24.4s, v24.4s, #14 259 srshr v20.4s, v20.4s, #14 260 add \c0\().4s, v18.4s, v22.4s 261 sub \c3\().4s, v18.4s, v22.4s 262 add \c1\().4s, v24.4s, v20.4s 263 sub \c2\().4s, v24.4s, v20.4s 264 .endm 265 266 .macro idct4_12 c0, c1, c2, c3 267 smull v22.2d, \c1\().2s, v0.s[3] 268 smull2 v23.2d, \c1\().4s, v0.s[3] 269 smull v20.2d, \c1\().2s, v0.s[2] 270 smull2 v21.2d, \c1\().4s, v0.s[2] 271 add v16.4s, \c0\().4s, \c2\().4s 272 sub v17.4s, \c0\().4s, \c2\().4s 273 smlal v22.2d, \c3\().2s, v0.s[2] 274 smlal2 v23.2d, \c3\().4s, v0.s[2] 275 smull v18.2d, v16.2s, v0.s[0] 276 smull2 v19.2d, v16.4s, v0.s[0] 277 smull v24.2d, v17.2s, v0.s[0] 278 smull2 v25.2d, v17.4s, v0.s[0] 279 smlsl v20.2d, \c3\().2s, v0.s[3] 280 smlsl2 v21.2d, \c3\().4s, v0.s[3] 281 rshrn v22.2s, v22.2d, #14 282 rshrn2 v22.4s, v23.2d, #14 283 rshrn v18.2s, v18.2d, #14 284 rshrn2 v18.4s, v19.2d, #14 285 rshrn v24.2s, v24.2d, #14 286 rshrn2 v24.4s, v25.2d, #14 287 rshrn v20.2s, v20.2d, #14 288 rshrn2 v20.4s, v21.2d, #14 289 add \c0\().4s, v18.4s, v22.4s 290 sub \c3\().4s, v18.4s, v22.4s 291 add \c1\().4s, v24.4s, v20.4s 292 sub \c2\().4s, v24.4s, v20.4s 293 .endm 294 295 .macro iadst4_10 c0, c1, c2, c3 296 mul v16.4s, \c0\().4s, v1.s[0] 297 mla v16.4s, \c2\().4s, v1.s[1] 298 mla v16.4s, \c3\().4s, v1.s[2] 299 mul v18.4s, \c0\().4s, v1.s[2] 300 mls v18.4s, \c2\().4s, v1.s[0] 301 sub \c0\().4s, \c0\().4s, \c2\().4s 302 mls v18.4s, \c3\().4s, v1.s[1] 303 add \c0\().4s, \c0\().4s, \c3\().4s 304 mul v22.4s, \c1\().4s, v1.s[3] 305 mul v20.4s, \c0\().4s, v1.s[3] 306 add v24.4s, v16.4s, v22.4s 307 add v26.4s, v18.4s, v22.4s 308 srshr \c0\().4s, v24.4s, #14 309 add v16.4s, v16.4s, v18.4s 310 srshr \c1\().4s, v26.4s, #14 311 sub v16.4s, v16.4s, v22.4s 312 srshr \c2\().4s, v20.4s, #14 313 srshr \c3\().4s, v16.4s, #14 314 .endm 315 316 .macro iadst4_12 c0, c1, c2, c3 317 smull v16.2d, \c0\().2s, v1.s[0] 318 smull2 v17.2d, \c0\().4s, v1.s[0] 319 smlal v16.2d, \c2\().2s, v1.s[1] 320 smlal2 v17.2d, \c2\().4s, v1.s[1] 321 smlal v16.2d, \c3\().2s, v1.s[2] 322 smlal2 v17.2d, \c3\().4s, v1.s[2] 323 smull v18.2d, \c0\().2s, v1.s[2] 324 smull2 v19.2d, \c0\().4s, v1.s[2] 325 smlsl v18.2d, \c2\().2s, v1.s[0] 326 smlsl2 v19.2d, \c2\().4s, v1.s[0] 327 sub \c0\().4s, \c0\().4s, \c2\().4s 328 smlsl v18.2d, \c3\().2s, v1.s[1] 329 smlsl2 v19.2d, \c3\().4s, v1.s[1] 330 add \c0\().4s, \c0\().4s, \c3\().4s 331 smull v22.2d, \c1\().2s, v1.s[3] 332 smull2 v23.2d, \c1\().4s, v1.s[3] 333 smull v20.2d, \c0\().2s, v1.s[3] 334 smull2 v21.2d, \c0\().4s, v1.s[3] 335 add v24.2d, v16.2d, v22.2d 336 add v25.2d, v17.2d, v23.2d 337 add v26.2d, v18.2d, v22.2d 338 add v27.2d, v19.2d, v23.2d 339 rshrn \c0\().2s, v24.2d, #14 340 rshrn2 \c0\().4s, v25.2d, #14 341 add v16.2d, v16.2d, v18.2d 342 add v17.2d, v17.2d, v19.2d 343 rshrn \c1\().2s, v26.2d, #14 344 rshrn2 \c1\().4s, v27.2d, #14 345 sub v16.2d, v16.2d, v22.2d 346 sub v17.2d, v17.2d, v23.2d 347 rshrn \c2\().2s, v20.2d, #14 348 rshrn2 \c2\().4s, v21.2d, #14 349 rshrn \c3\().2s, v16.2d, #14 350 rshrn2 \c3\().4s, v17.2d, #14 351 .endm 352 353 // The public functions in this file have got the following signature: 354 // void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 355 356 .macro itxfm_func4x4 txfm1, txfm2, bpp 357 function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1 358 .ifc \txfm1,\txfm2 359 .ifc \txfm1,idct 360 movrel x4, itxfm4_coeffs 361 ld1 {v0.4h}, [x4] 362 sxtl v0.4s, v0.4h 363 .endif 364 .ifc \txfm1,iadst 365 movrel x4, iadst4_coeffs 366 ld1 {v0.d}[1], [x4] 367 sxtl2 v1.4s, v0.8h 368 .endif 369 .else 370 movrel x4, itxfm4_coeffs 371 ld1 {v0.8h}, [x4] 372 sxtl2 v1.4s, v0.8h 373 sxtl v0.4s, v0.4h 374 .endif 375 376 movi v30.4s, #0 377 movi v31.4s, #0 378 .ifc \txfm1\()_\txfm2,idct_idct 379 cmp w3, #1 380 b.ne 1f 381 // DC-only for idct/idct 382 ld1 {v2.s}[0], [x2] 383 smull v2.2d, v2.2s, v0.s[0] 384 rshrn v2.2s, v2.2d, #14 385 smull v2.2d, v2.2s, v0.s[0] 386 rshrn v2.2s, v2.2d, #14 387 st1 {v31.s}[0], [x2] 388 dup v4.4s, v2.s[0] 389 mov v5.16b, v4.16b 390 mov v6.16b, v4.16b 391 mov v7.16b, v4.16b 392 b 2f 393 .endif 394 395 1: 396 ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2] 397 st1 {v30.4s,v31.4s}, [x2], #32 398 399 .ifc \txfm1,iwht 400 sshr v4.4s, v4.4s, #2 401 sshr v5.4s, v5.4s, #2 402 sshr v6.4s, v6.4s, #2 403 sshr v7.4s, v7.4s, #2 404 .endif 405 406 \txfm1\()4_\bpp v4, v5, v6, v7 407 408 st1 {v30.4s,v31.4s}, [x2], #32 409 // Transpose 4x4 with 32 bit elements 410 transpose_4x4s v4, v5, v6, v7, v16, v17, v18, v19 411 412 \txfm2\()4_\bpp v4, v5, v6, v7 413 2: 414 mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8 415 ld1 {v0.4h}, [x0], x1 416 ld1 {v1.4h}, [x0], x1 417 .ifnc \txfm1,iwht 418 srshr v4.4s, v4.4s, #4 419 srshr v5.4s, v5.4s, #4 420 srshr v6.4s, v6.4s, #4 421 srshr v7.4s, v7.4s, #4 422 .endif 423 uaddw v4.4s, v4.4s, v0.4h 424 uaddw v5.4s, v5.4s, v1.4h 425 ld1 {v2.4h}, [x0], x1 426 ld1 {v3.4h}, [x0], x1 427 sqxtun v0.4h, v4.4s 428 sqxtun2 v0.8h, v5.4s 429 sub x0, x0, x1, lsl #2 430 431 uaddw v6.4s, v6.4s, v2.4h 432 umin v0.8h, v0.8h, v31.8h 433 uaddw v7.4s, v7.4s, v3.4h 434 st1 {v0.4h}, [x0], x1 435 sqxtun v2.4h, v6.4s 436 sqxtun2 v2.8h, v7.4s 437 umin v2.8h, v2.8h, v31.8h 438 439 st1 {v0.d}[1], [x0], x1 440 st1 {v2.4h}, [x0], x1 441 st1 {v2.d}[1], [x0], x1 442 443 ret 444 endfunc 445 .endm 446 447 .macro itxfm_funcs4x4 bpp 448 itxfm_func4x4 idct, idct, \bpp 449 itxfm_func4x4 iadst, idct, \bpp 450 itxfm_func4x4 idct, iadst, \bpp 451 itxfm_func4x4 iadst, iadst, \bpp 452 itxfm_func4x4 iwht, iwht, \bpp 453 .endm 454 455 itxfm_funcs4x4 10 456 itxfm_funcs4x4 12 457 458 function idct8x8_dc_add_neon 459 movrel x4, idct_coeffs 460 ld1 {v0.4h}, [x4] 461 462 movi v1.4h, #0 463 sxtl v0.4s, v0.4h 464 465 ld1 {v2.s}[0], [x2] 466 smull v2.2d, v2.2s, v0.s[0] 467 rshrn v2.2s, v2.2d, #14 468 smull v2.2d, v2.2s, v0.s[0] 469 rshrn v2.2s, v2.2d, #14 470 st1 {v1.s}[0], [x2] 471 dup v2.4s, v2.s[0] 472 473 srshr v2.4s, v2.4s, #5 474 475 mov x4, #8 476 mov x3, x0 477 dup v31.8h, w5 478 1: 479 // Loop to add the constant from v2 into all 8x8 outputs 480 subs x4, x4, #2 481 ld1 {v3.8h}, [x0], x1 482 ld1 {v4.8h}, [x0], x1 483 uaddw v16.4s, v2.4s, v3.4h 484 uaddw2 v17.4s, v2.4s, v3.8h 485 uaddw v18.4s, v2.4s, v4.4h 486 uaddw2 v19.4s, v2.4s, v4.8h 487 sqxtun v3.4h, v16.4s 488 sqxtun2 v3.8h, v17.4s 489 sqxtun v4.4h, v18.4s 490 sqxtun2 v4.8h, v19.4s 491 umin v3.8h, v3.8h, v31.8h 492 umin v4.8h, v4.8h, v31.8h 493 st1 {v3.8h}, [x3], x1 494 st1 {v4.8h}, [x3], x1 495 b.ne 1b 496 497 ret 498 endfunc 499 500 .macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5 501 dmbutterfly0 \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a 502 dmbutterfly \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3 // r2 = t2a, r6 = t3a 503 dmbutterfly \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3 // r1 = t4a, r7 = t7a 504 dmbutterfly \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3 // r5 = t5a, r3 = t6a 505 506 butterfly_4s \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3 507 butterfly_4s \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a 508 butterfly_4s \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a 509 butterfly_4s \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2 510 511 dmbutterfly0 \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5 512 513 butterfly_4s \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6] 514 butterfly_4s \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7] 515 butterfly_4s \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5] 516 butterfly_4s \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4] 517 .endm 518 519 .macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5 520 dmbutterfly_l \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0] // t2,t3 = t1a, t0,t1 = t0a 521 dmbutterfly_l \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0] // r0,r7 = t5a, t4,t5 = t4a 522 523 dbutterfly_n \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4 524 dbutterfly_n \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5 525 526 dmbutterfly_l \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2] // t4,t5 = t3a, t2,t3 = t2a 527 dmbutterfly_l \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2] // r2,r5 = t7a, r0,r7 = t6a 528 529 dbutterfly_n \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6 530 dbutterfly_n \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7 531 532 butterfly_4s \r7, \r4, \r4, \r0 // r7 = -out[7], r4 = t3 533 neg \r7\().4s, \r7\().4s // r7 = out[7] 534 butterfly_4s \r0, \r1, \r3, \r1 // r0 = out[0], r1 = t2 535 536 dmbutterfly_l \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3] // r2,r3 = t5a, t3,t5 = t4a 537 dmbutterfly_l \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2] // t0,t1 = t6a, r5,r6 = t7a 538 539 dbutterfly_n \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6], t2 = t7 540 541 dmbutterfly0 \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2 // r3 = -out[3], r4 = out[4] 542 neg \r3\().4s, \r3\().4s // r3 = out[3] 543 544 dbutterfly_n \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6 545 neg \r1\().4s, \r1\().4s // r1 = out[1] 546 547 dmbutterfly0 \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5 // r2 = out[2], r5 = -out[5] 548 neg \r5\().4s, \r5\().4s // r5 = out[5] 549 .endm 550 551 552 .macro itxfm_func8x8 txfm1, txfm2 553 function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon 554 .ifc \txfm1\()_\txfm2,idct_idct 555 cmp w3, #1 556 b.eq idct8x8_dc_add_neon 557 .endif 558 // The iadst also uses a few coefficients from 559 // idct, so those always need to be loaded. 560 .ifc \txfm1\()_\txfm2,idct_idct 561 movrel x4, idct_coeffs 562 .else 563 movrel x4, iadst8_coeffs 564 ld1 {v1.8h}, [x4], #16 565 stp d8, d9, [sp, #-0x10]! 566 sxtl2 v3.4s, v1.8h 567 sxtl v2.4s, v1.4h 568 .endif 569 ld1 {v0.8h}, [x4] 570 sxtl2 v1.4s, v0.8h 571 sxtl v0.4s, v0.4h 572 573 movi v4.4s, #0 574 movi v5.4s, #0 575 movi v6.4s, #0 576 movi v7.4s, #0 577 578 1: 579 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2], #64 580 ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], #64 581 ld1 {v24.4s,v25.4s,v26.4s,v27.4s}, [x2], #64 582 ld1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64 583 sub x2, x2, #256 584 st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64 585 st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64 586 st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64 587 st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64 588 589 .ifc \txfm1\()_\txfm2,idct_idct 590 idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7 591 idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7 592 .else 593 \txfm1\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9 594 \txfm1\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9 595 .endif 596 597 // Transpose 8x8 with 16 bit elements 598 transpose_8x8s v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7 599 600 .ifc \txfm1\()_\txfm2,idct_idct 601 idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7 602 idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7 603 .else 604 \txfm2\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9 605 \txfm2\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9 606 .endif 607 2: 608 mov x3, x0 609 // Add into the destination 610 ld1 {v0.8h}, [x0], x1 611 srshr v16.4s, v16.4s, #5 612 srshr v17.4s, v17.4s, #5 613 ld1 {v1.8h}, [x0], x1 614 srshr v18.4s, v18.4s, #5 615 srshr v19.4s, v19.4s, #5 616 ld1 {v2.8h}, [x0], x1 617 srshr v20.4s, v20.4s, #5 618 srshr v21.4s, v21.4s, #5 619 uaddw v16.4s, v16.4s, v0.4h 620 uaddw2 v17.4s, v17.4s, v0.8h 621 ld1 {v3.8h}, [x0], x1 622 srshr v22.4s, v22.4s, #5 623 srshr v23.4s, v23.4s, #5 624 uaddw v18.4s, v18.4s, v1.4h 625 uaddw2 v19.4s, v19.4s, v1.8h 626 ld1 {v4.8h}, [x0], x1 627 srshr v24.4s, v24.4s, #5 628 srshr v25.4s, v25.4s, #5 629 uaddw v20.4s, v20.4s, v2.4h 630 uaddw2 v21.4s, v21.4s, v2.8h 631 sqxtun v0.4h, v16.4s 632 sqxtun2 v0.8h, v17.4s 633 dup v16.8h, w5 634 ld1 {v5.8h}, [x0], x1 635 srshr v26.4s, v26.4s, #5 636 srshr v27.4s, v27.4s, #5 637 uaddw v22.4s, v22.4s, v3.4h 638 uaddw2 v23.4s, v23.4s, v3.8h 639 sqxtun v1.4h, v18.4s 640 sqxtun2 v1.8h, v19.4s 641 umin v0.8h, v0.8h, v16.8h 642 ld1 {v6.8h}, [x0], x1 643 srshr v28.4s, v28.4s, #5 644 srshr v29.4s, v29.4s, #5 645 uaddw v24.4s, v24.4s, v4.4h 646 uaddw2 v25.4s, v25.4s, v4.8h 647 sqxtun v2.4h, v20.4s 648 sqxtun2 v2.8h, v21.4s 649 umin v1.8h, v1.8h, v16.8h 650 ld1 {v7.8h}, [x0], x1 651 srshr v30.4s, v30.4s, #5 652 srshr v31.4s, v31.4s, #5 653 uaddw v26.4s, v26.4s, v5.4h 654 uaddw2 v27.4s, v27.4s, v5.8h 655 sqxtun v3.4h, v22.4s 656 sqxtun2 v3.8h, v23.4s 657 umin v2.8h, v2.8h, v16.8h 658 659 st1 {v0.8h}, [x3], x1 660 uaddw v28.4s, v28.4s, v6.4h 661 uaddw2 v29.4s, v29.4s, v6.8h 662 st1 {v1.8h}, [x3], x1 663 sqxtun v4.4h, v24.4s 664 sqxtun2 v4.8h, v25.4s 665 umin v3.8h, v3.8h, v16.8h 666 st1 {v2.8h}, [x3], x1 667 uaddw v30.4s, v30.4s, v7.4h 668 uaddw2 v31.4s, v31.4s, v7.8h 669 st1 {v3.8h}, [x3], x1 670 sqxtun v5.4h, v26.4s 671 sqxtun2 v5.8h, v27.4s 672 umin v4.8h, v4.8h, v16.8h 673 st1 {v4.8h}, [x3], x1 674 sqxtun v6.4h, v28.4s 675 sqxtun2 v6.8h, v29.4s 676 umin v5.8h, v5.8h, v16.8h 677 st1 {v5.8h}, [x3], x1 678 sqxtun v7.4h, v30.4s 679 sqxtun2 v7.8h, v31.4s 680 umin v6.8h, v6.8h, v16.8h 681 682 st1 {v6.8h}, [x3], x1 683 umin v7.8h, v7.8h, v16.8h 684 st1 {v7.8h}, [x3], x1 685 686 .ifnc \txfm1\()_\txfm2,idct_idct 687 ldp d8, d9, [sp], 0x10 688 .endif 689 ret 690 endfunc 691 692 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1 693 mov x5, #0x03ff 694 b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon 695 endfunc 696 697 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1 698 mov x5, #0x0fff 699 b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon 700 endfunc 701 .endm 702 703 itxfm_func8x8 idct, idct 704 itxfm_func8x8 iadst, idct 705 itxfm_func8x8 idct, iadst 706 itxfm_func8x8 iadst, iadst 707 708 709 function idct16x16_dc_add_neon 710 movrel x4, idct_coeffs 711 ld1 {v0.4h}, [x4] 712 sxtl v0.4s, v0.4h 713 714 movi v1.4h, #0 715 716 ld1 {v2.s}[0], [x2] 717 smull v2.2d, v2.2s, v0.s[0] 718 rshrn v2.2s, v2.2d, #14 719 smull v2.2d, v2.2s, v0.s[0] 720 rshrn v2.2s, v2.2d, #14 721 st1 {v1.s}[0], [x2] 722 dup v2.4s, v2.s[0] 723 724 srshr v0.4s, v2.4s, #6 725 726 mov x3, x0 727 mov x4, #16 728 dup v31.8h, w13 729 1: 730 // Loop to add the constant from v2 into all 16x16 outputs 731 subs x4, x4, #2 732 ld1 {v1.8h,v2.8h}, [x0], x1 733 uaddw v16.4s, v0.4s, v1.4h 734 uaddw2 v17.4s, v0.4s, v1.8h 735 ld1 {v3.8h,v4.8h}, [x0], x1 736 uaddw v18.4s, v0.4s, v2.4h 737 uaddw2 v19.4s, v0.4s, v2.8h 738 uaddw v20.4s, v0.4s, v3.4h 739 uaddw2 v21.4s, v0.4s, v3.8h 740 uaddw v22.4s, v0.4s, v4.4h 741 uaddw2 v23.4s, v0.4s, v4.8h 742 sqxtun v1.4h, v16.4s 743 sqxtun2 v1.8h, v17.4s 744 sqxtun v2.4h, v18.4s 745 sqxtun2 v2.8h, v19.4s 746 sqxtun v3.4h, v20.4s 747 sqxtun2 v3.8h, v21.4s 748 sqxtun v4.4h, v22.4s 749 sqxtun2 v4.8h, v23.4s 750 umin v1.8h, v1.8h, v31.8h 751 umin v2.8h, v2.8h, v31.8h 752 st1 {v1.8h,v2.8h}, [x3], x1 753 umin v3.8h, v3.8h, v31.8h 754 umin v4.8h, v4.8h, v31.8h 755 st1 {v3.8h,v4.8h}, [x3], x1 756 b.ne 1b 757 758 ret 759 endfunc 760 761 .macro idct16_end 762 butterfly_4s v18, v7, v4, v7 // v18 = t0a, v7 = t7a 763 butterfly_4s v19, v22, v5, v22 // v19 = t1a, v22 = t6 764 butterfly_4s v4, v26, v20, v26 // v4 = t2a, v26 = t5 765 butterfly_4s v5, v6, v28, v6 // v5 = t3a, v6 = t4 766 butterfly_4s v20, v28, v16, v24 // v20 = t8a, v28 = t11a 767 butterfly_4s v24, v21, v23, v21 // v24 = t9, v21 = t10 768 butterfly_4s v23, v27, v25, v27 // v23 = t14, v27 = t13 769 butterfly_4s v25, v29, v29, v17 // v25 = t15a, v29 = t12a 770 771 dmbutterfly0 v8, v9, v27, v21, v8, v9, v16, v17, v30, v31 // v8 = t13a, v9 = t10a 772 dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11 773 774 butterfly_4s v16, v31, v18, v25 // v16 = out[0], v31 = out[15] 775 butterfly_4s v17, v30, v19, v23 // v17 = out[1], v30 = out[14] 776 butterfly_4s_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6] 777 butterfly_4s v23, v24, v7, v20 // v23 = out[7], v24 = out[8] 778 butterfly_4s v18, v29, v4, v8 // v18 = out[2], v29 = out[13] 779 butterfly_4s v19, v28, v5, v28 // v19 = out[3], v28 = out[12] 780 butterfly_4s v20, v27, v6, v27 // v20 = out[4], v27 = out[11] 781 butterfly_4s v21, v26, v26, v9 // v21 = out[5], v26 = out[10] 782 ret 783 .endm 784 785 function idct16 786 dmbutterfly0 v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a 787 dmbutterfly v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a 788 dmbutterfly v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a 789 dmbutterfly v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a, v22 = t6a 790 dmbutterfly v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a, v31 = t15a 791 dmbutterfly v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a, v23 = t14a 792 dmbutterfly v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a 793 dmbutterfly v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a 794 795 butterfly_4s v4, v28, v16, v28 // v4 = t0, v28 = t3 796 butterfly_4s v5, v20, v24, v20 // v5 = t1, v20 = t2 797 butterfly_4s v6, v26, v18, v26 // v6 = t4, v26 = t5 798 butterfly_4s v7, v22, v30, v22 // v7 = t7, v22 = t6 799 butterfly_4s v16, v25, v17, v25 // v16 = t8, v25 = t9 800 butterfly_4s v24, v21, v29, v21 // v24 = t11, v21 = t10 801 butterfly_4s v17, v27, v19, v27 // v17 = t12, v27 = t13 802 butterfly_4s v29, v23, v31, v23 // v29 = t15, v23 = t14 803 804 dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31 // v22 = t6a, v26 = t5a 805 dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a 806 dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a 807 idct16_end 808 endfunc 809 810 function idct16_half 811 dmbutterfly0_h v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a 812 dmbutterfly_h1 v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a 813 dmbutterfly_h1 v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a 814 dmbutterfly_h2 v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a, v22 = t6a 815 dmbutterfly_h1 v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a, v31 = t15a 816 dmbutterfly_h2 v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a, v23 = t14a 817 dmbutterfly_h1 v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a 818 dmbutterfly_h2 v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a 819 820 butterfly_4s v4, v28, v16, v28 // v4 = t0, v28 = t3 821 butterfly_4s v5, v20, v24, v20 // v5 = t1, v20 = t2 822 butterfly_4s v6, v26, v18, v26 // v6 = t4, v26 = t5 823 butterfly_4s v7, v22, v30, v22 // v7 = t7, v22 = t6 824 butterfly_4s v16, v25, v17, v25 // v16 = t8, v25 = t9 825 butterfly_4s v24, v21, v29, v21 // v24 = t11, v21 = t10 826 butterfly_4s v17, v27, v19, v27 // v17 = t12, v27 = t13 827 butterfly_4s v29, v23, v31, v23 // v29 = t15, v23 = t14 828 829 dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31 // v22 = t6a, v26 = t5a 830 dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a 831 dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a 832 idct16_end 833 endfunc 834 835 function idct16_quarter 836 dsmull_h v24, v25, v19, v3.s[3] 837 dsmull_h v4, v5, v17, v2.s[0] 838 dsmull_h v7, v6, v18, v1.s[1] 839 dsmull_h v30, v31, v18, v1.s[0] 840 neg v24.2d, v24.2d 841 neg v25.2d, v25.2d 842 dsmull_h v29, v28, v17, v2.s[1] 843 dsmull_h v26, v27, v19, v3.s[2] 844 dsmull_h v22, v23, v16, v0.s[0] 845 drshrn_h v24, v24, v25, #14 846 drshrn_h v16, v4, v5, #14 847 drshrn_h v7, v7, v6, #14 848 drshrn_h v6, v30, v31, #14 849 drshrn_h v29, v29, v28, #14 850 drshrn_h v17, v26, v27, #14 851 drshrn_h v28, v22, v23, #14 852 853 dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3] 854 dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3] 855 neg v22.2d, v22.2d 856 neg v23.2d, v23.2d 857 drshrn_h v27, v20, v21, #14 858 drshrn_h v21, v22, v23, #14 859 drshrn_h v23, v18, v19, #14 860 drshrn_h v25, v30, v31, #14 861 mov v4.16b, v28.16b 862 mov v5.16b, v28.16b 863 dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31 864 mov v20.16b, v28.16b 865 idct16_end 866 endfunc 867 868 function iadst16 869 ld1 {v0.8h,v1.8h}, [x11] 870 sxtl v2.4s, v1.4h 871 sxtl2 v3.4s, v1.8h 872 sxtl2 v1.4s, v0.8h 873 sxtl v0.4s, v0.4h 874 875 dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.s[1], v0.s[0] // v6,v7 = t1, v4,v5 = t0 876 dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.s[1], v1.s[0] // v10,v11 = t9, v8,v9 = t8 877 dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a 878 dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2] // v14,v15 = t3, v12,v13 = t2 879 dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a 880 881 dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.s[3], v1.s[2] // v6,v7 = t11, v4,v5 = t10 882 dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a 883 dmbutterfly_l v10, v11, v8, v9, v27, v20, v2.s[1], v2.s[0] // v10,v11 = t5, v8,v9 = t4 884 dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a 885 886 dmbutterfly_l v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0] // v14,v15 = t13, v12,v13 = t12 887 dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a 888 dmbutterfly_l v6, v7, v4, v5, v25, v22, v2.s[3], v2.s[2] // v6,v7 = t7, v4,v5 = t6 889 dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a 890 891 dmbutterfly_l v10, v11, v8, v9, v17, v30, v3.s[3], v3.s[2] // v10,v11 = t15, v8,v9 = t14 892 ld1 {v0.8h}, [x10] 893 dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a 894 sxtl2 v1.4s, v0.8h 895 sxtl v0.4s, v0.4h 896 dmbutterfly_l v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1] // v14,v15 = t9, v12,v13 = t8 897 dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a 898 899 dmbutterfly_l v4, v5, v6, v7, v28, v19, v1.s[1], v1.s[0] // v4,v5 = t12, v6,v7 = t13 900 dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a 901 dmbutterfly_l v10, v11, v8, v9, v21, v26, v1.s[2], v1.s[3] // v10,v11 = t11, v8,v9 = t10 902 butterfly_4s_r v4, v27, v16, v27 // v4 = t4, v27 = t0 903 dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a 904 905 dmbutterfly_l v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2] // v12,v13 = t14, v14,v15 = t15 906 butterfly_4s_r v5, v20, v31, v20 // v5 = t5, v20 = t1 907 dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a 908 dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a 909 910 butterfly_4s_r v6, v25, v18, v25 // v6 = t6, v25 = t2 911 butterfly_4s_r v7, v22, v29, v22 // v7 = t7, v22 = t3 912 913 dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.s[2], v0.s[3] // v10,v11 = t13, v8,v9 = t12 914 dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2] // v12,v13 = t14, v14,v15 = t15 915 916 dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a 917 dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a 918 neg v29.4s, v29.4s // v29 = out[13] 919 920 dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.s[2], v0.s[3] // v10,v11 = t5a, v8,v9 = t4a 921 dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.s[3], v0.s[2] // v12,v13 = t6a, v14,v15 = t7a 922 923 butterfly_4s v2, v6, v27, v25 // v2 = out[0], v6 = t2a 924 butterfly_4s v3, v7, v23, v21 // v3 =-out[1], v7 = t10 925 926 dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6 927 neg v19.4s, v19.4s // v19 = out[3] 928 dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7 929 930 butterfly_4s v5, v8, v20, v22 // v5 =-out[15],v8 = t3a 931 butterfly_4s v4, v9, v24, v26 // v4 = out[14],v9 = t11 932 933 dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8] 934 dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10] 935 dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11] 936 dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9] 937 938 neg v31.4s, v5.4s // v31 = out[15] 939 neg v17.4s, v3.4s // v17 = out[1] 940 941 mov v16.16b, v2.16b 942 mov v30.16b, v4.16b 943 ret 944 endfunc 945 946 // Helper macros; we can't use these expressions directly within 947 // e.g. .irp due to the extra concatenation \(). Therefore wrap 948 // them in macros to allow using .irp below. 949 .macro load i, src, inc 950 ld1 {v\i\().4s}, [\src], \inc 951 .endm 952 .macro store i, dst, inc 953 st1 {v\i\().4s}, [\dst], \inc 954 .endm 955 .macro movi_v i, size, imm 956 movi v\i\()\size, \imm 957 .endm 958 .macro load_clear i, src, inc 959 ld1 {v\i\().4s}, [\src] 960 st1 {v4.4s}, [\src], \inc 961 .endm 962 963 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7 964 srshr \coef0, \coef0, #6 965 ld1 {v4.4h}, [x0], x1 966 srshr \coef1, \coef1, #6 967 ld1 {v4.d}[1], [x3], x1 968 srshr \coef2, \coef2, #6 969 ld1 {v5.4h}, [x0], x1 970 srshr \coef3, \coef3, #6 971 uaddw \coef0, \coef0, v4.4h 972 ld1 {v5.d}[1], [x3], x1 973 srshr \coef4, \coef4, #6 974 uaddw2 \coef1, \coef1, v4.8h 975 ld1 {v6.4h}, [x0], x1 976 srshr \coef5, \coef5, #6 977 uaddw \coef2, \coef2, v5.4h 978 ld1 {v6.d}[1], [x3], x1 979 sqxtun v4.4h, \coef0 980 srshr \coef6, \coef6, #6 981 uaddw2 \coef3, \coef3, v5.8h 982 ld1 {v7.4h}, [x0], x1 983 sqxtun2 v4.8h, \coef1 984 srshr \coef7, \coef7, #6 985 uaddw \coef4, \coef4, v6.4h 986 ld1 {v7.d}[1], [x3], x1 987 umin v4.8h, v4.8h, v8.8h 988 sub x0, x0, x1, lsl #2 989 sub x3, x3, x1, lsl #2 990 sqxtun v5.4h, \coef2 991 uaddw2 \coef5, \coef5, v6.8h 992 st1 {v4.4h}, [x0], x1 993 sqxtun2 v5.8h, \coef3 994 uaddw \coef6, \coef6, v7.4h 995 st1 {v4.d}[1], [x3], x1 996 umin v5.8h, v5.8h, v8.8h 997 sqxtun v6.4h, \coef4 998 uaddw2 \coef7, \coef7, v7.8h 999 st1 {v5.4h}, [x0], x1 1000 sqxtun2 v6.8h, \coef5 1001 st1 {v5.d}[1], [x3], x1 1002 umin v6.8h, v6.8h, v8.8h 1003 sqxtun v7.4h, \coef6 1004 st1 {v6.4h}, [x0], x1 1005 sqxtun2 v7.8h, \coef7 1006 st1 {v6.d}[1], [x3], x1 1007 umin v7.8h, v7.8h, v8.8h 1008 st1 {v7.4h}, [x0], x1 1009 st1 {v7.d}[1], [x3], x1 1010 .endm 1011 1012 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, 1013 // transpose into a horizontal 16x4 slice and store. 1014 // x0 = dst (temp buffer) 1015 // x1 = slice offset 1016 // x2 = src 1017 // x9 = input stride 1018 .macro itxfm16_1d_funcs txfm 1019 function \txfm\()16_1d_4x16_pass1_neon 1020 mov x14, x30 1021 1022 movi v4.4s, #0 1023 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1024 load_clear \i, x2, x9 1025 .endr 1026 1027 bl \txfm\()16 1028 1029 // Do four 4x4 transposes. Originally, v16-v31 contain the 1030 // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 1031 // contain the four transposed 4x4 blocks. 1032 transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7 1033 transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7 1034 transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7 1035 transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7 1036 1037 // Store the transposed 4x4 blocks horizontally. 1038 cmp x1, #12 1039 b.eq 1f 1040 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 1041 store \i, x0, #16 1042 .endr 1043 ret x14 1044 1: 1045 // Special case: For the last input column (x1 == 12), 1046 // which would be stored as the last row in the temp buffer, 1047 // don't store the first 4x4 block, but keep it in registers 1048 // for the first slice of the second pass (where it is the 1049 // last 4x4 block). 1050 add x0, x0, #16 1051 st1 {v20.4s}, [x0], #16 1052 st1 {v24.4s}, [x0], #16 1053 st1 {v28.4s}, [x0], #16 1054 add x0, x0, #16 1055 st1 {v21.4s}, [x0], #16 1056 st1 {v25.4s}, [x0], #16 1057 st1 {v29.4s}, [x0], #16 1058 add x0, x0, #16 1059 st1 {v22.4s}, [x0], #16 1060 st1 {v26.4s}, [x0], #16 1061 st1 {v30.4s}, [x0], #16 1062 add x0, x0, #16 1063 st1 {v23.4s}, [x0], #16 1064 st1 {v27.4s}, [x0], #16 1065 st1 {v31.4s}, [x0], #16 1066 1067 mov v28.16b, v16.16b 1068 mov v29.16b, v17.16b 1069 mov v30.16b, v18.16b 1070 mov v31.16b, v19.16b 1071 ret x14 1072 endfunc 1073 1074 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, 1075 // load the destination pixels (from a similar 4x16 slice), add and store back. 1076 // x0 = dst 1077 // x1 = dst stride 1078 // x2 = src (temp buffer) 1079 // x3 = slice offset 1080 // x9 = temp buffer stride 1081 function \txfm\()16_1d_4x16_pass2_neon 1082 mov x14, x30 1083 1084 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 1085 load \i, x2, x9 1086 .endr 1087 cbz x3, 1f 1088 .irp i, 28, 29, 30, 31 1089 load \i, x2, x9 1090 .endr 1091 1: 1092 1093 add x3, x0, x1 1094 lsl x1, x1, #1 1095 bl \txfm\()16 1096 1097 dup v8.8h, w13 1098 load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 1099 load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1100 1101 ret x14 1102 endfunc 1103 .endm 1104 1105 itxfm16_1d_funcs idct 1106 itxfm16_1d_funcs iadst 1107 1108 // This is the minimum eob value for each subpartition, in increments of 4 1109 const min_eob_idct_idct_16, align=4 1110 .short 0, 10, 38, 89 1111 endconst 1112 1113 .macro itxfm_func16x16 txfm1, txfm2 1114 function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon 1115 .ifc \txfm1\()_\txfm2,idct_idct 1116 cmp w3, #1 1117 b.eq idct16x16_dc_add_neon 1118 .endif 1119 mov x15, x30 1120 // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9. 1121 .ifnc \txfm1\()_\txfm2,idct_idct 1122 stp d14, d15, [sp, #-0x10]! 1123 stp d12, d13, [sp, #-0x10]! 1124 stp d10, d11, [sp, #-0x10]! 1125 .endif 1126 stp d8, d9, [sp, #-0x10]! 1127 1128 sub sp, sp, #1024 1129 1130 mov x4, x0 1131 mov x5, x1 1132 mov x6, x2 1133 1134 movrel x10, idct_coeffs 1135 .ifnc \txfm1\()_\txfm2,idct_idct 1136 movrel x11, iadst16_coeffs 1137 .endif 1138 .ifc \txfm1,idct 1139 ld1 {v0.8h,v1.8h}, [x10] 1140 sxtl v2.4s, v1.4h 1141 sxtl2 v3.4s, v1.8h 1142 sxtl2 v1.4s, v0.8h 1143 sxtl v0.4s, v0.4h 1144 .endif 1145 mov x9, #64 1146 1147 .ifc \txfm1\()_\txfm2,idct_idct 1148 cmp w3, #10 1149 b.le idct16x16_quarter_add_16_neon 1150 cmp w3, #38 1151 b.le idct16x16_half_add_16_neon 1152 1153 movrel x12, min_eob_idct_idct_16, 2 1154 .endif 1155 1156 .irp i, 0, 4, 8, 12 1157 add x0, sp, #(\i*64) 1158 .ifc \txfm1\()_\txfm2,idct_idct 1159 .if \i > 0 1160 ldrh w1, [x12], #2 1161 cmp w3, w1 1162 mov x1, #(16 - \i)/4 1163 b.le 1f 1164 .endif 1165 .endif 1166 mov x1, #\i 1167 add x2, x6, #(\i*4) 1168 bl \txfm1\()16_1d_4x16_pass1_neon 1169 .endr 1170 .ifc \txfm1\()_\txfm2,iadst_idct 1171 ld1 {v0.8h,v1.8h}, [x10] 1172 sxtl v2.4s, v1.4h 1173 sxtl2 v3.4s, v1.8h 1174 sxtl2 v1.4s, v0.8h 1175 sxtl v0.4s, v0.4h 1176 .endif 1177 1178 .ifc \txfm1\()_\txfm2,idct_idct 1179 b 3f 1180 1: 1181 // Set v28-v31 to zero, for the in-register passthrough of 1182 // coefficients to pass 2. 1183 movi v28.4s, #0 1184 movi v29.4s, #0 1185 movi v30.4s, #0 1186 movi v31.4s, #0 1187 2: 1188 subs x1, x1, #1 1189 .rept 4 1190 st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9 1191 .endr 1192 b.ne 2b 1193 3: 1194 .endif 1195 1196 .irp i, 0, 4, 8, 12 1197 add x0, x4, #(\i*2) 1198 mov x1, x5 1199 add x2, sp, #(\i*4) 1200 mov x3, #\i 1201 bl \txfm2\()16_1d_4x16_pass2_neon 1202 .endr 1203 1204 add sp, sp, #1024 1205 ldp d8, d9, [sp], 0x10 1206 .ifnc \txfm1\()_\txfm2,idct_idct 1207 ldp d10, d11, [sp], 0x10 1208 ldp d12, d13, [sp], 0x10 1209 ldp d14, d15, [sp], 0x10 1210 .endif 1211 ret x15 1212 endfunc 1213 1214 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1 1215 mov x13, #0x03ff 1216 b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon 1217 endfunc 1218 1219 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1 1220 mov x13, #0x0fff 1221 b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon 1222 endfunc 1223 .endm 1224 1225 itxfm_func16x16 idct, idct 1226 itxfm_func16x16 iadst, idct 1227 itxfm_func16x16 idct, iadst 1228 itxfm_func16x16 iadst, iadst 1229 1230 function idct16_1d_4x16_pass1_quarter_neon 1231 mov x14, x30 1232 1233 movi v4.4s, #0 1234 .irp i, 16, 17, 18, 19 1235 load_clear \i, x2, x9 1236 .endr 1237 1238 bl idct16_quarter 1239 1240 // Do four 4x4 transposes. Originally, v16-v31 contain the 1241 // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 1242 // contain the four transposed 4x4 blocks. 1243 transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7 1244 transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7 1245 transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7 1246 transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7 1247 1248 // Store the transposed 4x4 blocks horizontally. 1249 // The first 4x4 block is kept in registers for the second pass, 1250 // store the rest in the temp buffer. 1251 add x0, x0, #16 1252 st1 {v20.4s}, [x0], #16 1253 st1 {v24.4s}, [x0], #16 1254 st1 {v28.4s}, [x0], #16 1255 add x0, x0, #16 1256 st1 {v21.4s}, [x0], #16 1257 st1 {v25.4s}, [x0], #16 1258 st1 {v29.4s}, [x0], #16 1259 add x0, x0, #16 1260 st1 {v22.4s}, [x0], #16 1261 st1 {v26.4s}, [x0], #16 1262 st1 {v30.4s}, [x0], #16 1263 add x0, x0, #16 1264 st1 {v23.4s}, [x0], #16 1265 st1 {v27.4s}, [x0], #16 1266 st1 {v31.4s}, [x0], #16 1267 ret x14 1268 endfunc 1269 1270 function idct16_1d_4x16_pass2_quarter_neon 1271 mov x14, x30 1272 1273 // Only load the top 4 lines, and only do it for the later slices. 1274 // For the first slice, d16-d19 is kept in registers from the first pass. 1275 cbz x3, 1f 1276 .irp i, 16, 17, 18, 19 1277 load \i, x2, x9 1278 .endr 1279 1: 1280 1281 add x3, x0, x1 1282 lsl x1, x1, #1 1283 bl idct16_quarter 1284 1285 dup v8.8h, w13 1286 load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 1287 load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1288 1289 ret x14 1290 endfunc 1291 1292 function idct16_1d_4x16_pass1_half_neon 1293 mov x14, x30 1294 1295 movi v4.4s, #0 1296 .irp i, 16, 17, 18, 19, 20, 21, 22, 23 1297 load_clear \i, x2, x9 1298 .endr 1299 1300 bl idct16_half 1301 1302 // Do four 4x4 transposes. Originally, v16-v31 contain the 1303 // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 1304 // contain the four transposed 4x4 blocks. 1305 transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7 1306 transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7 1307 transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7 1308 transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7 1309 1310 // Store the transposed 4x4 blocks horizontally. 1311 cmp x1, #4 1312 b.eq 1f 1313 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 1314 store \i, x0, #16 1315 .endr 1316 ret x14 1317 1: 1318 // Special case: For the second input column (r1 == 4), 1319 // which would be stored as the second row in the temp buffer, 1320 // don't store the first 4x4 block, but keep it in registers 1321 // for the first slice of the second pass (where it is the 1322 // second 4x4 block). 1323 add x0, x0, #16 1324 st1 {v20.4s}, [x0], #16 1325 st1 {v24.4s}, [x0], #16 1326 st1 {v28.4s}, [x0], #16 1327 add x0, x0, #16 1328 st1 {v21.4s}, [x0], #16 1329 st1 {v25.4s}, [x0], #16 1330 st1 {v29.4s}, [x0], #16 1331 add x0, x0, #16 1332 st1 {v22.4s}, [x0], #16 1333 st1 {v26.4s}, [x0], #16 1334 st1 {v30.4s}, [x0], #16 1335 add x0, x0, #16 1336 st1 {v23.4s}, [x0], #16 1337 st1 {v27.4s}, [x0], #16 1338 st1 {v31.4s}, [x0], #16 1339 1340 mov v20.16b, v16.16b 1341 mov v21.16b, v17.16b 1342 mov v22.16b, v18.16b 1343 mov v23.16b, v19.16b 1344 ret x14 1345 endfunc 1346 1347 function idct16_1d_4x16_pass2_half_neon 1348 mov x14, x30 1349 1350 .irp i, 16, 17, 18, 19 1351 load \i, x2, x9 1352 .endr 1353 cbz x3, 1f 1354 .irp i, 20, 21, 22, 23 1355 load \i, x2, x9 1356 .endr 1357 1: 1358 1359 add x3, x0, x1 1360 lsl x1, x1, #1 1361 bl idct16_half 1362 1363 dup v8.8h, w13 1364 load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s 1365 load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s 1366 1367 ret x14 1368 endfunc 1369 1370 .macro idct16_partial size 1371 function idct16x16_\size\()_add_16_neon 1372 add x0, sp, #(0*64) 1373 mov x1, #0 1374 add x2, x6, #(0*4) 1375 bl idct16_1d_4x16_pass1_\size\()_neon 1376 .ifc \size,half 1377 add x0, sp, #(4*64) 1378 mov x1, #4 1379 add x2, x6, #(4*4) 1380 bl idct16_1d_4x16_pass1_\size\()_neon 1381 .endif 1382 1383 .irp i, 0, 4, 8, 12 1384 add x0, x4, #(\i*2) 1385 mov x1, x5 1386 add x2, sp, #(\i*4) 1387 mov x3, #\i 1388 bl idct16_1d_4x16_pass2_\size\()_neon 1389 .endr 1390 1391 add sp, sp, #1024 1392 ldp d8, d9, [sp], 0x10 1393 ret x15 1394 endfunc 1395 .endm 1396 1397 idct16_partial quarter 1398 idct16_partial half 1399 1400 function idct32x32_dc_add_neon 1401 movrel x4, idct_coeffs 1402 ld1 {v0.4h}, [x4] 1403 sxtl v0.4s, v0.4h 1404 1405 movi v1.4h, #0 1406 1407 ld1 {v2.s}[0], [x2] 1408 smull v2.2d, v2.2s, v0.s[0] 1409 rshrn v2.2s, v2.2d, #14 1410 smull v2.2d, v2.2s, v0.s[0] 1411 rshrn v2.2s, v2.2d, #14 1412 st1 {v1.s}[0], [x2] 1413 dup v2.4s, v2.s[0] 1414 1415 srshr v0.4s, v2.4s, #6 1416 1417 mov x3, x0 1418 mov x4, #32 1419 sub x1, x1, #32 1420 dup v31.8h, w13 1421 1: 1422 // Loop to add the constant v0 into all 32x32 outputs 1423 subs x4, x4, #1 1424 ld1 {v1.8h,v2.8h}, [x0], #32 1425 uaddw v16.4s, v0.4s, v1.4h 1426 uaddw2 v17.4s, v0.4s, v1.8h 1427 ld1 {v3.8h,v4.8h}, [x0], x1 1428 uaddw v18.4s, v0.4s, v2.4h 1429 uaddw2 v19.4s, v0.4s, v2.8h 1430 uaddw v20.4s, v0.4s, v3.4h 1431 uaddw2 v21.4s, v0.4s, v3.8h 1432 uaddw v22.4s, v0.4s, v4.4h 1433 uaddw2 v23.4s, v0.4s, v4.8h 1434 sqxtun v1.4h, v16.4s 1435 sqxtun2 v1.8h, v17.4s 1436 sqxtun v2.4h, v18.4s 1437 sqxtun2 v2.8h, v19.4s 1438 sqxtun v3.4h, v20.4s 1439 sqxtun2 v3.8h, v21.4s 1440 sqxtun v4.4h, v22.4s 1441 sqxtun2 v4.8h, v23.4s 1442 umin v1.8h, v1.8h, v31.8h 1443 umin v2.8h, v2.8h, v31.8h 1444 st1 {v1.8h,v2.8h}, [x3], #32 1445 umin v3.8h, v3.8h, v31.8h 1446 umin v4.8h, v4.8h, v31.8h 1447 st1 {v3.8h,v4.8h}, [x3], x1 1448 b.ne 1b 1449 1450 ret 1451 endfunc 1452 1453 .macro idct32_end 1454 butterfly_4s v16, v5, v4, v5 // v16 = t16a, v5 = t19a 1455 butterfly_4s v17, v20, v23, v20 // v17 = t17, v20 = t18 1456 butterfly_4s v18, v6, v7, v6 // v18 = t23a, v6 = t20a 1457 butterfly_4s v19, v21, v22, v21 // v19 = t22, v21 = t21 1458 butterfly_4s v4, v28, v28, v30 // v4 = t24a, v28 = t27a 1459 butterfly_4s v23, v26, v25, v26 // v23 = t25, v26 = t26 1460 butterfly_4s v7, v8, v29, v31 // v7 = t31a, v3 = t28a 1461 butterfly_4s v22, v27, v24, v27 // v22 = t30, v27 = t29 1462 1463 dmbutterfly v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a 1464 dmbutterfly v8, v5, v0.s[2], v0.s[3], v24, v25, v30, v31 // v3 = t19, v5 = t28 1465 dmbutterfly v28, v6, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20 1466 dmbutterfly v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a 1467 1468 butterfly_4s v31, v24, v7, v4 // v31 = t31, v24 = t24 1469 butterfly_4s v30, v25, v22, v23 // v30 = t30a, v25 = t25a 1470 butterfly_4s_r v23, v16, v16, v18 // v23 = t23, v16 = t16 1471 butterfly_4s_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a 1472 butterfly_4s v18, v21, v27, v21 // v18 = t18, v21 = t21 1473 butterfly_4s_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a 1474 butterfly_4s v29, v26, v20, v26 // v29 = t29, v26 = t26 1475 butterfly_4s v19, v20, v8, v6 // v19 = t19a, v20 = t20 1476 1477 dmbutterfly0 v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27, v20 = t20 1478 dmbutterfly0 v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a 1479 dmbutterfly0 v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25, v22 = t22 1480 dmbutterfly0 v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a 1481 ret 1482 .endm 1483 1484 function idct32_odd 1485 dmbutterfly v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a 1486 dmbutterfly v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a 1487 dmbutterfly v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a 1488 dmbutterfly v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a 1489 dmbutterfly v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a 1490 dmbutterfly v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a 1491 dmbutterfly v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a 1492 dmbutterfly v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a 1493 1494 butterfly_4s v4, v24, v16, v24 // v4 = t16, v24 = t17 1495 butterfly_4s v5, v20, v28, v20 // v5 = t19, v20 = t18 1496 butterfly_4s v6, v26, v18, v26 // v6 = t20, v26 = t21 1497 butterfly_4s v7, v22, v30, v22 // v7 = t23, v22 = t22 1498 butterfly_4s v28, v25, v17, v25 // v28 = t24, v25 = t25 1499 butterfly_4s v30, v21, v29, v21 // v30 = t27, v21 = t26 1500 butterfly_4s v29, v23, v31, v23 // v29 = t31, v23 = t30 1501 butterfly_4s v31, v27, v19, v27 // v31 = t28, v27 = t29 1502 1503 dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19 // v23 = t17a, v24 = t30a 1504 dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a 1505 dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19 // v21 = t21a, v26 = t26a 1506 dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a 1507 idct32_end 1508 endfunc 1509 1510 function idct32_odd_half 1511 dmbutterfly_h1 v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a 1512 dmbutterfly_h2 v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a 1513 dmbutterfly_h1 v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a 1514 dmbutterfly_h2 v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a 1515 dmbutterfly_h1 v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a 1516 dmbutterfly_h2 v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a 1517 dmbutterfly_h1 v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a 1518 dmbutterfly_h2 v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a 1519 1520 butterfly_4s v4, v24, v16, v24 // v4 = t16, v24 = t17 1521 butterfly_4s v5, v20, v28, v20 // v5 = t19, v20 = t18 1522 butterfly_4s v6, v26, v18, v26 // v6 = t20, v26 = t21 1523 butterfly_4s v7, v22, v30, v22 // v7 = t23, v22 = t22 1524 butterfly_4s v28, v25, v17, v25 // v28 = t24, v25 = t25 1525 butterfly_4s v30, v21, v29, v21 // v30 = t27, v21 = t26 1526 butterfly_4s v29, v23, v31, v23 // v29 = t31, v23 = t30 1527 butterfly_4s v31, v27, v19, v27 // v31 = t28, v27 = t29 1528 1529 dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19 // v23 = t17a, v24 = t30a 1530 dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a 1531 dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19 // v21 = t21a, v26 = t26a 1532 dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a 1533 idct32_end 1534 endfunc 1535 1536 function idct32_odd_quarter 1537 dsmull_h v4, v5, v16, v10.s[0] 1538 dsmull_h v28, v29, v19, v11.s[3] 1539 dsmull_h v30, v31, v16, v10.s[1] 1540 dsmull_h v22, v23, v17, v13.s[2] 1541 dsmull_h v7, v6, v17, v13.s[3] 1542 dsmull_h v26, v27, v19, v11.s[2] 1543 dsmull_h v20, v21, v18, v12.s[0] 1544 dsmull_h v24, v25, v18, v12.s[1] 1545 1546 neg v28.2d, v28.2d 1547 neg v29.2d, v29.2d 1548 neg v7.2d, v7.2d 1549 neg v6.2d, v6.2d 1550 1551 drshrn_h v4, v4, v5, #14 1552 drshrn_h v5, v28, v29, #14 1553 drshrn_h v29, v30, v31, #14 1554 drshrn_h v28, v22, v23, #14 1555 drshrn_h v7, v7, v6, #14 1556 drshrn_h v31, v26, v27, #14 1557 drshrn_h v6, v20, v21, #14 1558 drshrn_h v30, v24, v25, #14 1559 1560 dmbutterfly_l v16, v17, v18, v19, v29, v4, v1.s[0], v1.s[1] 1561 dmbutterfly_l v27, v26, v20, v21, v31, v5, v1.s[0], v1.s[1] 1562 drshrn_h v23, v16, v17, #14 1563 drshrn_h v24, v18, v19, #14 1564 neg v20.2d, v20.2d 1565 neg v21.2d, v21.2d 1566 drshrn_h v27, v27, v26, #14 1567 drshrn_h v20, v20, v21, #14 1568 dmbutterfly_l v16, v17, v18, v19, v30, v6, v1.s[2], v1.s[3] 1569 drshrn_h v21, v16, v17, #14 1570 drshrn_h v26, v18, v19, #14 1571 dmbutterfly_l v16, v17, v18, v19, v28, v7, v1.s[2], v1.s[3] 1572 drshrn_h v25, v16, v17, #14 1573 neg v18.2d, v18.2d 1574 neg v19.2d, v19.2d 1575 drshrn_h v22, v18, v19, #14 1576 1577 idct32_end 1578 endfunc 1579 1580 .macro idct32_funcs suffix 1581 // Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix. 1582 // The 32-point IDCT can be decomposed into two 16-point IDCTs; 1583 // a normal IDCT16 with every other input component (the even ones, with 1584 // each output written twice), followed by a separate 16-point IDCT 1585 // of the odd inputs, added/subtracted onto the outputs of the first idct16. 1586 // x0 = dst (temp buffer) 1587 // x1 = unused 1588 // x2 = src 1589 // x9 = double input stride 1590 function idct32_1d_4x32_pass1\suffix\()_neon 1591 mov x14, x30 1592 1593 movi v4.4s, #0 1594 1595 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) 1596 .ifb \suffix 1597 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1598 load_clear \i, x2, x9 1599 .endr 1600 .endif 1601 .ifc \suffix,_quarter 1602 .irp i, 16, 17, 18, 19 1603 load_clear \i, x2, x9 1604 .endr 1605 .endif 1606 .ifc \suffix,_half 1607 .irp i, 16, 17, 18, 19, 20, 21, 22, 23 1608 load_clear \i, x2, x9 1609 .endr 1610 .endif 1611 1612 bl idct16\suffix 1613 1614 // Do four 4x4 transposes. Originally, v16-v31 contain the 1615 // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 1616 // contain the four transposed 4x4 blocks. 1617 transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7 1618 transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7 1619 transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7 1620 transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7 1621 1622 // Store the registers a, b, c, d horizontally, followed by the 1623 // same registers d, c, b, a mirrored. 1624 .macro store_rev a, b, c, d 1625 // There's no rev128 instruction, but we reverse each 64 bit 1626 // half, and then flip them using an ext with 8 bytes offset. 1627 rev64 v7.4s, \d 1628 st1 {\a}, [x0], #16 1629 ext v7.16b, v7.16b, v7.16b, #8 1630 st1 {\b}, [x0], #16 1631 rev64 v6.4s, \c 1632 st1 {\c}, [x0], #16 1633 ext v6.16b, v6.16b, v6.16b, #8 1634 st1 {\d}, [x0], #16 1635 rev64 v5.4s, \b 1636 st1 {v7.4s}, [x0], #16 1637 ext v5.16b, v5.16b, v5.16b, #8 1638 st1 {v6.4s}, [x0], #16 1639 rev64 v4.4s, \a 1640 st1 {v5.4s}, [x0], #16 1641 ext v4.16b, v4.16b, v4.16b, #8 1642 st1 {v4.4s}, [x0], #16 1643 .endm 1644 store_rev v16.4s, v20.4s, v24.4s, v28.4s 1645 store_rev v17.4s, v21.4s, v25.4s, v29.4s 1646 store_rev v18.4s, v22.4s, v26.4s, v30.4s 1647 store_rev v19.4s, v23.4s, v27.4s, v31.4s 1648 sub x0, x0, #512 1649 .purgem store_rev 1650 1651 // Move x2 back to the start of the input, and move 1652 // to the first odd row 1653 .ifb \suffix 1654 sub x2, x2, x9, lsl #4 1655 .endif 1656 .ifc \suffix,_quarter 1657 sub x2, x2, x9, lsl #2 1658 .endif 1659 .ifc \suffix,_half 1660 sub x2, x2, x9, lsl #3 1661 .endif 1662 add x2, x2, #128 1663 1664 movi v4.4s, #0 1665 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) 1666 .ifb \suffix 1667 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1668 load_clear \i, x2, x9 1669 .endr 1670 .endif 1671 .ifc \suffix,_quarter 1672 .irp i, 16, 17, 18, 19 1673 load_clear \i, x2, x9 1674 .endr 1675 .endif 1676 .ifc \suffix,_half 1677 .irp i, 16, 17, 18, 19, 20, 21, 22, 23 1678 load_clear \i, x2, x9 1679 .endr 1680 .endif 1681 1682 bl idct32_odd\suffix 1683 1684 transpose_4x4s v31, v30, v29, v28, v4, v5, v6, v7 1685 transpose_4x4s v27, v26, v25, v24, v4, v5, v6, v7 1686 transpose_4x4s v23, v22, v21, v20, v4, v5, v6, v7 1687 transpose_4x4s v19, v18, v17, v16, v4, v5, v6, v7 1688 1689 // Store the registers a, b, c, d horizontally, 1690 // adding into the output first, and the mirrored, 1691 // subtracted from the output. 1692 .macro store_rev a, b, c, d, a16b, b16b 1693 ld1 {v4.4s}, [x0] 1694 rev64 v9.4s, \d 1695 add v4.4s, v4.4s, \a 1696 st1 {v4.4s}, [x0], #16 1697 rev64 v8.4s, \c 1698 ld1 {v4.4s}, [x0] 1699 ext v9.16b, v9.16b, v9.16b, #8 1700 add v4.4s, v4.4s, \b 1701 st1 {v4.4s}, [x0], #16 1702 ext v8.16b, v8.16b, v8.16b, #8 1703 ld1 {v4.4s}, [x0] 1704 rev64 \b, \b 1705 add v4.4s, v4.4s, \c 1706 st1 {v4.4s}, [x0], #16 1707 rev64 \a, \a 1708 ld1 {v4.4s}, [x0] 1709 ext \b16b, \b16b, \b16b, #8 1710 add v4.4s, v4.4s, \d 1711 st1 {v4.4s}, [x0], #16 1712 ext \a16b, \a16b, \a16b, #8 1713 ld1 {v4.4s}, [x0] 1714 sub v4.4s, v4.4s, v9.4s 1715 st1 {v4.4s}, [x0], #16 1716 ld1 {v4.4s}, [x0] 1717 sub v4.4s, v4.4s, v8.4s 1718 st1 {v4.4s}, [x0], #16 1719 ld1 {v4.4s}, [x0] 1720 sub v4.4s, v4.4s, \b 1721 st1 {v4.4s}, [x0], #16 1722 ld1 {v4.4s}, [x0] 1723 sub v4.4s, v4.4s, \a 1724 st1 {v4.4s}, [x0], #16 1725 .endm 1726 1727 store_rev v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b 1728 store_rev v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b 1729 store_rev v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b 1730 store_rev v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b 1731 .purgem store_rev 1732 ret x14 1733 endfunc 1734 1735 // This is mostly the same as 4x32_pass1, but without the transpose, 1736 // and use the source as temp buffer between the two idct passes, and 1737 // add into the destination. 1738 // x0 = dst 1739 // x1 = dst stride 1740 // x2 = src (temp buffer) 1741 // x7 = negative double temp buffer stride 1742 // x9 = double temp buffer stride 1743 function idct32_1d_4x32_pass2\suffix\()_neon 1744 mov x14, x30 1745 1746 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) 1747 .ifb \suffix 1748 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1749 load \i, x2, x9 1750 .endr 1751 sub x2, x2, x9, lsl #4 1752 .endif 1753 .ifc \suffix,_quarter 1754 .irp i, 16, 17, 18, 19 1755 load \i, x2, x9 1756 .endr 1757 sub x2, x2, x9, lsl #2 1758 .endif 1759 .ifc \suffix,_half 1760 .irp i, 16, 17, 18, 19, 20, 21, 22, 23 1761 load \i, x2, x9 1762 .endr 1763 sub x2, x2, x9, lsl #3 1764 .endif 1765 1766 bl idct16\suffix 1767 1768 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1769 store \i, x2, x9 1770 .endr 1771 1772 sub x2, x2, x9, lsl #4 1773 add x2, x2, #128 1774 1775 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) 1776 .ifb \suffix 1777 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 1778 load \i, x2, x9 1779 .endr 1780 sub x2, x2, x9, lsl #4 1781 .endif 1782 .ifc \suffix,_quarter 1783 .irp i, 16, 17, 18, 19 1784 load \i, x2, x9 1785 .endr 1786 sub x2, x2, x9, lsl #2 1787 .endif 1788 .ifc \suffix,_half 1789 .irp i, 16, 17, 18, 19, 20, 21, 22, 23 1790 load \i, x2, x9 1791 .endr 1792 sub x2, x2, x9, lsl #3 1793 .endif 1794 sub x2, x2, #128 1795 1796 bl idct32_odd\suffix 1797 1798 .macro load_acc_store a, b, c, d, neg=0 1799 .if \neg == 0 1800 ld1 {v4.4s}, [x2], x9 1801 ld1 {v5.4s}, [x2], x9 1802 add v4.4s, v4.4s, \a 1803 ld1 {v6.4s}, [x2], x9 1804 add v5.4s, v5.4s, \b 1805 ld1 {v7.4s}, [x2], x9 1806 add v6.4s, v6.4s, \c 1807 add v7.4s, v7.4s, \d 1808 .else 1809 ld1 {v4.4s}, [x2], x7 1810 ld1 {v5.4s}, [x2], x7 1811 sub v4.4s, v4.4s, \a 1812 ld1 {v6.4s}, [x2], x7 1813 sub v5.4s, v5.4s, \b 1814 ld1 {v7.4s}, [x2], x7 1815 sub v6.4s, v6.4s, \c 1816 sub v7.4s, v7.4s, \d 1817 .endif 1818 ld1 {v8.4h}, [x0], x1 1819 ld1 {v8.d}[1], [x0], x1 1820 srshr v4.4s, v4.4s, #6 1821 ld1 {v9.4h}, [x0], x1 1822 srshr v5.4s, v5.4s, #6 1823 uaddw v4.4s, v4.4s, v8.4h 1824 ld1 {v9.d}[1], [x0], x1 1825 srshr v6.4s, v6.4s, #6 1826 uaddw2 v5.4s, v5.4s, v8.8h 1827 srshr v7.4s, v7.4s, #6 1828 sub x0, x0, x1, lsl #2 1829 uaddw v6.4s, v6.4s, v9.4h 1830 sqxtun v4.4h, v4.4s 1831 uaddw2 v7.4s, v7.4s, v9.8h 1832 sqxtun2 v4.8h, v5.4s 1833 umin v4.8h, v4.8h, v15.8h 1834 st1 {v4.4h}, [x0], x1 1835 sqxtun v5.4h, v6.4s 1836 st1 {v4.d}[1], [x0], x1 1837 sqxtun2 v5.8h, v7.4s 1838 umin v5.8h, v5.8h, v15.8h 1839 st1 {v5.4h}, [x0], x1 1840 st1 {v5.d}[1], [x0], x1 1841 .endm 1842 load_acc_store v31.4s, v30.4s, v29.4s, v28.4s 1843 load_acc_store v27.4s, v26.4s, v25.4s, v24.4s 1844 load_acc_store v23.4s, v22.4s, v21.4s, v20.4s 1845 load_acc_store v19.4s, v18.4s, v17.4s, v16.4s 1846 sub x2, x2, x9 1847 load_acc_store v16.4s, v17.4s, v18.4s, v19.4s, 1 1848 load_acc_store v20.4s, v21.4s, v22.4s, v23.4s, 1 1849 load_acc_store v24.4s, v25.4s, v26.4s, v27.4s, 1 1850 load_acc_store v28.4s, v29.4s, v30.4s, v31.4s, 1 1851 .purgem load_acc_store 1852 ret x14 1853 endfunc 1854 .endm 1855 1856 idct32_funcs 1857 idct32_funcs _quarter 1858 idct32_funcs _half 1859 1860 const min_eob_idct_idct_32, align=4 1861 .short 0, 9, 34, 70, 135, 240, 336, 448 1862 endconst 1863 1864 function vp9_idct_idct_32x32_add_16_neon 1865 cmp w3, #1 1866 b.eq idct32x32_dc_add_neon 1867 1868 movrel x10, idct_coeffs 1869 1870 mov x15, x30 1871 stp d8, d9, [sp, #-0x10]! 1872 stp d10, d11, [sp, #-0x10]! 1873 stp d12, d13, [sp, #-0x10]! 1874 stp d14, d15, [sp, #-0x10]! 1875 1876 sub sp, sp, #4096 1877 1878 mov x4, x0 1879 mov x5, x1 1880 mov x6, x2 1881 1882 // Double stride of the input, since we only read every other line 1883 mov x9, #256 1884 neg x7, x9 1885 1886 ld1 {v0.8h,v1.8h}, [x10], #32 1887 sxtl v2.4s, v1.4h 1888 sxtl2 v3.4s, v1.8h 1889 sxtl2 v1.4s, v0.8h 1890 sxtl v0.4s, v0.4h 1891 ld1 {v10.8h,v11.8h}, [x10] 1892 sxtl v12.4s, v11.4h 1893 sxtl2 v13.4s, v11.8h 1894 sxtl2 v11.4s, v10.8h 1895 sxtl v10.4s, v10.4h 1896 1897 dup v15.8h, w13 1898 1899 cmp w3, #34 1900 b.le idct32x32_quarter_add_16_neon 1901 cmp w3, #135 1902 b.le idct32x32_half_add_16_neon 1903 1904 movrel x12, min_eob_idct_idct_32, 2 1905 1906 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 1907 add x0, sp, #(\i*128) 1908 .if \i > 0 1909 ldrh w1, [x12], #2 1910 cmp w3, w1 1911 mov x1, #(32 - \i)/4 1912 b.le 1f 1913 .endif 1914 add x2, x6, #(\i*4) 1915 bl idct32_1d_4x32_pass1_neon 1916 .endr 1917 b 3f 1918 1919 1: 1920 // Write zeros to the temp buffer for pass 2 1921 movi v16.4s, #0 1922 movi v17.4s, #0 1923 movi v18.4s, #0 1924 movi v19.4s, #0 1925 2: 1926 subs x1, x1, #1 1927 .rept 4 1928 st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 1929 st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 1930 .endr 1931 b.ne 2b 1932 3: 1933 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 1934 add x0, x4, #(\i*2) 1935 mov x1, x5 1936 add x2, sp, #(\i*4) 1937 bl idct32_1d_4x32_pass2_neon 1938 .endr 1939 1940 add sp, sp, #4096 1941 ldp d14, d15, [sp], 0x10 1942 ldp d12, d13, [sp], 0x10 1943 ldp d10, d11, [sp], 0x10 1944 ldp d8, d9, [sp], 0x10 1945 1946 ret x15 1947 endfunc 1948 1949 function ff_vp9_idct_idct_32x32_add_10_neon, export=1 1950 mov x13, #0x03ff 1951 b vp9_idct_idct_32x32_add_16_neon 1952 endfunc 1953 1954 function ff_vp9_idct_idct_32x32_add_12_neon, export=1 1955 mov x13, #0x0fff 1956 b vp9_idct_idct_32x32_add_16_neon 1957 endfunc 1958 1959 .macro idct32_partial size 1960 function idct32x32_\size\()_add_16_neon 1961 .irp i, 0, 4 1962 add x0, sp, #(\i*128) 1963 .ifc \size,quarter 1964 .if \i == 4 1965 cmp w3, #9 1966 b.le 1f 1967 .endif 1968 .endif 1969 add x2, x6, #(\i*4) 1970 bl idct32_1d_4x32_pass1_\size\()_neon 1971 .endr 1972 1973 .ifc \size,half 1974 .irp i, 8, 12 1975 add x0, sp, #(\i*128) 1976 .if \i == 12 1977 cmp w3, #70 1978 b.le 1f 1979 .endif 1980 add x2, x6, #(\i*4) 1981 bl idct32_1d_4x32_pass1_\size\()_neon 1982 .endr 1983 .endif 1984 b 3f 1985 1986 1: 1987 // Write zeros to the temp buffer for pass 2 1988 movi v16.4s, #0 1989 movi v17.4s, #0 1990 movi v18.4s, #0 1991 movi v19.4s, #0 1992 1993 .rept 4 1994 st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 1995 st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 1996 .endr 1997 1998 3: 1999 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 2000 add x0, x4, #(\i*2) 2001 mov x1, x5 2002 add x2, sp, #(\i*4) 2003 bl idct32_1d_4x32_pass2_\size\()_neon 2004 .endr 2005 2006 add sp, sp, #4096 2007 ldp d14, d15, [sp], 0x10 2008 ldp d12, d13, [sp], 0x10 2009 ldp d10, d11, [sp], 0x10 2010 ldp d8, d9, [sp], 0x10 2011 2012 ret x15 2013 endfunc 2014 .endm 2015 2016 idct32_partial quarter 2017 idct32_partial half 2018