1/*
2 * Copyright (c) 2017 Google Inc.
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22#include "neon.S"
23
24const itxfm4_coeffs, align=4
25        .short  11585, 0, 6270, 15137
26iadst4_coeffs:
27        .short  5283, 15212, 9929, 13377
28endconst
29
30const iadst8_coeffs, align=4
31        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
32idct_coeffs:
33        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
34        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
35        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
36        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
37endconst
38
39const iadst16_coeffs, align=4
40        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
41        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
42endconst
43
44@ Do two 4x4 transposes, using q registers for the subtransposes that don't
45@ need to address the individual d registers.
46@ r0,r1 == rq1, r2,r3 == rq1, etc
47.macro transpose32_q_2x_4x4 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
48        vswp             \r1,  \r4  @ vtrn.64 \rq0, \rq2
49        vswp             \r3,  \r6  @ vtrn.64 \rq1, \rq3
50        vswp             \r9,  \r12 @ vtrn.64 \rq4, \rq6
51        vswp             \r11, \r14 @ vtrn.64 \rq5, \rq7
52        vtrn.32          \rq0, \rq1
53        vtrn.32          \rq2, \rq3
54        vtrn.32          \rq4, \rq5
55        vtrn.32          \rq6, \rq7
56.endm
57
58@ Do eight 2x2 transposes.
59.macro transpose32_8x_2x2 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
60        vtrn.32          \r0,  \r1
61        vtrn.32          \r2,  \r3
62        vtrn.32          \r4,  \r5
63        vtrn.32          \r6,  \r7
64        vtrn.32          \r8,  \r9
65        vtrn.32          \r10, \r11
66        vtrn.32          \r12, \r13
67        vtrn.32          \r14, \r15
68.endm
69
70@ out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
71@ out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
72@ in/out are d registers
73.macro mbutterfly0 out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4, neg=0
74        vadd.s32        \tmpd1, \in1,  \in2
75        vsub.s32        \tmpd2, \in1,  \in2
76.if \neg > 0
77        vneg.s32        \tmpd1, \tmpd1
78.endif
79        vmull.s32       \tmpq3, \tmpd1, d0[0]
80        vmull.s32       \tmpq4, \tmpd2, d0[0]
81        vrshrn.s64      \out1, \tmpq3, #14
82        vrshrn.s64      \out2, \tmpq4, #14
83.endm
84
85@ Same as mbutterfly0 above, but treating the input in in2 as zero,
86@ writing the same output into both out1 and out2.
87.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
88        vmull.s32       \tmpq3, \in1, d0[0]
89        vrshrn.s64      \out1, \tmpq3, #14
90        vrshrn.s64      \out2, \tmpq3, #14
91.endm
92
93@ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
94@ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
95@ Same as mbutterfly0, but with input being 2 q registers, output
96@ being 4 d registers.
97@ This can do with either 4 or 6 temporary q registers.
98.macro dmbutterfly0 out1, out2, out3, out4, in1, in2, tmpq1, tmpq2, tmpd11, tmpd12, tmpd21, tmpd22, tmpq3, tmpq4, tmpq5, tmpq6
99        vadd.s32        \tmpq1, \in1,  \in2
100        vsub.s32        \tmpq2, \in1,  \in2
101        vmull.s32       \tmpq3, \tmpd11, d0[0]
102        vmull.s32       \tmpq4, \tmpd12, d0[0]
103.ifb \tmpq5
104        vrshrn.s64      \out1, \tmpq3, #14
105        vrshrn.s64      \out2, \tmpq4, #14
106        vmull.s32       \tmpq3, \tmpd21, d0[0]
107        vmull.s32       \tmpq4, \tmpd22, d0[0]
108        vrshrn.s64      \out3, \tmpq3, #14
109        vrshrn.s64      \out4, \tmpq4, #14
110.else
111        vmull.s32       \tmpq5, \tmpd21, d0[0]
112        vmull.s32       \tmpq6, \tmpd22, d0[0]
113        vrshrn.s64      \out1, \tmpq3, #14
114        vrshrn.s64      \out2, \tmpq4, #14
115        vrshrn.s64      \out3, \tmpq5, #14
116        vrshrn.s64      \out4, \tmpq6, #14
117.endif
118.endm
119
120@ out1 = in1 * coef1 - in2 * coef2
121@ out2 = in1 * coef2 + in2 * coef1
122@ out are 2 q registers, in are 2 d registers
123.macro mbutterfly_l out1, out2, in1, in2, coef1, coef2, neg=0
124        vmull.s32       \out1, \in1, \coef1
125        vmlsl.s32       \out1, \in2, \coef2
126.if \neg
127        vmov.s64        \out2, #0
128        vmlsl.s32       \out2, \in1, \coef2
129        vmlsl.s32       \out2, \in2, \coef1
130.else
131        vmull.s32       \out2, \in1, \coef2
132        vmlal.s32       \out2, \in2, \coef1
133.endif
134.endm
135
136@ out1,out2 = in1,in2 * coef1 - in3,in4 * coef2
137@ out3,out4 = in1,in2 * coef2 + in3,in4 * coef1
138@ out are 4 q registers, in are 4 d registers
139.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, in3, in4, coef1, coef2
140        vmull.s32       \out1, \in1, \coef1
141        vmull.s32       \out2, \in2, \coef1
142        vmull.s32       \out3, \in1, \coef2
143        vmull.s32       \out4, \in2, \coef2
144        vmlsl.s32       \out1, \in3, \coef2
145        vmlsl.s32       \out2, \in4, \coef2
146        vmlal.s32       \out3, \in3, \coef1
147        vmlal.s32       \out4, \in4, \coef1
148.endm
149
150@ inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
151@ inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
152@ inout are 2 d registers, tmp are 2 q registers
153.macro mbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, neg=0
154        mbutterfly_l    \tmp1, \tmp2, \inout1, \inout2, \coef1, \coef2, \neg
155        vrshrn.s64      \inout1, \tmp1,  #14
156        vrshrn.s64      \inout2, \tmp2,  #14
157.endm
158
159@ Same as mbutterfly above, but treating the input in inout2 as zero
160.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2
161        vmull.s32       \tmp1,   \inout1, \coef1
162        vmull.s32       \tmp2,   \inout1, \coef2
163        vrshrn.s64      \inout1, \tmp1,   #14
164        vrshrn.s64      \inout2, \tmp2,   #14
165.endm
166
167@ Same as mbutterfly above, but treating the input in inout1 as zero
168.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2
169        vmov.s64        \tmp1,   #0
170        vmull.s32       \tmp2,   \inout2, \coef1
171        vmlsl.s32       \tmp1,   \inout2, \coef2
172        vrshrn.s64      \inout2, \tmp2,   #14
173        vrshrn.s64      \inout1, \tmp1,   #14
174.endm
175
176@ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14
177@ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14
178@ inout are 4 d registers, tmp are 4 q registers
179.macro dmbutterfly inout1, inout2, inout3, inout4, coef1, coef2, tmp1, tmp2, tmp3, tmp4
180        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \inout3, \inout4, \coef1, \coef2
181        vrshrn.s64      \inout1, \tmp1,  #14
182        vrshrn.s64      \inout2, \tmp2,  #14
183        vrshrn.s64      \inout3, \tmp3,  #14
184        vrshrn.s64      \inout4, \tmp4,  #14
185.endm
186
187@ out1 = in1 + in2
188@ out2 = in1 - in2
189.macro butterfly out1, out2, in1, in2
190        vadd.s32        \out1, \in1, \in2
191        vsub.s32        \out2, \in1, \in2
192.endm
193
194@ out1 = in1 - in2
195@ out2 = in1 + in2
196.macro butterfly_r out1, out2, in1, in2
197        vsub.s32        \out1, \in1, \in2
198        vadd.s32        \out2, \in1, \in2
199.endm
200
201@ out1 = (in1 + in2 + (1 << 13)) >> 14
202@ out2 = (in1 - in2 + (1 << 13)) >> 14
203@ out are 2 d registers, in are 2 q registers, tmp are 2 q registers
204.macro butterfly_n out1, out2, in1, in2, tmp1, tmp2
205        vadd.s64        \tmp1, \in1, \in2
206        vsub.s64        \tmp2, \in1, \in2
207        vrshrn.s64      \out1, \tmp1,  #14
208        vrshrn.s64      \out2, \tmp2,  #14
209.endm
210
211@ out1,out2 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
212@ out3,out4 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
213@ out are 4 d registers, in are 4 q registers, tmp are 4 q registers
214.macro dbutterfly_n out1, out2, out3, out4, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
215        vadd.s64        \tmp1, \in1, \in3
216        vadd.s64        \tmp2, \in2, \in4
217        vsub.s64        \tmp3, \in1, \in3
218        vsub.s64        \tmp4, \in2, \in4
219        vrshrn.s64      \out1, \tmp1,  #14
220        vrshrn.s64      \out2, \tmp2,  #14
221        vrshrn.s64      \out3, \tmp3,  #14
222        vrshrn.s64      \out4, \tmp4,  #14
223.endm
224
225
226.macro iwht4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
227        vadd.i32        \c0,  \c0,  \c1
228        vsub.i32        q11,  \c2,  \c3
229        vsub.i32        q10,  \c0,  q11
230        vshr.s32        q10,  q10,  #1
231        vsub.i32        \c2,  q10,  \c1
232        vsub.i32        \c1,  q10,  \c3
233        vadd.i32        \c3,  q11,  \c2
234        vsub.i32        \c0,  \c0,  \c1
235.endm
236
237.macro iwht4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
238        iwht4_10        \c0, \c1, \c2, \c3, \cd0, \cd1, \cd2, \cd3, \cd4, \cd5, \cd6, \cd7
239.endm
240
241@ c0 == cd0,cd1, c1 == cd2,cd3
242.macro idct4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
243        vmul.s32        q13,  \c1,  d1[1]
244        vmul.s32        q11,  \c1,  d1[0]
245        vadd.i32        q14,  \c0,  \c2
246        vsub.i32        q15,  \c0,  \c2
247        vmla.s32        q13,  \c3,  d1[0]
248        vmul.s32        q12,  q14,  d0[0]
249        vmul.s32        q10,  q15,  d0[0]
250        vmls.s32        q11,  \c3,  d1[1]
251        vrshr.s32       q13,  q13,  #14
252        vrshr.s32       q12,  q12,  #14
253        vrshr.s32       q10,  q10,  #14
254        vrshr.s32       q11,  q11,  #14
255        vadd.i32        \c0,  q12,  q13
256        vsub.i32        \c3,  q12,  q13
257        vadd.i32        \c1,  q10,  q11
258        vsub.i32        \c2,  q10,  q11
259.endm
260
261.macro idct4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
262        vmull.s32       q13,  \cd2, d1[1]
263        vmull.s32       q15,  \cd3, d1[1]
264        vmull.s32       q11,  \cd2, d1[0]
265        vmull.s32       q3,   \cd3, d1[0]
266        vadd.i32        q14,  \c0,  \c2
267        vsub.i32        q2,   \c0,  \c2
268        vmlal.s32       q13,  \cd6, d1[0]
269        vmlal.s32       q15,  \cd7, d1[0]
270        vmull.s32       q12,  d28,  d0[0]
271        vmull.s32       q14,  d29,  d0[0]
272        vmull.s32       q10,  d4,   d0[0]
273        vmull.s32       q8,   d5,   d0[0]
274        vmlsl.s32       q11,  \cd6, d1[1]
275        vmlsl.s32       q3,   \cd7, d1[1]
276        vrshrn.s64      d26,  q13,  #14
277        vrshrn.s64      d27,  q15,  #14
278        vrshrn.s64      d24,  q12,  #14
279        vrshrn.s64      d25,  q14,  #14
280        vrshrn.s64      d20,  q10,  #14
281        vrshrn.s64      d21,  q8,   #14
282        vrshrn.s64      d22,  q11,  #14
283        vrshrn.s64      d23,  q3,   #14
284        vadd.i32        \c0,  q12,  q13
285        vsub.i32        \c3,  q12,  q13
286        vadd.i32        \c1,  q10,  q11
287        vsub.i32        \c2,  q10,  q11
288.endm
289
290.macro iadst4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
291        vmul.s32        q10,  \c0,  d2[0]
292        vmla.s32        q10,  \c2,  d2[1]
293        vmla.s32        q10,  \c3,  d3[0]
294        vmul.s32        q11,  \c0,  d3[0]
295        vmls.s32        q11,  \c2,  d2[0]
296        vsub.s32        \c0,  \c0,  \c2
297        vmls.s32        q11,  \c3,  d2[1]
298        vadd.s32        \c0,  \c0,  \c3
299        vmul.s32        q13,  \c1,  d3[1]
300        vmul.s32        q12,  \c0,  d3[1]
301        vadd.s32        q14,  q10,  q13
302        vadd.s32        q15,  q11,  q13
303        vrshr.s32       \c0,  q14,  #14
304        vadd.s32        q10,  q10,  q11
305        vrshr.s32       \c1,  q15,  #14
306        vsub.s32        q10,  q10,  q13
307        vrshr.s32       \c2,  q12,  #14
308        vrshr.s32       \c3,  q10,  #14
309.endm
310
311.macro iadst4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
312        vmull.s32       q10,  \cd0, d2[0]
313        vmull.s32       q4,   \cd1, d2[0]
314        vmlal.s32       q10,  \cd4, d2[1]
315        vmlal.s32       q4,   \cd5, d2[1]
316        vmlal.s32       q10,  \cd6, d3[0]
317        vmlal.s32       q4,   \cd7, d3[0]
318        vmull.s32       q11,  \cd0, d3[0]
319        vmull.s32       q5,   \cd1, d3[0]
320        vmlsl.s32       q11,  \cd4, d2[0]
321        vmlsl.s32       q5,   \cd5, d2[0]
322        vsub.s32        \c0,  \c0,  \c2
323        vmlsl.s32       q11,  \cd6, d2[1]
324        vmlsl.s32       q5,   \cd7, d2[1]
325        vadd.s32        \c0,  \c0,  \c3
326        vmull.s32       q13,  \cd2, d3[1]
327        vmull.s32       q6,   \cd3, d3[1]
328        vmull.s32       q12,  \cd0, d3[1]
329        vmull.s32       q7,   \cd1, d3[1]
330        vadd.s64        q14,  q10,  q13
331        vadd.s64        q2,   q4,   q6
332        vadd.s64        q15,  q11,  q13
333        vadd.s64        q3,   q5,   q6
334        vrshrn.s64      \cd1, q2,   #14
335        vrshrn.s64      \cd0, q14,  #14
336        vadd.s64        q10,  q10,  q11
337        vadd.s64        q4,   q4,   q5
338        vrshrn.s64      \cd3, q3,   #14
339        vrshrn.s64      \cd2, q15,  #14
340        vsub.s64        q10,  q10,  q13
341        vsub.s64        q4,   q4,   q6
342        vrshrn.s64      \cd4, q12,  #14
343        vrshrn.s64      \cd5, q7,   #14
344        vrshrn.s64      \cd6, q10,  #14
345        vrshrn.s64      \cd7, q4,   #14
346.endm
347
348@ The public functions in this file have got the following signature:
349@ void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
350
351.macro itxfm_func4x4 txfm1, txfm2, bpp
352function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
353.ifc \txfm1,\txfm2
354.ifc \txfm1,idct
355        movrel          r12, itxfm4_coeffs
356        vld1.16         {d0}, [r12,:64]
357        vmovl.s16       q0,  d0
358.endif
359.ifc \txfm1,iadst
360        movrel          r12, iadst4_coeffs
361        vld1.16         {d1}, [r12,:64]
362        vmovl.s16       q1,  d1
363.endif
364.else
365        movrel          r12, itxfm4_coeffs
366        vld1.16         {q0}, [r12,:128]
367        vmovl.s16       q1,  d1
368        vmovl.s16       q0,  d0
369.endif
370.if \bpp > 10
371.ifnc \txfm1\()_\txfm2,idct_idct
372        @ iadst4_12 needs q4-q7
373        vpush           {q4-q7}
374.endif
375.endif
376
377        vmov.i32        q14, #0
378        vmov.i32        q15, #0
379.ifc \txfm1\()_\txfm2,idct_idct
380        cmp             r3,  #1
381        bne             1f
382        @ DC-only for idct/idct
383        vld1.32         {d4[]},   [r2,:32]
384        vmull.s32       q2,  d4,  d0[0]
385        vrshrn.s64      d4,  q2,  #14
386        vmull.s32       q2,  d4,  d0[0]
387        vrshrn.s64      d4,  q2,  #14
388        vst1.32         {d30[0]}, [r2,:32]
389        vdup.32         q2,  d4[0]
390        vmov            q3,  q2
391        vmov            q8,  q2
392        vmov            q9,  q2
393        b               2f
394.endif
395
3961:
397        vld1.32         {q2-q3},   [r2,:128]
398        vst1.32         {q14-q15}, [r2,:128]!
399        vld1.32         {q8-q9},   [r2,:128]
400
401.ifc \txfm1,iwht
402        vshr.s32        q2,  q2,  #2
403        vshr.s32        q3,  q3,  #2
404        vshr.s32        q8,  q8,  #2
405        vshr.s32        q9,  q9,  #2
406.endif
407
408        vst1.16         {q14-q15}, [r2,:128]!
409        \txfm1\()4_\bpp q2,  q3,  q8,  q9,  d4,  d5,  d6,  d7,  d16, d17, d18, d19
410
411        @ Transpose 4x4 with 32 bit elements
412        vtrn.32         q2,  q3
413        vtrn.32         q8,  q9
414        vswp            d5,  d16
415        vswp            d7,  d18
416
417        \txfm2\()4_\bpp q2,  q3,  q8,  q9,  d4,  d5,  d6,  d7,  d16, d17, d18, d19
4182:
419        vmvn.u16        q15, #((0xffff << \bpp) & 0xffff)
420        vld1.16         {d0},  [r0,:64], r1
421        vld1.16         {d1},  [r0,:64], r1
422.ifnc \txfm1,iwht
423        vrshr.s32       q2,  q2,  #4
424        vrshr.s32       q3,  q3,  #4
425        vrshr.s32       q8,  q8,  #4
426        vrshr.s32       q9,  q9,  #4
427.endif
428        vaddw.u16       q2,  q2,  d0
429        vaddw.u16       q3,  q3,  d1
430        vld1.16         {d2},  [r0,:64], r1
431        vld1.16         {d3},  [r0,:64], r1
432        vqmovun.s32     d0,  q2
433        vqmovun.s32     d1,  q3
434        sub             r0,  r0,  r1, lsl #2
435
436        vaddw.u16       q8,  q8,  d2
437        vmin.u16        q0,  q0,  q15
438        vaddw.u16       q9,  q9,  d3
439        vst1.16         {d0},  [r0,:64], r1
440        vqmovun.s32     d2,  q8
441        vqmovun.s32     d3,  q9
442        vmin.u16        q1,  q1,  q15
443
444        vst1.16         {d1},  [r0,:64], r1
445        vst1.16         {d2},  [r0,:64], r1
446        vst1.16         {d3},  [r0,:64], r1
447
448.if \bpp > 10
449.ifnc \txfm1\()_\txfm2,idct_idct
450        vpop            {q4-q7}
451.endif
452.endif
453        bx              lr
454endfunc
455.endm
456
457.macro itxfm_funcs4x4 bpp
458itxfm_func4x4 idct,  idct,  \bpp
459itxfm_func4x4 iadst, idct,  \bpp
460itxfm_func4x4 idct,  iadst, \bpp
461itxfm_func4x4 iadst, iadst, \bpp
462itxfm_func4x4 iwht,  iwht,  \bpp
463.endm
464
465itxfm_funcs4x4 10
466itxfm_funcs4x4 12
467
468.macro idct8
469        dmbutterfly0    d16, d17, d24, d25, q8,  q12, q2, q4, d4, d5, d8, d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a
470        dmbutterfly     d20, d21, d28, d29, d1[0], d1[1], q2,  q3,  q4,  q5 @ q10 = t2a, q14 = t3a
471        dmbutterfly     d18, d19, d30, d31, d2[0], d2[1], q2,  q3,  q4,  q5 @ q9  = t4a, q15 = t7a
472        dmbutterfly     d26, d27, d22, d23, d3[0], d3[1], q2,  q3,  q4,  q5 @ q13 = t5a, q11 = t6a
473
474        butterfly       q2,  q14, q8,  q14 @ q2 = t0, q14 = t3
475        butterfly       q3,  q10, q12, q10 @ q3 = t1, q10 = t2
476        butterfly       q4,  q13, q9,  q13 @ q4 = t4, q13 = t5a
477        butterfly       q5,  q11, q15, q11 @ q5 = t7, q11 = t6a
478
479        butterfly       q8,  q15, q2,  q5  @ q8 = out[0], q15 = out[7]
480
481        dmbutterfly0    d4,  d5,  d10, d11, q11, q13, q9,  q13, d18, d19, d26, d27, q2,  q5, q11, q12 @ q2 = t6, q5 = t5
482
483        butterfly       q11, q12, q14, q4  @ q11 = out[3], q12 = out[4]
484        butterfly       q9,  q14, q3,  q2  @ q9 = out[1],  q14 = out[6]
485        butterfly_r     q13, q10, q10, q5  @ q13 = out[5], q10 = out[2]
486.endm
487
488.macro iadst8
489        movrel          r12, iadst8_coeffs
490        vld1.16         {q1}, [r12,:128]!
491        vmovl.s16       q0,  d2
492        vmovl.s16       q1,  d3
493
494        dmbutterfly_l   q4,  q5,  q2,  q3,  d30, d31, d16, d17, d0[1], d0[0] @ q4,q5  = t1a, q2,q3 = t0a
495        dmbutterfly_l   q8,  q15, q6,  q7,  d22, d23, d24, d25, d2[1], d2[0] @ q8,q15 = t5a, q6,q7 = t4a
496
497        dbutterfly_n    d22, d23, d4,  d5,  q2,  q3,  q6,  q7,  q11, q12, q2,  q3 @ q11 = t0, q2 = t4
498
499        dbutterfly_n    d24, d25, d6,  d7,  q4,  q5,  q8,  q15, q12, q3,  q6,  q7 @ q12 = t1, q3 = t5
500
501        dmbutterfly_l   q6,  q7,  q4,  q5,  d26, d27, d20, d21, d1[1], d1[0] @ q6,q7 = t3a, q4,q5 = t2a
502        dmbutterfly_l   q10, q13, q8,  q15, d18, d19, d28, d29, d3[1], d3[0] @ q10,q13 = t7a, q8,q15 = t6a
503
504        dbutterfly_n    d18, d19, d8,  d9,  q4,  q5,  q8,  q15, q9,  q14, q4, q5 @ q9 = t2, q4 = t6
505        dbutterfly_n    d16, d17, d12, d13, q6,  q7,  q10, q13, q8,  q15, q6, q7 @ q8 = t3, q6 = t7
506
507        movrel          r12, idct_coeffs
508        vld1.16         {q0}, [r12,:128]
509        vmovl.s16       q1,  d1
510        vmovl.s16       q0,  d0
511
512        butterfly       q15, q12, q12, q8 @ q15 = -out[7], q12 = t3
513        vneg.s32        q15, q15          @ q15 = out[7]
514        butterfly       q8,  q9,  q11, q9 @ q8 = out[0], q9 = t2
515
516        dmbutterfly_l   q10, q11, q5,  q7,  d4,  d5,  d6,  d7,  d1[0], d1[1] @ q10,q11 = t5a, q5,q7 = t4a
517        dmbutterfly_l   q2,  q3,  q13, q14, d12, d13, d8,  d9,  d1[1], d1[0] @ q2,q3 = t6a, q13,q14 = t7a
518
519        dbutterfly_n    d28, d29, d8,  d9,  q10, q11, q13, q14, q4,  q6,  q10, q11 @ q14 = out[6], q4 = t7
520
521        dmbutterfly0    d22, d23, d24, d25, q9,  q12, q6, q13, d12, d13, d26, d27, q9, q10 @ q11 = -out[3], q12 = out[4]
522        vneg.s32        q11, q11      @ q11 = out[3]
523
524        dbutterfly_n    d18, d19, d4,  d5,  q5,  q7,  q2,  q3,  q9, q10, q2,  q3 @ q9 = -out[1], q2 = t6
525        vneg.s32        q9,  q9       @ q9 = out[1]
526
527        dmbutterfly0    d20, d21, d26, d27, q2,  q4,  q3, q5,  d6,  d7,  d10, d11, q6,  q7 @ q10 = out[2], q13 = -out[5]
528        vneg.s32        q13, q13      @ q13 = out[5]
529.endm
530
531function idct8x8_dc_add_neon
532        movrel          r12, idct_coeffs
533        vld1.16         {d0}, [r12,:64]
534
535        vmov.i32        q2,  #0
536        vmovl.s16       q0,  d0
537
538        vld1.32         {d16[]}, [r2,:32]
539        vmull.s32       q8,  d16, d0[0]
540        vrshrn.s64      d16, q8,  #14
541        vmull.s32       q8,  d16, d0[0]
542        vrshrn.s64      d16, q8,  #14
543        vdup.32         q8,  d16[0]
544        vst1.32         {d4[0]}, [r2,:32]
545
546        vrshr.s32       q8,  q8,  #5
547        vdup.s16        q15, r8
548
549        mov             r3,  r0
550        mov             r12, #8
5511:
552        @ Loop to add the constant from q8 into all 8x8 outputs
553        subs            r12, r12, #2
554        vld1.16         {q2},  [r0,:128], r1
555        vaddw.u16       q10, q8,  d4
556        vld1.16         {q3},  [r0,:128], r1
557        vaddw.u16       q11, q8,  d5
558        vaddw.u16       q12, q8,  d6
559        vaddw.u16       q13, q8,  d7
560        vqmovun.s32     d4,  q10
561        vqmovun.s32     d5,  q11
562        vqmovun.s32     d6,  q12
563        vqmovun.s32     d7,  q13
564        vmin.u16        q2,  q2,  q15
565        vst1.16         {q2},  [r3,:128], r1
566        vmin.u16        q3,  q3,  q15
567        vst1.16         {q3},  [r3,:128], r1
568        bne             1b
569
570        pop             {r4-r8,pc}
571endfunc
572.ltorg
573
574.macro itxfm8_1d_funcs txfm
575@ Read a vertical 4x8 slice out of a 8x8 matrix, do a transform on it,
576@ transpose into a horizontal 8x4 slice and store.
577@ r0 = dst (temp buffer)
578@ r1 = slice offset
579@ r2 = src
580function \txfm\()8_1d_4x8_pass1_neon
581        mov             r12, #32
582        vmov.s32        q2,  #0
583.irp i, 8, 9, 10, 11, 12, 13, 14, 15
584        vld1.32         {q\i}, [r2,:128]
585        vst1.32         {q2},  [r2,:128], r12
586.endr
587
588        \txfm\()8
589
590        @ Do two 4x4 transposes. Originally, q8-q15 contain the
591        @ 8 rows. Afterwards, q8-q11, q12-q15 contain the transposed
592        @ 4x4 blocks.
593        transpose32_q_2x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
594
595        @ Store the transposed 4x4 blocks horizontally.
596        cmp             r1,  #4
597        beq             1f
598.irp i, 8, 12, 9, 13, 10, 14, 11, 15
599        vst1.32         {q\i}, [r0,:128]!
600.endr
601        bx              lr
6021:
603        @ Special case: For the last input column (r1 == 4),
604        @ which would be stored as the last row in the temp buffer,
605        @ don't store the first 4x4 block, but keep it in registers
606        @ for the first slice of the second pass (where it is the
607        @ last 4x4 block).
608.irp i, 12, 13, 14, 15
609        add             r0,  r0,  #16
610        vst1.32         {q\i}, [r0,:128]!
611.endr
612        vmov            q12, q8
613        vmov            q13, q9
614        vmov            q14, q10
615        vmov            q15, q11
616        bx              lr
617endfunc
618
619@ Read a vertical 4x8 slice out of a 8x8 matrix, do a transform on it,
620@ load the destination pixels (from a similar 4x8 slice), add and store back.
621@ r0 = dst
622@ r1 = dst stride
623@ r2 = src (temp buffer)
624@ r3 = slice offset
625function \txfm\()8_1d_4x8_pass2_neon
626        mov             r12, #32
627.irp i, 8, 9, 10, 11
628        vld1.32         {q\i}, [r2,:128], r12
629.endr
630        cmp             r3,  #0
631        beq             1f
632.irp i, 12, 13, 14, 15
633        vld1.32         {q\i}, [r2,:128], r12
634.endr
6351:
636
637        add             r3,  r0,  r1
638        lsl             r1,  r1,  #1
639        \txfm\()8
640
641        vdup.s16        q4,  r8
642.macro load_add_store coef0, coef1, coef2, coef3
643        vld1.16         {d4},   [r0,:64], r1
644        vld1.16         {d5},   [r3,:64], r1
645        vld1.16         {d6},   [r0,:64], r1
646        vld1.16         {d7},   [r3,:64], r1
647
648        vrshr.s32       \coef0, \coef0, #5
649        vrshr.s32       \coef1, \coef1, #5
650        vrshr.s32       \coef2, \coef2, #5
651        vrshr.s32       \coef3, \coef3, #5
652
653        vaddw.u16       \coef0, \coef0, d4
654        vaddw.u16       \coef1, \coef1, d5
655        vaddw.u16       \coef2, \coef2, d6
656        vaddw.u16       \coef3, \coef3, d7
657
658        sub             r0,  r0,  r1, lsl #1
659        sub             r3,  r3,  r1, lsl #1
660
661        vqmovun.s32     d4,  \coef0
662        vqmovun.s32     d5,  \coef1
663        vqmovun.s32     d6,  \coef2
664        vqmovun.s32     d7,  \coef3
665
666        vmin.u16        q2,  q2,  q4
667        vmin.u16        q3,  q3,  q4
668
669        vst1.16         {d4},  [r0,:64], r1
670        vst1.16         {d5},  [r3,:64], r1
671        vst1.16         {d6},  [r0,:64], r1
672        vst1.16         {d7},  [r3,:64], r1
673.endm
674        load_add_store  q8,  q9,  q10, q11
675        load_add_store  q12, q13, q14, q15
676.purgem load_add_store
677
678        bx              lr
679endfunc
680.endm
681
682itxfm8_1d_funcs idct
683itxfm8_1d_funcs iadst
684
685.macro itxfm_func8x8 txfm1, txfm2
686function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
687.ifc \txfm1\()_\txfm2,idct_idct
688        cmp             r3,  #1
689        beq             idct8x8_dc_add_neon
690.endif
691.ifnc \txfm1\()_\txfm2,idct_idct
692        vpush           {q4-q7}
693.else
694        vpush           {q4-q5}
695.endif
696
697        @ Align the stack, allocate a temp buffer
698T       mov             r7,  sp
699T       and             r7,  r7,  #15
700A       and             r7,  sp,  #15
701        add             r7,  r7,  #256
702        sub             sp,  sp,  r7
703
704        mov             r4,  r0
705        mov             r5,  r1
706        mov             r6,  r2
707
708.ifc \txfm1,idct
709        movrel          r12, idct_coeffs
710        vld1.16         {q0}, [r12,:128]
711        vmovl.s16       q1,  d1
712        vmovl.s16       q0,  d0
713.endif
714
715.irp i, 0, 4
716        add             r0,  sp,  #(\i*32)
717.ifc \txfm1\()_\txfm2,idct_idct
718.if \i == 4
719        cmp             r3,  #12
720        ble             1f
721.endif
722.endif
723        mov             r1,  #\i
724        add             r2,  r6,  #(\i*4)
725        bl              \txfm1\()8_1d_4x8_pass1_neon
726.endr
727.ifc \txfm1\()_\txfm2,idct_idct
728        b               3f
7291:
730        @ For all-zero slices in pass 1, set q12-q15 to zero, for the in-register
731        @ passthrough of coefficients to pass 2 and clear the end of the temp buffer
732        vmov.i32        q12, #0
733        vmov.i32        q13, #0
734        vmov.i32        q14, #0
735        vmov.i32        q15, #0
736.rept 4
737        vst1.32         {q12-q13}, [r0,:128]!
738.endr
7393:
740.endif
741.ifc \txfm1\()_\txfm2,iadst_idct
742        movrel          r12, idct_coeffs
743        vld1.16         {q0}, [r12,:128]
744        vmovl.s16       q1,  d1
745        vmovl.s16       q0,  d0
746.endif
747.irp i, 0, 4
748        add             r0,  r4,  #(\i*2)
749        mov             r1,  r5
750        add             r2,  sp,  #(\i*4)
751        mov             r3,  #\i
752        bl              \txfm2\()8_1d_4x8_pass2_neon
753.endr
754
755        add             sp,  sp,  r7
756.ifnc \txfm1\()_\txfm2,idct_idct
757        vpop            {q4-q7}
758.else
759        vpop            {q4-q5}
760.endif
761        pop             {r4-r8,pc}
762endfunc
763
764function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
765        push            {r4-r8,lr}
766        movw            r8,  #0x03ff
767        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
768endfunc
769
770function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
771        push            {r4-r8,lr}
772        movw            r8,  #0x0fff
773        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
774endfunc
775.endm
776
777itxfm_func8x8 idct,  idct
778itxfm_func8x8 iadst, idct
779itxfm_func8x8 idct,  iadst
780itxfm_func8x8 iadst, iadst
781
782function idct16x16_dc_add_neon
783        movrel          r12, idct_coeffs
784        vld1.16         {d0}, [r12,:64]
785
786        vmov.i32        q2,  #0
787        vmovl.s16       q0,  d0
788
789        vld1.32         {d16[]}, [r2,:32]
790        vmull.s32       q8,  d16, d0[0]
791        vrshrn.s64      d16, q8,  #14
792        vmull.s32       q8,  d16, d0[0]
793        vrshrn.s64      d16, q8,  #14
794        vdup.32         q8,  d16[0]
795        vst1.32         {d4[0]}, [r2,:32]
796
797        vrshr.s32       q8,  q8,  #6
798        vdup.s16        q15, r9
799
800        mov             r3,  r0
801        mov             r12, #16
8021:
803        @ Loop to add the constant from q8 into all 16x16 outputs
804        subs            r12, r12, #2
805        vld1.16         {q0-q1},  [r0,:128], r1
806        vaddw.u16       q9,  q8,  d0
807        vaddw.u16       q10, q8,  d1
808        vld1.16         {q2-q3},  [r0,:128], r1
809        vaddw.u16       q11, q8,  d2
810        vaddw.u16       q12, q8,  d3
811        vaddw.u16       q13, q8,  d4
812        vaddw.u16       q14, q8,  d5
813        vqmovun.s32     d0,  q9
814        vaddw.u16       q9,  q8,  d6
815        vqmovun.s32     d1,  q10
816        vaddw.u16       q10, q8,  d7
817        vqmovun.s32     d2,  q11
818        vqmovun.s32     d3,  q12
819        vqmovun.s32     d4,  q13
820        vqmovun.s32     d5,  q14
821        vmin.u16        q0,  q0,  q15
822        vmin.u16        q1,  q1,  q15
823        vqmovun.s32     d6,  q9
824        vqmovun.s32     d7,  q10
825        vst1.16         {q0-q1},  [r3,:128], r1
826        vmin.u16        q2,  q2,  q15
827        vmin.u16        q3,  q3,  q15
828        vst1.16         {q2-q3},  [r3,:128], r1
829        bne             1b
830
831        pop             {r4-r9,pc}
832endfunc
833.ltorg
834
835.macro idct16_end
836        butterfly       d18, d11, d8,  d11               @ d18 = t0a,  d11 = t7a
837        butterfly       d19, d22, d9,  d22               @ d19 = t1a,  d22 = t6
838        butterfly       d8,  d26, d20, d26               @ d8  = t2a,  d26 = t5
839        butterfly       d9,  d10, d28, d10               @ d9  = t3a,  d10 = t4
840        butterfly       d20, d28, d16, d24               @ d20 = t8a,  d28 = t11a
841        butterfly       d24, d21, d23, d21               @ d24 = t9,   d21 = t10
842        butterfly       d23, d27, d25, d27               @ d23 = t14,  d27 = t13
843        butterfly       d25, d29, d29, d17               @ d25 = t15a, d29 = t12a
844
845        mbutterfly0     d27, d21, d27, d21, d16, d30, q8, q15 @ d27 = t13a, d21 = t10a
846        mbutterfly0     d29, d28, d29, d28, d16, d30, q8, q15 @ d29 = t12,  d28 = t11
847
848        vswp            d27, d29                         @ d27 = t12, d29 = t13a
849        vswp            d28, d27                         @ d28 = t12, d27 = t11
850        butterfly       d16, d31, d18, d25               @ d16 = out[0], d31 = out[15]
851        butterfly       d17, d30, d19, d23               @ d17 = out[1], d30 = out[14]
852        butterfly_r     d25, d22, d22, d24               @ d25 = out[9], d22 = out[6]
853        butterfly       d23, d24, d11, d20               @ d23 = out[7], d24 = out[8]
854        butterfly       d18, d29, d8,  d29               @ d18 = out[2], d29 = out[13]
855        butterfly       d19, d28, d9,  d28               @ d19 = out[3], d28 = out[12]
856        vmov            d8,  d21                         @ d8  = t10a
857        butterfly       d20, d27, d10, d27               @ d20 = out[4], d27 = out[11]
858        butterfly       d21, d26, d26, d8                @ d21 = out[5], d26 = out[10]
859        bx              lr
860.endm
861
862function idct16
863        mbutterfly0     d16, d24, d16, d24, d8, d10, q4,  q5 @ d16 = t0a,  d24 = t1a
864        mbutterfly      d20, d28, d1[0], d1[1], q4,  q5  @ d20 = t2a,  d28 = t3a
865        mbutterfly      d18, d30, d2[0], d2[1], q4,  q5  @ d18 = t4a,  d30 = t7a
866        mbutterfly      d26, d22, d3[0], d3[1], q4,  q5  @ d26 = t5a,  d22 = t6a
867        mbutterfly      d17, d31, d4[0], d4[1], q4,  q5  @ d17 = t8a,  d31 = t15a
868        mbutterfly      d25, d23, d5[0], d5[1], q4,  q5  @ d25 = t9a,  d23 = t14a
869        mbutterfly      d21, d27, d6[0], d6[1], q4,  q5  @ d21 = t10a, d27 = t13a
870        mbutterfly      d29, d19, d7[0], d7[1], q4,  q5  @ d29 = t11a, d19 = t12a
871
872        butterfly       d8,  d28, d16, d28               @ d8  = t0,   d28 = t3
873        butterfly       d9,  d20, d24, d20               @ d9  = t1,   d20 = t2
874        butterfly       d10, d26, d18, d26               @ d10 = t4,   d26 = t5
875        butterfly       d11, d22, d30, d22               @ d11 = t7,   d22 = t6
876        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
877        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
878        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
879        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
880
881        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
882        mbutterfly      d23, d25, d1[0], d1[1], q9,  q15        @ d23 = t9a,  d25 = t14a
883        mbutterfly      d27, d21, d1[0], d1[1], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
884        idct16_end
885endfunc
886
887function idct16_half
888        mbutterfly0_h   d16, d24, d16, d24, d8, d10, q4,  q5 @ d16 = t0a,  d24 = t1a
889        mbutterfly_h1   d20, d28, d1[0], d1[1], q4,  q5  @ d20 = t2a,  d28 = t3a
890        mbutterfly_h1   d18, d30, d2[0], d2[1], q4,  q5  @ d18 = t4a,  d30 = t7a
891        mbutterfly_h2   d26, d22, d3[0], d3[1], q4,  q5  @ d26 = t5a,  d22 = t6a
892        mbutterfly_h1   d17, d31, d4[0], d4[1], q4,  q5  @ d17 = t8a,  d31 = t15a
893        mbutterfly_h2   d25, d23, d5[0], d5[1], q4,  q5  @ d25 = t9a,  d23 = t14a
894        mbutterfly_h1   d21, d27, d6[0], d6[1], q4,  q5  @ d21 = t10a, d27 = t13a
895        mbutterfly_h2   d29, d19, d7[0], d7[1], q4,  q5  @ d29 = t11a, d19 = t12a
896
897        butterfly       d8,  d28, d16, d28               @ d8  = t0,   d28 = t3
898        butterfly       d9,  d20, d24, d20               @ d9  = t1,   d20 = t2
899        butterfly       d10, d26, d18, d26               @ d10 = t4,   d26 = t5
900        butterfly       d11, d22, d30, d22               @ d11 = t7,   d22 = t6
901        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
902        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
903        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
904        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
905
906        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
907        mbutterfly      d23, d25, d1[0], d1[1], q9,  q15        @ d23 = t9a,  d25 = t14a
908        mbutterfly      d27, d21, d1[0], d1[1], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
909        idct16_end
910endfunc
911
912function idct16_quarter
913        vmov.s64        q12, #0
914        vmull.s32       q4,  d17, d4[0]
915        vmull.s32       q5,  d18, d2[1]
916        vmull.s32       q15, d18, d2[0]
917        vmlsl.s32       q12, d19, d7[1]
918        vmull.s32       q14, d17, d4[1]
919        vmull.s32       q13, d19, d7[0]
920        vmull.s32       q11, d16, d0[0]
921        vrshrn.s64      d16, q4,  #14
922        vrshrn.s64      d11, q5,  #14
923        vrshrn.s64      d10, q15, #14
924        vrshrn.s64      d24, q12, #14
925        vrshrn.s64      d29, q14, #14
926        vrshrn.s64      d17, q13, #14
927        vrshrn.s64      d28, q11, #14
928
929        mbutterfly_l    q10, q11, d17, d24, d1[0], d1[1], neg=1
930        mbutterfly_l    q9,  q15, d29, d16, d1[0], d1[1]
931        vrshrn.s64      d27, q10, #14
932        vrshrn.s64      d21, q11, #14
933        vrshrn.s64      d23, q9,  #14
934        vrshrn.s64      d25, q15, #14
935        vmov            d8,  d28
936        vmov            d9,  d28
937        mbutterfly0     d22, d26, d11, d10, d18, d30, q9,  q15
938        vmov            d20, d28
939        idct16_end
940endfunc
941
942function iadst16
943        movrel          r12, iadst16_coeffs
944        vld1.16         {q0},  [r12,:128]!
945        vmovl.s16       q1,  d1
946        vmovl.s16       q0,  d0
947
948        mbutterfly_l    q3,  q2,  d31, d16, d0[1], d0[0] @ q3  = t1,   q2  = t0
949        mbutterfly_l    q5,  q4,  d23, d24, d2[1], d2[0] @ q5  = t9,   q4  = t8
950        butterfly_n     d31, d24, q3,  q5,  q6,  q5      @ d31 = t1a,  d24 = t9a
951        mbutterfly_l    q7,  q6,  d29, d18, d1[1], d1[0] @ q7  = t3,   q6  = t2
952        butterfly_n     d16, d23, q2,  q4,  q3,  q4      @ d16 = t0a,  d23 = t8a
953        mbutterfly_l    q3,  q2,  d21, d26, d3[1], d3[0] @ q3  = t11,  q2  = t10
954
955        vld1.16         {q0},  [r12,:128]!
956        butterfly_n     d29, d26, q7,  q3,  q4,  q3      @ d29 = t3a,  d26 = t11a
957        vmovl.s16       q1,  d1
958        vmovl.s16       q0,  d0
959        mbutterfly_l    q5,  q4,  d27, d20, d0[1], d0[0] @ q5  = t5,   q4  = t4
960        butterfly_n     d18, d21, q6,  q2,  q3,  q2      @ d18 = t2a,  d21 = t10a
961
962        mbutterfly_l    q7,  q6,  d19, d28, d2[1], d2[0] @ q7  = t13,  q6  = t12
963        butterfly_n     d20, d28, q5,  q7,  q2,  q7      @ d20 = t5a,  d28 = t13a
964        mbutterfly_l    q3,  q2,  d25, d22, d1[1], d1[0] @ q3  = t7,   q2  = t6
965        butterfly_n     d27, d19, q4,  q6,  q5,  q6      @ d27 = t4a,  d19 = t12a
966
967        mbutterfly_l    q5,  q4,  d17, d30, d3[1], d3[0] @ q5  = t15,  q4  = t14
968        movrel          r12, idct_coeffs
969        vld1.16         {q0}, [r12,:128]
970        vmovl.s16       q1,  d1
971        vmovl.s16       q0,  d0
972        butterfly_n     d22, d30, q3,  q5,  q6,  q5      @ d22 = t7a,  d30 = t15a
973        mbutterfly_l    q7,  q6,  d23, d24, d2[0], d2[1] @ q7  = t9,   q6  = t8
974        butterfly_n     d25, d17, q2,  q4,  q3,  q4      @ d25 = t6a,  d17 = t14a
975
976        mbutterfly_l    q2,  q3,  d28, d19, d2[1], d2[0] @ q2  = t12,  q3  = t13
977        butterfly_n     d23, d19, q6,  q2,  q4,  q2      @ d23 = t8a,  d19 = t12a
978        mbutterfly_l    q5,  q4,  d21, d26, d3[0], d3[1] @ q5  = t11,  q4  = t10
979        butterfly_r     d4,  d27, d16, d27               @ d4  = t4,   d27 = t0
980        butterfly_n     d24, d28, q7,  q3,  q6,  q3      @ d24 = t9a,  d28 = t13a
981
982        mbutterfly_l    q6,  q7,  d30, d17, d3[1], d3[0] @ q6  = t14,  q7  = t15
983        butterfly_r     d5,  d20, d31, d20               @ d5  = t5,   d20 = t1
984        butterfly_n     d21, d17, q4,  q6,  q3,  q6      @ d21 = t10a, d17 = t14a
985        butterfly_n     d26, d30, q5,  q7,  q4,  q7      @ d26 = t11a, d30 = t15a
986
987        butterfly_r     d6,  d25, d18, d25               @ d6  = t6,   d25 = t2
988        butterfly_r     d7,  d22, d29, d22               @ d7  = t7,   d22 = t3
989
990        mbutterfly_l    q5,  q4,  d19, d28, d1[0], d1[1] @ q5  = t13,  q4  = t12
991        mbutterfly_l    q6,  q7,  d30, d17, d1[1], d1[0] @ q6  = t14,  q7  = t15
992
993        butterfly_n     d18, d30, q4,  q6,  q8,  q6      @ d18 = out[2],   d30 = t14a
994        butterfly_n     d29, d17, q5,  q7,  q6,  q7      @ d29 = -out[13], d17 = t15a
995        vneg.s32        d29, d29                         @ d29 = out[13]
996
997        mbutterfly_l    q5,  q4,  d4,  d5,  d1[0], d1[1] @ q5  = t5a,  q4  = t4a
998        mbutterfly_l    q6,  q7,  d7,  d6,  d1[1], d1[0] @ q6  = t6a,  q7  = t7a
999
1000        butterfly       d2,  d6,  d27, d25               @ d2 = out[0], d6 = t2a
1001        butterfly       d3,  d7,  d23, d21               @ d3 =-out[1], d7 = t10
1002
1003        butterfly_n     d19, d31, q4,  q6,  q2,  q4      @ d19 = -out[3],  d31 = t6
1004        vneg.s32        d19, d19                         @ d19 = out[3]
1005        butterfly_n     d28, d16, q5,  q7,  q2,  q5      @ d28 = out[12],  d16 = t7
1006
1007        butterfly       d5,  d8,  d20, d22               @ d5 =-out[15],d8 = t3a
1008        butterfly       d4,  d9,  d24, d26               @ d4 = out[14],d9 = t11
1009
1010        mbutterfly0     d23, d24, d6,  d8,  d10, d11, q6,  q7, 1 @ d23 = out[7], d24 = out[8]
1011        mbutterfly0     d20, d27, d16, d31, d10, d11, q6,  q7    @ d20 = out[4], d27 = out[11]
1012        mbutterfly0     d22, d25, d9,  d7,  d10, d11, q6,  q7    @ d22 = out[6], d25 = out[9]
1013        mbutterfly0     d21, d26, d30, d17, d10, d11, q6,  q7, 1 @ d21 = out[5], d26 = out[10]
1014
1015        vneg.s32        d31, d5                          @ d31 = out[15]
1016        vneg.s32        d17, d3                          @ d17 = out[1]
1017
1018        vmov            d16, d2
1019        vmov            d30, d4
1020        bx              lr
1021endfunc
1022
1023.macro itxfm16_1d_funcs txfm, suffix
1024@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
1025@ transpose into a horizontal 16x2 slice and store.
1026@ r0 = dst (temp buffer)
1027@ r2 = src
1028function \txfm\()16_1d_2x16_pass1\suffix\()_neon
1029        push            {lr}
1030
1031        mov             r12, #64
1032        vmov.s32        q4,  #0
1033.ifb \suffix
1034.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1035        vld1.32         {d\i}, [r2,:64]
1036        vst1.32         {d8},  [r2,:64], r12
1037.endr
1038.endif
1039.ifc \suffix,_quarter
1040.irp i, 16, 17, 18, 19
1041        vld1.32         {d\i}, [r2,:64]
1042        vst1.32         {d8},  [r2,:64], r12
1043.endr
1044.endif
1045.ifc \suffix,_half
1046.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1047        vld1.32         {d\i}, [r2,:64]
1048        vst1.32         {d8},  [r2,:64], r12
1049.endr
1050.endif
1051
1052        bl              \txfm\()16\suffix
1053
1054        @ Do eight 2x2 transposes. Originally, d16-d31 contain the
1055        @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
1056        @ transposed 2x2 blocks.
1057        transpose32_8x_2x2 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
1058
1059        @ Store the transposed 2x2 blocks horizontally.
1060.irp i, 16, 18, 20, 22, 24, 26, 28, 30, 17, 19, 21, 23, 25, 27, 29, 31
1061        vst1.32         {d\i}, [r0,:64]!
1062.endr
1063        pop             {pc}
1064endfunc
1065
1066@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
1067@ load the destination pixels (from a similar 2x16 slice), add and store back.
1068@ r0 = dst
1069@ r1 = dst stride
1070@ r2 = src (temp buffer)
1071function \txfm\()16_1d_2x16_pass2\suffix\()_neon
1072        push            {lr}
1073
1074        mov             r12, #64
1075.ifb \suffix
1076.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1077        vld1.16         {d\i}, [r2,:64], r12
1078.endr
1079.endif
1080.ifc \suffix,_quarter
1081.irp i, 16, 17, 18, 19, 20
1082        vld1.16         {d\i}, [r2,:64], r12
1083.endr
1084.endif
1085.ifc \suffix,_half
1086.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1087        vld1.16         {d\i}, [r2,:64], r12
1088.endr
1089.endif
1090
1091        add             r3,  r0,  r1
1092        lsl             r1,  r1,  #1
1093        bl              \txfm\()16\suffix
1094
1095.macro load_add_store coef0, coef1, coef2, coef3
1096        vrshr.s32       \coef0, \coef0, #6
1097        vrshr.s32       \coef1, \coef1, #6
1098
1099        vld1.32         {d8[]},   [r0,:32], r1
1100        vld1.32         {d8[1]},  [r3,:32], r1
1101        vrshr.s32       \coef2, \coef2, #6
1102        vrshr.s32       \coef3, \coef3, #6
1103        vld1.32         {d9[]},   [r0,:32], r1
1104        vld1.32         {d9[1]},  [r3,:32], r1
1105        vaddw.u16       \coef0, \coef0, d8
1106        vld1.32         {d10[]},  [r0,:32], r1
1107        vld1.32         {d10[1]}, [r3,:32], r1
1108        vaddw.u16       \coef1, \coef1, d9
1109        vld1.32         {d11[]},  [r0,:32], r1
1110        vld1.32         {d11[1]}, [r3,:32], r1
1111
1112        vqmovun.s32     d8,  \coef0
1113        vdup.s16        q8,  r9
1114        vqmovun.s32     d9,  \coef1
1115        sub             r0,  r0,  r1, lsl #2
1116        sub             r3,  r3,  r1, lsl #2
1117        vaddw.u16       \coef2, \coef2, d10
1118        vaddw.u16       \coef3, \coef3, d11
1119        vmin.u16        q4,  q4,  q8
1120        vst1.32         {d8[0]},  [r0,:32], r1
1121        vst1.32         {d8[1]},  [r3,:32], r1
1122        vqmovun.s32     d10, \coef2
1123        vst1.32         {d9[0]},  [r0,:32], r1
1124        vst1.32         {d9[1]},  [r3,:32], r1
1125        vqmovun.s32     d11, \coef3
1126        vmin.u16        q5,  q5,  q8
1127
1128        vst1.32         {d10[0]}, [r0,:32], r1
1129        vst1.32         {d10[1]}, [r3,:32], r1
1130        vst1.32         {d11[0]}, [r0,:32], r1
1131        vst1.32         {d11[1]}, [r3,:32], r1
1132.endm
1133        load_add_store  q8,  q9,  q10, q11
1134        load_add_store  q12, q13, q14, q15
1135.purgem load_add_store
1136
1137        pop             {pc}
1138endfunc
1139.endm
1140
1141itxfm16_1d_funcs idct
1142itxfm16_1d_funcs iadst
1143itxfm16_1d_funcs idct, _quarter
1144itxfm16_1d_funcs idct, _half
1145.ltorg
1146
1147@ This is the minimum eob value for each subpartition, in increments of 2
1148const min_eob_idct_idct_16, align=4
1149        .short  0, 3, 10, 22, 38, 62, 89, 121
1150endconst
1151
1152.macro itxfm_func16x16 txfm1, txfm2
1153function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1154.ifc \txfm1\()_\txfm2,idct_idct
1155        cmp             r3,  #1
1156        beq             idct16x16_dc_add_neon
1157.endif
1158.ifnc \txfm1\()_\txfm2,idct_idct
1159        vpush           {q4-q7}
1160.else
1161        vpush           {q4-q5}
1162.endif
1163
1164        @ Align the stack, allocate a temp buffer
1165T       mov             r7,  sp
1166T       and             r7,  r7,  #15
1167A       and             r7,  sp,  #15
1168        add             r7,  r7,  #1024
1169        sub             sp,  sp,  r7
1170
1171        mov             r4,  r0
1172        mov             r5,  r1
1173        mov             r6,  r2
1174
1175.ifc \txfm1,idct
1176        movrel          r12, idct_coeffs
1177        vld1.16         {q0-q1}, [r12,:128]
1178        vmovl.s16       q2,  d2
1179        vmovl.s16       q3,  d3
1180        vmovl.s16       q1,  d1
1181        vmovl.s16       q0,  d0
1182.endif
1183
1184.ifc \txfm1\()_\txfm2,idct_idct
1185        cmp             r3,  #10
1186        ble             idct16x16_quarter_add_16_neon
1187        cmp             r3,  #38
1188        ble             idct16x16_half_add_16_neon
1189
1190        movrel          r8,  min_eob_idct_idct_16 + 2
1191.endif
1192
1193.irp i, 0, 2, 4, 6, 8, 10, 12, 14
1194        add             r0,  sp,  #(\i*64)
1195.ifc \txfm1\()_\txfm2,idct_idct
1196.if \i > 0
1197        ldrh_post       r1,  r8,  #2
1198        cmp             r3,  r1
1199        it              le
1200        movle           r1,  #(16 - \i)/2
1201        ble             1f
1202.endif
1203.endif
1204        add             r2,  r6,  #(\i*4)
1205        bl              \txfm1\()16_1d_2x16_pass1_neon
1206.endr
1207
1208.ifc \txfm1\()_\txfm2,idct_idct
1209        b               3f
12101:
1211        vmov.i32        q14, #0
1212        vmov.i32        q15, #0
12132:
1214        subs            r1,  r1,  #1
1215        @ Unroll for 2 lines
1216.rept 2
1217        @ Fill one line with zeros
1218        vst1.32         {q14-q15}, [r0,:128]!
1219        vst1.32         {q14-q15}, [r0,:128]!
1220.endr
1221        bne             2b
12223:
1223.endif
1224
1225.ifc \txfm1\()_\txfm2,iadst_idct
1226        movrel          r12, idct_coeffs
1227        vld1.16         {q0-q1}, [r12,:128]
1228        vmovl.s16       q2,  d2
1229        vmovl.s16       q3,  d3
1230        vmovl.s16       q1,  d1
1231        vmovl.s16       q0,  d0
1232.endif
1233.irp i, 0, 2, 4, 6, 8, 10, 12, 14
1234        add             r0,  r4,  #(\i*2)
1235        mov             r1,  r5
1236        add             r2,  sp,  #(\i*4)
1237        bl              \txfm2\()16_1d_2x16_pass2_neon
1238.endr
1239
1240        add             sp,  sp,  r7
1241.ifnc \txfm1\()_\txfm2,idct_idct
1242        vpop            {q4-q7}
1243.else
1244        vpop            {q4-q5}
1245.endif
1246        pop             {r4-r9,pc}
1247endfunc
1248
1249function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
1250        push            {r4-r9,lr}
1251        movw            r9,  #0x03ff
1252        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1253endfunc
1254
1255function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
1256        push            {r4-r9,lr}
1257        movw            r9,  #0x0fff
1258        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1259endfunc
1260.endm
1261
1262itxfm_func16x16 idct,  idct
1263itxfm_func16x16 iadst, idct
1264itxfm_func16x16 idct,  iadst
1265itxfm_func16x16 iadst, iadst
1266.ltorg
1267
1268.macro idct16_partial size
1269function idct16x16_\size\()_add_16_neon
1270.irp i, 0, 2
1271        add             r0,  sp,  #(\i*64)
1272.ifc \size,quarter
1273.if \i == 2
1274        cmp             r3,  #3
1275        ble             1f
1276.endif
1277.endif
1278        add             r2,  r6,  #(\i*4)
1279        bl              idct16_1d_2x16_pass1_\size\()_neon
1280.endr
1281
1282.ifc \size,half
1283.irp i, 4, 6
1284        add             r0,  sp,  #(\i*64)
1285.if \i == 6
1286        cmp             r3,  #22
1287        ble             1f
1288.endif
1289        add             r2,  r6,  #(\i*4)
1290        bl              idct16_1d_2x16_pass1_\size\()_neon
1291.endr
1292.endif
1293
1294        b               3f
12951:
1296        vmov.i32        q14, #0
1297        vmov.i32        q15, #0
1298
1299        @ Unroll for 2 lines
1300.rept 2
1301        @ Fill one line with zeros
1302        vst1.32         {q14-q15}, [r0,:128]!
1303        vst1.32         {q14-q15}, [r0,:128]!
1304.endr
1305
13063:
1307
1308.irp i, 0, 2, 4, 6, 8, 10, 12, 14
1309        add             r0,  r4,  #(\i*2)
1310        mov             r1,  r5
1311        add             r2,  sp,  #(\i*4)
1312        bl              idct16_1d_2x16_pass2_\size\()_neon
1313.endr
1314
1315        add             sp,  sp,  r7
1316        vpop            {q4-q5}
1317        pop             {r4-r9,pc}
1318endfunc
1319.endm
1320
1321idct16_partial quarter
1322idct16_partial half
1323
1324function idct32x32_dc_add_neon
1325        movrel          r12, idct_coeffs
1326        vld1.16         {d0}, [r12,:64]
1327
1328        vmov.i32        q2,  #0
1329        vmovl.s16       q0,  d0
1330
1331        vld1.32         {d16[]}, [r2,:32]
1332        vmull.s32       q8,  d16, d0[0]
1333        vrshrn.s64      d16, q8,  #14
1334        vmull.s32       q8,  d16, d0[0]
1335        vrshrn.s64      d16, q8,  #14
1336        vdup.32         q8,  d16[0]
1337        vst1.32         {d4[0]}, [r2,:32]
1338
1339        vrshr.s32       q8,  q8,  #6
1340        vdup.s16        q15, r9
1341
1342        mov             r3,  r0
1343        mov             r12, #32
1344        sub             r1,  r1,  #32
13451:
1346        @ Loop to add the constant from q8 into all 32x32 outputs
1347        subs            r12, r12, #1
1348        vld1.16         {q0-q1},  [r0,:128]!
1349        vaddw.u16       q9,  q8,  d0
1350        vaddw.u16       q10, q8,  d1
1351        vld1.16         {q2-q3},  [r0,:128], r1
1352        vaddw.u16       q11, q8,  d2
1353        vaddw.u16       q12, q8,  d3
1354        vaddw.u16       q13, q8,  d4
1355        vaddw.u16       q14, q8,  d5
1356        vqmovun.s32     d0,  q9
1357        vaddw.u16       q9,  q8,  d6
1358        vqmovun.s32     d1,  q10
1359        vaddw.u16       q10, q8,  d7
1360        vqmovun.s32     d2,  q11
1361        vqmovun.s32     d3,  q12
1362        vqmovun.s32     d4,  q13
1363        vqmovun.s32     d5,  q14
1364        vmin.u16        q0,  q0,  q15
1365        vmin.u16        q1,  q1,  q15
1366        vqmovun.s32     d6,  q9
1367        vqmovun.s32     d7,  q10
1368        vst1.16         {q0-q1},  [r3,:128]!
1369        vmin.u16        q2,  q2,  q15
1370        vmin.u16        q3,  q3,  q15
1371        vst1.16         {q2-q3},  [r3,:128], r1
1372        bne             1b
1373
1374        pop             {r4-r9,pc}
1375endfunc
1376
1377.macro idct32_end
1378        butterfly       d16, d9,  d8,  d9  @ d16 = t16a, d9  = t19a
1379        butterfly       d17, d20, d23, d20 @ d17 = t17,  d20 = t18
1380        butterfly       d18, d10, d11, d10 @ d18 = t23a, d10 = t20a
1381        butterfly       d19, d21, d22, d21 @ d19 = t22,  d21 = t21
1382        butterfly       d8,  d28, d28, d30 @ d8  = t24a, d28 = t27a
1383        butterfly       d23, d26, d25, d26 @ d23 = t25,  d26 = t26
1384        butterfly       d11, d29, d29, d31 @ d11 = t31a, d29 = t28a
1385        butterfly       d22, d27, d24, d27 @ d22 = t30,  d27 = t29
1386
1387        mbutterfly      d27, d20, d1[0], d1[1], q12, q15        @ d27 = t18a, d20 = t29a
1388        mbutterfly      d29, d9,  d1[0], d1[1], q12, q15        @ d29 = t19,  d9  = t28
1389        mbutterfly      d28, d10, d1[0], d1[1], q12, q15, neg=1 @ d28 = t27,  d10 = t20
1390        mbutterfly      d26, d21, d1[0], d1[1], q12, q15, neg=1 @ d26 = t26a, d21 = t21a
1391
1392        butterfly       d31, d24, d11, d8  @ d31 = t31,  d24 = t24
1393        butterfly       d30, d25, d22, d23 @ d30 = t30a, d25 = t25a
1394        butterfly_r     d23, d16, d16, d18 @ d23 = t23,  d16 = t16
1395        butterfly_r     d22, d17, d17, d19 @ d22 = t22a, d17 = t17a
1396        butterfly       d18, d21, d27, d21 @ d18 = t18,  d21 = t21
1397        butterfly_r     d27, d28, d9,  d28 @ d27 = t27a, d28 = t28a
1398        butterfly       d8,  d26, d20, d26 @ d8  = t29,  d26 = t26
1399        butterfly       d19, d20, d29, d10 @ d19 = t19a, d20 = t20
1400        vmov            d29, d8            @ d29 = t29
1401
1402        mbutterfly0     d27, d20, d27, d20, d8, d10, q4, q5 @ d27 = t27,  d20 = t20
1403        mbutterfly0     d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a
1404        mbutterfly0     d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25,  d22 = t22
1405        mbutterfly0     d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a
1406        bx              lr
1407.endm
1408
1409function idct32_odd
1410        movrel          r12, idct_coeffs
1411
1412        @ Overwrite the idct16 coeffs with the stored ones for idct32
1413        vmovl.s16       q0,  d12
1414        vmovl.s16       q1,  d13
1415        vmovl.s16       q2,  d14
1416        vmovl.s16       q3,  d15
1417
1418        mbutterfly      d16, d31, d0[0], d0[1], q4, q5 @ d16 = t16a, d31 = t31a
1419        mbutterfly      d24, d23, d1[0], d1[1], q4, q5 @ d24 = t17a, d23 = t30a
1420        mbutterfly      d20, d27, d2[0], d2[1], q4, q5 @ d20 = t18a, d27 = t29a
1421        mbutterfly      d28, d19, d3[0], d3[1], q4, q5 @ d28 = t19a, d19 = t28a
1422        mbutterfly      d18, d29, d4[0], d4[1], q4, q5 @ d18 = t20a, d29 = t27a
1423        mbutterfly      d26, d21, d5[0], d5[1], q4, q5 @ d26 = t21a, d21 = t26a
1424        mbutterfly      d22, d25, d6[0], d6[1], q4, q5 @ d22 = t22a, d25 = t25a
1425        mbutterfly      d30, d17, d7[0], d7[1], q4, q5 @ d30 = t23a, d17 = t24a
1426
1427        @ Reload the idct16 coefficients. We could swap the coefficients between
1428        @ q0-q3 and q6-q7 by narrowing/lengthening, but that's slower than just
1429        @ loading and lengthening.
1430        vld1.16         {q0-q1}, [r12,:128]
1431
1432        butterfly       d8,  d24, d16, d24 @ d8  = t16, d24 = t17
1433        butterfly       d9,  d20, d28, d20 @ d9  = t19, d20 = t18
1434        butterfly       d10, d26, d18, d26 @ d10 = t20, d26 = t21
1435        butterfly       d11, d22, d30, d22 @ d11 = t23, d22 = t22
1436        vmovl.s16       q2,  d2
1437        vmovl.s16       q3,  d3
1438        vmovl.s16       q1,  d1
1439        vmovl.s16       q0,  d0
1440        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
1441        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
1442        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
1443        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
1444
1445        mbutterfly      d23, d24, d2[0], d2[1], q8, q9        @ d23 = t17a, d24 = t30a
1446        mbutterfly      d27, d20, d2[0], d2[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
1447        mbutterfly      d21, d26, d3[0], d3[1], q8, q9        @ d21 = t21a, d26 = t26a
1448        mbutterfly      d25, d22, d3[0], d3[1], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
1449        idct32_end
1450endfunc
1451
1452function idct32_odd_half
1453        movrel          r12, idct_coeffs
1454
1455        vmovl.s16       q0,  d12
1456        vmovl.s16       q1,  d13
1457        vmovl.s16       q2,  d14
1458        vmovl.s16       q3,  d15
1459
1460        mbutterfly_h1   d16, d31, d0[0], d0[1], q4, q5 @ d16 = t16a, d31 = t31a
1461        mbutterfly_h2   d24, d23, d1[0], d1[1], q4, q5 @ d24 = t17a, d23 = t30a
1462        mbutterfly_h1   d20, d27, d2[0], d2[1], q4, q5 @ d20 = t18a, d27 = t29a
1463        mbutterfly_h2   d28, d19, d3[0], d3[1], q4, q5 @ d28 = t19a, d19 = t28a
1464        mbutterfly_h1   d18, d29, d4[0], d4[1], q4, q5 @ d18 = t20a, d29 = t27a
1465        mbutterfly_h2   d26, d21, d5[0], d5[1], q4, q5 @ d26 = t21a, d21 = t26a
1466        mbutterfly_h1   d22, d25, d6[0], d6[1], q4, q5 @ d22 = t22a, d25 = t25a
1467        mbutterfly_h2   d30, d17, d7[0], d7[1], q4, q5 @ d30 = t23a, d17 = t24a
1468
1469        vld1.16         {q0-q1}, [r12,:128]
1470
1471        butterfly       d8,  d24, d16, d24 @ d8  = t16, d24 = t17
1472        butterfly       d9,  d20, d28, d20 @ d9  = t19, d20 = t18
1473        butterfly       d10, d26, d18, d26 @ d10 = t20, d26 = t21
1474        butterfly       d11, d22, d30, d22 @ d11 = t23, d22 = t22
1475        vmovl.s16       q2,  d2
1476        vmovl.s16       q3,  d3
1477        vmovl.s16       q1,  d1
1478        vmovl.s16       q0,  d0
1479        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
1480        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
1481        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
1482        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
1483
1484        mbutterfly      d23, d24, d2[0], d2[1], q8, q9        @ d23 = t17a, d24 = t30a
1485        mbutterfly      d27, d20, d2[0], d2[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
1486        mbutterfly      d21, d26, d3[0], d3[1], q8, q9        @ d21 = t21a, d26 = t26a
1487        mbutterfly      d25, d22, d3[0], d3[1], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
1488        idct32_end
1489endfunc
1490
1491function idct32_odd_quarter
1492        movrel          r12, idct_coeffs
1493
1494        vmovl.s16       q0,  d12
1495        vmovl.s16       q1,  d13
1496        vmovl.s16       q2,  d14
1497        vmovl.s16       q3,  d15
1498
1499        vmov.s64        q14, #0
1500        vmov.s64        q5,  #0
1501
1502        vmull.s32       q4,  d16, d0[0]
1503        vmlsl.s32       q14, d19, d3[1]
1504        vmull.s32       q15, d16, d0[1]
1505        vmull.s32       q11, d17, d7[0]
1506        vmlsl.s32       q5,  d17, d7[1]
1507        vmull.s32       q13, d19, d3[0]
1508        vmull.s32       q10, d18, d4[0]
1509        vmull.s32       q12, d18, d4[1]
1510
1511        vld1.16         {q0-q1}, [r12,:128]
1512
1513        vrshrn.s64      d8,  q4,  #14
1514        vrshrn.s64      d9,  q14, #14
1515        vrshrn.s64      d29, q15, #14
1516        vrshrn.s64      d28, q11, #14
1517
1518        vmovl.s16       q2,  d2
1519        vmovl.s16       q3,  d3
1520        vmovl.s16       q1,  d1
1521        vmovl.s16       q0,  d0
1522
1523        vrshrn.s64      d11, q5,  #14
1524        vrshrn.s64      d31, q13, #14
1525        vrshrn.s64      d10, q10, #14
1526        vrshrn.s64      d30, q12, #14
1527
1528        mbutterfly_l    q8,  q9,  d29, d8,  d2[0], d2[1]
1529        mbutterfly_l    q13, q10, d31, d9,  d2[0], d2[1], neg=1
1530        vrshrn.s64      d23, q8,  #14
1531        vrshrn.s64      d24, q9,  #14
1532        vrshrn.s64      d27, q13, #14
1533        vrshrn.s64      d20, q10, #14
1534        mbutterfly_l    q8,  q9,  d30, d10, d3[0], d3[1]
1535        vrshrn.s64      d21, q8,  #14
1536        vrshrn.s64      d26, q9,  #14
1537        mbutterfly_l    q8,  q9,  d28, d11, d3[0], d3[1], neg=1
1538        vrshrn.s64      d25, q8,  #14
1539        vrshrn.s64      d22, q9,  #14
1540
1541        idct32_end
1542endfunc
1543
1544.macro idct32_funcs suffix
1545@ Do an 32-point IDCT of a 2x32 slice out of a 32x32 matrix.
1546@ We don't have register space to do a single pass IDCT of 2x32 though,
1547@ but the 32-point IDCT can be decomposed into two 16-point IDCTs;
1548@ a normal IDCT16 with every other input component (the even ones, with
1549@ each output written twice), followed by a separate 16-point IDCT
1550@ of the odd inputs, added/subtracted onto the outputs of the first idct16.
1551@ r0 = dst (temp buffer)
1552@ r1 = unused
1553@ r2 = src
1554function idct32_1d_2x32_pass1\suffix\()_neon
1555        push            {lr}
1556
1557        @ Double stride of the input, since we only read every other line
1558        mov             r12, #256
1559        vmov.s32        d8,  #0
1560
1561        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
1562.ifb \suffix
1563.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1564        vld1.32         {d\i}, [r2,:64]
1565        vst1.32         {d8},  [r2,:64], r12
1566.endr
1567.endif
1568.ifc \suffix,_quarter
1569.irp i, 16, 17, 18, 19
1570        vld1.32         {d\i}, [r2,:64]
1571        vst1.32         {d8},  [r2,:64], r12
1572.endr
1573.endif
1574.ifc \suffix,_half
1575.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1576        vld1.32         {d\i}, [r2,:64]
1577        vst1.32         {d8},  [r2,:64], r12
1578.endr
1579.endif
1580
1581        bl              idct16\suffix
1582
1583        @ Do eight 2x2 transposes. Originally, d16-d31 contain the
1584        @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
1585        @ transposed 2x2 blocks.
1586        transpose32_8x_2x2 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
1587
1588        @ Store the registers a, b, c, d, e, f, g, h horizontally, followed
1589        @ by the same registers h, g, f, e, d, c, b, a mirrored.
1590.macro store_rev a, b, c, d, e, f, g, h
1591.irp i, \a, \b, \c, \d, \e, \f, \g, \h
1592        vst1.32         {d\i}, [r0,:64]!
1593        vrev64.32       d\i, d\i
1594.endr
1595.irp i, \h, \g, \f, \e, \d, \c, \b, \a
1596        vst1.32         {d\i}, [r0,:64]!
1597.endr
1598.endm
1599        store_rev       16, 18, 20, 22, 24, 26, 28, 30
1600        store_rev       17, 19, 21, 23, 25, 27, 29, 31
1601        sub             r0,  r0,  #256
1602.purgem store_rev
1603
1604        @ Move r2 back to the start of the input, and move
1605        @ to the first odd row
1606.ifb \suffix
1607        sub             r2,  r2,  r12, lsl #4
1608.endif
1609.ifc \suffix,_quarter
1610        sub             r2,  r2,  r12, lsl #2
1611.endif
1612.ifc \suffix,_half
1613        sub             r2,  r2,  r12, lsl #3
1614.endif
1615        add             r2,  r2,  #128
1616
1617        vmov.s32        d8,  #0
1618        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
1619.ifb \suffix
1620.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1621        vld1.16         {d\i}, [r2,:64]
1622        vst1.16         {d8},  [r2,:64], r12
1623.endr
1624.endif
1625.ifc \suffix,_quarter
1626.irp i, 16, 17, 18, 19
1627        vld1.16         {d\i}, [r2,:64]
1628        vst1.16         {d8},  [r2,:64], r12
1629.endr
1630.endif
1631.ifc \suffix,_half
1632.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1633        vld1.16         {d\i}, [r2,:64]
1634        vst1.16         {d8},  [r2,:64], r12
1635.endr
1636.endif
1637
1638        bl              idct32_odd\suffix
1639
1640        transpose32_8x_2x2 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
1641
1642        @ Store the registers a, b, c, d, e, f, g, h horizontally,
1643        @ adding into the output first, and then mirrored, subtracted
1644        @ from the output.
1645.macro store_rev a, b, c, d, e, f, g, h
1646.irp i, \a, \b, \c, \d, \e, \f, \g, \h
1647        vld1.32         {d8},  [r0,:64]
1648        vadd.s32        d8, d8, d\i
1649        vst1.32         {d8},  [r0,:64]!
1650        vrev64.32       d\i, d\i
1651.endr
1652.irp i, \h, \g, \f, \e, \d, \c, \b, \a
1653        vld1.32         {d8},  [r0,:64]
1654        vsub.s32        d8, d8, d\i
1655        vst1.32         {d8},  [r0,:64]!
1656.endr
1657.endm
1658
1659        store_rev       31, 29, 27, 25, 23, 21, 19, 17
1660        store_rev       30, 28, 26, 24, 22, 20, 18, 16
1661.purgem store_rev
1662        pop             {pc}
1663endfunc
1664.ltorg
1665
1666@ This is mostly the same as 2x32_pass1, but without the transpose,
1667@ and use the source as temp buffer between the two idct passes, and
1668@ add into the destination.
1669@ r0 = dst
1670@ r1 = dst stride
1671@ r2 = src (temp buffer)
1672function idct32_1d_2x32_pass2\suffix\()_neon
1673        push            {lr}
1674
1675        mov             r12, #256
1676        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
1677.ifb \suffix
1678.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1679        vld1.32         {d\i}, [r2,:64], r12
1680.endr
1681        sub             r2,  r2,  r12, lsl #4
1682.endif
1683.ifc \suffix,_quarter
1684.irp i, 16, 17, 18, 19
1685        vld1.32         {d\i}, [r2,:64], r12
1686.endr
1687        sub             r2,  r2,  r12, lsl #2
1688.endif
1689.ifc \suffix,_half
1690.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1691        vld1.32         {d\i}, [r2,:64], r12
1692.endr
1693        sub             r2,  r2,  r12, lsl #3
1694.endif
1695
1696        bl              idct16\suffix
1697
1698.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1699        vst1.32         {d\i}, [r2,:64], r12
1700.endr
1701
1702        sub             r2,  r2,  r12, lsl #4
1703        add             r2,  r2,  #128
1704
1705        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
1706.ifb \suffix
1707.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1708        vld1.32         {d\i}, [r2,:64], r12
1709.endr
1710        sub             r2,  r2,  r12, lsl #4
1711.endif
1712.ifc \suffix,_quarter
1713.irp i, 16, 17, 18, 19
1714        vld1.32         {d\i}, [r2,:64], r12
1715.endr
1716        sub             r2,  r2,  r12, lsl #2
1717.endif
1718.ifc \suffix,_half
1719.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1720        vld1.32         {d\i}, [r2,:64], r12
1721.endr
1722        sub             r2,  r2,  r12, lsl #3
1723.endif
1724        sub             r2,  r2,  #128
1725
1726        bl              idct32_odd\suffix
1727
1728        @ Narrow the ict16 coefficients in q0-q3 into q0-q1, to
1729        @ allow clobbering q2-q3 below.
1730        vmovn.s32       d0,  q0
1731        vmovn.s32       d1,  q1
1732        vmovn.s32       d2,  q2
1733        vmovn.s32       d3,  q3
1734
1735        mov             r12, #256
1736        vdup.s16        q4,  r9
1737.macro load_acc_store a, b, c, d, neg=0
1738        vld1.32         {d4},  [r2,:64], r12
1739        vld1.32         {d5},  [r2,:64], r12
1740.if \neg == 0
1741        vadd.s32        d4,  d4,  d\a
1742        vld1.32         {d6},  [r2,:64], r12
1743        vadd.s32        d5,  d5,  d\b
1744        vld1.32         {d7},  [r2,:64], r12
1745        vadd.s32        d6,  d6,  d\c
1746        vadd.s32        d7,  d7,  d\d
1747.else
1748        vsub.s32        d4,  d4,  d\a
1749        vld1.32         {d6},  [r2,:64], r12
1750        vsub.s32        d5,  d5,  d\b
1751        vld1.32         {d7},  [r2,:64], r12
1752        vsub.s32        d6,  d6,  d\c
1753        vsub.s32        d7,  d7,  d\d
1754.endif
1755        vld1.32         {d10[]},  [r0,:32], r1
1756        vld1.32         {d10[1]}, [r0,:32], r1
1757        vrshr.s32       q2,  q2,  #6
1758        vld1.32         {d11[]},  [r0,:32], r1
1759        vrshr.s32       q3,  q3,  #6
1760        vld1.32         {d11[1]}, [r0,:32], r1
1761        sub             r0,  r0,  r1, lsl #2
1762        vaddw.u16       q2,  q2,  d10
1763        vaddw.u16       q3,  q3,  d11
1764        vqmovun.s32     d4,  q2
1765        vqmovun.s32     d5,  q3
1766        vmin.u16        q2,  q2,  q4
1767        vst1.32         {d4[0]},  [r0,:32], r1
1768        vst1.32         {d4[1]},  [r0,:32], r1
1769        vst1.32         {d5[0]},  [r0,:32], r1
1770        vst1.32         {d5[1]},  [r0,:32], r1
1771.endm
1772        load_acc_store  31, 30, 29, 28
1773        load_acc_store  27, 26, 25, 24
1774        load_acc_store  23, 22, 21, 20
1775        load_acc_store  19, 18, 17, 16
1776        sub             r2,  r2,  r12
1777        neg             r12, r12
1778        load_acc_store  16, 17, 18, 19, 1
1779        load_acc_store  20, 21, 22, 23, 1
1780        load_acc_store  24, 25, 26, 27, 1
1781        load_acc_store  28, 29, 30, 31, 1
1782.purgem load_acc_store
1783        @ Lengthen the idct16 coeffs back into 32 bit form
1784        vmovl.s16       q2,  d2
1785        vmovl.s16       q3,  d3
1786        vmovl.s16       q1,  d1
1787        vmovl.s16       q0,  d0
1788        pop             {pc}
1789endfunc
1790.endm
1791
1792idct32_funcs
1793idct32_funcs _quarter
1794idct32_funcs _half
1795
1796const min_eob_idct_idct_32, align=4
1797        .short  0, 3, 9, 21, 34, 51, 70, 98, 135, 176, 240, 258, 336, 357, 448, 472
1798endconst
1799
1800function vp9_idct_idct_32x32_add_16_neon
1801        cmp             r3,  #1
1802        beq             idct32x32_dc_add_neon
1803        vpush           {q4-q7}
1804        movrel          r8,  min_eob_idct_idct_32 + 2
1805
1806        @ Align the stack, allocate a temp buffer
1807T       mov             r7,  sp
1808T       and             r7,  r7,  #15
1809A       and             r7,  sp,  #15
1810        add             r7,  r7,  #4096
1811        sub             sp,  sp,  r7
1812
1813        mov             r4,  r0
1814        mov             r5,  r1
1815        mov             r6,  r2
1816
1817        movrel          r12, idct_coeffs
1818        vld1.16         {q0-q1}, [r12,:128]!
1819        vld1.16         {q6-q7}, [r12,:128]
1820        vmovl.s16       q2,  d2
1821        vmovl.s16       q3,  d3
1822        vmovl.s16       q1,  d1
1823        vmovl.s16       q0,  d0
1824
1825        cmp             r3,  #34
1826        ble             idct32x32_quarter_add_16_neon
1827        cmp             r3,  #135
1828        ble             idct32x32_half_add_16_neon
1829
1830.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
1831        add             r0,  sp,  #(\i*128)
1832.if \i > 0
1833        ldrh_post       r1,  r8,  #2
1834        cmp             r3,  r1
1835        it              le
1836        movle           r1,  #(32 - \i)/2
1837        ble             1f
1838.endif
1839        add             r2,  r6,  #(\i*4)
1840        bl              idct32_1d_2x32_pass1_neon
1841.endr
1842        b               3f
1843
18441:
1845        @ Write zeros to the temp buffer for pass 2
1846        vmov.i16        q14, #0
1847        vmov.i16        q15, #0
18482:
1849        subs            r1,  r1,  #1
1850.rept 2
1851        @ Fill one line with zeros
1852        vst1.16         {q14-q15}, [r0,:128]!
1853        vst1.16         {q14-q15}, [r0,:128]!
1854        vst1.16         {q14-q15}, [r0,:128]!
1855        vst1.16         {q14-q15}, [r0,:128]!
1856.endr
1857        bne             2b
18583:
1859.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
1860        add             r0,  r4,  #(\i*2)
1861        mov             r1,  r5
1862        add             r2,  sp,  #(\i*4)
1863        bl              idct32_1d_2x32_pass2_neon
1864.endr
1865
1866        add             sp,  sp,  r7
1867        vpop            {q4-q7}
1868        pop             {r4-r9,pc}
1869endfunc
1870
1871function ff_vp9_idct_idct_32x32_add_10_neon, export=1
1872        push            {r4-r9,lr}
1873        movw            r9,  #0x03ff
1874        b               vp9_idct_idct_32x32_add_16_neon
1875endfunc
1876
1877function ff_vp9_idct_idct_32x32_add_12_neon, export=1
1878        push            {r4-r9,lr}
1879        movw            r9,  #0x0fff
1880        b               vp9_idct_idct_32x32_add_16_neon
1881endfunc
1882
1883.macro idct32_partial size, rows
1884function idct32x32_\size\()_add_16_neon
1885.irp i, 0, 2, 4, 6
1886        add             r0,  sp,  #(\i*128)
1887.ifc \size,quarter
1888.if \i > 0
1889        ldrh_post       r1,  r8,  #2
1890        cmp             r3,  r1
1891        it              le
1892        movle           r1,  #(\rows - \i)/2
1893        ble             1f
1894.endif
1895.endif
1896        add             r2,  r6,  #(\i*4)
1897        bl              idct32_1d_2x32_pass1_\size\()_neon
1898.endr
1899.ifc \size,half
1900        add             r8,  r8,  #8
1901.irp i, 8, 10, 12, 14
1902        add             r0,  sp,  #(\i*128)
1903.if \i > 8
1904        ldrh_post       r1,  r8,  #2
1905        cmp             r3,  r1
1906        it              le
1907        movle           r1,  #(\rows - \i)/2
1908        ble             1f
1909.endif
1910        add             r2,  r6,  #(\i*4)
1911        bl              idct32_1d_2x32_pass1_\size\()_neon
1912.endr
1913.endif
1914        b               3f
1915
19161:
1917        @ Write zeros to the temp buffer for pass 2
1918        vmov.i16        q14, #0
1919        vmov.i16        q15, #0
19202:
1921        subs            r1,  r1,  #1
1922.rept 2
1923        @ Fill one line with zeros
1924        vst1.16         {q14-q15}, [r0,:128]!
1925        vst1.16         {q14-q15}, [r0,:128]!
1926        vst1.16         {q14-q15}, [r0,:128]!
1927        vst1.16         {q14-q15}, [r0,:128]!
1928.endr
1929        bne             2b
19303:
1931.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
1932        add             r0,  r4,  #(\i*2)
1933        mov             r1,  r5
1934        add             r2,  sp,  #(\i*4)
1935        bl              idct32_1d_2x32_pass2_\size\()_neon
1936.endr
1937
1938        add             sp,  sp,  r7
1939        vpop            {q4-q7}
1940        pop             {r4-r9,pc}
1941endfunc
1942.endm
1943
1944idct32_partial quarter, 8
1945idct32_partial half, 16
1946