1/*
2 * Copyright (c) 2017 Google Inc.
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/aarch64/asm.S"
22#include "neon.S"
23
24const itxfm4_coeffs, align=4
25        .short  11585, 0, 6270, 15137
26iadst4_coeffs:
27        .short  5283, 15212, 9929, 13377
28endconst
29
30const iadst8_coeffs, align=4
31        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
32idct_coeffs:
33        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
34        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
35        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
36        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
37endconst
38
39const iadst16_coeffs, align=4
40        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
41        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
42endconst
43
44.macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
45        trn1            \r4\().4s,  \r0\().4s,  \r1\().4s
46        trn2            \r5\().4s,  \r0\().4s,  \r1\().4s
47        trn1            \r6\().4s,  \r2\().4s,  \r3\().4s
48        trn2            \r7\().4s,  \r2\().4s,  \r3\().4s
49        trn1            \r0\().2d,  \r4\().2d,  \r6\().2d
50        trn2            \r2\().2d,  \r4\().2d,  \r6\().2d
51        trn1            \r1\().2d,  \r5\().2d,  \r7\().2d
52        trn2            \r3\().2d,  \r5\().2d,  \r7\().2d
53.endm
54
55// Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
56// over two registers.
57.macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
58        transpose_4x4s  \r0,  \r2,  \r4,  \r6,  \t0, \t1, \t2, \t3
59        transpose_4x4s  \r9,  \r11, \r13, \r15, \t0, \t1, \t2, \t3
60
61        // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
62        // while swapping the two 4x4 matrices between each other
63
64        // First step of the 4x4 transpose of r1-r7, into t0-t3
65        trn1            \t0\().4s,  \r1\().4s,  \r3\().4s
66        trn2            \t1\().4s,  \r1\().4s,  \r3\().4s
67        trn1            \t2\().4s,  \r5\().4s,  \r7\().4s
68        trn2            \t3\().4s,  \r5\().4s,  \r7\().4s
69
70        // First step of the 4x4 transpose of r8-r12, into r1-r7
71        trn1            \r1\().4s,  \r8\().4s,  \r10\().4s
72        trn2            \r3\().4s,  \r8\().4s,  \r10\().4s
73        trn1            \r5\().4s,  \r12\().4s, \r14\().4s
74        trn2            \r7\().4s,  \r12\().4s, \r14\().4s
75
76        // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
77        trn1            \r8\().2d,  \t0\().2d,  \t2\().2d
78        trn2            \r12\().2d, \t0\().2d,  \t2\().2d
79        trn1            \r10\().2d, \t1\().2d,  \t3\().2d
80        trn2            \r14\().2d, \t1\().2d,  \t3\().2d
81
82        // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
83        trn1            \t0\().2d,  \r1\().2d,  \r5\().2d
84        trn2            \r5\().2d,  \r1\().2d,  \r5\().2d
85        trn1            \t1\().2d,  \r3\().2d,  \r7\().2d
86        trn2            \r7\().2d,  \r3\().2d,  \r7\().2d
87
88        // Move the outputs of trn1 back in place
89        mov             \r1\().16b,  \t0\().16b
90        mov             \r3\().16b,  \t1\().16b
91.endm
92
93// out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
94// out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
95// in/out are .4s registers; this can do with 4 temp registers, but is
96// more efficient if 6 temp registers are available.
97.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
98.if \neg > 0
99        neg             \tmp4\().4s, v0.4s
100.endif
101        add             \tmp1\().4s, \in1\().4s,  \in2\().4s
102        sub             \tmp2\().4s, \in1\().4s,  \in2\().4s
103.if \neg > 0
104        smull           \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0]
105        smull2          \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0]
106.else
107        smull           \tmp3\().2d, \tmp1\().2s, v0.s[0]
108        smull2          \tmp4\().2d, \tmp1\().4s, v0.s[0]
109.endif
110.ifb \tmp5
111        rshrn           \out1\().2s, \tmp3\().2d, #14
112        rshrn2          \out1\().4s, \tmp4\().2d, #14
113        smull           \tmp3\().2d, \tmp2\().2s, v0.s[0]
114        smull2          \tmp4\().2d, \tmp2\().4s, v0.s[0]
115        rshrn           \out2\().2s, \tmp3\().2d, #14
116        rshrn2          \out2\().4s, \tmp4\().2d, #14
117.else
118        smull           \tmp5\().2d, \tmp2\().2s, v0.s[0]
119        smull2          \tmp6\().2d, \tmp2\().4s, v0.s[0]
120        rshrn           \out1\().2s, \tmp3\().2d, #14
121        rshrn2          \out1\().4s, \tmp4\().2d, #14
122        rshrn           \out2\().2s, \tmp5\().2d, #14
123        rshrn2          \out2\().4s, \tmp6\().2d, #14
124.endif
125.endm
126
127// Same as dmbutterfly0 above, but treating the input in in2 as zero,
128// writing the same output into both out1 and out2.
129.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
130        smull           \tmp1\().2d, \in1\().2s,  v0.s[0]
131        smull2          \tmp2\().2d, \in1\().4s,  v0.s[0]
132        rshrn           \out1\().2s, \tmp1\().2d, #14
133        rshrn2          \out1\().4s, \tmp2\().2d, #14
134        rshrn           \out2\().2s, \tmp1\().2d, #14
135        rshrn2          \out2\().4s, \tmp2\().2d, #14
136.endm
137
138// out1,out2 = in1 * coef1 - in2 * coef2
139// out3,out4 = in1 * coef2 + in2 * coef1
140// out are 4 x .2d registers, in are 2 x .4s registers
141.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
142        smull           \out1\().2d, \in1\().2s, \coef1
143        smull2          \out2\().2d, \in1\().4s, \coef1
144        smull           \out3\().2d, \in1\().2s, \coef2
145        smull2          \out4\().2d, \in1\().4s, \coef2
146        smlsl           \out1\().2d, \in2\().2s, \coef2
147        smlsl2          \out2\().2d, \in2\().4s, \coef2
148        smlal           \out3\().2d, \in2\().2s, \coef1
149        smlal2          \out4\().2d, \in2\().4s, \coef1
150.endm
151
152// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
153// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
154// inout are 2 x .4s registers
155.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
156        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
157.if \neg > 0
158        neg             \tmp3\().2d, \tmp3\().2d
159        neg             \tmp4\().2d, \tmp4\().2d
160.endif
161        rshrn           \inout1\().2s, \tmp1\().2d,  #14
162        rshrn2          \inout1\().4s, \tmp2\().2d,  #14
163        rshrn           \inout2\().2s, \tmp3\().2d,  #14
164        rshrn2          \inout2\().4s, \tmp4\().2d,  #14
165.endm
166
167// Same as dmbutterfly above, but treating the input in inout2 as zero
168.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
169        smull           \tmp1\().2d, \inout1\().2s, \coef1
170        smull2          \tmp2\().2d, \inout1\().4s, \coef1
171        smull           \tmp3\().2d, \inout1\().2s, \coef2
172        smull2          \tmp4\().2d, \inout1\().4s, \coef2
173        rshrn           \inout1\().2s, \tmp1\().2d, #14
174        rshrn2          \inout1\().4s, \tmp2\().2d, #14
175        rshrn           \inout2\().2s, \tmp3\().2d, #14
176        rshrn2          \inout2\().4s, \tmp4\().2d, #14
177.endm
178
179// Same as dmbutterfly above, but treating the input in inout1 as zero
180.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
181        smull           \tmp1\().2d, \inout2\().2s, \coef2
182        smull2          \tmp2\().2d, \inout2\().4s, \coef2
183        smull           \tmp3\().2d, \inout2\().2s, \coef1
184        smull2          \tmp4\().2d, \inout2\().4s, \coef1
185        neg             \tmp1\().2d, \tmp1\().2d
186        neg             \tmp2\().2d, \tmp2\().2d
187        rshrn           \inout2\().2s, \tmp3\().2d, #14
188        rshrn2          \inout2\().4s, \tmp4\().2d, #14
189        rshrn           \inout1\().2s, \tmp1\().2d, #14
190        rshrn2          \inout1\().4s, \tmp2\().2d, #14
191.endm
192
193.macro dsmull_h out1, out2, in, coef
194        smull           \out1\().2d, \in\().2s, \coef
195        smull2          \out2\().2d, \in\().4s, \coef
196.endm
197
198.macro drshrn_h out, in1, in2, shift
199        rshrn           \out\().2s, \in1\().2d, \shift
200        rshrn2          \out\().4s, \in2\().2d, \shift
201.endm
202
203
204// out1 = in1 + in2
205// out2 = in1 - in2
206.macro butterfly_4s out1, out2, in1, in2
207        add             \out1\().4s, \in1\().4s, \in2\().4s
208        sub             \out2\().4s, \in1\().4s, \in2\().4s
209.endm
210
211// out1 = in1 - in2
212// out2 = in1 + in2
213.macro butterfly_4s_r out1, out2, in1, in2
214        sub             \out1\().4s, \in1\().4s, \in2\().4s
215        add             \out2\().4s, \in1\().4s, \in2\().4s
216.endm
217
218// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
219// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
220// out are 2 x .4s registers, in are 4 x .2d registers
221.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
222        add             \tmp1\().2d, \in1\().2d, \in3\().2d
223        add             \tmp2\().2d, \in2\().2d, \in4\().2d
224        sub             \tmp3\().2d, \in1\().2d, \in3\().2d
225        sub             \tmp4\().2d, \in2\().2d, \in4\().2d
226        rshrn           \out1\().2s, \tmp1\().2d,  #14
227        rshrn2          \out1\().4s, \tmp2\().2d,  #14
228        rshrn           \out2\().2s, \tmp3\().2d,  #14
229        rshrn2          \out2\().4s, \tmp4\().2d,  #14
230.endm
231
232.macro iwht4_10 c0, c1, c2, c3
233        add             \c0\().4s, \c0\().4s, \c1\().4s
234        sub             v17.4s,    \c2\().4s, \c3\().4s
235        sub             v16.4s,    \c0\().4s, v17.4s
236        sshr            v16.4s,    v16.4s,    #1
237        sub             \c2\().4s, v16.4s,    \c1\().4s
238        sub             \c1\().4s, v16.4s,    \c3\().4s
239        add             \c3\().4s, v17.4s,    \c2\().4s
240        sub             \c0\().4s, \c0\().4s, \c1\().4s
241.endm
242
243.macro iwht4_12 c0, c1, c2, c3
244        iwht4_10        \c0, \c1, \c2, \c3
245.endm
246
247.macro idct4_10 c0, c1, c2, c3
248        mul             v22.4s,    \c1\().4s, v0.s[3]
249        mul             v20.4s,    \c1\().4s, v0.s[2]
250        add             v16.4s,    \c0\().4s, \c2\().4s
251        sub             v17.4s,    \c0\().4s, \c2\().4s
252        mla             v22.4s,    \c3\().4s, v0.s[2]
253        mul             v18.4s,    v16.4s,    v0.s[0]
254        mul             v24.4s,    v17.4s,    v0.s[0]
255        mls             v20.4s,    \c3\().4s, v0.s[3]
256        srshr           v22.4s,    v22.4s,    #14
257        srshr           v18.4s,    v18.4s,    #14
258        srshr           v24.4s,    v24.4s,    #14
259        srshr           v20.4s,    v20.4s,    #14
260        add             \c0\().4s, v18.4s,    v22.4s
261        sub             \c3\().4s, v18.4s,    v22.4s
262        add             \c1\().4s, v24.4s,    v20.4s
263        sub             \c2\().4s, v24.4s,    v20.4s
264.endm
265
266.macro idct4_12 c0, c1, c2, c3
267        smull           v22.2d,    \c1\().2s, v0.s[3]
268        smull2          v23.2d,    \c1\().4s, v0.s[3]
269        smull           v20.2d,    \c1\().2s, v0.s[2]
270        smull2          v21.2d,    \c1\().4s, v0.s[2]
271        add             v16.4s,    \c0\().4s, \c2\().4s
272        sub             v17.4s,    \c0\().4s, \c2\().4s
273        smlal           v22.2d,    \c3\().2s, v0.s[2]
274        smlal2          v23.2d,    \c3\().4s, v0.s[2]
275        smull           v18.2d,    v16.2s,    v0.s[0]
276        smull2          v19.2d,    v16.4s,    v0.s[0]
277        smull           v24.2d,    v17.2s,    v0.s[0]
278        smull2          v25.2d,    v17.4s,    v0.s[0]
279        smlsl           v20.2d,    \c3\().2s, v0.s[3]
280        smlsl2          v21.2d,    \c3\().4s, v0.s[3]
281        rshrn           v22.2s,    v22.2d,    #14
282        rshrn2          v22.4s,    v23.2d,    #14
283        rshrn           v18.2s,    v18.2d,    #14
284        rshrn2          v18.4s,    v19.2d,    #14
285        rshrn           v24.2s,    v24.2d,    #14
286        rshrn2          v24.4s,    v25.2d,    #14
287        rshrn           v20.2s,    v20.2d,    #14
288        rshrn2          v20.4s,    v21.2d,    #14
289        add             \c0\().4s, v18.4s,    v22.4s
290        sub             \c3\().4s, v18.4s,    v22.4s
291        add             \c1\().4s, v24.4s,    v20.4s
292        sub             \c2\().4s, v24.4s,    v20.4s
293.endm
294
295.macro iadst4_10 c0, c1, c2, c3
296        mul             v16.4s,    \c0\().4s, v1.s[0]
297        mla             v16.4s,    \c2\().4s, v1.s[1]
298        mla             v16.4s,    \c3\().4s, v1.s[2]
299        mul             v18.4s,    \c0\().4s, v1.s[2]
300        mls             v18.4s,    \c2\().4s, v1.s[0]
301        sub             \c0\().4s, \c0\().4s, \c2\().4s
302        mls             v18.4s,    \c3\().4s, v1.s[1]
303        add             \c0\().4s, \c0\().4s, \c3\().4s
304        mul             v22.4s,    \c1\().4s, v1.s[3]
305        mul             v20.4s,    \c0\().4s, v1.s[3]
306        add             v24.4s,    v16.4s,    v22.4s
307        add             v26.4s,    v18.4s,    v22.4s
308        srshr           \c0\().4s, v24.4s,    #14
309        add             v16.4s,    v16.4s,    v18.4s
310        srshr           \c1\().4s, v26.4s,    #14
311        sub             v16.4s,    v16.4s,    v22.4s
312        srshr           \c2\().4s, v20.4s,    #14
313        srshr           \c3\().4s, v16.4s,    #14
314.endm
315
316.macro iadst4_12 c0, c1, c2, c3
317        smull           v16.2d,    \c0\().2s, v1.s[0]
318        smull2          v17.2d,    \c0\().4s, v1.s[0]
319        smlal           v16.2d,    \c2\().2s, v1.s[1]
320        smlal2          v17.2d,    \c2\().4s, v1.s[1]
321        smlal           v16.2d,    \c3\().2s, v1.s[2]
322        smlal2          v17.2d,    \c3\().4s, v1.s[2]
323        smull           v18.2d,    \c0\().2s, v1.s[2]
324        smull2          v19.2d,    \c0\().4s, v1.s[2]
325        smlsl           v18.2d,    \c2\().2s, v1.s[0]
326        smlsl2          v19.2d,    \c2\().4s, v1.s[0]
327        sub             \c0\().4s, \c0\().4s, \c2\().4s
328        smlsl           v18.2d,    \c3\().2s, v1.s[1]
329        smlsl2          v19.2d,    \c3\().4s, v1.s[1]
330        add             \c0\().4s, \c0\().4s, \c3\().4s
331        smull           v22.2d,    \c1\().2s, v1.s[3]
332        smull2          v23.2d,    \c1\().4s, v1.s[3]
333        smull           v20.2d,    \c0\().2s, v1.s[3]
334        smull2          v21.2d,    \c0\().4s, v1.s[3]
335        add             v24.2d,    v16.2d,    v22.2d
336        add             v25.2d,    v17.2d,    v23.2d
337        add             v26.2d,    v18.2d,    v22.2d
338        add             v27.2d,    v19.2d,    v23.2d
339        rshrn           \c0\().2s, v24.2d,    #14
340        rshrn2          \c0\().4s, v25.2d,    #14
341        add             v16.2d,    v16.2d,    v18.2d
342        add             v17.2d,    v17.2d,    v19.2d
343        rshrn           \c1\().2s, v26.2d,    #14
344        rshrn2          \c1\().4s, v27.2d,    #14
345        sub             v16.2d,    v16.2d,    v22.2d
346        sub             v17.2d,    v17.2d,    v23.2d
347        rshrn           \c2\().2s, v20.2d,    #14
348        rshrn2          \c2\().4s, v21.2d,    #14
349        rshrn           \c3\().2s, v16.2d,    #14
350        rshrn2          \c3\().4s, v17.2d,    #14
351.endm
352
353// The public functions in this file have got the following signature:
354// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
355
356.macro itxfm_func4x4 txfm1, txfm2, bpp
357function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
358.ifc \txfm1,\txfm2
359.ifc \txfm1,idct
360        movrel          x4,  itxfm4_coeffs
361        ld1             {v0.4h}, [x4]
362        sxtl            v0.4s,  v0.4h
363.endif
364.ifc \txfm1,iadst
365        movrel          x4,  iadst4_coeffs
366        ld1             {v0.d}[1], [x4]
367        sxtl2           v1.4s,  v0.8h
368.endif
369.else
370        movrel          x4,  itxfm4_coeffs
371        ld1             {v0.8h}, [x4]
372        sxtl2           v1.4s,  v0.8h
373        sxtl            v0.4s,  v0.4h
374.endif
375
376        movi            v30.4s, #0
377        movi            v31.4s, #0
378.ifc \txfm1\()_\txfm2,idct_idct
379        cmp             w3,  #1
380        b.ne            1f
381        // DC-only for idct/idct
382        ld1             {v2.s}[0],  [x2]
383        smull           v2.2d,  v2.2s, v0.s[0]
384        rshrn           v2.2s,  v2.2d, #14
385        smull           v2.2d,  v2.2s, v0.s[0]
386        rshrn           v2.2s,  v2.2d, #14
387        st1             {v31.s}[0], [x2]
388        dup             v4.4s,  v2.s[0]
389        mov             v5.16b, v4.16b
390        mov             v6.16b, v4.16b
391        mov             v7.16b, v4.16b
392        b               2f
393.endif
394
3951:
396        ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x2]
397        st1             {v30.4s,v31.4s}, [x2], #32
398
399.ifc \txfm1,iwht
400        sshr            v4.4s,  v4.4s,  #2
401        sshr            v5.4s,  v5.4s,  #2
402        sshr            v6.4s,  v6.4s,  #2
403        sshr            v7.4s,  v7.4s,  #2
404.endif
405
406        \txfm1\()4_\bpp v4,  v5,  v6,  v7
407
408        st1             {v30.4s,v31.4s}, [x2], #32
409        // Transpose 4x4 with 32 bit elements
410        transpose_4x4s  v4,  v5,  v6,  v7,  v16, v17, v18, v19
411
412        \txfm2\()4_\bpp v4,  v5,  v6,  v7
4132:
414        mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
415        ld1             {v0.4h},   [x0], x1
416        ld1             {v1.4h},   [x0], x1
417.ifnc \txfm1,iwht
418        srshr           v4.4s,  v4.4s,  #4
419        srshr           v5.4s,  v5.4s,  #4
420        srshr           v6.4s,  v6.4s,  #4
421        srshr           v7.4s,  v7.4s,  #4
422.endif
423        uaddw           v4.4s,  v4.4s,  v0.4h
424        uaddw           v5.4s,  v5.4s,  v1.4h
425        ld1             {v2.4h},   [x0], x1
426        ld1             {v3.4h},   [x0], x1
427        sqxtun          v0.4h,  v4.4s
428        sqxtun2         v0.8h,  v5.4s
429        sub             x0,  x0,  x1, lsl #2
430
431        uaddw           v6.4s,  v6.4s,  v2.4h
432        umin            v0.8h,  v0.8h,  v31.8h
433        uaddw           v7.4s,  v7.4s,  v3.4h
434        st1             {v0.4h},   [x0], x1
435        sqxtun          v2.4h,  v6.4s
436        sqxtun2         v2.8h,  v7.4s
437        umin            v2.8h,  v2.8h,  v31.8h
438
439        st1             {v0.d}[1], [x0], x1
440        st1             {v2.4h},   [x0], x1
441        st1             {v2.d}[1], [x0], x1
442
443        ret
444endfunc
445.endm
446
447.macro itxfm_funcs4x4 bpp
448itxfm_func4x4 idct,  idct,  \bpp
449itxfm_func4x4 iadst, idct,  \bpp
450itxfm_func4x4 idct,  iadst, \bpp
451itxfm_func4x4 iadst, iadst, \bpp
452itxfm_func4x4 iwht,  iwht,  \bpp
453.endm
454
455itxfm_funcs4x4 10
456itxfm_funcs4x4 12
457
458function idct8x8_dc_add_neon
459        movrel          x4,  idct_coeffs
460        ld1             {v0.4h}, [x4]
461
462        movi            v1.4h,  #0
463        sxtl            v0.4s,  v0.4h
464
465        ld1             {v2.s}[0],  [x2]
466        smull           v2.2d,  v2.2s,  v0.s[0]
467        rshrn           v2.2s,  v2.2d,  #14
468        smull           v2.2d,  v2.2s,  v0.s[0]
469        rshrn           v2.2s,  v2.2d,  #14
470        st1             {v1.s}[0],  [x2]
471        dup             v2.4s,  v2.s[0]
472
473        srshr           v2.4s,  v2.4s,  #5
474
475        mov             x4,  #8
476        mov             x3,  x0
477        dup             v31.8h, w5
4781:
479        // Loop to add the constant from v2 into all 8x8 outputs
480        subs            x4,  x4,  #2
481        ld1             {v3.8h},  [x0], x1
482        ld1             {v4.8h},  [x0], x1
483        uaddw           v16.4s, v2.4s,  v3.4h
484        uaddw2          v17.4s, v2.4s,  v3.8h
485        uaddw           v18.4s, v2.4s,  v4.4h
486        uaddw2          v19.4s, v2.4s,  v4.8h
487        sqxtun          v3.4h,  v16.4s
488        sqxtun2         v3.8h,  v17.4s
489        sqxtun          v4.4h,  v18.4s
490        sqxtun2         v4.8h,  v19.4s
491        umin            v3.8h,  v3.8h,  v31.8h
492        umin            v4.8h,  v4.8h,  v31.8h
493        st1             {v3.8h},  [x3], x1
494        st1             {v4.8h},  [x3], x1
495        b.ne            1b
496
497        ret
498endfunc
499
500.macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
501        dmbutterfly0    \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a
502        dmbutterfly     \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3   // r2 = t2a, r6 = t3a
503        dmbutterfly     \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3   // r1 = t4a, r7 = t7a
504        dmbutterfly     \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3   // r5 = t5a, r3 = t6a
505
506        butterfly_4s    \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3
507        butterfly_4s    \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a
508        butterfly_4s    \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a
509        butterfly_4s    \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2
510
511        dmbutterfly0    \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5
512
513        butterfly_4s    \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6]
514        butterfly_4s    \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7]
515        butterfly_4s    \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5]
516        butterfly_4s    \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4]
517.endm
518
519.macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
520        dmbutterfly_l   \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0]   // t2,t3 = t1a, t0,t1 = t0a
521        dmbutterfly_l   \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0]   // r0,r7 = t5a, t4,t5 = t4a
522
523        dbutterfly_n    \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4
524        dbutterfly_n    \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5
525
526        dmbutterfly_l   \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2]   // t4,t5 = t3a, t2,t3 = t2a
527        dmbutterfly_l   \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2]   // r2,r5 = t7a, r0,r7 = t6a
528
529        dbutterfly_n    \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6
530        dbutterfly_n    \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7
531
532        butterfly_4s    \r7, \r4, \r4, \r0   // r7 = -out[7], r4 = t3
533        neg             \r7\().4s, \r7\().4s // r7 = out[7]
534        butterfly_4s    \r0, \r1, \r3, \r1   // r0 = out[0],  r1 = t2
535
536        dmbutterfly_l   \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3]   // r2,r3 = t5a, t3,t5 = t4a
537        dmbutterfly_l   \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2]   // t0,t1 = t6a, r5,r6 = t7a
538
539        dbutterfly_n    \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6],  t2 = t7
540
541        dmbutterfly0    \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2           // r3 = -out[3], r4 = out[4]
542        neg             \r3\().4s, \r3\().4s  // r3 = out[3]
543
544        dbutterfly_n    \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6
545        neg             \r1\().4s, \r1\().4s  // r1 = out[1]
546
547        dmbutterfly0    \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5           // r2 = out[2],  r5 = -out[5]
548        neg             \r5\().4s, \r5\().4s  // r5 = out[5]
549.endm
550
551
552.macro itxfm_func8x8 txfm1, txfm2
553function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
554.ifc \txfm1\()_\txfm2,idct_idct
555        cmp             w3,  #1
556        b.eq            idct8x8_dc_add_neon
557.endif
558        // The iadst also uses a few coefficients from
559        // idct, so those always need to be loaded.
560.ifc \txfm1\()_\txfm2,idct_idct
561        movrel          x4,  idct_coeffs
562.else
563        movrel          x4,  iadst8_coeffs
564        ld1             {v1.8h}, [x4], #16
565        stp             d8,  d9,  [sp, #-0x10]!
566        sxtl2           v3.4s,  v1.8h
567        sxtl            v2.4s,  v1.4h
568.endif
569        ld1             {v0.8h}, [x4]
570        sxtl2           v1.4s,  v0.8h
571        sxtl            v0.4s,  v0.4h
572
573        movi            v4.4s, #0
574        movi            v5.4s, #0
575        movi            v6.4s, #0
576        movi            v7.4s, #0
577
5781:
579        ld1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x2], #64
580        ld1             {v20.4s,v21.4s,v22.4s,v23.4s},  [x2], #64
581        ld1             {v24.4s,v25.4s,v26.4s,v27.4s},  [x2], #64
582        ld1             {v28.4s,v29.4s,v30.4s,v31.4s},  [x2], #64
583        sub             x2,  x2,  #256
584        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
585        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
586        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
587        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
588
589.ifc \txfm1\()_\txfm2,idct_idct
590        idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
591        idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
592.else
593        \txfm1\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
594        \txfm1\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
595.endif
596
597        // Transpose 8x8 with 16 bit elements
598        transpose_8x8s  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7
599
600.ifc \txfm1\()_\txfm2,idct_idct
601        idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
602        idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
603.else
604        \txfm2\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
605        \txfm2\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
606.endif
6072:
608        mov             x3,  x0
609        // Add into the destination
610        ld1             {v0.8h},  [x0], x1
611        srshr           v16.4s, v16.4s, #5
612        srshr           v17.4s, v17.4s, #5
613        ld1             {v1.8h},  [x0], x1
614        srshr           v18.4s, v18.4s, #5
615        srshr           v19.4s, v19.4s, #5
616        ld1             {v2.8h},  [x0], x1
617        srshr           v20.4s, v20.4s, #5
618        srshr           v21.4s, v21.4s, #5
619        uaddw           v16.4s, v16.4s, v0.4h
620        uaddw2          v17.4s, v17.4s, v0.8h
621        ld1             {v3.8h},  [x0], x1
622        srshr           v22.4s, v22.4s, #5
623        srshr           v23.4s, v23.4s, #5
624        uaddw           v18.4s, v18.4s, v1.4h
625        uaddw2          v19.4s, v19.4s, v1.8h
626        ld1             {v4.8h},  [x0], x1
627        srshr           v24.4s, v24.4s, #5
628        srshr           v25.4s, v25.4s, #5
629        uaddw           v20.4s, v20.4s, v2.4h
630        uaddw2          v21.4s, v21.4s, v2.8h
631        sqxtun          v0.4h,  v16.4s
632        sqxtun2         v0.8h,  v17.4s
633        dup             v16.8h, w5
634        ld1             {v5.8h},  [x0], x1
635        srshr           v26.4s, v26.4s, #5
636        srshr           v27.4s, v27.4s, #5
637        uaddw           v22.4s, v22.4s, v3.4h
638        uaddw2          v23.4s, v23.4s, v3.8h
639        sqxtun          v1.4h,  v18.4s
640        sqxtun2         v1.8h,  v19.4s
641        umin            v0.8h,  v0.8h,  v16.8h
642        ld1             {v6.8h},  [x0], x1
643        srshr           v28.4s, v28.4s, #5
644        srshr           v29.4s, v29.4s, #5
645        uaddw           v24.4s, v24.4s, v4.4h
646        uaddw2          v25.4s, v25.4s, v4.8h
647        sqxtun          v2.4h,  v20.4s
648        sqxtun2         v2.8h,  v21.4s
649        umin            v1.8h,  v1.8h,  v16.8h
650        ld1             {v7.8h},  [x0], x1
651        srshr           v30.4s, v30.4s, #5
652        srshr           v31.4s, v31.4s, #5
653        uaddw           v26.4s, v26.4s, v5.4h
654        uaddw2          v27.4s, v27.4s, v5.8h
655        sqxtun          v3.4h,  v22.4s
656        sqxtun2         v3.8h,  v23.4s
657        umin            v2.8h,  v2.8h,  v16.8h
658
659        st1             {v0.8h},  [x3], x1
660        uaddw           v28.4s, v28.4s, v6.4h
661        uaddw2          v29.4s, v29.4s, v6.8h
662        st1             {v1.8h},  [x3], x1
663        sqxtun          v4.4h,  v24.4s
664        sqxtun2         v4.8h,  v25.4s
665        umin            v3.8h,  v3.8h,  v16.8h
666        st1             {v2.8h},  [x3], x1
667        uaddw           v30.4s, v30.4s, v7.4h
668        uaddw2          v31.4s, v31.4s, v7.8h
669        st1             {v3.8h},  [x3], x1
670        sqxtun          v5.4h,  v26.4s
671        sqxtun2         v5.8h,  v27.4s
672        umin            v4.8h,  v4.8h,  v16.8h
673        st1             {v4.8h},  [x3], x1
674        sqxtun          v6.4h,  v28.4s
675        sqxtun2         v6.8h,  v29.4s
676        umin            v5.8h,  v5.8h,  v16.8h
677        st1             {v5.8h},  [x3], x1
678        sqxtun          v7.4h,  v30.4s
679        sqxtun2         v7.8h,  v31.4s
680        umin            v6.8h,  v6.8h,  v16.8h
681
682        st1             {v6.8h},  [x3], x1
683        umin            v7.8h,  v7.8h,  v16.8h
684        st1             {v7.8h},  [x3], x1
685
686.ifnc \txfm1\()_\txfm2,idct_idct
687        ldp             d8,  d9,  [sp], 0x10
688.endif
689        ret
690endfunc
691
692function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
693        mov             x5,  #0x03ff
694        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
695endfunc
696
697function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
698        mov             x5,  #0x0fff
699        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
700endfunc
701.endm
702
703itxfm_func8x8 idct,  idct
704itxfm_func8x8 iadst, idct
705itxfm_func8x8 idct,  iadst
706itxfm_func8x8 iadst, iadst
707
708
709function idct16x16_dc_add_neon
710        movrel          x4,  idct_coeffs
711        ld1             {v0.4h}, [x4]
712        sxtl            v0.4s,  v0.4h
713
714        movi            v1.4h,  #0
715
716        ld1             {v2.s}[0],  [x2]
717        smull           v2.2d,  v2.2s,  v0.s[0]
718        rshrn           v2.2s,  v2.2d,  #14
719        smull           v2.2d,  v2.2s,  v0.s[0]
720        rshrn           v2.2s,  v2.2d,  #14
721        st1             {v1.s}[0],  [x2]
722        dup             v2.4s,  v2.s[0]
723
724        srshr           v0.4s,  v2.4s,  #6
725
726        mov             x3, x0
727        mov             x4, #16
728        dup             v31.8h, w13
7291:
730        // Loop to add the constant from v2 into all 16x16 outputs
731        subs            x4,  x4,  #2
732        ld1             {v1.8h,v2.8h},  [x0], x1
733        uaddw           v16.4s, v0.4s,  v1.4h
734        uaddw2          v17.4s, v0.4s,  v1.8h
735        ld1             {v3.8h,v4.8h},  [x0], x1
736        uaddw           v18.4s, v0.4s,  v2.4h
737        uaddw2          v19.4s, v0.4s,  v2.8h
738        uaddw           v20.4s, v0.4s,  v3.4h
739        uaddw2          v21.4s, v0.4s,  v3.8h
740        uaddw           v22.4s, v0.4s,  v4.4h
741        uaddw2          v23.4s, v0.4s,  v4.8h
742        sqxtun          v1.4h,  v16.4s
743        sqxtun2         v1.8h,  v17.4s
744        sqxtun          v2.4h,  v18.4s
745        sqxtun2         v2.8h,  v19.4s
746        sqxtun          v3.4h,  v20.4s
747        sqxtun2         v3.8h,  v21.4s
748        sqxtun          v4.4h,  v22.4s
749        sqxtun2         v4.8h,  v23.4s
750        umin            v1.8h,  v1.8h,  v31.8h
751        umin            v2.8h,  v2.8h,  v31.8h
752        st1             {v1.8h,v2.8h},  [x3], x1
753        umin            v3.8h,  v3.8h,  v31.8h
754        umin            v4.8h,  v4.8h,  v31.8h
755        st1             {v3.8h,v4.8h},  [x3], x1
756        b.ne            1b
757
758        ret
759endfunc
760
761.macro idct16_end
762        butterfly_4s    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
763        butterfly_4s    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
764        butterfly_4s    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
765        butterfly_4s    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
766        butterfly_4s    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
767        butterfly_4s    v24, v21, v23, v21               // v24 = t9,   v21 = t10
768        butterfly_4s    v23, v27, v25, v27               // v23 = t14,  v27 = t13
769        butterfly_4s    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
770
771        dmbutterfly0    v8,  v9,  v27, v21, v8,  v9,  v16, v17, v30, v31 // v8  = t13a, v9  = t10a
772        dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
773
774        butterfly_4s    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
775        butterfly_4s    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
776        butterfly_4s_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
777        butterfly_4s    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
778        butterfly_4s    v18, v29, v4,  v8                // v18 = out[2], v29 = out[13]
779        butterfly_4s    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
780        butterfly_4s    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
781        butterfly_4s    v21, v26, v26, v9                // v21 = out[5], v26 = out[10]
782        ret
783.endm
784
785function idct16
786        dmbutterfly0    v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
787        dmbutterfly     v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
788        dmbutterfly     v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
789        dmbutterfly     v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
790        dmbutterfly     v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
791        dmbutterfly     v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
792        dmbutterfly     v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
793        dmbutterfly     v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
794
795        butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
796        butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
797        butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
798        butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
799        butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
800        butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
801        butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
802        butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
803
804        dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
805        dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
806        dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
807        idct16_end
808endfunc
809
810function idct16_half
811        dmbutterfly0_h  v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
812        dmbutterfly_h1  v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
813        dmbutterfly_h1  v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
814        dmbutterfly_h2  v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
815        dmbutterfly_h1  v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
816        dmbutterfly_h2  v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
817        dmbutterfly_h1  v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
818        dmbutterfly_h2  v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
819
820        butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
821        butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
822        butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
823        butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
824        butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
825        butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
826        butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
827        butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
828
829        dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
830        dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
831        dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
832        idct16_end
833endfunc
834
835function idct16_quarter
836        dsmull_h        v24, v25, v19, v3.s[3]
837        dsmull_h        v4,  v5,  v17, v2.s[0]
838        dsmull_h        v7,  v6,  v18, v1.s[1]
839        dsmull_h        v30, v31, v18, v1.s[0]
840        neg             v24.2d,  v24.2d
841        neg             v25.2d,  v25.2d
842        dsmull_h        v29, v28, v17, v2.s[1]
843        dsmull_h        v26, v27, v19, v3.s[2]
844        dsmull_h        v22, v23, v16, v0.s[0]
845        drshrn_h        v24, v24, v25, #14
846        drshrn_h        v16, v4,  v5,  #14
847        drshrn_h        v7,  v7,  v6,  #14
848        drshrn_h        v6,  v30, v31, #14
849        drshrn_h        v29, v29, v28, #14
850        drshrn_h        v17, v26, v27, #14
851        drshrn_h        v28, v22, v23, #14
852
853        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3]
854        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3]
855        neg             v22.2d,  v22.2d
856        neg             v23.2d,  v23.2d
857        drshrn_h        v27, v20, v21, #14
858        drshrn_h        v21, v22, v23, #14
859        drshrn_h        v23, v18, v19, #14
860        drshrn_h        v25, v30, v31, #14
861        mov             v4.16b,  v28.16b
862        mov             v5.16b,  v28.16b
863        dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
864        mov             v20.16b, v28.16b
865        idct16_end
866endfunc
867
868function iadst16
869        ld1             {v0.8h,v1.8h}, [x11]
870        sxtl            v2.4s,  v1.4h
871        sxtl2           v3.4s,  v1.8h
872        sxtl2           v1.4s,  v0.8h
873        sxtl            v0.4s,  v0.4h
874
875        dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.s[1], v0.s[0]   // v6,v7   = t1,   v4,v5   = t0
876        dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v1.s[1], v1.s[0]   // v10,v11 = t9,   v8,v9   = t8
877        dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
878        dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2]   // v14,v15 = t3,   v12,v13 = t2
879        dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
880
881        dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v1.s[3], v1.s[2]   // v6,v7   = t11,  v4,v5   = t10
882        dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
883        dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v2.s[1], v2.s[0]   // v10,v11 = t5,   v8,v9   = t4
884        dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
885
886        dmbutterfly_l   v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0]   // v14,v15 = t13,  v12,v13 = t12
887        dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
888        dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v2.s[3], v2.s[2]   // v6,v7   = t7,   v4,v5   = t6
889        dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
890
891        dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v3.s[3], v3.s[2]   // v10,v11 = t15,  v8,v9   = t14
892        ld1             {v0.8h}, [x10]
893        dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
894        sxtl2           v1.4s,  v0.8h
895        sxtl            v0.4s,  v0.4h
896        dmbutterfly_l   v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1]   // v14,v15 = t9,   v12,v13 = t8
897        dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
898
899        dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v1.s[1], v1.s[0]   // v4,v5   = t12,  v6,v7   = t13
900        dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
901        dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v1.s[2], v1.s[3]   // v10,v11 = t11,  v8,v9   = t10
902        butterfly_4s_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
903        dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
904
905        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2]   // v12,v13 = t14,  v14,v15 = t15
906        butterfly_4s_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
907        dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
908        dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
909
910        butterfly_4s_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
911        butterfly_4s_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
912
913        dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.s[2], v0.s[3]   // v10,v11 = t13,  v8,v9   = t12
914        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2]   // v12,v13 = t14,  v14,v15 = t15
915
916        dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
917        dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
918        neg             v29.4s, v29.4s                   // v29 = out[13]
919
920        dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.s[2], v0.s[3]   // v10,v11 = t5a,  v8,v9   = t4a
921        dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.s[3], v0.s[2]   // v12,v13 = t6a,  v14,v15 = t7a
922
923        butterfly_4s    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
924        butterfly_4s    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
925
926        dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
927        neg             v19.4s, v19.4s                   // v19 = out[3]
928        dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
929
930        butterfly_4s    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
931        butterfly_4s    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
932
933        dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
934        dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
935        dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
936        dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
937
938        neg             v31.4s,  v5.4s                    // v31 = out[15]
939        neg             v17.4s,  v3.4s                    // v17 = out[1]
940
941        mov             v16.16b, v2.16b
942        mov             v30.16b, v4.16b
943        ret
944endfunc
945
946// Helper macros; we can't use these expressions directly within
947// e.g. .irp due to the extra concatenation \(). Therefore wrap
948// them in macros to allow using .irp below.
949.macro load i, src, inc
950        ld1             {v\i\().4s},  [\src], \inc
951.endm
952.macro store i, dst, inc
953        st1             {v\i\().4s},  [\dst], \inc
954.endm
955.macro movi_v i, size, imm
956        movi            v\i\()\size,  \imm
957.endm
958.macro load_clear i, src, inc
959        ld1             {v\i\().4s}, [\src]
960        st1             {v4.4s},  [\src], \inc
961.endm
962
963.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
964        srshr           \coef0, \coef0, #6
965        ld1             {v4.4h},   [x0], x1
966        srshr           \coef1, \coef1, #6
967        ld1             {v4.d}[1], [x3], x1
968        srshr           \coef2, \coef2, #6
969        ld1             {v5.4h},   [x0], x1
970        srshr           \coef3, \coef3, #6
971        uaddw           \coef0, \coef0, v4.4h
972        ld1             {v5.d}[1], [x3], x1
973        srshr           \coef4, \coef4, #6
974        uaddw2          \coef1, \coef1, v4.8h
975        ld1             {v6.4h},   [x0], x1
976        srshr           \coef5, \coef5, #6
977        uaddw           \coef2, \coef2, v5.4h
978        ld1             {v6.d}[1], [x3], x1
979        sqxtun          v4.4h,  \coef0
980        srshr           \coef6, \coef6, #6
981        uaddw2          \coef3, \coef3, v5.8h
982        ld1             {v7.4h},   [x0], x1
983        sqxtun2         v4.8h,  \coef1
984        srshr           \coef7, \coef7, #6
985        uaddw           \coef4, \coef4, v6.4h
986        ld1             {v7.d}[1], [x3], x1
987        umin            v4.8h,  v4.8h,  v8.8h
988        sub             x0,  x0,  x1, lsl #2
989        sub             x3,  x3,  x1, lsl #2
990        sqxtun          v5.4h,  \coef2
991        uaddw2          \coef5, \coef5, v6.8h
992        st1             {v4.4h},   [x0], x1
993        sqxtun2         v5.8h,  \coef3
994        uaddw           \coef6, \coef6, v7.4h
995        st1             {v4.d}[1], [x3], x1
996        umin            v5.8h,  v5.8h,  v8.8h
997        sqxtun          v6.4h,  \coef4
998        uaddw2          \coef7, \coef7, v7.8h
999        st1             {v5.4h},   [x0], x1
1000        sqxtun2         v6.8h,  \coef5
1001        st1             {v5.d}[1], [x3], x1
1002        umin            v6.8h,  v6.8h,  v8.8h
1003        sqxtun          v7.4h,  \coef6
1004        st1             {v6.4h},   [x0], x1
1005        sqxtun2         v7.8h,  \coef7
1006        st1             {v6.d}[1], [x3], x1
1007        umin            v7.8h,  v7.8h,  v8.8h
1008        st1             {v7.4h},   [x0], x1
1009        st1             {v7.d}[1], [x3], x1
1010.endm
1011
1012// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
1013// transpose into a horizontal 16x4 slice and store.
1014// x0 = dst (temp buffer)
1015// x1 = slice offset
1016// x2 = src
1017// x9 = input stride
1018.macro itxfm16_1d_funcs txfm
1019function \txfm\()16_1d_4x16_pass1_neon
1020        mov             x14, x30
1021
1022        movi            v4.4s, #0
1023.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1024        load_clear      \i,  x2,  x9
1025.endr
1026
1027        bl              \txfm\()16
1028
1029        // Do four 4x4 transposes. Originally, v16-v31 contain the
1030        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1031        // contain the four transposed 4x4 blocks.
1032        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1033        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1034        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1035        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1036
1037        // Store the transposed 4x4 blocks horizontally.
1038        cmp             x1,  #12
1039        b.eq            1f
1040.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
1041        store           \i,  x0,  #16
1042.endr
1043        ret             x14
10441:
1045        // Special case: For the last input column (x1 == 12),
1046        // which would be stored as the last row in the temp buffer,
1047        // don't store the first 4x4 block, but keep it in registers
1048        // for the first slice of the second pass (where it is the
1049        // last 4x4 block).
1050        add             x0,  x0,  #16
1051        st1             {v20.4s},  [x0], #16
1052        st1             {v24.4s},  [x0], #16
1053        st1             {v28.4s},  [x0], #16
1054        add             x0,  x0,  #16
1055        st1             {v21.4s},  [x0], #16
1056        st1             {v25.4s},  [x0], #16
1057        st1             {v29.4s},  [x0], #16
1058        add             x0,  x0,  #16
1059        st1             {v22.4s},  [x0], #16
1060        st1             {v26.4s},  [x0], #16
1061        st1             {v30.4s},  [x0], #16
1062        add             x0,  x0,  #16
1063        st1             {v23.4s},  [x0], #16
1064        st1             {v27.4s},  [x0], #16
1065        st1             {v31.4s},  [x0], #16
1066
1067        mov             v28.16b, v16.16b
1068        mov             v29.16b, v17.16b
1069        mov             v30.16b, v18.16b
1070        mov             v31.16b, v19.16b
1071        ret             x14
1072endfunc
1073
1074// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
1075// load the destination pixels (from a similar 4x16 slice), add and store back.
1076// x0 = dst
1077// x1 = dst stride
1078// x2 = src (temp buffer)
1079// x3 = slice offset
1080// x9 = temp buffer stride
1081function \txfm\()16_1d_4x16_pass2_neon
1082        mov             x14, x30
1083
1084.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
1085        load            \i,  x2,  x9
1086.endr
1087        cbz             x3,  1f
1088.irp i, 28, 29, 30, 31
1089        load            \i,  x2,  x9
1090.endr
10911:
1092
1093        add             x3,  x0,  x1
1094        lsl             x1,  x1,  #1
1095        bl              \txfm\()16
1096
1097        dup             v8.8h, w13
1098        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1099        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1100
1101        ret             x14
1102endfunc
1103.endm
1104
1105itxfm16_1d_funcs idct
1106itxfm16_1d_funcs iadst
1107
1108// This is the minimum eob value for each subpartition, in increments of 4
1109const min_eob_idct_idct_16, align=4
1110        .short  0, 10, 38, 89
1111endconst
1112
1113.macro itxfm_func16x16 txfm1, txfm2
1114function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1115.ifc \txfm1\()_\txfm2,idct_idct
1116        cmp             w3,  #1
1117        b.eq            idct16x16_dc_add_neon
1118.endif
1119        mov             x15, x30
1120        // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9.
1121.ifnc \txfm1\()_\txfm2,idct_idct
1122        stp             d14, d15, [sp, #-0x10]!
1123        stp             d12, d13, [sp, #-0x10]!
1124        stp             d10, d11, [sp, #-0x10]!
1125.endif
1126        stp             d8,  d9,  [sp, #-0x10]!
1127
1128        sub             sp,  sp,  #1024
1129
1130        mov             x4,  x0
1131        mov             x5,  x1
1132        mov             x6,  x2
1133
1134        movrel          x10, idct_coeffs
1135.ifnc \txfm1\()_\txfm2,idct_idct
1136        movrel          x11, iadst16_coeffs
1137.endif
1138.ifc \txfm1,idct
1139        ld1             {v0.8h,v1.8h}, [x10]
1140        sxtl            v2.4s,  v1.4h
1141        sxtl2           v3.4s,  v1.8h
1142        sxtl2           v1.4s,  v0.8h
1143        sxtl            v0.4s,  v0.4h
1144.endif
1145        mov             x9,  #64
1146
1147.ifc \txfm1\()_\txfm2,idct_idct
1148        cmp             w3,  #10
1149        b.le            idct16x16_quarter_add_16_neon
1150        cmp             w3,  #38
1151        b.le            idct16x16_half_add_16_neon
1152
1153        movrel          x12, min_eob_idct_idct_16, 2
1154.endif
1155
1156.irp i, 0, 4, 8, 12
1157        add             x0,  sp,  #(\i*64)
1158.ifc \txfm1\()_\txfm2,idct_idct
1159.if \i > 0
1160        ldrh            w1,  [x12], #2
1161        cmp             w3,  w1
1162        mov             x1,  #(16 - \i)/4
1163        b.le            1f
1164.endif
1165.endif
1166        mov             x1,  #\i
1167        add             x2,  x6,  #(\i*4)
1168        bl              \txfm1\()16_1d_4x16_pass1_neon
1169.endr
1170.ifc \txfm1\()_\txfm2,iadst_idct
1171        ld1             {v0.8h,v1.8h}, [x10]
1172        sxtl            v2.4s,  v1.4h
1173        sxtl2           v3.4s,  v1.8h
1174        sxtl2           v1.4s,  v0.8h
1175        sxtl            v0.4s,  v0.4h
1176.endif
1177
1178.ifc \txfm1\()_\txfm2,idct_idct
1179        b               3f
11801:
1181        // Set v28-v31 to zero, for the in-register passthrough of
1182        // coefficients to pass 2.
1183        movi            v28.4s,  #0
1184        movi            v29.4s,  #0
1185        movi            v30.4s,  #0
1186        movi            v31.4s,  #0
11872:
1188        subs            x1,  x1,  #1
1189.rept 4
1190        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
1191.endr
1192        b.ne            2b
11933:
1194.endif
1195
1196.irp i, 0, 4, 8, 12
1197        add             x0,  x4,  #(\i*2)
1198        mov             x1,  x5
1199        add             x2,  sp,  #(\i*4)
1200        mov             x3,  #\i
1201        bl              \txfm2\()16_1d_4x16_pass2_neon
1202.endr
1203
1204        add             sp,  sp,  #1024
1205        ldp             d8,  d9,  [sp], 0x10
1206.ifnc \txfm1\()_\txfm2,idct_idct
1207        ldp             d10, d11, [sp], 0x10
1208        ldp             d12, d13, [sp], 0x10
1209        ldp             d14, d15, [sp], 0x10
1210.endif
1211        ret             x15
1212endfunc
1213
1214function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
1215        mov             x13, #0x03ff
1216        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1217endfunc
1218
1219function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
1220        mov             x13, #0x0fff
1221        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1222endfunc
1223.endm
1224
1225itxfm_func16x16 idct,  idct
1226itxfm_func16x16 iadst, idct
1227itxfm_func16x16 idct,  iadst
1228itxfm_func16x16 iadst, iadst
1229
1230function idct16_1d_4x16_pass1_quarter_neon
1231        mov             x14, x30
1232
1233        movi            v4.4s, #0
1234.irp i, 16, 17, 18, 19
1235        load_clear      \i,  x2,  x9
1236.endr
1237
1238        bl              idct16_quarter
1239
1240        // Do four 4x4 transposes. Originally, v16-v31 contain the
1241        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1242        // contain the four transposed 4x4 blocks.
1243        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1244        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1245        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1246        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1247
1248        // Store the transposed 4x4 blocks horizontally.
1249        // The first 4x4 block is kept in registers for the second pass,
1250        // store the rest in the temp buffer.
1251        add             x0,  x0,  #16
1252        st1             {v20.4s},  [x0], #16
1253        st1             {v24.4s},  [x0], #16
1254        st1             {v28.4s},  [x0], #16
1255        add             x0,  x0,  #16
1256        st1             {v21.4s},  [x0], #16
1257        st1             {v25.4s},  [x0], #16
1258        st1             {v29.4s},  [x0], #16
1259        add             x0,  x0,  #16
1260        st1             {v22.4s},  [x0], #16
1261        st1             {v26.4s},  [x0], #16
1262        st1             {v30.4s},  [x0], #16
1263        add             x0,  x0,  #16
1264        st1             {v23.4s},  [x0], #16
1265        st1             {v27.4s},  [x0], #16
1266        st1             {v31.4s},  [x0], #16
1267        ret             x14
1268endfunc
1269
1270function idct16_1d_4x16_pass2_quarter_neon
1271        mov             x14, x30
1272
1273        // Only load the top 4 lines, and only do it for the later slices.
1274        // For the first slice, d16-d19 is kept in registers from the first pass.
1275        cbz             x3,  1f
1276.irp i, 16, 17, 18, 19
1277        load            \i,  x2,  x9
1278.endr
12791:
1280
1281        add             x3,  x0,  x1
1282        lsl             x1,  x1,  #1
1283        bl              idct16_quarter
1284
1285        dup             v8.8h, w13
1286        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1287        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1288
1289        ret             x14
1290endfunc
1291
1292function idct16_1d_4x16_pass1_half_neon
1293        mov             x14, x30
1294
1295        movi            v4.4s, #0
1296.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1297        load_clear      \i,  x2,  x9
1298.endr
1299
1300        bl              idct16_half
1301
1302        // Do four 4x4 transposes. Originally, v16-v31 contain the
1303        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1304        // contain the four transposed 4x4 blocks.
1305        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1306        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1307        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1308        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1309
1310        // Store the transposed 4x4 blocks horizontally.
1311        cmp             x1,  #4
1312        b.eq            1f
1313.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
1314        store           \i,  x0,  #16
1315.endr
1316        ret             x14
13171:
1318        // Special case: For the second input column (r1 == 4),
1319        // which would be stored as the second row in the temp buffer,
1320        // don't store the first 4x4 block, but keep it in registers
1321        // for the first slice of the second pass (where it is the
1322        // second 4x4 block).
1323        add             x0,  x0,  #16
1324        st1             {v20.4s},  [x0], #16
1325        st1             {v24.4s},  [x0], #16
1326        st1             {v28.4s},  [x0], #16
1327        add             x0,  x0,  #16
1328        st1             {v21.4s},  [x0], #16
1329        st1             {v25.4s},  [x0], #16
1330        st1             {v29.4s},  [x0], #16
1331        add             x0,  x0,  #16
1332        st1             {v22.4s},  [x0], #16
1333        st1             {v26.4s},  [x0], #16
1334        st1             {v30.4s},  [x0], #16
1335        add             x0,  x0,  #16
1336        st1             {v23.4s},  [x0], #16
1337        st1             {v27.4s},  [x0], #16
1338        st1             {v31.4s},  [x0], #16
1339
1340        mov             v20.16b, v16.16b
1341        mov             v21.16b, v17.16b
1342        mov             v22.16b, v18.16b
1343        mov             v23.16b, v19.16b
1344        ret             x14
1345endfunc
1346
1347function idct16_1d_4x16_pass2_half_neon
1348        mov             x14, x30
1349
1350.irp i, 16, 17, 18, 19
1351        load            \i,  x2,  x9
1352.endr
1353        cbz             x3,  1f
1354.irp i, 20, 21, 22, 23
1355        load            \i,  x2,  x9
1356.endr
13571:
1358
1359        add             x3,  x0,  x1
1360        lsl             x1,  x1,  #1
1361        bl              idct16_half
1362
1363        dup             v8.8h, w13
1364        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1365        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1366
1367        ret             x14
1368endfunc
1369
1370.macro idct16_partial size
1371function idct16x16_\size\()_add_16_neon
1372        add             x0,  sp,  #(0*64)
1373        mov             x1,  #0
1374        add             x2,  x6,  #(0*4)
1375        bl              idct16_1d_4x16_pass1_\size\()_neon
1376.ifc \size,half
1377        add             x0,  sp,  #(4*64)
1378        mov             x1,  #4
1379        add             x2,  x6,  #(4*4)
1380        bl              idct16_1d_4x16_pass1_\size\()_neon
1381.endif
1382
1383.irp i, 0, 4, 8, 12
1384        add             x0,  x4,  #(\i*2)
1385        mov             x1,  x5
1386        add             x2,  sp,  #(\i*4)
1387        mov             x3,  #\i
1388        bl              idct16_1d_4x16_pass2_\size\()_neon
1389.endr
1390
1391        add             sp,  sp,  #1024
1392        ldp             d8,  d9,  [sp], 0x10
1393        ret             x15
1394endfunc
1395.endm
1396
1397idct16_partial quarter
1398idct16_partial half
1399
1400function idct32x32_dc_add_neon
1401        movrel          x4,  idct_coeffs
1402        ld1             {v0.4h}, [x4]
1403        sxtl            v0.4s,  v0.4h
1404
1405        movi            v1.4h,  #0
1406
1407        ld1             {v2.s}[0],  [x2]
1408        smull           v2.2d,  v2.2s,  v0.s[0]
1409        rshrn           v2.2s,  v2.2d,  #14
1410        smull           v2.2d,  v2.2s,  v0.s[0]
1411        rshrn           v2.2s,  v2.2d,  #14
1412        st1             {v1.s}[0],  [x2]
1413        dup             v2.4s,  v2.s[0]
1414
1415        srshr           v0.4s,  v2.4s,  #6
1416
1417        mov             x3,  x0
1418        mov             x4,  #32
1419        sub             x1,  x1,  #32
1420        dup             v31.8h, w13
14211:
1422        // Loop to add the constant v0 into all 32x32 outputs
1423        subs            x4,  x4,  #1
1424        ld1             {v1.8h,v2.8h},  [x0], #32
1425        uaddw           v16.4s, v0.4s,  v1.4h
1426        uaddw2          v17.4s, v0.4s,  v1.8h
1427        ld1             {v3.8h,v4.8h},  [x0], x1
1428        uaddw           v18.4s, v0.4s,  v2.4h
1429        uaddw2          v19.4s, v0.4s,  v2.8h
1430        uaddw           v20.4s, v0.4s,  v3.4h
1431        uaddw2          v21.4s, v0.4s,  v3.8h
1432        uaddw           v22.4s, v0.4s,  v4.4h
1433        uaddw2          v23.4s, v0.4s,  v4.8h
1434        sqxtun          v1.4h,  v16.4s
1435        sqxtun2         v1.8h,  v17.4s
1436        sqxtun          v2.4h,  v18.4s
1437        sqxtun2         v2.8h,  v19.4s
1438        sqxtun          v3.4h,  v20.4s
1439        sqxtun2         v3.8h,  v21.4s
1440        sqxtun          v4.4h,  v22.4s
1441        sqxtun2         v4.8h,  v23.4s
1442        umin            v1.8h,  v1.8h,  v31.8h
1443        umin            v2.8h,  v2.8h,  v31.8h
1444        st1             {v1.8h,v2.8h},  [x3], #32
1445        umin            v3.8h,  v3.8h,  v31.8h
1446        umin            v4.8h,  v4.8h,  v31.8h
1447        st1             {v3.8h,v4.8h},  [x3], x1
1448        b.ne            1b
1449
1450        ret
1451endfunc
1452
1453.macro idct32_end
1454        butterfly_4s    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
1455        butterfly_4s    v17, v20, v23, v20 // v17 = t17,  v20 = t18
1456        butterfly_4s    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
1457        butterfly_4s    v19, v21, v22, v21 // v19 = t22,  v21 = t21
1458        butterfly_4s    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
1459        butterfly_4s    v23, v26, v25, v26 // v23 = t25,  v26 = t26
1460        butterfly_4s    v7,  v8,  v29, v31 // v7  = t31a, v3  = t28a
1461        butterfly_4s    v22, v27, v24, v27 // v22 = t30,  v27 = t29
1462
1463        dmbutterfly     v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
1464        dmbutterfly     v8,  v5,  v0.s[2], v0.s[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
1465        dmbutterfly     v28, v6,  v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
1466        dmbutterfly     v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
1467
1468        butterfly_4s    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
1469        butterfly_4s    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
1470        butterfly_4s_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
1471        butterfly_4s_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
1472        butterfly_4s    v18, v21, v27, v21 // v18 = t18,  v21 = t21
1473        butterfly_4s_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
1474        butterfly_4s    v29, v26, v20, v26 // v29 = t29,  v26 = t26
1475        butterfly_4s    v19, v20, v8,  v6  // v19 = t19a, v20 = t20
1476
1477        dmbutterfly0    v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27,  v20 = t20
1478        dmbutterfly0    v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a
1479        dmbutterfly0    v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25,  v22 = t22
1480        dmbutterfly0    v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a
1481        ret
1482.endm
1483
1484function idct32_odd
1485        dmbutterfly     v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1486        dmbutterfly     v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1487        dmbutterfly     v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1488        dmbutterfly     v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1489        dmbutterfly     v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1490        dmbutterfly     v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1491        dmbutterfly     v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1492        dmbutterfly     v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1493
1494        butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1495        butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1496        butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1497        butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1498        butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
1499        butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
1500        butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
1501        butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
1502
1503        dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1504        dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1505        dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1506        dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1507        idct32_end
1508endfunc
1509
1510function idct32_odd_half
1511        dmbutterfly_h1  v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1512        dmbutterfly_h2  v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1513        dmbutterfly_h1  v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1514        dmbutterfly_h2  v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1515        dmbutterfly_h1  v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1516        dmbutterfly_h2  v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1517        dmbutterfly_h1  v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1518        dmbutterfly_h2  v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1519
1520        butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1521        butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1522        butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1523        butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1524        butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
1525        butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
1526        butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
1527        butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
1528
1529        dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1530        dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1531        dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1532        dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1533        idct32_end
1534endfunc
1535
1536function idct32_odd_quarter
1537        dsmull_h        v4,  v5,  v16, v10.s[0]
1538        dsmull_h        v28, v29, v19, v11.s[3]
1539        dsmull_h        v30, v31, v16, v10.s[1]
1540        dsmull_h        v22, v23, v17, v13.s[2]
1541        dsmull_h        v7,  v6,  v17, v13.s[3]
1542        dsmull_h        v26, v27, v19, v11.s[2]
1543        dsmull_h        v20, v21, v18, v12.s[0]
1544        dsmull_h        v24, v25, v18, v12.s[1]
1545
1546        neg             v28.2d, v28.2d
1547        neg             v29.2d, v29.2d
1548        neg             v7.2d,  v7.2d
1549        neg             v6.2d,  v6.2d
1550
1551        drshrn_h        v4,  v4,  v5,  #14
1552        drshrn_h        v5,  v28, v29, #14
1553        drshrn_h        v29, v30, v31, #14
1554        drshrn_h        v28, v22, v23, #14
1555        drshrn_h        v7,  v7,  v6,  #14
1556        drshrn_h        v31, v26, v27, #14
1557        drshrn_h        v6,  v20, v21, #14
1558        drshrn_h        v30, v24, v25, #14
1559
1560        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v1.s[0], v1.s[1]
1561        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v1.s[0], v1.s[1]
1562        drshrn_h        v23, v16, v17, #14
1563        drshrn_h        v24, v18, v19, #14
1564        neg             v20.2d, v20.2d
1565        neg             v21.2d, v21.2d
1566        drshrn_h        v27, v27, v26, #14
1567        drshrn_h        v20, v20, v21, #14
1568        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v1.s[2], v1.s[3]
1569        drshrn_h        v21, v16, v17, #14
1570        drshrn_h        v26, v18, v19, #14
1571        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v1.s[2], v1.s[3]
1572        drshrn_h        v25, v16, v17, #14
1573        neg             v18.2d, v18.2d
1574        neg             v19.2d, v19.2d
1575        drshrn_h        v22, v18, v19, #14
1576
1577        idct32_end
1578endfunc
1579
1580.macro idct32_funcs suffix
1581// Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
1582// The 32-point IDCT can be decomposed into two 16-point IDCTs;
1583// a normal IDCT16 with every other input component (the even ones, with
1584// each output written twice), followed by a separate 16-point IDCT
1585// of the odd inputs, added/subtracted onto the outputs of the first idct16.
1586// x0 = dst (temp buffer)
1587// x1 = unused
1588// x2 = src
1589// x9 = double input stride
1590function idct32_1d_4x32_pass1\suffix\()_neon
1591        mov             x14, x30
1592
1593        movi            v4.4s,  #0
1594
1595        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1596.ifb \suffix
1597.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1598        load_clear      \i, x2, x9
1599.endr
1600.endif
1601.ifc \suffix,_quarter
1602.irp i, 16, 17, 18, 19
1603        load_clear      \i, x2, x9
1604.endr
1605.endif
1606.ifc \suffix,_half
1607.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1608        load_clear      \i, x2, x9
1609.endr
1610.endif
1611
1612        bl              idct16\suffix
1613
1614        // Do four 4x4 transposes. Originally, v16-v31 contain the
1615        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1616        // contain the four transposed 4x4 blocks.
1617        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1618        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1619        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1620        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1621
1622        // Store the registers a, b, c, d horizontally, followed by the
1623        // same registers d, c, b, a mirrored.
1624.macro store_rev a, b, c, d
1625        // There's no rev128 instruction, but we reverse each 64 bit
1626        // half, and then flip them using an ext with 8 bytes offset.
1627        rev64           v7.4s, \d
1628        st1             {\a},  [x0], #16
1629        ext             v7.16b, v7.16b, v7.16b, #8
1630        st1             {\b},  [x0], #16
1631        rev64           v6.4s, \c
1632        st1             {\c},  [x0], #16
1633        ext             v6.16b, v6.16b, v6.16b, #8
1634        st1             {\d},  [x0], #16
1635        rev64           v5.4s, \b
1636        st1             {v7.4s},  [x0], #16
1637        ext             v5.16b, v5.16b, v5.16b, #8
1638        st1             {v6.4s},  [x0], #16
1639        rev64           v4.4s, \a
1640        st1             {v5.4s},  [x0], #16
1641        ext             v4.16b, v4.16b, v4.16b, #8
1642        st1             {v4.4s},  [x0], #16
1643.endm
1644        store_rev       v16.4s, v20.4s, v24.4s, v28.4s
1645        store_rev       v17.4s, v21.4s, v25.4s, v29.4s
1646        store_rev       v18.4s, v22.4s, v26.4s, v30.4s
1647        store_rev       v19.4s, v23.4s, v27.4s, v31.4s
1648        sub             x0,  x0,  #512
1649.purgem store_rev
1650
1651        // Move x2 back to the start of the input, and move
1652        // to the first odd row
1653.ifb \suffix
1654        sub             x2,  x2,  x9, lsl #4
1655.endif
1656.ifc \suffix,_quarter
1657        sub             x2,  x2,  x9, lsl #2
1658.endif
1659.ifc \suffix,_half
1660        sub             x2,  x2,  x9, lsl #3
1661.endif
1662        add             x2,  x2,  #128
1663
1664        movi            v4.4s,  #0
1665        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1666.ifb \suffix
1667.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1668        load_clear      \i, x2, x9
1669.endr
1670.endif
1671.ifc \suffix,_quarter
1672.irp i, 16, 17, 18, 19
1673        load_clear      \i, x2, x9
1674.endr
1675.endif
1676.ifc \suffix,_half
1677.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1678        load_clear      \i, x2, x9
1679.endr
1680.endif
1681
1682        bl              idct32_odd\suffix
1683
1684        transpose_4x4s  v31, v30, v29, v28, v4, v5, v6, v7
1685        transpose_4x4s  v27, v26, v25, v24, v4, v5, v6, v7
1686        transpose_4x4s  v23, v22, v21, v20, v4, v5, v6, v7
1687        transpose_4x4s  v19, v18, v17, v16, v4, v5, v6, v7
1688
1689        // Store the registers a, b, c, d horizontally,
1690        // adding into the output first, and the mirrored,
1691        // subtracted from the output.
1692.macro store_rev a, b, c, d, a16b, b16b
1693        ld1             {v4.4s},  [x0]
1694        rev64           v9.4s, \d
1695        add             v4.4s, v4.4s, \a
1696        st1             {v4.4s},  [x0], #16
1697        rev64           v8.4s, \c
1698        ld1             {v4.4s},  [x0]
1699        ext             v9.16b, v9.16b, v9.16b, #8
1700        add             v4.4s, v4.4s, \b
1701        st1             {v4.4s},  [x0], #16
1702        ext             v8.16b, v8.16b, v8.16b, #8
1703        ld1             {v4.4s},  [x0]
1704        rev64           \b, \b
1705        add             v4.4s, v4.4s, \c
1706        st1             {v4.4s},  [x0], #16
1707        rev64           \a, \a
1708        ld1             {v4.4s},  [x0]
1709        ext             \b16b, \b16b, \b16b, #8
1710        add             v4.4s, v4.4s, \d
1711        st1             {v4.4s},  [x0], #16
1712        ext             \a16b, \a16b, \a16b, #8
1713        ld1             {v4.4s},  [x0]
1714        sub             v4.4s, v4.4s, v9.4s
1715        st1             {v4.4s},  [x0], #16
1716        ld1             {v4.4s},  [x0]
1717        sub             v4.4s, v4.4s, v8.4s
1718        st1             {v4.4s},  [x0], #16
1719        ld1             {v4.4s},  [x0]
1720        sub             v4.4s, v4.4s, \b
1721        st1             {v4.4s},  [x0], #16
1722        ld1             {v4.4s},  [x0]
1723        sub             v4.4s, v4.4s, \a
1724        st1             {v4.4s},  [x0], #16
1725.endm
1726
1727        store_rev       v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b
1728        store_rev       v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b
1729        store_rev       v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
1730        store_rev       v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
1731.purgem store_rev
1732        ret             x14
1733endfunc
1734
1735// This is mostly the same as 4x32_pass1, but without the transpose,
1736// and use the source as temp buffer between the two idct passes, and
1737// add into the destination.
1738// x0 = dst
1739// x1 = dst stride
1740// x2 = src (temp buffer)
1741// x7 = negative double temp buffer stride
1742// x9 = double temp buffer stride
1743function idct32_1d_4x32_pass2\suffix\()_neon
1744        mov             x14, x30
1745
1746        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1747.ifb \suffix
1748.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1749        load            \i, x2, x9
1750.endr
1751        sub             x2,  x2,  x9, lsl #4
1752.endif
1753.ifc \suffix,_quarter
1754.irp i, 16, 17, 18, 19
1755        load            \i, x2, x9
1756.endr
1757        sub             x2,  x2,  x9, lsl #2
1758.endif
1759.ifc \suffix,_half
1760.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1761        load            \i, x2, x9
1762.endr
1763        sub             x2,  x2,  x9, lsl #3
1764.endif
1765
1766        bl              idct16\suffix
1767
1768.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1769        store           \i, x2, x9
1770.endr
1771
1772        sub             x2,  x2,  x9, lsl #4
1773        add             x2,  x2,  #128
1774
1775        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1776.ifb \suffix
1777.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1778        load            \i, x2, x9
1779.endr
1780        sub             x2,  x2,  x9, lsl #4
1781.endif
1782.ifc \suffix,_quarter
1783.irp i, 16, 17, 18, 19
1784        load            \i, x2, x9
1785.endr
1786        sub             x2,  x2,  x9, lsl #2
1787.endif
1788.ifc \suffix,_half
1789.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1790        load            \i, x2, x9
1791.endr
1792        sub             x2,  x2,  x9, lsl #3
1793.endif
1794        sub             x2,  x2,  #128
1795
1796        bl              idct32_odd\suffix
1797
1798.macro load_acc_store a, b, c, d, neg=0
1799.if \neg == 0
1800        ld1             {v4.4s},  [x2], x9
1801        ld1             {v5.4s},  [x2], x9
1802        add             v4.4s, v4.4s, \a
1803        ld1             {v6.4s},  [x2], x9
1804        add             v5.4s, v5.4s, \b
1805        ld1             {v7.4s},  [x2], x9
1806        add             v6.4s, v6.4s, \c
1807        add             v7.4s, v7.4s, \d
1808.else
1809        ld1             {v4.4s},  [x2], x7
1810        ld1             {v5.4s},  [x2], x7
1811        sub             v4.4s, v4.4s, \a
1812        ld1             {v6.4s},  [x2], x7
1813        sub             v5.4s, v5.4s, \b
1814        ld1             {v7.4s},  [x2], x7
1815        sub             v6.4s, v6.4s, \c
1816        sub             v7.4s, v7.4s, \d
1817.endif
1818        ld1             {v8.4h},   [x0], x1
1819        ld1             {v8.d}[1], [x0], x1
1820        srshr           v4.4s, v4.4s, #6
1821        ld1             {v9.4h},   [x0], x1
1822        srshr           v5.4s, v5.4s, #6
1823        uaddw           v4.4s, v4.4s, v8.4h
1824        ld1             {v9.d}[1], [x0], x1
1825        srshr           v6.4s, v6.4s, #6
1826        uaddw2          v5.4s, v5.4s, v8.8h
1827        srshr           v7.4s, v7.4s, #6
1828        sub             x0,  x0,  x1, lsl #2
1829        uaddw           v6.4s, v6.4s, v9.4h
1830        sqxtun          v4.4h, v4.4s
1831        uaddw2          v7.4s, v7.4s, v9.8h
1832        sqxtun2         v4.8h, v5.4s
1833        umin            v4.8h, v4.8h, v15.8h
1834        st1             {v4.4h},   [x0], x1
1835        sqxtun          v5.4h, v6.4s
1836        st1             {v4.d}[1], [x0], x1
1837        sqxtun2         v5.8h, v7.4s
1838        umin            v5.8h, v5.8h, v15.8h
1839        st1             {v5.4h},   [x0], x1
1840        st1             {v5.d}[1], [x0], x1
1841.endm
1842        load_acc_store  v31.4s, v30.4s, v29.4s, v28.4s
1843        load_acc_store  v27.4s, v26.4s, v25.4s, v24.4s
1844        load_acc_store  v23.4s, v22.4s, v21.4s, v20.4s
1845        load_acc_store  v19.4s, v18.4s, v17.4s, v16.4s
1846        sub             x2,  x2,  x9
1847        load_acc_store  v16.4s, v17.4s, v18.4s, v19.4s, 1
1848        load_acc_store  v20.4s, v21.4s, v22.4s, v23.4s, 1
1849        load_acc_store  v24.4s, v25.4s, v26.4s, v27.4s, 1
1850        load_acc_store  v28.4s, v29.4s, v30.4s, v31.4s, 1
1851.purgem load_acc_store
1852        ret             x14
1853endfunc
1854.endm
1855
1856idct32_funcs
1857idct32_funcs _quarter
1858idct32_funcs _half
1859
1860const min_eob_idct_idct_32, align=4
1861        .short  0, 9, 34, 70, 135, 240, 336, 448
1862endconst
1863
1864function vp9_idct_idct_32x32_add_16_neon
1865        cmp             w3,  #1
1866        b.eq            idct32x32_dc_add_neon
1867
1868        movrel          x10, idct_coeffs
1869
1870        mov             x15, x30
1871        stp             d8,  d9,  [sp, #-0x10]!
1872        stp             d10, d11, [sp, #-0x10]!
1873        stp             d12, d13, [sp, #-0x10]!
1874        stp             d14, d15, [sp, #-0x10]!
1875
1876        sub             sp,  sp,  #4096
1877
1878        mov             x4,  x0
1879        mov             x5,  x1
1880        mov             x6,  x2
1881
1882        // Double stride of the input, since we only read every other line
1883        mov             x9,  #256
1884        neg             x7,  x9
1885
1886        ld1             {v0.8h,v1.8h},   [x10], #32
1887        sxtl            v2.4s,  v1.4h
1888        sxtl2           v3.4s,  v1.8h
1889        sxtl2           v1.4s,  v0.8h
1890        sxtl            v0.4s,  v0.4h
1891        ld1             {v10.8h,v11.8h}, [x10]
1892        sxtl            v12.4s, v11.4h
1893        sxtl2           v13.4s, v11.8h
1894        sxtl2           v11.4s, v10.8h
1895        sxtl            v10.4s, v10.4h
1896
1897        dup             v15.8h, w13
1898
1899        cmp             w3,  #34
1900        b.le            idct32x32_quarter_add_16_neon
1901        cmp             w3,  #135
1902        b.le            idct32x32_half_add_16_neon
1903
1904        movrel          x12, min_eob_idct_idct_32, 2
1905
1906.irp i, 0, 4, 8, 12, 16, 20, 24, 28
1907        add             x0,  sp,  #(\i*128)
1908.if \i > 0
1909        ldrh            w1,  [x12], #2
1910        cmp             w3,  w1
1911        mov             x1,  #(32 - \i)/4
1912        b.le            1f
1913.endif
1914        add             x2,  x6,  #(\i*4)
1915        bl              idct32_1d_4x32_pass1_neon
1916.endr
1917        b               3f
1918
19191:
1920        // Write zeros to the temp buffer for pass 2
1921        movi            v16.4s,  #0
1922        movi            v17.4s,  #0
1923        movi            v18.4s,  #0
1924        movi            v19.4s,  #0
19252:
1926        subs            x1,  x1,  #1
1927.rept 4
1928        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
1929        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
1930.endr
1931        b.ne            2b
19323:
1933.irp i, 0, 4, 8, 12, 16, 20, 24, 28
1934        add             x0,  x4,  #(\i*2)
1935        mov             x1,  x5
1936        add             x2,  sp,  #(\i*4)
1937        bl              idct32_1d_4x32_pass2_neon
1938.endr
1939
1940        add             sp,  sp,  #4096
1941        ldp             d14, d15, [sp], 0x10
1942        ldp             d12, d13, [sp], 0x10
1943        ldp             d10, d11, [sp], 0x10
1944        ldp             d8,  d9,  [sp], 0x10
1945
1946        ret             x15
1947endfunc
1948
1949function ff_vp9_idct_idct_32x32_add_10_neon, export=1
1950        mov             x13, #0x03ff
1951        b               vp9_idct_idct_32x32_add_16_neon
1952endfunc
1953
1954function ff_vp9_idct_idct_32x32_add_12_neon, export=1
1955        mov             x13, #0x0fff
1956        b               vp9_idct_idct_32x32_add_16_neon
1957endfunc
1958
1959.macro idct32_partial size
1960function idct32x32_\size\()_add_16_neon
1961.irp i, 0, 4
1962        add             x0,  sp,  #(\i*128)
1963.ifc \size,quarter
1964.if \i == 4
1965        cmp             w3,  #9
1966        b.le            1f
1967.endif
1968.endif
1969        add             x2,  x6,  #(\i*4)
1970        bl              idct32_1d_4x32_pass1_\size\()_neon
1971.endr
1972
1973.ifc \size,half
1974.irp i, 8, 12
1975        add             x0,  sp,  #(\i*128)
1976.if \i == 12
1977        cmp             w3,  #70
1978        b.le            1f
1979.endif
1980        add             x2,  x6,  #(\i*4)
1981        bl              idct32_1d_4x32_pass1_\size\()_neon
1982.endr
1983.endif
1984        b               3f
1985
19861:
1987        // Write zeros to the temp buffer for pass 2
1988        movi            v16.4s,  #0
1989        movi            v17.4s,  #0
1990        movi            v18.4s,  #0
1991        movi            v19.4s,  #0
1992
1993.rept 4
1994        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
1995        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
1996.endr
1997
19983:
1999.irp i, 0, 4, 8, 12, 16, 20, 24, 28
2000        add             x0,  x4,  #(\i*2)
2001        mov             x1,  x5
2002        add             x2,  sp,  #(\i*4)
2003        bl              idct32_1d_4x32_pass2_\size\()_neon
2004.endr
2005
2006        add             sp,  sp,  #4096
2007        ldp             d14, d15, [sp], 0x10
2008        ldp             d12, d13, [sp], 0x10
2009        ldp             d10, d11, [sp], 0x10
2010        ldp             d8,  d9,  [sp], 0x10
2011
2012        ret             x15
2013endfunc
2014.endm
2015
2016idct32_partial quarter
2017idct32_partial half
2018