1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/aarch64/asm.S"
23#include "neon.S"
24
25function ff_h264_idct_add_neon, export=1
26.L_ff_h264_idct_add_neon:
27        AARCH64_VALID_CALL_TARGET
28        ld1             {v0.4H, v1.4H, v2.4H, v3.4H},  [x1]
29        sxtw            x2,     w2
30        movi            v30.8H, #0
31
32        add             v4.4H,  v0.4H,  v2.4H
33        sshr            v16.4H, v1.4H,  #1
34        st1             {v30.8H},    [x1], #16
35        sshr            v17.4H, v3.4H,  #1
36        st1             {v30.8H},    [x1], #16
37        sub             v5.4H,  v0.4H,  v2.4H
38        sub             v6.4H,  v16.4H, v3.4H
39        add             v7.4H,  v1.4H,  v17.4H
40        add             v0.4H,  v4.4H,  v7.4H
41        add             v1.4H,  v5.4H,  v6.4H
42        sub             v2.4H,  v5.4H,  v6.4H
43        sub             v3.4H,  v4.4H,  v7.4H
44
45        transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
46
47        add             v4.4H,  v0.4H,  v2.4H
48        ld1             {v18.S}[0], [x0], x2
49        sshr            v16.4H,  v3.4H,  #1
50        sshr            v17.4H,  v1.4H,  #1
51        ld1             {v18.S}[1], [x0], x2
52        sub             v5.4H,  v0.4H,  v2.4H
53        ld1             {v19.S}[1], [x0], x2
54        add             v6.4H,  v16.4H, v1.4H
55        ins             v4.D[1],  v5.D[0]
56        sub             v7.4H,  v17.4H, v3.4H
57        ld1             {v19.S}[0], [x0], x2
58        ins             v6.D[1],  v7.D[0]
59        sub             x0,  x0,  x2, lsl #2
60        add             v0.8H,  v4.8H,  v6.8H
61        sub             v1.8H,  v4.8H,  v6.8H
62
63        srshr           v0.8H,  v0.8H,  #6
64        srshr           v1.8H,  v1.8H,  #6
65
66        uaddw           v0.8H,  v0.8H,  v18.8B
67        uaddw           v1.8H,  v1.8H,  v19.8B
68
69        sqxtun          v0.8B, v0.8H
70        sqxtun          v1.8B, v1.8H
71
72        st1             {v0.S}[0],  [x0], x2
73        st1             {v0.S}[1],  [x0], x2
74        st1             {v1.S}[1],  [x0], x2
75        st1             {v1.S}[0],  [x0], x2
76
77        sub             x1,  x1,  #32
78        ret
79endfunc
80
81function ff_h264_idct_dc_add_neon, export=1
82.L_ff_h264_idct_dc_add_neon:
83        AARCH64_VALID_CALL_TARGET
84        sxtw            x2,  w2
85        mov             w3,       #0
86        ld1r            {v2.8H},  [x1]
87        strh            w3,       [x1]
88        srshr           v2.8H,  v2.8H,  #6
89        ld1             {v0.S}[0],  [x0], x2
90        ld1             {v0.S}[1],  [x0], x2
91        uaddw           v3.8H,  v2.8H,  v0.8B
92        ld1             {v1.S}[0],  [x0], x2
93        ld1             {v1.S}[1],  [x0], x2
94        uaddw           v4.8H,  v2.8H,  v1.8B
95        sqxtun          v0.8B,  v3.8H
96        sqxtun          v1.8B,  v4.8H
97        sub             x0,  x0,  x2, lsl #2
98        st1             {v0.S}[0],  [x0], x2
99        st1             {v0.S}[1],  [x0], x2
100        st1             {v1.S}[0],  [x0], x2
101        st1             {v1.S}[1],  [x0], x2
102        ret
103endfunc
104
105function ff_h264_idct_add16_neon, export=1
106        mov             x12, x30
107        mov             x6,  x0         // dest
108        mov             x5,  x1         // block_offset
109        mov             x1,  x2         // block
110        mov             w9,  w3         // stride
111        movrel          x7,  scan8
112        mov             x10, #16
113        movrel          x13, .L_ff_h264_idct_dc_add_neon
114        movrel          x14, .L_ff_h264_idct_add_neon
1151:      mov             w2,  w9
116        ldrb            w3,  [x7], #1
117        ldrsw           x0,  [x5], #4
118        ldrb            w3,  [x4,  w3,  uxtw]
119        subs            w3,  w3,  #1
120        b.lt            2f
121        ldrsh           w3,  [x1]
122        add             x0,  x0,  x6
123        ccmp            w3,  #0,  #4,  eq
124        csel            x15, x13, x14, ne
125        blr             x15
1262:      subs            x10, x10, #1
127        add             x1,  x1,  #32
128        b.ne            1b
129        ret             x12
130endfunc
131
132function ff_h264_idct_add16intra_neon, export=1
133        mov             x12, x30
134        mov             x6,  x0         // dest
135        mov             x5,  x1         // block_offset
136        mov             x1,  x2         // block
137        mov             w9,  w3         // stride
138        movrel          x7,  scan8
139        mov             x10, #16
140        movrel          x13, .L_ff_h264_idct_dc_add_neon
141        movrel          x14, .L_ff_h264_idct_add_neon
1421:      mov             w2,  w9
143        ldrb            w3,  [x7], #1
144        ldrsw           x0,  [x5], #4
145        ldrb            w3,  [x4,  w3,  uxtw]
146        add             x0,  x0,  x6
147        cmp             w3,  #0
148        ldrsh           w3,  [x1]
149        csel            x15, x13, x14, eq
150        ccmp            w3,  #0,  #0,  eq
151        b.eq            2f
152        blr             x15
1532:      subs            x10, x10, #1
154        add             x1,  x1,  #32
155        b.ne            1b
156        ret             x12
157endfunc
158
159function ff_h264_idct_add8_neon, export=1
160        sub             sp,  sp, #0x40
161        stp             x19, x20, [sp]
162        mov             x12, x30
163        ldp             x6,  x15, [x0]          // dest[0], dest[1]
164        add             x5,  x1,  #16*4         // block_offset
165        add             x9,  x2,  #16*32        // block
166        mov             w19, w3                 // stride
167        movrel          x13, .L_ff_h264_idct_dc_add_neon
168        movrel          x14, .L_ff_h264_idct_add_neon
169        movrel          x7,  scan8, 16
170        mov             x10, #0
171        mov             x11, #16
1721:      mov             w2,  w19
173        ldrb            w3,  [x7, x10]          // scan8[i]
174        ldrsw           x0,  [x5, x10, lsl #2]  // block_offset[i]
175        ldrb            w3,  [x4, w3,  uxtw]    // nnzc[ scan8[i] ]
176        add             x0,  x0,  x6            // block_offset[i] + dst[j-1]
177        add             x1,  x9,  x10, lsl #5   // block + i * 16
178        cmp             w3,  #0
179        ldrsh           w3,  [x1]               // block[i*16]
180        csel            x20, x13, x14, eq
181        ccmp            w3,  #0,  #0,  eq
182        b.eq            2f
183        blr             x20
1842:      add             x10, x10, #1
185        cmp             x10, #4
186        csel            x10, x11, x10, eq     // mov x10, #16
187        csel            x6,  x15, x6,  eq
188        cmp             x10, #20
189        b.lt            1b
190        ldp             x19, x20, [sp]
191        add             sp,  sp,  #0x40
192        ret             x12
193endfunc
194
195.macro  idct8x8_cols    pass
196  .if \pass == 0
197        va      .req    v18
198        vb      .req    v30
199        sshr            v18.8H, v26.8H, #1
200        add             v16.8H, v24.8H, v28.8H
201        ld1             {v30.8H, v31.8H}, [x1]
202        st1             {v19.8H}, [x1],  #16
203        st1             {v19.8H}, [x1],  #16
204        sub             v17.8H,  v24.8H, v28.8H
205        sshr            v19.8H,  v30.8H, #1
206        sub             v18.8H,  v18.8H,  v30.8H
207        add             v19.8H,  v19.8H,  v26.8H
208  .else
209        va      .req    v30
210        vb      .req    v18
211        sshr            v30.8H, v26.8H, #1
212        sshr            v19.8H, v18.8H, #1
213        add             v16.8H, v24.8H, v28.8H
214        sub             v17.8H, v24.8H, v28.8H
215        sub             v30.8H, v30.8H, v18.8H
216        add             v19.8H, v19.8H, v26.8H
217  .endif
218        add             v26.8H, v17.8H, va.8H
219        sub             v28.8H, v17.8H, va.8H
220        add             v24.8H, v16.8H, v19.8H
221        sub             vb.8H,  v16.8H, v19.8H
222        sub             v16.8H, v29.8H, v27.8H
223        add             v17.8H, v31.8H, v25.8H
224        sub             va.8H,  v31.8H, v25.8H
225        add             v19.8H, v29.8H, v27.8H
226        sub             v16.8H, v16.8H, v31.8H
227        sub             v17.8H, v17.8H, v27.8H
228        add             va.8H,  va.8H,  v29.8H
229        add             v19.8H, v19.8H, v25.8H
230        sshr            v25.8H, v25.8H, #1
231        sshr            v27.8H, v27.8H, #1
232        sshr            v29.8H, v29.8H, #1
233        sshr            v31.8H, v31.8H, #1
234        sub             v16.8H, v16.8H, v31.8H
235        sub             v17.8H, v17.8H, v27.8H
236        add             va.8H,  va.8H,  v29.8H
237        add             v19.8H, v19.8H, v25.8H
238        sshr            v25.8H, v16.8H, #2
239        sshr            v27.8H, v17.8H, #2
240        sshr            v29.8H, va.8H,  #2
241        sshr            v31.8H, v19.8H, #2
242        sub             v19.8H, v19.8H, v25.8H
243        sub             va.8H,  v27.8H, va.8H
244        add             v17.8H, v17.8H, v29.8H
245        add             v16.8H, v16.8H, v31.8H
246  .if \pass == 0
247        sub             v31.8H, v24.8H, v19.8H
248        add             v24.8H, v24.8H, v19.8H
249        add             v25.8H, v26.8H, v18.8H
250        sub             v18.8H, v26.8H, v18.8H
251        add             v26.8H, v28.8H, v17.8H
252        add             v27.8H, v30.8H, v16.8H
253        sub             v29.8H, v28.8H, v17.8H
254        sub             v28.8H, v30.8H, v16.8H
255  .else
256        sub             v31.8H, v24.8H, v19.8H
257        add             v24.8H, v24.8H, v19.8H
258        add             v25.8H, v26.8H, v30.8H
259        sub             v30.8H, v26.8H, v30.8H
260        add             v26.8H, v28.8H, v17.8H
261        sub             v29.8H, v28.8H, v17.8H
262        add             v27.8H, v18.8H, v16.8H
263        sub             v28.8H, v18.8H, v16.8H
264  .endif
265        .unreq          va
266        .unreq          vb
267.endm
268
269function ff_h264_idct8_add_neon, export=1
270.L_ff_h264_idct8_add_neon:
271        AARCH64_VALID_CALL_TARGET
272        movi            v19.8H,   #0
273        sxtw            x2,       w2
274        ld1             {v24.8H, v25.8H}, [x1]
275        st1             {v19.8H},  [x1],   #16
276        st1             {v19.8H},  [x1],   #16
277        ld1             {v26.8H, v27.8H}, [x1]
278        st1             {v19.8H},  [x1],   #16
279        st1             {v19.8H},  [x1],   #16
280        ld1             {v28.8H, v29.8H}, [x1]
281        st1             {v19.8H},  [x1],   #16
282        st1             {v19.8H},  [x1],   #16
283
284        idct8x8_cols    0
285        transpose_8x8H  v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
286        idct8x8_cols    1
287
288        mov             x3,  x0
289        srshr           v24.8H, v24.8H, #6
290        ld1             {v0.8B},     [x0], x2
291        srshr           v25.8H, v25.8H, #6
292        ld1             {v1.8B},     [x0], x2
293        srshr           v26.8H, v26.8H, #6
294        ld1             {v2.8B},     [x0], x2
295        srshr           v27.8H, v27.8H, #6
296        ld1             {v3.8B},     [x0], x2
297        srshr           v28.8H, v28.8H, #6
298        ld1             {v4.8B},     [x0], x2
299        srshr           v29.8H, v29.8H, #6
300        ld1             {v5.8B},     [x0], x2
301        srshr           v30.8H, v30.8H, #6
302        ld1             {v6.8B},     [x0], x2
303        srshr           v31.8H, v31.8H, #6
304        ld1             {v7.8B},     [x0], x2
305        uaddw           v24.8H, v24.8H, v0.8B
306        uaddw           v25.8H, v25.8H, v1.8B
307        uaddw           v26.8H, v26.8H, v2.8B
308        sqxtun          v0.8B,  v24.8H
309        uaddw           v27.8H, v27.8H, v3.8B
310        sqxtun          v1.8B,  v25.8H
311        uaddw           v28.8H, v28.8H, v4.8B
312        sqxtun          v2.8B,  v26.8H
313        st1             {v0.8B},     [x3], x2
314        uaddw           v29.8H, v29.8H, v5.8B
315        sqxtun          v3.8B,  v27.8H
316        st1             {v1.8B},     [x3], x2
317        uaddw           v30.8H, v30.8H, v6.8B
318        sqxtun          v4.8B,  v28.8H
319        st1             {v2.8B},     [x3], x2
320        uaddw           v31.8H, v31.8H, v7.8B
321        sqxtun          v5.8B,  v29.8H
322        st1             {v3.8B},     [x3], x2
323        sqxtun          v6.8B,  v30.8H
324        sqxtun          v7.8B,  v31.8H
325        st1             {v4.8B},     [x3], x2
326        st1             {v5.8B},     [x3], x2
327        st1             {v6.8B},     [x3], x2
328        st1             {v7.8B},     [x3], x2
329
330        sub             x1,  x1,  #128
331        ret
332endfunc
333
334function ff_h264_idct8_dc_add_neon, export=1
335.L_ff_h264_idct8_dc_add_neon:
336        AARCH64_VALID_CALL_TARGET
337        mov             w3,       #0
338        sxtw            x2,       w2
339        ld1r            {v31.8H}, [x1]
340        strh            w3,       [x1]
341        ld1             {v0.8B},  [x0], x2
342        srshr           v31.8H, v31.8H, #6
343        ld1             {v1.8B},     [x0], x2
344        ld1             {v2.8B},     [x0], x2
345        uaddw           v24.8H, v31.8H, v0.8B
346        ld1             {v3.8B},     [x0], x2
347        uaddw           v25.8H, v31.8H, v1.8B
348        ld1             {v4.8B},     [x0], x2
349        uaddw           v26.8H, v31.8H, v2.8B
350        ld1             {v5.8B},     [x0], x2
351        uaddw           v27.8H, v31.8H, v3.8B
352        ld1             {v6.8B},     [x0], x2
353        uaddw           v28.8H, v31.8H, v4.8B
354        ld1             {v7.8B},     [x0], x2
355        uaddw           v29.8H, v31.8H, v5.8B
356        uaddw           v30.8H, v31.8H, v6.8B
357        uaddw           v31.8H, v31.8H, v7.8B
358        sqxtun          v0.8B,  v24.8H
359        sqxtun          v1.8B,  v25.8H
360        sqxtun          v2.8B,  v26.8H
361        sqxtun          v3.8B,  v27.8H
362        sub             x0,  x0,  x2, lsl #3
363        st1             {v0.8B},     [x0], x2
364        sqxtun          v4.8B,  v28.8H
365        st1             {v1.8B},     [x0], x2
366        sqxtun          v5.8B,  v29.8H
367        st1             {v2.8B},     [x0], x2
368        sqxtun          v6.8B,  v30.8H
369        st1             {v3.8B},     [x0], x2
370        sqxtun          v7.8B,  v31.8H
371        st1             {v4.8B},     [x0], x2
372        st1             {v5.8B},     [x0], x2
373        st1             {v6.8B},     [x0], x2
374        st1             {v7.8B},     [x0], x2
375        ret
376endfunc
377
378function ff_h264_idct8_add4_neon, export=1
379        mov             x12, x30
380        mov             x6,  x0
381        mov             x5,  x1
382        mov             x1,  x2
383        mov             w2,  w3
384        movrel          x7,  scan8
385        mov             w10, #16
386        movrel          x13, .L_ff_h264_idct8_dc_add_neon
387        movrel          x14, .L_ff_h264_idct8_add_neon
3881:      ldrb            w9,  [x7], #4
389        ldrsw           x0,  [x5], #16
390        ldrb            w9,  [x4, w9, UXTW]
391        subs            w9,  w9,  #1
392        b.lt            2f
393        ldrsh           w11,  [x1]
394        add             x0,  x6,  x0
395        ccmp            w11, #0,  #4,  eq
396        csel            x15, x13, x14, ne
397        blr             x15
3982:      subs            w10, w10, #4
399        add             x1,  x1,  #128
400        b.ne            1b
401        ret             x12
402endfunc
403
404const   scan8
405        .byte           4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
406        .byte           6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
407        .byte           4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
408        .byte           6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
409        .byte           4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
410        .byte           6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
411        .byte           4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
412        .byte           6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
413        .byte           4+11*8, 5+11*8, 4+12*8, 5+12*8
414        .byte           6+11*8, 7+11*8, 6+12*8, 7+12*8
415        .byte           4+13*8, 5+13*8, 4+14*8, 5+14*8
416        .byte           6+13*8, 7+13*8, 6+14*8, 7+14*8
417endconst
418