1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/aarch64/asm.S"
23 #include "neon.S"
24 
25 function ff_h264_idct_add_neon, export=1
26 .L_ff_h264_idct_add_neon:
27         AARCH64_VALID_CALL_TARGET
28         ld1             {v0.4H, v1.4H, v2.4H, v3.4H},  [x1]
29         sxtw            x2,     w2
30         movi            v30.8H, #0
31 
32         add             v4.4H,  v0.4H,  v2.4H
33         sshr            v16.4H, v1.4H,  #1
34         st1             {v30.8H},    [x1], #16
35         sshr            v17.4H, v3.4H,  #1
36         st1             {v30.8H},    [x1], #16
37         sub             v5.4H,  v0.4H,  v2.4H
38         sub             v6.4H,  v16.4H, v3.4H
39         add             v7.4H,  v1.4H,  v17.4H
40         add             v0.4H,  v4.4H,  v7.4H
41         add             v1.4H,  v5.4H,  v6.4H
42         sub             v2.4H,  v5.4H,  v6.4H
43         sub             v3.4H,  v4.4H,  v7.4H
44 
45         transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
46 
47         add             v4.4H,  v0.4H,  v2.4H
48         ld1             {v18.S}[0], [x0], x2
49         sshr            v16.4H,  v3.4H,  #1
50         sshr            v17.4H,  v1.4H,  #1
51         ld1             {v18.S}[1], [x0], x2
52         sub             v5.4H,  v0.4H,  v2.4H
53         ld1             {v19.S}[1], [x0], x2
54         add             v6.4H,  v16.4H, v1.4H
55         ins             v4.D[1],  v5.D[0]
56         sub             v7.4H,  v17.4H, v3.4H
57         ld1             {v19.S}[0], [x0], x2
58         ins             v6.D[1],  v7.D[0]
59         sub             x0,  x0,  x2, lsl #2
60         add             v0.8H,  v4.8H,  v6.8H
61         sub             v1.8H,  v4.8H,  v6.8H
62 
63         srshr           v0.8H,  v0.8H,  #6
64         srshr           v1.8H,  v1.8H,  #6
65 
66         uaddw           v0.8H,  v0.8H,  v18.8B
67         uaddw           v1.8H,  v1.8H,  v19.8B
68 
69         sqxtun          v0.8B, v0.8H
70         sqxtun          v1.8B, v1.8H
71 
72         st1             {v0.S}[0],  [x0], x2
73         st1             {v0.S}[1],  [x0], x2
74         st1             {v1.S}[1],  [x0], x2
75         st1             {v1.S}[0],  [x0], x2
76 
77         sub             x1,  x1,  #32
78         ret
79 endfunc
80 
81 function ff_h264_idct_dc_add_neon, export=1
82 .L_ff_h264_idct_dc_add_neon:
83         AARCH64_VALID_CALL_TARGET
84         sxtw            x2,  w2
85         mov             w3,       #0
86         ld1r            {v2.8H},  [x1]
87         strh            w3,       [x1]
88         srshr           v2.8H,  v2.8H,  #6
89         ld1             {v0.S}[0],  [x0], x2
90         ld1             {v0.S}[1],  [x0], x2
91         uaddw           v3.8H,  v2.8H,  v0.8B
92         ld1             {v1.S}[0],  [x0], x2
93         ld1             {v1.S}[1],  [x0], x2
94         uaddw           v4.8H,  v2.8H,  v1.8B
95         sqxtun          v0.8B,  v3.8H
96         sqxtun          v1.8B,  v4.8H
97         sub             x0,  x0,  x2, lsl #2
98         st1             {v0.S}[0],  [x0], x2
99         st1             {v0.S}[1],  [x0], x2
100         st1             {v1.S}[0],  [x0], x2
101         st1             {v1.S}[1],  [x0], x2
102         ret
103 endfunc
104 
105 function ff_h264_idct_add16_neon, export=1
106         mov             x12, x30
107         mov             x6,  x0         // dest
108         mov             x5,  x1         // block_offset
109         mov             x1,  x2         // block
110         mov             w9,  w3         // stride
111         movrel          x7,  scan8
112         mov             x10, #16
113         movrel          x13, .L_ff_h264_idct_dc_add_neon
114         movrel          x14, .L_ff_h264_idct_add_neon
115 1:      mov             w2,  w9
116         ldrb            w3,  [x7], #1
117         ldrsw           x0,  [x5], #4
118         ldrb            w3,  [x4,  w3,  uxtw]
119         subs            w3,  w3,  #1
120         b.lt            2f
121         ldrsh           w3,  [x1]
122         add             x0,  x0,  x6
123         ccmp            w3,  #0,  #4,  eq
124         csel            x15, x13, x14, ne
125         blr             x15
126 2:      subs            x10, x10, #1
127         add             x1,  x1,  #32
128         b.ne            1b
129         ret             x12
130 endfunc
131 
132 function ff_h264_idct_add16intra_neon, export=1
133         mov             x12, x30
134         mov             x6,  x0         // dest
135         mov             x5,  x1         // block_offset
136         mov             x1,  x2         // block
137         mov             w9,  w3         // stride
138         movrel          x7,  scan8
139         mov             x10, #16
140         movrel          x13, .L_ff_h264_idct_dc_add_neon
141         movrel          x14, .L_ff_h264_idct_add_neon
142 1:      mov             w2,  w9
143         ldrb            w3,  [x7], #1
144         ldrsw           x0,  [x5], #4
145         ldrb            w3,  [x4,  w3,  uxtw]
146         add             x0,  x0,  x6
147         cmp             w3,  #0
148         ldrsh           w3,  [x1]
149         csel            x15, x13, x14, eq
150         ccmp            w3,  #0,  #0,  eq
151         b.eq            2f
152         blr             x15
153 2:      subs            x10, x10, #1
154         add             x1,  x1,  #32
155         b.ne            1b
156         ret             x12
157 endfunc
158 
159 function ff_h264_idct_add8_neon, export=1
160         sub             sp,  sp, #0x40
161         stp             x19, x20, [sp]
162         mov             x12, x30
163         ldp             x6,  x15, [x0]          // dest[0], dest[1]
164         add             x5,  x1,  #16*4         // block_offset
165         add             x9,  x2,  #16*32        // block
166         mov             w19, w3                 // stride
167         movrel          x13, .L_ff_h264_idct_dc_add_neon
168         movrel          x14, .L_ff_h264_idct_add_neon
169         movrel          x7,  scan8, 16
170         mov             x10, #0
171         mov             x11, #16
172 1:      mov             w2,  w19
173         ldrb            w3,  [x7, x10]          // scan8[i]
174         ldrsw           x0,  [x5, x10, lsl #2]  // block_offset[i]
175         ldrb            w3,  [x4, w3,  uxtw]    // nnzc[ scan8[i] ]
176         add             x0,  x0,  x6            // block_offset[i] + dst[j-1]
177         add             x1,  x9,  x10, lsl #5   // block + i * 16
178         cmp             w3,  #0
179         ldrsh           w3,  [x1]               // block[i*16]
180         csel            x20, x13, x14, eq
181         ccmp            w3,  #0,  #0,  eq
182         b.eq            2f
183         blr             x20
184 2:      add             x10, x10, #1
185         cmp             x10, #4
186         csel            x10, x11, x10, eq     // mov x10, #16
187         csel            x6,  x15, x6,  eq
188         cmp             x10, #20
189         b.lt            1b
190         ldp             x19, x20, [sp]
191         add             sp,  sp,  #0x40
192         ret             x12
193 endfunc
194 
195 .macro  idct8x8_cols    pass
196   .if \pass == 0
197         va      .req    v18
198         vb      .req    v30
199         sshr            v18.8H, v26.8H, #1
200         add             v16.8H, v24.8H, v28.8H
201         ld1             {v30.8H, v31.8H}, [x1]
202         st1             {v19.8H}, [x1],  #16
203         st1             {v19.8H}, [x1],  #16
204         sub             v17.8H,  v24.8H, v28.8H
205         sshr            v19.8H,  v30.8H, #1
206         sub             v18.8H,  v18.8H,  v30.8H
207         add             v19.8H,  v19.8H,  v26.8H
208   .else
209         va      .req    v30
210         vb      .req    v18
211         sshr            v30.8H, v26.8H, #1
212         sshr            v19.8H, v18.8H, #1
213         add             v16.8H, v24.8H, v28.8H
214         sub             v17.8H, v24.8H, v28.8H
215         sub             v30.8H, v30.8H, v18.8H
216         add             v19.8H, v19.8H, v26.8H
217   .endif
218         add             v26.8H, v17.8H, va.8H
219         sub             v28.8H, v17.8H, va.8H
220         add             v24.8H, v16.8H, v19.8H
221         sub             vb.8H,  v16.8H, v19.8H
222         sub             v16.8H, v29.8H, v27.8H
223         add             v17.8H, v31.8H, v25.8H
224         sub             va.8H,  v31.8H, v25.8H
225         add             v19.8H, v29.8H, v27.8H
226         sub             v16.8H, v16.8H, v31.8H
227         sub             v17.8H, v17.8H, v27.8H
228         add             va.8H,  va.8H,  v29.8H
229         add             v19.8H, v19.8H, v25.8H
230         sshr            v25.8H, v25.8H, #1
231         sshr            v27.8H, v27.8H, #1
232         sshr            v29.8H, v29.8H, #1
233         sshr            v31.8H, v31.8H, #1
234         sub             v16.8H, v16.8H, v31.8H
235         sub             v17.8H, v17.8H, v27.8H
236         add             va.8H,  va.8H,  v29.8H
237         add             v19.8H, v19.8H, v25.8H
238         sshr            v25.8H, v16.8H, #2
239         sshr            v27.8H, v17.8H, #2
240         sshr            v29.8H, va.8H,  #2
241         sshr            v31.8H, v19.8H, #2
242         sub             v19.8H, v19.8H, v25.8H
243         sub             va.8H,  v27.8H, va.8H
244         add             v17.8H, v17.8H, v29.8H
245         add             v16.8H, v16.8H, v31.8H
246   .if \pass == 0
247         sub             v31.8H, v24.8H, v19.8H
248         add             v24.8H, v24.8H, v19.8H
249         add             v25.8H, v26.8H, v18.8H
250         sub             v18.8H, v26.8H, v18.8H
251         add             v26.8H, v28.8H, v17.8H
252         add             v27.8H, v30.8H, v16.8H
253         sub             v29.8H, v28.8H, v17.8H
254         sub             v28.8H, v30.8H, v16.8H
255   .else
256         sub             v31.8H, v24.8H, v19.8H
257         add             v24.8H, v24.8H, v19.8H
258         add             v25.8H, v26.8H, v30.8H
259         sub             v30.8H, v26.8H, v30.8H
260         add             v26.8H, v28.8H, v17.8H
261         sub             v29.8H, v28.8H, v17.8H
262         add             v27.8H, v18.8H, v16.8H
263         sub             v28.8H, v18.8H, v16.8H
264   .endif
265         .unreq          va
266         .unreq          vb
267 .endm
268 
269 function ff_h264_idct8_add_neon, export=1
270 .L_ff_h264_idct8_add_neon:
271         AARCH64_VALID_CALL_TARGET
272         movi            v19.8H,   #0
273         sxtw            x2,       w2
274         ld1             {v24.8H, v25.8H}, [x1]
275         st1             {v19.8H},  [x1],   #16
276         st1             {v19.8H},  [x1],   #16
277         ld1             {v26.8H, v27.8H}, [x1]
278         st1             {v19.8H},  [x1],   #16
279         st1             {v19.8H},  [x1],   #16
280         ld1             {v28.8H, v29.8H}, [x1]
281         st1             {v19.8H},  [x1],   #16
282         st1             {v19.8H},  [x1],   #16
283 
284         idct8x8_cols    0
285         transpose_8x8H  v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
286         idct8x8_cols    1
287 
288         mov             x3,  x0
289         srshr           v24.8H, v24.8H, #6
290         ld1             {v0.8B},     [x0], x2
291         srshr           v25.8H, v25.8H, #6
292         ld1             {v1.8B},     [x0], x2
293         srshr           v26.8H, v26.8H, #6
294         ld1             {v2.8B},     [x0], x2
295         srshr           v27.8H, v27.8H, #6
296         ld1             {v3.8B},     [x0], x2
297         srshr           v28.8H, v28.8H, #6
298         ld1             {v4.8B},     [x0], x2
299         srshr           v29.8H, v29.8H, #6
300         ld1             {v5.8B},     [x0], x2
301         srshr           v30.8H, v30.8H, #6
302         ld1             {v6.8B},     [x0], x2
303         srshr           v31.8H, v31.8H, #6
304         ld1             {v7.8B},     [x0], x2
305         uaddw           v24.8H, v24.8H, v0.8B
306         uaddw           v25.8H, v25.8H, v1.8B
307         uaddw           v26.8H, v26.8H, v2.8B
308         sqxtun          v0.8B,  v24.8H
309         uaddw           v27.8H, v27.8H, v3.8B
310         sqxtun          v1.8B,  v25.8H
311         uaddw           v28.8H, v28.8H, v4.8B
312         sqxtun          v2.8B,  v26.8H
313         st1             {v0.8B},     [x3], x2
314         uaddw           v29.8H, v29.8H, v5.8B
315         sqxtun          v3.8B,  v27.8H
316         st1             {v1.8B},     [x3], x2
317         uaddw           v30.8H, v30.8H, v6.8B
318         sqxtun          v4.8B,  v28.8H
319         st1             {v2.8B},     [x3], x2
320         uaddw           v31.8H, v31.8H, v7.8B
321         sqxtun          v5.8B,  v29.8H
322         st1             {v3.8B},     [x3], x2
323         sqxtun          v6.8B,  v30.8H
324         sqxtun          v7.8B,  v31.8H
325         st1             {v4.8B},     [x3], x2
326         st1             {v5.8B},     [x3], x2
327         st1             {v6.8B},     [x3], x2
328         st1             {v7.8B},     [x3], x2
329 
330         sub             x1,  x1,  #128
331         ret
332 endfunc
333 
334 function ff_h264_idct8_dc_add_neon, export=1
335 .L_ff_h264_idct8_dc_add_neon:
336         AARCH64_VALID_CALL_TARGET
337         mov             w3,       #0
338         sxtw            x2,       w2
339         ld1r            {v31.8H}, [x1]
340         strh            w3,       [x1]
341         ld1             {v0.8B},  [x0], x2
342         srshr           v31.8H, v31.8H, #6
343         ld1             {v1.8B},     [x0], x2
344         ld1             {v2.8B},     [x0], x2
345         uaddw           v24.8H, v31.8H, v0.8B
346         ld1             {v3.8B},     [x0], x2
347         uaddw           v25.8H, v31.8H, v1.8B
348         ld1             {v4.8B},     [x0], x2
349         uaddw           v26.8H, v31.8H, v2.8B
350         ld1             {v5.8B},     [x0], x2
351         uaddw           v27.8H, v31.8H, v3.8B
352         ld1             {v6.8B},     [x0], x2
353         uaddw           v28.8H, v31.8H, v4.8B
354         ld1             {v7.8B},     [x0], x2
355         uaddw           v29.8H, v31.8H, v5.8B
356         uaddw           v30.8H, v31.8H, v6.8B
357         uaddw           v31.8H, v31.8H, v7.8B
358         sqxtun          v0.8B,  v24.8H
359         sqxtun          v1.8B,  v25.8H
360         sqxtun          v2.8B,  v26.8H
361         sqxtun          v3.8B,  v27.8H
362         sub             x0,  x0,  x2, lsl #3
363         st1             {v0.8B},     [x0], x2
364         sqxtun          v4.8B,  v28.8H
365         st1             {v1.8B},     [x0], x2
366         sqxtun          v5.8B,  v29.8H
367         st1             {v2.8B},     [x0], x2
368         sqxtun          v6.8B,  v30.8H
369         st1             {v3.8B},     [x0], x2
370         sqxtun          v7.8B,  v31.8H
371         st1             {v4.8B},     [x0], x2
372         st1             {v5.8B},     [x0], x2
373         st1             {v6.8B},     [x0], x2
374         st1             {v7.8B},     [x0], x2
375         ret
376 endfunc
377 
378 function ff_h264_idct8_add4_neon, export=1
379         mov             x12, x30
380         mov             x6,  x0
381         mov             x5,  x1
382         mov             x1,  x2
383         mov             w2,  w3
384         movrel          x7,  scan8
385         mov             w10, #16
386         movrel          x13, .L_ff_h264_idct8_dc_add_neon
387         movrel          x14, .L_ff_h264_idct8_add_neon
388 1:      ldrb            w9,  [x7], #4
389         ldrsw           x0,  [x5], #16
390         ldrb            w9,  [x4, w9, UXTW]
391         subs            w9,  w9,  #1
392         b.lt            2f
393         ldrsh           w11,  [x1]
394         add             x0,  x6,  x0
395         ccmp            w11, #0,  #4,  eq
396         csel            x15, x13, x14, ne
397         blr             x15
398 2:      subs            w10, w10, #4
399         add             x1,  x1,  #128
400         b.ne            1b
401         ret             x12
402 endfunc
403 
404 const   scan8
405         .byte           4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
406         .byte           6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
407         .byte           4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
408         .byte           6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
409         .byte           4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
410         .byte           6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
411         .byte           4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
412         .byte           6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
413         .byte           4+11*8, 5+11*8, 4+12*8, 5+12*8
414         .byte           6+11*8, 7+11*8, 6+12*8, 7+12*8
415         .byte           4+13*8, 5+13*8, 4+14*8, 5+14*8
416         .byte           6+13*8, 7+13*8, 6+14*8, 7+14*8
417 endconst
418