1;******************************************************************************
2;* AAC Spectral Band Replication decoding functions
3;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25; mask equivalent for multiply by -1.0 1.0
26ps_mask         times 2 dd 1<<31, 0
27ps_mask2        times 2 dd 0, 1<<31
28ps_mask3        dd  0, 0, 0, 1<<31
29ps_noise0       times 2 dd  1.0,  0.0,
30ps_noise2       times 2 dd -1.0,  0.0
31ps_noise13      dd  0.0,  1.0, 0.0, -1.0
32                dd  0.0, -1.0, 0.0,  1.0
33                dd  0.0,  1.0, 0.0, -1.0
34cextern         sbr_noise_table
35cextern         ps_neg
36
37SECTION .text
38
39INIT_XMM sse
40cglobal sbr_sum_square, 2, 3, 6
41    mov        r2d, r1d
42    xorps       m0, m0
43    xorps       m1, m1
44    sar         r2, 3
45    jz          .prepare
46.loop:
47    movu        m2, [r0 +  0]
48    movu        m3, [r0 + 16]
49    movu        m4, [r0 + 32]
50    movu        m5, [r0 + 48]
51    mulps       m2, m2
52    mulps       m3, m3
53    mulps       m4, m4
54    mulps       m5, m5
55    addps       m0, m2
56    addps       m1, m3
57    addps       m0, m4
58    addps       m1, m5
59    add         r0, 64
60    dec         r2
61    jnz         .loop
62.prepare:
63    and         r1, 7
64    sar         r1, 1
65    jz          .end
66; len is a multiple of 2, thus there are at least 4 elements to process
67.endloop:
68    movu        m2, [r0]
69    add         r0, 16
70    mulps       m2, m2
71    dec         r1
72    addps       m0, m2
73    jnz         .endloop
74.end:
75    addps       m0, m1
76    movhlps     m2, m0
77    addps       m0, m2
78    movss       m1, m0
79    shufps      m0, m0, 1
80    addss       m0, m1
81%if ARCH_X86_64 == 0
82    movss       r0m,  m0
83    fld         dword r0m
84%endif
85    RET
86
87%define STEP  40*4*2
88cglobal sbr_hf_g_filt, 5, 6, 5
89    lea         r1, [r1 + 8*r4] ; offset by ixh elements into X_high
90    mov         r5, r3
91    and         r3, 0xFC
92    lea         r2, [r2 + r3*4]
93    lea         r0, [r0 + r3*8]
94    neg         r3
95    jz          .loop1
96.loop4:
97    movlps      m0, [r2 + 4*r3 + 0]
98    movlps      m1, [r2 + 4*r3 + 8]
99    movlps      m2, [r1 + 0*STEP]
100    movlps      m3, [r1 + 2*STEP]
101    movhps      m2, [r1 + 1*STEP]
102    movhps      m3, [r1 + 3*STEP]
103    unpcklps    m0, m0
104    unpcklps    m1, m1
105    mulps       m0, m2
106    mulps       m1, m3
107    movu        [r0 + 8*r3 +  0], m0
108    movu        [r0 + 8*r3 + 16], m1
109    add         r1, 4*STEP
110    add         r3, 4
111    jnz         .loop4
112    and         r5, 3 ; number of single element loops
113    jz          .end
114.loop1: ; element 0 and 1 can be computed at the same time
115    movss       m0, [r2]
116    movlps      m2, [r1]
117    unpcklps    m0, m0
118    mulps       m2, m0
119    movlps    [r0], m2
120    add         r0, 8
121    add         r2, 4
122    add         r1, STEP
123    dec         r5
124    jnz         .loop1
125.end:
126    RET
127
128; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
129;                        const float alpha0[2], const float alpha1[2],
130;                        float bw, int start, int end)
131;
132cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
133    ; load alpha factors
134%define bw m0
135%if ARCH_X86_64 == 0 || WIN64
136    movss      bw, BWm
137%endif
138    movlps     m2, [alpha1q]
139    movlps     m1, [alpha0q]
140    shufps     bw, bw, 0
141    mulps      m2, bw             ; (a1[0] a1[1])*bw
142    mulps      m1, bw             ; (a0[0] a0[1])*bw    = (a2 a3)
143    mulps      m2, bw             ; (a1[0] a1[1])*bw*bw = (a0 a1)
144    mova       m3, m1
145    mova       m4, m2
146
147    ; Set pointers
148%if ARCH_X86_64 == 0 || WIN64
149    ; start and end 6th and 7th args on stack
150    mov        r2d, Sm
151    mov        r3d, Em
152    DEFINE_ARGS X_high, X_low, start, end
153%else
154; BW does not actually occupy a register, so shift by 1
155    DEFINE_ARGS X_high, X_low, alpha0, alpha1, start, end
156    movsxd  startq, startd
157    movsxd    endq, endd
158%endif
159    sub     startq, endq         ; neg num of loops
160    lea    X_highq, [X_highq + endq*2*4]
161    lea     X_lowq, [X_lowq  + endq*2*4 - 2*2*4]
162    shl     startq, 3            ; offset from num loops
163
164    mova        m0, [X_lowq + startq]
165    shufps      m3, m3, q1111
166    shufps      m4, m4, q1111
167    xorps       m3, [ps_mask]
168    shufps      m1, m1, q0000
169    shufps      m2, m2, q0000
170    xorps       m4, [ps_mask]
171.loop2:
172    movu        m7, [X_lowq + startq + 8]       ; BbCc
173    mova        m6, m0
174    mova        m5, m7
175    shufps      m0, m0, q2301                   ; aAbB
176    shufps      m7, m7, q2301                   ; bBcC
177    mulps       m0, m4
178    mulps       m7, m3
179    mulps       m6, m2
180    mulps       m5, m1
181    addps       m7, m0
182    mova        m0, [X_lowq + startq + 16]      ; CcDd
183    addps       m7, m0
184    addps       m6, m5
185    addps       m7, m6
186    mova  [X_highq + startq], m7
187    add     startq, 16
188    jnz         .loop2
189    RET
190
191cglobal sbr_sum64x5, 1,2,4,z
192    lea    r1q, [zq+ 256]
193.loop:
194    mova    m0, [zq+   0]
195    mova    m2, [zq+  16]
196    mova    m1, [zq+ 256]
197    mova    m3, [zq+ 272]
198    addps   m0, [zq+ 512]
199    addps   m2, [zq+ 528]
200    addps   m1, [zq+ 768]
201    addps   m3, [zq+ 784]
202    addps   m0, [zq+1024]
203    addps   m2, [zq+1040]
204    addps   m0, m1
205    addps   m2, m3
206    mova  [zq], m0
207    mova  [zq+16], m2
208    add     zq, 32
209    cmp     zq, r1q
210    jne  .loop
211    REP_RET
212
213INIT_XMM sse
214cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
215    lea              r2q, [zq + (64-4)*4]
216    mova              m3, [ps_neg]
217.loop:
218    mova              m1, [zq]
219    xorps             m0, m3, [r2q]
220    shufps            m0, m0, m0, q0123
221    unpcklps          m2, m0, m1
222    unpckhps          m0, m0, m1
223    mova       [Wq +  0], m2
224    mova       [Wq + 16], m0
225    add               Wq, 32
226    sub              r2q, 16
227    add               zq, 16
228    cmp               zq, r2q
229    jl             .loop
230    REP_RET
231
232INIT_XMM sse
233cglobal sbr_neg_odd_64, 1,2,4,z
234    lea        r1q, [zq+256]
235.loop:
236    mova        m0, [zq+ 0]
237    mova        m1, [zq+16]
238    mova        m2, [zq+32]
239    mova        m3, [zq+48]
240    xorps       m0, [ps_mask2]
241    xorps       m1, [ps_mask2]
242    xorps       m2, [ps_mask2]
243    xorps       m3, [ps_mask2]
244    mova   [zq+ 0], m0
245    mova   [zq+16], m1
246    mova   [zq+32], m2
247    mova   [zq+48], m3
248    add         zq, 64
249    cmp         zq, r1q
250    jne      .loop
251    REP_RET
252
253; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
254INIT_XMM sse2
255cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
256    mov               cq, 64*4-2*mmsize
257    lea            vrevq, [vq + 64*4]
258.loop:
259    mova              m0, [src0q+cq]
260    mova              m1, [src1q]
261    mova              m4, [src0q+cq+mmsize]
262    mova              m5, [src1q+mmsize]
263    pshufd            m2, m0, q0123
264    pshufd            m3, m1, q0123
265    pshufd            m6, m4, q0123
266    pshufd            m7, m5, q0123
267    addps             m5, m2
268    subps             m0, m7
269    addps             m1, m6
270    subps             m4, m3
271    mova         [vrevq], m1
272    mova  [vrevq+mmsize], m5
273    mova         [vq+cq], m0
274    mova  [vq+cq+mmsize], m4
275    add            src1q, 2*mmsize
276    add            vrevq, 2*mmsize
277    sub               cq, 2*mmsize
278    jge            .loop
279    REP_RET
280
281INIT_XMM sse2
282cglobal sbr_qmf_pre_shuffle, 1,4,6,z
283%define OFFSET  (32*4-2*mmsize)
284    mov       r3q, OFFSET
285    lea       r1q, [zq + (32+1)*4]
286    lea       r2q, [zq + 64*4]
287    mova       m5, [ps_neg]
288.loop:
289    movu       m0, [r1q]
290    movu       m2, [r1q + mmsize]
291    movu       m1, [zq + r3q + 4 + mmsize]
292    movu       m3, [zq + r3q + 4]
293
294    pxor       m2, m5
295    pxor       m0, m5
296    pshufd     m2, m2, q0123
297    pshufd     m0, m0, q0123
298    SBUTTERFLY dq, 2, 3, 4
299    SBUTTERFLY dq, 0, 1, 4
300    mova  [r2q + 2*r3q + 0*mmsize], m2
301    mova  [r2q + 2*r3q + 1*mmsize], m3
302    mova  [r2q + 2*r3q + 2*mmsize], m0
303    mova  [r2q + 2*r3q + 3*mmsize], m1
304    add       r1q, 2*mmsize
305    sub       r3q, 2*mmsize
306    jge      .loop
307    movq       m2, [zq]
308    movq    [r2q], m2
309    REP_RET
310
311%ifdef PIC
312%define NREGS 1
313%if UNIX64
314%define NOISE_TABLE r6q ; r5q is m_max
315%else
316%define NOISE_TABLE r5q
317%endif
318%else
319%define NREGS 0
320%define NOISE_TABLE sbr_noise_table
321%endif
322
323%macro LOAD_NST  1
324%ifdef PIC
325    lea  NOISE_TABLE, [%1]
326    mova          m0, [kxq + NOISE_TABLE]
327%else
328    mova          m0, [kxq + %1]
329%endif
330%endmacro
331
332INIT_XMM sse2
333; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
334;                      const float *q_filt, int noise,
335;                      int kx, int m_max)
336cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
337    mova       m0, [ps_noise0]
338    jmp apply_noise_main
339
340; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
341;                      const float *q_filt, int noise,
342;                      int kx, int m_max)
343cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
344    and       kxq, 1
345    shl       kxq, 4
346    LOAD_NST  ps_noise13
347    jmp apply_noise_main
348
349; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
350;                      const float *q_filt, int noise,
351;                      int kx, int m_max)
352cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
353    mova       m0, [ps_noise2]
354    jmp apply_noise_main
355
356; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
357;                      const float *q_filt, int noise,
358;                      int kx, int m_max)
359cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
360    and       kxq, 1
361    shl       kxq, 4
362    LOAD_NST  ps_noise13+16
363
364apply_noise_main:
365%if ARCH_X86_64 == 0 || WIN64
366    mov       kxd, m_maxm
367    DEFINE_ARGS Y, s_m, q_filt, noise, count
368%else
369    DEFINE_ARGS Y, s_m, q_filt, noise, kx, count
370%endif
371    movsxdifnidn    noiseq, noised
372    dec    noiseq
373    shl    countd, 2
374%ifdef PIC
375    lea NOISE_TABLE, [sbr_noise_table]
376%endif
377    lea        Yq, [Yq + 2*countq]
378    add      s_mq, countq
379    add   q_filtq, countq
380    shl    noiseq, 3
381    pxor       m5, m5
382    neg    countq
383.loop:
384    mova       m1, [q_filtq + countq]
385    movu       m3, [noiseq + NOISE_TABLE + 1*mmsize]
386    movu       m4, [noiseq + NOISE_TABLE + 2*mmsize]
387    add    noiseq, 2*mmsize
388    and    noiseq, 0x1ff<<3
389    punpckhdq  m2, m1, m1
390    punpckldq  m1, m1
391    mulps      m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
392    mulps      m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
393    mova       m3, [s_mq + countq]
394    ; TODO: replace by a vpermd in AVX2
395    punpckhdq  m4, m3, m3
396    punpckldq  m3, m3
397    pcmpeqd    m6, m3, m5 ; m6 == 0
398    pcmpeqd    m7, m4, m5 ; m7 == 0
399    mulps      m3, m0 ; s_m[m] * phi_sign
400    mulps      m4, m0 ; s_m[m] * phi_sign
401    pand       m1, m6
402    pand       m2, m7
403    movu       m6, [Yq + 2*countq]
404    movu       m7, [Yq + 2*countq + mmsize]
405    addps      m3, m1
406    addps      m4, m2
407    addps      m6, m3
408    addps      m7, m4
409    movu    [Yq + 2*countq], m6
410    movu    [Yq + 2*countq + mmsize], m7
411    add    countq, mmsize
412    jl      .loop
413    RET
414
415INIT_XMM sse
416cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
417%define COUNT  32*4
418%define OFFSET 32*4
419    mov        cq, -COUNT
420    lea     vrevq, [vq + OFFSET + COUNT]
421    add        vq, OFFSET-mmsize
422    add      srcq, 2*COUNT
423    mova       m3, [ps_neg]
424.loop:
425    mova       m0, [srcq + 2*cq + 0*mmsize]
426    mova       m1, [srcq + 2*cq + 1*mmsize]
427    shufps     m2, m0, m1, q2020
428    shufps     m1, m0, q1313
429    xorps      m2, m3
430    mova     [vq], m1
431    mova  [vrevq + cq], m2
432    sub        vq, mmsize
433    add        cq, mmsize
434    jl      .loop
435    REP_RET
436
437%macro SBR_AUTOCORRELATE 0
438cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
439    mov   cntq, 37*8
440    add     xq, cntq
441    neg   cntq
442
443%if cpuflag(sse3)
444%define   MOVH  movsd
445    movddup m5, [xq+cntq]
446%else
447%define   MOVH  movlps
448    movlps  m5, [xq+cntq]
449    movlhps m5, m5
450%endif
451    MOVH    m7, [xq+cntq+8 ]
452    MOVH    m1, [xq+cntq+16]
453    shufps  m7, m7, q0110
454    shufps  m1, m1, q0110
455    mulps   m3, m5, m7   ;              x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
456    mulps   m4, m5, m5   ;              x[0][0] * x[0][0], x[0][1] * x[0][1];
457    mulps   m5, m1       ; real_sum2  = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0]
458    movaps  [rsp   ], m3
459    movaps  [rsp+16], m4
460    add   cntq, 8
461
462    MOVH    m2, [xq+cntq+16]
463    movlhps m7, m7
464    shufps  m2, m2, q0110
465    mulps   m6, m7, m1   ; real_sum1  = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
466    mulps   m4, m7, m2
467    mulps   m7, m7       ; real_sum0  = x[1][0] * x[1][0], x[1][1] * x[1][1];
468    addps   m5, m4       ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0]
469
470align 16
471.loop:
472    add   cntq, 8
473    MOVH    m0, [xq+cntq+16]
474    movlhps m1, m1
475    shufps  m0, m0, q0110
476    mulps   m3, m1, m2
477    mulps   m4, m1, m0
478    mulps   m1, m1
479    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
480    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
481    addps   m7, m1       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
482    add   cntq, 8
483    MOVH    m1, [xq+cntq+16]
484    movlhps m2, m2
485    shufps  m1, m1, q0110
486    mulps   m3, m2, m0
487    mulps   m4, m2, m1
488    mulps   m2, m2
489    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
490    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
491    addps   m7, m2       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
492    add   cntq, 8
493    MOVH    m2, [xq+cntq+16]
494    movlhps m0, m0
495    shufps  m2, m2, q0110
496    mulps   m3, m0, m1
497    mulps   m4, m0, m2
498    mulps   m0, m0
499    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
500    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
501    addps   m7, m0       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
502    jl .loop
503
504    movlhps m1, m1
505    mulps   m2, m1
506    mulps   m1, m1
507    addps   m2, m6       ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0];
508    addps   m1, m7       ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1];
509    addps   m6, [rsp   ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0];
510    addps   m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1];
511
512    xorps   m2, [ps_mask3]
513    xorps   m5, [ps_mask3]
514    xorps   m6, [ps_mask3]
515    HADDPS  m2, m5, m3
516    HADDPS  m7, m6, m4
517%if cpuflag(sse3)
518    movshdup m0, m1
519%else
520    movss   m0, m1
521    shufps  m1, m1, q0001
522%endif
523    addss   m1, m0
524    movaps  [phiq     ], m2
525    movhps  [phiq+0x18], m7
526    movss   [phiq+0x28], m7
527    movss   [phiq+0x10], m1
528    RET
529%endmacro
530
531INIT_XMM sse
532SBR_AUTOCORRELATE
533INIT_XMM sse3
534SBR_AUTOCORRELATE
535