1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* SIMD optimized Opus encoder DSP function
3cabdff1aSopenharmony_ci;*
4cabdff1aSopenharmony_ci;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci;*
13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
17cabdff1aSopenharmony_ci;*
18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci;******************************************************************************
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci%include "config.asm"
24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ci%ifdef __NASM_VER__
27cabdff1aSopenharmony_ci%use "smartalign"
28cabdff1aSopenharmony_ciALIGNMODE p6
29cabdff1aSopenharmony_ci%endif
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_ciSECTION_RODATA 64
32cabdff1aSopenharmony_ci
33cabdff1aSopenharmony_ciconst_float_abs_mask:   times 8 dd 0x7fffffff
34cabdff1aSopenharmony_ciconst_align_abs_edge:   times 8 dd 0
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_ciconst_float_0_5:        times 8 dd 0.5
37cabdff1aSopenharmony_ciconst_float_1:          times 8 dd 1.0
38cabdff1aSopenharmony_ciconst_float_sign_mask:  times 8 dd 0x80000000
39cabdff1aSopenharmony_ci
40cabdff1aSopenharmony_ciconst_int32_offsets:
41cabdff1aSopenharmony_ci                        %rep 8
42cabdff1aSopenharmony_ci                                dd $-const_int32_offsets
43cabdff1aSopenharmony_ci                        %endrep
44cabdff1aSopenharmony_ciSECTION .text
45cabdff1aSopenharmony_ci
46cabdff1aSopenharmony_ci;
47cabdff1aSopenharmony_ci;   Setup High Register to be used
48cabdff1aSopenharmony_ci;   for holding memory constants
49cabdff1aSopenharmony_ci;
50cabdff1aSopenharmony_ci; %1 - the register to be used, assmues it is >= mm8
51cabdff1aSopenharmony_ci; %2 - name of the constant.
52cabdff1aSopenharmony_ci;
53cabdff1aSopenharmony_ci; Subsequent opcodes are going to use the constant in the form
54cabdff1aSopenharmony_ci; "addps m0, mm_const_name" and it would be turned into:
55cabdff1aSopenharmony_ci; "addps m0, [const_name]" on 32 bit arch or
56cabdff1aSopenharmony_ci; "addps m0, m8" on 64 bit arch
57cabdff1aSopenharmony_ci%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
58cabdff1aSopenharmony_ci%if num_mmregs > 8
59cabdff1aSopenharmony_ci    %define  mm_%3   %2
60cabdff1aSopenharmony_ci    %{1}        %2, [%3]    ; movaps m8, [const_name]
61cabdff1aSopenharmony_ci%else
62cabdff1aSopenharmony_ci    %define  mm_%3  [%3]
63cabdff1aSopenharmony_ci%endif
64cabdff1aSopenharmony_ci%endmacro
65cabdff1aSopenharmony_ci
66cabdff1aSopenharmony_ci;
67cabdff1aSopenharmony_ci;   Set Position Independent Code
68cabdff1aSopenharmony_ci;       Base address of a constant
69cabdff1aSopenharmony_ci; %1 - the register to be used, if PIC is set
70cabdff1aSopenharmony_ci; %2 - name of the constant.
71cabdff1aSopenharmony_ci;
72cabdff1aSopenharmony_ci; Subsequent opcode are going to use the base address in the form
73cabdff1aSopenharmony_ci; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
74cabdff1aSopenharmony_ci; "movaps m0, [r5 + r4]" if PIC is enabled
75cabdff1aSopenharmony_ci; "movaps m0, [constant_name + r4]" if texrel are used
76cabdff1aSopenharmony_ci%macro SET_PIC_BASE 3; reg, const_label
77cabdff1aSopenharmony_ci%ifdef PIC
78cabdff1aSopenharmony_ci    %{1}     %2, [%3]      ; lea r5, [rip+const]
79cabdff1aSopenharmony_ci    %define  pic_base_%3 %2
80cabdff1aSopenharmony_ci%else
81cabdff1aSopenharmony_ci    %define  pic_base_%3 %3
82cabdff1aSopenharmony_ci%endif
83cabdff1aSopenharmony_ci%endmacro
84cabdff1aSopenharmony_ci
85cabdff1aSopenharmony_ci%macro PULSES_SEARCH 1
86cabdff1aSopenharmony_ci; m6 Syy_norm
87cabdff1aSopenharmony_ci; m7 Sxy_norm
88cabdff1aSopenharmony_ci    addps          m6, mm_const_float_0_5   ; Syy_norm += 1.0/2
89cabdff1aSopenharmony_ci    pxor           m1, m1                   ; max_idx
90cabdff1aSopenharmony_ci    xorps          m3, m3                   ; p_max
91cabdff1aSopenharmony_ci    xor           r4d, r4d
92cabdff1aSopenharmony_cialign 16
93cabdff1aSopenharmony_ci%%distortion_search:
94cabdff1aSopenharmony_ci    movd          xm2, dword r4d    ; movd zero extends
95cabdff1aSopenharmony_ci%ifidn %1,add
96cabdff1aSopenharmony_ci    movaps         m4, [tmpY + r4]  ; y[i]
97cabdff1aSopenharmony_ci    movaps         m5, [tmpX + r4]  ; X[i]
98cabdff1aSopenharmony_ci
99cabdff1aSopenharmony_ci  %if USE_APPROXIMATION == 1
100cabdff1aSopenharmony_ci    xorps          m0, m0
101cabdff1aSopenharmony_ci    cmpps          m0, m0, m5, 4    ; m0 = (X[i] != 0.0)
102cabdff1aSopenharmony_ci  %endif
103cabdff1aSopenharmony_ci
104cabdff1aSopenharmony_ci    addps          m4, m6           ; m4 = Syy_new = y[i] + Syy_norm
105cabdff1aSopenharmony_ci    addps          m5, m7           ; m5 = Sxy_new = X[i] + Sxy_norm
106cabdff1aSopenharmony_ci
107cabdff1aSopenharmony_ci  %if USE_APPROXIMATION == 1
108cabdff1aSopenharmony_ci    andps          m5, m0           ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
109cabdff1aSopenharmony_ci  %endif
110cabdff1aSopenharmony_ci
111cabdff1aSopenharmony_ci%else
112cabdff1aSopenharmony_ci    movaps         m5, [tmpY + r4]      ; m5 = y[i]
113cabdff1aSopenharmony_ci
114cabdff1aSopenharmony_ci    xorps          m0, m0               ; m0 = 0;
115cabdff1aSopenharmony_ci    cmpps          m0, m0, m5, 1        ; m0 = (0<y)
116cabdff1aSopenharmony_ci
117cabdff1aSopenharmony_ci    subps          m4, m6, m5           ; m4 = Syy_new = Syy_norm - y[i]
118cabdff1aSopenharmony_ci    subps          m5, m7, [tmpX + r4]  ; m5 = Sxy_new = Sxy_norm - X[i]
119cabdff1aSopenharmony_ci    andps          m5, m0               ; (0<y)?m5:0
120cabdff1aSopenharmony_ci%endif
121cabdff1aSopenharmony_ci
122cabdff1aSopenharmony_ci%if USE_APPROXIMATION == 1
123cabdff1aSopenharmony_ci    rsqrtps        m4, m4
124cabdff1aSopenharmony_ci    mulps          m5, m4           ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
125cabdff1aSopenharmony_ci%else
126cabdff1aSopenharmony_ci    mulps          m5, m5
127cabdff1aSopenharmony_ci    divps          m5, m4           ; m5 = p = Sxy_new*Sxy_new/Syy
128cabdff1aSopenharmony_ci%endif
129cabdff1aSopenharmony_ci    VPBROADCASTD   m2, xm2          ; m2=i (all lanes get same values, we add the offset-per-lane, later)
130cabdff1aSopenharmony_ci
131cabdff1aSopenharmony_ci    cmpps          m0, m3, m5, 1    ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
132cabdff1aSopenharmony_ci    maxps          m3, m5           ; m3=max(p_max,p)
133cabdff1aSopenharmony_ci                                    ; maxps here is faster than blendvps, despite blend having lower latency.
134cabdff1aSopenharmony_ci
135cabdff1aSopenharmony_ci    pand           m2, m0           ; This version seems faster than sse41 pblendvb
136cabdff1aSopenharmony_ci    pmaxsw         m1, m2           ; SSE2 signed word, so it would work for N < 32768/4
137cabdff1aSopenharmony_ci
138cabdff1aSopenharmony_ci    add           r4d, mmsize
139cabdff1aSopenharmony_ci    cmp           r4d, Nd
140cabdff1aSopenharmony_ci    jb   %%distortion_search
141cabdff1aSopenharmony_ci
142cabdff1aSopenharmony_ci    por            m1, mm_const_int32_offsets  ; max_idx offsets per individual lane (skipped in the inner loop)
143cabdff1aSopenharmony_ci    movdqa         m4, m1                      ; needed for the aligned y[max_idx]+=1; processing
144cabdff1aSopenharmony_ci
145cabdff1aSopenharmony_ci%if mmsize >= 32
146cabdff1aSopenharmony_ci; Merge parallel maximums round 8 (4 vs 4)
147cabdff1aSopenharmony_ci
148cabdff1aSopenharmony_ci    vextractf128  xm5, ym3, 1       ; xmm5 = ymm3[1x128] = ymm3[255..128b]
149cabdff1aSopenharmony_ci    cmpps         xm0, xm3, xm5, 1  ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
150cabdff1aSopenharmony_ci
151cabdff1aSopenharmony_ci    vextracti128  xm2, ym1, 1       ; xmm2 = ymm1[1x128] = ymm1[255..128b]
152cabdff1aSopenharmony_ci    BLENDVPS      xm3, xm5, xm0     ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
153cabdff1aSopenharmony_ci    PBLENDVB      xm1, xm2, xm0     ; p       = m0 ? p[1x128]       : p[0x128]
154cabdff1aSopenharmony_ci%endif
155cabdff1aSopenharmony_ci
156cabdff1aSopenharmony_ci; Merge parallel maximums round 4 (2 vs 2)
157cabdff1aSopenharmony_ci                                    ; m3=p[3210]
158cabdff1aSopenharmony_ci    movhlps       xm5, xm3          ; m5=p[xx32]
159cabdff1aSopenharmony_ci    cmpps         xm0, xm3, xm5, 1  ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
160cabdff1aSopenharmony_ci
161cabdff1aSopenharmony_ci    pshufd        xm2, xm1, q3232
162cabdff1aSopenharmony_ci    BLENDVPS      xm3, xm5, xm0     ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
163cabdff1aSopenharmony_ci    PBLENDVB      xm1, xm2, xm0     ; p       = m0 ? p[3,2]       : p[1,0]
164cabdff1aSopenharmony_ci
165cabdff1aSopenharmony_ci; Merge parallel maximums final round (1 vs 1)
166cabdff1aSopenharmony_ci    shufps        xm0, xm3, xm3, q1111  ; m0 = m3[1] = p[1]
167cabdff1aSopenharmony_ci    cmpss         xm0, xm3, 5           ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )
168cabdff1aSopenharmony_ci
169cabdff1aSopenharmony_ci    pshufd        xm2, xm1, q1111
170cabdff1aSopenharmony_ci    PBLENDVB      xm1, xm2, xm0
171cabdff1aSopenharmony_ci
172cabdff1aSopenharmony_ci    movd    dword r4d, xm1          ; zero extends to the rest of r4q
173cabdff1aSopenharmony_ci
174cabdff1aSopenharmony_ci    VBROADCASTSS   m3, [tmpX + r4]
175cabdff1aSopenharmony_ci    %{1}ps         m7, m3           ; Sxy += X[max_idx]
176cabdff1aSopenharmony_ci
177cabdff1aSopenharmony_ci    VBROADCASTSS   m5, [tmpY + r4]
178cabdff1aSopenharmony_ci    %{1}ps         m6, m5           ; Syy += Y[max_idx]
179cabdff1aSopenharmony_ci
180cabdff1aSopenharmony_ci    ; We have to update a single element in Y[i]
181cabdff1aSopenharmony_ci    ; However writing 4 bytes and then doing 16 byte load in the inner loop
182cabdff1aSopenharmony_ci    ; could cause a stall due to breaking write forwarding.
183cabdff1aSopenharmony_ci    VPBROADCASTD   m1, xm1
184cabdff1aSopenharmony_ci    pcmpeqd        m1, m1, m4           ; exactly 1 element matches max_idx and this finds it
185cabdff1aSopenharmony_ci
186cabdff1aSopenharmony_ci    and           r4d, ~(mmsize-1)      ; align address down, so the value pointed by max_idx is inside a mmsize load
187cabdff1aSopenharmony_ci    movaps         m5, [tmpY + r4]      ; m5 = Y[y3...ym...y0]
188cabdff1aSopenharmony_ci    andps          m1, mm_const_float_1 ; m1 =  [ 0...1.0...0]
189cabdff1aSopenharmony_ci    %{1}ps         m5, m1               ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
190cabdff1aSopenharmony_ci    movaps [tmpY + r4], m5              ; Y[max_idx] +-= 1.0;
191cabdff1aSopenharmony_ci%endmacro
192cabdff1aSopenharmony_ci
193cabdff1aSopenharmony_ci;
194cabdff1aSopenharmony_ci; We need one more register for
195cabdff1aSopenharmony_ci; PIC relative addressing. Use this
196cabdff1aSopenharmony_ci; to count it in cglobal
197cabdff1aSopenharmony_ci;
198cabdff1aSopenharmony_ci%ifdef PIC
199cabdff1aSopenharmony_ci  %define num_pic_regs 1
200cabdff1aSopenharmony_ci%else
201cabdff1aSopenharmony_ci  %define num_pic_regs 0
202cabdff1aSopenharmony_ci%endif
203cabdff1aSopenharmony_ci
204cabdff1aSopenharmony_ci;
205cabdff1aSopenharmony_ci; Pyramid Vector Quantization Search implementation
206cabdff1aSopenharmony_ci;
207cabdff1aSopenharmony_ci; float * inX   - Unaligned (SIMD) access, it will be overread,
208cabdff1aSopenharmony_ci;                 but extra data is masked away.
209cabdff1aSopenharmony_ci; int32 * outY  - Should be aligned and padded buffer.
210cabdff1aSopenharmony_ci;                 It is used as temp buffer.
211cabdff1aSopenharmony_ci; uint32 K      - Number of pulses to have after quantizations.
212cabdff1aSopenharmony_ci; uint32 N      - Number of vector elements. Must be 0 < N < 256
213cabdff1aSopenharmony_ci;
214cabdff1aSopenharmony_ci%macro PVQ_FAST_SEARCH 1
215cabdff1aSopenharmony_cicglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
216cabdff1aSopenharmony_ci%define tmpX rsp
217cabdff1aSopenharmony_ci%define tmpY outYq
218cabdff1aSopenharmony_ci
219cabdff1aSopenharmony_ci    movaps     m0, [const_float_abs_mask]
220cabdff1aSopenharmony_ci    shl        Nd, 2    ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
221cabdff1aSopenharmony_ci    mov       r4d, Nd
222cabdff1aSopenharmony_ci
223cabdff1aSopenharmony_ci    neg       r4d
224cabdff1aSopenharmony_ci    and       r4d, mmsize-1
225cabdff1aSopenharmony_ci
226cabdff1aSopenharmony_ci    SET_PIC_BASE lea, r5, const_align_abs_edge  ; rip+const
227cabdff1aSopenharmony_ci    movups     m2, [pic_base_const_align_abs_edge + r4 - mmsize]
228cabdff1aSopenharmony_ci
229cabdff1aSopenharmony_ci    add        Nd, r4d              ; N = align(N, mmsize)
230cabdff1aSopenharmony_ci
231cabdff1aSopenharmony_ci    lea       r4d, [Nd - mmsize]    ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
232cabdff1aSopenharmony_ci    movups     m1, [inXq + r4]
233cabdff1aSopenharmony_ci    andps      m1, m2
234cabdff1aSopenharmony_ci    movaps  [tmpX + r4], m1         ; Sx = abs( X[N-1] )
235cabdff1aSopenharmony_ci
236cabdff1aSopenharmony_cialign 16
237cabdff1aSopenharmony_ci%%loop_abs_sum:
238cabdff1aSopenharmony_ci    sub       r4d, mmsize
239cabdff1aSopenharmony_ci    jc   %%end_loop_abs_sum
240cabdff1aSopenharmony_ci
241cabdff1aSopenharmony_ci    movups     m2, [inXq + r4]
242cabdff1aSopenharmony_ci    andps      m2, m0
243cabdff1aSopenharmony_ci
244cabdff1aSopenharmony_ci    movaps  [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
245cabdff1aSopenharmony_ci    addps      m1, m2       ; Sx += abs(X[i])
246cabdff1aSopenharmony_ci    jmp  %%loop_abs_sum
247cabdff1aSopenharmony_ci
248cabdff1aSopenharmony_cialign 16
249cabdff1aSopenharmony_ci%%end_loop_abs_sum:
250cabdff1aSopenharmony_ci
251cabdff1aSopenharmony_ci    HSUMPS     m1, m2       ; m1  = Sx
252cabdff1aSopenharmony_ci
253cabdff1aSopenharmony_ci    xorps      m0, m0
254cabdff1aSopenharmony_ci    comiss    xm0, xm1      ;
255cabdff1aSopenharmony_ci    jz   %%zero_input       ; if (Sx==0) goto zero_input
256cabdff1aSopenharmony_ci
257cabdff1aSopenharmony_ci    cvtsi2ss  xm0, dword Kd ; m0 = K
258cabdff1aSopenharmony_ci%if USE_APPROXIMATION == 1
259cabdff1aSopenharmony_ci    rcpss     xm1, xm1      ; m1 = approx(1/Sx)
260cabdff1aSopenharmony_ci    mulss     xm0, xm1      ; m0 = K*(1/Sx)
261cabdff1aSopenharmony_ci%else
262cabdff1aSopenharmony_ci    divss     xm0, xm1      ; b = K/Sx
263cabdff1aSopenharmony_ci                            ; b = K/max_x
264cabdff1aSopenharmony_ci%endif
265cabdff1aSopenharmony_ci
266cabdff1aSopenharmony_ci    VBROADCASTSS  m0, xm0
267cabdff1aSopenharmony_ci
268cabdff1aSopenharmony_ci    lea       r4d, [Nd - mmsize]
269cabdff1aSopenharmony_ci    pxor       m5, m5             ; Sy    ( Sum of abs( y[i]) )
270cabdff1aSopenharmony_ci    xorps      m6, m6             ; Syy   ( Sum of y[i]*y[i]  )
271cabdff1aSopenharmony_ci    xorps      m7, m7             ; Sxy   ( Sum of X[i]*y[i]  )
272cabdff1aSopenharmony_cialign 16
273cabdff1aSopenharmony_ci%%loop_guess:
274cabdff1aSopenharmony_ci    movaps     m1, [tmpX + r4]    ; m1   = X[i]
275cabdff1aSopenharmony_ci    mulps      m2, m0, m1         ; m2   = res*X[i]
276cabdff1aSopenharmony_ci    cvtps2dq   m2, m2             ; yt   = (int)lrintf( res*X[i] )
277cabdff1aSopenharmony_ci    paddd      m5, m2             ; Sy  += yt
278cabdff1aSopenharmony_ci    cvtdq2ps   m2, m2             ; yt   = (float)yt
279cabdff1aSopenharmony_ci    mulps      m1, m2             ; m1   = X[i]*yt
280cabdff1aSopenharmony_ci    movaps  [tmpY + r4], m2       ; y[i] = m2
281cabdff1aSopenharmony_ci    addps      m7, m1             ; Sxy += m1;
282cabdff1aSopenharmony_ci    mulps      m2, m2             ; m2   = yt*yt
283cabdff1aSopenharmony_ci    addps      m6, m2             ; Syy += m2
284cabdff1aSopenharmony_ci
285cabdff1aSopenharmony_ci    sub       r4d, mmsize
286cabdff1aSopenharmony_ci    jnc  %%loop_guess
287cabdff1aSopenharmony_ci
288cabdff1aSopenharmony_ci    HSUMPS     m6, m1       ; Syy_norm
289cabdff1aSopenharmony_ci    HADDD      m5, m4       ; pulses
290cabdff1aSopenharmony_ci
291cabdff1aSopenharmony_ci    movd  dword r4d, xm5    ; zero extends to the rest of r4q
292cabdff1aSopenharmony_ci
293cabdff1aSopenharmony_ci    sub        Kd, r4d      ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
294cabdff1aSopenharmony_ci    jz   %%finish           ; K - pulses == 0
295cabdff1aSopenharmony_ci
296cabdff1aSopenharmony_ci    SET_HI_REG_MM_CONSTANT movaps,  m8, const_float_0_5
297cabdff1aSopenharmony_ci    SET_HI_REG_MM_CONSTANT movaps,  m9, const_float_1
298cabdff1aSopenharmony_ci    SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
299cabdff1aSopenharmony_ci    ; Use Syy/2 in distortion parameter calculations.
300cabdff1aSopenharmony_ci    ; Saves pre and post-caclulation to correct Y[] values.
301cabdff1aSopenharmony_ci    ; Same precision, since float mantisa is normalized.
302cabdff1aSopenharmony_ci    ; The SQRT approximation does differ.
303cabdff1aSopenharmony_ci    HSUMPS     m7, m0         ; Sxy_norm
304cabdff1aSopenharmony_ci    mulps      m6, mm_const_float_0_5
305cabdff1aSopenharmony_ci
306cabdff1aSopenharmony_ci    jc   %%remove_pulses_loop   ; K - pulses < 0
307cabdff1aSopenharmony_ci
308cabdff1aSopenharmony_cialign 16                        ; K - pulses > 0
309cabdff1aSopenharmony_ci%%add_pulses_loop:
310cabdff1aSopenharmony_ci
311cabdff1aSopenharmony_ci    PULSES_SEARCH add   ; m6 Syy_norm ; m7 Sxy_norm
312cabdff1aSopenharmony_ci
313cabdff1aSopenharmony_ci    sub        Kd, 1
314cabdff1aSopenharmony_ci    jnz  %%add_pulses_loop
315cabdff1aSopenharmony_ci
316cabdff1aSopenharmony_ci    addps      m6, m6 ; Syy*=2
317cabdff1aSopenharmony_ci
318cabdff1aSopenharmony_ci    jmp  %%finish
319cabdff1aSopenharmony_ci
320cabdff1aSopenharmony_cialign 16
321cabdff1aSopenharmony_ci%%remove_pulses_loop:
322cabdff1aSopenharmony_ci
323cabdff1aSopenharmony_ci    PULSES_SEARCH sub   ; m6 Syy_norm ; m7 Sxy_norm
324cabdff1aSopenharmony_ci
325cabdff1aSopenharmony_ci    add        Kd, 1
326cabdff1aSopenharmony_ci    jnz  %%remove_pulses_loop
327cabdff1aSopenharmony_ci
328cabdff1aSopenharmony_ci    addps      m6, m6 ; Syy*=2
329cabdff1aSopenharmony_ci
330cabdff1aSopenharmony_cialign 16
331cabdff1aSopenharmony_ci%%finish:
332cabdff1aSopenharmony_ci    lea       r4d, [Nd - mmsize]
333cabdff1aSopenharmony_ci    movaps     m2, [const_float_sign_mask]
334cabdff1aSopenharmony_ci
335cabdff1aSopenharmony_cialign 16
336cabdff1aSopenharmony_ci%%restore_sign_loop:
337cabdff1aSopenharmony_ci    movaps     m0, [tmpY + r4]    ; m0 = Y[i]
338cabdff1aSopenharmony_ci    movups     m1, [inXq + r4]    ; m1 = X[i]
339cabdff1aSopenharmony_ci    andps      m1, m2             ; m1 = sign(X[i])
340cabdff1aSopenharmony_ci    orps       m0, m1             ; m0 = Y[i]*sign
341cabdff1aSopenharmony_ci    cvtps2dq   m3, m0             ; m3 = (int)m0
342cabdff1aSopenharmony_ci    movaps  [outYq + r4], m3
343cabdff1aSopenharmony_ci
344cabdff1aSopenharmony_ci    sub       r4d, mmsize
345cabdff1aSopenharmony_ci    jnc  %%restore_sign_loop
346cabdff1aSopenharmony_ci%%return:
347cabdff1aSopenharmony_ci
348cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0    ; sbrdsp
349cabdff1aSopenharmony_ci    movss     r0m, xm6  ; return (float)Syy_norm
350cabdff1aSopenharmony_ci    fld dword r0m
351cabdff1aSopenharmony_ci%else
352cabdff1aSopenharmony_ci    movaps     m0, m6   ; return (float)Syy_norm
353cabdff1aSopenharmony_ci%endif
354cabdff1aSopenharmony_ci
355cabdff1aSopenharmony_ci    RET
356cabdff1aSopenharmony_ci
357cabdff1aSopenharmony_cialign 16
358cabdff1aSopenharmony_ci%%zero_input:
359cabdff1aSopenharmony_ci    lea       r4d, [Nd - mmsize]
360cabdff1aSopenharmony_ci    xorps      m0, m0
361cabdff1aSopenharmony_ci%%zero_loop:
362cabdff1aSopenharmony_ci    movaps  [outYq + r4], m0
363cabdff1aSopenharmony_ci    sub       r4d, mmsize
364cabdff1aSopenharmony_ci    jnc  %%zero_loop
365cabdff1aSopenharmony_ci
366cabdff1aSopenharmony_ci    movaps     m6, [const_float_1]
367cabdff1aSopenharmony_ci    jmp  %%return
368cabdff1aSopenharmony_ci%endmacro
369cabdff1aSopenharmony_ci
370cabdff1aSopenharmony_ci; if 1, use a float op that give half precision but execute for around 3 cycles.
371cabdff1aSopenharmony_ci; On Skylake & Ryzen the division is much faster (around 11c/3),
372cabdff1aSopenharmony_ci; that makes the full precision code about 2% slower.
373cabdff1aSopenharmony_ci; Opus also does use rsqrt approximation in their intrinsics code.
374cabdff1aSopenharmony_ci%define USE_APPROXIMATION   1
375cabdff1aSopenharmony_ci
376cabdff1aSopenharmony_ciINIT_XMM sse2
377cabdff1aSopenharmony_ciPVQ_FAST_SEARCH _approx
378cabdff1aSopenharmony_ci
379cabdff1aSopenharmony_ciINIT_XMM sse4
380cabdff1aSopenharmony_ciPVQ_FAST_SEARCH _approx
381cabdff1aSopenharmony_ci
382cabdff1aSopenharmony_ci%define USE_APPROXIMATION   0
383cabdff1aSopenharmony_ci
384cabdff1aSopenharmony_ciINIT_XMM avx
385cabdff1aSopenharmony_ciPVQ_FAST_SEARCH _exact
386