1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* SIMD optimized Opus encoder DSP function 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com> 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci;* 13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 17cabdff1aSopenharmony_ci;* 18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci;****************************************************************************** 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci%include "config.asm" 24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ci%ifdef __NASM_VER__ 27cabdff1aSopenharmony_ci%use "smartalign" 28cabdff1aSopenharmony_ciALIGNMODE p6 29cabdff1aSopenharmony_ci%endif 30cabdff1aSopenharmony_ci 31cabdff1aSopenharmony_ciSECTION_RODATA 64 32cabdff1aSopenharmony_ci 33cabdff1aSopenharmony_ciconst_float_abs_mask: times 8 dd 0x7fffffff 34cabdff1aSopenharmony_ciconst_align_abs_edge: times 8 dd 0 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_ciconst_float_0_5: times 8 dd 0.5 37cabdff1aSopenharmony_ciconst_float_1: times 8 dd 1.0 38cabdff1aSopenharmony_ciconst_float_sign_mask: times 8 dd 0x80000000 39cabdff1aSopenharmony_ci 40cabdff1aSopenharmony_ciconst_int32_offsets: 41cabdff1aSopenharmony_ci %rep 8 42cabdff1aSopenharmony_ci dd $-const_int32_offsets 43cabdff1aSopenharmony_ci %endrep 44cabdff1aSopenharmony_ciSECTION .text 45cabdff1aSopenharmony_ci 46cabdff1aSopenharmony_ci; 47cabdff1aSopenharmony_ci; Setup High Register to be used 48cabdff1aSopenharmony_ci; for holding memory constants 49cabdff1aSopenharmony_ci; 50cabdff1aSopenharmony_ci; %1 - the register to be used, assmues it is >= mm8 51cabdff1aSopenharmony_ci; %2 - name of the constant. 52cabdff1aSopenharmony_ci; 53cabdff1aSopenharmony_ci; Subsequent opcodes are going to use the constant in the form 54cabdff1aSopenharmony_ci; "addps m0, mm_const_name" and it would be turned into: 55cabdff1aSopenharmony_ci; "addps m0, [const_name]" on 32 bit arch or 56cabdff1aSopenharmony_ci; "addps m0, m8" on 64 bit arch 57cabdff1aSopenharmony_ci%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name 58cabdff1aSopenharmony_ci%if num_mmregs > 8 59cabdff1aSopenharmony_ci %define mm_%3 %2 60cabdff1aSopenharmony_ci %{1} %2, [%3] ; movaps m8, [const_name] 61cabdff1aSopenharmony_ci%else 62cabdff1aSopenharmony_ci %define mm_%3 [%3] 63cabdff1aSopenharmony_ci%endif 64cabdff1aSopenharmony_ci%endmacro 65cabdff1aSopenharmony_ci 66cabdff1aSopenharmony_ci; 67cabdff1aSopenharmony_ci; Set Position Independent Code 68cabdff1aSopenharmony_ci; Base address of a constant 69cabdff1aSopenharmony_ci; %1 - the register to be used, if PIC is set 70cabdff1aSopenharmony_ci; %2 - name of the constant. 71cabdff1aSopenharmony_ci; 72cabdff1aSopenharmony_ci; Subsequent opcode are going to use the base address in the form 73cabdff1aSopenharmony_ci; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into 74cabdff1aSopenharmony_ci; "movaps m0, [r5 + r4]" if PIC is enabled 75cabdff1aSopenharmony_ci; "movaps m0, [constant_name + r4]" if texrel are used 76cabdff1aSopenharmony_ci%macro SET_PIC_BASE 3; reg, const_label 77cabdff1aSopenharmony_ci%ifdef PIC 78cabdff1aSopenharmony_ci %{1} %2, [%3] ; lea r5, [rip+const] 79cabdff1aSopenharmony_ci %define pic_base_%3 %2 80cabdff1aSopenharmony_ci%else 81cabdff1aSopenharmony_ci %define pic_base_%3 %3 82cabdff1aSopenharmony_ci%endif 83cabdff1aSopenharmony_ci%endmacro 84cabdff1aSopenharmony_ci 85cabdff1aSopenharmony_ci%macro PULSES_SEARCH 1 86cabdff1aSopenharmony_ci; m6 Syy_norm 87cabdff1aSopenharmony_ci; m7 Sxy_norm 88cabdff1aSopenharmony_ci addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2 89cabdff1aSopenharmony_ci pxor m1, m1 ; max_idx 90cabdff1aSopenharmony_ci xorps m3, m3 ; p_max 91cabdff1aSopenharmony_ci xor r4d, r4d 92cabdff1aSopenharmony_cialign 16 93cabdff1aSopenharmony_ci%%distortion_search: 94cabdff1aSopenharmony_ci movd xm2, dword r4d ; movd zero extends 95cabdff1aSopenharmony_ci%ifidn %1,add 96cabdff1aSopenharmony_ci movaps m4, [tmpY + r4] ; y[i] 97cabdff1aSopenharmony_ci movaps m5, [tmpX + r4] ; X[i] 98cabdff1aSopenharmony_ci 99cabdff1aSopenharmony_ci %if USE_APPROXIMATION == 1 100cabdff1aSopenharmony_ci xorps m0, m0 101cabdff1aSopenharmony_ci cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0) 102cabdff1aSopenharmony_ci %endif 103cabdff1aSopenharmony_ci 104cabdff1aSopenharmony_ci addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm 105cabdff1aSopenharmony_ci addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm 106cabdff1aSopenharmony_ci 107cabdff1aSopenharmony_ci %if USE_APPROXIMATION == 1 108cabdff1aSopenharmony_ci andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding. 109cabdff1aSopenharmony_ci %endif 110cabdff1aSopenharmony_ci 111cabdff1aSopenharmony_ci%else 112cabdff1aSopenharmony_ci movaps m5, [tmpY + r4] ; m5 = y[i] 113cabdff1aSopenharmony_ci 114cabdff1aSopenharmony_ci xorps m0, m0 ; m0 = 0; 115cabdff1aSopenharmony_ci cmpps m0, m0, m5, 1 ; m0 = (0<y) 116cabdff1aSopenharmony_ci 117cabdff1aSopenharmony_ci subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i] 118cabdff1aSopenharmony_ci subps m5, m7, [tmpX + r4] ; m5 = Sxy_new = Sxy_norm - X[i] 119cabdff1aSopenharmony_ci andps m5, m0 ; (0<y)?m5:0 120cabdff1aSopenharmony_ci%endif 121cabdff1aSopenharmony_ci 122cabdff1aSopenharmony_ci%if USE_APPROXIMATION == 1 123cabdff1aSopenharmony_ci rsqrtps m4, m4 124cabdff1aSopenharmony_ci mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) ) 125cabdff1aSopenharmony_ci%else 126cabdff1aSopenharmony_ci mulps m5, m5 127cabdff1aSopenharmony_ci divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy 128cabdff1aSopenharmony_ci%endif 129cabdff1aSopenharmony_ci VPBROADCASTD m2, xm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later) 130cabdff1aSopenharmony_ci 131cabdff1aSopenharmony_ci cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max) 132cabdff1aSopenharmony_ci maxps m3, m5 ; m3=max(p_max,p) 133cabdff1aSopenharmony_ci ; maxps here is faster than blendvps, despite blend having lower latency. 134cabdff1aSopenharmony_ci 135cabdff1aSopenharmony_ci pand m2, m0 ; This version seems faster than sse41 pblendvb 136cabdff1aSopenharmony_ci pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4 137cabdff1aSopenharmony_ci 138cabdff1aSopenharmony_ci add r4d, mmsize 139cabdff1aSopenharmony_ci cmp r4d, Nd 140cabdff1aSopenharmony_ci jb %%distortion_search 141cabdff1aSopenharmony_ci 142cabdff1aSopenharmony_ci por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop) 143cabdff1aSopenharmony_ci movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing 144cabdff1aSopenharmony_ci 145cabdff1aSopenharmony_ci%if mmsize >= 32 146cabdff1aSopenharmony_ci; Merge parallel maximums round 8 (4 vs 4) 147cabdff1aSopenharmony_ci 148cabdff1aSopenharmony_ci vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b] 149cabdff1aSopenharmony_ci cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] ) 150cabdff1aSopenharmony_ci 151cabdff1aSopenharmony_ci vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b] 152cabdff1aSopenharmony_ci BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128] 153cabdff1aSopenharmony_ci PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128] 154cabdff1aSopenharmony_ci%endif 155cabdff1aSopenharmony_ci 156cabdff1aSopenharmony_ci; Merge parallel maximums round 4 (2 vs 2) 157cabdff1aSopenharmony_ci ; m3=p[3210] 158cabdff1aSopenharmony_ci movhlps xm5, xm3 ; m5=p[xx32] 159cabdff1aSopenharmony_ci cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] ) 160cabdff1aSopenharmony_ci 161cabdff1aSopenharmony_ci pshufd xm2, xm1, q3232 162cabdff1aSopenharmony_ci BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0] 163cabdff1aSopenharmony_ci PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0] 164cabdff1aSopenharmony_ci 165cabdff1aSopenharmony_ci; Merge parallel maximums final round (1 vs 1) 166cabdff1aSopenharmony_ci shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1] 167cabdff1aSopenharmony_ci cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] ) 168cabdff1aSopenharmony_ci 169cabdff1aSopenharmony_ci pshufd xm2, xm1, q1111 170cabdff1aSopenharmony_ci PBLENDVB xm1, xm2, xm0 171cabdff1aSopenharmony_ci 172cabdff1aSopenharmony_ci movd dword r4d, xm1 ; zero extends to the rest of r4q 173cabdff1aSopenharmony_ci 174cabdff1aSopenharmony_ci VBROADCASTSS m3, [tmpX + r4] 175cabdff1aSopenharmony_ci %{1}ps m7, m3 ; Sxy += X[max_idx] 176cabdff1aSopenharmony_ci 177cabdff1aSopenharmony_ci VBROADCASTSS m5, [tmpY + r4] 178cabdff1aSopenharmony_ci %{1}ps m6, m5 ; Syy += Y[max_idx] 179cabdff1aSopenharmony_ci 180cabdff1aSopenharmony_ci ; We have to update a single element in Y[i] 181cabdff1aSopenharmony_ci ; However writing 4 bytes and then doing 16 byte load in the inner loop 182cabdff1aSopenharmony_ci ; could cause a stall due to breaking write forwarding. 183cabdff1aSopenharmony_ci VPBROADCASTD m1, xm1 184cabdff1aSopenharmony_ci pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it 185cabdff1aSopenharmony_ci 186cabdff1aSopenharmony_ci and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load 187cabdff1aSopenharmony_ci movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0] 188cabdff1aSopenharmony_ci andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0] 189cabdff1aSopenharmony_ci %{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0] 190cabdff1aSopenharmony_ci movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0; 191cabdff1aSopenharmony_ci%endmacro 192cabdff1aSopenharmony_ci 193cabdff1aSopenharmony_ci; 194cabdff1aSopenharmony_ci; We need one more register for 195cabdff1aSopenharmony_ci; PIC relative addressing. Use this 196cabdff1aSopenharmony_ci; to count it in cglobal 197cabdff1aSopenharmony_ci; 198cabdff1aSopenharmony_ci%ifdef PIC 199cabdff1aSopenharmony_ci %define num_pic_regs 1 200cabdff1aSopenharmony_ci%else 201cabdff1aSopenharmony_ci %define num_pic_regs 0 202cabdff1aSopenharmony_ci%endif 203cabdff1aSopenharmony_ci 204cabdff1aSopenharmony_ci; 205cabdff1aSopenharmony_ci; Pyramid Vector Quantization Search implementation 206cabdff1aSopenharmony_ci; 207cabdff1aSopenharmony_ci; float * inX - Unaligned (SIMD) access, it will be overread, 208cabdff1aSopenharmony_ci; but extra data is masked away. 209cabdff1aSopenharmony_ci; int32 * outY - Should be aligned and padded buffer. 210cabdff1aSopenharmony_ci; It is used as temp buffer. 211cabdff1aSopenharmony_ci; uint32 K - Number of pulses to have after quantizations. 212cabdff1aSopenharmony_ci; uint32 N - Number of vector elements. Must be 0 < N < 256 213cabdff1aSopenharmony_ci; 214cabdff1aSopenharmony_ci%macro PVQ_FAST_SEARCH 1 215cabdff1aSopenharmony_cicglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N 216cabdff1aSopenharmony_ci%define tmpX rsp 217cabdff1aSopenharmony_ci%define tmpY outYq 218cabdff1aSopenharmony_ci 219cabdff1aSopenharmony_ci movaps m0, [const_float_abs_mask] 220cabdff1aSopenharmony_ci shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode. 221cabdff1aSopenharmony_ci mov r4d, Nd 222cabdff1aSopenharmony_ci 223cabdff1aSopenharmony_ci neg r4d 224cabdff1aSopenharmony_ci and r4d, mmsize-1 225cabdff1aSopenharmony_ci 226cabdff1aSopenharmony_ci SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const 227cabdff1aSopenharmony_ci movups m2, [pic_base_const_align_abs_edge + r4 - mmsize] 228cabdff1aSopenharmony_ci 229cabdff1aSopenharmony_ci add Nd, r4d ; N = align(N, mmsize) 230cabdff1aSopenharmony_ci 231cabdff1aSopenharmony_ci lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0. 232cabdff1aSopenharmony_ci movups m1, [inXq + r4] 233cabdff1aSopenharmony_ci andps m1, m2 234cabdff1aSopenharmony_ci movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] ) 235cabdff1aSopenharmony_ci 236cabdff1aSopenharmony_cialign 16 237cabdff1aSopenharmony_ci%%loop_abs_sum: 238cabdff1aSopenharmony_ci sub r4d, mmsize 239cabdff1aSopenharmony_ci jc %%end_loop_abs_sum 240cabdff1aSopenharmony_ci 241cabdff1aSopenharmony_ci movups m2, [inXq + r4] 242cabdff1aSopenharmony_ci andps m2, m0 243cabdff1aSopenharmony_ci 244cabdff1aSopenharmony_ci movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i]) 245cabdff1aSopenharmony_ci addps m1, m2 ; Sx += abs(X[i]) 246cabdff1aSopenharmony_ci jmp %%loop_abs_sum 247cabdff1aSopenharmony_ci 248cabdff1aSopenharmony_cialign 16 249cabdff1aSopenharmony_ci%%end_loop_abs_sum: 250cabdff1aSopenharmony_ci 251cabdff1aSopenharmony_ci HSUMPS m1, m2 ; m1 = Sx 252cabdff1aSopenharmony_ci 253cabdff1aSopenharmony_ci xorps m0, m0 254cabdff1aSopenharmony_ci comiss xm0, xm1 ; 255cabdff1aSopenharmony_ci jz %%zero_input ; if (Sx==0) goto zero_input 256cabdff1aSopenharmony_ci 257cabdff1aSopenharmony_ci cvtsi2ss xm0, dword Kd ; m0 = K 258cabdff1aSopenharmony_ci%if USE_APPROXIMATION == 1 259cabdff1aSopenharmony_ci rcpss xm1, xm1 ; m1 = approx(1/Sx) 260cabdff1aSopenharmony_ci mulss xm0, xm1 ; m0 = K*(1/Sx) 261cabdff1aSopenharmony_ci%else 262cabdff1aSopenharmony_ci divss xm0, xm1 ; b = K/Sx 263cabdff1aSopenharmony_ci ; b = K/max_x 264cabdff1aSopenharmony_ci%endif 265cabdff1aSopenharmony_ci 266cabdff1aSopenharmony_ci VBROADCASTSS m0, xm0 267cabdff1aSopenharmony_ci 268cabdff1aSopenharmony_ci lea r4d, [Nd - mmsize] 269cabdff1aSopenharmony_ci pxor m5, m5 ; Sy ( Sum of abs( y[i]) ) 270cabdff1aSopenharmony_ci xorps m6, m6 ; Syy ( Sum of y[i]*y[i] ) 271cabdff1aSopenharmony_ci xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] ) 272cabdff1aSopenharmony_cialign 16 273cabdff1aSopenharmony_ci%%loop_guess: 274cabdff1aSopenharmony_ci movaps m1, [tmpX + r4] ; m1 = X[i] 275cabdff1aSopenharmony_ci mulps m2, m0, m1 ; m2 = res*X[i] 276cabdff1aSopenharmony_ci cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] ) 277cabdff1aSopenharmony_ci paddd m5, m2 ; Sy += yt 278cabdff1aSopenharmony_ci cvtdq2ps m2, m2 ; yt = (float)yt 279cabdff1aSopenharmony_ci mulps m1, m2 ; m1 = X[i]*yt 280cabdff1aSopenharmony_ci movaps [tmpY + r4], m2 ; y[i] = m2 281cabdff1aSopenharmony_ci addps m7, m1 ; Sxy += m1; 282cabdff1aSopenharmony_ci mulps m2, m2 ; m2 = yt*yt 283cabdff1aSopenharmony_ci addps m6, m2 ; Syy += m2 284cabdff1aSopenharmony_ci 285cabdff1aSopenharmony_ci sub r4d, mmsize 286cabdff1aSopenharmony_ci jnc %%loop_guess 287cabdff1aSopenharmony_ci 288cabdff1aSopenharmony_ci HSUMPS m6, m1 ; Syy_norm 289cabdff1aSopenharmony_ci HADDD m5, m4 ; pulses 290cabdff1aSopenharmony_ci 291cabdff1aSopenharmony_ci movd dword r4d, xm5 ; zero extends to the rest of r4q 292cabdff1aSopenharmony_ci 293cabdff1aSopenharmony_ci sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode. 294cabdff1aSopenharmony_ci jz %%finish ; K - pulses == 0 295cabdff1aSopenharmony_ci 296cabdff1aSopenharmony_ci SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5 297cabdff1aSopenharmony_ci SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1 298cabdff1aSopenharmony_ci SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets 299cabdff1aSopenharmony_ci ; Use Syy/2 in distortion parameter calculations. 300cabdff1aSopenharmony_ci ; Saves pre and post-caclulation to correct Y[] values. 301cabdff1aSopenharmony_ci ; Same precision, since float mantisa is normalized. 302cabdff1aSopenharmony_ci ; The SQRT approximation does differ. 303cabdff1aSopenharmony_ci HSUMPS m7, m0 ; Sxy_norm 304cabdff1aSopenharmony_ci mulps m6, mm_const_float_0_5 305cabdff1aSopenharmony_ci 306cabdff1aSopenharmony_ci jc %%remove_pulses_loop ; K - pulses < 0 307cabdff1aSopenharmony_ci 308cabdff1aSopenharmony_cialign 16 ; K - pulses > 0 309cabdff1aSopenharmony_ci%%add_pulses_loop: 310cabdff1aSopenharmony_ci 311cabdff1aSopenharmony_ci PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm 312cabdff1aSopenharmony_ci 313cabdff1aSopenharmony_ci sub Kd, 1 314cabdff1aSopenharmony_ci jnz %%add_pulses_loop 315cabdff1aSopenharmony_ci 316cabdff1aSopenharmony_ci addps m6, m6 ; Syy*=2 317cabdff1aSopenharmony_ci 318cabdff1aSopenharmony_ci jmp %%finish 319cabdff1aSopenharmony_ci 320cabdff1aSopenharmony_cialign 16 321cabdff1aSopenharmony_ci%%remove_pulses_loop: 322cabdff1aSopenharmony_ci 323cabdff1aSopenharmony_ci PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm 324cabdff1aSopenharmony_ci 325cabdff1aSopenharmony_ci add Kd, 1 326cabdff1aSopenharmony_ci jnz %%remove_pulses_loop 327cabdff1aSopenharmony_ci 328cabdff1aSopenharmony_ci addps m6, m6 ; Syy*=2 329cabdff1aSopenharmony_ci 330cabdff1aSopenharmony_cialign 16 331cabdff1aSopenharmony_ci%%finish: 332cabdff1aSopenharmony_ci lea r4d, [Nd - mmsize] 333cabdff1aSopenharmony_ci movaps m2, [const_float_sign_mask] 334cabdff1aSopenharmony_ci 335cabdff1aSopenharmony_cialign 16 336cabdff1aSopenharmony_ci%%restore_sign_loop: 337cabdff1aSopenharmony_ci movaps m0, [tmpY + r4] ; m0 = Y[i] 338cabdff1aSopenharmony_ci movups m1, [inXq + r4] ; m1 = X[i] 339cabdff1aSopenharmony_ci andps m1, m2 ; m1 = sign(X[i]) 340cabdff1aSopenharmony_ci orps m0, m1 ; m0 = Y[i]*sign 341cabdff1aSopenharmony_ci cvtps2dq m3, m0 ; m3 = (int)m0 342cabdff1aSopenharmony_ci movaps [outYq + r4], m3 343cabdff1aSopenharmony_ci 344cabdff1aSopenharmony_ci sub r4d, mmsize 345cabdff1aSopenharmony_ci jnc %%restore_sign_loop 346cabdff1aSopenharmony_ci%%return: 347cabdff1aSopenharmony_ci 348cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0 ; sbrdsp 349cabdff1aSopenharmony_ci movss r0m, xm6 ; return (float)Syy_norm 350cabdff1aSopenharmony_ci fld dword r0m 351cabdff1aSopenharmony_ci%else 352cabdff1aSopenharmony_ci movaps m0, m6 ; return (float)Syy_norm 353cabdff1aSopenharmony_ci%endif 354cabdff1aSopenharmony_ci 355cabdff1aSopenharmony_ci RET 356cabdff1aSopenharmony_ci 357cabdff1aSopenharmony_cialign 16 358cabdff1aSopenharmony_ci%%zero_input: 359cabdff1aSopenharmony_ci lea r4d, [Nd - mmsize] 360cabdff1aSopenharmony_ci xorps m0, m0 361cabdff1aSopenharmony_ci%%zero_loop: 362cabdff1aSopenharmony_ci movaps [outYq + r4], m0 363cabdff1aSopenharmony_ci sub r4d, mmsize 364cabdff1aSopenharmony_ci jnc %%zero_loop 365cabdff1aSopenharmony_ci 366cabdff1aSopenharmony_ci movaps m6, [const_float_1] 367cabdff1aSopenharmony_ci jmp %%return 368cabdff1aSopenharmony_ci%endmacro 369cabdff1aSopenharmony_ci 370cabdff1aSopenharmony_ci; if 1, use a float op that give half precision but execute for around 3 cycles. 371cabdff1aSopenharmony_ci; On Skylake & Ryzen the division is much faster (around 11c/3), 372cabdff1aSopenharmony_ci; that makes the full precision code about 2% slower. 373cabdff1aSopenharmony_ci; Opus also does use rsqrt approximation in their intrinsics code. 374cabdff1aSopenharmony_ci%define USE_APPROXIMATION 1 375cabdff1aSopenharmony_ci 376cabdff1aSopenharmony_ciINIT_XMM sse2 377cabdff1aSopenharmony_ciPVQ_FAST_SEARCH _approx 378cabdff1aSopenharmony_ci 379cabdff1aSopenharmony_ciINIT_XMM sse4 380cabdff1aSopenharmony_ciPVQ_FAST_SEARCH _approx 381cabdff1aSopenharmony_ci 382cabdff1aSopenharmony_ci%define USE_APPROXIMATION 0 383cabdff1aSopenharmony_ci 384cabdff1aSopenharmony_ciINIT_XMM avx 385cabdff1aSopenharmony_ciPVQ_FAST_SEARCH _exact 386