1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* Copyright (c) 2012 Michael Niedermayer
3cabdff1aSopenharmony_ci;* Copyright (c) 2014 James Almer <jamrial <at> gmail.com>
4cabdff1aSopenharmony_ci;* Copyright (c) 2014 Ronald S. Bultje <rsbultje@gmail.com>
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci;*
13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
17cabdff1aSopenharmony_ci;*
18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci;******************************************************************************
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci%if ARCH_X86_64
26cabdff1aSopenharmony_ci%define pointer resq
27cabdff1aSopenharmony_ci%else
28cabdff1aSopenharmony_ci%define pointer resd
29cabdff1aSopenharmony_ci%endif
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_cistruc ResampleContext
32cabdff1aSopenharmony_ci    .av_class:              pointer 1
33cabdff1aSopenharmony_ci    .filter_bank:           pointer 1
34cabdff1aSopenharmony_ci    .filter_length:         resd 1
35cabdff1aSopenharmony_ci    .filter_alloc:          resd 1
36cabdff1aSopenharmony_ci    .ideal_dst_incr:        resd 1
37cabdff1aSopenharmony_ci    .dst_incr:              resd 1
38cabdff1aSopenharmony_ci    .dst_incr_div:          resd 1
39cabdff1aSopenharmony_ci    .dst_incr_mod:          resd 1
40cabdff1aSopenharmony_ci    .index:                 resd 1
41cabdff1aSopenharmony_ci    .frac:                  resd 1
42cabdff1aSopenharmony_ci    .src_incr:              resd 1
43cabdff1aSopenharmony_ci    .compensation_distance: resd 1
44cabdff1aSopenharmony_ci    .phase_count:           resd 1
45cabdff1aSopenharmony_ci
46cabdff1aSopenharmony_ci    ; there's a few more here but we only care about the first few
47cabdff1aSopenharmony_ciendstruc
48cabdff1aSopenharmony_ci
49cabdff1aSopenharmony_ciSECTION_RODATA
50cabdff1aSopenharmony_ci
51cabdff1aSopenharmony_cipf_1:      dd 1.0
52cabdff1aSopenharmony_cipdbl_1:    dq 1.0
53cabdff1aSopenharmony_cipd_0x4000: dd 0x4000
54cabdff1aSopenharmony_ci
55cabdff1aSopenharmony_ciSECTION .text
56cabdff1aSopenharmony_ci
57cabdff1aSopenharmony_ci; FIXME remove unneeded variables (index_incr, phase_mask)
58cabdff1aSopenharmony_ci%macro RESAMPLE_FNS 3-5 ; format [float or int16], bps, log2_bps, float op suffix [s or d], 1.0 constant
59cabdff1aSopenharmony_ci; int resample_common_$format(ResampleContext *ctx, $format *dst,
60cabdff1aSopenharmony_ci;                             const $format *src, int size, int update_ctx)
61cabdff1aSopenharmony_ci%if ARCH_X86_64 ; unix64 and win64
62cabdff1aSopenharmony_cicglobal resample_common_%1, 0, 15, 2, ctx, dst, src, phase_count, index, frac, \
63cabdff1aSopenharmony_ci                                      dst_incr_mod, size, min_filter_count_x4, \
64cabdff1aSopenharmony_ci                                      min_filter_len_x4, dst_incr_div, src_incr, \
65cabdff1aSopenharmony_ci                                      phase_mask, dst_end, filter_bank
66cabdff1aSopenharmony_ci
67cabdff1aSopenharmony_ci    ; use red-zone for variable storage
68cabdff1aSopenharmony_ci%define ctx_stackq            [rsp-0x8]
69cabdff1aSopenharmony_ci%define src_stackq            [rsp-0x10]
70cabdff1aSopenharmony_ci%if WIN64
71cabdff1aSopenharmony_ci%define update_context_stackd r4m
72cabdff1aSopenharmony_ci%else ; unix64
73cabdff1aSopenharmony_ci%define update_context_stackd [rsp-0x14]
74cabdff1aSopenharmony_ci%endif
75cabdff1aSopenharmony_ci
76cabdff1aSopenharmony_ci    ; load as many variables in registers as possible; for the rest, store
77cabdff1aSopenharmony_ci    ; on stack so that we have 'ctx' available as one extra register
78cabdff1aSopenharmony_ci    mov                        sized, r3d
79cabdff1aSopenharmony_ci%if UNIX64
80cabdff1aSopenharmony_ci    mov        update_context_stackd, r4d
81cabdff1aSopenharmony_ci%endif
82cabdff1aSopenharmony_ci    mov                       indexd, [ctxq+ResampleContext.index]
83cabdff1aSopenharmony_ci    mov                        fracd, [ctxq+ResampleContext.frac]
84cabdff1aSopenharmony_ci    mov                dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
85cabdff1aSopenharmony_ci    mov                 filter_bankq, [ctxq+ResampleContext.filter_bank]
86cabdff1aSopenharmony_ci    mov                    src_incrd, [ctxq+ResampleContext.src_incr]
87cabdff1aSopenharmony_ci    mov                   ctx_stackq, ctxq
88cabdff1aSopenharmony_ci    mov           min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
89cabdff1aSopenharmony_ci    mov                dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
90cabdff1aSopenharmony_ci    shl           min_filter_len_x4d, %3
91cabdff1aSopenharmony_ci    lea                     dst_endq, [dstq+sizeq*%2]
92cabdff1aSopenharmony_ci
93cabdff1aSopenharmony_ci%if UNIX64
94cabdff1aSopenharmony_ci    mov                          ecx, [ctxq+ResampleContext.phase_count]
95cabdff1aSopenharmony_ci    mov                          edi, [ctxq+ResampleContext.filter_alloc]
96cabdff1aSopenharmony_ci
97cabdff1aSopenharmony_ci    DEFINE_ARGS filter_alloc, dst, src, phase_count, index, frac, dst_incr_mod, \
98cabdff1aSopenharmony_ci                filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
99cabdff1aSopenharmony_ci                src_incr, phase_mask, dst_end, filter_bank
100cabdff1aSopenharmony_ci%elif WIN64
101cabdff1aSopenharmony_ci    mov                          R9d, [ctxq+ResampleContext.filter_alloc]
102cabdff1aSopenharmony_ci    mov                          ecx, [ctxq+ResampleContext.phase_count]
103cabdff1aSopenharmony_ci
104cabdff1aSopenharmony_ci    DEFINE_ARGS phase_count, dst, src, filter_alloc, index, frac, dst_incr_mod, \
105cabdff1aSopenharmony_ci                filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
106cabdff1aSopenharmony_ci                src_incr, phase_mask, dst_end, filter_bank
107cabdff1aSopenharmony_ci%endif
108cabdff1aSopenharmony_ci
109cabdff1aSopenharmony_ci    neg           min_filter_len_x4q
110cabdff1aSopenharmony_ci    sub                 filter_bankq, min_filter_len_x4q
111cabdff1aSopenharmony_ci    sub                         srcq, min_filter_len_x4q
112cabdff1aSopenharmony_ci    mov                   src_stackq, srcq
113cabdff1aSopenharmony_ci%else ; x86-32
114cabdff1aSopenharmony_cicglobal resample_common_%1, 1, 7, 2, ctx, phase_count, dst, frac, \
115cabdff1aSopenharmony_ci                                     index, min_filter_length_x4, filter_bank
116cabdff1aSopenharmony_ci
117cabdff1aSopenharmony_ci    ; push temp variables to stack
118cabdff1aSopenharmony_ci%define ctx_stackq            r0mp
119cabdff1aSopenharmony_ci%define src_stackq            r2mp
120cabdff1aSopenharmony_ci%define update_context_stackd r4m
121cabdff1aSopenharmony_ci
122cabdff1aSopenharmony_ci    mov                         dstq, r1mp
123cabdff1aSopenharmony_ci    mov                           r3, r3mp
124cabdff1aSopenharmony_ci    lea                           r3, [dstq+r3*%2]
125cabdff1aSopenharmony_ci    PUSH                              dword [ctxq+ResampleContext.dst_incr_div]
126cabdff1aSopenharmony_ci    PUSH                              dword [ctxq+ResampleContext.dst_incr_mod]
127cabdff1aSopenharmony_ci    PUSH                              dword [ctxq+ResampleContext.filter_alloc]
128cabdff1aSopenharmony_ci    PUSH                              r3
129cabdff1aSopenharmony_ci    PUSH                              dword [ctxq+ResampleContext.phase_count]  ; unneeded replacement for phase_mask
130cabdff1aSopenharmony_ci    PUSH                              dword [ctxq+ResampleContext.src_incr]
131cabdff1aSopenharmony_ci    mov        min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
132cabdff1aSopenharmony_ci    mov                       indexd, [ctxq+ResampleContext.index]
133cabdff1aSopenharmony_ci    shl        min_filter_length_x4d, %3
134cabdff1aSopenharmony_ci    mov                        fracd, [ctxq+ResampleContext.frac]
135cabdff1aSopenharmony_ci    neg        min_filter_length_x4q
136cabdff1aSopenharmony_ci    mov                 filter_bankq, [ctxq+ResampleContext.filter_bank]
137cabdff1aSopenharmony_ci    sub                         r2mp, min_filter_length_x4q
138cabdff1aSopenharmony_ci    sub                 filter_bankq, min_filter_length_x4q
139cabdff1aSopenharmony_ci    PUSH                              min_filter_length_x4q
140cabdff1aSopenharmony_ci    PUSH                              filter_bankq
141cabdff1aSopenharmony_ci    mov                 phase_countd, [ctxq+ResampleContext.phase_count]
142cabdff1aSopenharmony_ci
143cabdff1aSopenharmony_ci    DEFINE_ARGS src, phase_count, dst, frac, index, min_filter_count_x4, filter
144cabdff1aSopenharmony_ci
145cabdff1aSopenharmony_ci%define filter_bankq          dword [rsp+0x0]
146cabdff1aSopenharmony_ci%define min_filter_length_x4q dword [rsp+0x4]
147cabdff1aSopenharmony_ci%define src_incrd             dword [rsp+0x8]
148cabdff1aSopenharmony_ci%define phase_maskd           dword [rsp+0xc]
149cabdff1aSopenharmony_ci%define dst_endq              dword [rsp+0x10]
150cabdff1aSopenharmony_ci%define filter_allocd         dword [rsp+0x14]
151cabdff1aSopenharmony_ci%define dst_incr_modd         dword [rsp+0x18]
152cabdff1aSopenharmony_ci%define dst_incr_divd         dword [rsp+0x1c]
153cabdff1aSopenharmony_ci
154cabdff1aSopenharmony_ci    mov                         srcq, r2mp
155cabdff1aSopenharmony_ci%endif
156cabdff1aSopenharmony_ci
157cabdff1aSopenharmony_ci.loop:
158cabdff1aSopenharmony_ci    mov                      filterd, filter_allocd
159cabdff1aSopenharmony_ci    imul                     filterd, indexd
160cabdff1aSopenharmony_ci%if ARCH_X86_64
161cabdff1aSopenharmony_ci    mov         min_filter_count_x4q, min_filter_len_x4q
162cabdff1aSopenharmony_ci    lea                      filterq, [filter_bankq+filterq*%2]
163cabdff1aSopenharmony_ci%else ; x86-32
164cabdff1aSopenharmony_ci    mov         min_filter_count_x4q, filter_bankq
165cabdff1aSopenharmony_ci    lea                      filterq, [min_filter_count_x4q+filterq*%2]
166cabdff1aSopenharmony_ci    mov         min_filter_count_x4q, min_filter_length_x4q
167cabdff1aSopenharmony_ci%endif
168cabdff1aSopenharmony_ci%ifidn %1, int16
169cabdff1aSopenharmony_ci    movd                          m0, [pd_0x4000]
170cabdff1aSopenharmony_ci%else ; float/double
171cabdff1aSopenharmony_ci    xorps                         m0, m0, m0
172cabdff1aSopenharmony_ci%endif
173cabdff1aSopenharmony_ci
174cabdff1aSopenharmony_ci    align 16
175cabdff1aSopenharmony_ci.inner_loop:
176cabdff1aSopenharmony_ci    movu                          m1, [srcq+min_filter_count_x4q*1]
177cabdff1aSopenharmony_ci%ifidn %1, int16
178cabdff1aSopenharmony_ci%if cpuflag(xop)
179cabdff1aSopenharmony_ci    vpmadcswd                     m0, m1, [filterq+min_filter_count_x4q*1], m0
180cabdff1aSopenharmony_ci%else
181cabdff1aSopenharmony_ci    pmaddwd                       m1, [filterq+min_filter_count_x4q*1]
182cabdff1aSopenharmony_ci    paddd                         m0, m1
183cabdff1aSopenharmony_ci%endif
184cabdff1aSopenharmony_ci%else ; float/double
185cabdff1aSopenharmony_ci%if cpuflag(fma4) || cpuflag(fma3)
186cabdff1aSopenharmony_ci    fmaddp%4                      m0, m1, [filterq+min_filter_count_x4q*1], m0
187cabdff1aSopenharmony_ci%else
188cabdff1aSopenharmony_ci    mulp%4                        m1, m1, [filterq+min_filter_count_x4q*1]
189cabdff1aSopenharmony_ci    addp%4                        m0, m0, m1
190cabdff1aSopenharmony_ci%endif ; cpuflag
191cabdff1aSopenharmony_ci%endif
192cabdff1aSopenharmony_ci    add         min_filter_count_x4q, mmsize
193cabdff1aSopenharmony_ci    js .inner_loop
194cabdff1aSopenharmony_ci
195cabdff1aSopenharmony_ci%ifidn %1, int16
196cabdff1aSopenharmony_ci    HADDD                         m0, m1
197cabdff1aSopenharmony_ci    psrad                         m0, 15
198cabdff1aSopenharmony_ci    add                        fracd, dst_incr_modd
199cabdff1aSopenharmony_ci    packssdw                      m0, m0
200cabdff1aSopenharmony_ci    add                       indexd, dst_incr_divd
201cabdff1aSopenharmony_ci    movd                      [dstq], m0
202cabdff1aSopenharmony_ci%else ; float/double
203cabdff1aSopenharmony_ci    ; horizontal sum & store
204cabdff1aSopenharmony_ci%if mmsize == 32
205cabdff1aSopenharmony_ci    vextractf128                 xm1, m0, 0x1
206cabdff1aSopenharmony_ci    addp%4                       xm0, xm1
207cabdff1aSopenharmony_ci%endif
208cabdff1aSopenharmony_ci    movhlps                      xm1, xm0
209cabdff1aSopenharmony_ci%ifidn %1, float
210cabdff1aSopenharmony_ci    addps                        xm0, xm1
211cabdff1aSopenharmony_ci    shufps                       xm1, xm0, xm0, q0001
212cabdff1aSopenharmony_ci%endif
213cabdff1aSopenharmony_ci    add                        fracd, dst_incr_modd
214cabdff1aSopenharmony_ci    addp%4                       xm0, xm1
215cabdff1aSopenharmony_ci    add                       indexd, dst_incr_divd
216cabdff1aSopenharmony_ci    movs%4                    [dstq], xm0
217cabdff1aSopenharmony_ci%endif
218cabdff1aSopenharmony_ci    cmp                        fracd, src_incrd
219cabdff1aSopenharmony_ci    jl .skip
220cabdff1aSopenharmony_ci    sub                        fracd, src_incrd
221cabdff1aSopenharmony_ci    inc                       indexd
222cabdff1aSopenharmony_ci
223cabdff1aSopenharmony_ci%if UNIX64
224cabdff1aSopenharmony_ci    DEFINE_ARGS filter_alloc, dst, src, phase_count, index, frac, dst_incr_mod, \
225cabdff1aSopenharmony_ci                index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
226cabdff1aSopenharmony_ci                src_incr, phase_mask, dst_end, filter_bank
227cabdff1aSopenharmony_ci%elif WIN64
228cabdff1aSopenharmony_ci    DEFINE_ARGS phase_count, dst, src, filter_alloc, index, frac, dst_incr_mod, \
229cabdff1aSopenharmony_ci                index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
230cabdff1aSopenharmony_ci                src_incr, phase_mask, dst_end, filter_bank
231cabdff1aSopenharmony_ci%else ; x86-32
232cabdff1aSopenharmony_ci    DEFINE_ARGS src, phase_count, dst, frac, index, index_incr
233cabdff1aSopenharmony_ci%endif
234cabdff1aSopenharmony_ci
235cabdff1aSopenharmony_ci.skip:
236cabdff1aSopenharmony_ci    add                         dstq, %2
237cabdff1aSopenharmony_ci    cmp                       indexd, phase_countd
238cabdff1aSopenharmony_ci    jb .index_skip
239cabdff1aSopenharmony_ci.index_while:
240cabdff1aSopenharmony_ci    sub                       indexd, phase_countd
241cabdff1aSopenharmony_ci    lea                         srcq, [srcq+%2]
242cabdff1aSopenharmony_ci    cmp                       indexd, phase_countd
243cabdff1aSopenharmony_ci    jnb .index_while
244cabdff1aSopenharmony_ci.index_skip:
245cabdff1aSopenharmony_ci    cmp                         dstq, dst_endq
246cabdff1aSopenharmony_ci    jne .loop
247cabdff1aSopenharmony_ci
248cabdff1aSopenharmony_ci%if ARCH_X86_64
249cabdff1aSopenharmony_ci    DEFINE_ARGS ctx, dst, src, phase_count, index, frac
250cabdff1aSopenharmony_ci%else ; x86-32
251cabdff1aSopenharmony_ci    DEFINE_ARGS src, ctx, update_context, frac, index
252cabdff1aSopenharmony_ci%endif
253cabdff1aSopenharmony_ci
254cabdff1aSopenharmony_ci    cmp  dword update_context_stackd, 0
255cabdff1aSopenharmony_ci    jz .skip_store
256cabdff1aSopenharmony_ci    ; strictly speaking, the function should always return the consumed
257cabdff1aSopenharmony_ci    ; number of bytes; however, we only use the value if update_context
258cabdff1aSopenharmony_ci    ; is true, so let's just leave it uninitialized otherwise
259cabdff1aSopenharmony_ci    mov                         ctxq, ctx_stackq
260cabdff1aSopenharmony_ci    movifnidn                    rax, srcq
261cabdff1aSopenharmony_ci    mov [ctxq+ResampleContext.frac ], fracd
262cabdff1aSopenharmony_ci    sub                          rax, src_stackq
263cabdff1aSopenharmony_ci    mov [ctxq+ResampleContext.index], indexd
264cabdff1aSopenharmony_ci    shr                          rax, %3
265cabdff1aSopenharmony_ci
266cabdff1aSopenharmony_ci.skip_store:
267cabdff1aSopenharmony_ci%if ARCH_X86_32
268cabdff1aSopenharmony_ci    ADD                          rsp, 0x20
269cabdff1aSopenharmony_ci%endif
270cabdff1aSopenharmony_ci    RET
271cabdff1aSopenharmony_ci
272cabdff1aSopenharmony_ci; int resample_linear_$format(ResampleContext *ctx, float *dst,
273cabdff1aSopenharmony_ci;                             const float *src, int size, int update_ctx)
274cabdff1aSopenharmony_ci%if ARCH_X86_64 ; unix64 and win64
275cabdff1aSopenharmony_ci%if UNIX64
276cabdff1aSopenharmony_cicglobal resample_linear_%1, 0, 15, 5, ctx, dst, phase_mask, phase_count, index, frac, \
277cabdff1aSopenharmony_ci                                      size, dst_incr_mod, min_filter_count_x4, \
278cabdff1aSopenharmony_ci                                      min_filter_len_x4, dst_incr_div, src_incr, \
279cabdff1aSopenharmony_ci                                      src, dst_end, filter_bank
280cabdff1aSopenharmony_ci
281cabdff1aSopenharmony_ci    mov                         srcq, r2mp
282cabdff1aSopenharmony_ci%else ; win64
283cabdff1aSopenharmony_cicglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_count, index, frac, \
284cabdff1aSopenharmony_ci                                      size, dst_incr_mod, min_filter_count_x4, \
285cabdff1aSopenharmony_ci                                      min_filter_len_x4, dst_incr_div, src_incr, \
286cabdff1aSopenharmony_ci                                      dst, dst_end, filter_bank
287cabdff1aSopenharmony_ci
288cabdff1aSopenharmony_ci    mov                         dstq, r1mp
289cabdff1aSopenharmony_ci%endif
290cabdff1aSopenharmony_ci
291cabdff1aSopenharmony_ci    ; use red-zone for variable storage
292cabdff1aSopenharmony_ci%define ctx_stackq            [rsp-0x8]
293cabdff1aSopenharmony_ci%define src_stackq            [rsp-0x10]
294cabdff1aSopenharmony_ci%define phase_mask_stackd     [rsp-0x14]
295cabdff1aSopenharmony_ci%if WIN64
296cabdff1aSopenharmony_ci%define update_context_stackd r4m
297cabdff1aSopenharmony_ci%else ; unix64
298cabdff1aSopenharmony_ci%define update_context_stackd [rsp-0x18]
299cabdff1aSopenharmony_ci%endif
300cabdff1aSopenharmony_ci
301cabdff1aSopenharmony_ci    ; load as many variables in registers as possible; for the rest, store
302cabdff1aSopenharmony_ci    ; on stack so that we have 'ctx' available as one extra register
303cabdff1aSopenharmony_ci    mov                        sized, r3d
304cabdff1aSopenharmony_ci%if UNIX64
305cabdff1aSopenharmony_ci    mov        update_context_stackd, r4d
306cabdff1aSopenharmony_ci%endif
307cabdff1aSopenharmony_ci    mov                       indexd, [ctxq+ResampleContext.index]
308cabdff1aSopenharmony_ci    mov                        fracd, [ctxq+ResampleContext.frac]
309cabdff1aSopenharmony_ci    mov                dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
310cabdff1aSopenharmony_ci    mov                 filter_bankq, [ctxq+ResampleContext.filter_bank]
311cabdff1aSopenharmony_ci    mov                    src_incrd, [ctxq+ResampleContext.src_incr]
312cabdff1aSopenharmony_ci    mov                   ctx_stackq, ctxq
313cabdff1aSopenharmony_ci    mov           min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
314cabdff1aSopenharmony_ci%ifidn %1, int16
315cabdff1aSopenharmony_ci    movd                          m4, [pd_0x4000]
316cabdff1aSopenharmony_ci%else ; float/double
317cabdff1aSopenharmony_ci    cvtsi2s%4                    xm0, src_incrd
318cabdff1aSopenharmony_ci    movs%4                       xm4, [%5]
319cabdff1aSopenharmony_ci    divs%4                       xm4, xm0
320cabdff1aSopenharmony_ci%endif
321cabdff1aSopenharmony_ci    mov                dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
322cabdff1aSopenharmony_ci    shl           min_filter_len_x4d, %3
323cabdff1aSopenharmony_ci    lea                     dst_endq, [dstq+sizeq*%2]
324cabdff1aSopenharmony_ci
325cabdff1aSopenharmony_ci%if UNIX64
326cabdff1aSopenharmony_ci    mov                          ecx, [ctxq+ResampleContext.phase_count]
327cabdff1aSopenharmony_ci    mov                          edi, [ctxq+ResampleContext.filter_alloc]
328cabdff1aSopenharmony_ci
329cabdff1aSopenharmony_ci    DEFINE_ARGS filter_alloc, dst, filter2, phase_count, index, frac, filter1, \
330cabdff1aSopenharmony_ci                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
331cabdff1aSopenharmony_ci                dst_incr_div, src_incr, src, dst_end, filter_bank
332cabdff1aSopenharmony_ci%elif WIN64
333cabdff1aSopenharmony_ci    mov                          R9d, [ctxq+ResampleContext.filter_alloc]
334cabdff1aSopenharmony_ci    mov                          ecx, [ctxq+ResampleContext.phase_count]
335cabdff1aSopenharmony_ci
336cabdff1aSopenharmony_ci    DEFINE_ARGS phase_count, filter2, src, filter_alloc, index, frac, filter1, \
337cabdff1aSopenharmony_ci                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
338cabdff1aSopenharmony_ci                dst_incr_div, src_incr, dst, dst_end, filter_bank
339cabdff1aSopenharmony_ci%endif
340cabdff1aSopenharmony_ci
341cabdff1aSopenharmony_ci    neg           min_filter_len_x4q
342cabdff1aSopenharmony_ci    sub                 filter_bankq, min_filter_len_x4q
343cabdff1aSopenharmony_ci    sub                         srcq, min_filter_len_x4q
344cabdff1aSopenharmony_ci    mov                   src_stackq, srcq
345cabdff1aSopenharmony_ci%else ; x86-32
346cabdff1aSopenharmony_cicglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
347cabdff1aSopenharmony_ci                                     frac, index, dst, filter_bank
348cabdff1aSopenharmony_ci
349cabdff1aSopenharmony_ci    ; push temp variables to stack
350cabdff1aSopenharmony_ci%define ctx_stackq            r0mp
351cabdff1aSopenharmony_ci%define src_stackq            r2mp
352cabdff1aSopenharmony_ci%define update_context_stackd r4m
353cabdff1aSopenharmony_ci
354cabdff1aSopenharmony_ci    mov                         dstq, r1mp
355cabdff1aSopenharmony_ci    mov                           r3, r3mp
356cabdff1aSopenharmony_ci    lea                           r3, [dstq+r3*%2]
357cabdff1aSopenharmony_ci    PUSH                              dword [ctxq+ResampleContext.dst_incr_div]
358cabdff1aSopenharmony_ci    PUSH                              r3
359cabdff1aSopenharmony_ci    mov                           r3, dword [ctxq+ResampleContext.filter_alloc]
360cabdff1aSopenharmony_ci    PUSH                              dword [ctxq+ResampleContext.dst_incr_mod]
361cabdff1aSopenharmony_ci    PUSH                              r3
362cabdff1aSopenharmony_ci    shl                           r3, %3
363cabdff1aSopenharmony_ci    PUSH                              r3
364cabdff1aSopenharmony_ci    mov                           r3, dword [ctxq+ResampleContext.src_incr]
365cabdff1aSopenharmony_ci    PUSH                              dword [ctxq+ResampleContext.phase_count]  ; unneeded replacement of phase_mask
366cabdff1aSopenharmony_ci    PUSH                              r3d
367cabdff1aSopenharmony_ci%ifidn %1, int16
368cabdff1aSopenharmony_ci    movd                          m4, [pd_0x4000]
369cabdff1aSopenharmony_ci%else ; float/double
370cabdff1aSopenharmony_ci    cvtsi2s%4                    xm0, r3d
371cabdff1aSopenharmony_ci    movs%4                       xm4, [%5]
372cabdff1aSopenharmony_ci    divs%4                       xm4, xm0
373cabdff1aSopenharmony_ci%endif
374cabdff1aSopenharmony_ci    mov        min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
375cabdff1aSopenharmony_ci    mov                       indexd, [ctxq+ResampleContext.index]
376cabdff1aSopenharmony_ci    shl        min_filter_length_x4d, %3
377cabdff1aSopenharmony_ci    mov                        fracd, [ctxq+ResampleContext.frac]
378cabdff1aSopenharmony_ci    neg        min_filter_length_x4q
379cabdff1aSopenharmony_ci    mov                 filter_bankq, [ctxq+ResampleContext.filter_bank]
380cabdff1aSopenharmony_ci    sub                         r2mp, min_filter_length_x4q
381cabdff1aSopenharmony_ci    sub                 filter_bankq, min_filter_length_x4q
382cabdff1aSopenharmony_ci    PUSH                              min_filter_length_x4q
383cabdff1aSopenharmony_ci    PUSH                              filter_bankq
384cabdff1aSopenharmony_ci    PUSH                              dword [ctxq+ResampleContext.phase_count]
385cabdff1aSopenharmony_ci
386cabdff1aSopenharmony_ci    DEFINE_ARGS filter1, min_filter_count_x4, filter2, frac, index, dst, src
387cabdff1aSopenharmony_ci
388cabdff1aSopenharmony_ci%define phase_count_stackd    dword [rsp+0x0]
389cabdff1aSopenharmony_ci%define filter_bankq          dword [rsp+0x4]
390cabdff1aSopenharmony_ci%define min_filter_length_x4q dword [rsp+0x8]
391cabdff1aSopenharmony_ci%define src_incrd             dword [rsp+0xc]
392cabdff1aSopenharmony_ci%define phase_mask_stackd     dword [rsp+0x10]
393cabdff1aSopenharmony_ci%define filter_alloc_x4q      dword [rsp+0x14]
394cabdff1aSopenharmony_ci%define filter_allocd         dword [rsp+0x18]
395cabdff1aSopenharmony_ci%define dst_incr_modd         dword [rsp+0x1c]
396cabdff1aSopenharmony_ci%define dst_endq              dword [rsp+0x20]
397cabdff1aSopenharmony_ci%define dst_incr_divd         dword [rsp+0x24]
398cabdff1aSopenharmony_ci
399cabdff1aSopenharmony_ci    mov                         srcq, r2mp
400cabdff1aSopenharmony_ci%endif
401cabdff1aSopenharmony_ci
402cabdff1aSopenharmony_ci.loop:
403cabdff1aSopenharmony_ci    mov                     filter1d, filter_allocd
404cabdff1aSopenharmony_ci    imul                    filter1d, indexd
405cabdff1aSopenharmony_ci%if ARCH_X86_64
406cabdff1aSopenharmony_ci    mov         min_filter_count_x4q, min_filter_len_x4q
407cabdff1aSopenharmony_ci    lea                     filter1q, [filter_bankq+filter1q*%2]
408cabdff1aSopenharmony_ci    lea                     filter2q, [filter1q+filter_allocq*%2]
409cabdff1aSopenharmony_ci%else ; x86-32
410cabdff1aSopenharmony_ci    mov         min_filter_count_x4q, filter_bankq
411cabdff1aSopenharmony_ci    lea                     filter1q, [min_filter_count_x4q+filter1q*%2]
412cabdff1aSopenharmony_ci    mov         min_filter_count_x4q, min_filter_length_x4q
413cabdff1aSopenharmony_ci    mov                     filter2q, filter1q
414cabdff1aSopenharmony_ci    add                     filter2q, filter_alloc_x4q
415cabdff1aSopenharmony_ci%endif
416cabdff1aSopenharmony_ci%ifidn %1, int16
417cabdff1aSopenharmony_ci    mova                          m0, m4
418cabdff1aSopenharmony_ci    mova                          m2, m4
419cabdff1aSopenharmony_ci%else ; float/double
420cabdff1aSopenharmony_ci    xorps                         m0, m0, m0
421cabdff1aSopenharmony_ci    xorps                         m2, m2, m2
422cabdff1aSopenharmony_ci%endif
423cabdff1aSopenharmony_ci
424cabdff1aSopenharmony_ci    align 16
425cabdff1aSopenharmony_ci.inner_loop:
426cabdff1aSopenharmony_ci    movu                          m1, [srcq+min_filter_count_x4q*1]
427cabdff1aSopenharmony_ci%ifidn %1, int16
428cabdff1aSopenharmony_ci%if cpuflag(xop)
429cabdff1aSopenharmony_ci    vpmadcswd                     m2, m1, [filter2q+min_filter_count_x4q*1], m2
430cabdff1aSopenharmony_ci    vpmadcswd                     m0, m1, [filter1q+min_filter_count_x4q*1], m0
431cabdff1aSopenharmony_ci%else
432cabdff1aSopenharmony_ci    pmaddwd                       m3, m1, [filter2q+min_filter_count_x4q*1]
433cabdff1aSopenharmony_ci    pmaddwd                       m1, [filter1q+min_filter_count_x4q*1]
434cabdff1aSopenharmony_ci    paddd                         m2, m3
435cabdff1aSopenharmony_ci    paddd                         m0, m1
436cabdff1aSopenharmony_ci%endif ; cpuflag
437cabdff1aSopenharmony_ci%else ; float/double
438cabdff1aSopenharmony_ci%if cpuflag(fma4) || cpuflag(fma3)
439cabdff1aSopenharmony_ci    fmaddp%4                      m2, m1, [filter2q+min_filter_count_x4q*1], m2
440cabdff1aSopenharmony_ci    fmaddp%4                      m0, m1, [filter1q+min_filter_count_x4q*1], m0
441cabdff1aSopenharmony_ci%else
442cabdff1aSopenharmony_ci    mulp%4                        m3, m1, [filter2q+min_filter_count_x4q*1]
443cabdff1aSopenharmony_ci    mulp%4                        m1, m1, [filter1q+min_filter_count_x4q*1]
444cabdff1aSopenharmony_ci    addp%4                        m2, m2, m3
445cabdff1aSopenharmony_ci    addp%4                        m0, m0, m1
446cabdff1aSopenharmony_ci%endif ; cpuflag
447cabdff1aSopenharmony_ci%endif
448cabdff1aSopenharmony_ci    add         min_filter_count_x4q, mmsize
449cabdff1aSopenharmony_ci    js .inner_loop
450cabdff1aSopenharmony_ci
451cabdff1aSopenharmony_ci%ifidn %1, int16
452cabdff1aSopenharmony_ci%if mmsize == 16
453cabdff1aSopenharmony_ci%if cpuflag(xop)
454cabdff1aSopenharmony_ci    vphadddq                      m2, m2
455cabdff1aSopenharmony_ci    vphadddq                      m0, m0
456cabdff1aSopenharmony_ci%endif
457cabdff1aSopenharmony_ci    pshufd                        m3, m2, q0032
458cabdff1aSopenharmony_ci    pshufd                        m1, m0, q0032
459cabdff1aSopenharmony_ci    paddd                         m2, m3
460cabdff1aSopenharmony_ci    paddd                         m0, m1
461cabdff1aSopenharmony_ci%endif
462cabdff1aSopenharmony_ci%if notcpuflag(xop)
463cabdff1aSopenharmony_ci    PSHUFLW                       m3, m2, q0032
464cabdff1aSopenharmony_ci    PSHUFLW                       m1, m0, q0032
465cabdff1aSopenharmony_ci    paddd                         m2, m3
466cabdff1aSopenharmony_ci    paddd                         m0, m1
467cabdff1aSopenharmony_ci%endif
468cabdff1aSopenharmony_ci    psubd                         m2, m0
469cabdff1aSopenharmony_ci    ; This is probably a really bad idea on atom and other machines with a
470cabdff1aSopenharmony_ci    ; long transfer latency between GPRs and XMMs (atom). However, it does
471cabdff1aSopenharmony_ci    ; make the clip a lot simpler...
472cabdff1aSopenharmony_ci    movd                         eax, m2
473cabdff1aSopenharmony_ci    add                       indexd, dst_incr_divd
474cabdff1aSopenharmony_ci    imul                              fracd
475cabdff1aSopenharmony_ci    idiv                              src_incrd
476cabdff1aSopenharmony_ci    movd                          m1, eax
477cabdff1aSopenharmony_ci    add                        fracd, dst_incr_modd
478cabdff1aSopenharmony_ci    paddd                         m0, m1
479cabdff1aSopenharmony_ci    psrad                         m0, 15
480cabdff1aSopenharmony_ci    packssdw                      m0, m0
481cabdff1aSopenharmony_ci    movd                      [dstq], m0
482cabdff1aSopenharmony_ci
483cabdff1aSopenharmony_ci    ; note that for imul/idiv, I need to move filter to edx/eax for each:
484cabdff1aSopenharmony_ci    ; - 32bit: eax=r0[filter1], edx=r2[filter2]
485cabdff1aSopenharmony_ci    ; - win64: eax=r6[filter1], edx=r1[todo]
486cabdff1aSopenharmony_ci    ; - unix64: eax=r6[filter1], edx=r2[todo]
487cabdff1aSopenharmony_ci%else ; float/double
488cabdff1aSopenharmony_ci    ; val += (v2 - val) * (FELEML) frac / c->src_incr;
489cabdff1aSopenharmony_ci%if mmsize == 32
490cabdff1aSopenharmony_ci    vextractf128                 xm1, m0, 0x1
491cabdff1aSopenharmony_ci    vextractf128                 xm3, m2, 0x1
492cabdff1aSopenharmony_ci    addp%4                       xm0, xm1
493cabdff1aSopenharmony_ci    addp%4                       xm2, xm3
494cabdff1aSopenharmony_ci%endif
495cabdff1aSopenharmony_ci    cvtsi2s%4                    xm1, fracd
496cabdff1aSopenharmony_ci    subp%4                       xm2, xm0
497cabdff1aSopenharmony_ci    mulp%4                       xm1, xm4
498cabdff1aSopenharmony_ci    shufp%4                      xm1, xm1, q0000
499cabdff1aSopenharmony_ci%if cpuflag(fma4) || cpuflag(fma3)
500cabdff1aSopenharmony_ci    fmaddp%4                     xm0, xm2, xm1, xm0
501cabdff1aSopenharmony_ci%else
502cabdff1aSopenharmony_ci    mulp%4                       xm2, xm1
503cabdff1aSopenharmony_ci    addp%4                       xm0, xm2
504cabdff1aSopenharmony_ci%endif ; cpuflag
505cabdff1aSopenharmony_ci
506cabdff1aSopenharmony_ci    ; horizontal sum & store
507cabdff1aSopenharmony_ci    movhlps                      xm1, xm0
508cabdff1aSopenharmony_ci%ifidn %1, float
509cabdff1aSopenharmony_ci    addps                        xm0, xm1
510cabdff1aSopenharmony_ci    shufps                       xm1, xm0, xm0, q0001
511cabdff1aSopenharmony_ci%endif
512cabdff1aSopenharmony_ci    add                        fracd, dst_incr_modd
513cabdff1aSopenharmony_ci    addp%4                       xm0, xm1
514cabdff1aSopenharmony_ci    add                       indexd, dst_incr_divd
515cabdff1aSopenharmony_ci    movs%4                    [dstq], xm0
516cabdff1aSopenharmony_ci%endif
517cabdff1aSopenharmony_ci    cmp                        fracd, src_incrd
518cabdff1aSopenharmony_ci    jl .skip
519cabdff1aSopenharmony_ci    sub                        fracd, src_incrd
520cabdff1aSopenharmony_ci    inc                       indexd
521cabdff1aSopenharmony_ci
522cabdff1aSopenharmony_ci%if UNIX64
523cabdff1aSopenharmony_ci    DEFINE_ARGS filter_alloc, dst, filter2, phase_count, index, frac, index_incr, \
524cabdff1aSopenharmony_ci                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
525cabdff1aSopenharmony_ci                dst_incr_div, src_incr, src, dst_end, filter_bank
526cabdff1aSopenharmony_ci%elif WIN64
527cabdff1aSopenharmony_ci    DEFINE_ARGS phase_count, filter2, src, filter_alloc, index, frac, index_incr, \
528cabdff1aSopenharmony_ci                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
529cabdff1aSopenharmony_ci                dst_incr_div, src_incr, dst, dst_end, filter_bank
530cabdff1aSopenharmony_ci%else ; x86-32
531cabdff1aSopenharmony_ci    DEFINE_ARGS filter1, phase_count, index_incr, frac, index, dst, src
532cabdff1aSopenharmony_ci%endif
533cabdff1aSopenharmony_ci
534cabdff1aSopenharmony_ci.skip:
535cabdff1aSopenharmony_ci%if ARCH_X86_32
536cabdff1aSopenharmony_ci    mov                 phase_countd, phase_count_stackd
537cabdff1aSopenharmony_ci%endif
538cabdff1aSopenharmony_ci    add                         dstq, %2
539cabdff1aSopenharmony_ci    cmp                       indexd, phase_countd
540cabdff1aSopenharmony_ci    jb .index_skip
541cabdff1aSopenharmony_ci.index_while:
542cabdff1aSopenharmony_ci    sub                       indexd, phase_countd
543cabdff1aSopenharmony_ci    lea                         srcq, [srcq+%2]
544cabdff1aSopenharmony_ci    cmp                       indexd, phase_countd
545cabdff1aSopenharmony_ci    jnb .index_while
546cabdff1aSopenharmony_ci.index_skip:
547cabdff1aSopenharmony_ci    cmp                         dstq, dst_endq
548cabdff1aSopenharmony_ci    jne .loop
549cabdff1aSopenharmony_ci
550cabdff1aSopenharmony_ci%if UNIX64
551cabdff1aSopenharmony_ci    DEFINE_ARGS ctx, dst, filter2, phase_count, index, frac, index_incr, \
552cabdff1aSopenharmony_ci                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
553cabdff1aSopenharmony_ci                dst_incr_div, src_incr, src, dst_end, filter_bank
554cabdff1aSopenharmony_ci%elif WIN64
555cabdff1aSopenharmony_ci    DEFINE_ARGS ctx, filter2, src, phase_count, index, frac, index_incr, \
556cabdff1aSopenharmony_ci                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
557cabdff1aSopenharmony_ci                dst_incr_div, src_incr, dst, dst_end, filter_bank
558cabdff1aSopenharmony_ci%else ; x86-32
559cabdff1aSopenharmony_ci    DEFINE_ARGS filter1, ctx, update_context, frac, index, dst, src
560cabdff1aSopenharmony_ci%endif
561cabdff1aSopenharmony_ci
562cabdff1aSopenharmony_ci    cmp  dword update_context_stackd, 0
563cabdff1aSopenharmony_ci    jz .skip_store
564cabdff1aSopenharmony_ci    ; strictly speaking, the function should always return the consumed
565cabdff1aSopenharmony_ci    ; number of bytes; however, we only use the value if update_context
566cabdff1aSopenharmony_ci    ; is true, so let's just leave it uninitialized otherwise
567cabdff1aSopenharmony_ci    mov                         ctxq, ctx_stackq
568cabdff1aSopenharmony_ci    movifnidn                    rax, srcq
569cabdff1aSopenharmony_ci    mov [ctxq+ResampleContext.frac ], fracd
570cabdff1aSopenharmony_ci    sub                          rax, src_stackq
571cabdff1aSopenharmony_ci    mov [ctxq+ResampleContext.index], indexd
572cabdff1aSopenharmony_ci    shr                          rax, %3
573cabdff1aSopenharmony_ci
574cabdff1aSopenharmony_ci.skip_store:
575cabdff1aSopenharmony_ci%if ARCH_X86_32
576cabdff1aSopenharmony_ci    ADD                          rsp, 0x28
577cabdff1aSopenharmony_ci%endif
578cabdff1aSopenharmony_ci    RET
579cabdff1aSopenharmony_ci%endmacro
580cabdff1aSopenharmony_ci
581cabdff1aSopenharmony_ciINIT_XMM sse
582cabdff1aSopenharmony_ciRESAMPLE_FNS float, 4, 2, s, pf_1
583cabdff1aSopenharmony_ci
584cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
585cabdff1aSopenharmony_ciINIT_YMM avx
586cabdff1aSopenharmony_ciRESAMPLE_FNS float, 4, 2, s, pf_1
587cabdff1aSopenharmony_ci%endif
588cabdff1aSopenharmony_ci%if HAVE_FMA3_EXTERNAL
589cabdff1aSopenharmony_ciINIT_YMM fma3
590cabdff1aSopenharmony_ciRESAMPLE_FNS float, 4, 2, s, pf_1
591cabdff1aSopenharmony_ci%endif
592cabdff1aSopenharmony_ci%if HAVE_FMA4_EXTERNAL
593cabdff1aSopenharmony_ciINIT_XMM fma4
594cabdff1aSopenharmony_ciRESAMPLE_FNS float, 4, 2, s, pf_1
595cabdff1aSopenharmony_ci%endif
596cabdff1aSopenharmony_ci
597cabdff1aSopenharmony_ciINIT_XMM sse2
598cabdff1aSopenharmony_ciRESAMPLE_FNS int16, 2, 1
599cabdff1aSopenharmony_ci%if HAVE_XOP_EXTERNAL
600cabdff1aSopenharmony_ciINIT_XMM xop
601cabdff1aSopenharmony_ciRESAMPLE_FNS int16, 2, 1
602cabdff1aSopenharmony_ci%endif
603cabdff1aSopenharmony_ci
604cabdff1aSopenharmony_ciINIT_XMM sse2
605cabdff1aSopenharmony_ciRESAMPLE_FNS double, 8, 3, d, pdbl_1
606cabdff1aSopenharmony_ci
607cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
608cabdff1aSopenharmony_ciINIT_YMM avx
609cabdff1aSopenharmony_ciRESAMPLE_FNS double, 8, 3, d, pdbl_1
610cabdff1aSopenharmony_ci%endif
611cabdff1aSopenharmony_ci%if HAVE_FMA3_EXTERNAL
612cabdff1aSopenharmony_ciINIT_YMM fma3
613cabdff1aSopenharmony_ciRESAMPLE_FNS double, 8, 3, d, pdbl_1
614cabdff1aSopenharmony_ci%endif
615