1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* TAK DSP SIMD optimizations
3cabdff1aSopenharmony_ci;*
4cabdff1aSopenharmony_ci;* Copyright (C) 2015 Paul B Mahol
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci;*
13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
17cabdff1aSopenharmony_ci;*
18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci;******************************************************************************
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ciSECTION_RODATA
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_cipd_128: times 4 dd 128
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_ciSECTION .text
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_ciINIT_XMM sse2
32cabdff1aSopenharmony_cicglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
33cabdff1aSopenharmony_ci    shl                     lengthd, 2
34cabdff1aSopenharmony_ci    add                         p1q, lengthq
35cabdff1aSopenharmony_ci    add                         p2q, lengthq
36cabdff1aSopenharmony_ci    neg                     lengthq
37cabdff1aSopenharmony_ci.loop:
38cabdff1aSopenharmony_ci    mova                         m0, [p1q+lengthq+mmsize*0]
39cabdff1aSopenharmony_ci    mova                         m1, [p1q+lengthq+mmsize*1]
40cabdff1aSopenharmony_ci    paddd                        m0, [p2q+lengthq+mmsize*0]
41cabdff1aSopenharmony_ci    paddd                        m1, [p2q+lengthq+mmsize*1]
42cabdff1aSopenharmony_ci    mova     [p2q+lengthq+mmsize*0], m0
43cabdff1aSopenharmony_ci    mova     [p2q+lengthq+mmsize*1], m1
44cabdff1aSopenharmony_ci    add                     lengthq, mmsize*2
45cabdff1aSopenharmony_ci    jl .loop
46cabdff1aSopenharmony_ci    REP_RET
47cabdff1aSopenharmony_ci
48cabdff1aSopenharmony_cicglobal tak_decorrelate_sr, 3, 3, 2, p1, p2, length
49cabdff1aSopenharmony_ci    shl                     lengthd, 2
50cabdff1aSopenharmony_ci    add                         p1q, lengthq
51cabdff1aSopenharmony_ci    add                         p2q, lengthq
52cabdff1aSopenharmony_ci    neg                     lengthq
53cabdff1aSopenharmony_ci
54cabdff1aSopenharmony_ci.loop:
55cabdff1aSopenharmony_ci    mova                         m0, [p2q+lengthq+mmsize*0]
56cabdff1aSopenharmony_ci    mova                         m1, [p2q+lengthq+mmsize*1]
57cabdff1aSopenharmony_ci    psubd                        m0, [p1q+lengthq+mmsize*0]
58cabdff1aSopenharmony_ci    psubd                        m1, [p1q+lengthq+mmsize*1]
59cabdff1aSopenharmony_ci    mova     [p1q+lengthq+mmsize*0], m0
60cabdff1aSopenharmony_ci    mova     [p1q+lengthq+mmsize*1], m1
61cabdff1aSopenharmony_ci    add                     lengthq, mmsize*2
62cabdff1aSopenharmony_ci    jl .loop
63cabdff1aSopenharmony_ci    REP_RET
64cabdff1aSopenharmony_ci
65cabdff1aSopenharmony_cicglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
66cabdff1aSopenharmony_ci    shl                     lengthd, 2
67cabdff1aSopenharmony_ci    add                         p1q, lengthq
68cabdff1aSopenharmony_ci    add                         p2q, lengthq
69cabdff1aSopenharmony_ci    neg                     lengthq
70cabdff1aSopenharmony_ci
71cabdff1aSopenharmony_ci.loop:
72cabdff1aSopenharmony_ci    mova                         m0, [p1q+lengthq]
73cabdff1aSopenharmony_ci    mova                         m1, [p2q+lengthq]
74cabdff1aSopenharmony_ci    mova                         m3, [p1q+lengthq+mmsize]
75cabdff1aSopenharmony_ci    mova                         m4, [p2q+lengthq+mmsize]
76cabdff1aSopenharmony_ci    mova                         m2, m1
77cabdff1aSopenharmony_ci    mova                         m5, m4
78cabdff1aSopenharmony_ci    psrad                        m2, 1
79cabdff1aSopenharmony_ci    psrad                        m5, 1
80cabdff1aSopenharmony_ci    psubd                        m0, m2
81cabdff1aSopenharmony_ci    psubd                        m3, m5
82cabdff1aSopenharmony_ci    paddd                        m1, m0
83cabdff1aSopenharmony_ci    paddd                        m4, m3
84cabdff1aSopenharmony_ci    mova              [p1q+lengthq], m0
85cabdff1aSopenharmony_ci    mova              [p2q+lengthq], m1
86cabdff1aSopenharmony_ci    mova       [p1q+lengthq+mmsize], m3
87cabdff1aSopenharmony_ci    mova       [p2q+lengthq+mmsize], m4
88cabdff1aSopenharmony_ci    add                     lengthq, mmsize*2
89cabdff1aSopenharmony_ci    jl .loop
90cabdff1aSopenharmony_ci    REP_RET
91cabdff1aSopenharmony_ci
92cabdff1aSopenharmony_ciINIT_XMM sse4
93cabdff1aSopenharmony_cicglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
94cabdff1aSopenharmony_ci    shl             lengthd, 2
95cabdff1aSopenharmony_ci    add                 p1q, lengthq
96cabdff1aSopenharmony_ci    add                 p2q, lengthq
97cabdff1aSopenharmony_ci    neg             lengthq
98cabdff1aSopenharmony_ci
99cabdff1aSopenharmony_ci    movd                 m2, dshiftm
100cabdff1aSopenharmony_ci    movd                 m3, dfactorm
101cabdff1aSopenharmony_ci    pshufd               m3, m3, 0
102cabdff1aSopenharmony_ci    mova                 m4, [pd_128]
103cabdff1aSopenharmony_ci
104cabdff1aSopenharmony_ci.loop:
105cabdff1aSopenharmony_ci    mova                 m0, [p1q+lengthq]
106cabdff1aSopenharmony_ci    mova                 m1, [p2q+lengthq]
107cabdff1aSopenharmony_ci    psrad                m1, m2
108cabdff1aSopenharmony_ci    pmulld               m1, m3
109cabdff1aSopenharmony_ci    paddd                m1, m4
110cabdff1aSopenharmony_ci    psrad                m1, 8
111cabdff1aSopenharmony_ci    pslld                m1, m2
112cabdff1aSopenharmony_ci    psubd                m1, m0
113cabdff1aSopenharmony_ci    mova      [p1q+lengthq], m1
114cabdff1aSopenharmony_ci    add             lengthq, mmsize
115cabdff1aSopenharmony_ci    jl .loop
116cabdff1aSopenharmony_ci    REP_RET
117