1/*
2 * MLP DSP functions x86-optimized
3 * Copyright (c) 2009 Ramiro Polla
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include <stdint.h>
23#include "config.h"
24#include "libavutil/attributes.h"
25#include "libavutil/cpu.h"
26#include "libavutil/macros.h"
27#include "libavutil/x86/asm.h"
28#include "libavutil/x86/cpu.h"
29#include "libavcodec/mlpdsp.h"
30#include "libavcodec/mlp.h"
31
32#define REMATRIX_CHANNEL_FUNC(opt) \
33void ff_mlp_rematrix_channel_##opt(int32_t *samples, \
34                                   const int32_t *coeffs, \
35                                   const uint8_t *bypassed_lsbs, \
36                                   const int8_t *noise_buffer, \
37                                   int index, \
38                                   unsigned int dest_ch, \
39                                   uint16_t blockpos, \
40                                   unsigned int maxchan, \
41                                   int matrix_noise_shift, \
42                                   int access_unit_size_pow2, \
43                                   int32_t mask);
44
45REMATRIX_CHANNEL_FUNC(sse4)
46REMATRIX_CHANNEL_FUNC(avx2_bmi2)
47
48#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
49
50extern char ff_mlp_firorder_8;
51extern char ff_mlp_firorder_7;
52extern char ff_mlp_firorder_6;
53extern char ff_mlp_firorder_5;
54extern char ff_mlp_firorder_4;
55extern char ff_mlp_firorder_3;
56extern char ff_mlp_firorder_2;
57extern char ff_mlp_firorder_1;
58extern char ff_mlp_firorder_0;
59
60extern char ff_mlp_iirorder_4;
61extern char ff_mlp_iirorder_3;
62extern char ff_mlp_iirorder_2;
63extern char ff_mlp_iirorder_1;
64extern char ff_mlp_iirorder_0;
65
66static const void * const firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
67                                          &ff_mlp_firorder_2, &ff_mlp_firorder_3,
68                                          &ff_mlp_firorder_4, &ff_mlp_firorder_5,
69                                          &ff_mlp_firorder_6, &ff_mlp_firorder_7,
70                                          &ff_mlp_firorder_8 };
71static const void * const iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
72                                          &ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
73                                          &ff_mlp_iirorder_4 };
74
75#if ARCH_X86_64
76
77#define MLPMUL(label, offset, offs, offc)   \
78    LABEL_MANGLE(label)":             \n\t" \
79    "movslq "offset"+"offs"(%0), %%rax\n\t" \
80    "movslq "offset"+"offc"(%1), %%rdx\n\t" \
81    "imul                 %%rdx, %%rax\n\t" \
82    "add                  %%rax, %%rsi\n\t"
83
84#define FIRMULREG(label, offset, firc)\
85    LABEL_MANGLE(label)":       \n\t" \
86    "movslq "#offset"(%0), %%rax\n\t" \
87    "imul        %"#firc", %%rax\n\t" \
88    "add            %%rax, %%rsi\n\t"
89
90#define CLEAR_ACCUM                   \
91    "xor            %%rsi, %%rsi\n\t"
92
93#define SHIFT_ACCUM                   \
94    "shr     %%cl,         %%rsi\n\t"
95
96#define ACCUM    "%%rdx"
97#define RESULT   "%%rsi"
98#define RESULT32 "%%esi"
99
100#else /* if ARCH_X86_32 */
101
102#define MLPMUL(label, offset, offs, offc)  \
103    LABEL_MANGLE(label)":            \n\t" \
104    "mov   "offset"+"offs"(%0), %%eax\n\t" \
105    "imull "offset"+"offc"(%1)       \n\t" \
106    "add                %%eax , %%esi\n\t" \
107    "adc                %%edx , %%ecx\n\t"
108
109#define FIRMULREG(label, offset, firc)  \
110    MLPMUL(label, #offset, "0", "0")
111
112#define CLEAR_ACCUM                  \
113    "xor           %%esi, %%esi\n\t" \
114    "xor           %%ecx, %%ecx\n\t"
115
116#define SHIFT_ACCUM                  \
117    "mov           %%ecx, %%edx\n\t" \
118    "mov           %%esi, %%eax\n\t" \
119    "movzbl        %7   , %%ecx\n\t" \
120    "shrd    %%cl, %%edx, %%eax\n\t" \
121
122#define ACCUM    "%%edx"
123#define RESULT   "%%eax"
124#define RESULT32 "%%eax"
125
126#endif /* !ARCH_X86_64 */
127
128#define BINC  AV_STRINGIFY(4* MAX_CHANNELS)
129#define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE))
130#define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER)
131
132#define FIRMUL(label, offset) MLPMUL(label, #offset,   "0",   "0")
133#define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC)
134
135static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
136                                   int firorder, int iirorder,
137                                   unsigned int filter_shift, int32_t mask,
138                                   int blocksize, int32_t *sample_buffer)
139{
140    const void *firjump = firtable[firorder];
141    const void *iirjump = iirtable[iirorder];
142
143    blocksize = -blocksize;
144
145    __asm__ volatile(
146        "1:                           \n\t"
147        CLEAR_ACCUM
148        "jmp  *%5                     \n\t"
149        FIRMUL   (ff_mlp_firorder_8, 0x1c   )
150        FIRMUL   (ff_mlp_firorder_7, 0x18   )
151        FIRMUL   (ff_mlp_firorder_6, 0x14   )
152        FIRMUL   (ff_mlp_firorder_5, 0x10   )
153        FIRMUL   (ff_mlp_firorder_4, 0x0c   )
154        FIRMUL   (ff_mlp_firorder_3, 0x08   )
155        FIRMUL   (ff_mlp_firorder_2, 0x04   )
156        FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
157        LABEL_MANGLE(ff_mlp_firorder_0)":\n\t"
158        "jmp  *%6                     \n\t"
159        IIRMUL   (ff_mlp_iirorder_4, 0x0c   )
160        IIRMUL   (ff_mlp_iirorder_3, 0x08   )
161        IIRMUL   (ff_mlp_iirorder_2, 0x04   )
162        IIRMUL   (ff_mlp_iirorder_1, 0x00   )
163        LABEL_MANGLE(ff_mlp_iirorder_0)":\n\t"
164        SHIFT_ACCUM
165        "mov  "RESULT"  ,"ACCUM"      \n\t"
166        "add  (%2)      ,"RESULT"     \n\t"
167        "and   %4       ,"RESULT"     \n\t"
168        "sub   $4       ,  %0         \n\t"
169        "mov  "RESULT32", (%0)        \n\t"
170        "mov  "RESULT32", (%2)        \n\t"
171        "add $"BINC"    ,  %2         \n\t"
172        "sub  "ACCUM"   ,"RESULT"     \n\t"
173        "mov  "RESULT32","IOFFS"(%0)  \n\t"
174        "incl              %3         \n\t"
175        "js 1b                        \n\t"
176        : /* 0*/"+r"(state),
177          /* 1*/"+r"(coeff),
178          /* 2*/"+r"(sample_buffer),
179#if ARCH_X86_64
180          /* 3*/"+r"(blocksize)
181        : /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump),
182          /* 6*/"r"(iirjump)      , /* 7*/"c"(filter_shift)
183        , /* 8*/"r"((int64_t)coeff[0])
184        : "rax", "rdx", "rsi"
185#else /* ARCH_X86_32 */
186          /* 3*/"+m"(blocksize)
187        : /* 4*/"m"(         mask), /* 5*/"m"(firjump),
188          /* 6*/"m"(iirjump)      , /* 7*/"m"(filter_shift)
189        : "eax", "edx", "esi", "ecx"
190#endif /* !ARCH_X86_64 */
191    );
192}
193
194#endif /* HAVE_7REGS && HAVE_INLINE_ASM */
195
196av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
197{
198    int cpu_flags = av_get_cpu_flags();
199#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
200    if (INLINE_MMX(cpu_flags))
201        c->mlp_filter_channel = mlp_filter_channel_x86;
202#endif
203    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags))
204        c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4;
205    if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2)
206        c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2;
207}
208