1/* 2 * MLP DSP functions x86-optimized 3 * Copyright (c) 2009 Ramiro Polla 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include <stdint.h> 23#include "config.h" 24#include "libavutil/attributes.h" 25#include "libavutil/cpu.h" 26#include "libavutil/macros.h" 27#include "libavutil/x86/asm.h" 28#include "libavutil/x86/cpu.h" 29#include "libavcodec/mlpdsp.h" 30#include "libavcodec/mlp.h" 31 32#define REMATRIX_CHANNEL_FUNC(opt) \ 33void ff_mlp_rematrix_channel_##opt(int32_t *samples, \ 34 const int32_t *coeffs, \ 35 const uint8_t *bypassed_lsbs, \ 36 const int8_t *noise_buffer, \ 37 int index, \ 38 unsigned int dest_ch, \ 39 uint16_t blockpos, \ 40 unsigned int maxchan, \ 41 int matrix_noise_shift, \ 42 int access_unit_size_pow2, \ 43 int32_t mask); 44 45REMATRIX_CHANNEL_FUNC(sse4) 46REMATRIX_CHANNEL_FUNC(avx2_bmi2) 47 48#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS 49 50extern char ff_mlp_firorder_8; 51extern char ff_mlp_firorder_7; 52extern char ff_mlp_firorder_6; 53extern char ff_mlp_firorder_5; 54extern char ff_mlp_firorder_4; 55extern char ff_mlp_firorder_3; 56extern char ff_mlp_firorder_2; 57extern char ff_mlp_firorder_1; 58extern char ff_mlp_firorder_0; 59 60extern char ff_mlp_iirorder_4; 61extern char ff_mlp_iirorder_3; 62extern char ff_mlp_iirorder_2; 63extern char ff_mlp_iirorder_1; 64extern char ff_mlp_iirorder_0; 65 66static const void * const firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1, 67 &ff_mlp_firorder_2, &ff_mlp_firorder_3, 68 &ff_mlp_firorder_4, &ff_mlp_firorder_5, 69 &ff_mlp_firorder_6, &ff_mlp_firorder_7, 70 &ff_mlp_firorder_8 }; 71static const void * const iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1, 72 &ff_mlp_iirorder_2, &ff_mlp_iirorder_3, 73 &ff_mlp_iirorder_4 }; 74 75#if ARCH_X86_64 76 77#define MLPMUL(label, offset, offs, offc) \ 78 LABEL_MANGLE(label)": \n\t" \ 79 "movslq "offset"+"offs"(%0), %%rax\n\t" \ 80 "movslq "offset"+"offc"(%1), %%rdx\n\t" \ 81 "imul %%rdx, %%rax\n\t" \ 82 "add %%rax, %%rsi\n\t" 83 84#define FIRMULREG(label, offset, firc)\ 85 LABEL_MANGLE(label)": \n\t" \ 86 "movslq "#offset"(%0), %%rax\n\t" \ 87 "imul %"#firc", %%rax\n\t" \ 88 "add %%rax, %%rsi\n\t" 89 90#define CLEAR_ACCUM \ 91 "xor %%rsi, %%rsi\n\t" 92 93#define SHIFT_ACCUM \ 94 "shr %%cl, %%rsi\n\t" 95 96#define ACCUM "%%rdx" 97#define RESULT "%%rsi" 98#define RESULT32 "%%esi" 99 100#else /* if ARCH_X86_32 */ 101 102#define MLPMUL(label, offset, offs, offc) \ 103 LABEL_MANGLE(label)": \n\t" \ 104 "mov "offset"+"offs"(%0), %%eax\n\t" \ 105 "imull "offset"+"offc"(%1) \n\t" \ 106 "add %%eax , %%esi\n\t" \ 107 "adc %%edx , %%ecx\n\t" 108 109#define FIRMULREG(label, offset, firc) \ 110 MLPMUL(label, #offset, "0", "0") 111 112#define CLEAR_ACCUM \ 113 "xor %%esi, %%esi\n\t" \ 114 "xor %%ecx, %%ecx\n\t" 115 116#define SHIFT_ACCUM \ 117 "mov %%ecx, %%edx\n\t" \ 118 "mov %%esi, %%eax\n\t" \ 119 "movzbl %7 , %%ecx\n\t" \ 120 "shrd %%cl, %%edx, %%eax\n\t" \ 121 122#define ACCUM "%%edx" 123#define RESULT "%%eax" 124#define RESULT32 "%%eax" 125 126#endif /* !ARCH_X86_64 */ 127 128#define BINC AV_STRINGIFY(4* MAX_CHANNELS) 129#define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE)) 130#define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER) 131 132#define FIRMUL(label, offset) MLPMUL(label, #offset, "0", "0") 133#define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC) 134 135static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff, 136 int firorder, int iirorder, 137 unsigned int filter_shift, int32_t mask, 138 int blocksize, int32_t *sample_buffer) 139{ 140 const void *firjump = firtable[firorder]; 141 const void *iirjump = iirtable[iirorder]; 142 143 blocksize = -blocksize; 144 145 __asm__ volatile( 146 "1: \n\t" 147 CLEAR_ACCUM 148 "jmp *%5 \n\t" 149 FIRMUL (ff_mlp_firorder_8, 0x1c ) 150 FIRMUL (ff_mlp_firorder_7, 0x18 ) 151 FIRMUL (ff_mlp_firorder_6, 0x14 ) 152 FIRMUL (ff_mlp_firorder_5, 0x10 ) 153 FIRMUL (ff_mlp_firorder_4, 0x0c ) 154 FIRMUL (ff_mlp_firorder_3, 0x08 ) 155 FIRMUL (ff_mlp_firorder_2, 0x04 ) 156 FIRMULREG(ff_mlp_firorder_1, 0x00, 8) 157 LABEL_MANGLE(ff_mlp_firorder_0)":\n\t" 158 "jmp *%6 \n\t" 159 IIRMUL (ff_mlp_iirorder_4, 0x0c ) 160 IIRMUL (ff_mlp_iirorder_3, 0x08 ) 161 IIRMUL (ff_mlp_iirorder_2, 0x04 ) 162 IIRMUL (ff_mlp_iirorder_1, 0x00 ) 163 LABEL_MANGLE(ff_mlp_iirorder_0)":\n\t" 164 SHIFT_ACCUM 165 "mov "RESULT" ,"ACCUM" \n\t" 166 "add (%2) ,"RESULT" \n\t" 167 "and %4 ,"RESULT" \n\t" 168 "sub $4 , %0 \n\t" 169 "mov "RESULT32", (%0) \n\t" 170 "mov "RESULT32", (%2) \n\t" 171 "add $"BINC" , %2 \n\t" 172 "sub "ACCUM" ,"RESULT" \n\t" 173 "mov "RESULT32","IOFFS"(%0) \n\t" 174 "incl %3 \n\t" 175 "js 1b \n\t" 176 : /* 0*/"+r"(state), 177 /* 1*/"+r"(coeff), 178 /* 2*/"+r"(sample_buffer), 179#if ARCH_X86_64 180 /* 3*/"+r"(blocksize) 181 : /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump), 182 /* 6*/"r"(iirjump) , /* 7*/"c"(filter_shift) 183 , /* 8*/"r"((int64_t)coeff[0]) 184 : "rax", "rdx", "rsi" 185#else /* ARCH_X86_32 */ 186 /* 3*/"+m"(blocksize) 187 : /* 4*/"m"( mask), /* 5*/"m"(firjump), 188 /* 6*/"m"(iirjump) , /* 7*/"m"(filter_shift) 189 : "eax", "edx", "esi", "ecx" 190#endif /* !ARCH_X86_64 */ 191 ); 192} 193 194#endif /* HAVE_7REGS && HAVE_INLINE_ASM */ 195 196av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c) 197{ 198 int cpu_flags = av_get_cpu_flags(); 199#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS 200 if (INLINE_MMX(cpu_flags)) 201 c->mlp_filter_channel = mlp_filter_channel_x86; 202#endif 203 if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags)) 204 c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4; 205 if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2) 206 c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2; 207} 208