1/* 2 * Bluetooth low-complexity, subband codec (SBC) 3 * 4 * Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org> 5 * Copyright (C) 2012-2013 Intel Corporation 6 * Copyright (C) 2008-2010 Nokia Corporation 7 * Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org> 8 * Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch> 9 * Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com> 10 * 11 * This file is part of FFmpeg. 12 * 13 * FFmpeg is free software; you can redistribute it and/or 14 * modify it under the terms of the GNU Lesser General Public 15 * License as published by the Free Software Foundation; either 16 * version 2.1 of the License, or (at your option) any later version. 17 * 18 * FFmpeg is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 * Lesser General Public License for more details. 22 * 23 * You should have received a copy of the GNU Lesser General Public 24 * License along with FFmpeg; if not, write to the Free Software 25 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 26 */ 27 28/** 29 * @file 30 * SBC basic "building bricks" 31 */ 32 33#include <stdint.h> 34#include <limits.h> 35#include <string.h> 36#include "libavutil/common.h" 37#include "libavutil/intmath.h" 38#include "libavutil/intreadwrite.h" 39#include "sbc.h" 40#include "sbcdsp.h" 41#include "sbcdsp_data.h" 42 43/* 44 * A reference C code of analysis filter with SIMD-friendly tables 45 * reordering and code layout. This code can be used to develop platform 46 * specific SIMD optimizations. Also it may be used as some kind of test 47 * for compiler autovectorization capabilities (who knows, if the compiler 48 * is very good at this stuff, hand optimized assembly may be not strictly 49 * needed for some platform). 50 * 51 * Note: It is also possible to make a simple variant of analysis filter, 52 * which needs only a single constants table without taking care about 53 * even/odd cases. This simple variant of filter can be implemented without 54 * input data permutation. The only thing that would be lost is the 55 * possibility to use pairwise SIMD multiplications. But for some simple 56 * CPU cores without SIMD extensions it can be useful. If anybody is 57 * interested in implementing such variant of a filter, sourcecode from 58 * bluez versions 4.26/4.27 can be used as a reference and the history of 59 * the changes in git repository done around that time may be worth checking. 60 */ 61 62static av_always_inline void sbc_analyze_simd(const int16_t *in, int32_t *out, 63 const int16_t *consts, 64 unsigned subbands) 65{ 66 int32_t t1[8]; 67 int16_t t2[8]; 68 int i, j, hop = 0; 69 70 /* rounding coefficient */ 71 for (i = 0; i < subbands; i++) 72 t1[i] = 1 << (SBC_PROTO_FIXED_SCALE - 1); 73 74 /* low pass polyphase filter */ 75 for (hop = 0; hop < 10*subbands; hop += 2*subbands) 76 for (i = 0; i < 2*subbands; i++) 77 t1[i >> 1] += in[hop + i] * consts[hop + i]; 78 79 /* scaling */ 80 for (i = 0; i < subbands; i++) 81 t2[i] = t1[i] >> SBC_PROTO_FIXED_SCALE; 82 83 memset(t1, 0, sizeof(t1)); 84 85 /* do the cos transform */ 86 for (i = 0; i < subbands/2; i++) 87 for (j = 0; j < 2*subbands; j++) 88 t1[j>>1] += t2[i * 2 + (j&1)] * consts[10*subbands + i*2*subbands + j]; 89 90 for (i = 0; i < subbands; i++) 91 out[i] = t1[i] >> (SBC_COS_TABLE_FIXED_SCALE - SCALE_OUT_BITS); 92} 93 94static void sbc_analyze_4_simd(const int16_t *in, int32_t *out, 95 const int16_t *consts) 96{ 97 sbc_analyze_simd(in, out, consts, 4); 98} 99 100static void sbc_analyze_8_simd(const int16_t *in, int32_t *out, 101 const int16_t *consts) 102{ 103 sbc_analyze_simd(in, out, consts, 8); 104} 105 106static inline void sbc_analyze_4b_4s_simd(SBCDSPContext *s, 107 int16_t *x, int32_t *out, int out_stride) 108{ 109 /* Analyze blocks */ 110 s->sbc_analyze_4(x + 12, out, ff_sbcdsp_analysis_consts_fixed4_simd_odd); 111 out += out_stride; 112 s->sbc_analyze_4(x + 8, out, ff_sbcdsp_analysis_consts_fixed4_simd_even); 113 out += out_stride; 114 s->sbc_analyze_4(x + 4, out, ff_sbcdsp_analysis_consts_fixed4_simd_odd); 115 out += out_stride; 116 s->sbc_analyze_4(x + 0, out, ff_sbcdsp_analysis_consts_fixed4_simd_even); 117} 118 119static inline void sbc_analyze_4b_8s_simd(SBCDSPContext *s, 120 int16_t *x, int32_t *out, int out_stride) 121{ 122 /* Analyze blocks */ 123 s->sbc_analyze_8(x + 24, out, ff_sbcdsp_analysis_consts_fixed8_simd_odd); 124 out += out_stride; 125 s->sbc_analyze_8(x + 16, out, ff_sbcdsp_analysis_consts_fixed8_simd_even); 126 out += out_stride; 127 s->sbc_analyze_8(x + 8, out, ff_sbcdsp_analysis_consts_fixed8_simd_odd); 128 out += out_stride; 129 s->sbc_analyze_8(x + 0, out, ff_sbcdsp_analysis_consts_fixed8_simd_even); 130} 131 132static inline void sbc_analyze_1b_8s_simd_even(SBCDSPContext *s, 133 int16_t *x, int32_t *out, 134 int out_stride); 135 136static inline void sbc_analyze_1b_8s_simd_odd(SBCDSPContext *s, 137 int16_t *x, int32_t *out, 138 int out_stride) 139{ 140 s->sbc_analyze_8(x, out, ff_sbcdsp_analysis_consts_fixed8_simd_odd); 141 s->sbc_analyze_8s = sbc_analyze_1b_8s_simd_even; 142} 143 144static inline void sbc_analyze_1b_8s_simd_even(SBCDSPContext *s, 145 int16_t *x, int32_t *out, 146 int out_stride) 147{ 148 s->sbc_analyze_8(x, out, ff_sbcdsp_analysis_consts_fixed8_simd_even); 149 s->sbc_analyze_8s = sbc_analyze_1b_8s_simd_odd; 150} 151 152/* 153 * Input data processing functions. The data is endian converted if needed, 154 * channels are deintrleaved and audio samples are reordered for use in 155 * SIMD-friendly analysis filter function. The results are put into "X" 156 * array, getting appended to the previous data (or it is better to say 157 * prepended, as the buffer is filled from top to bottom). Old data is 158 * discarded when neededed, but availability of (10 * nrof_subbands) 159 * contiguous samples is always guaranteed for the input to the analysis 160 * filter. This is achieved by copying a sufficient part of old data 161 * to the top of the buffer on buffer wraparound. 162 */ 163 164static int sbc_enc_process_input_4s(int position, const uint8_t *pcm, 165 int16_t X[2][SBC_X_BUFFER_SIZE], 166 int nsamples, int nchannels) 167{ 168 int c; 169 170 /* handle X buffer wraparound */ 171 if (position < nsamples) { 172 for (c = 0; c < nchannels; c++) 173 memcpy(&X[c][SBC_X_BUFFER_SIZE - 40], &X[c][position], 174 36 * sizeof(int16_t)); 175 position = SBC_X_BUFFER_SIZE - 40; 176 } 177 178 /* copy/permutate audio samples */ 179 for (; nsamples >= 8; nsamples -= 8, pcm += 16 * nchannels) { 180 position -= 8; 181 for (c = 0; c < nchannels; c++) { 182 int16_t *x = &X[c][position]; 183 x[0] = AV_RN16(pcm + 14*nchannels + 2*c); 184 x[1] = AV_RN16(pcm + 6*nchannels + 2*c); 185 x[2] = AV_RN16(pcm + 12*nchannels + 2*c); 186 x[3] = AV_RN16(pcm + 8*nchannels + 2*c); 187 x[4] = AV_RN16(pcm + 0*nchannels + 2*c); 188 x[5] = AV_RN16(pcm + 4*nchannels + 2*c); 189 x[6] = AV_RN16(pcm + 2*nchannels + 2*c); 190 x[7] = AV_RN16(pcm + 10*nchannels + 2*c); 191 } 192 } 193 194 return position; 195} 196 197static int sbc_enc_process_input_8s(int position, const uint8_t *pcm, 198 int16_t X[2][SBC_X_BUFFER_SIZE], 199 int nsamples, int nchannels) 200{ 201 int c; 202 203 /* handle X buffer wraparound */ 204 if (position < nsamples) { 205 for (c = 0; c < nchannels; c++) 206 memcpy(&X[c][SBC_X_BUFFER_SIZE - 72], &X[c][position], 207 72 * sizeof(int16_t)); 208 position = SBC_X_BUFFER_SIZE - 72; 209 } 210 211 if (position % 16 == 8) { 212 position -= 8; 213 nsamples -= 8; 214 for (c = 0; c < nchannels; c++) { 215 int16_t *x = &X[c][position]; 216 x[0] = AV_RN16(pcm + 14*nchannels + 2*c); 217 x[2] = AV_RN16(pcm + 12*nchannels + 2*c); 218 x[3] = AV_RN16(pcm + 0*nchannels + 2*c); 219 x[4] = AV_RN16(pcm + 10*nchannels + 2*c); 220 x[5] = AV_RN16(pcm + 2*nchannels + 2*c); 221 x[6] = AV_RN16(pcm + 8*nchannels + 2*c); 222 x[7] = AV_RN16(pcm + 4*nchannels + 2*c); 223 x[8] = AV_RN16(pcm + 6*nchannels + 2*c); 224 } 225 pcm += 16 * nchannels; 226 } 227 228 /* copy/permutate audio samples */ 229 for (; nsamples >= 16; nsamples -= 16, pcm += 32 * nchannels) { 230 position -= 16; 231 for (c = 0; c < nchannels; c++) { 232 int16_t *x = &X[c][position]; 233 x[0] = AV_RN16(pcm + 30*nchannels + 2*c); 234 x[1] = AV_RN16(pcm + 14*nchannels + 2*c); 235 x[2] = AV_RN16(pcm + 28*nchannels + 2*c); 236 x[3] = AV_RN16(pcm + 16*nchannels + 2*c); 237 x[4] = AV_RN16(pcm + 26*nchannels + 2*c); 238 x[5] = AV_RN16(pcm + 18*nchannels + 2*c); 239 x[6] = AV_RN16(pcm + 24*nchannels + 2*c); 240 x[7] = AV_RN16(pcm + 20*nchannels + 2*c); 241 x[8] = AV_RN16(pcm + 22*nchannels + 2*c); 242 x[9] = AV_RN16(pcm + 6*nchannels + 2*c); 243 x[10] = AV_RN16(pcm + 12*nchannels + 2*c); 244 x[11] = AV_RN16(pcm + 0*nchannels + 2*c); 245 x[12] = AV_RN16(pcm + 10*nchannels + 2*c); 246 x[13] = AV_RN16(pcm + 2*nchannels + 2*c); 247 x[14] = AV_RN16(pcm + 8*nchannels + 2*c); 248 x[15] = AV_RN16(pcm + 4*nchannels + 2*c); 249 } 250 } 251 252 if (nsamples == 8) { 253 position -= 8; 254 for (c = 0; c < nchannels; c++) { 255 int16_t *x = &X[c][position]; 256 x[-7] = AV_RN16(pcm + 14*nchannels + 2*c); 257 x[1] = AV_RN16(pcm + 6*nchannels + 2*c); 258 x[2] = AV_RN16(pcm + 12*nchannels + 2*c); 259 x[3] = AV_RN16(pcm + 0*nchannels + 2*c); 260 x[4] = AV_RN16(pcm + 10*nchannels + 2*c); 261 x[5] = AV_RN16(pcm + 2*nchannels + 2*c); 262 x[6] = AV_RN16(pcm + 8*nchannels + 2*c); 263 x[7] = AV_RN16(pcm + 4*nchannels + 2*c); 264 } 265 } 266 267 return position; 268} 269 270static void sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8], 271 uint32_t scale_factor[2][8], 272 int blocks, int channels, int subbands) 273{ 274 int ch, sb, blk; 275 for (ch = 0; ch < channels; ch++) { 276 for (sb = 0; sb < subbands; sb++) { 277 uint32_t x = 1 << SCALE_OUT_BITS; 278 for (blk = 0; blk < blocks; blk++) { 279 int32_t tmp = FFABS(sb_sample_f[blk][ch][sb]); 280 if (tmp != 0) 281 x |= tmp - 1; 282 } 283 scale_factor[ch][sb] = (31 - SCALE_OUT_BITS) - ff_clz(x); 284 } 285 } 286} 287 288static int sbc_calc_scalefactors_j(int32_t sb_sample_f[16][2][8], 289 uint32_t scale_factor[2][8], 290 int blocks, int subbands) 291{ 292 int blk, joint = 0; 293 int32_t tmp0, tmp1; 294 uint32_t x, y; 295 296 /* last subband does not use joint stereo */ 297 int sb = subbands - 1; 298 x = 1 << SCALE_OUT_BITS; 299 y = 1 << SCALE_OUT_BITS; 300 for (blk = 0; blk < blocks; blk++) { 301 tmp0 = FFABS(sb_sample_f[blk][0][sb]); 302 tmp1 = FFABS(sb_sample_f[blk][1][sb]); 303 if (tmp0 != 0) 304 x |= tmp0 - 1; 305 if (tmp1 != 0) 306 y |= tmp1 - 1; 307 } 308 scale_factor[0][sb] = (31 - SCALE_OUT_BITS) - ff_clz(x); 309 scale_factor[1][sb] = (31 - SCALE_OUT_BITS) - ff_clz(y); 310 311 /* the rest of subbands can use joint stereo */ 312 while (--sb >= 0) { 313 int32_t sb_sample_j[16][2]; 314 x = 1 << SCALE_OUT_BITS; 315 y = 1 << SCALE_OUT_BITS; 316 for (blk = 0; blk < blocks; blk++) { 317 tmp0 = sb_sample_f[blk][0][sb]; 318 tmp1 = sb_sample_f[blk][1][sb]; 319 sb_sample_j[blk][0] = (tmp0 >> 1) + (tmp1 >> 1); 320 sb_sample_j[blk][1] = (tmp0 >> 1) - (tmp1 >> 1); 321 tmp0 = FFABS(tmp0); 322 tmp1 = FFABS(tmp1); 323 if (tmp0 != 0) 324 x |= tmp0 - 1; 325 if (tmp1 != 0) 326 y |= tmp1 - 1; 327 } 328 scale_factor[0][sb] = (31 - SCALE_OUT_BITS) - 329 ff_clz(x); 330 scale_factor[1][sb] = (31 - SCALE_OUT_BITS) - 331 ff_clz(y); 332 x = 1 << SCALE_OUT_BITS; 333 y = 1 << SCALE_OUT_BITS; 334 for (blk = 0; blk < blocks; blk++) { 335 tmp0 = FFABS(sb_sample_j[blk][0]); 336 tmp1 = FFABS(sb_sample_j[blk][1]); 337 if (tmp0 != 0) 338 x |= tmp0 - 1; 339 if (tmp1 != 0) 340 y |= tmp1 - 1; 341 } 342 x = (31 - SCALE_OUT_BITS) - ff_clz(x); 343 y = (31 - SCALE_OUT_BITS) - ff_clz(y); 344 345 /* decide whether to use joint stereo for this subband */ 346 if ((scale_factor[0][sb] + scale_factor[1][sb]) > x + y) { 347 joint |= 1 << (subbands - 1 - sb); 348 scale_factor[0][sb] = x; 349 scale_factor[1][sb] = y; 350 for (blk = 0; blk < blocks; blk++) { 351 sb_sample_f[blk][0][sb] = sb_sample_j[blk][0]; 352 sb_sample_f[blk][1][sb] = sb_sample_j[blk][1]; 353 } 354 } 355 } 356 357 /* bitmask with the information about subbands using joint stereo */ 358 return joint; 359} 360 361/* 362 * Detect CPU features and setup function pointers 363 */ 364av_cold void ff_sbcdsp_init(SBCDSPContext *s) 365{ 366 /* Default implementation for analyze functions */ 367 s->sbc_analyze_4 = sbc_analyze_4_simd; 368 s->sbc_analyze_8 = sbc_analyze_8_simd; 369 s->sbc_analyze_4s = sbc_analyze_4b_4s_simd; 370 if (s->increment == 1) 371 s->sbc_analyze_8s = sbc_analyze_1b_8s_simd_odd; 372 else 373 s->sbc_analyze_8s = sbc_analyze_4b_8s_simd; 374 375 /* Default implementation for input reordering / deinterleaving */ 376 s->sbc_enc_process_input_4s = sbc_enc_process_input_4s; 377 s->sbc_enc_process_input_8s = sbc_enc_process_input_8s; 378 379 /* Default implementation for scale factors calculation */ 380 s->sbc_calc_scalefactors = sbc_calc_scalefactors; 381 s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j; 382 383#if ARCH_ARM 384 ff_sbcdsp_init_arm(s); 385#elif ARCH_X86 386 ff_sbcdsp_init_x86(s); 387#endif 388} 389