1/* 2 * Copyright 2002-2008 Xiph.org Foundation 3 * Copyright 2002-2008 Jean-Marc Valin 4 * Copyright 2005-2007 Analog Devices Inc. 5 * Copyright 2005-2008 Commonwealth Scientific and Industrial Research Organisation (CSIRO) 6 * Copyright 1993, 2002, 2006 David Rowe 7 * Copyright 2003 EpicGames 8 * Copyright 1992-1994 Jutta Degener, Carsten Bormann 9 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 14 * - Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 17 * - Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 21 * - Neither the name of the Xiph.org Foundation nor the names of its 22 * contributors may be used to endorse or promote products derived from 23 * this software without specific prior written permission. 24 25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 29 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 30 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 31 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 32 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 33 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 34 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 35 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 * 37 * This file is part of FFmpeg. 38 * 39 * FFmpeg is free software; you can redistribute it and/or 40 * modify it under the terms of the GNU Lesser General Public 41 * License as published by the Free Software Foundation; either 42 * version 2.1 of the License, or (at your option) any later version. 43 * 44 * FFmpeg is distributed in the hope that it will be useful, 45 * but WITHOUT ANY WARRANTY; without even the implied warranty of 46 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 47 * Lesser General Public License for more details. 48 * 49 * You should have received a copy of the GNU Lesser General Public 50 * License along with FFmpeg; if not, write to the Free Software 51 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 52 */ 53 54#include "libavutil/avassert.h" 55#include "libavutil/float_dsp.h" 56#include "avcodec.h" 57#include "bytestream.h" 58#include "codec_internal.h" 59#include "get_bits.h" 60#include "internal.h" 61#include "speexdata.h" 62 63#define SPEEX_NB_MODES 3 64#define SPEEX_INBAND_STEREO 9 65 66#define QMF_ORDER 64 67#define NB_ORDER 10 68#define NB_FRAME_SIZE 160 69#define NB_SUBMODES 9 70#define NB_SUBMODE_BITS 4 71#define SB_SUBMODE_BITS 3 72 73#define NB_SUBFRAME_SIZE 40 74#define NB_NB_SUBFRAMES 4 75#define NB_PITCH_START 17 76#define NB_PITCH_END 144 77 78#define NB_DEC_BUFFER (NB_FRAME_SIZE + 2 * NB_PITCH_END + NB_SUBFRAME_SIZE + 12) 79 80#define SPEEX_MEMSET(dst, c, n) (memset((dst), (c), (n) * sizeof(*(dst)))) 81#define SPEEX_COPY(dst, src, n) (memcpy((dst), (src), (n) * sizeof(*(dst)))) 82 83#define LSP_LINEAR(i) (.25f * (i) + .25f) 84#define LSP_LINEAR_HIGH(i) (.3125f * (i) + .75f) 85#define LSP_DIV_256(x) (0.00390625f * (x)) 86#define LSP_DIV_512(x) (0.001953125f * (x)) 87#define LSP_DIV_1024(x) (0.0009765625f * (x)) 88 89typedef struct LtpParams { 90 const int8_t *gain_cdbk; 91 int gain_bits; 92 int pitch_bits; 93} LtpParam; 94 95static const LtpParam ltp_params_vlbr = { gain_cdbk_lbr, 5, 0 }; 96static const LtpParam ltp_params_lbr = { gain_cdbk_lbr, 5, 7 }; 97static const LtpParam ltp_params_med = { gain_cdbk_lbr, 5, 7 }; 98static const LtpParam ltp_params_nb = { gain_cdbk_nb, 7, 7 }; 99 100typedef struct SplitCodebookParams { 101 int subvect_size; 102 int nb_subvect; 103 const signed char *shape_cb; 104 int shape_bits; 105 int have_sign; 106} SplitCodebookParams; 107 108static const SplitCodebookParams split_cb_nb_ulbr = { 20, 2, exc_20_32_table, 5, 0 }; 109static const SplitCodebookParams split_cb_nb_vlbr = { 10, 4, exc_10_16_table, 4, 0 }; 110static const SplitCodebookParams split_cb_nb_lbr = { 10, 4, exc_10_32_table, 5, 0 }; 111static const SplitCodebookParams split_cb_nb_med = { 8, 5, exc_8_128_table, 7, 0 }; 112static const SplitCodebookParams split_cb_nb = { 5, 8, exc_5_64_table, 6, 0 }; 113static const SplitCodebookParams split_cb_sb = { 5, 8, exc_5_256_table, 8, 0 }; 114static const SplitCodebookParams split_cb_high = { 8, 5, hexc_table, 7, 1 }; 115static const SplitCodebookParams split_cb_high_lbr= { 10, 4, hexc_10_32_table,5, 0 }; 116 117/** Quantizes LSPs */ 118typedef void (*lsp_quant_func)(float *, float *, int, GetBitContext *); 119 120/** Decodes quantized LSPs */ 121typedef void (*lsp_unquant_func)(float *, int, GetBitContext *); 122 123/** Long-term predictor quantization */ 124typedef int (*ltp_quant_func)(float *, float *, float *, 125 float *, float *, float *, 126 const void *, int, int, float, int, int, 127 GetBitContext *, char *, float *, 128 float *, int, int, int, float *); 129 130/** Long-term un-quantize */ 131typedef void (*ltp_unquant_func)(float *, float *, int, int, 132 float, const void *, int, int *, 133 float *, GetBitContext *, int, int, 134 float, int); 135 136/** Innovation quantization function */ 137typedef void (*innovation_quant_func)(float *, float *, 138 float *, float *, const void *, 139 int, int, float *, float *, 140 GetBitContext *, char *, int, int); 141 142/** Innovation unquantization function */ 143typedef void (*innovation_unquant_func)(float *, const void *, int, 144 GetBitContext *, uint32_t *); 145 146typedef struct SpeexSubmode { 147 int lbr_pitch; /**< Set to -1 for "normal" modes, otherwise encode pitch using 148 a global pitch and allowing a +- lbr_pitch variation (for 149 low not-rates)*/ 150 int forced_pitch_gain; /**< Use the same (forced) pitch gain for all 151 sub-frames */ 152 int have_subframe_gain; /**< Number of bits to use as sub-frame innovation 153 gain */ 154 int double_codebook; /**< Apply innovation quantization twice for higher 155 quality (and higher bit-rate)*/ 156 lsp_unquant_func lsp_unquant; /**< LSP unquantization function */ 157 158 ltp_unquant_func ltp_unquant; /**< Long-term predictor (pitch) un-quantizer */ 159 const void *LtpParam; /**< Pitch parameters (options) */ 160 161 innovation_unquant_func innovation_unquant; /**< Innovation un-quantization */ 162 const void *innovation_params; /**< Innovation quantization parameters*/ 163 164 float comb_gain; /**< Gain of enhancer comb filter */ 165} SpeexSubmode; 166 167typedef struct SpeexMode { 168 int modeID; /**< ID of the mode */ 169 int (*decode)(AVCodecContext *avctx, void *dec, GetBitContext *gb, float *out); 170 int frame_size; /**< Size of frames used for decoding */ 171 int subframe_size; /**< Size of sub-frames used for decoding */ 172 int lpc_size; /**< Order of LPC filter */ 173 float folding_gain; /**< Folding gain */ 174 const SpeexSubmode *submodes[NB_SUBMODES]; /**< Sub-mode data for the mode */ 175 int default_submode; /**< Default sub-mode to use when decoding */ 176} SpeexMode; 177 178typedef struct DecoderState { 179 const SpeexMode *mode; 180 int modeID; /**< ID of the decoder mode */ 181 int first; /**< Is first frame */ 182 int full_frame_size; /**< Length of full-band frames */ 183 int is_wideband; /**< If wideband is present */ 184 int count_lost; /**< Was the last frame lost? */ 185 int frame_size; /**< Length of high-band frames */ 186 int subframe_size; /**< Length of high-band sub-frames */ 187 int nb_subframes; /**< Number of high-band sub-frames */ 188 int lpc_size; /**< Order of high-band LPC analysis */ 189 float last_ol_gain; /**< Open-loop gain for previous frame */ 190 float *innov_save; /**< If non-NULL, innovation is copied here */ 191 192 /* This is used in packet loss concealment */ 193 int last_pitch; /**< Pitch of last correctly decoded frame */ 194 float last_pitch_gain; /**< Pitch gain of last correctly decoded frame */ 195 uint32_t seed; /**< Seed used for random number generation */ 196 197 int encode_submode; 198 const SpeexSubmode *const *submodes; /**< Sub-mode data */ 199 int submodeID; /**< Activated sub-mode */ 200 int lpc_enh_enabled; /**< 1 when LPC enhancer is on, 0 otherwise */ 201 202 /* Vocoder data */ 203 float voc_m1; 204 float voc_m2; 205 float voc_mean; 206 int voc_offset; 207 208 int dtx_enabled; 209 int highpass_enabled; /**< Is the input filter enabled */ 210 211 float *exc; /**< Start of excitation frame */ 212 float mem_hp[2]; /**< High-pass filter memory */ 213 float exc_buf[NB_DEC_BUFFER]; /**< Excitation buffer */ 214 float old_qlsp[NB_ORDER]; /**< Quantized LSPs for previous frame */ 215 float interp_qlpc[NB_ORDER]; /**< Interpolated quantized LPCs */ 216 float mem_sp[NB_ORDER]; /**< Filter memory for synthesis signal */ 217 float g0_mem[QMF_ORDER]; 218 float g1_mem[QMF_ORDER]; 219 float pi_gain[NB_NB_SUBFRAMES]; /**< Gain of LPC filter at theta=pi (fe/2) */ 220 float exc_rms[NB_NB_SUBFRAMES]; /**< RMS of excitation per subframe */ 221} DecoderState; 222 223/* Default handler for user callbacks: skip it */ 224static int speex_default_user_handler(GetBitContext *gb, void *state, void *data) 225{ 226 const int req_size = get_bits(gb, 4); 227 skip_bits_long(gb, 5 + 8 * req_size); 228 return 0; 229} 230 231typedef struct StereoState { 232 float balance; /**< Left/right balance info */ 233 float e_ratio; /**< Ratio of energies: E(left+right)/[E(left)+E(right)] */ 234 float smooth_left; /**< Smoothed left channel gain */ 235 float smooth_right; /**< Smoothed right channel gain */ 236} StereoState; 237 238typedef struct SpeexContext { 239 AVClass *class; 240 GetBitContext gb; 241 242 int32_t version_id; /**< Version for Speex (for checking compatibility) */ 243 int32_t rate; /**< Sampling rate used */ 244 int32_t mode; /**< Mode used (0 for narrowband, 1 for wideband) */ 245 int32_t bitstream_version; /**< Version ID of the bit-stream */ 246 int32_t nb_channels; /**< Number of channels decoded */ 247 int32_t bitrate; /**< Bit-rate used */ 248 int32_t frame_size; /**< Size of frames */ 249 int32_t vbr; /**< 1 for a VBR decoding, 0 otherwise */ 250 int32_t frames_per_packet; /**< Number of frames stored per Ogg packet */ 251 int32_t extra_headers; /**< Number of additional headers after the comments */ 252 253 int pkt_size; 254 255 StereoState stereo; 256 DecoderState st[SPEEX_NB_MODES]; 257 258 AVFloatDSPContext *fdsp; 259} SpeexContext; 260 261static void lsp_unquant_lbr(float *lsp, int order, GetBitContext *gb) 262{ 263 int id; 264 265 for (int i = 0; i < order; i++) 266 lsp[i] = LSP_LINEAR(i); 267 268 id = get_bits(gb, 6); 269 for (int i = 0; i < 10; i++) 270 lsp[i] += LSP_DIV_256(cdbk_nb[id * 10 + i]); 271 272 id = get_bits(gb, 6); 273 for (int i = 0; i < 5; i++) 274 lsp[i] += LSP_DIV_512(cdbk_nb_low1[id * 5 + i]); 275 276 id = get_bits(gb, 6); 277 for (int i = 0; i < 5; i++) 278 lsp[i + 5] += LSP_DIV_512(cdbk_nb_high1[id * 5 + i]); 279} 280 281static void forced_pitch_unquant(float *exc, float *exc_out, int start, int end, 282 float pitch_coef, const void *par, int nsf, 283 int *pitch_val, float *gain_val, GetBitContext *gb, int count_lost, 284 int subframe_offset, float last_pitch_gain, int cdbk_offset) 285{ 286 av_assert0(!isnan(pitch_coef)); 287 pitch_coef = fminf(pitch_coef, .99f); 288 for (int i = 0; i < nsf; i++) { 289 exc_out[i] = exc[i - start] * pitch_coef; 290 exc[i] = exc_out[i]; 291 } 292 pitch_val[0] = start; 293 gain_val[0] = gain_val[2] = 0.f; 294 gain_val[1] = pitch_coef; 295} 296 297static inline float speex_rand(float std, uint32_t *seed) 298{ 299 const uint32_t jflone = 0x3f800000; 300 const uint32_t jflmsk = 0x007fffff; 301 float fran; 302 uint32_t ran; 303 seed[0] = 1664525 * seed[0] + 1013904223; 304 ran = jflone | (jflmsk & seed[0]); 305 fran = av_int2float(ran); 306 fran -= 1.5f; 307 fran *= std; 308 return fran; 309} 310 311static void noise_codebook_unquant(float *exc, const void *par, int nsf, 312 GetBitContext *gb, uint32_t *seed) 313{ 314 for (int i = 0; i < nsf; i++) 315 exc[i] = speex_rand(1.f, seed); 316} 317 318static void split_cb_shape_sign_unquant(float *exc, const void *par, int nsf, 319 GetBitContext *gb, uint32_t *seed) 320{ 321 int subvect_size, nb_subvect, have_sign, shape_bits; 322 const SplitCodebookParams *params; 323 const signed char *shape_cb; 324 int signs[10], ind[10]; 325 326 params = par; 327 subvect_size = params->subvect_size; 328 nb_subvect = params->nb_subvect; 329 330 shape_cb = params->shape_cb; 331 have_sign = params->have_sign; 332 shape_bits = params->shape_bits; 333 334 /* Decode codewords and gains */ 335 for (int i = 0; i < nb_subvect; i++) { 336 signs[i] = have_sign ? get_bits1(gb) : 0; 337 ind[i] = get_bitsz(gb, shape_bits); 338 } 339 /* Compute decoded excitation */ 340 for (int i = 0; i < nb_subvect; i++) { 341 const float s = signs[i] ? -1.f : 1.f; 342 343 for (int j = 0; j < subvect_size; j++) 344 exc[subvect_size * i + j] += s * 0.03125f * shape_cb[ind[i] * subvect_size + j]; 345 } 346} 347 348#define SUBMODE(x) st->submodes[st->submodeID]->x 349 350#define gain_3tap_to_1tap(g) (FFABS(g[1]) + (g[0] > 0.f ? g[0] : -.5f * g[0]) + (g[2] > 0.f ? g[2] : -.5f * g[2])) 351 352static void 353pitch_unquant_3tap(float *exc, float *exc_out, int start, int end, float pitch_coef, 354 const void *par, int nsf, int *pitch_val, float *gain_val, GetBitContext *gb, 355 int count_lost, int subframe_offset, float last_pitch_gain, int cdbk_offset) 356{ 357 int pitch, gain_index, gain_cdbk_size; 358 const int8_t *gain_cdbk; 359 const LtpParam *params; 360 float gain[3]; 361 362 params = (const LtpParam *)par; 363 gain_cdbk_size = 1 << params->gain_bits; 364 gain_cdbk = params->gain_cdbk + 4 * gain_cdbk_size * cdbk_offset; 365 366 pitch = get_bitsz(gb, params->pitch_bits); 367 pitch += start; 368 gain_index = get_bitsz(gb, params->gain_bits); 369 gain[0] = 0.015625f * gain_cdbk[gain_index * 4] + .5f; 370 gain[1] = 0.015625f * gain_cdbk[gain_index * 4 + 1] + .5f; 371 gain[2] = 0.015625f * gain_cdbk[gain_index * 4 + 2] + .5f; 372 373 if (count_lost && pitch > subframe_offset) { 374 float tmp = count_lost < 4 ? last_pitch_gain : 0.5f * last_pitch_gain; 375 float gain_sum; 376 377 tmp = fminf(tmp, .95f); 378 gain_sum = gain_3tap_to_1tap(gain); 379 380 if (gain_sum > tmp && gain_sum > 0.f) { 381 float fact = tmp / gain_sum; 382 for (int i = 0; i < 3; i++) 383 gain[i] *= fact; 384 } 385 } 386 387 pitch_val[0] = pitch; 388 gain_val[0] = gain[0]; 389 gain_val[1] = gain[1]; 390 gain_val[2] = gain[2]; 391 SPEEX_MEMSET(exc_out, 0, nsf); 392 393 for (int i = 0; i < 3; i++) { 394 int tmp1, tmp3; 395 int pp = pitch + 1 - i; 396 tmp1 = nsf; 397 if (tmp1 > pp) 398 tmp1 = pp; 399 for (int j = 0; j < tmp1; j++) 400 exc_out[j] += gain[2 - i] * exc[j - pp]; 401 tmp3 = nsf; 402 if (tmp3 > pp + pitch) 403 tmp3 = pp + pitch; 404 for (int j = tmp1; j < tmp3; j++) 405 exc_out[j] += gain[2 - i] * exc[j - pp - pitch]; 406 } 407} 408 409static void lsp_unquant_nb(float *lsp, int order, GetBitContext *gb) 410{ 411 int id; 412 413 for (int i = 0; i < order; i++) 414 lsp[i] = LSP_LINEAR(i); 415 416 id = get_bits(gb, 6); 417 for (int i = 0; i < 10; i++) 418 lsp[i] += LSP_DIV_256(cdbk_nb[id * 10 + i]); 419 420 id = get_bits(gb, 6); 421 for (int i = 0; i < 5; i++) 422 lsp[i] += LSP_DIV_512(cdbk_nb_low1[id * 5 + i]); 423 424 id = get_bits(gb, 6); 425 for (int i = 0; i < 5; i++) 426 lsp[i] += LSP_DIV_1024(cdbk_nb_low2[id * 5 + i]); 427 428 id = get_bits(gb, 6); 429 for (int i = 0; i < 5; i++) 430 lsp[i + 5] += LSP_DIV_512(cdbk_nb_high1[id * 5 + i]); 431 432 id = get_bits(gb, 6); 433 for (int i = 0; i < 5; i++) 434 lsp[i + 5] += LSP_DIV_1024(cdbk_nb_high2[id * 5 + i]); 435} 436 437static void lsp_unquant_high(float *lsp, int order, GetBitContext *gb) 438{ 439 int id; 440 441 for (int i = 0; i < order; i++) 442 lsp[i] = LSP_LINEAR_HIGH(i); 443 444 id = get_bits(gb, 6); 445 for (int i = 0; i < order; i++) 446 lsp[i] += LSP_DIV_256(high_lsp_cdbk[id * order + i]); 447 448 id = get_bits(gb, 6); 449 for (int i = 0; i < order; i++) 450 lsp[i] += LSP_DIV_512(high_lsp_cdbk2[id * order + i]); 451} 452 453/* 2150 bps "vocoder-like" mode for comfort noise */ 454static const SpeexSubmode nb_submode1 = { 455 0, 1, 0, 0, lsp_unquant_lbr, forced_pitch_unquant, NULL, 456 noise_codebook_unquant, NULL, -1.f 457}; 458 459/* 5.95 kbps very low bit-rate mode */ 460static const SpeexSubmode nb_submode2 = { 461 0, 0, 0, 0, lsp_unquant_lbr, pitch_unquant_3tap, <p_params_vlbr, 462 split_cb_shape_sign_unquant, &split_cb_nb_vlbr, .6f 463}; 464 465/* 8 kbps low bit-rate mode */ 466static const SpeexSubmode nb_submode3 = { 467 -1, 0, 1, 0, lsp_unquant_lbr, pitch_unquant_3tap, <p_params_lbr, 468 split_cb_shape_sign_unquant, &split_cb_nb_lbr, .55f 469}; 470 471/* 11 kbps medium bit-rate mode */ 472static const SpeexSubmode nb_submode4 = { 473 -1, 0, 1, 0, lsp_unquant_lbr, pitch_unquant_3tap, <p_params_med, 474 split_cb_shape_sign_unquant, &split_cb_nb_med, .45f 475}; 476 477/* 15 kbps high bit-rate mode */ 478static const SpeexSubmode nb_submode5 = { 479 -1, 0, 3, 0, lsp_unquant_nb, pitch_unquant_3tap, <p_params_nb, 480 split_cb_shape_sign_unquant, &split_cb_nb, .25f 481}; 482 483/* 18.2 high bit-rate mode */ 484static const SpeexSubmode nb_submode6 = { 485 -1, 0, 3, 0, lsp_unquant_nb, pitch_unquant_3tap, <p_params_nb, 486 split_cb_shape_sign_unquant, &split_cb_sb, .15f 487}; 488 489/* 24.6 kbps high bit-rate mode */ 490static const SpeexSubmode nb_submode7 = { 491 -1, 0, 3, 1, lsp_unquant_nb, pitch_unquant_3tap, <p_params_nb, 492 split_cb_shape_sign_unquant, &split_cb_nb, 0.05f 493}; 494 495/* 3.95 kbps very low bit-rate mode */ 496static const SpeexSubmode nb_submode8 = { 497 0, 1, 0, 0, lsp_unquant_lbr, forced_pitch_unquant, NULL, 498 split_cb_shape_sign_unquant, &split_cb_nb_ulbr, .5f 499}; 500 501static const SpeexSubmode wb_submode1 = { 502 0, 0, 1, 0, lsp_unquant_high, NULL, NULL, 503 NULL, NULL, -1.f 504}; 505 506static const SpeexSubmode wb_submode2 = { 507 0, 0, 1, 0, lsp_unquant_high, NULL, NULL, 508 split_cb_shape_sign_unquant, &split_cb_high_lbr, -1.f 509}; 510 511static const SpeexSubmode wb_submode3 = { 512 0, 0, 1, 0, lsp_unquant_high, NULL, NULL, 513 split_cb_shape_sign_unquant, &split_cb_high, -1.f 514}; 515 516static const SpeexSubmode wb_submode4 = { 517 0, 0, 1, 1, lsp_unquant_high, NULL, NULL, 518 split_cb_shape_sign_unquant, &split_cb_high, -1.f 519}; 520 521static int nb_decode(AVCodecContext *, void *, GetBitContext *, float *); 522static int sb_decode(AVCodecContext *, void *, GetBitContext *, float *); 523 524static const SpeexMode speex_modes[SPEEX_NB_MODES] = { 525 { 526 .modeID = 0, 527 .decode = nb_decode, 528 .frame_size = NB_FRAME_SIZE, 529 .subframe_size = NB_SUBFRAME_SIZE, 530 .lpc_size = NB_ORDER, 531 .submodes = { 532 NULL, &nb_submode1, &nb_submode2, &nb_submode3, &nb_submode4, 533 &nb_submode5, &nb_submode6, &nb_submode7, &nb_submode8 534 }, 535 .default_submode = 5, 536 }, 537 { 538 .modeID = 1, 539 .decode = sb_decode, 540 .frame_size = NB_FRAME_SIZE, 541 .subframe_size = NB_SUBFRAME_SIZE, 542 .lpc_size = 8, 543 .folding_gain = 0.9f, 544 .submodes = { 545 NULL, &wb_submode1, &wb_submode2, &wb_submode3, &wb_submode4 546 }, 547 .default_submode = 3, 548 }, 549 { 550 .modeID = 2, 551 .decode = sb_decode, 552 .frame_size = 320, 553 .subframe_size = 80, 554 .lpc_size = 8, 555 .folding_gain = 0.7f, 556 .submodes = { 557 NULL, &wb_submode1 558 }, 559 .default_submode = 1, 560 }, 561}; 562 563static float compute_rms(const float *x, int len) 564{ 565 float sum = 0.f; 566 567 for (int i = 0; i < len; i++) 568 sum += x[i] * x[i]; 569 570 av_assert0(len > 0); 571 return sqrtf(.1f + sum / len); 572} 573 574static void bw_lpc(float gamma, const float *lpc_in, 575 float *lpc_out, int order) 576{ 577 float tmp = gamma; 578 579 for (int i = 0; i < order; i++) { 580 lpc_out[i] = tmp * lpc_in[i]; 581 tmp *= gamma; 582 } 583} 584 585static void iir_mem(const float *x, const float *den, 586 float *y, int N, int ord, float *mem) 587{ 588 for (int i = 0; i < N; i++) { 589 float yi = x[i] + mem[0]; 590 float nyi = -yi; 591 for (int j = 0; j < ord - 1; j++) 592 mem[j] = mem[j + 1] + den[j] * nyi; 593 mem[ord - 1] = den[ord - 1] * nyi; 594 y[i] = yi; 595 } 596} 597 598static void highpass(const float *x, float *y, int len, float *mem, int wide) 599{ 600 static const float Pcoef[2][3] = {{ 1.00000f, -1.92683f, 0.93071f }, { 1.00000f, -1.97226f, 0.97332f } }; 601 static const float Zcoef[2][3] = {{ 0.96446f, -1.92879f, 0.96446f }, { 0.98645f, -1.97277f, 0.98645f } }; 602 const float *den, *num; 603 604 den = Pcoef[wide]; 605 num = Zcoef[wide]; 606 for (int i = 0; i < len; i++) { 607 float yi = num[0] * x[i] + mem[0]; 608 mem[0] = mem[1] + num[1] * x[i] + -den[1] * yi; 609 mem[1] = num[2] * x[i] + -den[2] * yi; 610 y[i] = yi; 611 } 612} 613 614#define median3(a, b, c) \ 615 ((a) < (b) ? ((b) < (c) ? (b) : ((a) < (c) ? (c) : (a))) \ 616 : ((c) < (b) ? (b) : ((c) < (a) ? (c) : (a)))) 617 618static int speex_std_stereo(GetBitContext *gb, void *state, void *data) 619{ 620 StereoState *stereo = data; 621 float sign = get_bits1(gb) ? -1.f : 1.f; 622 623 stereo->balance = exp(sign * .25f * get_bits(gb, 5)); 624 stereo->e_ratio = e_ratio_quant[get_bits(gb, 2)]; 625 626 return 0; 627} 628 629static int speex_inband_handler(GetBitContext *gb, void *state, StereoState *stereo) 630{ 631 int id = get_bits(gb, 4); 632 633 if (id == SPEEX_INBAND_STEREO) { 634 return speex_std_stereo(gb, state, stereo); 635 } else { 636 int adv; 637 638 if (id < 2) 639 adv = 1; 640 else if (id < 8) 641 adv = 4; 642 else if (id < 10) 643 adv = 8; 644 else if (id < 12) 645 adv = 16; 646 else if (id < 14) 647 adv = 32; 648 else 649 adv = 64; 650 skip_bits_long(gb, adv); 651 } 652 return 0; 653} 654 655static void sanitize_values(float *vec, float min_val, float max_val, int len) 656{ 657 for (int i = 0; i < len; i++) { 658 if (!isnormal(vec[i]) || fabsf(vec[i]) < 1e-8f) 659 vec[i] = 0.f; 660 else 661 vec[i] = av_clipf(vec[i], min_val, max_val); 662 } 663} 664 665static void signal_mul(const float *x, float *y, float scale, int len) 666{ 667 for (int i = 0; i < len; i++) 668 y[i] = scale * x[i]; 669} 670 671static float inner_prod(const float *x, const float *y, int len) 672{ 673 float sum = 0.f; 674 675 for (int i = 0; i < len; i += 8) { 676 float part = 0.f; 677 part += x[i + 0] * y[i + 0]; 678 part += x[i + 1] * y[i + 1]; 679 part += x[i + 2] * y[i + 2]; 680 part += x[i + 3] * y[i + 3]; 681 part += x[i + 4] * y[i + 4]; 682 part += x[i + 5] * y[i + 5]; 683 part += x[i + 6] * y[i + 6]; 684 part += x[i + 7] * y[i + 7]; 685 sum += part; 686 } 687 688 return sum; 689} 690 691static int interp_pitch(const float *exc, float *interp, int pitch, int len) 692{ 693 float corr[4][7], maxcorr; 694 int maxi, maxj; 695 696 for (int i = 0; i < 7; i++) 697 corr[0][i] = inner_prod(exc, exc - pitch - 3 + i, len); 698 for (int i = 0; i < 3; i++) { 699 for (int j = 0; j < 7; j++) { 700 int i1, i2; 701 float tmp = 0.f; 702 703 i1 = 3 - j; 704 if (i1 < 0) 705 i1 = 0; 706 i2 = 10 - j; 707 if (i2 > 7) 708 i2 = 7; 709 for (int k = i1; k < i2; k++) 710 tmp += shift_filt[i][k] * corr[0][j + k - 3]; 711 corr[i + 1][j] = tmp; 712 } 713 } 714 maxi = maxj = 0; 715 maxcorr = corr[0][0]; 716 for (int i = 0; i < 4; i++) { 717 for (int j = 0; j < 7; j++) { 718 if (corr[i][j] > maxcorr) { 719 maxcorr = corr[i][j]; 720 maxi = i; 721 maxj = j; 722 } 723 } 724 } 725 for (int i = 0; i < len; i++) { 726 float tmp = 0.f; 727 if (maxi > 0.f) { 728 for (int k = 0; k < 7; k++) 729 tmp += exc[i - (pitch - maxj + 3) + k - 3] * shift_filt[maxi - 1][k]; 730 } else { 731 tmp = exc[i - (pitch - maxj + 3)]; 732 } 733 interp[i] = tmp; 734 } 735 return pitch - maxj + 3; 736} 737 738static void multicomb(const float *exc, float *new_exc, float *ak, int p, int nsf, 739 int pitch, int max_pitch, float comb_gain) 740{ 741 float old_ener, new_ener; 742 float iexc0_mag, iexc1_mag, exc_mag; 743 float iexc[4 * NB_SUBFRAME_SIZE]; 744 float corr0, corr1, gain0, gain1; 745 float pgain1, pgain2; 746 float c1, c2, g1, g2; 747 float ngain, gg1, gg2; 748 int corr_pitch = pitch; 749 750 interp_pitch(exc, iexc, corr_pitch, 80); 751 if (corr_pitch > max_pitch) 752 interp_pitch(exc, iexc + nsf, 2 * corr_pitch, 80); 753 else 754 interp_pitch(exc, iexc + nsf, -corr_pitch, 80); 755 756 iexc0_mag = sqrtf(1000.f + inner_prod(iexc, iexc, nsf)); 757 iexc1_mag = sqrtf(1000.f + inner_prod(iexc + nsf, iexc + nsf, nsf)); 758 exc_mag = sqrtf(1.f + inner_prod(exc, exc, nsf)); 759 corr0 = inner_prod(iexc, exc, nsf); 760 corr1 = inner_prod(iexc + nsf, exc, nsf); 761 if (corr0 > iexc0_mag * exc_mag) 762 pgain1 = 1.f; 763 else 764 pgain1 = (corr0 / exc_mag) / iexc0_mag; 765 if (corr1 > iexc1_mag * exc_mag) 766 pgain2 = 1.f; 767 else 768 pgain2 = (corr1 / exc_mag) / iexc1_mag; 769 gg1 = exc_mag / iexc0_mag; 770 gg2 = exc_mag / iexc1_mag; 771 if (comb_gain > 0.f) { 772 c1 = .4f * comb_gain + .07f; 773 c2 = .5f + 1.72f * (c1 - .07f); 774 } else { 775 c1 = c2 = 0.f; 776 } 777 g1 = 1.f - c2 * pgain1 * pgain1; 778 g2 = 1.f - c2 * pgain2 * pgain2; 779 g1 = fmaxf(g1, c1); 780 g2 = fmaxf(g2, c1); 781 g1 = c1 / g1; 782 g2 = c1 / g2; 783 784 if (corr_pitch > max_pitch) { 785 gain0 = .7f * g1 * gg1; 786 gain1 = .3f * g2 * gg2; 787 } else { 788 gain0 = .6f * g1 * gg1; 789 gain1 = .6f * g2 * gg2; 790 } 791 for (int i = 0; i < nsf; i++) 792 new_exc[i] = exc[i] + (gain0 * iexc[i]) + (gain1 * iexc[i + nsf]); 793 new_ener = compute_rms(new_exc, nsf); 794 old_ener = compute_rms(exc, nsf); 795 796 old_ener = fmaxf(old_ener, 1.f); 797 new_ener = fmaxf(new_ener, 1.f); 798 old_ener = fminf(old_ener, new_ener); 799 ngain = old_ener / new_ener; 800 801 for (int i = 0; i < nsf; i++) 802 new_exc[i] *= ngain; 803} 804 805static void lsp_interpolate(const float *old_lsp, const float *new_lsp, 806 float *lsp, int len, int subframe, 807 int nb_subframes, float margin) 808{ 809 const float tmp = (1.f + subframe) / nb_subframes; 810 811 for (int i = 0; i < len; i++) { 812 lsp[i] = (1.f - tmp) * old_lsp[i] + tmp * new_lsp[i]; 813 lsp[i] = av_clipf(lsp[i], margin, M_PI - margin); 814 } 815 for (int i = 1; i < len - 1; i++) { 816 lsp[i] = fmaxf(lsp[i], lsp[i - 1] + margin); 817 if (lsp[i] > lsp[i + 1] - margin) 818 lsp[i] = .5f * (lsp[i] + lsp[i + 1] - margin); 819 } 820} 821 822static void lsp_to_lpc(const float *freq, float *ak, int lpcrdr) 823{ 824 float xout1, xout2, xin1, xin2; 825 float *pw, *n0; 826 float Wp[4 * NB_ORDER + 2] = { 0 }; 827 float x_freq[NB_ORDER]; 828 const int m = lpcrdr >> 1; 829 830 pw = Wp; 831 832 xin1 = xin2 = 1.f; 833 834 for (int i = 0; i < lpcrdr; i++) 835 x_freq[i] = -cosf(freq[i]); 836 837 /* reconstruct P(z) and Q(z) by cascading second order 838 * polynomials in form 1 - 2xz(-1) +z(-2), where x is the 839 * LSP coefficient 840 */ 841 for (int j = 0; j <= lpcrdr; j++) { 842 int i2 = 0; 843 for (int i = 0; i < m; i++, i2 += 2) { 844 n0 = pw + (i * 4); 845 xout1 = xin1 + 2.f * x_freq[i2 ] * n0[0] + n0[1]; 846 xout2 = xin2 + 2.f * x_freq[i2 + 1] * n0[2] + n0[3]; 847 n0[1] = n0[0]; 848 n0[3] = n0[2]; 849 n0[0] = xin1; 850 n0[2] = xin2; 851 xin1 = xout1; 852 xin2 = xout2; 853 } 854 xout1 = xin1 + n0[4]; 855 xout2 = xin2 - n0[5]; 856 if (j > 0) 857 ak[j - 1] = (xout1 + xout2) * 0.5f; 858 n0[4] = xin1; 859 n0[5] = xin2; 860 861 xin1 = 0.f; 862 xin2 = 0.f; 863 } 864} 865 866static int nb_decode(AVCodecContext *avctx, void *ptr_st, 867 GetBitContext *gb, float *out) 868{ 869 DecoderState *st = ptr_st; 870 float ol_gain = 0, ol_pitch_coef = 0, best_pitch_gain = 0, pitch_average = 0; 871 int m, pitch, wideband, ol_pitch = 0, best_pitch = 40; 872 SpeexContext *s = avctx->priv_data; 873 float innov[NB_SUBFRAME_SIZE]; 874 float exc32[NB_SUBFRAME_SIZE]; 875 float interp_qlsp[NB_ORDER]; 876 float qlsp[NB_ORDER]; 877 float ak[NB_ORDER]; 878 float pitch_gain[3] = { 0 }; 879 880 st->exc = st->exc_buf + 2 * NB_PITCH_END + NB_SUBFRAME_SIZE + 6; 881 882 if (st->encode_submode) { 883 do { /* Search for next narrowband block (handle requests, skip wideband blocks) */ 884 if (get_bits_left(gb) < 5) 885 return AVERROR_INVALIDDATA; 886 wideband = get_bits1(gb); 887 if (wideband) /* Skip wideband block (for compatibility) */ { 888 int submode, advance; 889 890 submode = get_bits(gb, SB_SUBMODE_BITS); 891 advance = wb_skip_table[submode]; 892 advance -= SB_SUBMODE_BITS + 1; 893 if (advance < 0) 894 return AVERROR_INVALIDDATA; 895 skip_bits_long(gb, advance); 896 897 if (get_bits_left(gb) < 5) 898 return AVERROR_INVALIDDATA; 899 wideband = get_bits1(gb); 900 if (wideband) { 901 submode = get_bits(gb, SB_SUBMODE_BITS); 902 advance = wb_skip_table[submode]; 903 advance -= SB_SUBMODE_BITS + 1; 904 if (advance < 0) 905 return AVERROR_INVALIDDATA; 906 skip_bits_long(gb, advance); 907 wideband = get_bits1(gb); 908 if (wideband) { 909 av_log(avctx, AV_LOG_ERROR, "more than two wideband layers found\n"); 910 return AVERROR_INVALIDDATA; 911 } 912 } 913 } 914 if (get_bits_left(gb) < 4) 915 return AVERROR_INVALIDDATA; 916 m = get_bits(gb, 4); 917 if (m == 15) /* We found a terminator */ { 918 return AVERROR_INVALIDDATA; 919 } else if (m == 14) /* Speex in-band request */ { 920 int ret = speex_inband_handler(gb, st, &s->stereo); 921 if (ret) 922 return ret; 923 } else if (m == 13) /* User in-band request */ { 924 int ret = speex_default_user_handler(gb, st, NULL); 925 if (ret) 926 return ret; 927 } else if (m > 8) /* Invalid mode */ { 928 return AVERROR_INVALIDDATA; 929 } 930 } while (m > 8); 931 932 st->submodeID = m; /* Get the sub-mode that was used */ 933 } 934 935 /* Shift all buffers by one frame */ 936 memmove(st->exc_buf, st->exc_buf + NB_FRAME_SIZE, (2 * NB_PITCH_END + NB_SUBFRAME_SIZE + 12) * sizeof(float)); 937 938 /* If null mode (no transmission), just set a couple things to zero */ 939 if (st->submodes[st->submodeID] == NULL) { 940 float lpc[NB_ORDER]; 941 float innov_gain = 0.f; 942 943 bw_lpc(0.93f, st->interp_qlpc, lpc, NB_ORDER); 944 innov_gain = compute_rms(st->exc, NB_FRAME_SIZE); 945 for (int i = 0; i < NB_FRAME_SIZE; i++) 946 st->exc[i] = speex_rand(innov_gain, &st->seed); 947 948 /* Final signal synthesis from excitation */ 949 iir_mem(st->exc, lpc, out, NB_FRAME_SIZE, NB_ORDER, st->mem_sp); 950 st->count_lost = 0; 951 952 return 0; 953 } 954 955 /* Unquantize LSPs */ 956 SUBMODE(lsp_unquant)(qlsp, NB_ORDER, gb); 957 958 /* Damp memory if a frame was lost and the LSP changed too much */ 959 if (st->count_lost) { 960 float fact, lsp_dist = 0; 961 962 for (int i = 0; i < NB_ORDER; i++) 963 lsp_dist = lsp_dist + FFABS(st->old_qlsp[i] - qlsp[i]); 964 fact = .6f * exp(-.2f * lsp_dist); 965 for (int i = 0; i < NB_ORDER; i++) 966 st->mem_sp[i] = fact * st->mem_sp[i]; 967 } 968 969 /* Handle first frame and lost-packet case */ 970 if (st->first || st->count_lost) 971 memcpy(st->old_qlsp, qlsp, sizeof(st->old_qlsp)); 972 973 /* Get open-loop pitch estimation for low bit-rate pitch coding */ 974 if (SUBMODE(lbr_pitch) != -1) 975 ol_pitch = NB_PITCH_START + get_bits(gb, 7); 976 977 if (SUBMODE(forced_pitch_gain)) 978 ol_pitch_coef = 0.066667f * get_bits(gb, 4); 979 980 /* Get global excitation gain */ 981 ol_gain = expf(get_bits(gb, 5) / 3.5f); 982 983 if (st->submodeID == 1) 984 st->dtx_enabled = get_bits(gb, 4) == 15; 985 986 if (st->submodeID > 1) 987 st->dtx_enabled = 0; 988 989 for (int sub = 0; sub < NB_NB_SUBFRAMES; sub++) { /* Loop on subframes */ 990 float *exc, *innov_save = NULL, tmp, ener; 991 int pit_min, pit_max, offset, q_energy; 992 993 offset = NB_SUBFRAME_SIZE * sub; /* Offset relative to start of frame */ 994 exc = st->exc + offset; /* Excitation */ 995 if (st->innov_save) /* Original signal */ 996 innov_save = st->innov_save + offset; 997 998 SPEEX_MEMSET(exc, 0, NB_SUBFRAME_SIZE); /* Reset excitation */ 999 1000 /* Adaptive codebook contribution */ 1001 av_assert0(SUBMODE(ltp_unquant)); 1002 /* Handle pitch constraints if any */ 1003 if (SUBMODE(lbr_pitch) != -1) { 1004 int margin = SUBMODE(lbr_pitch); 1005 1006 if (margin) { 1007 pit_min = ol_pitch - margin + 1; 1008 pit_min = FFMAX(pit_min, NB_PITCH_START); 1009 pit_max = ol_pitch + margin; 1010 pit_max = FFMIN(pit_max, NB_PITCH_START); 1011 } else { 1012 pit_min = pit_max = ol_pitch; 1013 } 1014 } else { 1015 pit_min = NB_PITCH_START; 1016 pit_max = NB_PITCH_END; 1017 } 1018 1019 SUBMODE(ltp_unquant)(exc, exc32, pit_min, pit_max, ol_pitch_coef, SUBMODE(LtpParam), 1020 NB_SUBFRAME_SIZE, &pitch, pitch_gain, gb, st->count_lost, offset, 1021 st->last_pitch_gain, 0); 1022 1023 sanitize_values(exc32, -32000, 32000, NB_SUBFRAME_SIZE); 1024 1025 tmp = gain_3tap_to_1tap(pitch_gain); 1026 1027 pitch_average += tmp; 1028 if ((tmp > best_pitch_gain && 1029 FFABS(2 * best_pitch - pitch) >= 3 && 1030 FFABS(3 * best_pitch - pitch) >= 4 && 1031 FFABS(4 * best_pitch - pitch) >= 5) || 1032 (tmp > .6f * best_pitch_gain && 1033 (FFABS(best_pitch - 2 * pitch) < 3 || 1034 FFABS(best_pitch - 3 * pitch) < 4 || 1035 FFABS(best_pitch - 4 * pitch) < 5)) || 1036 ((.67f * tmp) > best_pitch_gain && 1037 (FFABS(2 * best_pitch - pitch) < 3 || 1038 FFABS(3 * best_pitch - pitch) < 4 || 1039 FFABS(4 * best_pitch - pitch) < 5))) { 1040 best_pitch = pitch; 1041 if (tmp > best_pitch_gain) 1042 best_pitch_gain = tmp; 1043 } 1044 1045 memset(innov, 0, sizeof(innov)); 1046 1047 /* Decode sub-frame gain correction */ 1048 if (SUBMODE(have_subframe_gain) == 3) { 1049 q_energy = get_bits(gb, 3); 1050 ener = exc_gain_quant_scal3[q_energy] * ol_gain; 1051 } else if (SUBMODE(have_subframe_gain) == 1) { 1052 q_energy = get_bits1(gb); 1053 ener = exc_gain_quant_scal1[q_energy] * ol_gain; 1054 } else { 1055 ener = ol_gain; 1056 } 1057 1058 av_assert0(SUBMODE(innovation_unquant)); 1059 /* Fixed codebook contribution */ 1060 SUBMODE(innovation_unquant)(innov, SUBMODE(innovation_params), NB_SUBFRAME_SIZE, gb, &st->seed); 1061 /* De-normalize innovation and update excitation */ 1062 1063 signal_mul(innov, innov, ener, NB_SUBFRAME_SIZE); 1064 1065 /* Decode second codebook (only for some modes) */ 1066 if (SUBMODE(double_codebook)) { 1067 float innov2[NB_SUBFRAME_SIZE] = { 0 }; 1068 1069 SUBMODE(innovation_unquant)(innov2, SUBMODE(innovation_params), NB_SUBFRAME_SIZE, gb, &st->seed); 1070 signal_mul(innov2, innov2, 0.454545f * ener, NB_SUBFRAME_SIZE); 1071 for (int i = 0; i < NB_SUBFRAME_SIZE; i++) 1072 innov[i] += innov2[i]; 1073 } 1074 for (int i = 0; i < NB_SUBFRAME_SIZE; i++) 1075 exc[i] = exc32[i] + innov[i]; 1076 if (innov_save) 1077 memcpy(innov_save, innov, sizeof(innov)); 1078 1079 /* Vocoder mode */ 1080 if (st->submodeID == 1) { 1081 float g = ol_pitch_coef; 1082 1083 g = av_clipf(1.5f * (g - .2f), 0.f, 1.f); 1084 1085 SPEEX_MEMSET(exc, 0, NB_SUBFRAME_SIZE); 1086 while (st->voc_offset < NB_SUBFRAME_SIZE) { 1087 if (st->voc_offset >= 0) 1088 exc[st->voc_offset] = sqrtf(2.f * ol_pitch) * (g * ol_gain); 1089 st->voc_offset += ol_pitch; 1090 } 1091 st->voc_offset -= NB_SUBFRAME_SIZE; 1092 1093 for (int i = 0; i < NB_SUBFRAME_SIZE; i++) { 1094 float exci = exc[i]; 1095 exc[i] = (.7f * exc[i] + .3f * st->voc_m1) + ((1.f - .85f * g) * innov[i]) - .15f * g * st->voc_m2; 1096 st->voc_m1 = exci; 1097 st->voc_m2 = innov[i]; 1098 st->voc_mean = .8f * st->voc_mean + .2f * exc[i]; 1099 exc[i] -= st->voc_mean; 1100 } 1101 } 1102 } 1103 1104 if (st->lpc_enh_enabled && SUBMODE(comb_gain) > 0 && !st->count_lost) { 1105 multicomb(st->exc - NB_SUBFRAME_SIZE, out, st->interp_qlpc, NB_ORDER, 1106 2 * NB_SUBFRAME_SIZE, best_pitch, 40, SUBMODE(comb_gain)); 1107 multicomb(st->exc + NB_SUBFRAME_SIZE, out + 2 * NB_SUBFRAME_SIZE, 1108 st->interp_qlpc, NB_ORDER, 2 * NB_SUBFRAME_SIZE, best_pitch, 40, 1109 SUBMODE(comb_gain)); 1110 } else { 1111 SPEEX_COPY(out, &st->exc[-NB_SUBFRAME_SIZE], NB_FRAME_SIZE); 1112 } 1113 1114 /* If the last packet was lost, re-scale the excitation to obtain the same 1115 * energy as encoded in ol_gain */ 1116 if (st->count_lost) { 1117 float exc_ener, gain; 1118 1119 exc_ener = compute_rms(st->exc, NB_FRAME_SIZE); 1120 av_assert0(exc_ener + 1.f > 0.f); 1121 gain = fminf(ol_gain / (exc_ener + 1.f), 2.f); 1122 for (int i = 0; i < NB_FRAME_SIZE; i++) { 1123 st->exc[i] *= gain; 1124 out[i] = st->exc[i - NB_SUBFRAME_SIZE]; 1125 } 1126 } 1127 1128 for (int sub = 0; sub < NB_NB_SUBFRAMES; sub++) { /* Loop on subframes */ 1129 const int offset = NB_SUBFRAME_SIZE * sub; /* Offset relative to start of frame */ 1130 float pi_g = 1.f, *sp = out + offset; /* Original signal */ 1131 1132 lsp_interpolate(st->old_qlsp, qlsp, interp_qlsp, NB_ORDER, sub, NB_NB_SUBFRAMES, 0.002f); 1133 lsp_to_lpc(interp_qlsp, ak, NB_ORDER); /* Compute interpolated LPCs (unquantized) */ 1134 1135 for (int i = 0; i < NB_ORDER; i += 2) /* Compute analysis filter at w=pi */ 1136 pi_g += ak[i + 1] - ak[i]; 1137 st->pi_gain[sub] = pi_g; 1138 st->exc_rms[sub] = compute_rms(st->exc + offset, NB_SUBFRAME_SIZE); 1139 1140 iir_mem(sp, st->interp_qlpc, sp, NB_SUBFRAME_SIZE, NB_ORDER, st->mem_sp); 1141 1142 memcpy(st->interp_qlpc, ak, sizeof(st->interp_qlpc)); 1143 } 1144 1145 if (st->highpass_enabled) 1146 highpass(out, out, NB_FRAME_SIZE, st->mem_hp, st->is_wideband); 1147 1148 /* Store the LSPs for interpolation in the next frame */ 1149 memcpy(st->old_qlsp, qlsp, sizeof(st->old_qlsp)); 1150 1151 st->count_lost = 0; 1152 st->last_pitch = best_pitch; 1153 st->last_pitch_gain = .25f * pitch_average; 1154 st->last_ol_gain = ol_gain; 1155 st->first = 0; 1156 1157 return 0; 1158} 1159 1160static void qmf_synth(const float *x1, const float *x2, const float *a, float *y, int N, int M, float *mem1, float *mem2) 1161{ 1162 const int M2 = M >> 1, N2 = N >> 1; 1163 float xx1[352], xx2[352]; 1164 1165 for (int i = 0; i < N2; i++) 1166 xx1[i] = x1[N2-1-i]; 1167 for (int i = 0; i < M2; i++) 1168 xx1[N2+i] = mem1[2*i+1]; 1169 for (int i = 0; i < N2; i++) 1170 xx2[i] = x2[N2-1-i]; 1171 for (int i = 0; i < M2; i++) 1172 xx2[N2+i] = mem2[2*i+1]; 1173 1174 for (int i = 0; i < N2; i += 2) { 1175 float y0, y1, y2, y3; 1176 float x10, x20; 1177 1178 y0 = y1 = y2 = y3 = 0.f; 1179 x10 = xx1[N2-2-i]; 1180 x20 = xx2[N2-2-i]; 1181 1182 for (int j = 0; j < M2; j += 2) { 1183 float x11, x21; 1184 float a0, a1; 1185 1186 a0 = a[2*j]; 1187 a1 = a[2*j+1]; 1188 x11 = xx1[N2-1+j-i]; 1189 x21 = xx2[N2-1+j-i]; 1190 1191 y0 += a0 * (x11-x21); 1192 y1 += a1 * (x11+x21); 1193 y2 += a0 * (x10-x20); 1194 y3 += a1 * (x10+x20); 1195 a0 = a[2*j+2]; 1196 a1 = a[2*j+3]; 1197 x10 = xx1[N2+j-i]; 1198 x20 = xx2[N2+j-i]; 1199 1200 y0 += a0 * (x10-x20); 1201 y1 += a1 * (x10+x20); 1202 y2 += a0 * (x11-x21); 1203 y3 += a1 * (x11+x21); 1204 } 1205 y[2 * i ] = 2.f * y0; 1206 y[2 * i+1] = 2.f * y1; 1207 y[2 * i+2] = 2.f * y2; 1208 y[2 * i+3] = 2.f * y3; 1209 } 1210 1211 for (int i = 0; i < M2; i++) 1212 mem1[2*i+1] = xx1[i]; 1213 for (int i = 0; i < M2; i++) 1214 mem2[2*i+1] = xx2[i]; 1215} 1216 1217static int sb_decode(AVCodecContext *avctx, void *ptr_st, 1218 GetBitContext *gb, float *out) 1219{ 1220 SpeexContext *s = avctx->priv_data; 1221 DecoderState *st = ptr_st; 1222 float low_pi_gain[NB_NB_SUBFRAMES]; 1223 float low_exc_rms[NB_NB_SUBFRAMES]; 1224 float interp_qlsp[NB_ORDER]; 1225 int ret, wideband; 1226 float *low_innov_alias; 1227 float qlsp[NB_ORDER]; 1228 float ak[NB_ORDER]; 1229 const SpeexMode *mode; 1230 1231 mode = st->mode; 1232 1233 if (st->modeID > 0) { 1234 low_innov_alias = out + st->frame_size; 1235 s->st[st->modeID - 1].innov_save = low_innov_alias; 1236 ret = speex_modes[st->modeID - 1].decode(avctx, &s->st[st->modeID - 1], gb, out); 1237 if (ret < 0) 1238 return ret; 1239 } 1240 1241 if (st->encode_submode) { /* Check "wideband bit" */ 1242 if (get_bits_left(gb) > 0) 1243 wideband = show_bits1(gb); 1244 else 1245 wideband = 0; 1246 if (wideband) { /* Regular wideband frame, read the submode */ 1247 wideband = get_bits1(gb); 1248 st->submodeID = get_bits(gb, SB_SUBMODE_BITS); 1249 } else { /* Was a narrowband frame, set "null submode" */ 1250 st->submodeID = 0; 1251 } 1252 if (st->submodeID != 0 && st->submodes[st->submodeID] == NULL) 1253 return AVERROR_INVALIDDATA; 1254 } 1255 1256 /* If null mode (no transmission), just set a couple things to zero */ 1257 if (st->submodes[st->submodeID] == NULL) { 1258 for (int i = 0; i < st->frame_size; i++) 1259 out[st->frame_size + i] = 1e-15f; 1260 1261 st->first = 1; 1262 1263 /* Final signal synthesis from excitation */ 1264 iir_mem(out + st->frame_size, st->interp_qlpc, out + st->frame_size, st->frame_size, st->lpc_size, st->mem_sp); 1265 1266 qmf_synth(out, out + st->frame_size, h0, out, st->full_frame_size, QMF_ORDER, st->g0_mem, st->g1_mem); 1267 1268 return 0; 1269 } 1270 1271 memcpy(low_pi_gain, s->st[st->modeID - 1].pi_gain, sizeof(low_pi_gain)); 1272 memcpy(low_exc_rms, s->st[st->modeID - 1].exc_rms, sizeof(low_exc_rms)); 1273 1274 SUBMODE(lsp_unquant)(qlsp, st->lpc_size, gb); 1275 1276 if (st->first) 1277 memcpy(st->old_qlsp, qlsp, sizeof(st->old_qlsp)); 1278 1279 for (int sub = 0; sub < st->nb_subframes; sub++) { 1280 float filter_ratio, el, rl, rh; 1281 float *innov_save = NULL, *sp; 1282 float exc[80]; 1283 int offset; 1284 1285 offset = st->subframe_size * sub; 1286 sp = out + st->frame_size + offset; 1287 /* Pointer for saving innovation */ 1288 if (st->innov_save) { 1289 innov_save = st->innov_save + 2 * offset; 1290 SPEEX_MEMSET(innov_save, 0, 2 * st->subframe_size); 1291 } 1292 1293 av_assert0(st->nb_subframes > 0); 1294 lsp_interpolate(st->old_qlsp, qlsp, interp_qlsp, st->lpc_size, sub, st->nb_subframes, 0.05f); 1295 lsp_to_lpc(interp_qlsp, ak, st->lpc_size); 1296 1297 /* Calculate reponse ratio between the low and high filter in the middle 1298 of the band (4000 Hz) */ 1299 st->pi_gain[sub] = 1.f; 1300 rh = 1.f; 1301 for (int i = 0; i < st->lpc_size; i += 2) { 1302 rh += ak[i + 1] - ak[i]; 1303 st->pi_gain[sub] += ak[i] + ak[i + 1]; 1304 } 1305 1306 rl = low_pi_gain[sub]; 1307 filter_ratio = (rl + .01f) / (rh + .01f); 1308 1309 SPEEX_MEMSET(exc, 0, st->subframe_size); 1310 if (!SUBMODE(innovation_unquant)) { 1311 const int x = get_bits(gb, 5); 1312 const float g = expf(.125f * (x - 10)) / filter_ratio; 1313 1314 for (int i = 0; i < st->subframe_size; i += 2) { 1315 exc[i ] = mode->folding_gain * low_innov_alias[offset + i ] * g; 1316 exc[i + 1] = -mode->folding_gain * low_innov_alias[offset + i + 1] * g; 1317 } 1318 } else { 1319 float gc, scale; 1320 1321 el = low_exc_rms[sub]; 1322 gc = 0.87360f * gc_quant_bound[get_bits(gb, 4)]; 1323 1324 if (st->subframe_size == 80) 1325 gc *= M_SQRT2; 1326 1327 scale = (gc * el) / filter_ratio; 1328 SUBMODE(innovation_unquant) 1329 (exc, SUBMODE(innovation_params), st->subframe_size, 1330 gb, &st->seed); 1331 1332 signal_mul(exc, exc, scale, st->subframe_size); 1333 if (SUBMODE(double_codebook)) { 1334 float innov2[80]; 1335 1336 SPEEX_MEMSET(innov2, 0, st->subframe_size); 1337 SUBMODE(innovation_unquant)(innov2, SUBMODE(innovation_params), st->subframe_size, gb, &st->seed); 1338 signal_mul(innov2, innov2, 0.4f * scale, st->subframe_size); 1339 for (int i = 0; i < st->subframe_size; i++) 1340 exc[i] += innov2[i]; 1341 } 1342 } 1343 1344 if (st->innov_save) { 1345 for (int i = 0; i < st->subframe_size; i++) 1346 innov_save[2 * i] = exc[i]; 1347 } 1348 1349 iir_mem(st->exc_buf, st->interp_qlpc, sp, st->subframe_size, st->lpc_size, st->mem_sp); 1350 memcpy(st->exc_buf, exc, sizeof(exc)); 1351 memcpy(st->interp_qlpc, ak, sizeof(st->interp_qlpc)); 1352 st->exc_rms[sub] = compute_rms(st->exc_buf, st->subframe_size); 1353 } 1354 1355 qmf_synth(out, out + st->frame_size, h0, out, st->full_frame_size, QMF_ORDER, st->g0_mem, st->g1_mem); 1356 memcpy(st->old_qlsp, qlsp, sizeof(st->old_qlsp)); 1357 1358 st->first = 0; 1359 1360 return 0; 1361} 1362 1363static int decoder_init(SpeexContext *s, DecoderState *st, const SpeexMode *mode) 1364{ 1365 st->mode = mode; 1366 st->modeID = mode->modeID; 1367 1368 st->first = 1; 1369 st->encode_submode = 1; 1370 st->is_wideband = st->modeID > 0; 1371 st->innov_save = NULL; 1372 1373 st->submodes = mode->submodes; 1374 st->submodeID = mode->default_submode; 1375 st->subframe_size = mode->subframe_size; 1376 st->lpc_size = mode->lpc_size; 1377 st->full_frame_size = (1 + (st->modeID > 0)) * mode->frame_size; 1378 st->nb_subframes = mode->frame_size / mode->subframe_size; 1379 st->frame_size = mode->frame_size; 1380 1381 st->lpc_enh_enabled = 1; 1382 1383 st->last_pitch = 40; 1384 st->count_lost = 0; 1385 st->seed = 1000; 1386 st->last_ol_gain = 0; 1387 1388 st->voc_m1 = st->voc_m2 = st->voc_mean = 0; 1389 st->voc_offset = 0; 1390 st->dtx_enabled = 0; 1391 st->highpass_enabled = mode->modeID == 0; 1392 1393 return 0; 1394} 1395 1396static int parse_speex_extradata(AVCodecContext *avctx, 1397 const uint8_t *extradata, int extradata_size) 1398{ 1399 SpeexContext *s = avctx->priv_data; 1400 const uint8_t *buf = extradata; 1401 1402 if (memcmp(buf, "Speex ", 8)) 1403 return AVERROR_INVALIDDATA; 1404 1405 buf += 28; 1406 1407 s->version_id = bytestream_get_le32(&buf); 1408 buf += 4; 1409 s->rate = bytestream_get_le32(&buf); 1410 if (s->rate <= 0) 1411 return AVERROR_INVALIDDATA; 1412 s->mode = bytestream_get_le32(&buf); 1413 if (s->mode < 0 || s->mode >= SPEEX_NB_MODES) 1414 return AVERROR_INVALIDDATA; 1415 s->bitstream_version = bytestream_get_le32(&buf); 1416 if (s->bitstream_version != 4) 1417 return AVERROR_INVALIDDATA; 1418 s->nb_channels = bytestream_get_le32(&buf); 1419 if (s->nb_channels <= 0 || s->nb_channels > 2) 1420 return AVERROR_INVALIDDATA; 1421 s->bitrate = bytestream_get_le32(&buf); 1422 s->frame_size = bytestream_get_le32(&buf); 1423 if (s->frame_size < NB_FRAME_SIZE << s->mode) 1424 return AVERROR_INVALIDDATA; 1425 s->vbr = bytestream_get_le32(&buf); 1426 s->frames_per_packet = bytestream_get_le32(&buf); 1427 if (s->frames_per_packet <= 0 || 1428 s->frames_per_packet > 64 || 1429 s->frames_per_packet >= INT32_MAX / s->nb_channels / s->frame_size) 1430 return AVERROR_INVALIDDATA; 1431 s->extra_headers = bytestream_get_le32(&buf); 1432 1433 return 0; 1434} 1435 1436static av_cold int speex_decode_init(AVCodecContext *avctx) 1437{ 1438 SpeexContext *s = avctx->priv_data; 1439 int ret; 1440 1441 s->fdsp = avpriv_float_dsp_alloc(0); 1442 if (!s->fdsp) 1443 return AVERROR(ENOMEM); 1444 1445 if (avctx->extradata && avctx->extradata_size >= 80) { 1446 ret = parse_speex_extradata(avctx, avctx->extradata, avctx->extradata_size); 1447 if (ret < 0) 1448 return ret; 1449 } else { 1450 s->rate = avctx->sample_rate; 1451 if (s->rate <= 0) 1452 return AVERROR_INVALIDDATA; 1453 1454 s->nb_channels = avctx->ch_layout.nb_channels; 1455 if (s->nb_channels <= 0 || s->nb_channels > 2) 1456 return AVERROR_INVALIDDATA; 1457 1458 switch (s->rate) { 1459 case 8000: s->mode = 0; break; 1460 case 16000: s->mode = 1; break; 1461 case 32000: s->mode = 2; break; 1462 default: s->mode = 2; 1463 } 1464 1465 s->frames_per_packet = 1; 1466 s->frame_size = NB_FRAME_SIZE << s->mode; 1467 } 1468 1469 if (avctx->codec_tag == MKTAG('S', 'P', 'X', 'N')) { 1470 int quality; 1471 1472 if (!avctx->extradata || avctx->extradata && avctx->extradata_size < 47) { 1473 av_log(avctx, AV_LOG_ERROR, "Missing or invalid extradata.\n"); 1474 return AVERROR_INVALIDDATA; 1475 } 1476 1477 quality = avctx->extradata[37]; 1478 if (quality > 10) { 1479 av_log(avctx, AV_LOG_ERROR, "Unsupported quality mode %d.\n", quality); 1480 return AVERROR_PATCHWELCOME; 1481 } 1482 1483 s->pkt_size = ((const uint8_t[]){ 5, 10, 15, 20, 20, 28, 28, 38, 38, 46, 62 })[quality]; 1484 1485 s->mode = 0; 1486 s->nb_channels = 1; 1487 s->rate = avctx->sample_rate; 1488 if (s->rate <= 0) 1489 return AVERROR_INVALIDDATA; 1490 s->frames_per_packet = 1; 1491 s->frame_size = NB_FRAME_SIZE; 1492 } 1493 1494 if (s->bitrate > 0) 1495 avctx->bit_rate = s->bitrate; 1496 av_channel_layout_uninit(&avctx->ch_layout); 1497 avctx->ch_layout.order = AV_CHANNEL_ORDER_UNSPEC; 1498 avctx->ch_layout.nb_channels = s->nb_channels; 1499 avctx->sample_rate = s->rate; 1500 avctx->sample_fmt = AV_SAMPLE_FMT_FLT; 1501 1502 for (int m = 0; m <= s->mode; m++) { 1503 ret = decoder_init(s, &s->st[m], &speex_modes[m]); 1504 if (ret < 0) 1505 return ret; 1506 } 1507 1508 s->stereo.balance = 1.f; 1509 s->stereo.e_ratio = .5f; 1510 s->stereo.smooth_left = 1.f; 1511 s->stereo.smooth_right = 1.f; 1512 1513 return 0; 1514} 1515 1516static void speex_decode_stereo(float *data, int frame_size, StereoState *stereo) 1517{ 1518 float balance, e_left, e_right, e_ratio; 1519 1520 balance = stereo->balance; 1521 e_ratio = stereo->e_ratio; 1522 1523 /* These two are Q14, with max value just below 2. */ 1524 e_right = 1.f / sqrtf(e_ratio * (1.f + balance)); 1525 e_left = sqrtf(balance) * e_right; 1526 1527 for (int i = frame_size - 1; i >= 0; i--) { 1528 float tmp = data[i]; 1529 stereo->smooth_left = stereo->smooth_left * 0.98f + e_left * 0.02f; 1530 stereo->smooth_right = stereo->smooth_right * 0.98f + e_right * 0.02f; 1531 data[2 * i ] = stereo->smooth_left * tmp; 1532 data[2 * i + 1] = stereo->smooth_right * tmp; 1533 } 1534} 1535 1536static int speex_decode_frame(AVCodecContext *avctx, AVFrame *frame, 1537 int *got_frame_ptr, AVPacket *avpkt) 1538{ 1539 SpeexContext *s = avctx->priv_data; 1540 const float scale = 1.f / 32768.f; 1541 int buf_size = avpkt->size; 1542 float *dst; 1543 int ret; 1544 1545 if (s->pkt_size && avpkt->size == 62) 1546 buf_size = s->pkt_size; 1547 if ((ret = init_get_bits8(&s->gb, avpkt->data, buf_size)) < 0) 1548 return ret; 1549 1550 frame->nb_samples = FFALIGN(s->frame_size * s->frames_per_packet, 4); 1551 if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) 1552 return ret; 1553 1554 dst = (float *)frame->extended_data[0]; 1555 for (int i = 0; i < s->frames_per_packet; i++) { 1556 ret = speex_modes[s->mode].decode(avctx, &s->st[s->mode], &s->gb, dst + i * s->frame_size); 1557 if (ret < 0) 1558 return ret; 1559 if (avctx->ch_layout.nb_channels == 2) 1560 speex_decode_stereo(dst + i * s->frame_size, s->frame_size, &s->stereo); 1561 } 1562 1563 dst = (float *)frame->extended_data[0]; 1564 s->fdsp->vector_fmul_scalar(dst, dst, scale, frame->nb_samples * frame->ch_layout.nb_channels); 1565 frame->nb_samples = s->frame_size * s->frames_per_packet; 1566 1567 *got_frame_ptr = 1; 1568 1569 return buf_size; 1570} 1571 1572static av_cold int speex_decode_close(AVCodecContext *avctx) 1573{ 1574 SpeexContext *s = avctx->priv_data; 1575 av_freep(&s->fdsp); 1576 return 0; 1577} 1578 1579const FFCodec ff_speex_decoder = { 1580 .p.name = "speex", 1581 .p.long_name = NULL_IF_CONFIG_SMALL("Speex"), 1582 .p.type = AVMEDIA_TYPE_AUDIO, 1583 .p.id = AV_CODEC_ID_SPEEX, 1584 .init = speex_decode_init, 1585 FF_CODEC_DECODE_CB(speex_decode_frame), 1586 .close = speex_decode_close, 1587 .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_CHANNEL_CONF, 1588 .priv_data_size = sizeof(SpeexContext), 1589 .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_INIT_CLEANUP, 1590}; 1591