xref: /third_party/ffmpeg/libavcodec/wmavoice.c (revision cabdff1a)
1/*
2 * Windows Media Audio Voice decoder.
3 * Copyright (c) 2009 Ronald S. Bultje
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22/**
23 * @file
24 * @brief Windows Media Audio Voice compatible decoder
25 * @author Ronald S. Bultje <rsbultje@gmail.com>
26 */
27
28#include <math.h>
29
30#include "libavutil/channel_layout.h"
31#include "libavutil/float_dsp.h"
32#include "libavutil/mem_internal.h"
33#include "libavutil/thread.h"
34#include "avcodec.h"
35#include "codec_internal.h"
36#include "internal.h"
37#include "get_bits.h"
38#include "put_bits.h"
39#include "wmavoice_data.h"
40#include "celp_filters.h"
41#include "acelp_vectors.h"
42#include "acelp_filters.h"
43#include "lsp.h"
44#include "dct.h"
45#include "rdft.h"
46#include "sinewin.h"
47
48#define MAX_BLOCKS           8   ///< maximum number of blocks per frame
49#define MAX_LSPS             16  ///< maximum filter order
50#define MAX_LSPS_ALIGN16     16  ///< same as #MAX_LSPS; needs to be multiple
51                                 ///< of 16 for ASM input buffer alignment
52#define MAX_FRAMES           3   ///< maximum number of frames per superframe
53#define MAX_FRAMESIZE        160 ///< maximum number of samples per frame
54#define MAX_SIGNAL_HISTORY   416 ///< maximum excitation signal history
55#define MAX_SFRAMESIZE       (MAX_FRAMESIZE * MAX_FRAMES)
56                                 ///< maximum number of samples per superframe
57#define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
58                                 ///< was split over two packets
59#define VLC_NBITS            6   ///< number of bits to read per VLC iteration
60
61/**
62 * Frame type VLC coding.
63 */
64static VLC frame_type_vlc;
65
66/**
67 * Adaptive codebook types.
68 */
69enum {
70    ACB_TYPE_NONE       = 0, ///< no adaptive codebook (only hardcoded fixed)
71    ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
72                             ///< we interpolate to get a per-sample pitch.
73                             ///< Signal is generated using an asymmetric sinc
74                             ///< window function
75                             ///< @note see #wmavoice_ipol1_coeffs
76    ACB_TYPE_HAMMING    = 2  ///< Per-block pitch with signal generation using
77                             ///< a Hamming sinc window function
78                             ///< @note see #wmavoice_ipol2_coeffs
79};
80
81/**
82 * Fixed codebook types.
83 */
84enum {
85    FCB_TYPE_SILENCE    = 0, ///< comfort noise during silence
86                             ///< generated from a hardcoded (fixed) codebook
87                             ///< with per-frame (low) gain values
88    FCB_TYPE_HARDCODED  = 1, ///< hardcoded (fixed) codebook with per-block
89                             ///< gain values
90    FCB_TYPE_AW_PULSES  = 2, ///< Pitch-adaptive window (AW) pulse signals,
91                             ///< used in particular for low-bitrate streams
92    FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
93                             ///< combinations of either single pulses or
94                             ///< pulse pairs
95};
96
97/**
98 * Description of frame types.
99 */
100static const struct frame_type_desc {
101    uint8_t n_blocks;     ///< amount of blocks per frame (each block
102                          ///< (contains 160/#n_blocks samples)
103    uint8_t log_n_blocks; ///< log2(#n_blocks)
104    uint8_t acb_type;     ///< Adaptive codebook type (ACB_TYPE_*)
105    uint8_t fcb_type;     ///< Fixed codebook type (FCB_TYPE_*)
106    uint8_t dbl_pulses;   ///< how many pulse vectors have pulse pairs
107                          ///< (rather than just one single pulse)
108                          ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
109} frame_descs[17] = {
110    { 1, 0, ACB_TYPE_NONE,       FCB_TYPE_SILENCE,    0 },
111    { 2, 1, ACB_TYPE_NONE,       FCB_TYPE_HARDCODED,  0 },
112    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES,  0 },
113    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
114    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
115    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0 },
116    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
117    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
118    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
119    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
120    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 },
121    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
122    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
123    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 },
124    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
125    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
126    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 }
127};
128
129/**
130 * WMA Voice decoding context.
131 */
132typedef struct WMAVoiceContext {
133    /**
134     * @name Global values specified in the stream header / extradata or used all over.
135     * @{
136     */
137    GetBitContext gb;             ///< packet bitreader. During decoder init,
138                                  ///< it contains the extradata from the
139                                  ///< demuxer. During decoding, it contains
140                                  ///< packet data.
141    int8_t vbm_tree[25];          ///< converts VLC codes to frame type
142
143    int spillover_bitsize;        ///< number of bits used to specify
144                                  ///< #spillover_nbits in the packet header
145                                  ///< = ceil(log2(ctx->block_align << 3))
146    int history_nsamples;         ///< number of samples in history for signal
147                                  ///< prediction (through ACB)
148
149    /* postfilter specific values */
150    int do_apf;                   ///< whether to apply the averaged
151                                  ///< projection filter (APF)
152    int denoise_strength;         ///< strength of denoising in Wiener filter
153                                  ///< [0-11]
154    int denoise_tilt_corr;        ///< Whether to apply tilt correction to the
155                                  ///< Wiener filter coefficients (postfilter)
156    int dc_level;                 ///< Predicted amount of DC noise, based
157                                  ///< on which a DC removal filter is used
158
159    int lsps;                     ///< number of LSPs per frame [10 or 16]
160    int lsp_q_mode;               ///< defines quantizer defaults [0, 1]
161    int lsp_def_mode;             ///< defines different sets of LSP defaults
162                                  ///< [0, 1]
163
164    int min_pitch_val;            ///< base value for pitch parsing code
165    int max_pitch_val;            ///< max value + 1 for pitch parsing
166    int pitch_nbits;              ///< number of bits used to specify the
167                                  ///< pitch value in the frame header
168    int block_pitch_nbits;        ///< number of bits used to specify the
169                                  ///< first block's pitch value
170    int block_pitch_range;        ///< range of the block pitch
171    int block_delta_pitch_nbits;  ///< number of bits used to specify the
172                                  ///< delta pitch between this and the last
173                                  ///< block's pitch value, used in all but
174                                  ///< first block
175    int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
176                                  ///< from -this to +this-1)
177    uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
178                                  ///< conversion
179
180    /**
181     * @}
182     *
183     * @name Packet values specified in the packet header or related to a packet.
184     *
185     * A packet is considered to be a single unit of data provided to this
186     * decoder by the demuxer.
187     * @{
188     */
189    int spillover_nbits;          ///< number of bits of the previous packet's
190                                  ///< last superframe preceding this
191                                  ///< packet's first full superframe (useful
192                                  ///< for re-synchronization also)
193    int has_residual_lsps;        ///< if set, superframes contain one set of
194                                  ///< LSPs that cover all frames, encoded as
195                                  ///< independent and residual LSPs; if not
196                                  ///< set, each frame contains its own, fully
197                                  ///< independent, LSPs
198    int skip_bits_next;           ///< number of bits to skip at the next call
199                                  ///< to #wmavoice_decode_packet() (since
200                                  ///< they're part of the previous superframe)
201
202    uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + AV_INPUT_BUFFER_PADDING_SIZE];
203                                  ///< cache for superframe data split over
204                                  ///< multiple packets
205    int sframe_cache_size;        ///< set to >0 if we have data from an
206                                  ///< (incomplete) superframe from a previous
207                                  ///< packet that spilled over in the current
208                                  ///< packet; specifies the amount of bits in
209                                  ///< #sframe_cache
210    PutBitContext pb;             ///< bitstream writer for #sframe_cache
211
212    /**
213     * @}
214     *
215     * @name Frame and superframe values
216     * Superframe and frame data - these can change from frame to frame,
217     * although some of them do in that case serve as a cache / history for
218     * the next frame or superframe.
219     * @{
220     */
221    double prev_lsps[MAX_LSPS];   ///< LSPs of the last frame of the previous
222                                  ///< superframe
223    int last_pitch_val;           ///< pitch value of the previous frame
224    int last_acb_type;            ///< frame type [0-2] of the previous frame
225    int pitch_diff_sh16;          ///< ((cur_pitch_val - #last_pitch_val)
226                                  ///< << 16) / #MAX_FRAMESIZE
227    float silence_gain;           ///< set for use in blocks if #ACB_TYPE_NONE
228
229    int aw_idx_is_ext;            ///< whether the AW index was encoded in
230                                  ///< 8 bits (instead of 6)
231    int aw_pulse_range;           ///< the range over which #aw_pulse_set1()
232                                  ///< can apply the pulse, relative to the
233                                  ///< value in aw_first_pulse_off. The exact
234                                  ///< position of the first AW-pulse is within
235                                  ///< [pulse_off, pulse_off + this], and
236                                  ///< depends on bitstream values; [16 or 24]
237    int aw_n_pulses[2];           ///< number of AW-pulses in each block; note
238                                  ///< that this number can be negative (in
239                                  ///< which case it basically means "zero")
240    int aw_first_pulse_off[2];    ///< index of first sample to which to
241                                  ///< apply AW-pulses, or -0xff if unset
242    int aw_next_pulse_off_cache;  ///< the position (relative to start of the
243                                  ///< second block) at which pulses should
244                                  ///< start to be positioned, serves as a
245                                  ///< cache for pitch-adaptive window pulses
246                                  ///< between blocks
247
248    int frame_cntr;               ///< current frame index [0 - 0xFFFE]; is
249                                  ///< only used for comfort noise in #pRNG()
250    int nb_superframes;           ///< number of superframes in current packet
251    float gain_pred_err[6];       ///< cache for gain prediction
252    float excitation_history[MAX_SIGNAL_HISTORY];
253                                  ///< cache of the signal of previous
254                                  ///< superframes, used as a history for
255                                  ///< signal generation
256    float synth_history[MAX_LSPS]; ///< see #excitation_history
257    /**
258     * @}
259     *
260     * @name Postfilter values
261     *
262     * Variables used for postfilter implementation, mostly history for
263     * smoothing and so on, and context variables for FFT/iFFT.
264     * @{
265     */
266    RDFTContext rdft, irdft;      ///< contexts for FFT-calculation in the
267                                  ///< postfilter (for denoise filter)
268    DCTContext dct, dst;          ///< contexts for phase shift (in Hilbert
269                                  ///< transform, part of postfilter)
270    float sin[511], cos[511];     ///< 8-bit cosine/sine windows over [-pi,pi]
271                                  ///< range
272    float postfilter_agc;         ///< gain control memory, used in
273                                  ///< #adaptive_gain_control()
274    float dcf_mem[2];             ///< DC filter history
275    float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
276                                  ///< zero filter output (i.e. excitation)
277                                  ///< by postfilter
278    float denoise_filter_cache[MAX_FRAMESIZE];
279    int   denoise_filter_cache_size; ///< samples in #denoise_filter_cache
280    DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
281                                  ///< aligned buffer for LPC tilting
282    DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
283                                  ///< aligned buffer for denoise coefficients
284    DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
285                                  ///< aligned buffer for postfilter speech
286                                  ///< synthesis
287    /**
288     * @}
289     */
290} WMAVoiceContext;
291
292/**
293 * Set up the variable bit mode (VBM) tree from container extradata.
294 * @param gb bit I/O context.
295 *           The bit context (s->gb) should be loaded with byte 23-46 of the
296 *           container extradata (i.e. the ones containing the VBM tree).
297 * @param vbm_tree pointer to array to which the decoded VBM tree will be
298 *                 written.
299 * @return 0 on success, <0 on error.
300 */
301static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
302{
303    int cntr[8] = { 0 }, n, res;
304
305    memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
306    for (n = 0; n < 17; n++) {
307        res = get_bits(gb, 3);
308        if (cntr[res] > 3) // should be >= 3 + (res == 7))
309            return -1;
310        vbm_tree[res * 3 + cntr[res]++] = n;
311    }
312    return 0;
313}
314
315static av_cold void wmavoice_init_static_data(void)
316{
317    static const uint8_t bits[] = {
318         2,  2,  2,  4,  4,  4,
319         6,  6,  6,  8,  8,  8,
320        10, 10, 10, 12, 12, 12,
321        14, 14, 14, 14
322    };
323    static const uint16_t codes[] = {
324          0x0000, 0x0001, 0x0002,        //              00/01/10
325          0x000c, 0x000d, 0x000e,        //           11+00/01/10
326          0x003c, 0x003d, 0x003e,        //         1111+00/01/10
327          0x00fc, 0x00fd, 0x00fe,        //       111111+00/01/10
328          0x03fc, 0x03fd, 0x03fe,        //     11111111+00/01/10
329          0x0ffc, 0x0ffd, 0x0ffe,        //   1111111111+00/01/10
330          0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
331    };
332
333    INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
334                    bits, 1, 1, codes, 2, 2, 132);
335}
336
337static av_cold void wmavoice_flush(AVCodecContext *ctx)
338{
339    WMAVoiceContext *s = ctx->priv_data;
340    int n;
341
342    s->postfilter_agc    = 0;
343    s->sframe_cache_size = 0;
344    s->skip_bits_next    = 0;
345    for (n = 0; n < s->lsps; n++)
346        s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
347    memset(s->excitation_history, 0,
348           sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
349    memset(s->synth_history,      0,
350           sizeof(*s->synth_history)      * MAX_LSPS);
351    memset(s->gain_pred_err,      0,
352           sizeof(s->gain_pred_err));
353
354    if (s->do_apf) {
355        memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
356               sizeof(*s->synth_filter_out_buf) * s->lsps);
357        memset(s->dcf_mem,              0,
358               sizeof(*s->dcf_mem)              * 2);
359        memset(s->zero_exc_pf,          0,
360               sizeof(*s->zero_exc_pf)          * s->history_nsamples);
361        memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
362    }
363}
364
365/**
366 * Set up decoder with parameters from demuxer (extradata etc.).
367 */
368static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
369{
370    static AVOnce init_static_once = AV_ONCE_INIT;
371    int n, flags, pitch_range, lsp16_flag, ret;
372    WMAVoiceContext *s = ctx->priv_data;
373
374    ff_thread_once(&init_static_once, wmavoice_init_static_data);
375
376    /**
377     * Extradata layout:
378     * - byte  0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
379     * - byte 19-22: flags field (annoyingly in LE; see below for known
380     *               values),
381     * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
382     *               rest is 0).
383     */
384    if (ctx->extradata_size != 46) {
385        av_log(ctx, AV_LOG_ERROR,
386               "Invalid extradata size %d (should be 46)\n",
387               ctx->extradata_size);
388        return AVERROR_INVALIDDATA;
389    }
390    if (ctx->block_align <= 0 || ctx->block_align > (1<<22)) {
391        av_log(ctx, AV_LOG_ERROR, "Invalid block alignment %d.\n", ctx->block_align);
392        return AVERROR_INVALIDDATA;
393    }
394
395    flags                = AV_RL32(ctx->extradata + 18);
396    s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
397    s->do_apf            =    flags & 0x1;
398    if (s->do_apf) {
399        if ((ret = ff_rdft_init(&s->rdft,  7,  DFT_R2C)) < 0 ||
400            (ret = ff_rdft_init(&s->irdft, 7, IDFT_C2R)) < 0 ||
401            (ret = ff_dct_init (&s->dct,   6,    DCT_I)) < 0 ||
402            (ret = ff_dct_init (&s->dst,   6,    DST_I)) < 0)
403            return ret;
404
405        ff_sine_window_init(s->cos, 256);
406        memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
407        for (n = 0; n < 255; n++) {
408            s->sin[n]       = -s->sin[510 - n];
409            s->cos[510 - n] =  s->cos[n];
410        }
411    }
412    s->denoise_strength  =   (flags >> 2) & 0xF;
413    if (s->denoise_strength >= 12) {
414        av_log(ctx, AV_LOG_ERROR,
415               "Invalid denoise filter strength %d (max=11)\n",
416               s->denoise_strength);
417        return AVERROR_INVALIDDATA;
418    }
419    s->denoise_tilt_corr = !!(flags & 0x40);
420    s->dc_level          =   (flags >> 7) & 0xF;
421    s->lsp_q_mode        = !!(flags & 0x2000);
422    s->lsp_def_mode      = !!(flags & 0x4000);
423    lsp16_flag           =    flags & 0x1000;
424    if (lsp16_flag) {
425        s->lsps               = 16;
426    } else {
427        s->lsps               = 10;
428    }
429    for (n = 0; n < s->lsps; n++)
430        s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
431
432    init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
433    if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
434        av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
435        return AVERROR_INVALIDDATA;
436    }
437
438    if (ctx->sample_rate >= INT_MAX / (256 * 37))
439        return AVERROR_INVALIDDATA;
440
441    s->min_pitch_val    = ((ctx->sample_rate << 8)      /  400 + 50) >> 8;
442    s->max_pitch_val    = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
443    pitch_range         = s->max_pitch_val - s->min_pitch_val;
444    if (pitch_range <= 0) {
445        av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
446        return AVERROR_INVALIDDATA;
447    }
448    s->pitch_nbits      = av_ceil_log2(pitch_range);
449    s->last_pitch_val   = 40;
450    s->last_acb_type    = ACB_TYPE_NONE;
451    s->history_nsamples = s->max_pitch_val + 8;
452
453    if (s->min_pitch_val < 1 || s->history_nsamples > MAX_SIGNAL_HISTORY) {
454        int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
455            max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
456
457        av_log(ctx, AV_LOG_ERROR,
458               "Unsupported samplerate %d (min=%d, max=%d)\n",
459               ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
460
461        return AVERROR(ENOSYS);
462    }
463
464    s->block_conv_table[0]      = s->min_pitch_val;
465    s->block_conv_table[1]      = (pitch_range * 25) >> 6;
466    s->block_conv_table[2]      = (pitch_range * 44) >> 6;
467    s->block_conv_table[3]      = s->max_pitch_val - 1;
468    s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
469    if (s->block_delta_pitch_hrange <= 0) {
470        av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
471        return AVERROR_INVALIDDATA;
472    }
473    s->block_delta_pitch_nbits  = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
474    s->block_pitch_range        = s->block_conv_table[2] +
475                                  s->block_conv_table[3] + 1 +
476                                  2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
477    s->block_pitch_nbits        = av_ceil_log2(s->block_pitch_range);
478
479    av_channel_layout_uninit(&ctx->ch_layout);
480    ctx->ch_layout = (AVChannelLayout)AV_CHANNEL_LAYOUT_MONO;
481    ctx->sample_fmt             = AV_SAMPLE_FMT_FLT;
482
483    return 0;
484}
485
486/**
487 * @name Postfilter functions
488 * Postfilter functions (gain control, wiener denoise filter, DC filter,
489 * kalman smoothening, plus surrounding code to wrap it)
490 * @{
491 */
492/**
493 * Adaptive gain control (as used in postfilter).
494 *
495 * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
496 * that the energy here is calculated using sum(abs(...)), whereas the
497 * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
498 *
499 * @param out output buffer for filtered samples
500 * @param in input buffer containing the samples as they are after the
501 *           postfilter steps so far
502 * @param speech_synth input buffer containing speech synth before postfilter
503 * @param size input buffer size
504 * @param alpha exponential filter factor
505 * @param gain_mem pointer to filter memory (single float)
506 */
507static void adaptive_gain_control(float *out, const float *in,
508                                  const float *speech_synth,
509                                  int size, float alpha, float *gain_mem)
510{
511    int i;
512    float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
513    float mem = *gain_mem;
514
515    for (i = 0; i < size; i++) {
516        speech_energy     += fabsf(speech_synth[i]);
517        postfilter_energy += fabsf(in[i]);
518    }
519    gain_scale_factor = postfilter_energy == 0.0 ? 0.0 :
520                        (1.0 - alpha) * speech_energy / postfilter_energy;
521
522    for (i = 0; i < size; i++) {
523        mem = alpha * mem + gain_scale_factor;
524        out[i] = in[i] * mem;
525    }
526
527    *gain_mem = mem;
528}
529
530/**
531 * Kalman smoothing function.
532 *
533 * This function looks back pitch +/- 3 samples back into history to find
534 * the best fitting curve (that one giving the optimal gain of the two
535 * signals, i.e. the highest dot product between the two), and then
536 * uses that signal history to smoothen the output of the speech synthesis
537 * filter.
538 *
539 * @param s WMA Voice decoding context
540 * @param pitch pitch of the speech signal
541 * @param in input speech signal
542 * @param out output pointer for smoothened signal
543 * @param size input/output buffer size
544 *
545 * @returns -1 if no smoothening took place, e.g. because no optimal
546 *          fit could be found, or 0 on success.
547 */
548static int kalman_smoothen(WMAVoiceContext *s, int pitch,
549                           const float *in, float *out, int size)
550{
551    int n;
552    float optimal_gain = 0, dot;
553    const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
554                *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
555                *best_hist_ptr = NULL;
556
557    /* find best fitting point in history */
558    do {
559        dot = avpriv_scalarproduct_float_c(in, ptr, size);
560        if (dot > optimal_gain) {
561            optimal_gain  = dot;
562            best_hist_ptr = ptr;
563        }
564    } while (--ptr >= end);
565
566    if (optimal_gain <= 0)
567        return -1;
568    dot = avpriv_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
569    if (dot <= 0) // would be 1.0
570        return -1;
571
572    if (optimal_gain <= dot) {
573        dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
574    } else
575        dot = 0.625;
576
577    /* actual smoothing */
578    for (n = 0; n < size; n++)
579        out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
580
581    return 0;
582}
583
584/**
585 * Get the tilt factor of a formant filter from its transfer function
586 * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
587 *      but somehow (??) it does a speech synthesis filter in the
588 *      middle, which is missing here
589 *
590 * @param lpcs LPC coefficients
591 * @param n_lpcs Size of LPC buffer
592 * @returns the tilt factor
593 */
594static float tilt_factor(const float *lpcs, int n_lpcs)
595{
596    float rh0, rh1;
597
598    rh0 = 1.0     + avpriv_scalarproduct_float_c(lpcs,  lpcs,    n_lpcs);
599    rh1 = lpcs[0] + avpriv_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
600
601    return rh1 / rh0;
602}
603
604/**
605 * Derive denoise filter coefficients (in real domain) from the LPCs.
606 */
607static void calc_input_response(WMAVoiceContext *s, float *lpcs,
608                                int fcb_type, float *coeffs, int remainder)
609{
610    float last_coeff, min = 15.0, max = -15.0;
611    float irange, angle_mul, gain_mul, range, sq;
612    int n, idx;
613
614    /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
615    s->rdft.rdft_calc(&s->rdft, lpcs);
616#define log_range(var, assign) do { \
617        float tmp = log10f(assign);  var = tmp; \
618        max       = FFMAX(max, tmp); min = FFMIN(min, tmp); \
619    } while (0)
620    log_range(last_coeff,  lpcs[1]         * lpcs[1]);
621    for (n = 1; n < 64; n++)
622        log_range(lpcs[n], lpcs[n * 2]     * lpcs[n * 2] +
623                           lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
624    log_range(lpcs[0],     lpcs[0]         * lpcs[0]);
625#undef log_range
626    range    = max - min;
627    lpcs[64] = last_coeff;
628
629    /* Now, use this spectrum to pick out these frequencies with higher
630     * (relative) power/energy (which we then take to be "not noise"),
631     * and set up a table (still in lpc[]) of (relative) gains per frequency.
632     * These frequencies will be maintained, while others ("noise") will be
633     * decreased in the filter output. */
634    irange    = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
635    gain_mul  = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
636                                                          (5.0 / 14.7));
637    angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
638    for (n = 0; n <= 64; n++) {
639        float pwr;
640
641        idx = lrint((max - lpcs[n]) * irange - 1);
642        idx = FFMAX(0, idx);
643        pwr = wmavoice_denoise_power_table[s->denoise_strength][idx];
644        lpcs[n] = angle_mul * pwr;
645
646        /* 70.57 =~ 1/log10(1.0331663) */
647        idx = av_clipf((pwr * gain_mul - 0.0295) * 70.570526123, 0, INT_MAX / 2);
648
649        if (idx > 127) { // fall back if index falls outside table range
650            coeffs[n] = wmavoice_energy_table[127] *
651                        powf(1.0331663, idx - 127);
652        } else
653            coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
654    }
655
656    /* calculate the Hilbert transform of the gains, which we do (since this
657     * is a sine input) by doing a phase shift (in theory, H(sin())=cos()).
658     * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
659     * "moment" of the LPCs in this filter. */
660    s->dct.dct_calc(&s->dct, lpcs);
661    s->dst.dct_calc(&s->dst, lpcs);
662
663    /* Split out the coefficient indexes into phase/magnitude pairs */
664    idx = 255 + av_clip(lpcs[64],               -255, 255);
665    coeffs[0]  = coeffs[0]  * s->cos[idx];
666    idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
667    last_coeff = coeffs[64] * s->cos[idx];
668    for (n = 63;; n--) {
669        idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
670        coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
671        coeffs[n * 2]     = coeffs[n] * s->cos[idx];
672
673        if (!--n) break;
674
675        idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
676        coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
677        coeffs[n * 2]     = coeffs[n] * s->cos[idx];
678    }
679    coeffs[1] = last_coeff;
680
681    /* move into real domain */
682    s->irdft.rdft_calc(&s->irdft, coeffs);
683
684    /* tilt correction and normalize scale */
685    memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
686    if (s->denoise_tilt_corr) {
687        float tilt_mem = 0;
688
689        coeffs[remainder - 1] = 0;
690        ff_tilt_compensation(&tilt_mem,
691                             -1.8 * tilt_factor(coeffs, remainder - 1),
692                             coeffs, remainder);
693    }
694    sq = (1.0 / 64.0) * sqrtf(1 / avpriv_scalarproduct_float_c(coeffs, coeffs,
695                                                               remainder));
696    for (n = 0; n < remainder; n++)
697        coeffs[n] *= sq;
698}
699
700/**
701 * This function applies a Wiener filter on the (noisy) speech signal as
702 * a means to denoise it.
703 *
704 * - take RDFT of LPCs to get the power spectrum of the noise + speech;
705 * - using this power spectrum, calculate (for each frequency) the Wiener
706 *    filter gain, which depends on the frequency power and desired level
707 *    of noise subtraction (when set too high, this leads to artifacts)
708 *    We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
709 *    of 4-8kHz);
710 * - by doing a phase shift, calculate the Hilbert transform of this array
711 *    of per-frequency filter-gains to get the filtering coefficients;
712 * - smoothen/normalize/de-tilt these filter coefficients as desired;
713 * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
714 *    to get the denoised speech signal;
715 * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
716 *    the frame boundary) are saved and applied to subsequent frames by an
717 *    overlap-add method (otherwise you get clicking-artifacts).
718 *
719 * @param s WMA Voice decoding context
720 * @param fcb_type Frame (codebook) type
721 * @param synth_pf input: the noisy speech signal, output: denoised speech
722 *                 data; should be 16-byte aligned (for ASM purposes)
723 * @param size size of the speech data
724 * @param lpcs LPCs used to synthesize this frame's speech data
725 */
726static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
727                           float *synth_pf, int size,
728                           const float *lpcs)
729{
730    int remainder, lim, n;
731
732    if (fcb_type != FCB_TYPE_SILENCE) {
733        float *tilted_lpcs = s->tilted_lpcs_pf,
734              *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
735
736        tilted_lpcs[0]           = 1.0;
737        memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
738        memset(&tilted_lpcs[s->lsps + 1], 0,
739               sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
740        ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
741                             tilted_lpcs, s->lsps + 2);
742
743        /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
744         * size is applied to the next frame. All input beyond this is zero,
745         * and thus all output beyond this will go towards zero, hence we can
746         * limit to min(size-1, 127-size) as a performance consideration. */
747        remainder = FFMIN(127 - size, size - 1);
748        calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
749
750        /* apply coefficients (in frequency spectrum domain), i.e. complex
751         * number multiplication */
752        memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
753        s->rdft.rdft_calc(&s->rdft, synth_pf);
754        s->rdft.rdft_calc(&s->rdft, coeffs);
755        synth_pf[0] *= coeffs[0];
756        synth_pf[1] *= coeffs[1];
757        for (n = 1; n < 64; n++) {
758            float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
759            synth_pf[n * 2]     = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
760            synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
761        }
762        s->irdft.rdft_calc(&s->irdft, synth_pf);
763    }
764
765    /* merge filter output with the history of previous runs */
766    if (s->denoise_filter_cache_size) {
767        lim = FFMIN(s->denoise_filter_cache_size, size);
768        for (n = 0; n < lim; n++)
769            synth_pf[n] += s->denoise_filter_cache[n];
770        s->denoise_filter_cache_size -= lim;
771        memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
772                sizeof(s->denoise_filter_cache[0]) * s->denoise_filter_cache_size);
773    }
774
775    /* move remainder of filter output into a cache for future runs */
776    if (fcb_type != FCB_TYPE_SILENCE) {
777        lim = FFMIN(remainder, s->denoise_filter_cache_size);
778        for (n = 0; n < lim; n++)
779            s->denoise_filter_cache[n] += synth_pf[size + n];
780        if (lim < remainder) {
781            memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
782                   sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
783            s->denoise_filter_cache_size = remainder;
784        }
785    }
786}
787
788/**
789 * Averaging projection filter, the postfilter used in WMAVoice.
790 *
791 * This uses the following steps:
792 * - A zero-synthesis filter (generate excitation from synth signal)
793 * - Kalman smoothing on excitation, based on pitch
794 * - Re-synthesized smoothened output
795 * - Iterative Wiener denoise filter
796 * - Adaptive gain filter
797 * - DC filter
798 *
799 * @param s WMAVoice decoding context
800 * @param synth Speech synthesis output (before postfilter)
801 * @param samples Output buffer for filtered samples
802 * @param size Buffer size of synth & samples
803 * @param lpcs Generated LPCs used for speech synthesis
804 * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
805 * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
806 * @param pitch Pitch of the input signal
807 */
808static void postfilter(WMAVoiceContext *s, const float *synth,
809                       float *samples,    int size,
810                       const float *lpcs, float *zero_exc_pf,
811                       int fcb_type,      int pitch)
812{
813    float synth_filter_in_buf[MAX_FRAMESIZE / 2],
814          *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
815          *synth_filter_in = zero_exc_pf;
816
817    av_assert0(size <= MAX_FRAMESIZE / 2);
818
819    /* generate excitation from input signal */
820    ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
821
822    if (fcb_type >= FCB_TYPE_AW_PULSES &&
823        !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
824        synth_filter_in = synth_filter_in_buf;
825
826    /* re-synthesize speech after smoothening, and keep history */
827    ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
828                                 synth_filter_in, size, s->lsps);
829    memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
830           sizeof(synth_pf[0]) * s->lsps);
831
832    wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
833
834    adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
835                          &s->postfilter_agc);
836
837    if (s->dc_level > 8) {
838        /* remove ultra-low frequency DC noise / highpass filter;
839         * coefficients are identical to those used in SIPR decoding,
840         * and very closely resemble those used in AMR-NB decoding. */
841        ff_acelp_apply_order_2_transfer_function(samples, samples,
842            (const float[2]) { -1.99997,      1.0 },
843            (const float[2]) { -1.9330735188, 0.93589198496 },
844            0.93980580475, s->dcf_mem, size);
845    }
846}
847/**
848 * @}
849 */
850
851/**
852 * Dequantize LSPs
853 * @param lsps output pointer to the array that will hold the LSPs
854 * @param num number of LSPs to be dequantized
855 * @param values quantized values, contains n_stages values
856 * @param sizes range (i.e. max value) of each quantized value
857 * @param n_stages number of dequantization runs
858 * @param table dequantization table to be used
859 * @param mul_q LSF multiplier
860 * @param base_q base (lowest) LSF values
861 */
862static void dequant_lsps(double *lsps, int num,
863                         const uint16_t *values,
864                         const uint16_t *sizes,
865                         int n_stages, const uint8_t *table,
866                         const double *mul_q,
867                         const double *base_q)
868{
869    int n, m;
870
871    memset(lsps, 0, num * sizeof(*lsps));
872    for (n = 0; n < n_stages; n++) {
873        const uint8_t *t_off = &table[values[n] * num];
874        double base = base_q[n], mul = mul_q[n];
875
876        for (m = 0; m < num; m++)
877            lsps[m] += base + mul * t_off[m];
878
879        table += sizes[n] * num;
880    }
881}
882
883/**
884 * @name LSP dequantization routines
885 * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
886 * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
887 * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
888 * @{
889 */
890/**
891 * Parse 10 independently-coded LSPs.
892 */
893static void dequant_lsp10i(GetBitContext *gb, double *lsps)
894{
895    static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
896    static const double mul_lsf[4] = {
897        5.2187144800e-3,    1.4626986422e-3,
898        9.6179549166e-4,    1.1325736225e-3
899    };
900    static const double base_lsf[4] = {
901        M_PI * -2.15522e-1, M_PI * -6.1646e-2,
902        M_PI * -3.3486e-2,  M_PI * -5.7408e-2
903    };
904    uint16_t v[4];
905
906    v[0] = get_bits(gb, 8);
907    v[1] = get_bits(gb, 6);
908    v[2] = get_bits(gb, 5);
909    v[3] = get_bits(gb, 5);
910
911    dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
912                 mul_lsf, base_lsf);
913}
914
915/**
916 * Parse 10 independently-coded LSPs, and then derive the tables to
917 * generate LSPs for the other frames from them (residual coding).
918 */
919static void dequant_lsp10r(GetBitContext *gb,
920                           double *i_lsps, const double *old,
921                           double *a1, double *a2, int q_mode)
922{
923    static const uint16_t vec_sizes[3] = { 128, 64, 64 };
924    static const double mul_lsf[3] = {
925        2.5807601174e-3,    1.2354460219e-3,   1.1763821673e-3
926    };
927    static const double base_lsf[3] = {
928        M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
929    };
930    const float (*ipol_tab)[2][10] = q_mode ?
931        wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a;
932    uint16_t interpol, v[3];
933    int n;
934
935    dequant_lsp10i(gb, i_lsps);
936
937    interpol = get_bits(gb, 5);
938    v[0]     = get_bits(gb, 7);
939    v[1]     = get_bits(gb, 6);
940    v[2]     = get_bits(gb, 6);
941
942    for (n = 0; n < 10; n++) {
943        double delta = old[n] - i_lsps[n];
944        a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
945        a1[10 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
946    }
947
948    dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
949                 mul_lsf, base_lsf);
950}
951
952/**
953 * Parse 16 independently-coded LSPs.
954 */
955static void dequant_lsp16i(GetBitContext *gb, double *lsps)
956{
957    static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
958    static const double mul_lsf[5] = {
959        3.3439586280e-3,    6.9908173703e-4,
960        3.3216608306e-3,    1.0334960326e-3,
961        3.1899104283e-3
962    };
963    static const double base_lsf[5] = {
964        M_PI * -1.27576e-1, M_PI * -2.4292e-2,
965        M_PI * -1.28094e-1, M_PI * -3.2128e-2,
966        M_PI * -1.29816e-1
967    };
968    uint16_t v[5];
969
970    v[0] = get_bits(gb, 8);
971    v[1] = get_bits(gb, 6);
972    v[2] = get_bits(gb, 7);
973    v[3] = get_bits(gb, 6);
974    v[4] = get_bits(gb, 7);
975
976    dequant_lsps( lsps,     5,  v,     vec_sizes,    2,
977                 wmavoice_dq_lsp16i1,  mul_lsf,     base_lsf);
978    dequant_lsps(&lsps[5],  5, &v[2], &vec_sizes[2], 2,
979                 wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
980    dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
981                 wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
982}
983
984/**
985 * Parse 16 independently-coded LSPs, and then derive the tables to
986 * generate LSPs for the other frames from them (residual coding).
987 */
988static void dequant_lsp16r(GetBitContext *gb,
989                           double *i_lsps, const double *old,
990                           double *a1, double *a2, int q_mode)
991{
992    static const uint16_t vec_sizes[3] = { 128, 128, 128 };
993    static const double mul_lsf[3] = {
994        1.2232979501e-3,   1.4062241527e-3,   1.6114744851e-3
995    };
996    static const double base_lsf[3] = {
997        M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
998    };
999    const float (*ipol_tab)[2][16] = q_mode ?
1000        wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a;
1001    uint16_t interpol, v[3];
1002    int n;
1003
1004    dequant_lsp16i(gb, i_lsps);
1005
1006    interpol = get_bits(gb, 5);
1007    v[0]     = get_bits(gb, 7);
1008    v[1]     = get_bits(gb, 7);
1009    v[2]     = get_bits(gb, 7);
1010
1011    for (n = 0; n < 16; n++) {
1012        double delta = old[n] - i_lsps[n];
1013        a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
1014        a1[16 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
1015    }
1016
1017    dequant_lsps( a2,     10,  v,     vec_sizes,    1,
1018                 wmavoice_dq_lsp16r1,  mul_lsf,     base_lsf);
1019    dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
1020                 wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
1021    dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
1022                 wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
1023}
1024
1025/**
1026 * @}
1027 * @name Pitch-adaptive window coding functions
1028 * The next few functions are for pitch-adaptive window coding.
1029 * @{
1030 */
1031/**
1032 * Parse the offset of the first pitch-adaptive window pulses, and
1033 * the distribution of pulses between the two blocks in this frame.
1034 * @param s WMA Voice decoding context private data
1035 * @param gb bit I/O context
1036 * @param pitch pitch for each block in this frame
1037 */
1038static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb,
1039                            const int *pitch)
1040{
1041    static const int16_t start_offset[94] = {
1042        -11,  -9,  -7,  -5,  -3,  -1,   1,   3,   5,   7,   9,  11,
1043         13,  15,  18,  17,  19,  20,  21,  22,  23,  24,  25,  26,
1044         27,  28,  29,  30,  31,  32,  33,  35,  37,  39,  41,  43,
1045         45,  47,  49,  51,  53,  55,  57,  59,  61,  63,  65,  67,
1046         69,  71,  73,  75,  77,  79,  81,  83,  85,  87,  89,  91,
1047         93,  95,  97,  99, 101, 103, 105, 107, 109, 111, 113, 115,
1048        117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1049        141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1050    };
1051    int bits, offset;
1052
1053    /* position of pulse */
1054    s->aw_idx_is_ext = 0;
1055    if ((bits = get_bits(gb, 6)) >= 54) {
1056        s->aw_idx_is_ext = 1;
1057        bits += (bits - 54) * 3 + get_bits(gb, 2);
1058    }
1059
1060    /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1061     * the distribution of the pulses in each block contained in this frame. */
1062    s->aw_pulse_range        = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1063    for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1064    s->aw_n_pulses[0]        = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1065    s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1066    offset                  += s->aw_n_pulses[0] * pitch[0];
1067    s->aw_n_pulses[1]        = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1068    s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1069
1070    /* if continuing from a position before the block, reset position to
1071     * start of block (when corrected for the range over which it can be
1072     * spread in aw_pulse_set1()). */
1073    if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1074        while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1075            s->aw_first_pulse_off[1] -= pitch[1];
1076        if (start_offset[bits] < 0)
1077            while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1078                s->aw_first_pulse_off[0] -= pitch[0];
1079    }
1080}
1081
1082/**
1083 * Apply second set of pitch-adaptive window pulses.
1084 * @param s WMA Voice decoding context private data
1085 * @param gb bit I/O context
1086 * @param block_idx block index in frame [0, 1]
1087 * @param fcb structure containing fixed codebook vector info
1088 * @return -1 on error, 0 otherwise
1089 */
1090static int aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
1091                         int block_idx, AMRFixed *fcb)
1092{
1093    uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1094    uint16_t *use_mask = use_mask_mem + 2;
1095    /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1096     * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1097     * of idx are the position of the bit within a particular item in the
1098     * array (0 being the most significant bit, and 15 being the least
1099     * significant bit), and the remainder (>> 4) is the index in the
1100     * use_mask[]-array. This is faster and uses less memory than using a
1101     * 80-byte/80-int array. */
1102    int pulse_off = s->aw_first_pulse_off[block_idx],
1103        pulse_start, n, idx, range, aidx, start_off = 0;
1104
1105    /* set offset of first pulse to within this block */
1106    if (s->aw_n_pulses[block_idx] > 0)
1107        while (pulse_off + s->aw_pulse_range < 1)
1108            pulse_off += fcb->pitch_lag;
1109
1110    /* find range per pulse */
1111    if (s->aw_n_pulses[0] > 0) {
1112        if (block_idx == 0) {
1113            range = 32;
1114        } else /* block_idx = 1 */ {
1115            range = 8;
1116            if (s->aw_n_pulses[block_idx] > 0)
1117                pulse_off = s->aw_next_pulse_off_cache;
1118        }
1119    } else
1120        range = 16;
1121    pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1122
1123    /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1124     * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1125     * we exclude that range from being pulsed again in this function. */
1126    memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1127    memset( use_mask,   -1, 5 * sizeof(use_mask[0]));
1128    memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1129    if (s->aw_n_pulses[block_idx] > 0)
1130        for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1131            int excl_range         = s->aw_pulse_range; // always 16 or 24
1132            uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1133            int first_sh           = 16 - (idx & 15);
1134            *use_mask_ptr++       &= 0xFFFFu << first_sh;
1135            excl_range            -= first_sh;
1136            if (excl_range >= 16) {
1137                *use_mask_ptr++    = 0;
1138                *use_mask_ptr     &= 0xFFFF >> (excl_range - 16);
1139            } else
1140                *use_mask_ptr     &= 0xFFFF >> excl_range;
1141        }
1142
1143    /* find the 'aidx'th offset that is not excluded */
1144    aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1145    for (n = 0; n <= aidx; pulse_start++) {
1146        for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1147        if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1148            if (use_mask[0])      idx = 0x0F;
1149            else if (use_mask[1]) idx = 0x1F;
1150            else if (use_mask[2]) idx = 0x2F;
1151            else if (use_mask[3]) idx = 0x3F;
1152            else if (use_mask[4]) idx = 0x4F;
1153            else return -1;
1154            idx -= av_log2_16bit(use_mask[idx >> 4]);
1155        }
1156        if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1157            use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1158            n++;
1159            start_off = idx;
1160        }
1161    }
1162
1163    fcb->x[fcb->n] = start_off;
1164    fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1165    fcb->n++;
1166
1167    /* set offset for next block, relative to start of that block */
1168    n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1169    s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1170    return 0;
1171}
1172
1173/**
1174 * Apply first set of pitch-adaptive window pulses.
1175 * @param s WMA Voice decoding context private data
1176 * @param gb bit I/O context
1177 * @param block_idx block index in frame [0, 1]
1178 * @param fcb storage location for fixed codebook pulse info
1179 */
1180static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
1181                          int block_idx, AMRFixed *fcb)
1182{
1183    int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1184    float v;
1185
1186    if (s->aw_n_pulses[block_idx] > 0) {
1187        int n, v_mask, i_mask, sh, n_pulses;
1188
1189        if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1190            n_pulses = 3;
1191            v_mask   = 8;
1192            i_mask   = 7;
1193            sh       = 4;
1194        } else { // 4 pulses, 1:sign + 2:index each
1195            n_pulses = 4;
1196            v_mask   = 4;
1197            i_mask   = 3;
1198            sh       = 3;
1199        }
1200
1201        for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1202            fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1203            fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1204                                 s->aw_first_pulse_off[block_idx];
1205            while (fcb->x[fcb->n] < 0)
1206                fcb->x[fcb->n] += fcb->pitch_lag;
1207            if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1208                fcb->n++;
1209        }
1210    } else {
1211        int num2 = (val & 0x1FF) >> 1, delta, idx;
1212
1213        if (num2 < 1 * 79)      { delta = 1; idx = num2 + 1; }
1214        else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1215        else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1216        else                    { delta = 7; idx = num2 + 1 - 3 * 75; }
1217        v = (val & 0x200) ? -1.0 : 1.0;
1218
1219        fcb->no_repeat_mask |= 3 << fcb->n;
1220        fcb->x[fcb->n]       = idx - delta;
1221        fcb->y[fcb->n]       = v;
1222        fcb->x[fcb->n + 1]   = idx;
1223        fcb->y[fcb->n + 1]   = (val & 1) ? -v : v;
1224        fcb->n              += 2;
1225    }
1226}
1227
1228/**
1229 * @}
1230 *
1231 * Generate a random number from frame_cntr and block_idx, which will live
1232 * in the range [0, 1000 - block_size] (so it can be used as an index in a
1233 * table of size 1000 of which you want to read block_size entries).
1234 *
1235 * @param frame_cntr current frame number
1236 * @param block_num current block index
1237 * @param block_size amount of entries we want to read from a table
1238 *                   that has 1000 entries
1239 * @return a (non-)random number in the [0, 1000 - block_size] range.
1240 */
1241static int pRNG(int frame_cntr, int block_num, int block_size)
1242{
1243    /* array to simplify the calculation of z:
1244     * y = (x % 9) * 5 + 6;
1245     * z = (49995 * x) / y;
1246     * Since y only has 9 values, we can remove the division by using a
1247     * LUT and using FASTDIV-style divisions. For each of the 9 values
1248     * of y, we can rewrite z as:
1249     * z = x * (49995 / y) + x * ((49995 % y) / y)
1250     * In this table, each col represents one possible value of y, the
1251     * first number is 49995 / y, and the second is the FASTDIV variant
1252     * of 49995 % y / y. */
1253    static const unsigned int div_tbl[9][2] = {
1254        { 8332,  3 * 715827883U }, // y =  6
1255        { 4545,  0 * 390451573U }, // y = 11
1256        { 3124, 11 * 268435456U }, // y = 16
1257        { 2380, 15 * 204522253U }, // y = 21
1258        { 1922, 23 * 165191050U }, // y = 26
1259        { 1612, 23 * 138547333U }, // y = 31
1260        { 1388, 27 * 119304648U }, // y = 36
1261        { 1219, 16 * 104755300U }, // y = 41
1262        { 1086, 39 *  93368855U }  // y = 46
1263    };
1264    unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1265    if (x >= 0xFFFF) x -= 0xFFFF;   // max value of x is 8*1877+0xFFFE=0x13AA6,
1266                                    // so this is effectively a modulo (%)
1267    y = x - 9 * MULH(477218589, x); // x % 9
1268    z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1269                                    // z = x * 49995 / (y * 5 + 6)
1270    return z % (1000 - block_size);
1271}
1272
1273/**
1274 * Parse hardcoded signal for a single block.
1275 * @note see #synth_block().
1276 */
1277static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb,
1278                                 int block_idx, int size,
1279                                 const struct frame_type_desc *frame_desc,
1280                                 float *excitation)
1281{
1282    float gain;
1283    int n, r_idx;
1284
1285    av_assert0(size <= MAX_FRAMESIZE);
1286
1287    /* Set the offset from which we start reading wmavoice_std_codebook */
1288    if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1289        r_idx = pRNG(s->frame_cntr, block_idx, size);
1290        gain  = s->silence_gain;
1291    } else /* FCB_TYPE_HARDCODED */ {
1292        r_idx = get_bits(gb, 8);
1293        gain  = wmavoice_gain_universal[get_bits(gb, 6)];
1294    }
1295
1296    /* Clear gain prediction parameters */
1297    memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1298
1299    /* Apply gain to hardcoded codebook and use that as excitation signal */
1300    for (n = 0; n < size; n++)
1301        excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1302}
1303
1304/**
1305 * Parse FCB/ACB signal for a single block.
1306 * @note see #synth_block().
1307 */
1308static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
1309                                int block_idx, int size,
1310                                int block_pitch_sh2,
1311                                const struct frame_type_desc *frame_desc,
1312                                float *excitation)
1313{
1314    static const float gain_coeff[6] = {
1315        0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1316    };
1317    float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1318    int n, idx, gain_weight;
1319    AMRFixed fcb;
1320
1321    av_assert0(size <= MAX_FRAMESIZE / 2);
1322    memset(pulses, 0, sizeof(*pulses) * size);
1323
1324    fcb.pitch_lag      = block_pitch_sh2 >> 2;
1325    fcb.pitch_fac      = 1.0;
1326    fcb.no_repeat_mask = 0;
1327    fcb.n              = 0;
1328
1329    /* For the other frame types, this is where we apply the innovation
1330     * (fixed) codebook pulses of the speech signal. */
1331    if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1332        aw_pulse_set1(s, gb, block_idx, &fcb);
1333        if (aw_pulse_set2(s, gb, block_idx, &fcb)) {
1334            /* Conceal the block with silence and return.
1335             * Skip the correct amount of bits to read the next
1336             * block from the correct offset. */
1337            int r_idx = pRNG(s->frame_cntr, block_idx, size);
1338
1339            for (n = 0; n < size; n++)
1340                excitation[n] =
1341                    wmavoice_std_codebook[r_idx + n] * s->silence_gain;
1342            skip_bits(gb, 7 + 1);
1343            return;
1344        }
1345    } else /* FCB_TYPE_EXC_PULSES */ {
1346        int offset_nbits = 5 - frame_desc->log_n_blocks;
1347
1348        fcb.no_repeat_mask = -1;
1349        /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1350         * (instead of double) for a subset of pulses */
1351        for (n = 0; n < 5; n++) {
1352            float sign;
1353            int pos1, pos2;
1354
1355            sign           = get_bits1(gb) ? 1.0 : -1.0;
1356            pos1           = get_bits(gb, offset_nbits);
1357            fcb.x[fcb.n]   = n + 5 * pos1;
1358            fcb.y[fcb.n++] = sign;
1359            if (n < frame_desc->dbl_pulses) {
1360                pos2           = get_bits(gb, offset_nbits);
1361                fcb.x[fcb.n]   = n + 5 * pos2;
1362                fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1363            }
1364        }
1365    }
1366    ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1367
1368    /* Calculate gain for adaptive & fixed codebook signal.
1369     * see ff_amr_set_fixed_gain(). */
1370    idx = get_bits(gb, 7);
1371    fcb_gain = expf(avpriv_scalarproduct_float_c(s->gain_pred_err,
1372                                                 gain_coeff, 6) -
1373                    5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1374    acb_gain = wmavoice_gain_codebook_acb[idx];
1375    pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1376                        -2.9957322736 /* log(0.05) */,
1377                         1.6094379124 /* log(5.0)  */);
1378
1379    gain_weight = 8 >> frame_desc->log_n_blocks;
1380    memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1381            sizeof(*s->gain_pred_err) * (6 - gain_weight));
1382    for (n = 0; n < gain_weight; n++)
1383        s->gain_pred_err[n] = pred_err;
1384
1385    /* Calculation of adaptive codebook */
1386    if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1387        int len;
1388        for (n = 0; n < size; n += len) {
1389            int next_idx_sh16;
1390            int abs_idx    = block_idx * size + n;
1391            int pitch_sh16 = (s->last_pitch_val << 16) +
1392                             s->pitch_diff_sh16 * abs_idx;
1393            int pitch      = (pitch_sh16 + 0x6FFF) >> 16;
1394            int idx_sh16   = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1395            idx            = idx_sh16 >> 16;
1396            if (s->pitch_diff_sh16) {
1397                if (s->pitch_diff_sh16 > 0) {
1398                    next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1399                } else
1400                    next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1401                len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1402                              1, size - n);
1403            } else
1404                len = size;
1405
1406            ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1407                                  wmavoice_ipol1_coeffs, 17,
1408                                  idx, 9, len);
1409        }
1410    } else /* ACB_TYPE_HAMMING */ {
1411        int block_pitch = block_pitch_sh2 >> 2;
1412        idx             = block_pitch_sh2 & 3;
1413        if (idx) {
1414            ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1415                                  wmavoice_ipol2_coeffs, 4,
1416                                  idx, 8, size);
1417        } else
1418            av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1419                              sizeof(float) * size);
1420    }
1421
1422    /* Interpolate ACB/FCB and use as excitation signal */
1423    ff_weighted_vector_sumf(excitation, excitation, pulses,
1424                            acb_gain, fcb_gain, size);
1425}
1426
1427/**
1428 * Parse data in a single block.
1429 *
1430 * @param s WMA Voice decoding context private data
1431 * @param gb bit I/O context
1432 * @param block_idx index of the to-be-read block
1433 * @param size amount of samples to be read in this block
1434 * @param block_pitch_sh2 pitch for this block << 2
1435 * @param lsps LSPs for (the end of) this frame
1436 * @param prev_lsps LSPs for the last frame
1437 * @param frame_desc frame type descriptor
1438 * @param excitation target memory for the ACB+FCB interpolated signal
1439 * @param synth target memory for the speech synthesis filter output
1440 * @return 0 on success, <0 on error.
1441 */
1442static void synth_block(WMAVoiceContext *s, GetBitContext *gb,
1443                        int block_idx, int size,
1444                        int block_pitch_sh2,
1445                        const double *lsps, const double *prev_lsps,
1446                        const struct frame_type_desc *frame_desc,
1447                        float *excitation, float *synth)
1448{
1449    double i_lsps[MAX_LSPS];
1450    float lpcs[MAX_LSPS];
1451    float fac;
1452    int n;
1453
1454    if (frame_desc->acb_type == ACB_TYPE_NONE)
1455        synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1456    else
1457        synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1458                            frame_desc, excitation);
1459
1460    /* convert interpolated LSPs to LPCs */
1461    fac = (block_idx + 0.5) / frame_desc->n_blocks;
1462    for (n = 0; n < s->lsps; n++) // LSF -> LSP
1463        i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1464    ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1465
1466    /* Speech synthesis */
1467    ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1468}
1469
1470/**
1471 * Synthesize output samples for a single frame.
1472 *
1473 * @param ctx WMA Voice decoder context
1474 * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1475 * @param frame_idx Frame number within superframe [0-2]
1476 * @param samples pointer to output sample buffer, has space for at least 160
1477 *                samples
1478 * @param lsps LSP array
1479 * @param prev_lsps array of previous frame's LSPs
1480 * @param excitation target buffer for excitation signal
1481 * @param synth target buffer for synthesized speech data
1482 * @return 0 on success, <0 on error.
1483 */
1484static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1485                       float *samples,
1486                       const double *lsps, const double *prev_lsps,
1487                       float *excitation, float *synth)
1488{
1489    WMAVoiceContext *s = ctx->priv_data;
1490    int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
1491    int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
1492
1493    /* Parse frame type ("frame header"), see frame_descs */
1494    int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
1495
1496    pitch[0] = INT_MAX;
1497
1498    if (bd_idx < 0) {
1499        av_log(ctx, AV_LOG_ERROR,
1500               "Invalid frame type VLC code, skipping\n");
1501        return AVERROR_INVALIDDATA;
1502    }
1503
1504    block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1505
1506    /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1507    if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1508        /* Pitch is provided per frame, which is interpreted as the pitch of
1509         * the last sample of the last block of this frame. We can interpolate
1510         * the pitch of other blocks (and even pitch-per-sample) by gradually
1511         * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1512        n_blocks_x2      = frame_descs[bd_idx].n_blocks << 1;
1513        log_n_blocks_x2  = frame_descs[bd_idx].log_n_blocks + 1;
1514        cur_pitch_val    = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1515        cur_pitch_val    = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1516        if (s->last_acb_type == ACB_TYPE_NONE ||
1517            20 * abs(cur_pitch_val - s->last_pitch_val) >
1518                (cur_pitch_val + s->last_pitch_val))
1519            s->last_pitch_val = cur_pitch_val;
1520
1521        /* pitch per block */
1522        for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1523            int fac = n * 2 + 1;
1524
1525            pitch[n] = (MUL16(fac,                 cur_pitch_val) +
1526                        MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1527                        frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1528        }
1529
1530        /* "pitch-diff-per-sample" for calculation of pitch per sample */
1531        s->pitch_diff_sh16 =
1532            (cur_pitch_val - s->last_pitch_val) * (1 << 16) / MAX_FRAMESIZE;
1533    }
1534
1535    /* Global gain (if silence) and pitch-adaptive window coordinates */
1536    switch (frame_descs[bd_idx].fcb_type) {
1537    case FCB_TYPE_SILENCE:
1538        s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)];
1539        break;
1540    case FCB_TYPE_AW_PULSES:
1541        aw_parse_coords(s, gb, pitch);
1542        break;
1543    }
1544
1545    for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1546        int bl_pitch_sh2;
1547
1548        /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1549        switch (frame_descs[bd_idx].acb_type) {
1550        case ACB_TYPE_HAMMING: {
1551            /* Pitch is given per block. Per-block pitches are encoded as an
1552             * absolute value for the first block, and then delta values
1553             * relative to this value) for all subsequent blocks. The scale of
1554             * this pitch value is semi-logarithmic compared to its use in the
1555             * decoder, so we convert it to normal scale also. */
1556            int block_pitch,
1557                t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1558                t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1559                t3 =  s->block_conv_table[3] - s->block_conv_table[2] + 1;
1560
1561            if (n == 0) {
1562                block_pitch = get_bits(gb, s->block_pitch_nbits);
1563            } else
1564                block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1565                                 get_bits(gb, s->block_delta_pitch_nbits);
1566            /* Convert last_ so that any next delta is within _range */
1567            last_block_pitch = av_clip(block_pitch,
1568                                       s->block_delta_pitch_hrange,
1569                                       s->block_pitch_range -
1570                                           s->block_delta_pitch_hrange);
1571
1572            /* Convert semi-log-style scale back to normal scale */
1573            if (block_pitch < t1) {
1574                bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1575            } else {
1576                block_pitch -= t1;
1577                if (block_pitch < t2) {
1578                    bl_pitch_sh2 =
1579                        (s->block_conv_table[1] << 2) + (block_pitch << 1);
1580                } else {
1581                    block_pitch -= t2;
1582                    if (block_pitch < t3) {
1583                        bl_pitch_sh2 =
1584                            (s->block_conv_table[2] + block_pitch) << 2;
1585                    } else
1586                        bl_pitch_sh2 = s->block_conv_table[3] << 2;
1587                }
1588            }
1589            pitch[n] = bl_pitch_sh2 >> 2;
1590            break;
1591        }
1592
1593        case ACB_TYPE_ASYMMETRIC: {
1594            bl_pitch_sh2 = pitch[n] << 2;
1595            break;
1596        }
1597
1598        default: // ACB_TYPE_NONE has no pitch
1599            bl_pitch_sh2 = 0;
1600            break;
1601        }
1602
1603        synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1604                    lsps, prev_lsps, &frame_descs[bd_idx],
1605                    &excitation[n * block_nsamples],
1606                    &synth[n * block_nsamples]);
1607    }
1608
1609    /* Averaging projection filter, if applicable. Else, just copy samples
1610     * from synthesis buffer */
1611    if (s->do_apf) {
1612        double i_lsps[MAX_LSPS];
1613        float lpcs[MAX_LSPS];
1614
1615        if(frame_descs[bd_idx].fcb_type >= FCB_TYPE_AW_PULSES && pitch[0] == INT_MAX)
1616            return AVERROR_INVALIDDATA;
1617
1618        for (n = 0; n < s->lsps; n++) // LSF -> LSP
1619            i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1620        ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1621        postfilter(s, synth, samples, 80, lpcs,
1622                   &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1623                   frame_descs[bd_idx].fcb_type, pitch[0]);
1624
1625        for (n = 0; n < s->lsps; n++) // LSF -> LSP
1626            i_lsps[n] = cos(lsps[n]);
1627        ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1628        postfilter(s, &synth[80], &samples[80], 80, lpcs,
1629                   &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1630                   frame_descs[bd_idx].fcb_type, pitch[0]);
1631    } else
1632        memcpy(samples, synth, 160 * sizeof(synth[0]));
1633
1634    /* Cache values for next frame */
1635    s->frame_cntr++;
1636    if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1637    s->last_acb_type = frame_descs[bd_idx].acb_type;
1638    switch (frame_descs[bd_idx].acb_type) {
1639    case ACB_TYPE_NONE:
1640        s->last_pitch_val = 0;
1641        break;
1642    case ACB_TYPE_ASYMMETRIC:
1643        s->last_pitch_val = cur_pitch_val;
1644        break;
1645    case ACB_TYPE_HAMMING:
1646        s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1647        break;
1648    }
1649
1650    return 0;
1651}
1652
1653/**
1654 * Ensure minimum value for first item, maximum value for last value,
1655 * proper spacing between each value and proper ordering.
1656 *
1657 * @param lsps array of LSPs
1658 * @param num size of LSP array
1659 *
1660 * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1661 *       useful to put in a generic location later on. Parts are also
1662 *       present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1663 *       which is in float.
1664 */
1665static void stabilize_lsps(double *lsps, int num)
1666{
1667    int n, m, l;
1668
1669    /* set minimum value for first, maximum value for last and minimum
1670     * spacing between LSF values.
1671     * Very similar to ff_set_min_dist_lsf(), but in double. */
1672    lsps[0]       = FFMAX(lsps[0],       0.0015 * M_PI);
1673    for (n = 1; n < num; n++)
1674        lsps[n]   = FFMAX(lsps[n],       lsps[n - 1] + 0.0125 * M_PI);
1675    lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1676
1677    /* reorder (looks like one-time / non-recursed bubblesort).
1678     * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1679    for (n = 1; n < num; n++) {
1680        if (lsps[n] < lsps[n - 1]) {
1681            for (m = 1; m < num; m++) {
1682                double tmp = lsps[m];
1683                for (l = m - 1; l >= 0; l--) {
1684                    if (lsps[l] <= tmp) break;
1685                    lsps[l + 1] = lsps[l];
1686                }
1687                lsps[l + 1] = tmp;
1688            }
1689            break;
1690        }
1691    }
1692}
1693
1694/**
1695 * Synthesize output samples for a single superframe. If we have any data
1696 * cached in s->sframe_cache, that will be used instead of whatever is loaded
1697 * in s->gb.
1698 *
1699 * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1700 * to give a total of 480 samples per frame. See #synth_frame() for frame
1701 * parsing. In addition to 3 frames, superframes can also contain the LSPs
1702 * (if these are globally specified for all frames (residually); they can
1703 * also be specified individually per-frame. See the s->has_residual_lsps
1704 * option), and can specify the number of samples encoded in this superframe
1705 * (if less than 480), usually used to prevent blanks at track boundaries.
1706 *
1707 * @param ctx WMA Voice decoder context
1708 * @return 0 on success, <0 on error or 1 if there was not enough data to
1709 *         fully parse the superframe
1710 */
1711static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
1712                            int *got_frame_ptr)
1713{
1714    WMAVoiceContext *s = ctx->priv_data;
1715    GetBitContext *gb = &s->gb, s_gb;
1716    int n, res, n_samples = MAX_SFRAMESIZE;
1717    double lsps[MAX_FRAMES][MAX_LSPS];
1718    const double *mean_lsf = s->lsps == 16 ?
1719        wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode];
1720    float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1721    float synth[MAX_LSPS + MAX_SFRAMESIZE];
1722    float *samples;
1723
1724    memcpy(synth,      s->synth_history,
1725           s->lsps             * sizeof(*synth));
1726    memcpy(excitation, s->excitation_history,
1727           s->history_nsamples * sizeof(*excitation));
1728
1729    if (s->sframe_cache_size > 0) {
1730        gb = &s_gb;
1731        init_get_bits(gb, s->sframe_cache, s->sframe_cache_size);
1732        s->sframe_cache_size = 0;
1733    }
1734
1735    /* First bit is speech/music bit, it differentiates between WMAVoice
1736     * speech samples (the actual codec) and WMAVoice music samples, which
1737     * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1738     * the wild yet. */
1739    if (!get_bits1(gb)) {
1740        avpriv_request_sample(ctx, "WMAPro-in-WMAVoice");
1741        return AVERROR_PATCHWELCOME;
1742    }
1743
1744    /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1745    if (get_bits1(gb)) {
1746        if ((n_samples = get_bits(gb, 12)) > MAX_SFRAMESIZE) {
1747            av_log(ctx, AV_LOG_ERROR,
1748                   "Superframe encodes > %d samples (%d), not allowed\n",
1749                   MAX_SFRAMESIZE, n_samples);
1750            return AVERROR_INVALIDDATA;
1751        }
1752    }
1753
1754    /* Parse LSPs, if global for the superframe (can also be per-frame). */
1755    if (s->has_residual_lsps) {
1756        double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1757
1758        for (n = 0; n < s->lsps; n++)
1759            prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1760
1761        if (s->lsps == 10) {
1762            dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1763        } else /* s->lsps == 16 */
1764            dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1765
1766        for (n = 0; n < s->lsps; n++) {
1767            lsps[0][n]  = mean_lsf[n] + (a1[n]           - a2[n * 2]);
1768            lsps[1][n]  = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1769            lsps[2][n] += mean_lsf[n];
1770        }
1771        for (n = 0; n < 3; n++)
1772            stabilize_lsps(lsps[n], s->lsps);
1773    }
1774
1775    /* synth_superframe can run multiple times per packet
1776     * free potential previous frame */
1777    av_frame_unref(frame);
1778
1779    /* get output buffer */
1780    frame->nb_samples = MAX_SFRAMESIZE;
1781    if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
1782        return res;
1783    frame->nb_samples = n_samples;
1784    samples = (float *)frame->data[0];
1785
1786    /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1787    for (n = 0; n < 3; n++) {
1788        if (!s->has_residual_lsps) {
1789            int m;
1790
1791            if (s->lsps == 10) {
1792                dequant_lsp10i(gb, lsps[n]);
1793            } else /* s->lsps == 16 */
1794                dequant_lsp16i(gb, lsps[n]);
1795
1796            for (m = 0; m < s->lsps; m++)
1797                lsps[n][m] += mean_lsf[m];
1798            stabilize_lsps(lsps[n], s->lsps);
1799        }
1800
1801        if ((res = synth_frame(ctx, gb, n,
1802                               &samples[n * MAX_FRAMESIZE],
1803                               lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1804                               &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1805                               &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1806            *got_frame_ptr = 0;
1807            return res;
1808        }
1809    }
1810
1811    /* Statistics? FIXME - we don't check for length, a slight overrun
1812     * will be caught by internal buffer padding, and anything else
1813     * will be skipped, not read. */
1814    if (get_bits1(gb)) {
1815        res = get_bits(gb, 4);
1816        skip_bits(gb, 10 * (res + 1));
1817    }
1818
1819    if (get_bits_left(gb) < 0) {
1820        wmavoice_flush(ctx);
1821        return AVERROR_INVALIDDATA;
1822    }
1823
1824    *got_frame_ptr = 1;
1825
1826    /* Update history */
1827    memcpy(s->prev_lsps,           lsps[2],
1828           s->lsps             * sizeof(*s->prev_lsps));
1829    memcpy(s->synth_history,      &synth[MAX_SFRAMESIZE],
1830           s->lsps             * sizeof(*synth));
1831    memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1832           s->history_nsamples * sizeof(*excitation));
1833    if (s->do_apf)
1834        memmove(s->zero_exc_pf,       &s->zero_exc_pf[MAX_SFRAMESIZE],
1835                s->history_nsamples * sizeof(*s->zero_exc_pf));
1836
1837    return 0;
1838}
1839
1840/**
1841 * Parse the packet header at the start of each packet (input data to this
1842 * decoder).
1843 *
1844 * @param s WMA Voice decoding context private data
1845 * @return <0 on error, nb_superframes on success.
1846 */
1847static int parse_packet_header(WMAVoiceContext *s)
1848{
1849    GetBitContext *gb = &s->gb;
1850    unsigned int res, n_superframes = 0;
1851
1852    skip_bits(gb, 4);          // packet sequence number
1853    s->has_residual_lsps = get_bits1(gb);
1854    do {
1855        if (get_bits_left(gb) < 6 + s->spillover_bitsize)
1856            return AVERROR_INVALIDDATA;
1857
1858        res = get_bits(gb, 6); // number of superframes per packet
1859                               // (minus first one if there is spillover)
1860        n_superframes += res;
1861    } while (res == 0x3F);
1862    s->spillover_nbits   = get_bits(gb, s->spillover_bitsize);
1863
1864    return get_bits_left(gb) >= 0 ? n_superframes : AVERROR_INVALIDDATA;
1865}
1866
1867/**
1868 * Copy (unaligned) bits from gb/data/size to pb.
1869 *
1870 * @param pb target buffer to copy bits into
1871 * @param data source buffer to copy bits from
1872 * @param size size of the source data, in bytes
1873 * @param gb bit I/O context specifying the current position in the source.
1874 *           data. This function might use this to align the bit position to
1875 *           a whole-byte boundary before calling #ff_copy_bits() on aligned
1876 *           source data
1877 * @param nbits the amount of bits to copy from source to target
1878 *
1879 * @note after calling this function, the current position in the input bit
1880 *       I/O context is undefined.
1881 */
1882static void copy_bits(PutBitContext *pb,
1883                      const uint8_t *data, int size,
1884                      GetBitContext *gb, int nbits)
1885{
1886    int rmn_bytes, rmn_bits;
1887
1888    rmn_bits = rmn_bytes = get_bits_left(gb);
1889    if (rmn_bits < nbits)
1890        return;
1891    if (nbits > put_bits_left(pb))
1892        return;
1893    rmn_bits &= 7; rmn_bytes >>= 3;
1894    if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1895        put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1896    ff_copy_bits(pb, data + size - rmn_bytes,
1897                 FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1898}
1899
1900/**
1901 * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1902 * and we expect that the demuxer / application provides it to us as such
1903 * (else you'll probably get garbage as output). Every packet has a size of
1904 * ctx->block_align bytes, starts with a packet header (see
1905 * #parse_packet_header()), and then a series of superframes. Superframe
1906 * boundaries may exceed packets, i.e. superframes can split data over
1907 * multiple (two) packets.
1908 *
1909 * For more information about frames, see #synth_superframe().
1910 */
1911static int wmavoice_decode_packet(AVCodecContext *ctx, AVFrame *frame,
1912                                  int *got_frame_ptr, AVPacket *avpkt)
1913{
1914    WMAVoiceContext *s = ctx->priv_data;
1915    GetBitContext *gb = &s->gb;
1916    int size, res, pos;
1917
1918    /* Packets are sometimes a multiple of ctx->block_align, with a packet
1919     * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1920     * feeds us ASF packets, which may concatenate multiple "codec" packets
1921     * in a single "muxer" packet, so we artificially emulate that by
1922     * capping the packet size at ctx->block_align. */
1923    for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1924    init_get_bits8(&s->gb, avpkt->data, size);
1925
1926    /* size == ctx->block_align is used to indicate whether we are dealing with
1927     * a new packet or a packet of which we already read the packet header
1928     * previously. */
1929    if (!(size % ctx->block_align)) { // new packet header
1930        if (!size) {
1931            s->spillover_nbits = 0;
1932            s->nb_superframes = 0;
1933        } else {
1934            if ((res = parse_packet_header(s)) < 0)
1935                return res;
1936            s->nb_superframes = res;
1937        }
1938
1939        /* If the packet header specifies a s->spillover_nbits, then we want
1940         * to push out all data of the previous packet (+ spillover) before
1941         * continuing to parse new superframes in the current packet. */
1942        if (s->sframe_cache_size > 0) {
1943            int cnt = get_bits_count(gb);
1944            if (cnt + s->spillover_nbits > avpkt->size * 8) {
1945                s->spillover_nbits = avpkt->size * 8 - cnt;
1946            }
1947            copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1948            flush_put_bits(&s->pb);
1949            s->sframe_cache_size += s->spillover_nbits;
1950            if ((res = synth_superframe(ctx, frame, got_frame_ptr)) == 0 &&
1951                *got_frame_ptr) {
1952                cnt += s->spillover_nbits;
1953                s->skip_bits_next = cnt & 7;
1954                res = cnt >> 3;
1955                return res;
1956            } else
1957                skip_bits_long (gb, s->spillover_nbits - cnt +
1958                                get_bits_count(gb)); // resync
1959        } else if (s->spillover_nbits) {
1960            skip_bits_long(gb, s->spillover_nbits);  // resync
1961        }
1962    } else if (s->skip_bits_next)
1963        skip_bits(gb, s->skip_bits_next);
1964
1965    /* Try parsing superframes in current packet */
1966    s->sframe_cache_size = 0;
1967    s->skip_bits_next = 0;
1968    pos = get_bits_left(gb);
1969    if (s->nb_superframes-- == 0) {
1970        *got_frame_ptr = 0;
1971        return size;
1972    } else if (s->nb_superframes > 0) {
1973        if ((res = synth_superframe(ctx, frame, got_frame_ptr)) < 0) {
1974            return res;
1975        } else if (*got_frame_ptr) {
1976            int cnt = get_bits_count(gb);
1977            s->skip_bits_next = cnt & 7;
1978            res = cnt >> 3;
1979            return res;
1980        }
1981    } else if ((s->sframe_cache_size = pos) > 0) {
1982        /* ... cache it for spillover in next packet */
1983        init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
1984        copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1985        // FIXME bad - just copy bytes as whole and add use the
1986        // skip_bits_next field
1987    }
1988
1989    return size;
1990}
1991
1992static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
1993{
1994    WMAVoiceContext *s = ctx->priv_data;
1995
1996    if (s->do_apf) {
1997        ff_rdft_end(&s->rdft);
1998        ff_rdft_end(&s->irdft);
1999        ff_dct_end(&s->dct);
2000        ff_dct_end(&s->dst);
2001    }
2002
2003    return 0;
2004}
2005
2006const FFCodec ff_wmavoice_decoder = {
2007    .p.name           = "wmavoice",
2008    .p.long_name      = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
2009    .p.type           = AVMEDIA_TYPE_AUDIO,
2010    .p.id             = AV_CODEC_ID_WMAVOICE,
2011    .priv_data_size   = sizeof(WMAVoiceContext),
2012    .init             = wmavoice_decode_init,
2013    .close            = wmavoice_decode_end,
2014    FF_CODEC_DECODE_CB(wmavoice_decode_packet),
2015    .p.capabilities   = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
2016    .caps_internal    = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_INIT_CLEANUP,
2017    .flush            = wmavoice_flush,
2018};
2019