xref: /third_party/ffmpeg/libavcodec/vp8.c (revision cabdff1a)
1/*
2 * VP7/VP8 compatible video decoder
3 *
4 * Copyright (C) 2010 David Conrad
5 * Copyright (C) 2010 Ronald S. Bultje
6 * Copyright (C) 2010 Fiona Glaser
7 * Copyright (C) 2012 Daniel Kang
8 * Copyright (C) 2014 Peter Ross
9 *
10 * This file is part of FFmpeg.
11 *
12 * FFmpeg is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27#include "config_components.h"
28
29#include "libavutil/imgutils.h"
30#include "libavutil/mem_internal.h"
31
32#include "avcodec.h"
33#include "codec_internal.h"
34#include "hwconfig.h"
35#include "internal.h"
36#include "mathops.h"
37#include "rectangle.h"
38#include "thread.h"
39#include "threadframe.h"
40#include "vp8.h"
41#include "vp8data.h"
42
43#if ARCH_ARM
44#   include "arm/vp8.h"
45#endif
46
47#if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
48#define VPX(vp7, f) (vp7 ? vp7_ ## f : vp8_ ## f)
49#elif CONFIG_VP7_DECODER
50#define VPX(vp7, f) vp7_ ## f
51#else // CONFIG_VP8_DECODER
52#define VPX(vp7, f) vp8_ ## f
53#endif
54
55static void free_buffers(VP8Context *s)
56{
57    int i;
58    if (s->thread_data)
59        for (i = 0; i < MAX_THREADS; i++) {
60#if HAVE_THREADS
61            pthread_cond_destroy(&s->thread_data[i].cond);
62            pthread_mutex_destroy(&s->thread_data[i].lock);
63#endif
64            av_freep(&s->thread_data[i].filter_strength);
65        }
66    av_freep(&s->thread_data);
67    av_freep(&s->macroblocks_base);
68    av_freep(&s->intra4x4_pred_mode_top);
69    av_freep(&s->top_nnz);
70    av_freep(&s->top_border);
71
72    s->macroblocks = NULL;
73}
74
75static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
76{
77    int ret;
78    if ((ret = ff_thread_get_ext_buffer(s->avctx, &f->tf,
79                                        ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
80        return ret;
81    if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height)))
82        goto fail;
83    if (s->avctx->hwaccel) {
84        const AVHWAccel *hwaccel = s->avctx->hwaccel;
85        if (hwaccel->frame_priv_data_size) {
86            f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
87            if (!f->hwaccel_priv_buf)
88                goto fail;
89            f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
90        }
91    }
92    return 0;
93
94fail:
95    av_buffer_unref(&f->seg_map);
96    ff_thread_release_ext_buffer(s->avctx, &f->tf);
97    return AVERROR(ENOMEM);
98}
99
100static void vp8_release_frame(VP8Context *s, VP8Frame *f)
101{
102    av_buffer_unref(&f->seg_map);
103    av_buffer_unref(&f->hwaccel_priv_buf);
104    f->hwaccel_picture_private = NULL;
105    ff_thread_release_ext_buffer(s->avctx, &f->tf);
106}
107
108#if CONFIG_VP8_DECODER
109static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
110{
111    int ret;
112
113    vp8_release_frame(s, dst);
114
115    if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
116        return ret;
117    if (src->seg_map &&
118        !(dst->seg_map = av_buffer_ref(src->seg_map))) {
119        vp8_release_frame(s, dst);
120        return AVERROR(ENOMEM);
121    }
122    if (src->hwaccel_picture_private) {
123        dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
124        if (!dst->hwaccel_priv_buf)
125            return AVERROR(ENOMEM);
126        dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
127    }
128
129    return 0;
130}
131#endif /* CONFIG_VP8_DECODER */
132
133static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
134{
135    VP8Context *s = avctx->priv_data;
136    int i;
137
138    for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
139        vp8_release_frame(s, &s->frames[i]);
140    memset(s->framep, 0, sizeof(s->framep));
141
142    if (free_mem)
143        free_buffers(s);
144}
145
146static void vp8_decode_flush(AVCodecContext *avctx)
147{
148    vp8_decode_flush_impl(avctx, 0);
149}
150
151static VP8Frame *vp8_find_free_buffer(VP8Context *s)
152{
153    VP8Frame *frame = NULL;
154    int i;
155
156    // find a free buffer
157    for (i = 0; i < 5; i++)
158        if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
159            &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
160            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
161            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
162            frame = &s->frames[i];
163            break;
164        }
165    if (i == 5) {
166        av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
167        abort();
168    }
169    if (frame->tf.f->buf[0])
170        vp8_release_frame(s, frame);
171
172    return frame;
173}
174
175static enum AVPixelFormat get_pixel_format(VP8Context *s)
176{
177    enum AVPixelFormat pix_fmts[] = {
178#if CONFIG_VP8_VAAPI_HWACCEL
179        AV_PIX_FMT_VAAPI,
180#endif
181#if CONFIG_VP8_NVDEC_HWACCEL
182        AV_PIX_FMT_CUDA,
183#endif
184        AV_PIX_FMT_YUV420P,
185        AV_PIX_FMT_NONE,
186    };
187
188    return ff_get_format(s->avctx, pix_fmts);
189}
190
191static av_always_inline
192int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
193{
194    AVCodecContext *avctx = s->avctx;
195    int i, ret, dim_reset = 0;
196
197    if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
198        height != s->avctx->height) {
199        vp8_decode_flush_impl(s->avctx, 1);
200
201        ret = ff_set_dimensions(s->avctx, width, height);
202        if (ret < 0)
203            return ret;
204
205        dim_reset = (s->macroblocks_base != NULL);
206    }
207
208    if ((s->pix_fmt == AV_PIX_FMT_NONE || dim_reset) &&
209         !s->actually_webp && !is_vp7) {
210        s->pix_fmt = get_pixel_format(s);
211        if (s->pix_fmt < 0)
212            return AVERROR(EINVAL);
213        avctx->pix_fmt = s->pix_fmt;
214    }
215
216    s->mb_width  = (s->avctx->coded_width  + 15) / 16;
217    s->mb_height = (s->avctx->coded_height + 15) / 16;
218
219    s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
220                   avctx->thread_count > 1;
221    if (!s->mb_layout) { // Frame threading and one thread
222        s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
223                                               sizeof(*s->macroblocks));
224        s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
225    } else // Sliced threading
226        s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
227                                         sizeof(*s->macroblocks));
228    s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
229    s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
230    s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
231
232    if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
233        !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
234        free_buffers(s);
235        return AVERROR(ENOMEM);
236    }
237
238    for (i = 0; i < MAX_THREADS; i++) {
239        s->thread_data[i].filter_strength =
240            av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
241        if (!s->thread_data[i].filter_strength) {
242            free_buffers(s);
243            return AVERROR(ENOMEM);
244        }
245#if HAVE_THREADS
246        pthread_mutex_init(&s->thread_data[i].lock, NULL);
247        pthread_cond_init(&s->thread_data[i].cond, NULL);
248#endif
249    }
250
251    s->macroblocks = s->macroblocks_base + 1;
252
253    return 0;
254}
255
256static int vp7_update_dimensions(VP8Context *s, int width, int height)
257{
258    return update_dimensions(s, width, height, IS_VP7);
259}
260
261static int vp8_update_dimensions(VP8Context *s, int width, int height)
262{
263    return update_dimensions(s, width, height, IS_VP8);
264}
265
266
267static void parse_segment_info(VP8Context *s)
268{
269    VP56RangeCoder *c = &s->c;
270    int i;
271
272    s->segmentation.update_map = vp8_rac_get(c);
273    s->segmentation.update_feature_data = vp8_rac_get(c);
274
275    if (s->segmentation.update_feature_data) {
276        s->segmentation.absolute_vals = vp8_rac_get(c);
277
278        for (i = 0; i < 4; i++)
279            s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
280
281        for (i = 0; i < 4; i++)
282            s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
283    }
284    if (s->segmentation.update_map)
285        for (i = 0; i < 3; i++)
286            s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
287}
288
289static void update_lf_deltas(VP8Context *s)
290{
291    VP56RangeCoder *c = &s->c;
292    int i;
293
294    for (i = 0; i < 4; i++) {
295        if (vp8_rac_get(c)) {
296            s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
297
298            if (vp8_rac_get(c))
299                s->lf_delta.ref[i] = -s->lf_delta.ref[i];
300        }
301    }
302
303    for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
304        if (vp8_rac_get(c)) {
305            s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
306
307            if (vp8_rac_get(c))
308                s->lf_delta.mode[i] = -s->lf_delta.mode[i];
309        }
310    }
311}
312
313static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
314{
315    const uint8_t *sizes = buf;
316    int i;
317    int ret;
318
319    s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
320
321    buf      += 3 * (s->num_coeff_partitions - 1);
322    buf_size -= 3 * (s->num_coeff_partitions - 1);
323    if (buf_size < 0)
324        return -1;
325
326    for (i = 0; i < s->num_coeff_partitions - 1; i++) {
327        int size = AV_RL24(sizes + 3 * i);
328        if (buf_size - size < 0)
329            return -1;
330        s->coeff_partition_size[i] = size;
331
332        ret = ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
333        if (ret < 0)
334            return ret;
335        buf      += size;
336        buf_size -= size;
337    }
338
339    s->coeff_partition_size[i] = buf_size;
340    ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
341
342    return 0;
343}
344
345static void vp7_get_quants(VP8Context *s)
346{
347    VP56RangeCoder *c = &s->c;
348
349    int yac_qi  = vp8_rac_get_uint(c, 7);
350    int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
351    int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
352    int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
353    int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
354    int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
355
356    s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
357    s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
358    s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
359    s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
360    s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
361    s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
362}
363
364static void vp8_get_quants(VP8Context *s)
365{
366    VP56RangeCoder *c = &s->c;
367    int i, base_qi;
368
369    s->quant.yac_qi     = vp8_rac_get_uint(c, 7);
370    s->quant.ydc_delta  = vp8_rac_get_sint(c, 4);
371    s->quant.y2dc_delta = vp8_rac_get_sint(c, 4);
372    s->quant.y2ac_delta = vp8_rac_get_sint(c, 4);
373    s->quant.uvdc_delta = vp8_rac_get_sint(c, 4);
374    s->quant.uvac_delta = vp8_rac_get_sint(c, 4);
375
376    for (i = 0; i < 4; i++) {
377        if (s->segmentation.enabled) {
378            base_qi = s->segmentation.base_quant[i];
379            if (!s->segmentation.absolute_vals)
380                base_qi += s->quant.yac_qi;
381        } else
382            base_qi = s->quant.yac_qi;
383
384        s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.ydc_delta,  7)];
385        s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
386        s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7)] * 2;
387        /* 101581>>16 is equivalent to 155/100 */
388        s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7)] * 101581 >> 16;
389        s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7)];
390        s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.uvac_delta, 7)];
391
392        s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
393        s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
394    }
395}
396
397/**
398 * Determine which buffers golden and altref should be updated with after this frame.
399 * The spec isn't clear here, so I'm going by my understanding of what libvpx does
400 *
401 * Intra frames update all 3 references
402 * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
403 * If the update (golden|altref) flag is set, it's updated with the current frame
404 *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
405 * If the flag is not set, the number read means:
406 *      0: no update
407 *      1: VP56_FRAME_PREVIOUS
408 *      2: update golden with altref, or update altref with golden
409 */
410static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
411{
412    VP56RangeCoder *c = &s->c;
413
414    if (update)
415        return VP56_FRAME_CURRENT;
416
417    switch (vp8_rac_get_uint(c, 2)) {
418    case 1:
419        return VP56_FRAME_PREVIOUS;
420    case 2:
421        return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
422    }
423    return VP56_FRAME_NONE;
424}
425
426static void vp78_reset_probability_tables(VP8Context *s)
427{
428    int i, j;
429    for (i = 0; i < 4; i++)
430        for (j = 0; j < 16; j++)
431            memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
432                   sizeof(s->prob->token[i][j]));
433}
434
435static void vp78_update_probability_tables(VP8Context *s)
436{
437    VP56RangeCoder *c = &s->c;
438    int i, j, k, l, m;
439
440    for (i = 0; i < 4; i++)
441        for (j = 0; j < 8; j++)
442            for (k = 0; k < 3; k++)
443                for (l = 0; l < NUM_DCT_TOKENS-1; l++)
444                    if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
445                        int prob = vp8_rac_get_uint(c, 8);
446                        for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
447                            s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
448                    }
449}
450
451#define VP7_MVC_SIZE 17
452#define VP8_MVC_SIZE 19
453
454static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
455                                                            int mvc_size)
456{
457    VP56RangeCoder *c = &s->c;
458    int i, j;
459
460    if (vp8_rac_get(c))
461        for (i = 0; i < 4; i++)
462            s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
463    if (vp8_rac_get(c))
464        for (i = 0; i < 3; i++)
465            s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
466
467    // 17.2 MV probability update
468    for (i = 0; i < 2; i++)
469        for (j = 0; j < mvc_size; j++)
470            if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
471                s->prob->mvc[i][j] = vp8_rac_get_nn(c);
472}
473
474static void update_refs(VP8Context *s)
475{
476    VP56RangeCoder *c = &s->c;
477
478    int update_golden = vp8_rac_get(c);
479    int update_altref = vp8_rac_get(c);
480
481    s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
482    s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
483}
484
485static void copy_chroma(AVFrame *dst, AVFrame *src, int width, int height)
486{
487    int i, j;
488
489    for (j = 1; j < 3; j++) {
490        for (i = 0; i < height / 2; i++)
491            memcpy(dst->data[j] + i * dst->linesize[j],
492                   src->data[j] + i * src->linesize[j], width / 2);
493    }
494}
495
496static void fade(uint8_t *dst, ptrdiff_t dst_linesize,
497                 const uint8_t *src, ptrdiff_t src_linesize,
498                 int width, int height,
499                 int alpha, int beta)
500{
501    int i, j;
502    for (j = 0; j < height; j++) {
503        const uint8_t *src2 = src + j * src_linesize;
504        uint8_t *dst2 = dst + j * dst_linesize;
505        for (i = 0; i < width; i++) {
506            uint8_t y = src2[i];
507            dst2[i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
508        }
509    }
510}
511
512static int vp7_fade_frame(VP8Context *s, int alpha, int beta)
513{
514    int ret;
515
516    if (!s->keyframe && (alpha || beta)) {
517        int width  = s->mb_width * 16;
518        int height = s->mb_height * 16;
519        AVFrame *src, *dst;
520
521        if (!s->framep[VP56_FRAME_PREVIOUS] ||
522            !s->framep[VP56_FRAME_GOLDEN]) {
523            av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
524            return AVERROR_INVALIDDATA;
525        }
526
527        dst =
528        src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
529
530        /* preserve the golden frame, write a new previous frame */
531        if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
532            s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
533            if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
534                return ret;
535
536            dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
537
538            copy_chroma(dst, src, width, height);
539        }
540
541        fade(dst->data[0], dst->linesize[0],
542             src->data[0], src->linesize[0],
543             width, height, alpha, beta);
544    }
545
546    return 0;
547}
548
549static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
550{
551    VP56RangeCoder *c = &s->c;
552    int part1_size, hscale, vscale, i, j, ret;
553    int width  = s->avctx->width;
554    int height = s->avctx->height;
555    int alpha = 0;
556    int beta  = 0;
557
558    if (buf_size < 4) {
559        return AVERROR_INVALIDDATA;
560    }
561
562    s->profile = (buf[0] >> 1) & 7;
563    if (s->profile > 1) {
564        avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
565        return AVERROR_INVALIDDATA;
566    }
567
568    s->keyframe  = !(buf[0] & 1);
569    s->invisible = 0;
570    part1_size   = AV_RL24(buf) >> 4;
571
572    if (buf_size < 4 - s->profile + part1_size) {
573        av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
574        return AVERROR_INVALIDDATA;
575    }
576
577    buf      += 4 - s->profile;
578    buf_size -= 4 - s->profile;
579
580    memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
581
582    ret = ff_vp56_init_range_decoder(c, buf, part1_size);
583    if (ret < 0)
584        return ret;
585    buf      += part1_size;
586    buf_size -= part1_size;
587
588    /* A. Dimension information (keyframes only) */
589    if (s->keyframe) {
590        width  = vp8_rac_get_uint(c, 12);
591        height = vp8_rac_get_uint(c, 12);
592        hscale = vp8_rac_get_uint(c, 2);
593        vscale = vp8_rac_get_uint(c, 2);
594        if (hscale || vscale)
595            avpriv_request_sample(s->avctx, "Upscaling");
596
597        s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
598        vp78_reset_probability_tables(s);
599        memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
600               sizeof(s->prob->pred16x16));
601        memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
602               sizeof(s->prob->pred8x8c));
603        for (i = 0; i < 2; i++)
604            memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
605                   sizeof(vp7_mv_default_prob[i]));
606        memset(&s->segmentation, 0, sizeof(s->segmentation));
607        memset(&s->lf_delta, 0, sizeof(s->lf_delta));
608        memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
609    }
610
611    if (s->keyframe || s->profile > 0)
612        memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
613
614    /* B. Decoding information for all four macroblock-level features */
615    for (i = 0; i < 4; i++) {
616        s->feature_enabled[i] = vp8_rac_get(c);
617        if (s->feature_enabled[i]) {
618             s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
619
620             for (j = 0; j < 3; j++)
621                 s->feature_index_prob[i][j] =
622                     vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
623
624             if (vp7_feature_value_size[s->profile][i])
625                 for (j = 0; j < 4; j++)
626                     s->feature_value[i][j] =
627                        vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
628        }
629    }
630
631    s->segmentation.enabled    = 0;
632    s->segmentation.update_map = 0;
633    s->lf_delta.enabled        = 0;
634
635    s->num_coeff_partitions = 1;
636    ret = ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
637    if (ret < 0)
638        return ret;
639
640    if (!s->macroblocks_base || /* first frame */
641        width != s->avctx->width || height != s->avctx->height ||
642        (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
643        if ((ret = vp7_update_dimensions(s, width, height)) < 0)
644            return ret;
645    }
646
647    /* C. Dequantization indices */
648    vp7_get_quants(s);
649
650    /* D. Golden frame update flag (a Flag) for interframes only */
651    if (!s->keyframe) {
652        s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
653        s->sign_bias[VP56_FRAME_GOLDEN] = 0;
654    }
655
656    s->update_last          = 1;
657    s->update_probabilities = 1;
658    s->fade_present         = 1;
659
660    if (s->profile > 0) {
661        s->update_probabilities = vp8_rac_get(c);
662        if (!s->update_probabilities)
663            s->prob[1] = s->prob[0];
664
665        if (!s->keyframe)
666            s->fade_present = vp8_rac_get(c);
667    }
668
669    if (vpX_rac_is_end(c))
670        return AVERROR_INVALIDDATA;
671    /* E. Fading information for previous frame */
672    if (s->fade_present && vp8_rac_get(c)) {
673        alpha = (int8_t) vp8_rac_get_uint(c, 8);
674        beta  = (int8_t) vp8_rac_get_uint(c, 8);
675    }
676
677    /* F. Loop filter type */
678    if (!s->profile)
679        s->filter.simple = vp8_rac_get(c);
680
681    /* G. DCT coefficient ordering specification */
682    if (vp8_rac_get(c))
683        for (i = 1; i < 16; i++)
684            s->prob[0].scan[i] = ff_zigzag_scan[vp8_rac_get_uint(c, 4)];
685
686    /* H. Loop filter levels  */
687    if (s->profile > 0)
688        s->filter.simple = vp8_rac_get(c);
689    s->filter.level     = vp8_rac_get_uint(c, 6);
690    s->filter.sharpness = vp8_rac_get_uint(c, 3);
691
692    /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
693    vp78_update_probability_tables(s);
694
695    s->mbskip_enabled = 0;
696
697    /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
698    if (!s->keyframe) {
699        s->prob->intra  = vp8_rac_get_uint(c, 8);
700        s->prob->last   = vp8_rac_get_uint(c, 8);
701        vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
702    }
703
704    if (vpX_rac_is_end(c))
705        return AVERROR_INVALIDDATA;
706
707    if ((ret = vp7_fade_frame(s, alpha, beta)) < 0)
708        return ret;
709
710    return 0;
711}
712
713static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
714{
715    VP56RangeCoder *c = &s->c;
716    int header_size, hscale, vscale, ret;
717    int width  = s->avctx->width;
718    int height = s->avctx->height;
719
720    if (buf_size < 3) {
721        av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size);
722        return AVERROR_INVALIDDATA;
723    }
724
725    s->keyframe  = !(buf[0] & 1);
726    s->profile   =  (buf[0]>>1) & 7;
727    s->invisible = !(buf[0] & 0x10);
728    header_size  = AV_RL24(buf) >> 5;
729    buf      += 3;
730    buf_size -= 3;
731
732    s->header_partition_size = header_size;
733
734    if (s->profile > 3)
735        av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
736
737    if (!s->profile)
738        memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
739               sizeof(s->put_pixels_tab));
740    else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
741        memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
742               sizeof(s->put_pixels_tab));
743
744    if (header_size > buf_size - 7 * s->keyframe) {
745        av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
746        return AVERROR_INVALIDDATA;
747    }
748
749    if (s->keyframe) {
750        if (AV_RL24(buf) != 0x2a019d) {
751            av_log(s->avctx, AV_LOG_ERROR,
752                   "Invalid start code 0x%x\n", AV_RL24(buf));
753            return AVERROR_INVALIDDATA;
754        }
755        width     = AV_RL16(buf + 3) & 0x3fff;
756        height    = AV_RL16(buf + 5) & 0x3fff;
757        hscale    = buf[4] >> 6;
758        vscale    = buf[6] >> 6;
759        buf      += 7;
760        buf_size -= 7;
761
762        if (hscale || vscale)
763            avpriv_request_sample(s->avctx, "Upscaling");
764
765        s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
766        vp78_reset_probability_tables(s);
767        memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
768               sizeof(s->prob->pred16x16));
769        memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
770               sizeof(s->prob->pred8x8c));
771        memcpy(s->prob->mvc, vp8_mv_default_prob,
772               sizeof(s->prob->mvc));
773        memset(&s->segmentation, 0, sizeof(s->segmentation));
774        memset(&s->lf_delta, 0, sizeof(s->lf_delta));
775    }
776
777    ret = ff_vp56_init_range_decoder(c, buf, header_size);
778    if (ret < 0)
779        return ret;
780    buf      += header_size;
781    buf_size -= header_size;
782
783    if (s->keyframe) {
784        s->colorspace = vp8_rac_get(c);
785        if (s->colorspace)
786            av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
787        s->fullrange = vp8_rac_get(c);
788    }
789
790    if ((s->segmentation.enabled = vp8_rac_get(c)))
791        parse_segment_info(s);
792    else
793        s->segmentation.update_map = 0; // FIXME: move this to some init function?
794
795    s->filter.simple    = vp8_rac_get(c);
796    s->filter.level     = vp8_rac_get_uint(c, 6);
797    s->filter.sharpness = vp8_rac_get_uint(c, 3);
798
799    if ((s->lf_delta.enabled = vp8_rac_get(c))) {
800        s->lf_delta.update = vp8_rac_get(c);
801        if (s->lf_delta.update)
802            update_lf_deltas(s);
803    }
804
805    if (setup_partitions(s, buf, buf_size)) {
806        av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
807        return AVERROR_INVALIDDATA;
808    }
809
810    if (!s->macroblocks_base || /* first frame */
811        width != s->avctx->width || height != s->avctx->height ||
812        (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
813        if ((ret = vp8_update_dimensions(s, width, height)) < 0)
814            return ret;
815
816    vp8_get_quants(s);
817
818    if (!s->keyframe) {
819        update_refs(s);
820        s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
821        s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
822    }
823
824    // if we aren't saving this frame's probabilities for future frames,
825    // make a copy of the current probabilities
826    if (!(s->update_probabilities = vp8_rac_get(c)))
827        s->prob[1] = s->prob[0];
828
829    s->update_last = s->keyframe || vp8_rac_get(c);
830
831    vp78_update_probability_tables(s);
832
833    if ((s->mbskip_enabled = vp8_rac_get(c)))
834        s->prob->mbskip = vp8_rac_get_uint(c, 8);
835
836    if (!s->keyframe) {
837        s->prob->intra  = vp8_rac_get_uint(c, 8);
838        s->prob->last   = vp8_rac_get_uint(c, 8);
839        s->prob->golden = vp8_rac_get_uint(c, 8);
840        vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
841    }
842
843    // Record the entropy coder state here so that hwaccels can use it.
844    s->c.code_word = vp56_rac_renorm(&s->c);
845    s->coder_state_at_header_end.input     = s->c.buffer - (-s->c.bits / 8);
846    s->coder_state_at_header_end.range     = s->c.high;
847    s->coder_state_at_header_end.value     = s->c.code_word >> 16;
848    s->coder_state_at_header_end.bit_count = -s->c.bits % 8;
849
850    return 0;
851}
852
853static av_always_inline
854void clamp_mv(VP8mvbounds *s, VP56mv *dst, const VP56mv *src)
855{
856    dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX),
857                             av_clip(s->mv_max.x, INT16_MIN, INT16_MAX));
858    dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX),
859                             av_clip(s->mv_max.y, INT16_MIN, INT16_MAX));
860}
861
862/**
863 * Motion vector coding, 17.1.
864 */
865static av_always_inline int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
866{
867    int bit, x = 0;
868
869    if (vp56_rac_get_prob_branchy(c, p[0])) {
870        int i;
871
872        for (i = 0; i < 3; i++)
873            x += vp56_rac_get_prob(c, p[9 + i]) << i;
874        for (i = (vp7 ? 7 : 9); i > 3; i--)
875            x += vp56_rac_get_prob(c, p[9 + i]) << i;
876        if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
877            x += 8;
878    } else {
879        // small_mvtree
880        const uint8_t *ps = p + 2;
881        bit = vp56_rac_get_prob(c, *ps);
882        ps += 1 + 3 * bit;
883        x  += 4 * bit;
884        bit = vp56_rac_get_prob(c, *ps);
885        ps += 1 + bit;
886        x  += 2 * bit;
887        x  += vp56_rac_get_prob(c, *ps);
888    }
889
890    return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
891}
892
893static int vp7_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
894{
895    return read_mv_component(c, p, 1);
896}
897
898static int vp8_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
899{
900    return read_mv_component(c, p, 0);
901}
902
903static av_always_inline
904const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
905{
906    if (is_vp7)
907        return vp7_submv_prob;
908
909    if (left == top)
910        return vp8_submv_prob[4 - !!left];
911    if (!top)
912        return vp8_submv_prob[2];
913    return vp8_submv_prob[1 - !!left];
914}
915
916/**
917 * Split motion vector prediction, 16.4.
918 * @returns the number of motion vectors parsed (2, 4 or 16)
919 */
920static av_always_inline
921int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
922                    int layout, int is_vp7)
923{
924    int part_idx;
925    int n, num;
926    VP8Macroblock *top_mb;
927    VP8Macroblock *left_mb = &mb[-1];
928    const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
929    const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
930    VP56mv *top_mv;
931    VP56mv *left_mv = left_mb->bmv;
932    VP56mv *cur_mv  = mb->bmv;
933
934    if (!layout) // layout is inlined, s->mb_layout is not
935        top_mb = &mb[2];
936    else
937        top_mb = &mb[-s->mb_width - 1];
938    mbsplits_top = vp8_mbsplits[top_mb->partitioning];
939    top_mv       = top_mb->bmv;
940
941    if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
942        if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
943            part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
944        else
945            part_idx = VP8_SPLITMVMODE_8x8;
946    } else {
947        part_idx = VP8_SPLITMVMODE_4x4;
948    }
949
950    num              = vp8_mbsplit_count[part_idx];
951    mbsplits_cur     = vp8_mbsplits[part_idx],
952    firstidx         = vp8_mbfirstidx[part_idx];
953    mb->partitioning = part_idx;
954
955    for (n = 0; n < num; n++) {
956        int k = firstidx[n];
957        uint32_t left, above;
958        const uint8_t *submv_prob;
959
960        if (!(k & 3))
961            left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
962        else
963            left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
964        if (k <= 3)
965            above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
966        else
967            above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
968
969        submv_prob = get_submv_prob(left, above, is_vp7);
970
971        if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
972            if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
973                if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
974                    mb->bmv[n].y = mb->mv.y +
975                                   read_mv_component(c, s->prob->mvc[0], is_vp7);
976                    mb->bmv[n].x = mb->mv.x +
977                                   read_mv_component(c, s->prob->mvc[1], is_vp7);
978                } else {
979                    AV_ZERO32(&mb->bmv[n]);
980                }
981            } else {
982                AV_WN32A(&mb->bmv[n], above);
983            }
984        } else {
985            AV_WN32A(&mb->bmv[n], left);
986        }
987    }
988
989    return num;
990}
991
992/**
993 * The vp7 reference decoder uses a padding macroblock column (added to right
994 * edge of the frame) to guard against illegal macroblock offsets. The
995 * algorithm has bugs that permit offsets to straddle the padding column.
996 * This function replicates those bugs.
997 *
998 * @param[out] edge_x macroblock x address
999 * @param[out] edge_y macroblock y address
1000 *
1001 * @return macroblock offset legal (boolean)
1002 */
1003static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
1004                                   int xoffset, int yoffset, int boundary,
1005                                   int *edge_x, int *edge_y)
1006{
1007    int vwidth = mb_width + 1;
1008    int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
1009    if (new < boundary || new % vwidth == vwidth - 1)
1010        return 0;
1011    *edge_y = new / vwidth;
1012    *edge_x = new % vwidth;
1013    return 1;
1014}
1015
1016static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
1017{
1018    return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
1019}
1020
1021static av_always_inline
1022void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
1023                    int mb_x, int mb_y, int layout)
1024{
1025    VP8Macroblock *mb_edge[12];
1026    enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
1027    enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1028    int idx = CNT_ZERO;
1029    VP56mv near_mv[3];
1030    uint8_t cnt[3] = { 0 };
1031    VP56RangeCoder *c = &s->c;
1032    int i;
1033
1034    AV_ZERO32(&near_mv[0]);
1035    AV_ZERO32(&near_mv[1]);
1036    AV_ZERO32(&near_mv[2]);
1037
1038    for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
1039        const VP7MVPred * pred = &vp7_mv_pred[i];
1040        int edge_x, edge_y;
1041
1042        if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
1043                                    pred->yoffset, !s->profile, &edge_x, &edge_y)) {
1044            VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
1045                                             ? s->macroblocks_base + 1 + edge_x +
1046                                               (s->mb_width + 1) * (edge_y + 1)
1047                                             : s->macroblocks + edge_x +
1048                                               (s->mb_height - edge_y - 1) * 2;
1049            uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
1050            if (mv) {
1051                if (AV_RN32A(&near_mv[CNT_NEAREST])) {
1052                    if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
1053                        idx = CNT_NEAREST;
1054                    } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
1055                        if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
1056                            continue;
1057                        idx = CNT_NEAR;
1058                    } else {
1059                        AV_WN32A(&near_mv[CNT_NEAR], mv);
1060                        idx = CNT_NEAR;
1061                    }
1062                } else {
1063                    AV_WN32A(&near_mv[CNT_NEAREST], mv);
1064                    idx = CNT_NEAREST;
1065                }
1066            } else {
1067                idx = CNT_ZERO;
1068            }
1069        } else {
1070            idx = CNT_ZERO;
1071        }
1072        cnt[idx] += vp7_mv_pred[i].score;
1073    }
1074
1075    mb->partitioning = VP8_SPLITMVMODE_NONE;
1076
1077    if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
1078        mb->mode = VP8_MVMODE_MV;
1079
1080        if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
1081
1082            if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
1083
1084                if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
1085                    AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
1086                else
1087                    AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
1088
1089                if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
1090                    mb->mode = VP8_MVMODE_SPLIT;
1091                    mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
1092                } else {
1093                    mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
1094                    mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
1095                    mb->bmv[0] = mb->mv;
1096                }
1097            } else {
1098                mb->mv = near_mv[CNT_NEAR];
1099                mb->bmv[0] = mb->mv;
1100            }
1101        } else {
1102            mb->mv = near_mv[CNT_NEAREST];
1103            mb->bmv[0] = mb->mv;
1104        }
1105    } else {
1106        mb->mode = VP8_MVMODE_ZERO;
1107        AV_ZERO32(&mb->mv);
1108        mb->bmv[0] = mb->mv;
1109    }
1110}
1111
1112static av_always_inline
1113void vp8_decode_mvs(VP8Context *s, VP8mvbounds *mv_bounds, VP8Macroblock *mb,
1114                    int mb_x, int mb_y, int layout)
1115{
1116    VP8Macroblock *mb_edge[3] = { 0      /* top */,
1117                                  mb - 1 /* left */,
1118                                  0      /* top-left */ };
1119    enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1120    enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1121    int idx = CNT_ZERO;
1122    int cur_sign_bias = s->sign_bias[mb->ref_frame];
1123    int8_t *sign_bias = s->sign_bias;
1124    VP56mv near_mv[4];
1125    uint8_t cnt[4] = { 0 };
1126    VP56RangeCoder *c = &s->c;
1127
1128    if (!layout) { // layout is inlined (s->mb_layout is not)
1129        mb_edge[0] = mb + 2;
1130        mb_edge[2] = mb + 1;
1131    } else {
1132        mb_edge[0] = mb - s->mb_width - 1;
1133        mb_edge[2] = mb - s->mb_width - 2;
1134    }
1135
1136    AV_ZERO32(&near_mv[0]);
1137    AV_ZERO32(&near_mv[1]);
1138    AV_ZERO32(&near_mv[2]);
1139
1140    /* Process MB on top, left and top-left */
1141#define MV_EDGE_CHECK(n)                                                      \
1142    {                                                                         \
1143        VP8Macroblock *edge = mb_edge[n];                                     \
1144        int edge_ref = edge->ref_frame;                                       \
1145        if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1146            uint32_t mv = AV_RN32A(&edge->mv);                                \
1147            if (mv) {                                                         \
1148                if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1149                    /* SWAR negate of the values in mv. */                    \
1150                    mv = ~mv;                                                 \
1151                    mv = ((mv & 0x7fff7fff) +                                 \
1152                          0x00010001) ^ (mv & 0x80008000);                    \
1153                }                                                             \
1154                if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1155                    AV_WN32A(&near_mv[++idx], mv);                            \
1156                cnt[idx] += 1 + (n != 2);                                     \
1157            } else                                                            \
1158                cnt[CNT_ZERO] += 1 + (n != 2);                                \
1159        }                                                                     \
1160    }
1161
1162    MV_EDGE_CHECK(0)
1163    MV_EDGE_CHECK(1)
1164    MV_EDGE_CHECK(2)
1165
1166    mb->partitioning = VP8_SPLITMVMODE_NONE;
1167    if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1168        mb->mode = VP8_MVMODE_MV;
1169
1170        /* If we have three distinct MVs, merge first and last if they're the same */
1171        if (cnt[CNT_SPLITMV] &&
1172            AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1173            cnt[CNT_NEAREST] += 1;
1174
1175        /* Swap near and nearest if necessary */
1176        if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1177            FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1178            FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1179        }
1180
1181        if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1182            if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1183                /* Choose the best mv out of 0,0 and the nearest mv */
1184                clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1185                cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1186                                    (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1187                                    (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1188
1189                if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1190                    mb->mode = VP8_MVMODE_SPLIT;
1191                    mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1192                } else {
1193                    mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
1194                    mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
1195                    mb->bmv[0] = mb->mv;
1196                }
1197            } else {
1198                clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAR]);
1199                mb->bmv[0] = mb->mv;
1200            }
1201        } else {
1202            clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAREST]);
1203            mb->bmv[0] = mb->mv;
1204        }
1205    } else {
1206        mb->mode = VP8_MVMODE_ZERO;
1207        AV_ZERO32(&mb->mv);
1208        mb->bmv[0] = mb->mv;
1209    }
1210}
1211
1212static av_always_inline
1213void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1214                           int mb_x, int keyframe, int layout)
1215{
1216    uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1217
1218    if (layout) {
1219        VP8Macroblock *mb_top = mb - s->mb_width - 1;
1220        memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1221    }
1222    if (keyframe) {
1223        int x, y;
1224        uint8_t *top;
1225        uint8_t *const left = s->intra4x4_pred_mode_left;
1226        if (layout)
1227            top = mb->intra4x4_pred_mode_top;
1228        else
1229            top = s->intra4x4_pred_mode_top + 4 * mb_x;
1230        for (y = 0; y < 4; y++) {
1231            for (x = 0; x < 4; x++) {
1232                const uint8_t *ctx;
1233                ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1234                *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1235                left[y]   = top[x] = *intra4x4;
1236                intra4x4++;
1237            }
1238        }
1239    } else {
1240        int i;
1241        for (i = 0; i < 16; i++)
1242            intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1243                                           vp8_pred4x4_prob_inter);
1244    }
1245}
1246
1247static av_always_inline
1248void decode_mb_mode(VP8Context *s, VP8mvbounds *mv_bounds,
1249                    VP8Macroblock *mb, int mb_x, int mb_y,
1250                    uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1251{
1252    VP56RangeCoder *c = &s->c;
1253    static const char * const vp7_feature_name[] = { "q-index",
1254                                                     "lf-delta",
1255                                                     "partial-golden-update",
1256                                                     "blit-pitch" };
1257    if (is_vp7) {
1258        int i;
1259        *segment = 0;
1260        for (i = 0; i < 4; i++) {
1261            if (s->feature_enabled[i]) {
1262                if (vp56_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
1263                      int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1264                                                   s->feature_index_prob[i]);
1265                      av_log(s->avctx, AV_LOG_WARNING,
1266                             "Feature %s present in macroblock (value 0x%x)\n",
1267                             vp7_feature_name[i], s->feature_value[i][index]);
1268                }
1269           }
1270        }
1271    } else if (s->segmentation.update_map) {
1272        int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
1273        *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
1274    } else if (s->segmentation.enabled)
1275        *segment = ref ? *ref : *segment;
1276    mb->segment = *segment;
1277
1278    mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1279
1280    if (s->keyframe) {
1281        mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1282                                    vp8_pred16x16_prob_intra);
1283
1284        if (mb->mode == MODE_I4x4) {
1285            decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1286        } else {
1287            const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1288                                           : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1289            if (s->mb_layout)
1290                AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1291            else
1292                AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1293            AV_WN32A(s->intra4x4_pred_mode_left, modes);
1294        }
1295
1296        mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1297                                                vp8_pred8x8c_prob_intra);
1298        mb->ref_frame        = VP56_FRAME_CURRENT;
1299    } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1300        // inter MB, 16.2
1301        if (vp56_rac_get_prob_branchy(c, s->prob->last))
1302            mb->ref_frame =
1303                (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1304                                                                   : VP56_FRAME_GOLDEN;
1305        else
1306            mb->ref_frame = VP56_FRAME_PREVIOUS;
1307        s->ref_count[mb->ref_frame - 1]++;
1308
1309        // motion vectors, 16.3
1310        if (is_vp7)
1311            vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1312        else
1313            vp8_decode_mvs(s, mv_bounds, mb, mb_x, mb_y, layout);
1314    } else {
1315        // intra MB, 16.1
1316        mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1317
1318        if (mb->mode == MODE_I4x4)
1319            decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1320
1321        mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1322                                                s->prob->pred8x8c);
1323        mb->ref_frame        = VP56_FRAME_CURRENT;
1324        mb->partitioning     = VP8_SPLITMVMODE_NONE;
1325        AV_ZERO32(&mb->bmv[0]);
1326    }
1327}
1328
1329/**
1330 * @param r     arithmetic bitstream reader context
1331 * @param block destination for block coefficients
1332 * @param probs probabilities to use when reading trees from the bitstream
1333 * @param i     initial coeff index, 0 unless a separate DC block is coded
1334 * @param qmul  array holding the dc/ac dequant factor at position 0/1
1335 *
1336 * @return 0 if no coeffs were decoded
1337 *         otherwise, the index of the last coeff decoded plus one
1338 */
1339static av_always_inline
1340int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1341                                 uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1342                                 int i, uint8_t *token_prob, int16_t qmul[2],
1343                                 const uint8_t scan[16], int vp7)
1344{
1345    VP56RangeCoder c = *r;
1346    goto skip_eob;
1347    do {
1348        int coeff;
1349restart:
1350        if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1351            break;
1352
1353skip_eob:
1354        if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1355            if (++i == 16)
1356                break; // invalid input; blocks should end with EOB
1357            token_prob = probs[i][0];
1358            if (vp7)
1359                goto restart;
1360            goto skip_eob;
1361        }
1362
1363        if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1364            coeff = 1;
1365            token_prob = probs[i + 1][1];
1366        } else {
1367            if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1368                coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1369                if (coeff)
1370                    coeff += vp56_rac_get_prob(&c, token_prob[5]);
1371                coeff += 2;
1372            } else {
1373                // DCT_CAT*
1374                if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1375                    if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1376                        coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1377                    } else {                                    // DCT_CAT2
1378                        coeff  = 7;
1379                        coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1380                        coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1381                    }
1382                } else {    // DCT_CAT3 and up
1383                    int a   = vp56_rac_get_prob(&c, token_prob[8]);
1384                    int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1385                    int cat = (a << 1) + b;
1386                    coeff  = 3 + (8 << cat);
1387                    coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1388                }
1389            }
1390            token_prob = probs[i + 1][2];
1391        }
1392        block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1393    } while (++i < 16);
1394
1395    *r = c;
1396    return i;
1397}
1398
1399static av_always_inline
1400int inter_predict_dc(int16_t block[16], int16_t pred[2])
1401{
1402    int16_t dc = block[0];
1403    int ret = 0;
1404
1405    if (pred[1] > 3) {
1406        dc += pred[0];
1407        ret = 1;
1408    }
1409
1410    if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1411        block[0] = pred[0] = dc;
1412        pred[1] = 0;
1413    } else {
1414        if (pred[0] == dc)
1415            pred[1]++;
1416        block[0] = pred[0] = dc;
1417    }
1418
1419    return ret;
1420}
1421
1422static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1423                                            int16_t block[16],
1424                                            uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1425                                            int i, uint8_t *token_prob,
1426                                            int16_t qmul[2],
1427                                            const uint8_t scan[16])
1428{
1429    return decode_block_coeffs_internal(r, block, probs, i,
1430                                        token_prob, qmul, scan, IS_VP7);
1431}
1432
1433#ifndef vp8_decode_block_coeffs_internal
1434static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1435                                            int16_t block[16],
1436                                            uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1437                                            int i, uint8_t *token_prob,
1438                                            int16_t qmul[2])
1439{
1440    return decode_block_coeffs_internal(r, block, probs, i,
1441                                        token_prob, qmul, ff_zigzag_scan, IS_VP8);
1442}
1443#endif
1444
1445/**
1446 * @param c          arithmetic bitstream reader context
1447 * @param block      destination for block coefficients
1448 * @param probs      probabilities to use when reading trees from the bitstream
1449 * @param i          initial coeff index, 0 unless a separate DC block is coded
1450 * @param zero_nhood the initial prediction context for number of surrounding
1451 *                   all-zero blocks (only left/top, so 0-2)
1452 * @param qmul       array holding the dc/ac dequant factor at position 0/1
1453 * @param scan       scan pattern (VP7 only)
1454 *
1455 * @return 0 if no coeffs were decoded
1456 *         otherwise, the index of the last coeff decoded plus one
1457 */
1458static av_always_inline
1459int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1460                        uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1461                        int i, int zero_nhood, int16_t qmul[2],
1462                        const uint8_t scan[16], int vp7)
1463{
1464    uint8_t *token_prob = probs[i][zero_nhood];
1465    if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1466        return 0;
1467    return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1468                                                  token_prob, qmul, scan)
1469               : vp8_decode_block_coeffs_internal(c, block, probs, i,
1470                                                  token_prob, qmul);
1471}
1472
1473static av_always_inline
1474void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1475                      VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1476                      int is_vp7)
1477{
1478    int i, x, y, luma_start = 0, luma_ctx = 3;
1479    int nnz_pred, nnz, nnz_total = 0;
1480    int segment = mb->segment;
1481    int block_dc = 0;
1482
1483    if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1484        nnz_pred = t_nnz[8] + l_nnz[8];
1485
1486        // decode DC values and do hadamard
1487        nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1488                                  nnz_pred, s->qmat[segment].luma_dc_qmul,
1489                                  ff_zigzag_scan, is_vp7);
1490        l_nnz[8] = t_nnz[8] = !!nnz;
1491
1492        if (is_vp7 && mb->mode > MODE_I4x4) {
1493            nnz |=  inter_predict_dc(td->block_dc,
1494                                     s->inter_dc_pred[mb->ref_frame - 1]);
1495        }
1496
1497        if (nnz) {
1498            nnz_total += nnz;
1499            block_dc   = 1;
1500            if (nnz == 1)
1501                s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1502            else
1503                s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1504        }
1505        luma_start = 1;
1506        luma_ctx   = 0;
1507    }
1508
1509    // luma blocks
1510    for (y = 0; y < 4; y++)
1511        for (x = 0; x < 4; x++) {
1512            nnz_pred = l_nnz[y] + t_nnz[x];
1513            nnz = decode_block_coeffs(c, td->block[y][x],
1514                                      s->prob->token[luma_ctx],
1515                                      luma_start, nnz_pred,
1516                                      s->qmat[segment].luma_qmul,
1517                                      s->prob[0].scan, is_vp7);
1518            /* nnz+block_dc may be one more than the actual last index,
1519             * but we don't care */
1520            td->non_zero_count_cache[y][x] = nnz + block_dc;
1521            t_nnz[x] = l_nnz[y] = !!nnz;
1522            nnz_total += nnz;
1523        }
1524
1525    // chroma blocks
1526    // TODO: what to do about dimensions? 2nd dim for luma is x,
1527    // but for chroma it's (y<<1)|x
1528    for (i = 4; i < 6; i++)
1529        for (y = 0; y < 2; y++)
1530            for (x = 0; x < 2; x++) {
1531                nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1532                nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1533                                          s->prob->token[2], 0, nnz_pred,
1534                                          s->qmat[segment].chroma_qmul,
1535                                          s->prob[0].scan, is_vp7);
1536                td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1537                t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1538                nnz_total += nnz;
1539            }
1540
1541    // if there were no coded coeffs despite the macroblock not being marked skip,
1542    // we MUST not do the inner loop filter and should not do IDCT
1543    // Since skip isn't used for bitstream prediction, just manually set it.
1544    if (!nnz_total)
1545        mb->skip = 1;
1546}
1547
1548static av_always_inline
1549void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1550                      uint8_t *src_cb, uint8_t *src_cr,
1551                      ptrdiff_t linesize, ptrdiff_t uvlinesize, int simple)
1552{
1553    AV_COPY128(top_border, src_y + 15 * linesize);
1554    if (!simple) {
1555        AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1556        AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1557    }
1558}
1559
1560static av_always_inline
1561void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1562                    uint8_t *src_cr, ptrdiff_t linesize, ptrdiff_t uvlinesize, int mb_x,
1563                    int mb_y, int mb_width, int simple, int xchg)
1564{
1565    uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1566    src_y  -= linesize;
1567    src_cb -= uvlinesize;
1568    src_cr -= uvlinesize;
1569
1570#define XCHG(a, b, xchg)                                                      \
1571    do {                                                                      \
1572        if (xchg)                                                             \
1573            AV_SWAP64(b, a);                                                  \
1574        else                                                                  \
1575            AV_COPY64(b, a);                                                  \
1576    } while (0)
1577
1578    XCHG(top_border_m1 + 8, src_y - 8, xchg);
1579    XCHG(top_border, src_y, xchg);
1580    XCHG(top_border + 8, src_y + 8, 1);
1581    if (mb_x < mb_width - 1)
1582        XCHG(top_border + 32, src_y + 16, 1);
1583
1584    // only copy chroma for normal loop filter
1585    // or to initialize the top row to 127
1586    if (!simple || !mb_y) {
1587        XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1588        XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1589        XCHG(top_border + 16, src_cb, 1);
1590        XCHG(top_border + 24, src_cr, 1);
1591    }
1592}
1593
1594static av_always_inline
1595int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1596{
1597    if (!mb_x)
1598        return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1599    else
1600        return mb_y ? mode : LEFT_DC_PRED8x8;
1601}
1602
1603static av_always_inline
1604int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1605{
1606    if (!mb_x)
1607        return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1608    else
1609        return mb_y ? mode : HOR_PRED8x8;
1610}
1611
1612static av_always_inline
1613int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1614{
1615    switch (mode) {
1616    case DC_PRED8x8:
1617        return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1618    case VERT_PRED8x8:
1619        return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1620    case HOR_PRED8x8:
1621        return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1622    case PLANE_PRED8x8: /* TM */
1623        return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1624    }
1625    return mode;
1626}
1627
1628static av_always_inline
1629int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1630{
1631    if (!mb_x) {
1632        return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1633    } else {
1634        return mb_y ? mode : HOR_VP8_PRED;
1635    }
1636}
1637
1638static av_always_inline
1639int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1640                                     int *copy_buf, int vp7)
1641{
1642    switch (mode) {
1643    case VERT_PRED:
1644        if (!mb_x && mb_y) {
1645            *copy_buf = 1;
1646            return mode;
1647        }
1648        /* fall-through */
1649    case DIAG_DOWN_LEFT_PRED:
1650    case VERT_LEFT_PRED:
1651        return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1652    case HOR_PRED:
1653        if (!mb_y) {
1654            *copy_buf = 1;
1655            return mode;
1656        }
1657        /* fall-through */
1658    case HOR_UP_PRED:
1659        return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1660    case TM_VP8_PRED:
1661        return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1662    case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1663                   * as 16x16/8x8 DC */
1664    case DIAG_DOWN_RIGHT_PRED:
1665    case VERT_RIGHT_PRED:
1666    case HOR_DOWN_PRED:
1667        if (!mb_y || !mb_x)
1668            *copy_buf = 1;
1669        return mode;
1670    }
1671    return mode;
1672}
1673
1674static av_always_inline
1675void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1676                   VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1677{
1678    int x, y, mode, nnz;
1679    uint32_t tr;
1680
1681    /* for the first row, we need to run xchg_mb_border to init the top edge
1682     * to 127 otherwise, skip it if we aren't going to deblock */
1683    if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1684        xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1685                       s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1686                       s->filter.simple, 1);
1687
1688    if (mb->mode < MODE_I4x4) {
1689        mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1690        s->hpc.pred16x16[mode](dst[0], s->linesize);
1691    } else {
1692        uint8_t *ptr = dst[0];
1693        uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1694        const uint8_t lo = is_vp7 ? 128 : 127;
1695        const uint8_t hi = is_vp7 ? 128 : 129;
1696        uint8_t tr_top[4] = { lo, lo, lo, lo };
1697
1698        // all blocks on the right edge of the macroblock use bottom edge
1699        // the top macroblock for their topright edge
1700        uint8_t *tr_right = ptr - s->linesize + 16;
1701
1702        // if we're on the right edge of the frame, said edge is extended
1703        // from the top macroblock
1704        if (mb_y && mb_x == s->mb_width - 1) {
1705            tr       = tr_right[-1] * 0x01010101u;
1706            tr_right = (uint8_t *) &tr;
1707        }
1708
1709        if (mb->skip)
1710            AV_ZERO128(td->non_zero_count_cache);
1711
1712        for (y = 0; y < 4; y++) {
1713            uint8_t *topright = ptr + 4 - s->linesize;
1714            for (x = 0; x < 4; x++) {
1715                int copy = 0;
1716                ptrdiff_t linesize = s->linesize;
1717                uint8_t *dst = ptr + 4 * x;
1718                LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]);
1719
1720                if ((y == 0 || x == 3) && mb_y == 0) {
1721                    topright = tr_top;
1722                } else if (x == 3)
1723                    topright = tr_right;
1724
1725                mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1726                                                        mb_y + y, &copy, is_vp7);
1727                if (copy) {
1728                    dst      = copy_dst + 12;
1729                    linesize = 8;
1730                    if (!(mb_y + y)) {
1731                        copy_dst[3] = lo;
1732                        AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1733                    } else {
1734                        AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1735                        if (!(mb_x + x)) {
1736                            copy_dst[3] = hi;
1737                        } else {
1738                            copy_dst[3] = ptr[4 * x - s->linesize - 1];
1739                        }
1740                    }
1741                    if (!(mb_x + x)) {
1742                        copy_dst[11] =
1743                        copy_dst[19] =
1744                        copy_dst[27] =
1745                        copy_dst[35] = hi;
1746                    } else {
1747                        copy_dst[11] = ptr[4 * x                   - 1];
1748                        copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1749                        copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1750                        copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1751                    }
1752                }
1753                s->hpc.pred4x4[mode](dst, topright, linesize);
1754                if (copy) {
1755                    AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1756                    AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1757                    AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1758                    AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1759                }
1760
1761                nnz = td->non_zero_count_cache[y][x];
1762                if (nnz) {
1763                    if (nnz == 1)
1764                        s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1765                                                  td->block[y][x], s->linesize);
1766                    else
1767                        s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1768                                               td->block[y][x], s->linesize);
1769                }
1770                topright += 4;
1771            }
1772
1773            ptr      += 4 * s->linesize;
1774            intra4x4 += 4;
1775        }
1776    }
1777
1778    mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1779                                            mb_x, mb_y, is_vp7);
1780    s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1781    s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1782
1783    if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1784        xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1785                       s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1786                       s->filter.simple, 0);
1787}
1788
1789static const uint8_t subpel_idx[3][8] = {
1790    { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1791                                // also function pointer index
1792    { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1793    { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1794};
1795
1796/**
1797 * luma MC function
1798 *
1799 * @param s        VP8 decoding context
1800 * @param dst      target buffer for block data at block position
1801 * @param ref      reference picture buffer at origin (0, 0)
1802 * @param mv       motion vector (relative to block position) to get pixel data from
1803 * @param x_off    horizontal position of block from origin (0, 0)
1804 * @param y_off    vertical position of block from origin (0, 0)
1805 * @param block_w  width of block (16, 8 or 4)
1806 * @param block_h  height of block (always same as block_w)
1807 * @param width    width of src/dst plane data
1808 * @param height   height of src/dst plane data
1809 * @param linesize size of a single line of plane data, including padding
1810 * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1811 */
1812static av_always_inline
1813void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1814                 ThreadFrame *ref, const VP56mv *mv,
1815                 int x_off, int y_off, int block_w, int block_h,
1816                 int width, int height, ptrdiff_t linesize,
1817                 vp8_mc_func mc_func[3][3])
1818{
1819    uint8_t *src = ref->f->data[0];
1820
1821    if (AV_RN32A(mv)) {
1822        ptrdiff_t src_linesize = linesize;
1823
1824        int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx];
1825        int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my];
1826
1827        x_off += mv->x >> 2;
1828        y_off += mv->y >> 2;
1829
1830        // edge emulation
1831        ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1832        src += y_off * linesize + x_off;
1833        if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1834            y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1835            s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1836                                     src - my_idx * linesize - mx_idx,
1837                                     EDGE_EMU_LINESIZE, linesize,
1838                                     block_w + subpel_idx[1][mx],
1839                                     block_h + subpel_idx[1][my],
1840                                     x_off - mx_idx, y_off - my_idx,
1841                                     width, height);
1842            src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1843            src_linesize = EDGE_EMU_LINESIZE;
1844        }
1845        mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1846    } else {
1847        ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1848        mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1849                      linesize, block_h, 0, 0);
1850    }
1851}
1852
1853/**
1854 * chroma MC function
1855 *
1856 * @param s        VP8 decoding context
1857 * @param dst1     target buffer for block data at block position (U plane)
1858 * @param dst2     target buffer for block data at block position (V plane)
1859 * @param ref      reference picture buffer at origin (0, 0)
1860 * @param mv       motion vector (relative to block position) to get pixel data from
1861 * @param x_off    horizontal position of block from origin (0, 0)
1862 * @param y_off    vertical position of block from origin (0, 0)
1863 * @param block_w  width of block (16, 8 or 4)
1864 * @param block_h  height of block (always same as block_w)
1865 * @param width    width of src/dst plane data
1866 * @param height   height of src/dst plane data
1867 * @param linesize size of a single line of plane data, including padding
1868 * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1869 */
1870static av_always_inline
1871void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1872                   uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1873                   int x_off, int y_off, int block_w, int block_h,
1874                   int width, int height, ptrdiff_t linesize,
1875                   vp8_mc_func mc_func[3][3])
1876{
1877    uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1878
1879    if (AV_RN32A(mv)) {
1880        int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1881        int my = mv->y & 7, my_idx = subpel_idx[0][my];
1882
1883        x_off += mv->x >> 3;
1884        y_off += mv->y >> 3;
1885
1886        // edge emulation
1887        src1 += y_off * linesize + x_off;
1888        src2 += y_off * linesize + x_off;
1889        ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1890        if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1891            y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1892            s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1893                                     src1 - my_idx * linesize - mx_idx,
1894                                     EDGE_EMU_LINESIZE, linesize,
1895                                     block_w + subpel_idx[1][mx],
1896                                     block_h + subpel_idx[1][my],
1897                                     x_off - mx_idx, y_off - my_idx, width, height);
1898            src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1899            mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1900
1901            s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1902                                     src2 - my_idx * linesize - mx_idx,
1903                                     EDGE_EMU_LINESIZE, linesize,
1904                                     block_w + subpel_idx[1][mx],
1905                                     block_h + subpel_idx[1][my],
1906                                     x_off - mx_idx, y_off - my_idx, width, height);
1907            src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1908            mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1909        } else {
1910            mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1911            mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1912        }
1913    } else {
1914        ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1915        mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1916        mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1917    }
1918}
1919
1920static av_always_inline
1921void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1922                 ThreadFrame *ref_frame, int x_off, int y_off,
1923                 int bx_off, int by_off, int block_w, int block_h,
1924                 int width, int height, VP56mv *mv)
1925{
1926    VP56mv uvmv = *mv;
1927
1928    /* Y */
1929    vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1930                ref_frame, mv, x_off + bx_off, y_off + by_off,
1931                block_w, block_h, width, height, s->linesize,
1932                s->put_pixels_tab[block_w == 8]);
1933
1934    /* U/V */
1935    if (s->profile == 3) {
1936        /* this block only applies VP8; it is safe to check
1937         * only the profile, as VP7 profile <= 1 */
1938        uvmv.x &= ~7;
1939        uvmv.y &= ~7;
1940    }
1941    x_off   >>= 1;
1942    y_off   >>= 1;
1943    bx_off  >>= 1;
1944    by_off  >>= 1;
1945    width   >>= 1;
1946    height  >>= 1;
1947    block_w >>= 1;
1948    block_h >>= 1;
1949    vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1950                  dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1951                  &uvmv, x_off + bx_off, y_off + by_off,
1952                  block_w, block_h, width, height, s->uvlinesize,
1953                  s->put_pixels_tab[1 + (block_w == 4)]);
1954}
1955
1956/* Fetch pixels for estimated mv 4 macroblocks ahead.
1957 * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1958static av_always_inline
1959void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1960                     int mb_xy, int ref)
1961{
1962    /* Don't prefetch refs that haven't been used very often this frame. */
1963    if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1964        int x_off = mb_x << 4, y_off = mb_y << 4;
1965        int mx = (mb->mv.x >> 2) + x_off + 8;
1966        int my = (mb->mv.y >> 2) + y_off;
1967        uint8_t **src = s->framep[ref]->tf.f->data;
1968        int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1969        /* For threading, a ff_thread_await_progress here might be useful, but
1970         * it actually slows down the decoder. Since a bad prefetch doesn't
1971         * generate bad decoder output, we don't run it here. */
1972        s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1973        off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1974        s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1975    }
1976}
1977
1978/**
1979 * Apply motion vectors to prediction buffer, chapter 18.
1980 */
1981static av_always_inline
1982void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1983                   VP8Macroblock *mb, int mb_x, int mb_y)
1984{
1985    int x_off = mb_x << 4, y_off = mb_y << 4;
1986    int width = 16 * s->mb_width, height = 16 * s->mb_height;
1987    ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1988    VP56mv *bmv = mb->bmv;
1989
1990    switch (mb->partitioning) {
1991    case VP8_SPLITMVMODE_NONE:
1992        vp8_mc_part(s, td, dst, ref, x_off, y_off,
1993                    0, 0, 16, 16, width, height, &mb->mv);
1994        break;
1995    case VP8_SPLITMVMODE_4x4: {
1996        int x, y;
1997        VP56mv uvmv;
1998
1999        /* Y */
2000        for (y = 0; y < 4; y++) {
2001            for (x = 0; x < 4; x++) {
2002                vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
2003                            ref, &bmv[4 * y + x],
2004                            4 * x + x_off, 4 * y + y_off, 4, 4,
2005                            width, height, s->linesize,
2006                            s->put_pixels_tab[2]);
2007            }
2008        }
2009
2010        /* U/V */
2011        x_off  >>= 1;
2012        y_off  >>= 1;
2013        width  >>= 1;
2014        height >>= 1;
2015        for (y = 0; y < 2; y++) {
2016            for (x = 0; x < 2; x++) {
2017                uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
2018                         mb->bmv[2 * y       * 4 + 2 * x + 1].x +
2019                         mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
2020                         mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
2021                uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
2022                         mb->bmv[2 * y       * 4 + 2 * x + 1].y +
2023                         mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
2024                         mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
2025                uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
2026                uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
2027                if (s->profile == 3) {
2028                    uvmv.x &= ~7;
2029                    uvmv.y &= ~7;
2030                }
2031                vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
2032                              dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
2033                              &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
2034                              width, height, s->uvlinesize,
2035                              s->put_pixels_tab[2]);
2036            }
2037        }
2038        break;
2039    }
2040    case VP8_SPLITMVMODE_16x8:
2041        vp8_mc_part(s, td, dst, ref, x_off, y_off,
2042                    0, 0, 16, 8, width, height, &bmv[0]);
2043        vp8_mc_part(s, td, dst, ref, x_off, y_off,
2044                    0, 8, 16, 8, width, height, &bmv[1]);
2045        break;
2046    case VP8_SPLITMVMODE_8x16:
2047        vp8_mc_part(s, td, dst, ref, x_off, y_off,
2048                    0, 0, 8, 16, width, height, &bmv[0]);
2049        vp8_mc_part(s, td, dst, ref, x_off, y_off,
2050                    8, 0, 8, 16, width, height, &bmv[1]);
2051        break;
2052    case VP8_SPLITMVMODE_8x8:
2053        vp8_mc_part(s, td, dst, ref, x_off, y_off,
2054                    0, 0, 8, 8, width, height, &bmv[0]);
2055        vp8_mc_part(s, td, dst, ref, x_off, y_off,
2056                    8, 0, 8, 8, width, height, &bmv[1]);
2057        vp8_mc_part(s, td, dst, ref, x_off, y_off,
2058                    0, 8, 8, 8, width, height, &bmv[2]);
2059        vp8_mc_part(s, td, dst, ref, x_off, y_off,
2060                    8, 8, 8, 8, width, height, &bmv[3]);
2061        break;
2062    }
2063}
2064
2065static av_always_inline
2066void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
2067{
2068    int x, y, ch;
2069
2070    if (mb->mode != MODE_I4x4) {
2071        uint8_t *y_dst = dst[0];
2072        for (y = 0; y < 4; y++) {
2073            uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
2074            if (nnz4) {
2075                if (nnz4 & ~0x01010101) {
2076                    for (x = 0; x < 4; x++) {
2077                        if ((uint8_t) nnz4 == 1)
2078                            s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
2079                                                      td->block[y][x],
2080                                                      s->linesize);
2081                        else if ((uint8_t) nnz4 > 1)
2082                            s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
2083                                                   td->block[y][x],
2084                                                   s->linesize);
2085                        nnz4 >>= 8;
2086                        if (!nnz4)
2087                            break;
2088                    }
2089                } else {
2090                    s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
2091                }
2092            }
2093            y_dst += 4 * s->linesize;
2094        }
2095    }
2096
2097    for (ch = 0; ch < 2; ch++) {
2098        uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
2099        if (nnz4) {
2100            uint8_t *ch_dst = dst[1 + ch];
2101            if (nnz4 & ~0x01010101) {
2102                for (y = 0; y < 2; y++) {
2103                    for (x = 0; x < 2; x++) {
2104                        if ((uint8_t) nnz4 == 1)
2105                            s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
2106                                                      td->block[4 + ch][(y << 1) + x],
2107                                                      s->uvlinesize);
2108                        else if ((uint8_t) nnz4 > 1)
2109                            s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
2110                                                   td->block[4 + ch][(y << 1) + x],
2111                                                   s->uvlinesize);
2112                        nnz4 >>= 8;
2113                        if (!nnz4)
2114                            goto chroma_idct_end;
2115                    }
2116                    ch_dst += 4 * s->uvlinesize;
2117                }
2118            } else {
2119                s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
2120            }
2121        }
2122chroma_idct_end:
2123        ;
2124    }
2125}
2126
2127static av_always_inline
2128void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
2129                         VP8FilterStrength *f, int is_vp7)
2130{
2131    int interior_limit, filter_level;
2132
2133    if (s->segmentation.enabled) {
2134        filter_level = s->segmentation.filter_level[mb->segment];
2135        if (!s->segmentation.absolute_vals)
2136            filter_level += s->filter.level;
2137    } else
2138        filter_level = s->filter.level;
2139
2140    if (s->lf_delta.enabled) {
2141        filter_level += s->lf_delta.ref[mb->ref_frame];
2142        filter_level += s->lf_delta.mode[mb->mode];
2143    }
2144
2145    filter_level = av_clip_uintp2(filter_level, 6);
2146
2147    interior_limit = filter_level;
2148    if (s->filter.sharpness) {
2149        interior_limit >>= (s->filter.sharpness + 3) >> 2;
2150        interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2151    }
2152    interior_limit = FFMAX(interior_limit, 1);
2153
2154    f->filter_level = filter_level;
2155    f->inner_limit = interior_limit;
2156    f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2157                      mb->mode == VP8_MVMODE_SPLIT;
2158}
2159
2160static av_always_inline
2161void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2162               int mb_x, int mb_y, int is_vp7)
2163{
2164    int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2165    int filter_level = f->filter_level;
2166    int inner_limit = f->inner_limit;
2167    int inner_filter = f->inner_filter;
2168    ptrdiff_t linesize   = s->linesize;
2169    ptrdiff_t uvlinesize = s->uvlinesize;
2170    static const uint8_t hev_thresh_lut[2][64] = {
2171        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2172          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2173          3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2174          3, 3, 3, 3 },
2175        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2176          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2177          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2178          2, 2, 2, 2 }
2179    };
2180
2181    if (!filter_level)
2182        return;
2183
2184    if (is_vp7) {
2185        bedge_lim_y  = filter_level;
2186        bedge_lim_uv = filter_level * 2;
2187        mbedge_lim   = filter_level + 2;
2188    } else {
2189        bedge_lim_y  =
2190        bedge_lim_uv = filter_level * 2 + inner_limit;
2191        mbedge_lim   = bedge_lim_y + 4;
2192    }
2193
2194    hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2195
2196    if (mb_x) {
2197        s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2198                                       mbedge_lim, inner_limit, hev_thresh);
2199        s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2200                                       mbedge_lim, inner_limit, hev_thresh);
2201    }
2202
2203#define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2204    if (cond && inner_filter) {                                               \
2205        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2206                                             bedge_lim_y, inner_limit,        \
2207                                             hev_thresh);                     \
2208        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2209                                             bedge_lim_y, inner_limit,        \
2210                                             hev_thresh);                     \
2211        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2212                                             bedge_lim_y, inner_limit,        \
2213                                             hev_thresh);                     \
2214        s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2215                                             uvlinesize,  bedge_lim_uv,       \
2216                                             inner_limit, hev_thresh);        \
2217    }
2218
2219    H_LOOP_FILTER_16Y_INNER(!is_vp7)
2220
2221    if (mb_y) {
2222        s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2223                                       mbedge_lim, inner_limit, hev_thresh);
2224        s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2225                                       mbedge_lim, inner_limit, hev_thresh);
2226    }
2227
2228    if (inner_filter) {
2229        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2230                                             linesize, bedge_lim_y,
2231                                             inner_limit, hev_thresh);
2232        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2233                                             linesize, bedge_lim_y,
2234                                             inner_limit, hev_thresh);
2235        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2236                                             linesize, bedge_lim_y,
2237                                             inner_limit, hev_thresh);
2238        s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2239                                             dst[2] +  4 * uvlinesize,
2240                                             uvlinesize, bedge_lim_uv,
2241                                             inner_limit, hev_thresh);
2242    }
2243
2244    H_LOOP_FILTER_16Y_INNER(is_vp7)
2245}
2246
2247static av_always_inline
2248void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2249                      int mb_x, int mb_y)
2250{
2251    int mbedge_lim, bedge_lim;
2252    int filter_level = f->filter_level;
2253    int inner_limit  = f->inner_limit;
2254    int inner_filter = f->inner_filter;
2255    ptrdiff_t linesize = s->linesize;
2256
2257    if (!filter_level)
2258        return;
2259
2260    bedge_lim  = 2 * filter_level + inner_limit;
2261    mbedge_lim = bedge_lim + 4;
2262
2263    if (mb_x)
2264        s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2265    if (inner_filter) {
2266        s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2267        s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2268        s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2269    }
2270
2271    if (mb_y)
2272        s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2273    if (inner_filter) {
2274        s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2275        s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2276        s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2277    }
2278}
2279
2280#define MARGIN (16 << 2)
2281static av_always_inline
2282int vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2283                                    VP8Frame *prev_frame, int is_vp7)
2284{
2285    VP8Context *s = avctx->priv_data;
2286    int mb_x, mb_y;
2287
2288    s->mv_bounds.mv_min.y = -MARGIN;
2289    s->mv_bounds.mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2290    for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2291        VP8Macroblock *mb = s->macroblocks_base +
2292                            ((s->mb_width + 1) * (mb_y + 1) + 1);
2293        int mb_xy = mb_y * s->mb_width;
2294
2295        AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2296
2297        s->mv_bounds.mv_min.x = -MARGIN;
2298        s->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2299
2300        for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2301            if (vpX_rac_is_end(&s->c)) {
2302                return AVERROR_INVALIDDATA;
2303            }
2304            if (mb_y == 0)
2305                AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2306                         DC_PRED * 0x01010101);
2307            decode_mb_mode(s, &s->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2308                           prev_frame && prev_frame->seg_map ?
2309                           prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2310            s->mv_bounds.mv_min.x -= 64;
2311            s->mv_bounds.mv_max.x -= 64;
2312        }
2313        s->mv_bounds.mv_min.y -= 64;
2314        s->mv_bounds.mv_max.y -= 64;
2315    }
2316    return 0;
2317}
2318
2319static int vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2320                                   VP8Frame *prev_frame)
2321{
2322    return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2323}
2324
2325static int vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2326                                   VP8Frame *prev_frame)
2327{
2328    return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2329}
2330
2331#if HAVE_THREADS
2332#define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2333    do {                                                                      \
2334        int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2335        if (atomic_load(&otd->thread_mb_pos) < tmp) {                         \
2336            pthread_mutex_lock(&otd->lock);                                   \
2337            atomic_store(&td->wait_mb_pos, tmp);                              \
2338            do {                                                              \
2339                if (atomic_load(&otd->thread_mb_pos) >= tmp)                  \
2340                    break;                                                    \
2341                pthread_cond_wait(&otd->cond, &otd->lock);                    \
2342            } while (1);                                                      \
2343            atomic_store(&td->wait_mb_pos, INT_MAX);                          \
2344            pthread_mutex_unlock(&otd->lock);                                 \
2345        }                                                                     \
2346    } while (0)
2347
2348#define update_pos(td, mb_y, mb_x)                                            \
2349    do {                                                                      \
2350        int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2351        int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2352                               (num_jobs > 1);                                \
2353        int is_null          = !next_td || !prev_td;                          \
2354        int pos_check        = (is_null) ? 1 :                                \
2355            (next_td != td && pos >= atomic_load(&next_td->wait_mb_pos)) ||   \
2356            (prev_td != td && pos >= atomic_load(&prev_td->wait_mb_pos));     \
2357        atomic_store(&td->thread_mb_pos, pos);                                \
2358        if (sliced_threading && pos_check) {                                  \
2359            pthread_mutex_lock(&td->lock);                                    \
2360            pthread_cond_broadcast(&td->cond);                                \
2361            pthread_mutex_unlock(&td->lock);                                  \
2362        }                                                                     \
2363    } while (0)
2364#else
2365#define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0)
2366#define update_pos(td, mb_y, mb_x) while(0)
2367#endif
2368
2369static av_always_inline int decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2370                                        int jobnr, int threadnr, int is_vp7)
2371{
2372    VP8Context *s = avctx->priv_data;
2373    VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2374    int mb_y = atomic_load(&td->thread_mb_pos) >> 16;
2375    int mb_x, mb_xy = mb_y * s->mb_width;
2376    int num_jobs = s->num_jobs;
2377    VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2378    VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2379    VP8Macroblock *mb;
2380    uint8_t *dst[3] = {
2381        curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2382        curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2383        curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2384    };
2385
2386    if (vpX_rac_is_end(c))
2387         return AVERROR_INVALIDDATA;
2388
2389    if (mb_y == 0)
2390        prev_td = td;
2391    else
2392        prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2393    if (mb_y == s->mb_height - 1)
2394        next_td = td;
2395    else
2396        next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2397    if (s->mb_layout == 1)
2398        mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2399    else {
2400        // Make sure the previous frame has read its segmentation map,
2401        // if we re-use the same map.
2402        if (prev_frame && s->segmentation.enabled &&
2403            !s->segmentation.update_map)
2404            ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2405        mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2406        memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2407        AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2408    }
2409
2410    if (!is_vp7 || mb_y == 0)
2411        memset(td->left_nnz, 0, sizeof(td->left_nnz));
2412
2413    td->mv_bounds.mv_min.x = -MARGIN;
2414    td->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2415
2416    for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2417        if (vpX_rac_is_end(c))
2418            return AVERROR_INVALIDDATA;
2419        // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2420        if (prev_td != td) {
2421            if (threadnr != 0) {
2422                check_thread_pos(td, prev_td,
2423                                 mb_x + (is_vp7 ? 2 : 1),
2424                                 mb_y - (is_vp7 ? 2 : 1));
2425            } else {
2426                check_thread_pos(td, prev_td,
2427                                 mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2428                                 mb_y - (is_vp7 ? 2 : 1));
2429            }
2430        }
2431
2432        s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2433                         s->linesize, 4);
2434        s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2435                         dst[2] - dst[1], 2);
2436
2437        if (!s->mb_layout)
2438            decode_mb_mode(s, &td->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2439                           prev_frame && prev_frame->seg_map ?
2440                           prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2441
2442        prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2443
2444        if (!mb->skip)
2445            decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2446
2447        if (mb->mode <= MODE_I4x4)
2448            intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2449        else
2450            inter_predict(s, td, dst, mb, mb_x, mb_y);
2451
2452        prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2453
2454        if (!mb->skip) {
2455            idct_mb(s, td, dst, mb);
2456        } else {
2457            AV_ZERO64(td->left_nnz);
2458            AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2459
2460            /* Reset DC block predictors if they would exist
2461             * if the mb had coefficients */
2462            if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2463                td->left_nnz[8]     = 0;
2464                s->top_nnz[mb_x][8] = 0;
2465            }
2466        }
2467
2468        if (s->deblock_filter)
2469            filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2470
2471        if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2472            if (s->filter.simple)
2473                backup_mb_border(s->top_border[mb_x + 1], dst[0],
2474                                 NULL, NULL, s->linesize, 0, 1);
2475            else
2476                backup_mb_border(s->top_border[mb_x + 1], dst[0],
2477                                 dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2478        }
2479
2480        prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2481
2482        dst[0]      += 16;
2483        dst[1]      += 8;
2484        dst[2]      += 8;
2485        td->mv_bounds.mv_min.x -= 64;
2486        td->mv_bounds.mv_max.x -= 64;
2487
2488        if (mb_x == s->mb_width + 1) {
2489            update_pos(td, mb_y, s->mb_width + 3);
2490        } else {
2491            update_pos(td, mb_y, mb_x);
2492        }
2493    }
2494    return 0;
2495}
2496
2497static int vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2498                                        int jobnr, int threadnr)
2499{
2500    return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
2501}
2502
2503static int vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2504                                        int jobnr, int threadnr)
2505{
2506    return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
2507}
2508
2509static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
2510                              int jobnr, int threadnr, int is_vp7)
2511{
2512    VP8Context *s = avctx->priv_data;
2513    VP8ThreadData *td = &s->thread_data[threadnr];
2514    int mb_x, mb_y = atomic_load(&td->thread_mb_pos) >> 16, num_jobs = s->num_jobs;
2515    AVFrame *curframe = s->curframe->tf.f;
2516    VP8Macroblock *mb;
2517    VP8ThreadData *prev_td, *next_td;
2518    uint8_t *dst[3] = {
2519        curframe->data[0] + 16 * mb_y * s->linesize,
2520        curframe->data[1] +  8 * mb_y * s->uvlinesize,
2521        curframe->data[2] +  8 * mb_y * s->uvlinesize
2522    };
2523
2524    if (s->mb_layout == 1)
2525        mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2526    else
2527        mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2528
2529    if (mb_y == 0)
2530        prev_td = td;
2531    else
2532        prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2533    if (mb_y == s->mb_height - 1)
2534        next_td = td;
2535    else
2536        next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2537
2538    for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2539        VP8FilterStrength *f = &td->filter_strength[mb_x];
2540        if (prev_td != td)
2541            check_thread_pos(td, prev_td,
2542                             (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2543        if (next_td != td)
2544            if (next_td != &s->thread_data[0])
2545                check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2546
2547        if (num_jobs == 1) {
2548            if (s->filter.simple)
2549                backup_mb_border(s->top_border[mb_x + 1], dst[0],
2550                                 NULL, NULL, s->linesize, 0, 1);
2551            else
2552                backup_mb_border(s->top_border[mb_x + 1], dst[0],
2553                                 dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2554        }
2555
2556        if (s->filter.simple)
2557            filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2558        else
2559            filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2560        dst[0] += 16;
2561        dst[1] += 8;
2562        dst[2] += 8;
2563
2564        update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2565    }
2566}
2567
2568static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
2569                              int jobnr, int threadnr)
2570{
2571    filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
2572}
2573
2574static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2575                              int jobnr, int threadnr)
2576{
2577    filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
2578}
2579
2580static av_always_inline
2581int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2582                              int threadnr, int is_vp7)
2583{
2584    VP8Context *s = avctx->priv_data;
2585    VP8ThreadData *td = &s->thread_data[jobnr];
2586    VP8ThreadData *next_td = NULL, *prev_td = NULL;
2587    VP8Frame *curframe = s->curframe;
2588    int mb_y, num_jobs = s->num_jobs;
2589    int ret;
2590
2591    td->thread_nr = threadnr;
2592    td->mv_bounds.mv_min.y   = -MARGIN - 64 * threadnr;
2593    td->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN - 64 * threadnr;
2594    for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2595        atomic_store(&td->thread_mb_pos, mb_y << 16);
2596        ret = s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
2597        if (ret < 0) {
2598            update_pos(td, s->mb_height, INT_MAX & 0xFFFF);
2599            return ret;
2600        }
2601        if (s->deblock_filter)
2602            s->filter_mb_row(avctx, tdata, jobnr, threadnr);
2603        update_pos(td, mb_y, INT_MAX & 0xFFFF);
2604
2605        td->mv_bounds.mv_min.y -= 64 * num_jobs;
2606        td->mv_bounds.mv_max.y -= 64 * num_jobs;
2607
2608        if (avctx->active_thread_type == FF_THREAD_FRAME)
2609            ff_thread_report_progress(&curframe->tf, mb_y, 0);
2610    }
2611
2612    return 0;
2613}
2614
2615static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2616                                    int jobnr, int threadnr)
2617{
2618    return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2619}
2620
2621static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2622                                    int jobnr, int threadnr)
2623{
2624    return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2625}
2626
2627static av_always_inline
2628int vp78_decode_frame(AVCodecContext *avctx, AVFrame *rframe, int *got_frame,
2629                      const AVPacket *avpkt, int is_vp7)
2630{
2631    VP8Context *s = avctx->priv_data;
2632    int ret, i, referenced, num_jobs;
2633    enum AVDiscard skip_thresh;
2634    VP8Frame *av_uninit(curframe), *prev_frame;
2635
2636    if (is_vp7)
2637        ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2638    else
2639        ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2640
2641    if (ret < 0)
2642        goto err;
2643
2644    if (s->actually_webp) {
2645        // avctx->pix_fmt already set in caller.
2646    } else if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
2647        s->pix_fmt = get_pixel_format(s);
2648        if (s->pix_fmt < 0) {
2649            ret = AVERROR(EINVAL);
2650            goto err;
2651        }
2652        avctx->pix_fmt = s->pix_fmt;
2653    }
2654
2655    prev_frame = s->framep[VP56_FRAME_CURRENT];
2656
2657    referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2658                 s->update_altref == VP56_FRAME_CURRENT;
2659
2660    skip_thresh = !referenced ? AVDISCARD_NONREF
2661                              : !s->keyframe ? AVDISCARD_NONKEY
2662                                             : AVDISCARD_ALL;
2663
2664    if (avctx->skip_frame >= skip_thresh) {
2665        s->invisible = 1;
2666        memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2667        goto skip_decode;
2668    }
2669    s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2670
2671    // release no longer referenced frames
2672    for (i = 0; i < 5; i++)
2673        if (s->frames[i].tf.f->buf[0] &&
2674            &s->frames[i] != prev_frame &&
2675            &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2676            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2677            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2678            vp8_release_frame(s, &s->frames[i]);
2679
2680    curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2681
2682    if (!s->colorspace)
2683        avctx->colorspace = AVCOL_SPC_BT470BG;
2684    if (s->fullrange)
2685        avctx->color_range = AVCOL_RANGE_JPEG;
2686    else
2687        avctx->color_range = AVCOL_RANGE_MPEG;
2688
2689    /* Given that arithmetic probabilities are updated every frame, it's quite
2690     * likely that the values we have on a random interframe are complete
2691     * junk if we didn't start decode on a keyframe. So just don't display
2692     * anything rather than junk. */
2693    if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2694                         !s->framep[VP56_FRAME_GOLDEN]   ||
2695                         !s->framep[VP56_FRAME_GOLDEN2])) {
2696        av_log(avctx, AV_LOG_WARNING,
2697               "Discarding interframe without a prior keyframe!\n");
2698        ret = AVERROR_INVALIDDATA;
2699        goto err;
2700    }
2701
2702    curframe->tf.f->key_frame = s->keyframe;
2703    curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2704                                            : AV_PICTURE_TYPE_P;
2705    if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
2706        goto err;
2707
2708    // check if golden and altref are swapped
2709    if (s->update_altref != VP56_FRAME_NONE)
2710        s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2711    else
2712        s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2713
2714    if (s->update_golden != VP56_FRAME_NONE)
2715        s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2716    else
2717        s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2718
2719    if (s->update_last)
2720        s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2721    else
2722        s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2723
2724    s->next_framep[VP56_FRAME_CURRENT] = curframe;
2725
2726    if (ffcodec(avctx->codec)->update_thread_context)
2727        ff_thread_finish_setup(avctx);
2728
2729    if (avctx->hwaccel) {
2730        ret = avctx->hwaccel->start_frame(avctx, avpkt->data, avpkt->size);
2731        if (ret < 0)
2732            goto err;
2733
2734        ret = avctx->hwaccel->decode_slice(avctx, avpkt->data, avpkt->size);
2735        if (ret < 0)
2736            goto err;
2737
2738        ret = avctx->hwaccel->end_frame(avctx);
2739        if (ret < 0)
2740            goto err;
2741
2742    } else {
2743        s->linesize   = curframe->tf.f->linesize[0];
2744        s->uvlinesize = curframe->tf.f->linesize[1];
2745
2746        memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2747        /* Zero macroblock structures for top/top-left prediction
2748         * from outside the frame. */
2749        if (!s->mb_layout)
2750            memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2751                   (s->mb_width + 1) * sizeof(*s->macroblocks));
2752        if (!s->mb_layout && s->keyframe)
2753            memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2754
2755        memset(s->ref_count, 0, sizeof(s->ref_count));
2756
2757        if (s->mb_layout == 1) {
2758            // Make sure the previous frame has read its segmentation map,
2759            // if we re-use the same map.
2760            if (prev_frame && s->segmentation.enabled &&
2761                !s->segmentation.update_map)
2762                ff_thread_await_progress(&prev_frame->tf, 1, 0);
2763            if (is_vp7)
2764                ret = vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2765            else
2766                ret = vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2767            if (ret < 0)
2768                goto err;
2769        }
2770
2771        if (avctx->active_thread_type == FF_THREAD_FRAME)
2772            num_jobs = 1;
2773        else
2774            num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2775        s->num_jobs   = num_jobs;
2776        s->curframe   = curframe;
2777        s->prev_frame = prev_frame;
2778        s->mv_bounds.mv_min.y   = -MARGIN;
2779        s->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2780        for (i = 0; i < MAX_THREADS; i++) {
2781            VP8ThreadData *td = &s->thread_data[i];
2782            atomic_init(&td->thread_mb_pos, 0);
2783            atomic_init(&td->wait_mb_pos, INT_MAX);
2784        }
2785        if (is_vp7)
2786            avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2787                            num_jobs);
2788        else
2789            avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2790                            num_jobs);
2791    }
2792
2793    ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2794    memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2795
2796skip_decode:
2797    // if future frames don't use the updated probabilities,
2798    // reset them to the values we saved
2799    if (!s->update_probabilities)
2800        s->prob[0] = s->prob[1];
2801
2802    if (!s->invisible) {
2803        if ((ret = av_frame_ref(rframe, curframe->tf.f)) < 0)
2804            return ret;
2805        *got_frame = 1;
2806    }
2807
2808    return avpkt->size;
2809err:
2810    memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2811    return ret;
2812}
2813
2814int ff_vp8_decode_frame(AVCodecContext *avctx, AVFrame *frame,
2815                        int *got_frame, AVPacket *avpkt)
2816{
2817    return vp78_decode_frame(avctx, frame, got_frame, avpkt, IS_VP8);
2818}
2819
2820#if CONFIG_VP7_DECODER
2821static int vp7_decode_frame(AVCodecContext *avctx, AVFrame *frame,
2822                            int *got_frame, AVPacket *avpkt)
2823{
2824    return vp78_decode_frame(avctx, frame, got_frame, avpkt, IS_VP7);
2825}
2826#endif /* CONFIG_VP7_DECODER */
2827
2828av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2829{
2830    VP8Context *s = avctx->priv_data;
2831    int i;
2832
2833    vp8_decode_flush_impl(avctx, 1);
2834    for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2835        av_frame_free(&s->frames[i].tf.f);
2836
2837    return 0;
2838}
2839
2840static av_cold int vp8_init_frames(VP8Context *s)
2841{
2842    int i;
2843    for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2844        s->frames[i].tf.f = av_frame_alloc();
2845        if (!s->frames[i].tf.f)
2846            return AVERROR(ENOMEM);
2847    }
2848    return 0;
2849}
2850
2851static av_always_inline
2852int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2853{
2854    VP8Context *s = avctx->priv_data;
2855    int ret;
2856
2857    s->avctx = avctx;
2858    s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
2859    s->pix_fmt = AV_PIX_FMT_NONE;
2860    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2861
2862    ff_videodsp_init(&s->vdsp, 8);
2863
2864    ff_vp78dsp_init(&s->vp8dsp);
2865    if (CONFIG_VP7_DECODER && is_vp7) {
2866        ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2867        ff_vp7dsp_init(&s->vp8dsp);
2868        s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
2869        s->filter_mb_row           = vp7_filter_mb_row;
2870    } else if (CONFIG_VP8_DECODER && !is_vp7) {
2871        ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2872        ff_vp8dsp_init(&s->vp8dsp);
2873        s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
2874        s->filter_mb_row           = vp8_filter_mb_row;
2875    }
2876
2877    /* does not change for VP8 */
2878    memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
2879
2880    if ((ret = vp8_init_frames(s)) < 0) {
2881        ff_vp8_decode_free(avctx);
2882        return ret;
2883    }
2884
2885    return 0;
2886}
2887
2888#if CONFIG_VP7_DECODER
2889static int vp7_decode_init(AVCodecContext *avctx)
2890{
2891    return vp78_decode_init(avctx, IS_VP7);
2892}
2893#endif /* CONFIG_VP7_DECODER */
2894
2895av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2896{
2897    return vp78_decode_init(avctx, IS_VP8);
2898}
2899
2900#if CONFIG_VP8_DECODER
2901#if HAVE_THREADS
2902#define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
2903
2904static int vp8_decode_update_thread_context(AVCodecContext *dst,
2905                                            const AVCodecContext *src)
2906{
2907    VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2908    int i;
2909
2910    if (s->macroblocks_base &&
2911        (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2912        free_buffers(s);
2913        s->mb_width  = s_src->mb_width;
2914        s->mb_height = s_src->mb_height;
2915    }
2916
2917    s->pix_fmt      = s_src->pix_fmt;
2918    s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2919    s->segmentation = s_src->segmentation;
2920    s->lf_delta     = s_src->lf_delta;
2921    memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2922
2923    for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2924        if (s_src->frames[i].tf.f->buf[0]) {
2925            int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2926            if (ret < 0)
2927                return ret;
2928        }
2929    }
2930
2931    s->framep[0] = REBASE(s_src->next_framep[0]);
2932    s->framep[1] = REBASE(s_src->next_framep[1]);
2933    s->framep[2] = REBASE(s_src->next_framep[2]);
2934    s->framep[3] = REBASE(s_src->next_framep[3]);
2935
2936    return 0;
2937}
2938#endif /* HAVE_THREADS */
2939#endif /* CONFIG_VP8_DECODER */
2940
2941#if CONFIG_VP7_DECODER
2942const FFCodec ff_vp7_decoder = {
2943    .p.name                = "vp7",
2944    .p.long_name           = NULL_IF_CONFIG_SMALL("On2 VP7"),
2945    .p.type                = AVMEDIA_TYPE_VIDEO,
2946    .p.id                  = AV_CODEC_ID_VP7,
2947    .priv_data_size        = sizeof(VP8Context),
2948    .init                  = vp7_decode_init,
2949    .close                 = ff_vp8_decode_free,
2950    FF_CODEC_DECODE_CB(vp7_decode_frame),
2951    .p.capabilities        = AV_CODEC_CAP_DR1,
2952    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE,
2953    .flush                 = vp8_decode_flush,
2954};
2955#endif /* CONFIG_VP7_DECODER */
2956
2957#if CONFIG_VP8_DECODER
2958const FFCodec ff_vp8_decoder = {
2959    .p.name                = "vp8",
2960    .p.long_name           = NULL_IF_CONFIG_SMALL("On2 VP8"),
2961    .p.type                = AVMEDIA_TYPE_VIDEO,
2962    .p.id                  = AV_CODEC_ID_VP8,
2963    .priv_data_size        = sizeof(VP8Context),
2964    .init                  = ff_vp8_decode_init,
2965    .close                 = ff_vp8_decode_free,
2966    FF_CODEC_DECODE_CB(ff_vp8_decode_frame),
2967    .p.capabilities        = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
2968                             AV_CODEC_CAP_SLICE_THREADS,
2969    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE |
2970                             FF_CODEC_CAP_ALLOCATE_PROGRESS,
2971    .flush                 = vp8_decode_flush,
2972    .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2973    .hw_configs            = (const AVCodecHWConfigInternal *const []) {
2974#if CONFIG_VP8_VAAPI_HWACCEL
2975                               HWACCEL_VAAPI(vp8),
2976#endif
2977#if CONFIG_VP8_NVDEC_HWACCEL
2978                               HWACCEL_NVDEC(vp8),
2979#endif
2980                               NULL
2981                           },
2982};
2983#endif /* CONFIG_VP7_DECODER */
2984