1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * H.26L/H.264/AVC/JVT/14496-10/... decoder
3cabdff1aSopenharmony_ci * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
4cabdff1aSopenharmony_ci *
5cabdff1aSopenharmony_ci * This file is part of FFmpeg.
6cabdff1aSopenharmony_ci *
7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci *
12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci * Lesser General Public License for more details.
16cabdff1aSopenharmony_ci *
17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci */
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci/**
23cabdff1aSopenharmony_ci * @file
24cabdff1aSopenharmony_ci * H.264 / AVC / MPEG-4 part10 macroblock decoding
25cabdff1aSopenharmony_ci */
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ci#include <stdint.h>
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_ci#include "config.h"
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_ci#include "libavutil/common.h"
32cabdff1aSopenharmony_ci#include "libavutil/intreadwrite.h"
33cabdff1aSopenharmony_ci#include "avcodec.h"
34cabdff1aSopenharmony_ci#include "h264dec.h"
35cabdff1aSopenharmony_ci#include "h264_ps.h"
36cabdff1aSopenharmony_ci#include "qpeldsp.h"
37cabdff1aSopenharmony_ci#include "threadframe.h"
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_cistatic inline int get_lowest_part_list_y(H264SliceContext *sl,
40cabdff1aSopenharmony_ci                                         int n, int height, int y_offset, int list)
41cabdff1aSopenharmony_ci{
42cabdff1aSopenharmony_ci    int raw_my             = sl->mv_cache[list][scan8[n]][1];
43cabdff1aSopenharmony_ci    int filter_height_down = (raw_my & 3) ? 3 : 0;
44cabdff1aSopenharmony_ci    int full_my            = (raw_my >> 2) + y_offset;
45cabdff1aSopenharmony_ci    int bottom             = full_my + filter_height_down + height;
46cabdff1aSopenharmony_ci
47cabdff1aSopenharmony_ci    av_assert2(height >= 0);
48cabdff1aSopenharmony_ci
49cabdff1aSopenharmony_ci    return FFMAX(0, bottom);
50cabdff1aSopenharmony_ci}
51cabdff1aSopenharmony_ci
52cabdff1aSopenharmony_cistatic inline void get_lowest_part_y(const H264Context *h, H264SliceContext *sl,
53cabdff1aSopenharmony_ci                                     int16_t refs[2][48], int n,
54cabdff1aSopenharmony_ci                                     int height, int y_offset, int list0,
55cabdff1aSopenharmony_ci                                     int list1, int *nrefs)
56cabdff1aSopenharmony_ci{
57cabdff1aSopenharmony_ci    int my;
58cabdff1aSopenharmony_ci
59cabdff1aSopenharmony_ci    y_offset += 16 * (sl->mb_y >> MB_FIELD(sl));
60cabdff1aSopenharmony_ci
61cabdff1aSopenharmony_ci    if (list0) {
62cabdff1aSopenharmony_ci        int ref_n = sl->ref_cache[0][scan8[n]];
63cabdff1aSopenharmony_ci        H264Ref *ref = &sl->ref_list[0][ref_n];
64cabdff1aSopenharmony_ci
65cabdff1aSopenharmony_ci        // Error resilience puts the current picture in the ref list.
66cabdff1aSopenharmony_ci        // Don't try to wait on these as it will cause a deadlock.
67cabdff1aSopenharmony_ci        // Fields can wait on each other, though.
68cabdff1aSopenharmony_ci        if (ref->parent->tf.progress->data != h->cur_pic.tf.progress->data ||
69cabdff1aSopenharmony_ci            (ref->reference & 3) != h->picture_structure) {
70cabdff1aSopenharmony_ci            my = get_lowest_part_list_y(sl, n, height, y_offset, 0);
71cabdff1aSopenharmony_ci            if (refs[0][ref_n] < 0)
72cabdff1aSopenharmony_ci                nrefs[0] += 1;
73cabdff1aSopenharmony_ci            refs[0][ref_n] = FFMAX(refs[0][ref_n], my);
74cabdff1aSopenharmony_ci        }
75cabdff1aSopenharmony_ci    }
76cabdff1aSopenharmony_ci
77cabdff1aSopenharmony_ci    if (list1) {
78cabdff1aSopenharmony_ci        int ref_n    = sl->ref_cache[1][scan8[n]];
79cabdff1aSopenharmony_ci        H264Ref *ref = &sl->ref_list[1][ref_n];
80cabdff1aSopenharmony_ci
81cabdff1aSopenharmony_ci        if (ref->parent->tf.progress->data != h->cur_pic.tf.progress->data ||
82cabdff1aSopenharmony_ci            (ref->reference & 3) != h->picture_structure) {
83cabdff1aSopenharmony_ci            my = get_lowest_part_list_y(sl, n, height, y_offset, 1);
84cabdff1aSopenharmony_ci            if (refs[1][ref_n] < 0)
85cabdff1aSopenharmony_ci                nrefs[1] += 1;
86cabdff1aSopenharmony_ci            refs[1][ref_n] = FFMAX(refs[1][ref_n], my);
87cabdff1aSopenharmony_ci        }
88cabdff1aSopenharmony_ci    }
89cabdff1aSopenharmony_ci}
90cabdff1aSopenharmony_ci
91cabdff1aSopenharmony_ci/**
92cabdff1aSopenharmony_ci * Wait until all reference frames are available for MC operations.
93cabdff1aSopenharmony_ci *
94cabdff1aSopenharmony_ci * @param h the H.264 context
95cabdff1aSopenharmony_ci */
96cabdff1aSopenharmony_cistatic void await_references(const H264Context *h, H264SliceContext *sl)
97cabdff1aSopenharmony_ci{
98cabdff1aSopenharmony_ci    const int mb_xy   = sl->mb_xy;
99cabdff1aSopenharmony_ci    const int mb_type = h->cur_pic.mb_type[mb_xy];
100cabdff1aSopenharmony_ci    int16_t refs[2][48];
101cabdff1aSopenharmony_ci    int nrefs[2] = { 0 };
102cabdff1aSopenharmony_ci    int ref, list;
103cabdff1aSopenharmony_ci
104cabdff1aSopenharmony_ci    memset(refs, -1, sizeof(refs));
105cabdff1aSopenharmony_ci
106cabdff1aSopenharmony_ci    if (IS_16X16(mb_type)) {
107cabdff1aSopenharmony_ci        get_lowest_part_y(h, sl, refs, 0, 16, 0,
108cabdff1aSopenharmony_ci                          IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs);
109cabdff1aSopenharmony_ci    } else if (IS_16X8(mb_type)) {
110cabdff1aSopenharmony_ci        get_lowest_part_y(h, sl, refs, 0, 8, 0,
111cabdff1aSopenharmony_ci                          IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs);
112cabdff1aSopenharmony_ci        get_lowest_part_y(h, sl, refs, 8, 8, 8,
113cabdff1aSopenharmony_ci                          IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs);
114cabdff1aSopenharmony_ci    } else if (IS_8X16(mb_type)) {
115cabdff1aSopenharmony_ci        get_lowest_part_y(h, sl, refs, 0, 16, 0,
116cabdff1aSopenharmony_ci                          IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs);
117cabdff1aSopenharmony_ci        get_lowest_part_y(h, sl, refs, 4, 16, 0,
118cabdff1aSopenharmony_ci                          IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs);
119cabdff1aSopenharmony_ci    } else {
120cabdff1aSopenharmony_ci        int i;
121cabdff1aSopenharmony_ci
122cabdff1aSopenharmony_ci        av_assert2(IS_8X8(mb_type));
123cabdff1aSopenharmony_ci
124cabdff1aSopenharmony_ci        for (i = 0; i < 4; i++) {
125cabdff1aSopenharmony_ci            const int sub_mb_type = sl->sub_mb_type[i];
126cabdff1aSopenharmony_ci            const int n           = 4 * i;
127cabdff1aSopenharmony_ci            int y_offset          = (i & 2) << 2;
128cabdff1aSopenharmony_ci
129cabdff1aSopenharmony_ci            if (IS_SUB_8X8(sub_mb_type)) {
130cabdff1aSopenharmony_ci                get_lowest_part_y(h, sl, refs, n, 8, y_offset,
131cabdff1aSopenharmony_ci                                  IS_DIR(sub_mb_type, 0, 0),
132cabdff1aSopenharmony_ci                                  IS_DIR(sub_mb_type, 0, 1),
133cabdff1aSopenharmony_ci                                  nrefs);
134cabdff1aSopenharmony_ci            } else if (IS_SUB_8X4(sub_mb_type)) {
135cabdff1aSopenharmony_ci                get_lowest_part_y(h, sl, refs, n, 4, y_offset,
136cabdff1aSopenharmony_ci                                  IS_DIR(sub_mb_type, 0, 0),
137cabdff1aSopenharmony_ci                                  IS_DIR(sub_mb_type, 0, 1),
138cabdff1aSopenharmony_ci                                  nrefs);
139cabdff1aSopenharmony_ci                get_lowest_part_y(h, sl, refs, n + 2, 4, y_offset + 4,
140cabdff1aSopenharmony_ci                                  IS_DIR(sub_mb_type, 0, 0),
141cabdff1aSopenharmony_ci                                  IS_DIR(sub_mb_type, 0, 1),
142cabdff1aSopenharmony_ci                                  nrefs);
143cabdff1aSopenharmony_ci            } else if (IS_SUB_4X8(sub_mb_type)) {
144cabdff1aSopenharmony_ci                get_lowest_part_y(h, sl, refs, n, 8, y_offset,
145cabdff1aSopenharmony_ci                                  IS_DIR(sub_mb_type, 0, 0),
146cabdff1aSopenharmony_ci                                  IS_DIR(sub_mb_type, 0, 1),
147cabdff1aSopenharmony_ci                                  nrefs);
148cabdff1aSopenharmony_ci                get_lowest_part_y(h, sl, refs, n + 1, 8, y_offset,
149cabdff1aSopenharmony_ci                                  IS_DIR(sub_mb_type, 0, 0),
150cabdff1aSopenharmony_ci                                  IS_DIR(sub_mb_type, 0, 1),
151cabdff1aSopenharmony_ci                                  nrefs);
152cabdff1aSopenharmony_ci            } else {
153cabdff1aSopenharmony_ci                int j;
154cabdff1aSopenharmony_ci                av_assert2(IS_SUB_4X4(sub_mb_type));
155cabdff1aSopenharmony_ci                for (j = 0; j < 4; j++) {
156cabdff1aSopenharmony_ci                    int sub_y_offset = y_offset + 2 * (j & 2);
157cabdff1aSopenharmony_ci                    get_lowest_part_y(h, sl, refs, n + j, 4, sub_y_offset,
158cabdff1aSopenharmony_ci                                      IS_DIR(sub_mb_type, 0, 0),
159cabdff1aSopenharmony_ci                                      IS_DIR(sub_mb_type, 0, 1),
160cabdff1aSopenharmony_ci                                      nrefs);
161cabdff1aSopenharmony_ci                }
162cabdff1aSopenharmony_ci            }
163cabdff1aSopenharmony_ci        }
164cabdff1aSopenharmony_ci    }
165cabdff1aSopenharmony_ci
166cabdff1aSopenharmony_ci    for (list = sl->list_count - 1; list >= 0; list--)
167cabdff1aSopenharmony_ci        for (ref = 0; ref < 48 && nrefs[list]; ref++) {
168cabdff1aSopenharmony_ci            int row = refs[list][ref];
169cabdff1aSopenharmony_ci            if (row >= 0) {
170cabdff1aSopenharmony_ci                H264Ref *ref_pic  = &sl->ref_list[list][ref];
171cabdff1aSopenharmony_ci                int ref_field         = ref_pic->reference - 1;
172cabdff1aSopenharmony_ci                int ref_field_picture = ref_pic->parent->field_picture;
173cabdff1aSopenharmony_ci                int pic_height        = 16 * h->mb_height >> ref_field_picture;
174cabdff1aSopenharmony_ci
175cabdff1aSopenharmony_ci                row <<= MB_MBAFF(sl);
176cabdff1aSopenharmony_ci                nrefs[list]--;
177cabdff1aSopenharmony_ci
178cabdff1aSopenharmony_ci                if (!FIELD_PICTURE(h) && ref_field_picture) { // frame referencing two fields
179cabdff1aSopenharmony_ci                    av_assert2((ref_pic->parent->reference & 3) == 3);
180cabdff1aSopenharmony_ci                    ff_thread_await_progress(&ref_pic->parent->tf,
181cabdff1aSopenharmony_ci                                             FFMIN((row >> 1) - !(row & 1),
182cabdff1aSopenharmony_ci                                                   pic_height - 1),
183cabdff1aSopenharmony_ci                                             1);
184cabdff1aSopenharmony_ci                    ff_thread_await_progress(&ref_pic->parent->tf,
185cabdff1aSopenharmony_ci                                             FFMIN((row >> 1), pic_height - 1),
186cabdff1aSopenharmony_ci                                             0);
187cabdff1aSopenharmony_ci                } else if (FIELD_PICTURE(h) && !ref_field_picture) { // field referencing one field of a frame
188cabdff1aSopenharmony_ci                    ff_thread_await_progress(&ref_pic->parent->tf,
189cabdff1aSopenharmony_ci                                             FFMIN(row * 2 + ref_field,
190cabdff1aSopenharmony_ci                                                   pic_height - 1),
191cabdff1aSopenharmony_ci                                             0);
192cabdff1aSopenharmony_ci                } else if (FIELD_PICTURE(h)) {
193cabdff1aSopenharmony_ci                    ff_thread_await_progress(&ref_pic->parent->tf,
194cabdff1aSopenharmony_ci                                             FFMIN(row, pic_height - 1),
195cabdff1aSopenharmony_ci                                             ref_field);
196cabdff1aSopenharmony_ci                } else {
197cabdff1aSopenharmony_ci                    ff_thread_await_progress(&ref_pic->parent->tf,
198cabdff1aSopenharmony_ci                                             FFMIN(row, pic_height - 1),
199cabdff1aSopenharmony_ci                                             0);
200cabdff1aSopenharmony_ci                }
201cabdff1aSopenharmony_ci            }
202cabdff1aSopenharmony_ci        }
203cabdff1aSopenharmony_ci}
204cabdff1aSopenharmony_ci
205cabdff1aSopenharmony_cistatic av_always_inline void mc_dir_part(const H264Context *h, H264SliceContext *sl,
206cabdff1aSopenharmony_ci                                         H264Ref *pic,
207cabdff1aSopenharmony_ci                                         int n, int square, int height,
208cabdff1aSopenharmony_ci                                         int delta, int list,
209cabdff1aSopenharmony_ci                                         uint8_t *dest_y, uint8_t *dest_cb,
210cabdff1aSopenharmony_ci                                         uint8_t *dest_cr,
211cabdff1aSopenharmony_ci                                         int src_x_offset, int src_y_offset,
212cabdff1aSopenharmony_ci                                         const qpel_mc_func *qpix_op,
213cabdff1aSopenharmony_ci                                         h264_chroma_mc_func chroma_op,
214cabdff1aSopenharmony_ci                                         int pixel_shift, int chroma_idc)
215cabdff1aSopenharmony_ci{
216cabdff1aSopenharmony_ci    const int mx      = sl->mv_cache[list][scan8[n]][0] + src_x_offset * 8;
217cabdff1aSopenharmony_ci    int my            = sl->mv_cache[list][scan8[n]][1] + src_y_offset * 8;
218cabdff1aSopenharmony_ci    const int luma_xy = (mx & 3) + ((my & 3) << 2);
219cabdff1aSopenharmony_ci    ptrdiff_t offset  = (mx >> 2) * (1 << pixel_shift) + (my >> 2) * sl->mb_linesize;
220cabdff1aSopenharmony_ci    uint8_t *src_y    = pic->data[0] + offset;
221cabdff1aSopenharmony_ci    uint8_t *src_cb, *src_cr;
222cabdff1aSopenharmony_ci    int extra_width  = 0;
223cabdff1aSopenharmony_ci    int extra_height = 0;
224cabdff1aSopenharmony_ci    int emu = 0;
225cabdff1aSopenharmony_ci    const int full_mx    = mx >> 2;
226cabdff1aSopenharmony_ci    const int full_my    = my >> 2;
227cabdff1aSopenharmony_ci    const int pic_width  = 16 * h->mb_width;
228cabdff1aSopenharmony_ci    const int pic_height = 16 * h->mb_height >> MB_FIELD(sl);
229cabdff1aSopenharmony_ci    int ysh;
230cabdff1aSopenharmony_ci
231cabdff1aSopenharmony_ci    if (mx & 7)
232cabdff1aSopenharmony_ci        extra_width -= 3;
233cabdff1aSopenharmony_ci    if (my & 7)
234cabdff1aSopenharmony_ci        extra_height -= 3;
235cabdff1aSopenharmony_ci
236cabdff1aSopenharmony_ci    if (full_mx                <          0 - extra_width  ||
237cabdff1aSopenharmony_ci        full_my                <          0 - extra_height ||
238cabdff1aSopenharmony_ci        full_mx + 16 /*FIXME*/ > pic_width  + extra_width  ||
239cabdff1aSopenharmony_ci        full_my + 16 /*FIXME*/ > pic_height + extra_height) {
240cabdff1aSopenharmony_ci        h->vdsp.emulated_edge_mc(sl->edge_emu_buffer,
241cabdff1aSopenharmony_ci                                 src_y - (2 << pixel_shift) - 2 * sl->mb_linesize,
242cabdff1aSopenharmony_ci                                 sl->mb_linesize, sl->mb_linesize,
243cabdff1aSopenharmony_ci                                 16 + 5, 16 + 5 /*FIXME*/, full_mx - 2,
244cabdff1aSopenharmony_ci                                 full_my - 2, pic_width, pic_height);
245cabdff1aSopenharmony_ci        src_y = sl->edge_emu_buffer + (2 << pixel_shift) + 2 * sl->mb_linesize;
246cabdff1aSopenharmony_ci        emu   = 1;
247cabdff1aSopenharmony_ci    }
248cabdff1aSopenharmony_ci
249cabdff1aSopenharmony_ci    qpix_op[luma_xy](dest_y, src_y, sl->mb_linesize); // FIXME try variable height perhaps?
250cabdff1aSopenharmony_ci    if (!square)
251cabdff1aSopenharmony_ci        qpix_op[luma_xy](dest_y + delta, src_y + delta, sl->mb_linesize);
252cabdff1aSopenharmony_ci
253cabdff1aSopenharmony_ci    if (CONFIG_GRAY && h->flags & AV_CODEC_FLAG_GRAY)
254cabdff1aSopenharmony_ci        return;
255cabdff1aSopenharmony_ci
256cabdff1aSopenharmony_ci    if (chroma_idc == 3 /* yuv444 */) {
257cabdff1aSopenharmony_ci        src_cb = pic->data[1] + offset;
258cabdff1aSopenharmony_ci        if (emu) {
259cabdff1aSopenharmony_ci            h->vdsp.emulated_edge_mc(sl->edge_emu_buffer,
260cabdff1aSopenharmony_ci                                     src_cb - (2 << pixel_shift) - 2 * sl->mb_linesize,
261cabdff1aSopenharmony_ci                                     sl->mb_linesize, sl->mb_linesize,
262cabdff1aSopenharmony_ci                                     16 + 5, 16 + 5 /*FIXME*/,
263cabdff1aSopenharmony_ci                                     full_mx - 2, full_my - 2,
264cabdff1aSopenharmony_ci                                     pic_width, pic_height);
265cabdff1aSopenharmony_ci            src_cb = sl->edge_emu_buffer + (2 << pixel_shift) + 2 * sl->mb_linesize;
266cabdff1aSopenharmony_ci        }
267cabdff1aSopenharmony_ci        qpix_op[luma_xy](dest_cb, src_cb, sl->mb_linesize); // FIXME try variable height perhaps?
268cabdff1aSopenharmony_ci        if (!square)
269cabdff1aSopenharmony_ci            qpix_op[luma_xy](dest_cb + delta, src_cb + delta, sl->mb_linesize);
270cabdff1aSopenharmony_ci
271cabdff1aSopenharmony_ci        src_cr = pic->data[2] + offset;
272cabdff1aSopenharmony_ci        if (emu) {
273cabdff1aSopenharmony_ci            h->vdsp.emulated_edge_mc(sl->edge_emu_buffer,
274cabdff1aSopenharmony_ci                                     src_cr - (2 << pixel_shift) - 2 * sl->mb_linesize,
275cabdff1aSopenharmony_ci                                     sl->mb_linesize, sl->mb_linesize,
276cabdff1aSopenharmony_ci                                     16 + 5, 16 + 5 /*FIXME*/,
277cabdff1aSopenharmony_ci                                     full_mx - 2, full_my - 2,
278cabdff1aSopenharmony_ci                                     pic_width, pic_height);
279cabdff1aSopenharmony_ci            src_cr = sl->edge_emu_buffer + (2 << pixel_shift) + 2 * sl->mb_linesize;
280cabdff1aSopenharmony_ci        }
281cabdff1aSopenharmony_ci        qpix_op[luma_xy](dest_cr, src_cr, sl->mb_linesize); // FIXME try variable height perhaps?
282cabdff1aSopenharmony_ci        if (!square)
283cabdff1aSopenharmony_ci            qpix_op[luma_xy](dest_cr + delta, src_cr + delta, sl->mb_linesize);
284cabdff1aSopenharmony_ci        return;
285cabdff1aSopenharmony_ci    }
286cabdff1aSopenharmony_ci
287cabdff1aSopenharmony_ci    ysh = 3 - (chroma_idc == 2 /* yuv422 */);
288cabdff1aSopenharmony_ci    if (chroma_idc == 1 /* yuv420 */ && MB_FIELD(sl)) {
289cabdff1aSopenharmony_ci        // chroma offset when predicting from a field of opposite parity
290cabdff1aSopenharmony_ci        my  += 2 * ((sl->mb_y & 1) - (pic->reference - 1));
291cabdff1aSopenharmony_ci        emu |= (my >> 3) < 0 || (my >> 3) + 8 >= (pic_height >> 1);
292cabdff1aSopenharmony_ci    }
293cabdff1aSopenharmony_ci
294cabdff1aSopenharmony_ci    src_cb = pic->data[1] + ((mx >> 3) * (1 << pixel_shift)) +
295cabdff1aSopenharmony_ci             (my >> ysh) * sl->mb_uvlinesize;
296cabdff1aSopenharmony_ci    src_cr = pic->data[2] + ((mx >> 3) * (1 << pixel_shift)) +
297cabdff1aSopenharmony_ci             (my >> ysh) * sl->mb_uvlinesize;
298cabdff1aSopenharmony_ci
299cabdff1aSopenharmony_ci    if (emu) {
300cabdff1aSopenharmony_ci        h->vdsp.emulated_edge_mc(sl->edge_emu_buffer, src_cb,
301cabdff1aSopenharmony_ci                                 sl->mb_uvlinesize, sl->mb_uvlinesize,
302cabdff1aSopenharmony_ci                                 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
303cabdff1aSopenharmony_ci                                 pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
304cabdff1aSopenharmony_ci        src_cb = sl->edge_emu_buffer;
305cabdff1aSopenharmony_ci    }
306cabdff1aSopenharmony_ci    chroma_op(dest_cb, src_cb, sl->mb_uvlinesize,
307cabdff1aSopenharmony_ci              height >> (chroma_idc == 1 /* yuv420 */),
308cabdff1aSopenharmony_ci              mx & 7, ((unsigned)my << (chroma_idc == 2 /* yuv422 */)) & 7);
309cabdff1aSopenharmony_ci
310cabdff1aSopenharmony_ci    if (emu) {
311cabdff1aSopenharmony_ci        h->vdsp.emulated_edge_mc(sl->edge_emu_buffer, src_cr,
312cabdff1aSopenharmony_ci                                 sl->mb_uvlinesize, sl->mb_uvlinesize,
313cabdff1aSopenharmony_ci                                 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
314cabdff1aSopenharmony_ci                                 pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
315cabdff1aSopenharmony_ci        src_cr = sl->edge_emu_buffer;
316cabdff1aSopenharmony_ci    }
317cabdff1aSopenharmony_ci    chroma_op(dest_cr, src_cr, sl->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */),
318cabdff1aSopenharmony_ci              mx & 7, ((unsigned)my << (chroma_idc == 2 /* yuv422 */)) & 7);
319cabdff1aSopenharmony_ci}
320cabdff1aSopenharmony_ci
321cabdff1aSopenharmony_cistatic av_always_inline void mc_part_std(const H264Context *h, H264SliceContext *sl,
322cabdff1aSopenharmony_ci                                         int n, int square,
323cabdff1aSopenharmony_ci                                         int height, int delta,
324cabdff1aSopenharmony_ci                                         uint8_t *dest_y, uint8_t *dest_cb,
325cabdff1aSopenharmony_ci                                         uint8_t *dest_cr,
326cabdff1aSopenharmony_ci                                         int x_offset, int y_offset,
327cabdff1aSopenharmony_ci                                         const qpel_mc_func *qpix_put,
328cabdff1aSopenharmony_ci                                         h264_chroma_mc_func chroma_put,
329cabdff1aSopenharmony_ci                                         const qpel_mc_func *qpix_avg,
330cabdff1aSopenharmony_ci                                         h264_chroma_mc_func chroma_avg,
331cabdff1aSopenharmony_ci                                         int list0, int list1,
332cabdff1aSopenharmony_ci                                         int pixel_shift, int chroma_idc)
333cabdff1aSopenharmony_ci{
334cabdff1aSopenharmony_ci    const qpel_mc_func *qpix_op   = qpix_put;
335cabdff1aSopenharmony_ci    h264_chroma_mc_func chroma_op = chroma_put;
336cabdff1aSopenharmony_ci
337cabdff1aSopenharmony_ci    dest_y += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
338cabdff1aSopenharmony_ci    if (chroma_idc == 3 /* yuv444 */) {
339cabdff1aSopenharmony_ci        dest_cb += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
340cabdff1aSopenharmony_ci        dest_cr += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
341cabdff1aSopenharmony_ci    } else if (chroma_idc == 2 /* yuv422 */) {
342cabdff1aSopenharmony_ci        dest_cb += (x_offset << pixel_shift) + 2 * y_offset * sl->mb_uvlinesize;
343cabdff1aSopenharmony_ci        dest_cr += (x_offset << pixel_shift) + 2 * y_offset * sl->mb_uvlinesize;
344cabdff1aSopenharmony_ci    } else { /* yuv420 */
345cabdff1aSopenharmony_ci        dest_cb += (x_offset << pixel_shift) + y_offset * sl->mb_uvlinesize;
346cabdff1aSopenharmony_ci        dest_cr += (x_offset << pixel_shift) + y_offset * sl->mb_uvlinesize;
347cabdff1aSopenharmony_ci    }
348cabdff1aSopenharmony_ci    x_offset += 8 * sl->mb_x;
349cabdff1aSopenharmony_ci    y_offset += 8 * (sl->mb_y >> MB_FIELD(sl));
350cabdff1aSopenharmony_ci
351cabdff1aSopenharmony_ci    if (list0) {
352cabdff1aSopenharmony_ci        H264Ref *ref = &sl->ref_list[0][sl->ref_cache[0][scan8[n]]];
353cabdff1aSopenharmony_ci        mc_dir_part(h, sl, ref, n, square, height, delta, 0,
354cabdff1aSopenharmony_ci                    dest_y, dest_cb, dest_cr, x_offset, y_offset,
355cabdff1aSopenharmony_ci                    qpix_op, chroma_op, pixel_shift, chroma_idc);
356cabdff1aSopenharmony_ci
357cabdff1aSopenharmony_ci        qpix_op   = qpix_avg;
358cabdff1aSopenharmony_ci        chroma_op = chroma_avg;
359cabdff1aSopenharmony_ci    }
360cabdff1aSopenharmony_ci
361cabdff1aSopenharmony_ci    if (list1) {
362cabdff1aSopenharmony_ci        H264Ref *ref = &sl->ref_list[1][sl->ref_cache[1][scan8[n]]];
363cabdff1aSopenharmony_ci        mc_dir_part(h, sl, ref, n, square, height, delta, 1,
364cabdff1aSopenharmony_ci                    dest_y, dest_cb, dest_cr, x_offset, y_offset,
365cabdff1aSopenharmony_ci                    qpix_op, chroma_op, pixel_shift, chroma_idc);
366cabdff1aSopenharmony_ci    }
367cabdff1aSopenharmony_ci}
368cabdff1aSopenharmony_ci
369cabdff1aSopenharmony_cistatic av_always_inline void mc_part_weighted(const H264Context *h, H264SliceContext *sl,
370cabdff1aSopenharmony_ci                                              int n, int square,
371cabdff1aSopenharmony_ci                                              int height, int delta,
372cabdff1aSopenharmony_ci                                              uint8_t *dest_y, uint8_t *dest_cb,
373cabdff1aSopenharmony_ci                                              uint8_t *dest_cr,
374cabdff1aSopenharmony_ci                                              int x_offset, int y_offset,
375cabdff1aSopenharmony_ci                                              const qpel_mc_func *qpix_put,
376cabdff1aSopenharmony_ci                                              h264_chroma_mc_func chroma_put,
377cabdff1aSopenharmony_ci                                              h264_weight_func luma_weight_op,
378cabdff1aSopenharmony_ci                                              h264_weight_func chroma_weight_op,
379cabdff1aSopenharmony_ci                                              h264_biweight_func luma_weight_avg,
380cabdff1aSopenharmony_ci                                              h264_biweight_func chroma_weight_avg,
381cabdff1aSopenharmony_ci                                              int list0, int list1,
382cabdff1aSopenharmony_ci                                              int pixel_shift, int chroma_idc)
383cabdff1aSopenharmony_ci{
384cabdff1aSopenharmony_ci    int chroma_height;
385cabdff1aSopenharmony_ci
386cabdff1aSopenharmony_ci    dest_y += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
387cabdff1aSopenharmony_ci    if (chroma_idc == 3 /* yuv444 */) {
388cabdff1aSopenharmony_ci        chroma_height     = height;
389cabdff1aSopenharmony_ci        chroma_weight_avg = luma_weight_avg;
390cabdff1aSopenharmony_ci        chroma_weight_op  = luma_weight_op;
391cabdff1aSopenharmony_ci        dest_cb += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
392cabdff1aSopenharmony_ci        dest_cr += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
393cabdff1aSopenharmony_ci    } else if (chroma_idc == 2 /* yuv422 */) {
394cabdff1aSopenharmony_ci        chroma_height = height;
395cabdff1aSopenharmony_ci        dest_cb      += (x_offset << pixel_shift) + 2 * y_offset * sl->mb_uvlinesize;
396cabdff1aSopenharmony_ci        dest_cr      += (x_offset << pixel_shift) + 2 * y_offset * sl->mb_uvlinesize;
397cabdff1aSopenharmony_ci    } else { /* yuv420 */
398cabdff1aSopenharmony_ci        chroma_height = height >> 1;
399cabdff1aSopenharmony_ci        dest_cb      += (x_offset << pixel_shift) + y_offset * sl->mb_uvlinesize;
400cabdff1aSopenharmony_ci        dest_cr      += (x_offset << pixel_shift) + y_offset * sl->mb_uvlinesize;
401cabdff1aSopenharmony_ci    }
402cabdff1aSopenharmony_ci    x_offset += 8 * sl->mb_x;
403cabdff1aSopenharmony_ci    y_offset += 8 * (sl->mb_y >> MB_FIELD(sl));
404cabdff1aSopenharmony_ci
405cabdff1aSopenharmony_ci    if (list0 && list1) {
406cabdff1aSopenharmony_ci        /* don't optimize for luma-only case, since B-frames usually
407cabdff1aSopenharmony_ci         * use implicit weights => chroma too. */
408cabdff1aSopenharmony_ci        uint8_t *tmp_cb = sl->bipred_scratchpad;
409cabdff1aSopenharmony_ci        uint8_t *tmp_cr = sl->bipred_scratchpad + (16 << pixel_shift);
410cabdff1aSopenharmony_ci        uint8_t *tmp_y  = sl->bipred_scratchpad + 16 * sl->mb_uvlinesize;
411cabdff1aSopenharmony_ci        int refn0       = sl->ref_cache[0][scan8[n]];
412cabdff1aSopenharmony_ci        int refn1       = sl->ref_cache[1][scan8[n]];
413cabdff1aSopenharmony_ci
414cabdff1aSopenharmony_ci        mc_dir_part(h, sl, &sl->ref_list[0][refn0], n, square, height, delta, 0,
415cabdff1aSopenharmony_ci                    dest_y, dest_cb, dest_cr,
416cabdff1aSopenharmony_ci                    x_offset, y_offset, qpix_put, chroma_put,
417cabdff1aSopenharmony_ci                    pixel_shift, chroma_idc);
418cabdff1aSopenharmony_ci        mc_dir_part(h, sl, &sl->ref_list[1][refn1], n, square, height, delta, 1,
419cabdff1aSopenharmony_ci                    tmp_y, tmp_cb, tmp_cr,
420cabdff1aSopenharmony_ci                    x_offset, y_offset, qpix_put, chroma_put,
421cabdff1aSopenharmony_ci                    pixel_shift, chroma_idc);
422cabdff1aSopenharmony_ci
423cabdff1aSopenharmony_ci        if (sl->pwt.use_weight == 2) {
424cabdff1aSopenharmony_ci            int weight0 = sl->pwt.implicit_weight[refn0][refn1][sl->mb_y & 1];
425cabdff1aSopenharmony_ci            int weight1 = 64 - weight0;
426cabdff1aSopenharmony_ci            luma_weight_avg(dest_y, tmp_y, sl->mb_linesize,
427cabdff1aSopenharmony_ci                            height, 5, weight0, weight1, 0);
428cabdff1aSopenharmony_ci            if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
429cabdff1aSopenharmony_ci                chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize,
430cabdff1aSopenharmony_ci                                  chroma_height, 5, weight0, weight1, 0);
431cabdff1aSopenharmony_ci                chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize,
432cabdff1aSopenharmony_ci                                  chroma_height, 5, weight0, weight1, 0);
433cabdff1aSopenharmony_ci            }
434cabdff1aSopenharmony_ci        } else {
435cabdff1aSopenharmony_ci            luma_weight_avg(dest_y, tmp_y, sl->mb_linesize, height,
436cabdff1aSopenharmony_ci                            sl->pwt.luma_log2_weight_denom,
437cabdff1aSopenharmony_ci                            sl->pwt.luma_weight[refn0][0][0],
438cabdff1aSopenharmony_ci                            sl->pwt.luma_weight[refn1][1][0],
439cabdff1aSopenharmony_ci                            sl->pwt.luma_weight[refn0][0][1] +
440cabdff1aSopenharmony_ci                            sl->pwt.luma_weight[refn1][1][1]);
441cabdff1aSopenharmony_ci            if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
442cabdff1aSopenharmony_ci                chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize, chroma_height,
443cabdff1aSopenharmony_ci                                  sl->pwt.chroma_log2_weight_denom,
444cabdff1aSopenharmony_ci                                  sl->pwt.chroma_weight[refn0][0][0][0],
445cabdff1aSopenharmony_ci                                  sl->pwt.chroma_weight[refn1][1][0][0],
446cabdff1aSopenharmony_ci                                  sl->pwt.chroma_weight[refn0][0][0][1] +
447cabdff1aSopenharmony_ci                                  sl->pwt.chroma_weight[refn1][1][0][1]);
448cabdff1aSopenharmony_ci                chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize, chroma_height,
449cabdff1aSopenharmony_ci                                  sl->pwt.chroma_log2_weight_denom,
450cabdff1aSopenharmony_ci                                  sl->pwt.chroma_weight[refn0][0][1][0],
451cabdff1aSopenharmony_ci                                  sl->pwt.chroma_weight[refn1][1][1][0],
452cabdff1aSopenharmony_ci                                  sl->pwt.chroma_weight[refn0][0][1][1] +
453cabdff1aSopenharmony_ci                                  sl->pwt.chroma_weight[refn1][1][1][1]);
454cabdff1aSopenharmony_ci            }
455cabdff1aSopenharmony_ci        }
456cabdff1aSopenharmony_ci    } else {
457cabdff1aSopenharmony_ci        int list     = list1 ? 1 : 0;
458cabdff1aSopenharmony_ci        int refn     = sl->ref_cache[list][scan8[n]];
459cabdff1aSopenharmony_ci        H264Ref *ref = &sl->ref_list[list][refn];
460cabdff1aSopenharmony_ci        mc_dir_part(h, sl, ref, n, square, height, delta, list,
461cabdff1aSopenharmony_ci                    dest_y, dest_cb, dest_cr, x_offset, y_offset,
462cabdff1aSopenharmony_ci                    qpix_put, chroma_put, pixel_shift, chroma_idc);
463cabdff1aSopenharmony_ci
464cabdff1aSopenharmony_ci        luma_weight_op(dest_y, sl->mb_linesize, height,
465cabdff1aSopenharmony_ci                       sl->pwt.luma_log2_weight_denom,
466cabdff1aSopenharmony_ci                       sl->pwt.luma_weight[refn][list][0],
467cabdff1aSopenharmony_ci                       sl->pwt.luma_weight[refn][list][1]);
468cabdff1aSopenharmony_ci        if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
469cabdff1aSopenharmony_ci            if (sl->pwt.use_weight_chroma) {
470cabdff1aSopenharmony_ci                chroma_weight_op(dest_cb, sl->mb_uvlinesize, chroma_height,
471cabdff1aSopenharmony_ci                                 sl->pwt.chroma_log2_weight_denom,
472cabdff1aSopenharmony_ci                                 sl->pwt.chroma_weight[refn][list][0][0],
473cabdff1aSopenharmony_ci                                 sl->pwt.chroma_weight[refn][list][0][1]);
474cabdff1aSopenharmony_ci                chroma_weight_op(dest_cr, sl->mb_uvlinesize, chroma_height,
475cabdff1aSopenharmony_ci                                 sl->pwt.chroma_log2_weight_denom,
476cabdff1aSopenharmony_ci                                 sl->pwt.chroma_weight[refn][list][1][0],
477cabdff1aSopenharmony_ci                                 sl->pwt.chroma_weight[refn][list][1][1]);
478cabdff1aSopenharmony_ci            }
479cabdff1aSopenharmony_ci        }
480cabdff1aSopenharmony_ci    }
481cabdff1aSopenharmony_ci}
482cabdff1aSopenharmony_ci
483cabdff1aSopenharmony_cistatic av_always_inline void prefetch_motion(const H264Context *h, H264SliceContext *sl,
484cabdff1aSopenharmony_ci                                             int list, int pixel_shift,
485cabdff1aSopenharmony_ci                                             int chroma_idc)
486cabdff1aSopenharmony_ci{
487cabdff1aSopenharmony_ci    /* fetch pixels for estimated mv 4 macroblocks ahead
488cabdff1aSopenharmony_ci     * optimized for 64byte cache lines */
489cabdff1aSopenharmony_ci    const int refn = sl->ref_cache[list][scan8[0]];
490cabdff1aSopenharmony_ci    if (refn >= 0) {
491cabdff1aSopenharmony_ci        const int mx  = (sl->mv_cache[list][scan8[0]][0] >> 2) + 16 * sl->mb_x + 8;
492cabdff1aSopenharmony_ci        const int my  = (sl->mv_cache[list][scan8[0]][1] >> 2) + 16 * sl->mb_y;
493cabdff1aSopenharmony_ci        uint8_t **src = sl->ref_list[list][refn].data;
494cabdff1aSopenharmony_ci        int off       =  mx * (1<< pixel_shift) +
495cabdff1aSopenharmony_ci                        (my + (sl->mb_x & 3) * 4) * sl->mb_linesize +
496cabdff1aSopenharmony_ci                        (64 << pixel_shift);
497cabdff1aSopenharmony_ci        h->vdsp.prefetch(src[0] + off, sl->linesize, 4);
498cabdff1aSopenharmony_ci        if (chroma_idc == 3 /* yuv444 */) {
499cabdff1aSopenharmony_ci            h->vdsp.prefetch(src[1] + off, sl->linesize, 4);
500cabdff1aSopenharmony_ci            h->vdsp.prefetch(src[2] + off, sl->linesize, 4);
501cabdff1aSopenharmony_ci        } else {
502cabdff1aSopenharmony_ci            off= ((mx>>1)+64) * (1<<pixel_shift) + ((my>>1) + (sl->mb_x&7))*sl->uvlinesize;
503cabdff1aSopenharmony_ci            h->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
504cabdff1aSopenharmony_ci        }
505cabdff1aSopenharmony_ci    }
506cabdff1aSopenharmony_ci}
507cabdff1aSopenharmony_ci
508cabdff1aSopenharmony_cistatic av_always_inline void xchg_mb_border(const H264Context *h, H264SliceContext *sl,
509cabdff1aSopenharmony_ci                                            uint8_t *src_y,
510cabdff1aSopenharmony_ci                                            uint8_t *src_cb, uint8_t *src_cr,
511cabdff1aSopenharmony_ci                                            int linesize, int uvlinesize,
512cabdff1aSopenharmony_ci                                            int xchg, int chroma444,
513cabdff1aSopenharmony_ci                                            int simple, int pixel_shift)
514cabdff1aSopenharmony_ci{
515cabdff1aSopenharmony_ci    int deblock_topleft;
516cabdff1aSopenharmony_ci    int deblock_top;
517cabdff1aSopenharmony_ci    int top_idx = 1;
518cabdff1aSopenharmony_ci    uint8_t *top_border_m1;
519cabdff1aSopenharmony_ci    uint8_t *top_border;
520cabdff1aSopenharmony_ci
521cabdff1aSopenharmony_ci    if (!simple && FRAME_MBAFF(h)) {
522cabdff1aSopenharmony_ci        if (sl->mb_y & 1) {
523cabdff1aSopenharmony_ci            if (!MB_MBAFF(sl))
524cabdff1aSopenharmony_ci                return;
525cabdff1aSopenharmony_ci        } else {
526cabdff1aSopenharmony_ci            top_idx = MB_MBAFF(sl) ? 0 : 1;
527cabdff1aSopenharmony_ci        }
528cabdff1aSopenharmony_ci    }
529cabdff1aSopenharmony_ci
530cabdff1aSopenharmony_ci    if (sl->deblocking_filter == 2) {
531cabdff1aSopenharmony_ci        deblock_topleft = h->slice_table[sl->mb_xy - 1 - h->mb_stride] == sl->slice_num;
532cabdff1aSopenharmony_ci        deblock_top     = sl->top_type;
533cabdff1aSopenharmony_ci    } else {
534cabdff1aSopenharmony_ci        deblock_topleft = (sl->mb_x > 0);
535cabdff1aSopenharmony_ci        deblock_top     = (sl->mb_y > !!MB_FIELD(sl));
536cabdff1aSopenharmony_ci    }
537cabdff1aSopenharmony_ci
538cabdff1aSopenharmony_ci    src_y  -= linesize   + 1 + pixel_shift;
539cabdff1aSopenharmony_ci    src_cb -= uvlinesize + 1 + pixel_shift;
540cabdff1aSopenharmony_ci    src_cr -= uvlinesize + 1 + pixel_shift;
541cabdff1aSopenharmony_ci
542cabdff1aSopenharmony_ci    top_border_m1 = sl->top_borders[top_idx][sl->mb_x - 1];
543cabdff1aSopenharmony_ci    top_border    = sl->top_borders[top_idx][sl->mb_x];
544cabdff1aSopenharmony_ci
545cabdff1aSopenharmony_ci#define XCHG(a, b, xchg)                        \
546cabdff1aSopenharmony_ci    if (pixel_shift) {                          \
547cabdff1aSopenharmony_ci        if (xchg) {                             \
548cabdff1aSopenharmony_ci            AV_SWAP64(b + 0, a + 0);            \
549cabdff1aSopenharmony_ci            AV_SWAP64(b + 8, a + 8);            \
550cabdff1aSopenharmony_ci        } else {                                \
551cabdff1aSopenharmony_ci            AV_COPY128(b, a);                   \
552cabdff1aSopenharmony_ci        }                                       \
553cabdff1aSopenharmony_ci    } else if (xchg)                            \
554cabdff1aSopenharmony_ci        AV_SWAP64(b, a);                        \
555cabdff1aSopenharmony_ci    else                                        \
556cabdff1aSopenharmony_ci        AV_COPY64(b, a);
557cabdff1aSopenharmony_ci
558cabdff1aSopenharmony_ci    if (deblock_top) {
559cabdff1aSopenharmony_ci        if (deblock_topleft) {
560cabdff1aSopenharmony_ci            XCHG(top_border_m1 + (8 << pixel_shift),
561cabdff1aSopenharmony_ci                 src_y - (7 << pixel_shift), 1);
562cabdff1aSopenharmony_ci        }
563cabdff1aSopenharmony_ci        XCHG(top_border + (0 << pixel_shift), src_y + (1 << pixel_shift), xchg);
564cabdff1aSopenharmony_ci        XCHG(top_border + (8 << pixel_shift), src_y + (9 << pixel_shift), 1);
565cabdff1aSopenharmony_ci        if (sl->mb_x + 1 < h->mb_width) {
566cabdff1aSopenharmony_ci            XCHG(sl->top_borders[top_idx][sl->mb_x + 1],
567cabdff1aSopenharmony_ci                 src_y + (17 << pixel_shift), 1);
568cabdff1aSopenharmony_ci        }
569cabdff1aSopenharmony_ci        if (simple || !CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
570cabdff1aSopenharmony_ci            if (chroma444) {
571cabdff1aSopenharmony_ci                if (deblock_topleft) {
572cabdff1aSopenharmony_ci                    XCHG(top_border_m1 + (24 << pixel_shift), src_cb - (7 << pixel_shift), 1);
573cabdff1aSopenharmony_ci                    XCHG(top_border_m1 + (40 << pixel_shift), src_cr - (7 << pixel_shift), 1);
574cabdff1aSopenharmony_ci                }
575cabdff1aSopenharmony_ci                XCHG(top_border + (16 << pixel_shift), src_cb + (1 << pixel_shift), xchg);
576cabdff1aSopenharmony_ci                XCHG(top_border + (24 << pixel_shift), src_cb + (9 << pixel_shift), 1);
577cabdff1aSopenharmony_ci                XCHG(top_border + (32 << pixel_shift), src_cr + (1 << pixel_shift), xchg);
578cabdff1aSopenharmony_ci                XCHG(top_border + (40 << pixel_shift), src_cr + (9 << pixel_shift), 1);
579cabdff1aSopenharmony_ci                if (sl->mb_x + 1 < h->mb_width) {
580cabdff1aSopenharmony_ci                    XCHG(sl->top_borders[top_idx][sl->mb_x + 1] + (16 << pixel_shift), src_cb + (17 << pixel_shift), 1);
581cabdff1aSopenharmony_ci                    XCHG(sl->top_borders[top_idx][sl->mb_x + 1] + (32 << pixel_shift), src_cr + (17 << pixel_shift), 1);
582cabdff1aSopenharmony_ci                }
583cabdff1aSopenharmony_ci            } else {
584cabdff1aSopenharmony_ci                if (deblock_topleft) {
585cabdff1aSopenharmony_ci                    XCHG(top_border_m1 + (16 << pixel_shift), src_cb - (7 << pixel_shift), 1);
586cabdff1aSopenharmony_ci                    XCHG(top_border_m1 + (24 << pixel_shift), src_cr - (7 << pixel_shift), 1);
587cabdff1aSopenharmony_ci                }
588cabdff1aSopenharmony_ci                XCHG(top_border + (16 << pixel_shift), src_cb + 1 + pixel_shift, 1);
589cabdff1aSopenharmony_ci                XCHG(top_border + (24 << pixel_shift), src_cr + 1 + pixel_shift, 1);
590cabdff1aSopenharmony_ci            }
591cabdff1aSopenharmony_ci        }
592cabdff1aSopenharmony_ci    }
593cabdff1aSopenharmony_ci}
594cabdff1aSopenharmony_ci
595cabdff1aSopenharmony_cistatic av_always_inline int dctcoef_get(int16_t *mb, int high_bit_depth,
596cabdff1aSopenharmony_ci                                        int index)
597cabdff1aSopenharmony_ci{
598cabdff1aSopenharmony_ci    if (high_bit_depth) {
599cabdff1aSopenharmony_ci        return AV_RN32A(((int32_t *)mb) + index);
600cabdff1aSopenharmony_ci    } else
601cabdff1aSopenharmony_ci        return AV_RN16A(mb + index);
602cabdff1aSopenharmony_ci}
603cabdff1aSopenharmony_ci
604cabdff1aSopenharmony_cistatic av_always_inline void dctcoef_set(int16_t *mb, int high_bit_depth,
605cabdff1aSopenharmony_ci                                         int index, int value)
606cabdff1aSopenharmony_ci{
607cabdff1aSopenharmony_ci    if (high_bit_depth) {
608cabdff1aSopenharmony_ci        AV_WN32A(((int32_t *)mb) + index, value);
609cabdff1aSopenharmony_ci    } else
610cabdff1aSopenharmony_ci        AV_WN16A(mb + index, value);
611cabdff1aSopenharmony_ci}
612cabdff1aSopenharmony_ci
613cabdff1aSopenharmony_cistatic av_always_inline void hl_decode_mb_predict_luma(const H264Context *h,
614cabdff1aSopenharmony_ci                                                       H264SliceContext *sl,
615cabdff1aSopenharmony_ci                                                       int mb_type, int simple,
616cabdff1aSopenharmony_ci                                                       int transform_bypass,
617cabdff1aSopenharmony_ci                                                       int pixel_shift,
618cabdff1aSopenharmony_ci                                                       const int *block_offset,
619cabdff1aSopenharmony_ci                                                       int linesize,
620cabdff1aSopenharmony_ci                                                       uint8_t *dest_y, int p)
621cabdff1aSopenharmony_ci{
622cabdff1aSopenharmony_ci    void (*idct_add)(uint8_t *dst, int16_t *block, int stride);
623cabdff1aSopenharmony_ci    void (*idct_dc_add)(uint8_t *dst, int16_t *block, int stride);
624cabdff1aSopenharmony_ci    int i;
625cabdff1aSopenharmony_ci    int qscale = p == 0 ? sl->qscale : sl->chroma_qp[p - 1];
626cabdff1aSopenharmony_ci    block_offset += 16 * p;
627cabdff1aSopenharmony_ci    if (IS_INTRA4x4(mb_type)) {
628cabdff1aSopenharmony_ci        if (IS_8x8DCT(mb_type)) {
629cabdff1aSopenharmony_ci            if (transform_bypass) {
630cabdff1aSopenharmony_ci                idct_dc_add =
631cabdff1aSopenharmony_ci                idct_add    = h->h264dsp.h264_add_pixels8_clear;
632cabdff1aSopenharmony_ci            } else {
633cabdff1aSopenharmony_ci                idct_dc_add = h->h264dsp.h264_idct8_dc_add;
634cabdff1aSopenharmony_ci                idct_add    = h->h264dsp.h264_idct8_add;
635cabdff1aSopenharmony_ci            }
636cabdff1aSopenharmony_ci            for (i = 0; i < 16; i += 4) {
637cabdff1aSopenharmony_ci                uint8_t *const ptr = dest_y + block_offset[i];
638cabdff1aSopenharmony_ci                const int dir      = sl->intra4x4_pred_mode_cache[scan8[i]];
639cabdff1aSopenharmony_ci                if (transform_bypass && h->ps.sps->profile_idc == 244 && dir <= 1) {
640cabdff1aSopenharmony_ci                    if (h->x264_build < 151U) {
641cabdff1aSopenharmony_ci                        h->hpc.pred8x8l_add[dir](ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
642cabdff1aSopenharmony_ci                    } else
643cabdff1aSopenharmony_ci                        h->hpc.pred8x8l_filter_add[dir](ptr, sl->mb + (i * 16 + p * 256 << pixel_shift),
644cabdff1aSopenharmony_ci                                                        (sl-> topleft_samples_available << i) & 0x8000,
645cabdff1aSopenharmony_ci                                                        (sl->topright_samples_available << i) & 0x4000, linesize);
646cabdff1aSopenharmony_ci                } else {
647cabdff1aSopenharmony_ci                    const int nnz = sl->non_zero_count_cache[scan8[i + p * 16]];
648cabdff1aSopenharmony_ci                    h->hpc.pred8x8l[dir](ptr, (sl->topleft_samples_available << i) & 0x8000,
649cabdff1aSopenharmony_ci                                         (sl->topright_samples_available << i) & 0x4000, linesize);
650cabdff1aSopenharmony_ci                    if (nnz) {
651cabdff1aSopenharmony_ci                        if (nnz == 1 && dctcoef_get(sl->mb, pixel_shift, i * 16 + p * 256))
652cabdff1aSopenharmony_ci                            idct_dc_add(ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
653cabdff1aSopenharmony_ci                        else
654cabdff1aSopenharmony_ci                            idct_add(ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
655cabdff1aSopenharmony_ci                    }
656cabdff1aSopenharmony_ci                }
657cabdff1aSopenharmony_ci            }
658cabdff1aSopenharmony_ci        } else {
659cabdff1aSopenharmony_ci            if (transform_bypass) {
660cabdff1aSopenharmony_ci                idct_dc_add  =
661cabdff1aSopenharmony_ci                idct_add     = h->h264dsp.h264_add_pixels4_clear;
662cabdff1aSopenharmony_ci            } else {
663cabdff1aSopenharmony_ci                idct_dc_add = h->h264dsp.h264_idct_dc_add;
664cabdff1aSopenharmony_ci                idct_add    = h->h264dsp.h264_idct_add;
665cabdff1aSopenharmony_ci            }
666cabdff1aSopenharmony_ci            for (i = 0; i < 16; i++) {
667cabdff1aSopenharmony_ci                uint8_t *const ptr = dest_y + block_offset[i];
668cabdff1aSopenharmony_ci                const int dir      = sl->intra4x4_pred_mode_cache[scan8[i]];
669cabdff1aSopenharmony_ci
670cabdff1aSopenharmony_ci                if (transform_bypass && h->ps.sps->profile_idc == 244 && dir <= 1) {
671cabdff1aSopenharmony_ci                    h->hpc.pred4x4_add[dir](ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
672cabdff1aSopenharmony_ci                } else {
673cabdff1aSopenharmony_ci                    uint8_t *topright;
674cabdff1aSopenharmony_ci                    int nnz, tr;
675cabdff1aSopenharmony_ci                    uint64_t tr_high;
676cabdff1aSopenharmony_ci                    if (dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED) {
677cabdff1aSopenharmony_ci                        const int topright_avail = (sl->topright_samples_available << i) & 0x8000;
678cabdff1aSopenharmony_ci                        av_assert2(sl->mb_y || linesize <= block_offset[i]);
679cabdff1aSopenharmony_ci                        if (!topright_avail) {
680cabdff1aSopenharmony_ci                            if (pixel_shift) {
681cabdff1aSopenharmony_ci                                tr_high  = ((uint16_t *)ptr)[3 - linesize / 2] * 0x0001000100010001ULL;
682cabdff1aSopenharmony_ci                                topright = (uint8_t *)&tr_high;
683cabdff1aSopenharmony_ci                            } else {
684cabdff1aSopenharmony_ci                                tr       = ptr[3 - linesize] * 0x01010101u;
685cabdff1aSopenharmony_ci                                topright = (uint8_t *)&tr;
686cabdff1aSopenharmony_ci                            }
687cabdff1aSopenharmony_ci                        } else
688cabdff1aSopenharmony_ci                            topright = ptr + (4 << pixel_shift) - linesize;
689cabdff1aSopenharmony_ci                    } else
690cabdff1aSopenharmony_ci                        topright = NULL;
691cabdff1aSopenharmony_ci
692cabdff1aSopenharmony_ci                    h->hpc.pred4x4[dir](ptr, topright, linesize);
693cabdff1aSopenharmony_ci                    nnz = sl->non_zero_count_cache[scan8[i + p * 16]];
694cabdff1aSopenharmony_ci                    if (nnz) {
695cabdff1aSopenharmony_ci                        if (nnz == 1 && dctcoef_get(sl->mb, pixel_shift, i * 16 + p * 256))
696cabdff1aSopenharmony_ci                            idct_dc_add(ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
697cabdff1aSopenharmony_ci                        else
698cabdff1aSopenharmony_ci                            idct_add(ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
699cabdff1aSopenharmony_ci                    }
700cabdff1aSopenharmony_ci                }
701cabdff1aSopenharmony_ci            }
702cabdff1aSopenharmony_ci        }
703cabdff1aSopenharmony_ci    } else {
704cabdff1aSopenharmony_ci        h->hpc.pred16x16[sl->intra16x16_pred_mode](dest_y, linesize);
705cabdff1aSopenharmony_ci        if (sl->non_zero_count_cache[scan8[LUMA_DC_BLOCK_INDEX + p]]) {
706cabdff1aSopenharmony_ci            if (!transform_bypass)
707cabdff1aSopenharmony_ci                h->h264dsp.h264_luma_dc_dequant_idct(sl->mb + (p * 256 << pixel_shift),
708cabdff1aSopenharmony_ci                                                     sl->mb_luma_dc[p],
709cabdff1aSopenharmony_ci                                                     h->ps.pps->dequant4_coeff[p][qscale][0]);
710cabdff1aSopenharmony_ci            else {
711cabdff1aSopenharmony_ci                static const uint8_t dc_mapping[16] = {
712cabdff1aSopenharmony_ci                     0 * 16,  1 * 16,  4 * 16,  5 * 16,
713cabdff1aSopenharmony_ci                     2 * 16,  3 * 16,  6 * 16,  7 * 16,
714cabdff1aSopenharmony_ci                     8 * 16,  9 * 16, 12 * 16, 13 * 16,
715cabdff1aSopenharmony_ci                    10 * 16, 11 * 16, 14 * 16, 15 * 16
716cabdff1aSopenharmony_ci                };
717cabdff1aSopenharmony_ci                for (i = 0; i < 16; i++)
718cabdff1aSopenharmony_ci                    dctcoef_set(sl->mb + (p * 256 << pixel_shift),
719cabdff1aSopenharmony_ci                                pixel_shift, dc_mapping[i],
720cabdff1aSopenharmony_ci                                dctcoef_get(sl->mb_luma_dc[p],
721cabdff1aSopenharmony_ci                                            pixel_shift, i));
722cabdff1aSopenharmony_ci            }
723cabdff1aSopenharmony_ci        }
724cabdff1aSopenharmony_ci    }
725cabdff1aSopenharmony_ci}
726cabdff1aSopenharmony_ci
727cabdff1aSopenharmony_cistatic av_always_inline void hl_decode_mb_idct_luma(const H264Context *h, H264SliceContext *sl,
728cabdff1aSopenharmony_ci                                                    int mb_type, int simple,
729cabdff1aSopenharmony_ci                                                    int transform_bypass,
730cabdff1aSopenharmony_ci                                                    int pixel_shift,
731cabdff1aSopenharmony_ci                                                    const int *block_offset,
732cabdff1aSopenharmony_ci                                                    int linesize,
733cabdff1aSopenharmony_ci                                                    uint8_t *dest_y, int p)
734cabdff1aSopenharmony_ci{
735cabdff1aSopenharmony_ci    void (*idct_add)(uint8_t *dst, int16_t *block, int stride);
736cabdff1aSopenharmony_ci    int i;
737cabdff1aSopenharmony_ci    block_offset += 16 * p;
738cabdff1aSopenharmony_ci    if (!IS_INTRA4x4(mb_type)) {
739cabdff1aSopenharmony_ci        if (IS_INTRA16x16(mb_type)) {
740cabdff1aSopenharmony_ci            if (transform_bypass) {
741cabdff1aSopenharmony_ci                if (h->ps.sps->profile_idc == 244 &&
742cabdff1aSopenharmony_ci                    (sl->intra16x16_pred_mode == VERT_PRED8x8 ||
743cabdff1aSopenharmony_ci                     sl->intra16x16_pred_mode == HOR_PRED8x8)) {
744cabdff1aSopenharmony_ci                    h->hpc.pred16x16_add[sl->intra16x16_pred_mode](dest_y, block_offset,
745cabdff1aSopenharmony_ci                                                                   sl->mb + (p * 256 << pixel_shift),
746cabdff1aSopenharmony_ci                                                                   linesize);
747cabdff1aSopenharmony_ci                } else {
748cabdff1aSopenharmony_ci                    for (i = 0; i < 16; i++)
749cabdff1aSopenharmony_ci                        if (sl->non_zero_count_cache[scan8[i + p * 16]] ||
750cabdff1aSopenharmony_ci                            dctcoef_get(sl->mb, pixel_shift, i * 16 + p * 256))
751cabdff1aSopenharmony_ci                            h->h264dsp.h264_add_pixels4_clear(dest_y + block_offset[i],
752cabdff1aSopenharmony_ci                                                              sl->mb + (i * 16 + p * 256 << pixel_shift),
753cabdff1aSopenharmony_ci                                                              linesize);
754cabdff1aSopenharmony_ci                }
755cabdff1aSopenharmony_ci            } else {
756cabdff1aSopenharmony_ci                h->h264dsp.h264_idct_add16intra(dest_y, block_offset,
757cabdff1aSopenharmony_ci                                                sl->mb + (p * 256 << pixel_shift),
758cabdff1aSopenharmony_ci                                                linesize,
759cabdff1aSopenharmony_ci                                                sl->non_zero_count_cache + p * 5 * 8);
760cabdff1aSopenharmony_ci            }
761cabdff1aSopenharmony_ci        } else if (sl->cbp & 15) {
762cabdff1aSopenharmony_ci            if (transform_bypass) {
763cabdff1aSopenharmony_ci                const int di = IS_8x8DCT(mb_type) ? 4 : 1;
764cabdff1aSopenharmony_ci                idct_add = IS_8x8DCT(mb_type) ? h->h264dsp.h264_add_pixels8_clear
765cabdff1aSopenharmony_ci                    : h->h264dsp.h264_add_pixels4_clear;
766cabdff1aSopenharmony_ci                for (i = 0; i < 16; i += di)
767cabdff1aSopenharmony_ci                    if (sl->non_zero_count_cache[scan8[i + p * 16]])
768cabdff1aSopenharmony_ci                        idct_add(dest_y + block_offset[i],
769cabdff1aSopenharmony_ci                                 sl->mb + (i * 16 + p * 256 << pixel_shift),
770cabdff1aSopenharmony_ci                                 linesize);
771cabdff1aSopenharmony_ci            } else {
772cabdff1aSopenharmony_ci                if (IS_8x8DCT(mb_type))
773cabdff1aSopenharmony_ci                    h->h264dsp.h264_idct8_add4(dest_y, block_offset,
774cabdff1aSopenharmony_ci                                               sl->mb + (p * 256 << pixel_shift),
775cabdff1aSopenharmony_ci                                               linesize,
776cabdff1aSopenharmony_ci                                               sl->non_zero_count_cache + p * 5 * 8);
777cabdff1aSopenharmony_ci                else
778cabdff1aSopenharmony_ci                    h->h264dsp.h264_idct_add16(dest_y, block_offset,
779cabdff1aSopenharmony_ci                                               sl->mb + (p * 256 << pixel_shift),
780cabdff1aSopenharmony_ci                                               linesize,
781cabdff1aSopenharmony_ci                                               sl->non_zero_count_cache + p * 5 * 8);
782cabdff1aSopenharmony_ci            }
783cabdff1aSopenharmony_ci        }
784cabdff1aSopenharmony_ci    }
785cabdff1aSopenharmony_ci}
786cabdff1aSopenharmony_ci
787cabdff1aSopenharmony_ci#define BITS   8
788cabdff1aSopenharmony_ci#define SIMPLE 1
789cabdff1aSopenharmony_ci#include "h264_mb_template.c"
790cabdff1aSopenharmony_ci
791cabdff1aSopenharmony_ci#undef  BITS
792cabdff1aSopenharmony_ci#define BITS   16
793cabdff1aSopenharmony_ci#include "h264_mb_template.c"
794cabdff1aSopenharmony_ci
795cabdff1aSopenharmony_ci#undef  SIMPLE
796cabdff1aSopenharmony_ci#define SIMPLE 0
797cabdff1aSopenharmony_ci#include "h264_mb_template.c"
798cabdff1aSopenharmony_ci
799cabdff1aSopenharmony_civoid ff_h264_hl_decode_mb(const H264Context *h, H264SliceContext *sl)
800cabdff1aSopenharmony_ci{
801cabdff1aSopenharmony_ci    const int mb_xy   = sl->mb_xy;
802cabdff1aSopenharmony_ci    const int mb_type = h->cur_pic.mb_type[mb_xy];
803cabdff1aSopenharmony_ci    int is_complex    = CONFIG_SMALL || sl->is_complex ||
804cabdff1aSopenharmony_ci                        IS_INTRA_PCM(mb_type) || sl->qscale == 0;
805cabdff1aSopenharmony_ci
806cabdff1aSopenharmony_ci    if (CHROMA444(h)) {
807cabdff1aSopenharmony_ci        if (is_complex || h->pixel_shift)
808cabdff1aSopenharmony_ci            hl_decode_mb_444_complex(h, sl);
809cabdff1aSopenharmony_ci        else
810cabdff1aSopenharmony_ci            hl_decode_mb_444_simple_8(h, sl);
811cabdff1aSopenharmony_ci    } else if (is_complex) {
812cabdff1aSopenharmony_ci        hl_decode_mb_complex(h, sl);
813cabdff1aSopenharmony_ci    } else if (h->pixel_shift) {
814cabdff1aSopenharmony_ci        hl_decode_mb_simple_16(h, sl);
815cabdff1aSopenharmony_ci    } else
816cabdff1aSopenharmony_ci        hl_decode_mb_simple_8(h, sl);
817cabdff1aSopenharmony_ci}
818