1/* 2 * This file is part of FFmpeg. 3 * 4 * FFmpeg is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU Lesser General Public 6 * License as published by the Free Software Foundation; either 7 * version 2.1 of the License, or (at your option) any later version. 8 * 9 * FFmpeg is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 * Lesser General Public License for more details. 13 * 14 * You should have received a copy of the GNU Lesser General Public 15 * License along with FFmpeg; if not, write to the Free Software 16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 19#include <stdint.h> 20 21#include "libavutil/attributes.h" 22#include "libavutil/cpu.h" 23#include "libavutil/aarch64/cpu.h" 24#include "libavutil/intreadwrite.h" 25#include "libavcodec/vc1dsp.h" 26 27#include "config.h" 28 29void ff_vc1_inv_trans_8x8_neon(int16_t *block); 30void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); 31void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); 32void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); 33 34void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); 35void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); 36void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); 37void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); 38 39void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq); 40void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq); 41void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq); 42void ff_vc1_h_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq); 43void ff_vc1_v_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq); 44void ff_vc1_h_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq); 45 46void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, 47 int h, int x, int y); 48void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, 49 int h, int x, int y); 50void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, 51 int h, int x, int y); 52void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, 53 int h, int x, int y); 54 55int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); 56 57static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) 58{ 59 /* Dealing with starting and stopping, and removing escape bytes, are 60 * comparatively less time-sensitive, so are more clearly expressed using 61 * a C wrapper around the assembly inner loop. Note that we assume a 62 * little-endian machine that supports unaligned loads. */ 63 int dsize = 0; 64 while (size >= 4) 65 { 66 int found = 0; 67 while (!found && (((uintptr_t) dst) & 7) && size >= 4) 68 { 69 found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; 70 if (!found) 71 { 72 *dst++ = *src++; 73 --size; 74 ++dsize; 75 } 76 } 77 if (!found) 78 { 79 int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); 80 dst += skip; 81 src += skip; 82 size -= skip; 83 dsize += skip; 84 while (!found && size >= 4) 85 { 86 found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; 87 if (!found) 88 { 89 *dst++ = *src++; 90 --size; 91 ++dsize; 92 } 93 } 94 } 95 if (found) 96 { 97 *dst++ = *src++; 98 *dst++ = *src++; 99 ++src; 100 size -= 3; 101 dsize += 2; 102 } 103 } 104 while (size > 0) 105 { 106 *dst++ = *src++; 107 --size; 108 ++dsize; 109 } 110 return dsize; 111} 112 113av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp) 114{ 115 int cpu_flags = av_get_cpu_flags(); 116 117 if (have_neon(cpu_flags)) { 118 dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon; 119 dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon; 120 dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon; 121 dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon; 122 dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon; 123 dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon; 124 dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon; 125 dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; 126 127 dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; 128 dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; 129 dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; 130 dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon; 131 dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon; 132 dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon; 133 134 dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon; 135 dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; 136 dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; 137 dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; 138 139 dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; 140 } 141} 142