1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * x86 optimized discrete wavelet transform 3cabdff1aSopenharmony_ci * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 4cabdff1aSopenharmony_ci * Copyright (c) 2010 David Conrad 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * This file is part of FFmpeg. 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci * 13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci * Lesser General Public License for more details. 17cabdff1aSopenharmony_ci * 18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci */ 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci#include "libavutil/x86/asm.h" 24cabdff1aSopenharmony_ci#include "libavutil/x86/cpu.h" 25cabdff1aSopenharmony_ci#include "libavcodec/dirac_dwt.h" 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_ci#define COMPOSE_VERTICAL(ext, align) \ 28cabdff1aSopenharmony_civoid ff_vertical_compose53iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \ 29cabdff1aSopenharmony_civoid ff_vertical_compose_dirac53iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \ 30cabdff1aSopenharmony_civoid ff_vertical_compose_dd137iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \ 31cabdff1aSopenharmony_civoid ff_vertical_compose_dd97iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \ 32cabdff1aSopenharmony_civoid ff_vertical_compose_haar##ext(int16_t *b0, int16_t *b1, int width); \ 33cabdff1aSopenharmony_civoid ff_horizontal_compose_haar0i##ext(int16_t *b, int16_t *tmp, int w);\ 34cabdff1aSopenharmony_civoid ff_horizontal_compose_haar1i##ext(int16_t *b, int16_t *tmp, int w);\ 35cabdff1aSopenharmony_ci\ 36cabdff1aSopenharmony_cistatic void vertical_compose53iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \ 37cabdff1aSopenharmony_ci{ \ 38cabdff1aSopenharmony_ci int i, width_align = width&~(align-1); \ 39cabdff1aSopenharmony_ci int16_t *b0 = (int16_t *)_b0; \ 40cabdff1aSopenharmony_ci int16_t *b1 = (int16_t *)_b1; \ 41cabdff1aSopenharmony_ci int16_t *b2 = (int16_t *)_b2; \ 42cabdff1aSopenharmony_ci\ 43cabdff1aSopenharmony_ci for(i=width_align; i<width; i++) \ 44cabdff1aSopenharmony_ci b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \ 45cabdff1aSopenharmony_ci\ 46cabdff1aSopenharmony_ci ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \ 47cabdff1aSopenharmony_ci} \ 48cabdff1aSopenharmony_ci\ 49cabdff1aSopenharmony_cistatic void vertical_compose_dirac53iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \ 50cabdff1aSopenharmony_ci{ \ 51cabdff1aSopenharmony_ci int i, width_align = width&~(align-1); \ 52cabdff1aSopenharmony_ci int16_t *b0 = (int16_t *)_b0; \ 53cabdff1aSopenharmony_ci int16_t *b1 = (int16_t *)_b1; \ 54cabdff1aSopenharmony_ci int16_t *b2 = (int16_t *)_b2; \ 55cabdff1aSopenharmony_ci\ 56cabdff1aSopenharmony_ci for(i=width_align; i<width; i++) \ 57cabdff1aSopenharmony_ci b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \ 58cabdff1aSopenharmony_ci\ 59cabdff1aSopenharmony_ci ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \ 60cabdff1aSopenharmony_ci} \ 61cabdff1aSopenharmony_ci\ 62cabdff1aSopenharmony_cistatic void vertical_compose_dd137iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \ 63cabdff1aSopenharmony_ci uint8_t *_b3, uint8_t *_b4, int width) \ 64cabdff1aSopenharmony_ci{ \ 65cabdff1aSopenharmony_ci int i, width_align = width&~(align-1); \ 66cabdff1aSopenharmony_ci int16_t *b0 = (int16_t *)_b0; \ 67cabdff1aSopenharmony_ci int16_t *b1 = (int16_t *)_b1; \ 68cabdff1aSopenharmony_ci int16_t *b2 = (int16_t *)_b2; \ 69cabdff1aSopenharmony_ci int16_t *b3 = (int16_t *)_b3; \ 70cabdff1aSopenharmony_ci int16_t *b4 = (int16_t *)_b4; \ 71cabdff1aSopenharmony_ci\ 72cabdff1aSopenharmony_ci for(i=width_align; i<width; i++) \ 73cabdff1aSopenharmony_ci b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \ 74cabdff1aSopenharmony_ci\ 75cabdff1aSopenharmony_ci ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \ 76cabdff1aSopenharmony_ci} \ 77cabdff1aSopenharmony_ci\ 78cabdff1aSopenharmony_cistatic void vertical_compose_dd97iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \ 79cabdff1aSopenharmony_ci uint8_t *_b3, uint8_t *_b4, int width) \ 80cabdff1aSopenharmony_ci{ \ 81cabdff1aSopenharmony_ci int i, width_align = width&~(align-1); \ 82cabdff1aSopenharmony_ci int16_t *b0 = (int16_t *)_b0; \ 83cabdff1aSopenharmony_ci int16_t *b1 = (int16_t *)_b1; \ 84cabdff1aSopenharmony_ci int16_t *b2 = (int16_t *)_b2; \ 85cabdff1aSopenharmony_ci int16_t *b3 = (int16_t *)_b3; \ 86cabdff1aSopenharmony_ci int16_t *b4 = (int16_t *)_b4; \ 87cabdff1aSopenharmony_ci\ 88cabdff1aSopenharmony_ci for(i=width_align; i<width; i++) \ 89cabdff1aSopenharmony_ci b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \ 90cabdff1aSopenharmony_ci\ 91cabdff1aSopenharmony_ci ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \ 92cabdff1aSopenharmony_ci} \ 93cabdff1aSopenharmony_cistatic void vertical_compose_haar##ext(uint8_t *_b0, uint8_t *_b1, int width) \ 94cabdff1aSopenharmony_ci{ \ 95cabdff1aSopenharmony_ci int i, width_align = width&~(align-1); \ 96cabdff1aSopenharmony_ci int16_t *b0 = (int16_t *)_b0; \ 97cabdff1aSopenharmony_ci int16_t *b1 = (int16_t *)_b1; \ 98cabdff1aSopenharmony_ci\ 99cabdff1aSopenharmony_ci for(i=width_align; i<width; i++) { \ 100cabdff1aSopenharmony_ci b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \ 101cabdff1aSopenharmony_ci b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \ 102cabdff1aSopenharmony_ci } \ 103cabdff1aSopenharmony_ci\ 104cabdff1aSopenharmony_ci ff_vertical_compose_haar##ext(b0, b1, width_align); \ 105cabdff1aSopenharmony_ci} \ 106cabdff1aSopenharmony_cistatic void horizontal_compose_haar0i##ext(uint8_t *_b, uint8_t *_tmp, int w)\ 107cabdff1aSopenharmony_ci{\ 108cabdff1aSopenharmony_ci int w2= w>>1;\ 109cabdff1aSopenharmony_ci int x= w2 - (w2&(align-1));\ 110cabdff1aSopenharmony_ci int16_t *b = (int16_t *)_b; \ 111cabdff1aSopenharmony_ci int16_t *tmp = (int16_t *)_tmp; \ 112cabdff1aSopenharmony_ci\ 113cabdff1aSopenharmony_ci ff_horizontal_compose_haar0i##ext(b, tmp, w);\ 114cabdff1aSopenharmony_ci\ 115cabdff1aSopenharmony_ci for (; x < w2; x++) {\ 116cabdff1aSopenharmony_ci b[2*x ] = tmp[x];\ 117cabdff1aSopenharmony_ci b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\ 118cabdff1aSopenharmony_ci }\ 119cabdff1aSopenharmony_ci}\ 120cabdff1aSopenharmony_cistatic void horizontal_compose_haar1i##ext(uint8_t *_b, uint8_t *_tmp, int w)\ 121cabdff1aSopenharmony_ci{\ 122cabdff1aSopenharmony_ci int w2= w>>1;\ 123cabdff1aSopenharmony_ci int x= w2 - (w2&(align-1));\ 124cabdff1aSopenharmony_ci int16_t *b = (int16_t *)_b; \ 125cabdff1aSopenharmony_ci int16_t *tmp = (int16_t *)_tmp; \ 126cabdff1aSopenharmony_ci\ 127cabdff1aSopenharmony_ci ff_horizontal_compose_haar1i##ext(b, tmp, w);\ 128cabdff1aSopenharmony_ci\ 129cabdff1aSopenharmony_ci for (; x < w2; x++) {\ 130cabdff1aSopenharmony_ci b[2*x ] = (tmp[x] + 1)>>1;\ 131cabdff1aSopenharmony_ci b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\ 132cabdff1aSopenharmony_ci }\ 133cabdff1aSopenharmony_ci}\ 134cabdff1aSopenharmony_ci\ 135cabdff1aSopenharmony_ci 136cabdff1aSopenharmony_ci#if HAVE_X86ASM 137cabdff1aSopenharmony_ciCOMPOSE_VERTICAL(_sse2, 8) 138cabdff1aSopenharmony_ci 139cabdff1aSopenharmony_ci 140cabdff1aSopenharmony_civoid ff_horizontal_compose_dd97i_ssse3(int16_t *_b, int16_t *_tmp, int w); 141cabdff1aSopenharmony_ci 142cabdff1aSopenharmony_cistatic void horizontal_compose_dd97i_ssse3(uint8_t *_b, uint8_t *_tmp, int w) 143cabdff1aSopenharmony_ci{ 144cabdff1aSopenharmony_ci int w2= w>>1; 145cabdff1aSopenharmony_ci int x= w2 - (w2&7); 146cabdff1aSopenharmony_ci int16_t *b = (int16_t *)_b; 147cabdff1aSopenharmony_ci int16_t *tmp = (int16_t *)_tmp; 148cabdff1aSopenharmony_ci 149cabdff1aSopenharmony_ci ff_horizontal_compose_dd97i_ssse3(b, tmp, w); 150cabdff1aSopenharmony_ci 151cabdff1aSopenharmony_ci for (; x < w2; x++) { 152cabdff1aSopenharmony_ci b[2*x ] = (tmp[x] + 1)>>1; 153cabdff1aSopenharmony_ci b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1; 154cabdff1aSopenharmony_ci } 155cabdff1aSopenharmony_ci} 156cabdff1aSopenharmony_ci#endif 157cabdff1aSopenharmony_ci 158cabdff1aSopenharmony_civoid ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type) 159cabdff1aSopenharmony_ci{ 160cabdff1aSopenharmony_ci#if HAVE_X86ASM 161cabdff1aSopenharmony_ci int mm_flags = av_get_cpu_flags(); 162cabdff1aSopenharmony_ci 163cabdff1aSopenharmony_ci if (!(mm_flags & AV_CPU_FLAG_SSE2)) 164cabdff1aSopenharmony_ci return; 165cabdff1aSopenharmony_ci 166cabdff1aSopenharmony_ci switch (type) { 167cabdff1aSopenharmony_ci case DWT_DIRAC_DD9_7: 168cabdff1aSopenharmony_ci d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2; 169cabdff1aSopenharmony_ci d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2; 170cabdff1aSopenharmony_ci break; 171cabdff1aSopenharmony_ci case DWT_DIRAC_LEGALL5_3: 172cabdff1aSopenharmony_ci d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2; 173cabdff1aSopenharmony_ci d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2; 174cabdff1aSopenharmony_ci break; 175cabdff1aSopenharmony_ci case DWT_DIRAC_DD13_7: 176cabdff1aSopenharmony_ci d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2; 177cabdff1aSopenharmony_ci d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2; 178cabdff1aSopenharmony_ci break; 179cabdff1aSopenharmony_ci case DWT_DIRAC_HAAR0: 180cabdff1aSopenharmony_ci d->vertical_compose = (void*)vertical_compose_haar_sse2; 181cabdff1aSopenharmony_ci d->horizontal_compose = horizontal_compose_haar0i_sse2; 182cabdff1aSopenharmony_ci break; 183cabdff1aSopenharmony_ci case DWT_DIRAC_HAAR1: 184cabdff1aSopenharmony_ci d->vertical_compose = (void*)vertical_compose_haar_sse2; 185cabdff1aSopenharmony_ci d->horizontal_compose = horizontal_compose_haar1i_sse2; 186cabdff1aSopenharmony_ci break; 187cabdff1aSopenharmony_ci } 188cabdff1aSopenharmony_ci 189cabdff1aSopenharmony_ci if (!(mm_flags & AV_CPU_FLAG_SSSE3)) 190cabdff1aSopenharmony_ci return; 191cabdff1aSopenharmony_ci 192cabdff1aSopenharmony_ci switch (type) { 193cabdff1aSopenharmony_ci case DWT_DIRAC_DD9_7: 194cabdff1aSopenharmony_ci d->horizontal_compose = horizontal_compose_dd97i_ssse3; 195cabdff1aSopenharmony_ci break; 196cabdff1aSopenharmony_ci } 197cabdff1aSopenharmony_ci#endif // HAVE_X86ASM 198cabdff1aSopenharmony_ci} 199