1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* x86 optimized discrete wavelet trasnform 3cabdff1aSopenharmony_ci;* Copyright (c) 2010 David Conrad 4cabdff1aSopenharmony_ci;* 5cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci;* 12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 16cabdff1aSopenharmony_ci;* 17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci;****************************************************************************** 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ciSECTION_RODATA 25cabdff1aSopenharmony_cipw_1991: times 4 dw 9,-1 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_cicextern pw_1 28cabdff1aSopenharmony_cicextern pw_2 29cabdff1aSopenharmony_cicextern pw_8 30cabdff1aSopenharmony_cicextern pw_16 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_ciSECTION .text 33cabdff1aSopenharmony_ci 34cabdff1aSopenharmony_ci; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2 35cabdff1aSopenharmony_ci%macro COMPOSE_53iL0 4 36cabdff1aSopenharmony_ci paddw %2, %3 37cabdff1aSopenharmony_ci paddw %2, %4 38cabdff1aSopenharmony_ci psraw %2, 2 39cabdff1aSopenharmony_ci psubw %1, %2 40cabdff1aSopenharmony_ci%endm 41cabdff1aSopenharmony_ci 42cabdff1aSopenharmony_ci; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4 43cabdff1aSopenharmony_ci; if %4 is supplied, %1 is loaded unaligned from there 44cabdff1aSopenharmony_ci; m2: clobbered m3: pw_8 m4: pw_1991 45cabdff1aSopenharmony_ci%macro COMPOSE_DD97iH0 3-4 46cabdff1aSopenharmony_ci paddw m0, %3 47cabdff1aSopenharmony_ci paddw m1, %2 48cabdff1aSopenharmony_ci psubw m0, m3 49cabdff1aSopenharmony_ci mova m2, m1 50cabdff1aSopenharmony_ci punpcklwd m1, m0 51cabdff1aSopenharmony_ci punpckhwd m2, m0 52cabdff1aSopenharmony_ci pmaddwd m1, m4 53cabdff1aSopenharmony_ci pmaddwd m2, m4 54cabdff1aSopenharmony_ci%if %0 > 3 55cabdff1aSopenharmony_ci movu %1, %4 56cabdff1aSopenharmony_ci%endif 57cabdff1aSopenharmony_ci psrad m1, 4 58cabdff1aSopenharmony_ci psrad m2, 4 59cabdff1aSopenharmony_ci packssdw m1, m2 60cabdff1aSopenharmony_ci paddw m1, %1 61cabdff1aSopenharmony_ci%endm 62cabdff1aSopenharmony_ci 63cabdff1aSopenharmony_ci%macro COMPOSE_VERTICAL 1 64cabdff1aSopenharmony_ci; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, 65cabdff1aSopenharmony_ci; int width) 66cabdff1aSopenharmony_cicglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width 67cabdff1aSopenharmony_ci mova m2, [pw_2] 68cabdff1aSopenharmony_ci%if ARCH_X86_64 69cabdff1aSopenharmony_ci mov widthd, widthd 70cabdff1aSopenharmony_ci%endif 71cabdff1aSopenharmony_ci.loop: 72cabdff1aSopenharmony_ci sub widthq, mmsize/2 73cabdff1aSopenharmony_ci mova m1, [b0q+2*widthq] 74cabdff1aSopenharmony_ci mova m0, [b1q+2*widthq] 75cabdff1aSopenharmony_ci COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2 76cabdff1aSopenharmony_ci mova [b1q+2*widthq], m0 77cabdff1aSopenharmony_ci jg .loop 78cabdff1aSopenharmony_ci REP_RET 79cabdff1aSopenharmony_ci 80cabdff1aSopenharmony_ci; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, 81cabdff1aSopenharmony_ci; int width) 82cabdff1aSopenharmony_cicglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width 83cabdff1aSopenharmony_ci mova m1, [pw_1] 84cabdff1aSopenharmony_ci%if ARCH_X86_64 85cabdff1aSopenharmony_ci mov widthd, widthd 86cabdff1aSopenharmony_ci%endif 87cabdff1aSopenharmony_ci.loop: 88cabdff1aSopenharmony_ci sub widthq, mmsize/2 89cabdff1aSopenharmony_ci mova m0, [b0q+2*widthq] 90cabdff1aSopenharmony_ci paddw m0, [b2q+2*widthq] 91cabdff1aSopenharmony_ci paddw m0, m1 92cabdff1aSopenharmony_ci psraw m0, 1 93cabdff1aSopenharmony_ci paddw m0, [b1q+2*widthq] 94cabdff1aSopenharmony_ci mova [b1q+2*widthq], m0 95cabdff1aSopenharmony_ci jg .loop 96cabdff1aSopenharmony_ci REP_RET 97cabdff1aSopenharmony_ci 98cabdff1aSopenharmony_ci; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, 99cabdff1aSopenharmony_ci; IDWTELEM *b3, IDWTELEM *b4, int width) 100cabdff1aSopenharmony_cicglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width 101cabdff1aSopenharmony_ci mova m3, [pw_8] 102cabdff1aSopenharmony_ci mova m4, [pw_1991] 103cabdff1aSopenharmony_ci%if ARCH_X86_64 104cabdff1aSopenharmony_ci mov widthd, widthd 105cabdff1aSopenharmony_ci%endif 106cabdff1aSopenharmony_ci.loop: 107cabdff1aSopenharmony_ci sub widthq, mmsize/2 108cabdff1aSopenharmony_ci mova m0, [b0q+2*widthq] 109cabdff1aSopenharmony_ci mova m1, [b1q+2*widthq] 110cabdff1aSopenharmony_ci COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq] 111cabdff1aSopenharmony_ci mova [b2q+2*widthq], m1 112cabdff1aSopenharmony_ci jg .loop 113cabdff1aSopenharmony_ci REP_RET 114cabdff1aSopenharmony_ci 115cabdff1aSopenharmony_ci; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, 116cabdff1aSopenharmony_ci; IDWTELEM *b3, IDWTELEM *b4, int width) 117cabdff1aSopenharmony_cicglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width 118cabdff1aSopenharmony_ci mova m3, [pw_16] 119cabdff1aSopenharmony_ci mova m4, [pw_1991] 120cabdff1aSopenharmony_ci%if ARCH_X86_64 121cabdff1aSopenharmony_ci mov widthd, widthd 122cabdff1aSopenharmony_ci%endif 123cabdff1aSopenharmony_ci.loop: 124cabdff1aSopenharmony_ci sub widthq, mmsize/2 125cabdff1aSopenharmony_ci mova m0, [b0q+2*widthq] 126cabdff1aSopenharmony_ci mova m1, [b1q+2*widthq] 127cabdff1aSopenharmony_ci mova m5, [b2q+2*widthq] 128cabdff1aSopenharmony_ci paddw m0, [b4q+2*widthq] 129cabdff1aSopenharmony_ci paddw m1, [b3q+2*widthq] 130cabdff1aSopenharmony_ci psubw m0, m3 131cabdff1aSopenharmony_ci mova m2, m1 132cabdff1aSopenharmony_ci punpcklwd m1, m0 133cabdff1aSopenharmony_ci punpckhwd m2, m0 134cabdff1aSopenharmony_ci pmaddwd m1, m4 135cabdff1aSopenharmony_ci pmaddwd m2, m4 136cabdff1aSopenharmony_ci psrad m1, 5 137cabdff1aSopenharmony_ci psrad m2, 5 138cabdff1aSopenharmony_ci packssdw m1, m2 139cabdff1aSopenharmony_ci psubw m5, m1 140cabdff1aSopenharmony_ci mova [b2q+2*widthq], m5 141cabdff1aSopenharmony_ci jg .loop 142cabdff1aSopenharmony_ci REP_RET 143cabdff1aSopenharmony_ci 144cabdff1aSopenharmony_ci; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width) 145cabdff1aSopenharmony_cicglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width 146cabdff1aSopenharmony_ci mova m3, [pw_1] 147cabdff1aSopenharmony_ci%if ARCH_X86_64 148cabdff1aSopenharmony_ci mov widthd, widthd 149cabdff1aSopenharmony_ci%endif 150cabdff1aSopenharmony_ci.loop: 151cabdff1aSopenharmony_ci sub widthq, mmsize/2 152cabdff1aSopenharmony_ci mova m1, [b1q+2*widthq] 153cabdff1aSopenharmony_ci mova m0, [b0q+2*widthq] 154cabdff1aSopenharmony_ci mova m2, m1 155cabdff1aSopenharmony_ci paddw m1, m3 156cabdff1aSopenharmony_ci psraw m1, 1 157cabdff1aSopenharmony_ci psubw m0, m1 158cabdff1aSopenharmony_ci mova [b0q+2*widthq], m0 159cabdff1aSopenharmony_ci paddw m2, m0 160cabdff1aSopenharmony_ci mova [b1q+2*widthq], m2 161cabdff1aSopenharmony_ci jg .loop 162cabdff1aSopenharmony_ci REP_RET 163cabdff1aSopenharmony_ci%endmacro 164cabdff1aSopenharmony_ci 165cabdff1aSopenharmony_ci; extend the left and right edges of the tmp array by %1 and %2 respectively 166cabdff1aSopenharmony_ci%macro EDGE_EXTENSION 3 167cabdff1aSopenharmony_ci mov %3, [tmpq] 168cabdff1aSopenharmony_ci%assign %%i 1 169cabdff1aSopenharmony_ci%rep %1 170cabdff1aSopenharmony_ci mov [tmpq-2*%%i], %3 171cabdff1aSopenharmony_ci %assign %%i %%i+1 172cabdff1aSopenharmony_ci%endrep 173cabdff1aSopenharmony_ci mov %3, [tmpq+2*w2q-2] 174cabdff1aSopenharmony_ci%assign %%i 0 175cabdff1aSopenharmony_ci%rep %2 176cabdff1aSopenharmony_ci mov [tmpq+2*w2q+2*%%i], %3 177cabdff1aSopenharmony_ci %assign %%i %%i+1 178cabdff1aSopenharmony_ci%endrep 179cabdff1aSopenharmony_ci%endmacro 180cabdff1aSopenharmony_ci 181cabdff1aSopenharmony_ci 182cabdff1aSopenharmony_ci%macro HAAR_HORIZONTAL 2 183cabdff1aSopenharmony_ci; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width) 184cabdff1aSopenharmony_cicglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2 185cabdff1aSopenharmony_ci mov w2d, wd 186cabdff1aSopenharmony_ci xor xq, xq 187cabdff1aSopenharmony_ci shr w2d, 1 188cabdff1aSopenharmony_ci lea b_w2q, [bq+wq] 189cabdff1aSopenharmony_ci mova m3, [pw_1] 190cabdff1aSopenharmony_ci.lowpass_loop: 191cabdff1aSopenharmony_ci movu m1, [b_w2q + 2*xq] 192cabdff1aSopenharmony_ci mova m0, [bq + 2*xq] 193cabdff1aSopenharmony_ci paddw m1, m3 194cabdff1aSopenharmony_ci psraw m1, 1 195cabdff1aSopenharmony_ci psubw m0, m1 196cabdff1aSopenharmony_ci mova [tmpq + 2*xq], m0 197cabdff1aSopenharmony_ci add xq, mmsize/2 198cabdff1aSopenharmony_ci cmp xq, w2q 199cabdff1aSopenharmony_ci jl .lowpass_loop 200cabdff1aSopenharmony_ci 201cabdff1aSopenharmony_ci xor xq, xq 202cabdff1aSopenharmony_ci and w2q, ~(mmsize/2 - 1) 203cabdff1aSopenharmony_ci cmp w2q, mmsize/2 204cabdff1aSopenharmony_ci jl .end 205cabdff1aSopenharmony_ci 206cabdff1aSopenharmony_ci.highpass_loop: 207cabdff1aSopenharmony_ci movu m1, [b_w2q + 2*xq] 208cabdff1aSopenharmony_ci mova m0, [tmpq + 2*xq] 209cabdff1aSopenharmony_ci paddw m1, m0 210cabdff1aSopenharmony_ci 211cabdff1aSopenharmony_ci ; shift and interleave 212cabdff1aSopenharmony_ci%if %2 == 1 213cabdff1aSopenharmony_ci paddw m0, m3 214cabdff1aSopenharmony_ci paddw m1, m3 215cabdff1aSopenharmony_ci psraw m0, 1 216cabdff1aSopenharmony_ci psraw m1, 1 217cabdff1aSopenharmony_ci%endif 218cabdff1aSopenharmony_ci mova m2, m0 219cabdff1aSopenharmony_ci punpcklwd m0, m1 220cabdff1aSopenharmony_ci punpckhwd m2, m1 221cabdff1aSopenharmony_ci mova [bq+4*xq], m0 222cabdff1aSopenharmony_ci mova [bq+4*xq+mmsize], m2 223cabdff1aSopenharmony_ci 224cabdff1aSopenharmony_ci add xq, mmsize/2 225cabdff1aSopenharmony_ci cmp xq, w2q 226cabdff1aSopenharmony_ci jl .highpass_loop 227cabdff1aSopenharmony_ci.end: 228cabdff1aSopenharmony_ci REP_RET 229cabdff1aSopenharmony_ci%endmacro 230cabdff1aSopenharmony_ci 231cabdff1aSopenharmony_ci 232cabdff1aSopenharmony_ciINIT_XMM 233cabdff1aSopenharmony_ci; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width) 234cabdff1aSopenharmony_cicglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2 235cabdff1aSopenharmony_ci mov w2d, wd 236cabdff1aSopenharmony_ci xor xd, xd 237cabdff1aSopenharmony_ci shr w2d, 1 238cabdff1aSopenharmony_ci lea b_w2q, [bq+wq] 239cabdff1aSopenharmony_ci movu m4, [bq+wq] 240cabdff1aSopenharmony_ci mova m7, [pw_2] 241cabdff1aSopenharmony_ci pslldq m4, 14 242cabdff1aSopenharmony_ci.lowpass_loop: 243cabdff1aSopenharmony_ci movu m1, [b_w2q + 2*xq] 244cabdff1aSopenharmony_ci mova m0, [bq + 2*xq] 245cabdff1aSopenharmony_ci mova m2, m1 246cabdff1aSopenharmony_ci palignr m1, m4, 14 247cabdff1aSopenharmony_ci mova m4, m2 248cabdff1aSopenharmony_ci COMPOSE_53iL0 m0, m1, m2, m7 249cabdff1aSopenharmony_ci mova [tmpq + 2*xq], m0 250cabdff1aSopenharmony_ci add xd, mmsize/2 251cabdff1aSopenharmony_ci cmp xd, w2d 252cabdff1aSopenharmony_ci jl .lowpass_loop 253cabdff1aSopenharmony_ci 254cabdff1aSopenharmony_ci EDGE_EXTENSION 1, 2, xw 255cabdff1aSopenharmony_ci ; leave the last up to 7 (sse) or 3 (mmx) values for C 256cabdff1aSopenharmony_ci xor xd, xd 257cabdff1aSopenharmony_ci and w2d, ~(mmsize/2 - 1) 258cabdff1aSopenharmony_ci cmp w2d, mmsize/2 259cabdff1aSopenharmony_ci jl .end 260cabdff1aSopenharmony_ci 261cabdff1aSopenharmony_ci mova m7, [tmpq-mmsize] 262cabdff1aSopenharmony_ci mova m0, [tmpq] 263cabdff1aSopenharmony_ci mova m5, [pw_1] 264cabdff1aSopenharmony_ci mova m3, [pw_8] 265cabdff1aSopenharmony_ci mova m4, [pw_1991] 266cabdff1aSopenharmony_ci.highpass_loop: 267cabdff1aSopenharmony_ci mova m6, m0 268cabdff1aSopenharmony_ci palignr m0, m7, 14 269cabdff1aSopenharmony_ci mova m7, [tmpq + 2*xq + 16] 270cabdff1aSopenharmony_ci mova m1, m7 271cabdff1aSopenharmony_ci mova m2, m7 272cabdff1aSopenharmony_ci palignr m1, m6, 2 273cabdff1aSopenharmony_ci palignr m2, m6, 4 274cabdff1aSopenharmony_ci COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq] 275cabdff1aSopenharmony_ci mova m0, m7 276cabdff1aSopenharmony_ci mova m7, m6 277cabdff1aSopenharmony_ci 278cabdff1aSopenharmony_ci ; shift and interleave 279cabdff1aSopenharmony_ci paddw m6, m5 280cabdff1aSopenharmony_ci paddw m1, m5 281cabdff1aSopenharmony_ci psraw m6, 1 282cabdff1aSopenharmony_ci psraw m1, 1 283cabdff1aSopenharmony_ci mova m2, m6 284cabdff1aSopenharmony_ci punpcklwd m6, m1 285cabdff1aSopenharmony_ci punpckhwd m2, m1 286cabdff1aSopenharmony_ci mova [bq+4*xq], m6 287cabdff1aSopenharmony_ci mova [bq+4*xq+mmsize], m2 288cabdff1aSopenharmony_ci 289cabdff1aSopenharmony_ci add xd, mmsize/2 290cabdff1aSopenharmony_ci cmp xd, w2d 291cabdff1aSopenharmony_ci jl .highpass_loop 292cabdff1aSopenharmony_ci.end: 293cabdff1aSopenharmony_ci REP_RET 294cabdff1aSopenharmony_ci 295cabdff1aSopenharmony_ci 296cabdff1aSopenharmony_ciINIT_XMM 297cabdff1aSopenharmony_ciCOMPOSE_VERTICAL sse2 298cabdff1aSopenharmony_ciHAAR_HORIZONTAL sse2, 0 299cabdff1aSopenharmony_ciHAAR_HORIZONTAL sse2, 1 300