1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* Copyright (c) 2010 David Conrad 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci;* 11cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 15cabdff1aSopenharmony_ci;* 16cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci;****************************************************************************** 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ciSECTION_RODATA 24cabdff1aSopenharmony_cipw_7: times 8 dw 7 25cabdff1aSopenharmony_ciconvert_to_unsigned_10bit: times 4 dd 0x200 26cabdff1aSopenharmony_ciclip_10bit: times 8 dw 0x3ff 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_cicextern pw_3 29cabdff1aSopenharmony_cicextern pw_16 30cabdff1aSopenharmony_cicextern pw_32 31cabdff1aSopenharmony_cicextern pb_80 32cabdff1aSopenharmony_ci 33cabdff1aSopenharmony_ciSECTION .text 34cabdff1aSopenharmony_ci 35cabdff1aSopenharmony_ci%macro UNPACK_ADD 6 36cabdff1aSopenharmony_ci mov%5 %1, %3 37cabdff1aSopenharmony_ci mov%6 m5, %4 38cabdff1aSopenharmony_ci mova m4, %1 39cabdff1aSopenharmony_ci mova %2, m5 40cabdff1aSopenharmony_ci punpcklbw %1, m7 41cabdff1aSopenharmony_ci punpcklbw m5, m7 42cabdff1aSopenharmony_ci punpckhbw m4, m7 43cabdff1aSopenharmony_ci punpckhbw %2, m7 44cabdff1aSopenharmony_ci paddw %1, m5 45cabdff1aSopenharmony_ci paddw %2, m4 46cabdff1aSopenharmony_ci%endmacro 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_ci%macro HPEL_FILTER 1 49cabdff1aSopenharmony_ci; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width); 50cabdff1aSopenharmony_cicglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3 51cabdff1aSopenharmony_ci mov src0q, srcq 52cabdff1aSopenharmony_ci lea stridex3q, [3*strideq] 53cabdff1aSopenharmony_ci sub src0q, stridex3q 54cabdff1aSopenharmony_ci pxor m7, m7 55cabdff1aSopenharmony_ci.loop: 56cabdff1aSopenharmony_ci ; 7*(src[0] + src[1]) 57cabdff1aSopenharmony_ci UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a 58cabdff1aSopenharmony_ci pmullw m0, [pw_7] 59cabdff1aSopenharmony_ci pmullw m1, [pw_7] 60cabdff1aSopenharmony_ci 61cabdff1aSopenharmony_ci ; 3*( ... + src[-2] + src[3]) 62cabdff1aSopenharmony_ci UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a 63cabdff1aSopenharmony_ci paddw m0, m2 64cabdff1aSopenharmony_ci paddw m1, m3 65cabdff1aSopenharmony_ci pmullw m0, [pw_3] 66cabdff1aSopenharmony_ci pmullw m1, [pw_3] 67cabdff1aSopenharmony_ci 68cabdff1aSopenharmony_ci ; ... - 7*(src[-1] + src[2]) 69cabdff1aSopenharmony_ci UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a 70cabdff1aSopenharmony_ci pmullw m2, [pw_7] 71cabdff1aSopenharmony_ci pmullw m3, [pw_7] 72cabdff1aSopenharmony_ci psubw m0, m2 73cabdff1aSopenharmony_ci psubw m1, m3 74cabdff1aSopenharmony_ci 75cabdff1aSopenharmony_ci ; ... - (src[-3] + src[4]) 76cabdff1aSopenharmony_ci UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a 77cabdff1aSopenharmony_ci psubw m0, m2 78cabdff1aSopenharmony_ci psubw m1, m3 79cabdff1aSopenharmony_ci 80cabdff1aSopenharmony_ci paddw m0, [pw_16] 81cabdff1aSopenharmony_ci paddw m1, [pw_16] 82cabdff1aSopenharmony_ci psraw m0, 5 83cabdff1aSopenharmony_ci psraw m1, 5 84cabdff1aSopenharmony_ci packuswb m0, m1 85cabdff1aSopenharmony_ci mova [dstq], m0 86cabdff1aSopenharmony_ci add dstq, mmsize 87cabdff1aSopenharmony_ci add srcq, mmsize 88cabdff1aSopenharmony_ci add src0q, mmsize 89cabdff1aSopenharmony_ci sub widthd, mmsize 90cabdff1aSopenharmony_ci jg .loop 91cabdff1aSopenharmony_ci RET 92cabdff1aSopenharmony_ci 93cabdff1aSopenharmony_ci; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width); 94cabdff1aSopenharmony_cicglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width 95cabdff1aSopenharmony_ci dec widthd 96cabdff1aSopenharmony_ci pxor m7, m7 97cabdff1aSopenharmony_ci and widthd, ~(mmsize-1) 98cabdff1aSopenharmony_ci.loop: 99cabdff1aSopenharmony_ci ; 7*(src[0] + src[1]) 100cabdff1aSopenharmony_ci UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u 101cabdff1aSopenharmony_ci pmullw m0, [pw_7] 102cabdff1aSopenharmony_ci pmullw m1, [pw_7] 103cabdff1aSopenharmony_ci 104cabdff1aSopenharmony_ci ; 3*( ... + src[-2] + src[3]) 105cabdff1aSopenharmony_ci UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u 106cabdff1aSopenharmony_ci paddw m0, m2 107cabdff1aSopenharmony_ci paddw m1, m3 108cabdff1aSopenharmony_ci pmullw m0, [pw_3] 109cabdff1aSopenharmony_ci pmullw m1, [pw_3] 110cabdff1aSopenharmony_ci 111cabdff1aSopenharmony_ci ; ... - 7*(src[-1] + src[2]) 112cabdff1aSopenharmony_ci UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u 113cabdff1aSopenharmony_ci pmullw m2, [pw_7] 114cabdff1aSopenharmony_ci pmullw m3, [pw_7] 115cabdff1aSopenharmony_ci psubw m0, m2 116cabdff1aSopenharmony_ci psubw m1, m3 117cabdff1aSopenharmony_ci 118cabdff1aSopenharmony_ci ; ... - (src[-3] + src[4]) 119cabdff1aSopenharmony_ci UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u 120cabdff1aSopenharmony_ci psubw m0, m2 121cabdff1aSopenharmony_ci psubw m1, m3 122cabdff1aSopenharmony_ci 123cabdff1aSopenharmony_ci paddw m0, [pw_16] 124cabdff1aSopenharmony_ci paddw m1, [pw_16] 125cabdff1aSopenharmony_ci psraw m0, 5 126cabdff1aSopenharmony_ci psraw m1, 5 127cabdff1aSopenharmony_ci packuswb m0, m1 128cabdff1aSopenharmony_ci mova [dstq + widthq], m0 129cabdff1aSopenharmony_ci sub widthd, mmsize 130cabdff1aSopenharmony_ci jge .loop 131cabdff1aSopenharmony_ci RET 132cabdff1aSopenharmony_ci%endmacro 133cabdff1aSopenharmony_ci 134cabdff1aSopenharmony_ci%macro PUT_RECT 1 135cabdff1aSopenharmony_ci; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height) 136cabdff1aSopenharmony_cicglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2 137cabdff1aSopenharmony_ci mova m0, [pb_80] 138cabdff1aSopenharmony_ci add wd, (mmsize-1) 139cabdff1aSopenharmony_ci and wd, ~(mmsize-1) 140cabdff1aSopenharmony_ci 141cabdff1aSopenharmony_ci%if ARCH_X86_64 142cabdff1aSopenharmony_ci movsxd dst_strideq, dst_strided 143cabdff1aSopenharmony_ci movsxd src_strideq, src_strided 144cabdff1aSopenharmony_ci mov r7d, r5m 145cabdff1aSopenharmony_ci mov r8d, wd 146cabdff1aSopenharmony_ci %define wspill r8d 147cabdff1aSopenharmony_ci %define hd r7d 148cabdff1aSopenharmony_ci%else 149cabdff1aSopenharmony_ci mov r4m, wd 150cabdff1aSopenharmony_ci %define wspill r4m 151cabdff1aSopenharmony_ci %define hd r5mp 152cabdff1aSopenharmony_ci%endif 153cabdff1aSopenharmony_ci 154cabdff1aSopenharmony_ci.loopy: 155cabdff1aSopenharmony_ci lea src2q, [srcq+src_strideq] 156cabdff1aSopenharmony_ci lea dst2q, [dstq+dst_strideq] 157cabdff1aSopenharmony_ci.loopx: 158cabdff1aSopenharmony_ci sub wd, mmsize 159cabdff1aSopenharmony_ci mova m1, [srcq +2*wq] 160cabdff1aSopenharmony_ci mova m2, [src2q+2*wq] 161cabdff1aSopenharmony_ci packsswb m1, [srcq +2*wq+mmsize] 162cabdff1aSopenharmony_ci packsswb m2, [src2q+2*wq+mmsize] 163cabdff1aSopenharmony_ci paddb m1, m0 164cabdff1aSopenharmony_ci paddb m2, m0 165cabdff1aSopenharmony_ci mova [dstq +wq], m1 166cabdff1aSopenharmony_ci mova [dst2q+wq], m2 167cabdff1aSopenharmony_ci jg .loopx 168cabdff1aSopenharmony_ci 169cabdff1aSopenharmony_ci lea srcq, [srcq+src_strideq*2] 170cabdff1aSopenharmony_ci lea dstq, [dstq+dst_strideq*2] 171cabdff1aSopenharmony_ci sub hd, 2 172cabdff1aSopenharmony_ci mov wd, wspill 173cabdff1aSopenharmony_ci jg .loopy 174cabdff1aSopenharmony_ci RET 175cabdff1aSopenharmony_ci%endm 176cabdff1aSopenharmony_ci 177cabdff1aSopenharmony_ci%macro ADD_RECT 1 178cabdff1aSopenharmony_ci; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height) 179cabdff1aSopenharmony_cicglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h 180cabdff1aSopenharmony_ci mova m0, [pw_32] 181cabdff1aSopenharmony_ci add wd, (mmsize-1) 182cabdff1aSopenharmony_ci and wd, ~(mmsize-1) 183cabdff1aSopenharmony_ci 184cabdff1aSopenharmony_ci%if ARCH_X86_64 185cabdff1aSopenharmony_ci movsxd strideq, strided 186cabdff1aSopenharmony_ci movsxd idwt_strideq, idwt_strided 187cabdff1aSopenharmony_ci mov r8d, wd 188cabdff1aSopenharmony_ci %define wspill r8d 189cabdff1aSopenharmony_ci%else 190cabdff1aSopenharmony_ci mov r5m, wd 191cabdff1aSopenharmony_ci %define wspill r5m 192cabdff1aSopenharmony_ci%endif 193cabdff1aSopenharmony_ci 194cabdff1aSopenharmony_ci.loop: 195cabdff1aSopenharmony_ci sub wd, mmsize 196cabdff1aSopenharmony_ci movu m1, [srcq +2*wq] ; FIXME: ensure alignment 197cabdff1aSopenharmony_ci paddw m1, m0 198cabdff1aSopenharmony_ci psraw m1, 6 199cabdff1aSopenharmony_ci movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment 200cabdff1aSopenharmony_ci paddw m2, m0 201cabdff1aSopenharmony_ci psraw m2, 6 202cabdff1aSopenharmony_ci paddw m1, [idwtq+2*wq] 203cabdff1aSopenharmony_ci paddw m2, [idwtq+2*wq+mmsize] 204cabdff1aSopenharmony_ci packuswb m1, m2 205cabdff1aSopenharmony_ci mova [dstq +wq], m1 206cabdff1aSopenharmony_ci jg .loop 207cabdff1aSopenharmony_ci 208cabdff1aSopenharmony_ci lea srcq, [srcq + 2*strideq] 209cabdff1aSopenharmony_ci add dstq, strideq 210cabdff1aSopenharmony_ci lea idwtq, [idwtq+ 2*idwt_strideq] 211cabdff1aSopenharmony_ci sub hd, 1 212cabdff1aSopenharmony_ci mov wd, wspill 213cabdff1aSopenharmony_ci jg .loop 214cabdff1aSopenharmony_ci RET 215cabdff1aSopenharmony_ci%endm 216cabdff1aSopenharmony_ci 217cabdff1aSopenharmony_ci%macro ADD_OBMC 2 218cabdff1aSopenharmony_ci; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen) 219cabdff1aSopenharmony_cicglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen 220cabdff1aSopenharmony_ci pxor m4, m4 221cabdff1aSopenharmony_ci.loop: 222cabdff1aSopenharmony_ci%assign i 0 223cabdff1aSopenharmony_ci%rep %1 / mmsize 224cabdff1aSopenharmony_ci mova m0, [srcq+i] 225cabdff1aSopenharmony_ci mova m1, m0 226cabdff1aSopenharmony_ci punpcklbw m0, m4 227cabdff1aSopenharmony_ci punpckhbw m1, m4 228cabdff1aSopenharmony_ci mova m2, [obmcq+i] 229cabdff1aSopenharmony_ci mova m3, m2 230cabdff1aSopenharmony_ci punpcklbw m2, m4 231cabdff1aSopenharmony_ci punpckhbw m3, m4 232cabdff1aSopenharmony_ci pmullw m0, m2 233cabdff1aSopenharmony_ci pmullw m1, m3 234cabdff1aSopenharmony_ci movu m2, [dstq+2*i] 235cabdff1aSopenharmony_ci movu m3, [dstq+2*i+mmsize] 236cabdff1aSopenharmony_ci paddw m0, m2 237cabdff1aSopenharmony_ci paddw m1, m3 238cabdff1aSopenharmony_ci movu [dstq+2*i], m0 239cabdff1aSopenharmony_ci movu [dstq+2*i+mmsize], m1 240cabdff1aSopenharmony_ci%assign i i+mmsize 241cabdff1aSopenharmony_ci%endrep 242cabdff1aSopenharmony_ci lea srcq, [srcq+strideq] 243cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 244cabdff1aSopenharmony_ci add obmcq, 32 245cabdff1aSopenharmony_ci sub yblend, 1 246cabdff1aSopenharmony_ci jg .loop 247cabdff1aSopenharmony_ci RET 248cabdff1aSopenharmony_ci%endm 249cabdff1aSopenharmony_ci 250cabdff1aSopenharmony_ciINIT_MMX 251cabdff1aSopenharmony_ciADD_OBMC 8, mmx 252cabdff1aSopenharmony_ci 253cabdff1aSopenharmony_ciINIT_XMM 254cabdff1aSopenharmony_ciPUT_RECT sse2 255cabdff1aSopenharmony_ciADD_RECT sse2 256cabdff1aSopenharmony_ci 257cabdff1aSopenharmony_ciHPEL_FILTER sse2 258cabdff1aSopenharmony_ciADD_OBMC 32, sse2 259cabdff1aSopenharmony_ciADD_OBMC 16, sse2 260cabdff1aSopenharmony_ci 261cabdff1aSopenharmony_ciINIT_XMM sse4 262cabdff1aSopenharmony_ci 263cabdff1aSopenharmony_ci; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h) 264cabdff1aSopenharmony_cicglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h 265cabdff1aSopenharmony_ci movd m2, qfd 266cabdff1aSopenharmony_ci movd m3, qsd 267cabdff1aSopenharmony_ci SPLATD m2 268cabdff1aSopenharmony_ci SPLATD m3 269cabdff1aSopenharmony_ci mov r4d, tot_hd 270cabdff1aSopenharmony_ci mov r3, dstq 271cabdff1aSopenharmony_ci 272cabdff1aSopenharmony_ci .loop_v: 273cabdff1aSopenharmony_ci mov tot_hq, r4 274cabdff1aSopenharmony_ci mov dstq, r3 275cabdff1aSopenharmony_ci 276cabdff1aSopenharmony_ci .loop_h: 277cabdff1aSopenharmony_ci movu m0, [srcq] 278cabdff1aSopenharmony_ci 279cabdff1aSopenharmony_ci pabsd m1, m0 280cabdff1aSopenharmony_ci pmulld m1, m2 281cabdff1aSopenharmony_ci paddd m1, m3 282cabdff1aSopenharmony_ci psrld m1, 2 283cabdff1aSopenharmony_ci psignd m1, m0 284cabdff1aSopenharmony_ci 285cabdff1aSopenharmony_ci movu [dstq], m1 286cabdff1aSopenharmony_ci 287cabdff1aSopenharmony_ci add srcq, mmsize 288cabdff1aSopenharmony_ci add dstq, mmsize 289cabdff1aSopenharmony_ci sub tot_hq, 4 290cabdff1aSopenharmony_ci jg .loop_h 291cabdff1aSopenharmony_ci lea srcq, [srcq + 4*tot_hq] 292cabdff1aSopenharmony_ci 293cabdff1aSopenharmony_ci add r3, strideq 294cabdff1aSopenharmony_ci dec tot_vd 295cabdff1aSopenharmony_ci jg .loop_v 296cabdff1aSopenharmony_ci 297cabdff1aSopenharmony_ci RET 298cabdff1aSopenharmony_ci 299cabdff1aSopenharmony_ciINIT_XMM sse4 300cabdff1aSopenharmony_ci; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height) 301cabdff1aSopenharmony_ci%if ARCH_X86_64 302cabdff1aSopenharmony_cicglobal put_signed_rect_clamped_10, 6, 8, 5, dst, dst_stride, src, src_stride, w, h, t1, t2 303cabdff1aSopenharmony_ci%else 304cabdff1aSopenharmony_cicglobal put_signed_rect_clamped_10, 5, 7, 5, dst, dst_stride, src, src_stride, w, t1, t2 305cabdff1aSopenharmony_ci %define hd r5mp 306cabdff1aSopenharmony_ci%endif 307cabdff1aSopenharmony_ci shl wd, 2 308cabdff1aSopenharmony_ci add srcq, wq 309cabdff1aSopenharmony_ci neg wq 310cabdff1aSopenharmony_ci mov t2q, dstq 311cabdff1aSopenharmony_ci mov t1q, wq 312cabdff1aSopenharmony_ci pxor m2, m2 313cabdff1aSopenharmony_ci mova m3, [clip_10bit] 314cabdff1aSopenharmony_ci mova m4, [convert_to_unsigned_10bit] 315cabdff1aSopenharmony_ci 316cabdff1aSopenharmony_ci .loop_h: 317cabdff1aSopenharmony_ci mov dstq, t2q 318cabdff1aSopenharmony_ci mov wq, t1q 319cabdff1aSopenharmony_ci 320cabdff1aSopenharmony_ci .loop_w: 321cabdff1aSopenharmony_ci movu m0, [srcq+wq+0*mmsize] 322cabdff1aSopenharmony_ci movu m1, [srcq+wq+1*mmsize] 323cabdff1aSopenharmony_ci 324cabdff1aSopenharmony_ci paddd m0, m4 325cabdff1aSopenharmony_ci paddd m1, m4 326cabdff1aSopenharmony_ci packusdw m0, m0, m1 327cabdff1aSopenharmony_ci CLIPW m0, m2, m3 ; packusdw saturates so it's fine 328cabdff1aSopenharmony_ci 329cabdff1aSopenharmony_ci movu [dstq], m0 330cabdff1aSopenharmony_ci 331cabdff1aSopenharmony_ci add dstq, 1*mmsize 332cabdff1aSopenharmony_ci add wq, 2*mmsize 333cabdff1aSopenharmony_ci jl .loop_w 334cabdff1aSopenharmony_ci 335cabdff1aSopenharmony_ci add srcq, src_strideq 336cabdff1aSopenharmony_ci add t2q, dst_strideq 337cabdff1aSopenharmony_ci sub hd, 1 338cabdff1aSopenharmony_ci jg .loop_h 339cabdff1aSopenharmony_ci 340cabdff1aSopenharmony_ci RET 341