1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* x86 optimizations for PNG decoding 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu> 5cabdff1aSopenharmony_ci;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 8cabdff1aSopenharmony_ci;* 9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 13cabdff1aSopenharmony_ci;* 14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 18cabdff1aSopenharmony_ci;* 19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 21cabdff1aSopenharmony_ci;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22cabdff1aSopenharmony_ci;****************************************************************************** 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ciSECTION_RODATA 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_cicextern pw_255 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_ciSECTION .text 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_ci; %1 = nr. of xmm registers used 33cabdff1aSopenharmony_ci%macro ADD_BYTES_FN 1 34cabdff1aSopenharmony_cicglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i 35cabdff1aSopenharmony_ci%if ARCH_X86_64 36cabdff1aSopenharmony_ci movsxd waq, wad 37cabdff1aSopenharmony_ci%endif 38cabdff1aSopenharmony_ci xor iq, iq 39cabdff1aSopenharmony_ci 40cabdff1aSopenharmony_ci ; vector loop 41cabdff1aSopenharmony_ci mov wq, waq 42cabdff1aSopenharmony_ci and waq, ~(mmsize*2-1) 43cabdff1aSopenharmony_ci jmp .end_v 44cabdff1aSopenharmony_ci.loop_v: 45cabdff1aSopenharmony_ci movu m0, [src2q+iq] 46cabdff1aSopenharmony_ci movu m1, [src2q+iq+mmsize] 47cabdff1aSopenharmony_ci paddb m0, [src1q+iq] 48cabdff1aSopenharmony_ci paddb m1, [src1q+iq+mmsize] 49cabdff1aSopenharmony_ci movu [dstq+iq ], m0 50cabdff1aSopenharmony_ci movu [dstq+iq+mmsize], m1 51cabdff1aSopenharmony_ci add iq, mmsize*2 52cabdff1aSopenharmony_ci.end_v: 53cabdff1aSopenharmony_ci cmp iq, waq 54cabdff1aSopenharmony_ci jl .loop_v 55cabdff1aSopenharmony_ci 56cabdff1aSopenharmony_ci%if mmsize == 16 57cabdff1aSopenharmony_ci ; vector loop 58cabdff1aSopenharmony_ci mov waq, wq 59cabdff1aSopenharmony_ci and waq, ~7 60cabdff1aSopenharmony_ci jmp .end_l 61cabdff1aSopenharmony_ci.loop_l: 62cabdff1aSopenharmony_ci movq mm0, [src1q+iq] 63cabdff1aSopenharmony_ci paddb mm0, [src2q+iq] 64cabdff1aSopenharmony_ci movq [dstq+iq ], mm0 65cabdff1aSopenharmony_ci add iq, 8 66cabdff1aSopenharmony_ci.end_l: 67cabdff1aSopenharmony_ci cmp iq, waq 68cabdff1aSopenharmony_ci jl .loop_l 69cabdff1aSopenharmony_ci%endif 70cabdff1aSopenharmony_ci 71cabdff1aSopenharmony_ci ; scalar loop for leftover 72cabdff1aSopenharmony_ci jmp .end_s 73cabdff1aSopenharmony_ci.loop_s: 74cabdff1aSopenharmony_ci mov wab, [src1q+iq] 75cabdff1aSopenharmony_ci add wab, [src2q+iq] 76cabdff1aSopenharmony_ci mov [dstq+iq], wab 77cabdff1aSopenharmony_ci inc iq 78cabdff1aSopenharmony_ci.end_s: 79cabdff1aSopenharmony_ci cmp iq, wq 80cabdff1aSopenharmony_ci jl .loop_s 81cabdff1aSopenharmony_ci REP_RET 82cabdff1aSopenharmony_ci%endmacro 83cabdff1aSopenharmony_ci 84cabdff1aSopenharmony_ci%if ARCH_X86_32 85cabdff1aSopenharmony_ciINIT_MMX mmx 86cabdff1aSopenharmony_ciADD_BYTES_FN 0 87cabdff1aSopenharmony_ci%endif 88cabdff1aSopenharmony_ci 89cabdff1aSopenharmony_ciINIT_XMM sse2 90cabdff1aSopenharmony_ciADD_BYTES_FN 2 91cabdff1aSopenharmony_ci 92cabdff1aSopenharmony_ci%macro ADD_PAETH_PRED_FN 1 93cabdff1aSopenharmony_cicglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr 94cabdff1aSopenharmony_ci%if ARCH_X86_64 95cabdff1aSopenharmony_ci movsxd bppq, bppd 96cabdff1aSopenharmony_ci movsxd wq, wd 97cabdff1aSopenharmony_ci%endif 98cabdff1aSopenharmony_ci lea endq, [dstq+wq-(mmsize/2-1)] 99cabdff1aSopenharmony_ci sub topq, dstq 100cabdff1aSopenharmony_ci sub srcq, dstq 101cabdff1aSopenharmony_ci sub dstq, bppq 102cabdff1aSopenharmony_ci pxor m7, m7 103cabdff1aSopenharmony_ci 104cabdff1aSopenharmony_ci PUSH dstq 105cabdff1aSopenharmony_ci lea cntrq, [bppq-1] 106cabdff1aSopenharmony_ci shr cntrq, 2 + mmsize/16 107cabdff1aSopenharmony_ci.bpp_loop: 108cabdff1aSopenharmony_ci lea dstq, [dstq+cntrq*(mmsize/2)] 109cabdff1aSopenharmony_ci movh m0, [dstq] 110cabdff1aSopenharmony_ci movh m1, [topq+dstq] 111cabdff1aSopenharmony_ci punpcklbw m0, m7 112cabdff1aSopenharmony_ci punpcklbw m1, m7 113cabdff1aSopenharmony_ci add dstq, bppq 114cabdff1aSopenharmony_ci.loop: 115cabdff1aSopenharmony_ci mova m2, m1 116cabdff1aSopenharmony_ci movh m1, [topq+dstq] 117cabdff1aSopenharmony_ci mova m3, m2 118cabdff1aSopenharmony_ci punpcklbw m1, m7 119cabdff1aSopenharmony_ci mova m4, m2 120cabdff1aSopenharmony_ci psubw m3, m1 121cabdff1aSopenharmony_ci psubw m4, m0 122cabdff1aSopenharmony_ci mova m5, m3 123cabdff1aSopenharmony_ci paddw m5, m4 124cabdff1aSopenharmony_ci%if cpuflag(ssse3) 125cabdff1aSopenharmony_ci pabsw m3, m3 126cabdff1aSopenharmony_ci pabsw m4, m4 127cabdff1aSopenharmony_ci pabsw m5, m5 128cabdff1aSopenharmony_ci%else ; !cpuflag(ssse3) 129cabdff1aSopenharmony_ci psubw m7, m5 130cabdff1aSopenharmony_ci pmaxsw m5, m7 131cabdff1aSopenharmony_ci pxor m6, m6 132cabdff1aSopenharmony_ci pxor m7, m7 133cabdff1aSopenharmony_ci psubw m6, m3 134cabdff1aSopenharmony_ci psubw m7, m4 135cabdff1aSopenharmony_ci pmaxsw m3, m6 136cabdff1aSopenharmony_ci pmaxsw m4, m7 137cabdff1aSopenharmony_ci pxor m7, m7 138cabdff1aSopenharmony_ci%endif ; cpuflag(ssse3) 139cabdff1aSopenharmony_ci mova m6, m4 140cabdff1aSopenharmony_ci pminsw m6, m5 141cabdff1aSopenharmony_ci pcmpgtw m3, m6 142cabdff1aSopenharmony_ci pcmpgtw m4, m5 143cabdff1aSopenharmony_ci mova m6, m4 144cabdff1aSopenharmony_ci pand m4, m3 145cabdff1aSopenharmony_ci pandn m6, m3 146cabdff1aSopenharmony_ci pandn m3, m0 147cabdff1aSopenharmony_ci movh m0, [srcq+dstq] 148cabdff1aSopenharmony_ci pand m6, m1 149cabdff1aSopenharmony_ci pand m2, m4 150cabdff1aSopenharmony_ci punpcklbw m0, m7 151cabdff1aSopenharmony_ci paddw m0, m6 152cabdff1aSopenharmony_ci paddw m3, m2 153cabdff1aSopenharmony_ci paddw m0, m3 154cabdff1aSopenharmony_ci pand m0, [pw_255] 155cabdff1aSopenharmony_ci mova m3, m0 156cabdff1aSopenharmony_ci packuswb m3, m3 157cabdff1aSopenharmony_ci movh [dstq], m3 158cabdff1aSopenharmony_ci add dstq, bppq 159cabdff1aSopenharmony_ci cmp dstq, endq 160cabdff1aSopenharmony_ci jl .loop 161cabdff1aSopenharmony_ci 162cabdff1aSopenharmony_ci mov dstq, [rsp] 163cabdff1aSopenharmony_ci dec cntrq 164cabdff1aSopenharmony_ci jge .bpp_loop 165cabdff1aSopenharmony_ci POP dstq 166cabdff1aSopenharmony_ci RET 167cabdff1aSopenharmony_ci%endmacro 168cabdff1aSopenharmony_ci 169cabdff1aSopenharmony_ciINIT_MMX mmxext 170cabdff1aSopenharmony_ciADD_PAETH_PRED_FN 0 171cabdff1aSopenharmony_ci 172cabdff1aSopenharmony_ciINIT_MMX ssse3 173cabdff1aSopenharmony_ciADD_PAETH_PRED_FN 0 174