1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* SIMD optimized MPEG-4 Parametric Stereo decoding functions 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* Copyright (C) 2015 James Almer 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci;* 13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 17cabdff1aSopenharmony_ci;* 18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci;****************************************************************************** 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ciSECTION_RODATA 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_cips_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_ciSECTION .text 30cabdff1aSopenharmony_ci 31cabdff1aSopenharmony_ci;************************************************************************* 32cabdff1aSopenharmony_ci;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n); 33cabdff1aSopenharmony_ci;************************************************************************* 34cabdff1aSopenharmony_ci%macro PS_ADD_SQUARES 1 35cabdff1aSopenharmony_cicglobal ps_add_squares, 3, 3, %1, dst, src, n 36cabdff1aSopenharmony_ci shl nd, 3 37cabdff1aSopenharmony_ci add srcq, nq 38cabdff1aSopenharmony_ci neg nq 39cabdff1aSopenharmony_ci 40cabdff1aSopenharmony_cialign 16 41cabdff1aSopenharmony_ci.loop: 42cabdff1aSopenharmony_ci movaps m0, [srcq+nq] 43cabdff1aSopenharmony_ci movaps m1, [srcq+nq+mmsize] 44cabdff1aSopenharmony_ci mulps m0, m0 45cabdff1aSopenharmony_ci mulps m1, m1 46cabdff1aSopenharmony_ci HADDPS m0, m1, m2 47cabdff1aSopenharmony_ci addps m0, [dstq] 48cabdff1aSopenharmony_ci movaps [dstq], m0 49cabdff1aSopenharmony_ci add dstq, mmsize 50cabdff1aSopenharmony_ci add nq, mmsize*2 51cabdff1aSopenharmony_ci jl .loop 52cabdff1aSopenharmony_ci REP_RET 53cabdff1aSopenharmony_ci%endmacro 54cabdff1aSopenharmony_ci 55cabdff1aSopenharmony_ciINIT_XMM sse 56cabdff1aSopenharmony_ciPS_ADD_SQUARES 2 57cabdff1aSopenharmony_ciINIT_XMM sse3 58cabdff1aSopenharmony_ciPS_ADD_SQUARES 3 59cabdff1aSopenharmony_ci 60cabdff1aSopenharmony_ci;******************************************************************* 61cabdff1aSopenharmony_ci;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2], 62cabdff1aSopenharmony_ci; float *src1, int n); 63cabdff1aSopenharmony_ci;******************************************************************* 64cabdff1aSopenharmony_ciINIT_XMM sse 65cabdff1aSopenharmony_cicglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n 66cabdff1aSopenharmony_ci shl nd, 3 67cabdff1aSopenharmony_ci add src1q, nq 68cabdff1aSopenharmony_ci add dstq, nq 69cabdff1aSopenharmony_ci neg nq 70cabdff1aSopenharmony_ci 71cabdff1aSopenharmony_cialign 16 72cabdff1aSopenharmony_ci.loop: 73cabdff1aSopenharmony_ci movu m0, [src1q+nq] 74cabdff1aSopenharmony_ci movu m1, [src1q+nq+mmsize] 75cabdff1aSopenharmony_ci mova m2, [src2q] 76cabdff1aSopenharmony_ci mova m3, m2 77cabdff1aSopenharmony_ci unpcklps m2, m2 78cabdff1aSopenharmony_ci unpckhps m3, m3 79cabdff1aSopenharmony_ci mulps m0, m2 80cabdff1aSopenharmony_ci mulps m1, m3 81cabdff1aSopenharmony_ci mova [dstq+nq], m0 82cabdff1aSopenharmony_ci mova [dstq+nq+mmsize], m1 83cabdff1aSopenharmony_ci add src2q, mmsize 84cabdff1aSopenharmony_ci add nq, mmsize*2 85cabdff1aSopenharmony_ci jl .loop 86cabdff1aSopenharmony_ci REP_RET 87cabdff1aSopenharmony_ci 88cabdff1aSopenharmony_ci;*********************************************************************** 89cabdff1aSopenharmony_ci;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2], 90cabdff1aSopenharmony_ci; float h[2][4], float h_step[2][4], 91cabdff1aSopenharmony_ci; int len); 92cabdff1aSopenharmony_ci;*********************************************************************** 93cabdff1aSopenharmony_ciINIT_XMM sse3 94cabdff1aSopenharmony_cicglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n 95cabdff1aSopenharmony_ci movaps m0, [hq] 96cabdff1aSopenharmony_ci movaps m1, [h_stepq] 97cabdff1aSopenharmony_ci unpcklps m4, m0, m0 98cabdff1aSopenharmony_ci unpckhps m0, m0 99cabdff1aSopenharmony_ci unpcklps m5, m1, m1 100cabdff1aSopenharmony_ci unpckhps m1, m1 101cabdff1aSopenharmony_ci shl nd, 3 102cabdff1aSopenharmony_ci add lq, nq 103cabdff1aSopenharmony_ci add rq, nq 104cabdff1aSopenharmony_ci neg nq 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_cialign 16 107cabdff1aSopenharmony_ci.loop: 108cabdff1aSopenharmony_ci addps m4, m5 109cabdff1aSopenharmony_ci addps m0, m1 110cabdff1aSopenharmony_ci movddup m2, [lq+nq] 111cabdff1aSopenharmony_ci movddup m3, [rq+nq] 112cabdff1aSopenharmony_ci mulps m2, m4 113cabdff1aSopenharmony_ci mulps m3, m0 114cabdff1aSopenharmony_ci addps m2, m3 115cabdff1aSopenharmony_ci movsd [lq+nq], m2 116cabdff1aSopenharmony_ci movhps [rq+nq], m2 117cabdff1aSopenharmony_ci add nq, 8 118cabdff1aSopenharmony_ci jl .loop 119cabdff1aSopenharmony_ci REP_RET 120cabdff1aSopenharmony_ci 121cabdff1aSopenharmony_ci;*************************************************************************** 122cabdff1aSopenharmony_ci;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2], 123cabdff1aSopenharmony_ci; float h[2][4], float h_step[2][4], 124cabdff1aSopenharmony_ci; int len); 125cabdff1aSopenharmony_ci;*************************************************************************** 126cabdff1aSopenharmony_ciINIT_XMM sse3 127cabdff1aSopenharmony_cicglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n 128cabdff1aSopenharmony_ci movaps m0, [hq] 129cabdff1aSopenharmony_ci movaps m1, [hq+mmsize] 130cabdff1aSopenharmony_ci%if ARCH_X86_64 131cabdff1aSopenharmony_ci movaps m8, [h_stepq] 132cabdff1aSopenharmony_ci movaps m9, [h_stepq+mmsize] 133cabdff1aSopenharmony_ci %define H_STEP0 m8 134cabdff1aSopenharmony_ci %define H_STEP1 m9 135cabdff1aSopenharmony_ci%else 136cabdff1aSopenharmony_ci %define H_STEP0 [h_stepq] 137cabdff1aSopenharmony_ci %define H_STEP1 [h_stepq+mmsize] 138cabdff1aSopenharmony_ci%endif 139cabdff1aSopenharmony_ci shl nd, 3 140cabdff1aSopenharmony_ci add lq, nq 141cabdff1aSopenharmony_ci add rq, nq 142cabdff1aSopenharmony_ci neg nq 143cabdff1aSopenharmony_ci 144cabdff1aSopenharmony_cialign 16 145cabdff1aSopenharmony_ci.loop: 146cabdff1aSopenharmony_ci addps m0, H_STEP0 147cabdff1aSopenharmony_ci addps m1, H_STEP1 148cabdff1aSopenharmony_ci movddup m2, [lq+nq] 149cabdff1aSopenharmony_ci movddup m3, [rq+nq] 150cabdff1aSopenharmony_ci shufps m4, m2, m2, q2301 151cabdff1aSopenharmony_ci shufps m5, m3, m3, q2301 152cabdff1aSopenharmony_ci unpcklps m6, m0, m0 153cabdff1aSopenharmony_ci unpckhps m7, m0, m0 154cabdff1aSopenharmony_ci mulps m2, m6 155cabdff1aSopenharmony_ci mulps m3, m7 156cabdff1aSopenharmony_ci unpcklps m6, m1, m1 157cabdff1aSopenharmony_ci unpckhps m7, m1, m1 158cabdff1aSopenharmony_ci mulps m4, m6 159cabdff1aSopenharmony_ci mulps m5, m7 160cabdff1aSopenharmony_ci addps m2, m3 161cabdff1aSopenharmony_ci addsubps m2, m4 162cabdff1aSopenharmony_ci addsubps m2, m5 163cabdff1aSopenharmony_ci movsd [lq+nq], m2 164cabdff1aSopenharmony_ci movhps [rq+nq], m2 165cabdff1aSopenharmony_ci add nq, 8 166cabdff1aSopenharmony_ci jl .loop 167cabdff1aSopenharmony_ci REP_RET 168cabdff1aSopenharmony_ci 169cabdff1aSopenharmony_ci;********************************************************** 170cabdff1aSopenharmony_ci;void ps_hybrid_analysis_ileave_sse(float out[2][38][64], 171cabdff1aSopenharmony_ci; float (*in)[32][2], 172cabdff1aSopenharmony_ci; int i, int len) 173cabdff1aSopenharmony_ci;********************************************************** 174cabdff1aSopenharmony_ciINIT_XMM sse 175cabdff1aSopenharmony_cicglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp 176cabdff1aSopenharmony_ci movsxdifnidn iq, id 177cabdff1aSopenharmony_ci mov lend, 32 << 3 178cabdff1aSopenharmony_ci lea inq, [inq+iq*4] 179cabdff1aSopenharmony_ci mov tmpd, id 180cabdff1aSopenharmony_ci shl tmpd, 8 181cabdff1aSopenharmony_ci add outq, tmpq 182cabdff1aSopenharmony_ci mov tmpd, 64 183cabdff1aSopenharmony_ci sub tmpd, id 184cabdff1aSopenharmony_ci mov id, tmpd 185cabdff1aSopenharmony_ci 186cabdff1aSopenharmony_ci test id, 1 187cabdff1aSopenharmony_ci jne .loop4 188cabdff1aSopenharmony_ci test id, 2 189cabdff1aSopenharmony_ci jne .loop8 190cabdff1aSopenharmony_ci 191cabdff1aSopenharmony_cialign 16 192cabdff1aSopenharmony_ci.loop16: 193cabdff1aSopenharmony_ci mov in0q, inq 194cabdff1aSopenharmony_ci mov in1q, 38*64*4 195cabdff1aSopenharmony_ci add in1q, in0q 196cabdff1aSopenharmony_ci mov tmpd, lend 197cabdff1aSopenharmony_ci 198cabdff1aSopenharmony_ci.inner_loop16: 199cabdff1aSopenharmony_ci movaps m0, [in0q] 200cabdff1aSopenharmony_ci movaps m1, [in1q] 201cabdff1aSopenharmony_ci movaps m2, [in0q+lenq] 202cabdff1aSopenharmony_ci movaps m3, [in1q+lenq] 203cabdff1aSopenharmony_ci TRANSPOSE4x4PS 0, 1, 2, 3, 4 204cabdff1aSopenharmony_ci movaps [outq], m0 205cabdff1aSopenharmony_ci movaps [outq+lenq], m1 206cabdff1aSopenharmony_ci movaps [outq+lenq*2], m2 207cabdff1aSopenharmony_ci movaps [outq+3*32*2*4], m3 208cabdff1aSopenharmony_ci lea in0q, [in0q+lenq*2] 209cabdff1aSopenharmony_ci lea in1q, [in1q+lenq*2] 210cabdff1aSopenharmony_ci add outq, mmsize 211cabdff1aSopenharmony_ci sub tmpd, mmsize 212cabdff1aSopenharmony_ci jg .inner_loop16 213cabdff1aSopenharmony_ci add inq, 16 214cabdff1aSopenharmony_ci add outq, 3*32*2*4 215cabdff1aSopenharmony_ci sub id, 4 216cabdff1aSopenharmony_ci jg .loop16 217cabdff1aSopenharmony_ci RET 218cabdff1aSopenharmony_ci 219cabdff1aSopenharmony_cialign 16 220cabdff1aSopenharmony_ci.loop8: 221cabdff1aSopenharmony_ci mov in0q, inq 222cabdff1aSopenharmony_ci mov in1q, 38*64*4 223cabdff1aSopenharmony_ci add in1q, in0q 224cabdff1aSopenharmony_ci mov tmpd, lend 225cabdff1aSopenharmony_ci 226cabdff1aSopenharmony_ci.inner_loop8: 227cabdff1aSopenharmony_ci movlps m0, [in0q] 228cabdff1aSopenharmony_ci movlps m1, [in1q] 229cabdff1aSopenharmony_ci movhps m0, [in0q+lenq] 230cabdff1aSopenharmony_ci movhps m1, [in1q+lenq] 231cabdff1aSopenharmony_ci SBUTTERFLYPS 0, 1, 2 232cabdff1aSopenharmony_ci SBUTTERFLYPD 0, 1, 2 233cabdff1aSopenharmony_ci movaps [outq], m0 234cabdff1aSopenharmony_ci movaps [outq+lenq], m1 235cabdff1aSopenharmony_ci lea in0q, [in0q+lenq*2] 236cabdff1aSopenharmony_ci lea in1q, [in1q+lenq*2] 237cabdff1aSopenharmony_ci add outq, mmsize 238cabdff1aSopenharmony_ci sub tmpd, mmsize 239cabdff1aSopenharmony_ci jg .inner_loop8 240cabdff1aSopenharmony_ci add inq, 8 241cabdff1aSopenharmony_ci add outq, lenq 242cabdff1aSopenharmony_ci sub id, 2 243cabdff1aSopenharmony_ci jg .loop16 244cabdff1aSopenharmony_ci RET 245cabdff1aSopenharmony_ci 246cabdff1aSopenharmony_cialign 16 247cabdff1aSopenharmony_ci.loop4: 248cabdff1aSopenharmony_ci mov in0q, inq 249cabdff1aSopenharmony_ci mov in1q, 38*64*4 250cabdff1aSopenharmony_ci add in1q, in0q 251cabdff1aSopenharmony_ci mov tmpd, lend 252cabdff1aSopenharmony_ci 253cabdff1aSopenharmony_ci.inner_loop4: 254cabdff1aSopenharmony_ci movss m0, [in0q] 255cabdff1aSopenharmony_ci movss m1, [in1q] 256cabdff1aSopenharmony_ci movss m2, [in0q+lenq] 257cabdff1aSopenharmony_ci movss m3, [in1q+lenq] 258cabdff1aSopenharmony_ci movlhps m0, m1 259cabdff1aSopenharmony_ci movlhps m2, m3 260cabdff1aSopenharmony_ci shufps m0, m2, q2020 261cabdff1aSopenharmony_ci movaps [outq], m0 262cabdff1aSopenharmony_ci lea in0q, [in0q+lenq*2] 263cabdff1aSopenharmony_ci lea in1q, [in1q+lenq*2] 264cabdff1aSopenharmony_ci add outq, mmsize 265cabdff1aSopenharmony_ci sub tmpd, mmsize 266cabdff1aSopenharmony_ci jg .inner_loop4 267cabdff1aSopenharmony_ci add inq, 4 268cabdff1aSopenharmony_ci sub id, 1 269cabdff1aSopenharmony_ci test id, 2 270cabdff1aSopenharmony_ci jne .loop8 271cabdff1aSopenharmony_ci cmp id, 4 272cabdff1aSopenharmony_ci jge .loop16 273cabdff1aSopenharmony_ci RET 274cabdff1aSopenharmony_ci 275cabdff1aSopenharmony_ci;*********************************************************** 276cabdff1aSopenharmony_ci;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64], 277cabdff1aSopenharmony_ci; float (*in)[32][2], 278cabdff1aSopenharmony_ci; int i, int len) 279cabdff1aSopenharmony_ci;*********************************************************** 280cabdff1aSopenharmony_ci%macro HYBRID_SYNTHESIS_DEINT 0 281cabdff1aSopenharmony_cicglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp 282cabdff1aSopenharmony_ci%if cpuflag(sse4) 283cabdff1aSopenharmony_ci%define MOVH movsd 284cabdff1aSopenharmony_ci%else 285cabdff1aSopenharmony_ci%define MOVH movlps 286cabdff1aSopenharmony_ci%endif 287cabdff1aSopenharmony_ci movsxdifnidn iq, id 288cabdff1aSopenharmony_ci mov lend, 32 << 3 289cabdff1aSopenharmony_ci lea outq, [outq+iq*4] 290cabdff1aSopenharmony_ci mov tmpd, id 291cabdff1aSopenharmony_ci shl tmpd, 8 292cabdff1aSopenharmony_ci add inq, tmpq 293cabdff1aSopenharmony_ci mov tmpd, 64 294cabdff1aSopenharmony_ci sub tmpd, id 295cabdff1aSopenharmony_ci mov id, tmpd 296cabdff1aSopenharmony_ci 297cabdff1aSopenharmony_ci test id, 1 298cabdff1aSopenharmony_ci jne .loop4 299cabdff1aSopenharmony_ci test id, 2 300cabdff1aSopenharmony_ci jne .loop8 301cabdff1aSopenharmony_ci 302cabdff1aSopenharmony_cialign 16 303cabdff1aSopenharmony_ci.loop16: 304cabdff1aSopenharmony_ci mov out0q, outq 305cabdff1aSopenharmony_ci mov out1q, 38*64*4 306cabdff1aSopenharmony_ci add out1q, out0q 307cabdff1aSopenharmony_ci mov tmpd, lend 308cabdff1aSopenharmony_ci 309cabdff1aSopenharmony_ci.inner_loop16: 310cabdff1aSopenharmony_ci movaps m0, [inq] 311cabdff1aSopenharmony_ci movaps m1, [inq+lenq] 312cabdff1aSopenharmony_ci movaps m2, [inq+lenq*2] 313cabdff1aSopenharmony_ci movaps m3, [inq+3*32*2*4] 314cabdff1aSopenharmony_ci TRANSPOSE4x4PS 0, 1, 2, 3, 4 315cabdff1aSopenharmony_ci movaps [out0q], m0 316cabdff1aSopenharmony_ci movaps [out1q], m1 317cabdff1aSopenharmony_ci movaps [out0q+lenq], m2 318cabdff1aSopenharmony_ci movaps [out1q+lenq], m3 319cabdff1aSopenharmony_ci lea out0q, [out0q+lenq*2] 320cabdff1aSopenharmony_ci lea out1q, [out1q+lenq*2] 321cabdff1aSopenharmony_ci add inq, mmsize 322cabdff1aSopenharmony_ci sub tmpd, mmsize 323cabdff1aSopenharmony_ci jg .inner_loop16 324cabdff1aSopenharmony_ci add outq, 16 325cabdff1aSopenharmony_ci add inq, 3*32*2*4 326cabdff1aSopenharmony_ci sub id, 4 327cabdff1aSopenharmony_ci jg .loop16 328cabdff1aSopenharmony_ci RET 329cabdff1aSopenharmony_ci 330cabdff1aSopenharmony_cialign 16 331cabdff1aSopenharmony_ci.loop8: 332cabdff1aSopenharmony_ci mov out0q, outq 333cabdff1aSopenharmony_ci mov out1q, 38*64*4 334cabdff1aSopenharmony_ci add out1q, out0q 335cabdff1aSopenharmony_ci mov tmpd, lend 336cabdff1aSopenharmony_ci 337cabdff1aSopenharmony_ci.inner_loop8: 338cabdff1aSopenharmony_ci movaps m0, [inq] 339cabdff1aSopenharmony_ci movaps m1, [inq+lenq] 340cabdff1aSopenharmony_ci SBUTTERFLYPS 0, 1, 2 341cabdff1aSopenharmony_ci SBUTTERFLYPD 0, 1, 2 342cabdff1aSopenharmony_ci MOVH [out0q], m0 343cabdff1aSopenharmony_ci MOVH [out1q], m1 344cabdff1aSopenharmony_ci movhps [out0q+lenq], m0 345cabdff1aSopenharmony_ci movhps [out1q+lenq], m1 346cabdff1aSopenharmony_ci lea out0q, [out0q+lenq*2] 347cabdff1aSopenharmony_ci lea out1q, [out1q+lenq*2] 348cabdff1aSopenharmony_ci add inq, mmsize 349cabdff1aSopenharmony_ci sub tmpd, mmsize 350cabdff1aSopenharmony_ci jg .inner_loop8 351cabdff1aSopenharmony_ci add outq, 8 352cabdff1aSopenharmony_ci add inq, lenq 353cabdff1aSopenharmony_ci sub id, 2 354cabdff1aSopenharmony_ci jg .loop16 355cabdff1aSopenharmony_ci RET 356cabdff1aSopenharmony_ci 357cabdff1aSopenharmony_cialign 16 358cabdff1aSopenharmony_ci.loop4: 359cabdff1aSopenharmony_ci mov out0q, outq 360cabdff1aSopenharmony_ci mov out1q, 38*64*4 361cabdff1aSopenharmony_ci add out1q, out0q 362cabdff1aSopenharmony_ci mov tmpd, lend 363cabdff1aSopenharmony_ci 364cabdff1aSopenharmony_ci.inner_loop4: 365cabdff1aSopenharmony_ci movaps m0, [inq] 366cabdff1aSopenharmony_ci movss [out0q], m0 367cabdff1aSopenharmony_ci%if cpuflag(sse4) 368cabdff1aSopenharmony_ci extractps [out1q], m0, 1 369cabdff1aSopenharmony_ci extractps [out0q+lenq], m0, 2 370cabdff1aSopenharmony_ci extractps [out1q+lenq], m0, 3 371cabdff1aSopenharmony_ci%else 372cabdff1aSopenharmony_ci movhlps m1, m0 373cabdff1aSopenharmony_ci movss [out0q+lenq], m1 374cabdff1aSopenharmony_ci shufps m0, m0, 0xb1 375cabdff1aSopenharmony_ci movss [out1q], m0 376cabdff1aSopenharmony_ci movhlps m1, m0 377cabdff1aSopenharmony_ci movss [out1q+lenq], m1 378cabdff1aSopenharmony_ci%endif 379cabdff1aSopenharmony_ci lea out0q, [out0q+lenq*2] 380cabdff1aSopenharmony_ci lea out1q, [out1q+lenq*2] 381cabdff1aSopenharmony_ci add inq, mmsize 382cabdff1aSopenharmony_ci sub tmpd, mmsize 383cabdff1aSopenharmony_ci jg .inner_loop4 384cabdff1aSopenharmony_ci add outq, 4 385cabdff1aSopenharmony_ci sub id, 1 386cabdff1aSopenharmony_ci test id, 2 387cabdff1aSopenharmony_ci jne .loop8 388cabdff1aSopenharmony_ci cmp id, 4 389cabdff1aSopenharmony_ci jge .loop16 390cabdff1aSopenharmony_ci RET 391cabdff1aSopenharmony_ci%endmacro 392cabdff1aSopenharmony_ci 393cabdff1aSopenharmony_ciINIT_XMM sse 394cabdff1aSopenharmony_ciHYBRID_SYNTHESIS_DEINT 395cabdff1aSopenharmony_ciINIT_XMM sse4 396cabdff1aSopenharmony_ciHYBRID_SYNTHESIS_DEINT 397cabdff1aSopenharmony_ci 398cabdff1aSopenharmony_ci;******************************************************************* 399cabdff1aSopenharmony_ci;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2], 400cabdff1aSopenharmony_ci; const float (*filter)[8][2], 401cabdff1aSopenharmony_ci; ptrdiff_t stride, int n); 402cabdff1aSopenharmony_ci;******************************************************************* 403cabdff1aSopenharmony_ci%macro PS_HYBRID_ANALYSIS_LOOP 3 404cabdff1aSopenharmony_ci movu %1, [inq+mmsize*%3] 405cabdff1aSopenharmony_ci movu m1, [inq+mmsize*(5-%3)+8] 406cabdff1aSopenharmony_ci%if cpuflag(sse3) 407cabdff1aSopenharmony_ci pshufd %2, %1, q2301 408cabdff1aSopenharmony_ci pshufd m4, m1, q0123 409cabdff1aSopenharmony_ci pshufd m1, m1, q1032 410cabdff1aSopenharmony_ci pshufd m2, [filterq+nq+mmsize*%3], q2301 411cabdff1aSopenharmony_ci addsubps %2, m4 412cabdff1aSopenharmony_ci addsubps %1, m1 413cabdff1aSopenharmony_ci%else 414cabdff1aSopenharmony_ci mova m2, [filterq+nq+mmsize*%3] 415cabdff1aSopenharmony_ci mova %2, %1 416cabdff1aSopenharmony_ci mova m4, m1 417cabdff1aSopenharmony_ci shufps %2, %2, q2301 418cabdff1aSopenharmony_ci shufps m4, m4, q0123 419cabdff1aSopenharmony_ci shufps m1, m1, q1032 420cabdff1aSopenharmony_ci shufps m2, m2, q2301 421cabdff1aSopenharmony_ci xorps m4, m7 422cabdff1aSopenharmony_ci xorps m1, m7 423cabdff1aSopenharmony_ci subps %2, m4 424cabdff1aSopenharmony_ci subps %1, m1 425cabdff1aSopenharmony_ci%endif 426cabdff1aSopenharmony_ci mulps %2, m2 427cabdff1aSopenharmony_ci mulps %1, m2 428cabdff1aSopenharmony_ci%if %3 429cabdff1aSopenharmony_ci addps m3, %2 430cabdff1aSopenharmony_ci addps m0, %1 431cabdff1aSopenharmony_ci%endif 432cabdff1aSopenharmony_ci%endmacro 433cabdff1aSopenharmony_ci 434cabdff1aSopenharmony_ci%macro PS_HYBRID_ANALYSIS 0 435cabdff1aSopenharmony_cicglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n 436cabdff1aSopenharmony_ci%if cpuflag(sse3) 437cabdff1aSopenharmony_ci%define MOVH movsd 438cabdff1aSopenharmony_ci%else 439cabdff1aSopenharmony_ci%define MOVH movlps 440cabdff1aSopenharmony_ci%endif 441cabdff1aSopenharmony_ci shl strideq, 3 442cabdff1aSopenharmony_ci shl nd, 6 443cabdff1aSopenharmony_ci add filterq, nq 444cabdff1aSopenharmony_ci neg nq 445cabdff1aSopenharmony_ci mova m7, [ps_p1m1p1m1] 446cabdff1aSopenharmony_ci 447cabdff1aSopenharmony_cialign 16 448cabdff1aSopenharmony_ci.loop: 449cabdff1aSopenharmony_ci PS_HYBRID_ANALYSIS_LOOP m0, m3, 0 450cabdff1aSopenharmony_ci PS_HYBRID_ANALYSIS_LOOP m5, m6, 1 451cabdff1aSopenharmony_ci PS_HYBRID_ANALYSIS_LOOP m5, m6, 2 452cabdff1aSopenharmony_ci 453cabdff1aSopenharmony_ci%if cpuflag(sse3) 454cabdff1aSopenharmony_ci pshufd m3, m3, q2301 455cabdff1aSopenharmony_ci xorps m0, m7 456cabdff1aSopenharmony_ci hsubps m3, m0 457cabdff1aSopenharmony_ci pshufd m1, m3, q0020 458cabdff1aSopenharmony_ci pshufd m3, m3, q0031 459cabdff1aSopenharmony_ci addps m1, m3 460cabdff1aSopenharmony_ci movsd m2, [inq+6*8] 461cabdff1aSopenharmony_ci%else 462cabdff1aSopenharmony_ci mova m1, m3 463cabdff1aSopenharmony_ci mova m2, m0 464cabdff1aSopenharmony_ci shufps m1, m1, q2301 465cabdff1aSopenharmony_ci shufps m2, m2, q2301 466cabdff1aSopenharmony_ci subps m1, m3 467cabdff1aSopenharmony_ci addps m2, m0 468cabdff1aSopenharmony_ci unpcklps m3, m1, m2 469cabdff1aSopenharmony_ci unpckhps m1, m2 470cabdff1aSopenharmony_ci addps m1, m3 471cabdff1aSopenharmony_ci movu m2, [inq+6*8] ; faster than movlps and no risk of overread 472cabdff1aSopenharmony_ci%endif 473cabdff1aSopenharmony_ci movss m3, [filterq+nq+8*6] 474cabdff1aSopenharmony_ci SPLATD m3 475cabdff1aSopenharmony_ci mulps m2, m3 476cabdff1aSopenharmony_ci addps m1, m2 477cabdff1aSopenharmony_ci MOVH [outq], m1 478cabdff1aSopenharmony_ci add outq, strideq 479cabdff1aSopenharmony_ci add nq, 64 480cabdff1aSopenharmony_ci jl .loop 481cabdff1aSopenharmony_ci REP_RET 482cabdff1aSopenharmony_ci%endmacro 483cabdff1aSopenharmony_ci 484cabdff1aSopenharmony_ciINIT_XMM sse 485cabdff1aSopenharmony_ciPS_HYBRID_ANALYSIS 486cabdff1aSopenharmony_ciINIT_XMM sse3 487cabdff1aSopenharmony_ciPS_HYBRID_ANALYSIS 488