1cabdff1aSopenharmony_ci;***************************************************************************** 2cabdff1aSopenharmony_ci;* x86util.asm 3cabdff1aSopenharmony_ci;***************************************************************************** 4cabdff1aSopenharmony_ci;* Copyright (C) 2008-2010 x264 project 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* Authors: Loren Merritt <lorenm@u.washington.edu> 7cabdff1aSopenharmony_ci;* Holger Lubitz <holger@lubitz.org> 8cabdff1aSopenharmony_ci;* 9cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 10cabdff1aSopenharmony_ci;* 11cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 12cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 13cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 14cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 15cabdff1aSopenharmony_ci;* 16cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 17cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 18cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 20cabdff1aSopenharmony_ci;* 21cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 22cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 23cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24cabdff1aSopenharmony_ci;****************************************************************************** 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ci%define private_prefix ff 27cabdff1aSopenharmony_ci%define public_prefix avpriv 28cabdff1aSopenharmony_ci%define cpuflags_mmxext cpuflags_mmx2 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_ci%include "libavutil/x86/x86inc.asm" 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_ci; expands to [base],...,[base+7*stride] 33cabdff1aSopenharmony_ci%define PASS8ROWS(base, base3, stride, stride3) \ 34cabdff1aSopenharmony_ci [base], [base + stride], [base + 2*stride], [base3], \ 35cabdff1aSopenharmony_ci [base3 + stride], [base3 + 2*stride], [base3 + stride3], [base3 + stride*4] 36cabdff1aSopenharmony_ci 37cabdff1aSopenharmony_ci; Interleave low src0 with low src1 and store in src0, 38cabdff1aSopenharmony_ci; interleave high src0 with high src1 and store in src1. 39cabdff1aSopenharmony_ci; %1 - types 40cabdff1aSopenharmony_ci; %2 - index of the register with src0 41cabdff1aSopenharmony_ci; %3 - index of the register with src1 42cabdff1aSopenharmony_ci; %4 - index of the register for intermediate results 43cabdff1aSopenharmony_ci; example for %1 - wd: input: src0: x0 x1 x2 x3 z0 z1 z2 z3 44cabdff1aSopenharmony_ci; src1: y0 y1 y2 y3 q0 q1 q2 q3 45cabdff1aSopenharmony_ci; output: src0: x0 y0 x1 y1 x2 y2 x3 y3 46cabdff1aSopenharmony_ci; src1: z0 q0 z1 q1 z2 q2 z3 q3 47cabdff1aSopenharmony_ci%macro SBUTTERFLY 4 48cabdff1aSopenharmony_ci%ifidn %1, dqqq 49cabdff1aSopenharmony_ci vperm2i128 m%4, m%2, m%3, q0301 50cabdff1aSopenharmony_ci vinserti128 m%2, m%2, xm%3, 1 51cabdff1aSopenharmony_ci%elif avx_enabled == 0 52cabdff1aSopenharmony_ci mova m%4, m%2 53cabdff1aSopenharmony_ci punpckl%1 m%2, m%3 54cabdff1aSopenharmony_ci punpckh%1 m%4, m%3 55cabdff1aSopenharmony_ci%else 56cabdff1aSopenharmony_ci punpckh%1 m%4, m%2, m%3 57cabdff1aSopenharmony_ci punpckl%1 m%2, m%3 58cabdff1aSopenharmony_ci%endif 59cabdff1aSopenharmony_ci SWAP %3, %4 60cabdff1aSopenharmony_ci%endmacro 61cabdff1aSopenharmony_ci 62cabdff1aSopenharmony_ci%macro SBUTTERFLY2 4 63cabdff1aSopenharmony_ci punpckl%1 m%4, m%2, m%3 64cabdff1aSopenharmony_ci punpckh%1 m%2, m%2, m%3 65cabdff1aSopenharmony_ci SWAP %2, %4, %3 66cabdff1aSopenharmony_ci%endmacro 67cabdff1aSopenharmony_ci 68cabdff1aSopenharmony_ci%macro SBUTTERFLYPS 3 69cabdff1aSopenharmony_ci unpcklps m%3, m%1, m%2 70cabdff1aSopenharmony_ci unpckhps m%1, m%1, m%2 71cabdff1aSopenharmony_ci SWAP %1, %3, %2 72cabdff1aSopenharmony_ci%endmacro 73cabdff1aSopenharmony_ci 74cabdff1aSopenharmony_ci%macro SBUTTERFLYPD 3 75cabdff1aSopenharmony_ci movlhps m%3, m%1, m%2 76cabdff1aSopenharmony_ci movhlps m%2, m%2, m%1 77cabdff1aSopenharmony_ci SWAP %1, %3 78cabdff1aSopenharmony_ci%endmacro 79cabdff1aSopenharmony_ci 80cabdff1aSopenharmony_ci%macro TRANSPOSE4x4B 5 81cabdff1aSopenharmony_ci SBUTTERFLY bw, %1, %2, %5 82cabdff1aSopenharmony_ci SBUTTERFLY bw, %3, %4, %5 83cabdff1aSopenharmony_ci SBUTTERFLY wd, %1, %3, %5 84cabdff1aSopenharmony_ci SBUTTERFLY wd, %2, %4, %5 85cabdff1aSopenharmony_ci SWAP %2, %3 86cabdff1aSopenharmony_ci%endmacro 87cabdff1aSopenharmony_ci 88cabdff1aSopenharmony_ci%macro TRANSPOSE4x4W 5 89cabdff1aSopenharmony_ci SBUTTERFLY wd, %1, %2, %5 90cabdff1aSopenharmony_ci SBUTTERFLY wd, %3, %4, %5 91cabdff1aSopenharmony_ci SBUTTERFLY dq, %1, %3, %5 92cabdff1aSopenharmony_ci SBUTTERFLY dq, %2, %4, %5 93cabdff1aSopenharmony_ci SWAP %2, %3 94cabdff1aSopenharmony_ci%endmacro 95cabdff1aSopenharmony_ci 96cabdff1aSopenharmony_ci%macro TRANSPOSE2x4x4B 5 97cabdff1aSopenharmony_ci SBUTTERFLY bw, %1, %2, %5 98cabdff1aSopenharmony_ci SBUTTERFLY bw, %3, %4, %5 99cabdff1aSopenharmony_ci SBUTTERFLY wd, %1, %3, %5 100cabdff1aSopenharmony_ci SBUTTERFLY wd, %2, %4, %5 101cabdff1aSopenharmony_ci SBUTTERFLY dq, %1, %2, %5 102cabdff1aSopenharmony_ci SBUTTERFLY dq, %3, %4, %5 103cabdff1aSopenharmony_ci%endmacro 104cabdff1aSopenharmony_ci 105cabdff1aSopenharmony_ci%macro TRANSPOSE2x4x4W 5 106cabdff1aSopenharmony_ci SBUTTERFLY wd, %1, %2, %5 107cabdff1aSopenharmony_ci SBUTTERFLY wd, %3, %4, %5 108cabdff1aSopenharmony_ci SBUTTERFLY dq, %1, %3, %5 109cabdff1aSopenharmony_ci SBUTTERFLY dq, %2, %4, %5 110cabdff1aSopenharmony_ci SBUTTERFLY qdq, %1, %2, %5 111cabdff1aSopenharmony_ci SBUTTERFLY qdq, %3, %4, %5 112cabdff1aSopenharmony_ci%endmacro 113cabdff1aSopenharmony_ci 114cabdff1aSopenharmony_ci%macro TRANSPOSE4x4D 5 115cabdff1aSopenharmony_ci SBUTTERFLY dq, %1, %2, %5 116cabdff1aSopenharmony_ci SBUTTERFLY dq, %3, %4, %5 117cabdff1aSopenharmony_ci SBUTTERFLY qdq, %1, %3, %5 118cabdff1aSopenharmony_ci SBUTTERFLY qdq, %2, %4, %5 119cabdff1aSopenharmony_ci SWAP %2, %3 120cabdff1aSopenharmony_ci%endmacro 121cabdff1aSopenharmony_ci 122cabdff1aSopenharmony_ci; identical behavior to TRANSPOSE4x4D, but using SSE1 float ops 123cabdff1aSopenharmony_ci%macro TRANSPOSE4x4PS 5 124cabdff1aSopenharmony_ci SBUTTERFLYPS %1, %2, %5 125cabdff1aSopenharmony_ci SBUTTERFLYPS %3, %4, %5 126cabdff1aSopenharmony_ci SBUTTERFLYPD %1, %3, %5 127cabdff1aSopenharmony_ci SBUTTERFLYPD %2, %4, %5 128cabdff1aSopenharmony_ci SWAP %2, %3 129cabdff1aSopenharmony_ci%endmacro 130cabdff1aSopenharmony_ci 131cabdff1aSopenharmony_ci%macro TRANSPOSE8x4D 9-11 132cabdff1aSopenharmony_ci%if ARCH_X86_64 133cabdff1aSopenharmony_ci SBUTTERFLY dq, %1, %2, %9 134cabdff1aSopenharmony_ci SBUTTERFLY dq, %3, %4, %9 135cabdff1aSopenharmony_ci SBUTTERFLY dq, %5, %6, %9 136cabdff1aSopenharmony_ci SBUTTERFLY dq, %7, %8, %9 137cabdff1aSopenharmony_ci SBUTTERFLY qdq, %1, %3, %9 138cabdff1aSopenharmony_ci SBUTTERFLY qdq, %2, %4, %9 139cabdff1aSopenharmony_ci SBUTTERFLY qdq, %5, %7, %9 140cabdff1aSopenharmony_ci SBUTTERFLY qdq, %6, %8, %9 141cabdff1aSopenharmony_ci SWAP %2, %5 142cabdff1aSopenharmony_ci SWAP %4, %7 143cabdff1aSopenharmony_ci%else 144cabdff1aSopenharmony_ci; in: m0..m7 145cabdff1aSopenharmony_ci; out: m0..m7, unless %11 in which case m2 is in %9 146cabdff1aSopenharmony_ci; spills into %9 and %10 147cabdff1aSopenharmony_ci movdqa %9, m%7 148cabdff1aSopenharmony_ci SBUTTERFLY dq, %1, %2, %7 149cabdff1aSopenharmony_ci movdqa %10, m%2 150cabdff1aSopenharmony_ci movdqa m%7, %9 151cabdff1aSopenharmony_ci SBUTTERFLY dq, %3, %4, %2 152cabdff1aSopenharmony_ci SBUTTERFLY dq, %5, %6, %2 153cabdff1aSopenharmony_ci SBUTTERFLY dq, %7, %8, %2 154cabdff1aSopenharmony_ci SBUTTERFLY qdq, %1, %3, %2 155cabdff1aSopenharmony_ci movdqa %9, m%3 156cabdff1aSopenharmony_ci movdqa m%2, %10 157cabdff1aSopenharmony_ci SBUTTERFLY qdq, %2, %4, %3 158cabdff1aSopenharmony_ci SBUTTERFLY qdq, %5, %7, %3 159cabdff1aSopenharmony_ci SBUTTERFLY qdq, %6, %8, %3 160cabdff1aSopenharmony_ci SWAP %2, %5 161cabdff1aSopenharmony_ci SWAP %4, %7 162cabdff1aSopenharmony_ci%if %0<11 163cabdff1aSopenharmony_ci movdqa m%3, %9 164cabdff1aSopenharmony_ci%endif 165cabdff1aSopenharmony_ci%endif 166cabdff1aSopenharmony_ci%endmacro 167cabdff1aSopenharmony_ci 168cabdff1aSopenharmony_ci%macro TRANSPOSE8x8W 9-11 169cabdff1aSopenharmony_ci%if ARCH_X86_64 170cabdff1aSopenharmony_ci SBUTTERFLY wd, %1, %2, %9 171cabdff1aSopenharmony_ci SBUTTERFLY wd, %3, %4, %9 172cabdff1aSopenharmony_ci SBUTTERFLY wd, %5, %6, %9 173cabdff1aSopenharmony_ci SBUTTERFLY wd, %7, %8, %9 174cabdff1aSopenharmony_ci SBUTTERFLY dq, %1, %3, %9 175cabdff1aSopenharmony_ci SBUTTERFLY dq, %2, %4, %9 176cabdff1aSopenharmony_ci SBUTTERFLY dq, %5, %7, %9 177cabdff1aSopenharmony_ci SBUTTERFLY dq, %6, %8, %9 178cabdff1aSopenharmony_ci SBUTTERFLY qdq, %1, %5, %9 179cabdff1aSopenharmony_ci SBUTTERFLY qdq, %2, %6, %9 180cabdff1aSopenharmony_ci SBUTTERFLY qdq, %3, %7, %9 181cabdff1aSopenharmony_ci SBUTTERFLY qdq, %4, %8, %9 182cabdff1aSopenharmony_ci SWAP %2, %5 183cabdff1aSopenharmony_ci SWAP %4, %7 184cabdff1aSopenharmony_ci%else 185cabdff1aSopenharmony_ci; in: m0..m7, unless %11 in which case m6 is in %9 186cabdff1aSopenharmony_ci; out: m0..m7, unless %11 in which case m4 is in %10 187cabdff1aSopenharmony_ci; spills into %9 and %10 188cabdff1aSopenharmony_ci%if %0<11 189cabdff1aSopenharmony_ci movdqa %9, m%7 190cabdff1aSopenharmony_ci%endif 191cabdff1aSopenharmony_ci SBUTTERFLY wd, %1, %2, %7 192cabdff1aSopenharmony_ci movdqa %10, m%2 193cabdff1aSopenharmony_ci movdqa m%7, %9 194cabdff1aSopenharmony_ci SBUTTERFLY wd, %3, %4, %2 195cabdff1aSopenharmony_ci SBUTTERFLY wd, %5, %6, %2 196cabdff1aSopenharmony_ci SBUTTERFLY wd, %7, %8, %2 197cabdff1aSopenharmony_ci SBUTTERFLY dq, %1, %3, %2 198cabdff1aSopenharmony_ci movdqa %9, m%3 199cabdff1aSopenharmony_ci movdqa m%2, %10 200cabdff1aSopenharmony_ci SBUTTERFLY dq, %2, %4, %3 201cabdff1aSopenharmony_ci SBUTTERFLY dq, %5, %7, %3 202cabdff1aSopenharmony_ci SBUTTERFLY dq, %6, %8, %3 203cabdff1aSopenharmony_ci SBUTTERFLY qdq, %1, %5, %3 204cabdff1aSopenharmony_ci SBUTTERFLY qdq, %2, %6, %3 205cabdff1aSopenharmony_ci movdqa %10, m%2 206cabdff1aSopenharmony_ci movdqa m%3, %9 207cabdff1aSopenharmony_ci SBUTTERFLY qdq, %3, %7, %2 208cabdff1aSopenharmony_ci SBUTTERFLY qdq, %4, %8, %2 209cabdff1aSopenharmony_ci SWAP %2, %5 210cabdff1aSopenharmony_ci SWAP %4, %7 211cabdff1aSopenharmony_ci%if %0<11 212cabdff1aSopenharmony_ci movdqa m%5, %10 213cabdff1aSopenharmony_ci%endif 214cabdff1aSopenharmony_ci%endif 215cabdff1aSopenharmony_ci%endmacro 216cabdff1aSopenharmony_ci 217cabdff1aSopenharmony_ci%macro TRANSPOSE16x16W 18-19 218cabdff1aSopenharmony_ci; in: m0..m15, unless %19 in which case m6 is in %17 219cabdff1aSopenharmony_ci; out: m0..m15, unless %19 in which case m4 is in %18 220cabdff1aSopenharmony_ci; spills into %17 and %18 221cabdff1aSopenharmony_ci%if %0 < 19 222cabdff1aSopenharmony_ci mova %17, m%7 223cabdff1aSopenharmony_ci%endif 224cabdff1aSopenharmony_ci 225cabdff1aSopenharmony_ci SBUTTERFLY dqqq, %1, %9, %7 226cabdff1aSopenharmony_ci SBUTTERFLY dqqq, %2, %10, %7 227cabdff1aSopenharmony_ci SBUTTERFLY dqqq, %3, %11, %7 228cabdff1aSopenharmony_ci SBUTTERFLY dqqq, %4, %12, %7 229cabdff1aSopenharmony_ci SBUTTERFLY dqqq, %5, %13, %7 230cabdff1aSopenharmony_ci SBUTTERFLY dqqq, %6, %14, %7 231cabdff1aSopenharmony_ci mova %18, m%14 232cabdff1aSopenharmony_ci mova m%7, %17 233cabdff1aSopenharmony_ci SBUTTERFLY dqqq, %7, %15, %14 234cabdff1aSopenharmony_ci SBUTTERFLY dqqq, %8, %16, %14 235cabdff1aSopenharmony_ci 236cabdff1aSopenharmony_ci SBUTTERFLY wd, %1, %2, %14 237cabdff1aSopenharmony_ci SBUTTERFLY wd, %3, %4, %14 238cabdff1aSopenharmony_ci SBUTTERFLY wd, %5, %6, %14 239cabdff1aSopenharmony_ci SBUTTERFLY wd, %7, %8, %14 240cabdff1aSopenharmony_ci SBUTTERFLY wd, %9, %10, %14 241cabdff1aSopenharmony_ci SBUTTERFLY wd, %11, %12, %14 242cabdff1aSopenharmony_ci mova %17, m%12 243cabdff1aSopenharmony_ci mova m%14, %18 244cabdff1aSopenharmony_ci SBUTTERFLY wd, %13, %14, %12 245cabdff1aSopenharmony_ci SBUTTERFLY wd, %15, %16, %12 246cabdff1aSopenharmony_ci 247cabdff1aSopenharmony_ci SBUTTERFLY dq, %1, %3, %12 248cabdff1aSopenharmony_ci SBUTTERFLY dq, %2, %4, %12 249cabdff1aSopenharmony_ci SBUTTERFLY dq, %5, %7, %12 250cabdff1aSopenharmony_ci SBUTTERFLY dq, %6, %8, %12 251cabdff1aSopenharmony_ci SBUTTERFLY dq, %9, %11, %12 252cabdff1aSopenharmony_ci mova %18, m%11 253cabdff1aSopenharmony_ci mova m%12, %17 254cabdff1aSopenharmony_ci SBUTTERFLY dq, %10, %12, %11 255cabdff1aSopenharmony_ci SBUTTERFLY dq, %13, %15, %11 256cabdff1aSopenharmony_ci SBUTTERFLY dq, %14, %16, %11 257cabdff1aSopenharmony_ci 258cabdff1aSopenharmony_ci SBUTTERFLY qdq, %1, %5, %11 259cabdff1aSopenharmony_ci SBUTTERFLY qdq, %2, %6, %11 260cabdff1aSopenharmony_ci SBUTTERFLY qdq, %3, %7, %11 261cabdff1aSopenharmony_ci SBUTTERFLY qdq, %4, %8, %11 262cabdff1aSopenharmony_ci 263cabdff1aSopenharmony_ci SWAP %2, %5 264cabdff1aSopenharmony_ci SWAP %4, %7 265cabdff1aSopenharmony_ci 266cabdff1aSopenharmony_ci SBUTTERFLY qdq, %9, %13, %11 267cabdff1aSopenharmony_ci SBUTTERFLY qdq, %10, %14, %11 268cabdff1aSopenharmony_ci mova m%11, %18 269cabdff1aSopenharmony_ci mova %18, m%5 270cabdff1aSopenharmony_ci SBUTTERFLY qdq, %11, %15, %5 271cabdff1aSopenharmony_ci SBUTTERFLY qdq, %12, %16, %5 272cabdff1aSopenharmony_ci 273cabdff1aSopenharmony_ci%if %0 < 19 274cabdff1aSopenharmony_ci mova m%5, %18 275cabdff1aSopenharmony_ci%endif 276cabdff1aSopenharmony_ci 277cabdff1aSopenharmony_ci SWAP %10, %13 278cabdff1aSopenharmony_ci SWAP %12, %15 279cabdff1aSopenharmony_ci%endmacro 280cabdff1aSopenharmony_ci 281cabdff1aSopenharmony_ci%macro TRANSPOSE_8X8B 8 282cabdff1aSopenharmony_ci %if mmsize == 8 283cabdff1aSopenharmony_ci %error "This macro does not support mmsize == 8" 284cabdff1aSopenharmony_ci %endif 285cabdff1aSopenharmony_ci punpcklbw m%1, m%2 286cabdff1aSopenharmony_ci punpcklbw m%3, m%4 287cabdff1aSopenharmony_ci punpcklbw m%5, m%6 288cabdff1aSopenharmony_ci punpcklbw m%7, m%8 289cabdff1aSopenharmony_ci TRANSPOSE4x4W %1, %3, %5, %7, %2 290cabdff1aSopenharmony_ci MOVHL m%2, m%1 291cabdff1aSopenharmony_ci MOVHL m%4, m%3 292cabdff1aSopenharmony_ci MOVHL m%6, m%5 293cabdff1aSopenharmony_ci MOVHL m%8, m%7 294cabdff1aSopenharmony_ci%endmacro 295cabdff1aSopenharmony_ci 296cabdff1aSopenharmony_ci; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place 297cabdff1aSopenharmony_ci%macro PABSW 2 298cabdff1aSopenharmony_ci%if cpuflag(ssse3) 299cabdff1aSopenharmony_ci pabsw %1, %2 300cabdff1aSopenharmony_ci%elif cpuflag(mmxext) 301cabdff1aSopenharmony_ci pxor %1, %1 302cabdff1aSopenharmony_ci psubw %1, %2 303cabdff1aSopenharmony_ci pmaxsw %1, %2 304cabdff1aSopenharmony_ci%else 305cabdff1aSopenharmony_ci pxor %1, %1 306cabdff1aSopenharmony_ci pcmpgtw %1, %2 307cabdff1aSopenharmony_ci pxor %2, %1 308cabdff1aSopenharmony_ci psubw %2, %1 309cabdff1aSopenharmony_ci SWAP %1, %2 310cabdff1aSopenharmony_ci%endif 311cabdff1aSopenharmony_ci%endmacro 312cabdff1aSopenharmony_ci 313cabdff1aSopenharmony_ci%macro PSIGNW 2 314cabdff1aSopenharmony_ci%if cpuflag(ssse3) 315cabdff1aSopenharmony_ci psignw %1, %2 316cabdff1aSopenharmony_ci%else 317cabdff1aSopenharmony_ci pxor %1, %2 318cabdff1aSopenharmony_ci psubw %1, %2 319cabdff1aSopenharmony_ci%endif 320cabdff1aSopenharmony_ci%endmacro 321cabdff1aSopenharmony_ci 322cabdff1aSopenharmony_ci%macro ABS1 2 323cabdff1aSopenharmony_ci%if cpuflag(ssse3) 324cabdff1aSopenharmony_ci pabsw %1, %1 325cabdff1aSopenharmony_ci%elif cpuflag(mmxext) ; a, tmp 326cabdff1aSopenharmony_ci pxor %2, %2 327cabdff1aSopenharmony_ci psubw %2, %1 328cabdff1aSopenharmony_ci pmaxsw %1, %2 329cabdff1aSopenharmony_ci%else ; a, tmp 330cabdff1aSopenharmony_ci pxor %2, %2 331cabdff1aSopenharmony_ci pcmpgtw %2, %1 332cabdff1aSopenharmony_ci pxor %1, %2 333cabdff1aSopenharmony_ci psubw %1, %2 334cabdff1aSopenharmony_ci%endif 335cabdff1aSopenharmony_ci%endmacro 336cabdff1aSopenharmony_ci 337cabdff1aSopenharmony_ci%macro ABS2 4 338cabdff1aSopenharmony_ci%if cpuflag(ssse3) 339cabdff1aSopenharmony_ci pabsw %1, %1 340cabdff1aSopenharmony_ci pabsw %2, %2 341cabdff1aSopenharmony_ci%elif cpuflag(mmxext) ; a, b, tmp0, tmp1 342cabdff1aSopenharmony_ci pxor %3, %3 343cabdff1aSopenharmony_ci pxor %4, %4 344cabdff1aSopenharmony_ci psubw %3, %1 345cabdff1aSopenharmony_ci psubw %4, %2 346cabdff1aSopenharmony_ci pmaxsw %1, %3 347cabdff1aSopenharmony_ci pmaxsw %2, %4 348cabdff1aSopenharmony_ci%else ; a, b, tmp0, tmp1 349cabdff1aSopenharmony_ci pxor %3, %3 350cabdff1aSopenharmony_ci pxor %4, %4 351cabdff1aSopenharmony_ci pcmpgtw %3, %1 352cabdff1aSopenharmony_ci pcmpgtw %4, %2 353cabdff1aSopenharmony_ci pxor %1, %3 354cabdff1aSopenharmony_ci pxor %2, %4 355cabdff1aSopenharmony_ci psubw %1, %3 356cabdff1aSopenharmony_ci psubw %2, %4 357cabdff1aSopenharmony_ci%endif 358cabdff1aSopenharmony_ci%endmacro 359cabdff1aSopenharmony_ci 360cabdff1aSopenharmony_ci%macro ABSB 2 ; source mmreg, temp mmreg (unused for SSSE3) 361cabdff1aSopenharmony_ci%if cpuflag(ssse3) 362cabdff1aSopenharmony_ci pabsb %1, %1 363cabdff1aSopenharmony_ci%else 364cabdff1aSopenharmony_ci pxor %2, %2 365cabdff1aSopenharmony_ci psubb %2, %1 366cabdff1aSopenharmony_ci pminub %1, %2 367cabdff1aSopenharmony_ci%endif 368cabdff1aSopenharmony_ci%endmacro 369cabdff1aSopenharmony_ci 370cabdff1aSopenharmony_ci%macro ABSB2 4 ; src1, src2, tmp1, tmp2 (tmp1/2 unused for SSSE3) 371cabdff1aSopenharmony_ci%if cpuflag(ssse3) 372cabdff1aSopenharmony_ci pabsb %1, %1 373cabdff1aSopenharmony_ci pabsb %2, %2 374cabdff1aSopenharmony_ci%else 375cabdff1aSopenharmony_ci pxor %3, %3 376cabdff1aSopenharmony_ci pxor %4, %4 377cabdff1aSopenharmony_ci psubb %3, %1 378cabdff1aSopenharmony_ci psubb %4, %2 379cabdff1aSopenharmony_ci pminub %1, %3 380cabdff1aSopenharmony_ci pminub %2, %4 381cabdff1aSopenharmony_ci%endif 382cabdff1aSopenharmony_ci%endmacro 383cabdff1aSopenharmony_ci 384cabdff1aSopenharmony_ci%macro ABSD2 4 385cabdff1aSopenharmony_ci pxor %3, %3 386cabdff1aSopenharmony_ci pxor %4, %4 387cabdff1aSopenharmony_ci pcmpgtd %3, %1 388cabdff1aSopenharmony_ci pcmpgtd %4, %2 389cabdff1aSopenharmony_ci pxor %1, %3 390cabdff1aSopenharmony_ci pxor %2, %4 391cabdff1aSopenharmony_ci psubd %1, %3 392cabdff1aSopenharmony_ci psubd %2, %4 393cabdff1aSopenharmony_ci%endmacro 394cabdff1aSopenharmony_ci 395cabdff1aSopenharmony_ci%macro ABS4 6 396cabdff1aSopenharmony_ci ABS2 %1, %2, %5, %6 397cabdff1aSopenharmony_ci ABS2 %3, %4, %5, %6 398cabdff1aSopenharmony_ci%endmacro 399cabdff1aSopenharmony_ci 400cabdff1aSopenharmony_ci%macro SPLATB_LOAD 3 401cabdff1aSopenharmony_ci%if cpuflag(ssse3) 402cabdff1aSopenharmony_ci movd %1, [%2-3] 403cabdff1aSopenharmony_ci pshufb %1, %3 404cabdff1aSopenharmony_ci%else 405cabdff1aSopenharmony_ci movd %1, [%2-3] ;to avoid crossing a cacheline 406cabdff1aSopenharmony_ci punpcklbw %1, %1 407cabdff1aSopenharmony_ci SPLATW %1, %1, 3 408cabdff1aSopenharmony_ci%endif 409cabdff1aSopenharmony_ci%endmacro 410cabdff1aSopenharmony_ci 411cabdff1aSopenharmony_ci%macro SPLATB_REG 3 412cabdff1aSopenharmony_ci%if cpuflag(ssse3) 413cabdff1aSopenharmony_ci movd %1, %2d 414cabdff1aSopenharmony_ci pshufb %1, %3 415cabdff1aSopenharmony_ci%else 416cabdff1aSopenharmony_ci movd %1, %2d 417cabdff1aSopenharmony_ci punpcklbw %1, %1 418cabdff1aSopenharmony_ci SPLATW %1, %1, 0 419cabdff1aSopenharmony_ci%endif 420cabdff1aSopenharmony_ci%endmacro 421cabdff1aSopenharmony_ci 422cabdff1aSopenharmony_ci%macro HADDD 2 ; sum junk 423cabdff1aSopenharmony_ci%if sizeof%1 == 32 424cabdff1aSopenharmony_ci%define %2 xmm%2 425cabdff1aSopenharmony_ci vextracti128 %2, %1, 1 426cabdff1aSopenharmony_ci%define %1 xmm%1 427cabdff1aSopenharmony_ci paddd %1, %2 428cabdff1aSopenharmony_ci%endif 429cabdff1aSopenharmony_ci%if mmsize >= 16 430cabdff1aSopenharmony_ci%if cpuflag(xop) && sizeof%1 == 16 431cabdff1aSopenharmony_ci vphadddq %1, %1 432cabdff1aSopenharmony_ci%endif 433cabdff1aSopenharmony_ci movhlps %2, %1 434cabdff1aSopenharmony_ci paddd %1, %2 435cabdff1aSopenharmony_ci%endif 436cabdff1aSopenharmony_ci%if notcpuflag(xop) || sizeof%1 != 16 437cabdff1aSopenharmony_ci%if cpuflag(mmxext) 438cabdff1aSopenharmony_ci PSHUFLW %2, %1, q0032 439cabdff1aSopenharmony_ci%else ; mmx 440cabdff1aSopenharmony_ci mova %2, %1 441cabdff1aSopenharmony_ci psrlq %2, 32 442cabdff1aSopenharmony_ci%endif 443cabdff1aSopenharmony_ci paddd %1, %2 444cabdff1aSopenharmony_ci%endif 445cabdff1aSopenharmony_ci%undef %1 446cabdff1aSopenharmony_ci%undef %2 447cabdff1aSopenharmony_ci%endmacro 448cabdff1aSopenharmony_ci 449cabdff1aSopenharmony_ci%macro HADDW 2 ; reg, tmp 450cabdff1aSopenharmony_ci%if cpuflag(xop) && sizeof%1 == 16 451cabdff1aSopenharmony_ci vphaddwq %1, %1 452cabdff1aSopenharmony_ci movhlps %2, %1 453cabdff1aSopenharmony_ci paddd %1, %2 454cabdff1aSopenharmony_ci%else 455cabdff1aSopenharmony_ci pmaddwd %1, [pw_1] 456cabdff1aSopenharmony_ci HADDD %1, %2 457cabdff1aSopenharmony_ci%endif 458cabdff1aSopenharmony_ci%endmacro 459cabdff1aSopenharmony_ci 460cabdff1aSopenharmony_ci%macro HADDPS 3 ; dst, src, tmp 461cabdff1aSopenharmony_ci%if cpuflag(sse3) 462cabdff1aSopenharmony_ci haddps %1, %1, %2 463cabdff1aSopenharmony_ci%else 464cabdff1aSopenharmony_ci movaps %3, %1 465cabdff1aSopenharmony_ci shufps %1, %2, q2020 466cabdff1aSopenharmony_ci shufps %3, %2, q3131 467cabdff1aSopenharmony_ci addps %1, %3 468cabdff1aSopenharmony_ci%endif 469cabdff1aSopenharmony_ci%endmacro 470cabdff1aSopenharmony_ci 471cabdff1aSopenharmony_ci%macro PALIGNR 4-5 472cabdff1aSopenharmony_ci%if cpuflag(ssse3) 473cabdff1aSopenharmony_ci%if %0==5 474cabdff1aSopenharmony_ci palignr %1, %2, %3, %4 475cabdff1aSopenharmony_ci%else 476cabdff1aSopenharmony_ci palignr %1, %2, %3 477cabdff1aSopenharmony_ci%endif 478cabdff1aSopenharmony_ci%else ; [dst,] src1, src2, imm, tmp 479cabdff1aSopenharmony_ci %define %%dst %1 480cabdff1aSopenharmony_ci%if %0==5 481cabdff1aSopenharmony_ci%ifnidn %1, %2 482cabdff1aSopenharmony_ci mova %%dst, %2 483cabdff1aSopenharmony_ci%endif 484cabdff1aSopenharmony_ci %rotate 1 485cabdff1aSopenharmony_ci%endif 486cabdff1aSopenharmony_ci%ifnidn %4, %2 487cabdff1aSopenharmony_ci mova %4, %2 488cabdff1aSopenharmony_ci%endif 489cabdff1aSopenharmony_ci%if mmsize==8 490cabdff1aSopenharmony_ci psllq %%dst, (8-%3)*8 491cabdff1aSopenharmony_ci psrlq %4, %3*8 492cabdff1aSopenharmony_ci%else 493cabdff1aSopenharmony_ci pslldq %%dst, 16-%3 494cabdff1aSopenharmony_ci psrldq %4, %3 495cabdff1aSopenharmony_ci%endif 496cabdff1aSopenharmony_ci por %%dst, %4 497cabdff1aSopenharmony_ci%endif 498cabdff1aSopenharmony_ci%endmacro 499cabdff1aSopenharmony_ci 500cabdff1aSopenharmony_ci%macro PAVGB 2-4 501cabdff1aSopenharmony_ci%if cpuflag(mmxext) 502cabdff1aSopenharmony_ci pavgb %1, %2 503cabdff1aSopenharmony_ci%elif cpuflag(3dnow) 504cabdff1aSopenharmony_ci pavgusb %1, %2 505cabdff1aSopenharmony_ci%elif cpuflag(mmx) 506cabdff1aSopenharmony_ci movu %3, %2 507cabdff1aSopenharmony_ci por %3, %1 508cabdff1aSopenharmony_ci pxor %1, %2 509cabdff1aSopenharmony_ci pand %1, %4 510cabdff1aSopenharmony_ci psrlq %1, 1 511cabdff1aSopenharmony_ci psubb %3, %1 512cabdff1aSopenharmony_ci SWAP %1, %3 513cabdff1aSopenharmony_ci%endif 514cabdff1aSopenharmony_ci%endmacro 515cabdff1aSopenharmony_ci 516cabdff1aSopenharmony_ci%macro PSHUFLW 1+ 517cabdff1aSopenharmony_ci %if mmsize == 8 518cabdff1aSopenharmony_ci pshufw %1 519cabdff1aSopenharmony_ci %else 520cabdff1aSopenharmony_ci pshuflw %1 521cabdff1aSopenharmony_ci %endif 522cabdff1aSopenharmony_ci%endmacro 523cabdff1aSopenharmony_ci 524cabdff1aSopenharmony_ci%macro PSWAPD 2 525cabdff1aSopenharmony_ci%if cpuflag(mmxext) 526cabdff1aSopenharmony_ci pshufw %1, %2, q1032 527cabdff1aSopenharmony_ci%elif cpuflag(3dnowext) 528cabdff1aSopenharmony_ci pswapd %1, %2 529cabdff1aSopenharmony_ci%elif cpuflag(3dnow) 530cabdff1aSopenharmony_ci movq %1, %2 531cabdff1aSopenharmony_ci psrlq %1, 32 532cabdff1aSopenharmony_ci punpckldq %1, %2 533cabdff1aSopenharmony_ci%endif 534cabdff1aSopenharmony_ci%endmacro 535cabdff1aSopenharmony_ci 536cabdff1aSopenharmony_ci%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from 537cabdff1aSopenharmony_ci%ifnum %5 538cabdff1aSopenharmony_ci pand m%3, m%5, m%4 ; src .. y6 .. y4 539cabdff1aSopenharmony_ci pand m%1, m%5, m%2 ; dst .. y6 .. y4 540cabdff1aSopenharmony_ci%else 541cabdff1aSopenharmony_ci mova m%1, %5 542cabdff1aSopenharmony_ci pand m%3, m%1, m%4 ; src .. y6 .. y4 543cabdff1aSopenharmony_ci pand m%1, m%1, m%2 ; dst .. y6 .. y4 544cabdff1aSopenharmony_ci%endif 545cabdff1aSopenharmony_ci psrlw m%2, 8 ; dst .. y7 .. y5 546cabdff1aSopenharmony_ci psrlw m%4, 8 ; src .. y7 .. y5 547cabdff1aSopenharmony_ci%endmacro 548cabdff1aSopenharmony_ci 549cabdff1aSopenharmony_ci%macro SUMSUB_BA 3-4 550cabdff1aSopenharmony_ci%if %0==3 551cabdff1aSopenharmony_ci padd%1 m%2, m%3 552cabdff1aSopenharmony_ci padd%1 m%3, m%3 553cabdff1aSopenharmony_ci psub%1 m%3, m%2 554cabdff1aSopenharmony_ci%else 555cabdff1aSopenharmony_ci%if avx_enabled == 0 556cabdff1aSopenharmony_ci mova m%4, m%2 557cabdff1aSopenharmony_ci padd%1 m%2, m%3 558cabdff1aSopenharmony_ci psub%1 m%3, m%4 559cabdff1aSopenharmony_ci%else 560cabdff1aSopenharmony_ci padd%1 m%4, m%2, m%3 561cabdff1aSopenharmony_ci psub%1 m%3, m%2 562cabdff1aSopenharmony_ci SWAP %2, %4 563cabdff1aSopenharmony_ci%endif 564cabdff1aSopenharmony_ci%endif 565cabdff1aSopenharmony_ci%endmacro 566cabdff1aSopenharmony_ci 567cabdff1aSopenharmony_ci%macro SUMSUB_BADC 5-6 568cabdff1aSopenharmony_ci%if %0==6 569cabdff1aSopenharmony_ci SUMSUB_BA %1, %2, %3, %6 570cabdff1aSopenharmony_ci SUMSUB_BA %1, %4, %5, %6 571cabdff1aSopenharmony_ci%else 572cabdff1aSopenharmony_ci padd%1 m%2, m%3 573cabdff1aSopenharmony_ci padd%1 m%4, m%5 574cabdff1aSopenharmony_ci padd%1 m%3, m%3 575cabdff1aSopenharmony_ci padd%1 m%5, m%5 576cabdff1aSopenharmony_ci psub%1 m%3, m%2 577cabdff1aSopenharmony_ci psub%1 m%5, m%4 578cabdff1aSopenharmony_ci%endif 579cabdff1aSopenharmony_ci%endmacro 580cabdff1aSopenharmony_ci 581cabdff1aSopenharmony_ci%macro SUMSUB2_AB 4 582cabdff1aSopenharmony_ci%ifnum %3 583cabdff1aSopenharmony_ci psub%1 m%4, m%2, m%3 584cabdff1aSopenharmony_ci psub%1 m%4, m%3 585cabdff1aSopenharmony_ci padd%1 m%2, m%2 586cabdff1aSopenharmony_ci padd%1 m%2, m%3 587cabdff1aSopenharmony_ci%else 588cabdff1aSopenharmony_ci mova m%4, m%2 589cabdff1aSopenharmony_ci padd%1 m%2, m%2 590cabdff1aSopenharmony_ci padd%1 m%2, %3 591cabdff1aSopenharmony_ci psub%1 m%4, %3 592cabdff1aSopenharmony_ci psub%1 m%4, %3 593cabdff1aSopenharmony_ci%endif 594cabdff1aSopenharmony_ci%endmacro 595cabdff1aSopenharmony_ci 596cabdff1aSopenharmony_ci%macro SUMSUB2_BA 4 597cabdff1aSopenharmony_ci%if avx_enabled == 0 598cabdff1aSopenharmony_ci mova m%4, m%2 599cabdff1aSopenharmony_ci padd%1 m%2, m%3 600cabdff1aSopenharmony_ci padd%1 m%2, m%3 601cabdff1aSopenharmony_ci psub%1 m%3, m%4 602cabdff1aSopenharmony_ci psub%1 m%3, m%4 603cabdff1aSopenharmony_ci%else 604cabdff1aSopenharmony_ci padd%1 m%4, m%2, m%3 605cabdff1aSopenharmony_ci padd%1 m%4, m%3 606cabdff1aSopenharmony_ci psub%1 m%3, m%2 607cabdff1aSopenharmony_ci psub%1 m%3, m%2 608cabdff1aSopenharmony_ci SWAP %2, %4 609cabdff1aSopenharmony_ci%endif 610cabdff1aSopenharmony_ci%endmacro 611cabdff1aSopenharmony_ci 612cabdff1aSopenharmony_ci%macro SUMSUBD2_AB 5 613cabdff1aSopenharmony_ci%ifnum %4 614cabdff1aSopenharmony_ci psra%1 m%5, m%2, 1 ; %3: %3>>1 615cabdff1aSopenharmony_ci psra%1 m%4, m%3, 1 ; %2: %2>>1 616cabdff1aSopenharmony_ci padd%1 m%4, m%2 ; %3: %3>>1+%2 617cabdff1aSopenharmony_ci psub%1 m%5, m%3 ; %2: %2>>1-%3 618cabdff1aSopenharmony_ci SWAP %2, %5 619cabdff1aSopenharmony_ci SWAP %3, %4 620cabdff1aSopenharmony_ci%else 621cabdff1aSopenharmony_ci mova %5, m%2 622cabdff1aSopenharmony_ci mova %4, m%3 623cabdff1aSopenharmony_ci psra%1 m%3, 1 ; %3: %3>>1 624cabdff1aSopenharmony_ci psra%1 m%2, 1 ; %2: %2>>1 625cabdff1aSopenharmony_ci padd%1 m%3, %5 ; %3: %3>>1+%2 626cabdff1aSopenharmony_ci psub%1 m%2, %4 ; %2: %2>>1-%3 627cabdff1aSopenharmony_ci%endif 628cabdff1aSopenharmony_ci%endmacro 629cabdff1aSopenharmony_ci 630cabdff1aSopenharmony_ci%macro DCT4_1D 5 631cabdff1aSopenharmony_ci%ifnum %5 632cabdff1aSopenharmony_ci SUMSUB_BADC w, %4, %1, %3, %2, %5 633cabdff1aSopenharmony_ci SUMSUB_BA w, %3, %4, %5 634cabdff1aSopenharmony_ci SUMSUB2_AB w, %1, %2, %5 635cabdff1aSopenharmony_ci SWAP %1, %3, %4, %5, %2 636cabdff1aSopenharmony_ci%else 637cabdff1aSopenharmony_ci SUMSUB_BADC w, %4, %1, %3, %2 638cabdff1aSopenharmony_ci SUMSUB_BA w, %3, %4 639cabdff1aSopenharmony_ci mova [%5], m%2 640cabdff1aSopenharmony_ci SUMSUB2_AB w, %1, [%5], %2 641cabdff1aSopenharmony_ci SWAP %1, %3, %4, %2 642cabdff1aSopenharmony_ci%endif 643cabdff1aSopenharmony_ci%endmacro 644cabdff1aSopenharmony_ci 645cabdff1aSopenharmony_ci%macro IDCT4_1D 6-7 646cabdff1aSopenharmony_ci%ifnum %6 647cabdff1aSopenharmony_ci SUMSUBD2_AB %1, %3, %5, %7, %6 648cabdff1aSopenharmony_ci ; %3: %3>>1-%5 %5: %3+%5>>1 649cabdff1aSopenharmony_ci SUMSUB_BA %1, %4, %2, %7 650cabdff1aSopenharmony_ci ; %4: %2+%4 %2: %2-%4 651cabdff1aSopenharmony_ci SUMSUB_BADC %1, %5, %4, %3, %2, %7 652cabdff1aSopenharmony_ci ; %5: %2+%4 + (%3+%5>>1) 653cabdff1aSopenharmony_ci ; %4: %2+%4 - (%3+%5>>1) 654cabdff1aSopenharmony_ci ; %3: %2-%4 + (%3>>1-%5) 655cabdff1aSopenharmony_ci ; %2: %2-%4 - (%3>>1-%5) 656cabdff1aSopenharmony_ci%else 657cabdff1aSopenharmony_ci%ifidn %1, w 658cabdff1aSopenharmony_ci SUMSUBD2_AB %1, %3, %5, [%6], [%6+16] 659cabdff1aSopenharmony_ci%else 660cabdff1aSopenharmony_ci SUMSUBD2_AB %1, %3, %5, [%6], [%6+32] 661cabdff1aSopenharmony_ci%endif 662cabdff1aSopenharmony_ci SUMSUB_BA %1, %4, %2 663cabdff1aSopenharmony_ci SUMSUB_BADC %1, %5, %4, %3, %2 664cabdff1aSopenharmony_ci%endif 665cabdff1aSopenharmony_ci SWAP %2, %5, %4 666cabdff1aSopenharmony_ci ; %2: %2+%4 + (%3+%5>>1) row0 667cabdff1aSopenharmony_ci ; %3: %2-%4 + (%3>>1-%5) row1 668cabdff1aSopenharmony_ci ; %4: %2-%4 - (%3>>1-%5) row2 669cabdff1aSopenharmony_ci ; %5: %2+%4 - (%3+%5>>1) row3 670cabdff1aSopenharmony_ci%endmacro 671cabdff1aSopenharmony_ci 672cabdff1aSopenharmony_ci 673cabdff1aSopenharmony_ci%macro LOAD_DIFF 5 674cabdff1aSopenharmony_ci%ifidn %3, none 675cabdff1aSopenharmony_ci movh %1, %4 676cabdff1aSopenharmony_ci movh %2, %5 677cabdff1aSopenharmony_ci punpcklbw %1, %2 678cabdff1aSopenharmony_ci punpcklbw %2, %2 679cabdff1aSopenharmony_ci psubw %1, %2 680cabdff1aSopenharmony_ci%else 681cabdff1aSopenharmony_ci movh %1, %4 682cabdff1aSopenharmony_ci punpcklbw %1, %3 683cabdff1aSopenharmony_ci movh %2, %5 684cabdff1aSopenharmony_ci punpcklbw %2, %3 685cabdff1aSopenharmony_ci psubw %1, %2 686cabdff1aSopenharmony_ci%endif 687cabdff1aSopenharmony_ci%endmacro 688cabdff1aSopenharmony_ci 689cabdff1aSopenharmony_ci%macro STORE_DCT 6 690cabdff1aSopenharmony_ci movq [%5+%6+ 0], m%1 691cabdff1aSopenharmony_ci movq [%5+%6+ 8], m%2 692cabdff1aSopenharmony_ci movq [%5+%6+16], m%3 693cabdff1aSopenharmony_ci movq [%5+%6+24], m%4 694cabdff1aSopenharmony_ci movhps [%5+%6+32], m%1 695cabdff1aSopenharmony_ci movhps [%5+%6+40], m%2 696cabdff1aSopenharmony_ci movhps [%5+%6+48], m%3 697cabdff1aSopenharmony_ci movhps [%5+%6+56], m%4 698cabdff1aSopenharmony_ci%endmacro 699cabdff1aSopenharmony_ci 700cabdff1aSopenharmony_ci%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment? 701cabdff1aSopenharmony_ci LOAD_DIFF m%1, m%5, m%7, [%8], [%9] 702cabdff1aSopenharmony_ci LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3] 703cabdff1aSopenharmony_ci LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3] 704cabdff1aSopenharmony_ci LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5] 705cabdff1aSopenharmony_ci%if %10 706cabdff1aSopenharmony_ci lea %8, [%8+4*r1] 707cabdff1aSopenharmony_ci lea %9, [%9+4*r3] 708cabdff1aSopenharmony_ci%endif 709cabdff1aSopenharmony_ci%endmacro 710cabdff1aSopenharmony_ci 711cabdff1aSopenharmony_ci%macro DIFFx2 6-7 712cabdff1aSopenharmony_ci movh %3, %5 713cabdff1aSopenharmony_ci punpcklbw %3, %4 714cabdff1aSopenharmony_ci psraw %1, 6 715cabdff1aSopenharmony_ci paddsw %1, %3 716cabdff1aSopenharmony_ci movh %3, %6 717cabdff1aSopenharmony_ci punpcklbw %3, %4 718cabdff1aSopenharmony_ci psraw %2, 6 719cabdff1aSopenharmony_ci paddsw %2, %3 720cabdff1aSopenharmony_ci packuswb %2, %1 721cabdff1aSopenharmony_ci%endmacro 722cabdff1aSopenharmony_ci 723cabdff1aSopenharmony_ci%macro STORE_DIFF 4 724cabdff1aSopenharmony_ci movh %2, %4 725cabdff1aSopenharmony_ci punpcklbw %2, %3 726cabdff1aSopenharmony_ci psraw %1, 6 727cabdff1aSopenharmony_ci paddsw %1, %2 728cabdff1aSopenharmony_ci packuswb %1, %1 729cabdff1aSopenharmony_ci movh %4, %1 730cabdff1aSopenharmony_ci%endmacro 731cabdff1aSopenharmony_ci 732cabdff1aSopenharmony_ci%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride 733cabdff1aSopenharmony_ci movh %3, [%7] 734cabdff1aSopenharmony_ci movh %4, [%7+%8] 735cabdff1aSopenharmony_ci psraw %1, %6 736cabdff1aSopenharmony_ci psraw %2, %6 737cabdff1aSopenharmony_ci punpcklbw %3, %5 738cabdff1aSopenharmony_ci punpcklbw %4, %5 739cabdff1aSopenharmony_ci paddw %3, %1 740cabdff1aSopenharmony_ci paddw %4, %2 741cabdff1aSopenharmony_ci packuswb %3, %5 742cabdff1aSopenharmony_ci packuswb %4, %5 743cabdff1aSopenharmony_ci movh [%7], %3 744cabdff1aSopenharmony_ci movh [%7+%8], %4 745cabdff1aSopenharmony_ci%endmacro 746cabdff1aSopenharmony_ci 747cabdff1aSopenharmony_ci%macro PMINUB 3 ; dst, src, ignored 748cabdff1aSopenharmony_ci%if cpuflag(mmxext) 749cabdff1aSopenharmony_ci pminub %1, %2 750cabdff1aSopenharmony_ci%else ; dst, src, tmp 751cabdff1aSopenharmony_ci mova %3, %1 752cabdff1aSopenharmony_ci psubusb %3, %2 753cabdff1aSopenharmony_ci psubb %1, %3 754cabdff1aSopenharmony_ci%endif 755cabdff1aSopenharmony_ci%endmacro 756cabdff1aSopenharmony_ci 757cabdff1aSopenharmony_ci%macro SPLATW 2-3 0 758cabdff1aSopenharmony_ci%if cpuflag(avx2) && %3 == 0 759cabdff1aSopenharmony_ci vpbroadcastw %1, %2 760cabdff1aSopenharmony_ci%elif mmsize == 16 761cabdff1aSopenharmony_ci pshuflw %1, %2, (%3)*0x55 762cabdff1aSopenharmony_ci punpcklqdq %1, %1 763cabdff1aSopenharmony_ci%elif cpuflag(mmxext) 764cabdff1aSopenharmony_ci pshufw %1, %2, (%3)*0x55 765cabdff1aSopenharmony_ci%else 766cabdff1aSopenharmony_ci %ifnidn %1, %2 767cabdff1aSopenharmony_ci mova %1, %2 768cabdff1aSopenharmony_ci %endif 769cabdff1aSopenharmony_ci %if %3 & 2 770cabdff1aSopenharmony_ci punpckhwd %1, %1 771cabdff1aSopenharmony_ci %else 772cabdff1aSopenharmony_ci punpcklwd %1, %1 773cabdff1aSopenharmony_ci %endif 774cabdff1aSopenharmony_ci %if %3 & 1 775cabdff1aSopenharmony_ci punpckhwd %1, %1 776cabdff1aSopenharmony_ci %else 777cabdff1aSopenharmony_ci punpcklwd %1, %1 778cabdff1aSopenharmony_ci %endif 779cabdff1aSopenharmony_ci%endif 780cabdff1aSopenharmony_ci%endmacro 781cabdff1aSopenharmony_ci 782cabdff1aSopenharmony_ci%macro SPLATD 1 783cabdff1aSopenharmony_ci%if mmsize == 8 784cabdff1aSopenharmony_ci punpckldq %1, %1 785cabdff1aSopenharmony_ci%elif cpuflag(sse2) 786cabdff1aSopenharmony_ci pshufd %1, %1, 0 787cabdff1aSopenharmony_ci%elif cpuflag(sse) 788cabdff1aSopenharmony_ci shufps %1, %1, 0 789cabdff1aSopenharmony_ci%endif 790cabdff1aSopenharmony_ci%endmacro 791cabdff1aSopenharmony_ci 792cabdff1aSopenharmony_ci%macro CLIPUB 3 ;(dst, min, max) 793cabdff1aSopenharmony_ci pmaxub %1, %2 794cabdff1aSopenharmony_ci pminub %1, %3 795cabdff1aSopenharmony_ci%endmacro 796cabdff1aSopenharmony_ci 797cabdff1aSopenharmony_ci%macro CLIPW 3 ;(dst, min, max) 798cabdff1aSopenharmony_ci pmaxsw %1, %2 799cabdff1aSopenharmony_ci pminsw %1, %3 800cabdff1aSopenharmony_ci%endmacro 801cabdff1aSopenharmony_ci 802cabdff1aSopenharmony_ci%macro PMINSD 3 ; dst, src, tmp/unused 803cabdff1aSopenharmony_ci%if cpuflag(sse4) 804cabdff1aSopenharmony_ci pminsd %1, %2 805cabdff1aSopenharmony_ci%elif cpuflag(sse2) 806cabdff1aSopenharmony_ci cvtdq2ps %1, %1 807cabdff1aSopenharmony_ci minps %1, %2 808cabdff1aSopenharmony_ci cvtps2dq %1, %1 809cabdff1aSopenharmony_ci%else 810cabdff1aSopenharmony_ci mova %3, %2 811cabdff1aSopenharmony_ci pcmpgtd %3, %1 812cabdff1aSopenharmony_ci pxor %1, %2 813cabdff1aSopenharmony_ci pand %1, %3 814cabdff1aSopenharmony_ci pxor %1, %2 815cabdff1aSopenharmony_ci%endif 816cabdff1aSopenharmony_ci%endmacro 817cabdff1aSopenharmony_ci 818cabdff1aSopenharmony_ci%macro PMAXSD 3 ; dst, src, tmp/unused 819cabdff1aSopenharmony_ci%if cpuflag(sse4) 820cabdff1aSopenharmony_ci pmaxsd %1, %2 821cabdff1aSopenharmony_ci%else 822cabdff1aSopenharmony_ci mova %3, %1 823cabdff1aSopenharmony_ci pcmpgtd %3, %2 824cabdff1aSopenharmony_ci pand %1, %3 825cabdff1aSopenharmony_ci pandn %3, %2 826cabdff1aSopenharmony_ci por %1, %3 827cabdff1aSopenharmony_ci%endif 828cabdff1aSopenharmony_ci%endmacro 829cabdff1aSopenharmony_ci 830cabdff1aSopenharmony_ci%macro CLIPD 3-4 831cabdff1aSopenharmony_ci%if cpuflag(sse4); src/dst, min, max, unused 832cabdff1aSopenharmony_ci pminsd %1, %3 833cabdff1aSopenharmony_ci pmaxsd %1, %2 834cabdff1aSopenharmony_ci%elif cpuflag(sse2) ; src/dst, min (float), max (float), unused 835cabdff1aSopenharmony_ci cvtdq2ps %1, %1 836cabdff1aSopenharmony_ci minps %1, %3 837cabdff1aSopenharmony_ci maxps %1, %2 838cabdff1aSopenharmony_ci cvtps2dq %1, %1 839cabdff1aSopenharmony_ci%else ; src/dst, min, max, tmp 840cabdff1aSopenharmony_ci PMINSD %1, %3, %4 841cabdff1aSopenharmony_ci PMAXSD %1, %2, %4 842cabdff1aSopenharmony_ci%endif 843cabdff1aSopenharmony_ci%endmacro 844cabdff1aSopenharmony_ci 845cabdff1aSopenharmony_ci%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm 846cabdff1aSopenharmony_ci%if cpuflag(avx2) 847cabdff1aSopenharmony_ci vbroadcastss %1, %2 848cabdff1aSopenharmony_ci%elif cpuflag(avx) 849cabdff1aSopenharmony_ci %ifnum sizeof%2 ; avx1 register 850cabdff1aSopenharmony_ci shufps xmm%1, xmm%2, xmm%2, q0000 851cabdff1aSopenharmony_ci %if sizeof%1 >= 32 ; mmsize>=32 852cabdff1aSopenharmony_ci vinsertf128 %1, %1, xmm%1, 1 853cabdff1aSopenharmony_ci %endif 854cabdff1aSopenharmony_ci %else ; avx1 memory 855cabdff1aSopenharmony_ci vbroadcastss %1, %2 856cabdff1aSopenharmony_ci %endif 857cabdff1aSopenharmony_ci%else 858cabdff1aSopenharmony_ci %ifnum sizeof%2 ; sse register 859cabdff1aSopenharmony_ci shufps %1, %2, %2, q0000 860cabdff1aSopenharmony_ci %else ; sse memory 861cabdff1aSopenharmony_ci movss %1, %2 862cabdff1aSopenharmony_ci shufps %1, %1, 0 863cabdff1aSopenharmony_ci %endif 864cabdff1aSopenharmony_ci%endif 865cabdff1aSopenharmony_ci%endmacro 866cabdff1aSopenharmony_ci 867cabdff1aSopenharmony_ci%macro VBROADCASTSD 2 ; dst xmm/ymm, src m64 868cabdff1aSopenharmony_ci%if cpuflag(avx) && mmsize == 32 869cabdff1aSopenharmony_ci vbroadcastsd %1, %2 870cabdff1aSopenharmony_ci%elif cpuflag(sse3) 871cabdff1aSopenharmony_ci movddup %1, %2 872cabdff1aSopenharmony_ci%else ; sse2 873cabdff1aSopenharmony_ci movsd %1, %2 874cabdff1aSopenharmony_ci movlhps %1, %1 875cabdff1aSopenharmony_ci%endif 876cabdff1aSopenharmony_ci%endmacro 877cabdff1aSopenharmony_ci 878cabdff1aSopenharmony_ci%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm 879cabdff1aSopenharmony_ci%if cpuflag(avx2) 880cabdff1aSopenharmony_ci vpbroadcastd %1, %2 881cabdff1aSopenharmony_ci%elif cpuflag(avx) && sizeof%1 >= 32 882cabdff1aSopenharmony_ci %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss 883cabdff1aSopenharmony_ci%else 884cabdff1aSopenharmony_ci %ifnum sizeof%2 ; sse2 register 885cabdff1aSopenharmony_ci pshufd %1, %2, q0000 886cabdff1aSopenharmony_ci %else ; sse memory 887cabdff1aSopenharmony_ci movd %1, %2 888cabdff1aSopenharmony_ci pshufd %1, %1, 0 889cabdff1aSopenharmony_ci %endif 890cabdff1aSopenharmony_ci%endif 891cabdff1aSopenharmony_ci%endmacro 892cabdff1aSopenharmony_ci 893cabdff1aSopenharmony_ci%macro VBROADCASTI128 2 ; dst xmm/ymm, src : 128bits val 894cabdff1aSopenharmony_ci%if mmsize > 16 895cabdff1aSopenharmony_ci vbroadcasti128 %1, %2 896cabdff1aSopenharmony_ci%else 897cabdff1aSopenharmony_ci mova %1, %2 898cabdff1aSopenharmony_ci%endif 899cabdff1aSopenharmony_ci%endmacro 900cabdff1aSopenharmony_ci 901cabdff1aSopenharmony_ci%macro SHUFFLE_MASK_W 8 902cabdff1aSopenharmony_ci %rep 8 903cabdff1aSopenharmony_ci %if %1>=0x80 904cabdff1aSopenharmony_ci db %1, %1 905cabdff1aSopenharmony_ci %else 906cabdff1aSopenharmony_ci db %1*2 907cabdff1aSopenharmony_ci db %1*2+1 908cabdff1aSopenharmony_ci %endif 909cabdff1aSopenharmony_ci %rotate 1 910cabdff1aSopenharmony_ci %endrep 911cabdff1aSopenharmony_ci%endmacro 912cabdff1aSopenharmony_ci 913cabdff1aSopenharmony_ci%macro PMOVSXWD 2; dst, src 914cabdff1aSopenharmony_ci%if cpuflag(sse4) 915cabdff1aSopenharmony_ci pmovsxwd %1, %2 916cabdff1aSopenharmony_ci%else 917cabdff1aSopenharmony_ci %ifnidn %1, %2 918cabdff1aSopenharmony_ci mova %1, %2 919cabdff1aSopenharmony_ci %endif 920cabdff1aSopenharmony_ci punpcklwd %1, %1 921cabdff1aSopenharmony_ci psrad %1, 16 922cabdff1aSopenharmony_ci%endif 923cabdff1aSopenharmony_ci%endmacro 924cabdff1aSopenharmony_ci 925cabdff1aSopenharmony_ci; Wrapper for non-FMA version of fmaddps 926cabdff1aSopenharmony_ci%macro FMULADD_PS 5 927cabdff1aSopenharmony_ci %if cpuflag(fma3) || cpuflag(fma4) 928cabdff1aSopenharmony_ci fmaddps %1, %2, %3, %4 929cabdff1aSopenharmony_ci %elifidn %1, %4 930cabdff1aSopenharmony_ci mulps %5, %2, %3 931cabdff1aSopenharmony_ci addps %1, %4, %5 932cabdff1aSopenharmony_ci %else 933cabdff1aSopenharmony_ci mulps %1, %2, %3 934cabdff1aSopenharmony_ci addps %1, %4 935cabdff1aSopenharmony_ci %endif 936cabdff1aSopenharmony_ci%endmacro 937cabdff1aSopenharmony_ci 938cabdff1aSopenharmony_ci%macro LSHIFT 2 939cabdff1aSopenharmony_ci%if mmsize > 8 940cabdff1aSopenharmony_ci pslldq %1, %2 941cabdff1aSopenharmony_ci%else 942cabdff1aSopenharmony_ci psllq %1, 8*(%2) 943cabdff1aSopenharmony_ci%endif 944cabdff1aSopenharmony_ci%endmacro 945cabdff1aSopenharmony_ci 946cabdff1aSopenharmony_ci%macro RSHIFT 2 947cabdff1aSopenharmony_ci%if mmsize > 8 948cabdff1aSopenharmony_ci psrldq %1, %2 949cabdff1aSopenharmony_ci%else 950cabdff1aSopenharmony_ci psrlq %1, 8*(%2) 951cabdff1aSopenharmony_ci%endif 952cabdff1aSopenharmony_ci%endmacro 953cabdff1aSopenharmony_ci 954cabdff1aSopenharmony_ci%macro MOVHL 2 ; dst, src 955cabdff1aSopenharmony_ci%ifidn %1, %2 956cabdff1aSopenharmony_ci punpckhqdq %1, %2 957cabdff1aSopenharmony_ci%elif cpuflag(avx) 958cabdff1aSopenharmony_ci punpckhqdq %1, %2, %2 959cabdff1aSopenharmony_ci%elif cpuflag(sse4) 960cabdff1aSopenharmony_ci pshufd %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones 961cabdff1aSopenharmony_ci%else 962cabdff1aSopenharmony_ci movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst 963cabdff1aSopenharmony_ci%endif 964cabdff1aSopenharmony_ci%endmacro 965cabdff1aSopenharmony_ci 966cabdff1aSopenharmony_ci; Horizontal Sum of Packed Single precision floats 967cabdff1aSopenharmony_ci; The resulting sum is in all elements. 968cabdff1aSopenharmony_ci%macro HSUMPS 2 ; dst/src, tmp 969cabdff1aSopenharmony_ci%if cpuflag(avx) 970cabdff1aSopenharmony_ci %if sizeof%1>=32 ; avx 971cabdff1aSopenharmony_ci vperm2f128 %2, %1, %1, (0)*16+(1) 972cabdff1aSopenharmony_ci addps %1, %2 973cabdff1aSopenharmony_ci %endif 974cabdff1aSopenharmony_ci shufps %2, %1, %1, q1032 975cabdff1aSopenharmony_ci addps %1, %2 976cabdff1aSopenharmony_ci shufps %2, %1, %1, q0321 977cabdff1aSopenharmony_ci addps %1, %2 978cabdff1aSopenharmony_ci%else ; this form is a bit faster than the short avx-like emulation. 979cabdff1aSopenharmony_ci movaps %2, %1 980cabdff1aSopenharmony_ci shufps %1, %1, q1032 981cabdff1aSopenharmony_ci addps %1, %2 982cabdff1aSopenharmony_ci movaps %2, %1 983cabdff1aSopenharmony_ci shufps %1, %1, q0321 984cabdff1aSopenharmony_ci addps %1, %2 985cabdff1aSopenharmony_ci ; all %1 members should be equal for as long as float a+b==b+a 986cabdff1aSopenharmony_ci%endif 987cabdff1aSopenharmony_ci%endmacro 988cabdff1aSopenharmony_ci 989cabdff1aSopenharmony_ci; Emulate blendvps if not available 990cabdff1aSopenharmony_ci; 991cabdff1aSopenharmony_ci; src_b is destroyed when using emulation with logical operands 992cabdff1aSopenharmony_ci; SSE41 blendv instruction is hard coded to use xmm0 as mask 993cabdff1aSopenharmony_ci%macro BLENDVPS 3 ; dst/src_a, src_b, mask 994cabdff1aSopenharmony_ci%if cpuflag(avx) 995cabdff1aSopenharmony_ci blendvps %1, %1, %2, %3 996cabdff1aSopenharmony_ci%elif cpuflag(sse4) 997cabdff1aSopenharmony_ci %ifnidn %3,xmm0 998cabdff1aSopenharmony_ci %error sse41 blendvps uses xmm0 as default 3d operand, you used %3 999cabdff1aSopenharmony_ci %endif 1000cabdff1aSopenharmony_ci blendvps %1, %2, %3 1001cabdff1aSopenharmony_ci%else 1002cabdff1aSopenharmony_ci xorps %2, %1 1003cabdff1aSopenharmony_ci andps %2, %3 1004cabdff1aSopenharmony_ci xorps %1, %2 1005cabdff1aSopenharmony_ci%endif 1006cabdff1aSopenharmony_ci%endmacro 1007cabdff1aSopenharmony_ci 1008cabdff1aSopenharmony_ci; Emulate pblendvb if not available 1009cabdff1aSopenharmony_ci; 1010cabdff1aSopenharmony_ci; src_b is destroyed when using emulation with logical operands 1011cabdff1aSopenharmony_ci; SSE41 blendv instruction is hard coded to use xmm0 as mask 1012cabdff1aSopenharmony_ci%macro PBLENDVB 3 ; dst/src_a, src_b, mask 1013cabdff1aSopenharmony_ci%if cpuflag(avx) 1014cabdff1aSopenharmony_ci %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32 1015cabdff1aSopenharmony_ci %error pblendb not possible with ymm on avx1, try blendvps. 1016cabdff1aSopenharmony_ci %endif 1017cabdff1aSopenharmony_ci pblendvb %1, %1, %2, %3 1018cabdff1aSopenharmony_ci%elif cpuflag(sse4) 1019cabdff1aSopenharmony_ci %ifnidn %3,xmm0 1020cabdff1aSopenharmony_ci %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3 1021cabdff1aSopenharmony_ci %endif 1022cabdff1aSopenharmony_ci pblendvb %1, %2, %3 1023cabdff1aSopenharmony_ci%else 1024cabdff1aSopenharmony_ci pxor %2, %1 1025cabdff1aSopenharmony_ci pand %2, %3 1026cabdff1aSopenharmony_ci pxor %1, %2 1027cabdff1aSopenharmony_ci%endif 1028cabdff1aSopenharmony_ci%endmacro 1029