1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * MMX and SSE2 optimized snow DSP utils 3cabdff1aSopenharmony_ci * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net> 4cabdff1aSopenharmony_ci * 5cabdff1aSopenharmony_ci * This file is part of FFmpeg. 6cabdff1aSopenharmony_ci * 7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci * 12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci * Lesser General Public License for more details. 16cabdff1aSopenharmony_ci * 17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci */ 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci#include <stdint.h> 23cabdff1aSopenharmony_ci#include "config.h" 24cabdff1aSopenharmony_ci#include "libavutil/attributes.h" 25cabdff1aSopenharmony_ci#include "libavutil/cpu.h" 26cabdff1aSopenharmony_ci#include "libavutil/x86/asm.h" 27cabdff1aSopenharmony_ci#include "libavcodec/snow.h" 28cabdff1aSopenharmony_ci#include "libavcodec/snow_dwt.h" 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_ci#if HAVE_INLINE_ASM 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_cistatic void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){ 33cabdff1aSopenharmony_ci const int w2= (width+1)>>1; 34cabdff1aSopenharmony_ci const int w_l= (width>>1); 35cabdff1aSopenharmony_ci const int w_r= w2 - 1; 36cabdff1aSopenharmony_ci int i; 37cabdff1aSopenharmony_ci 38cabdff1aSopenharmony_ci { // Lift 0 39cabdff1aSopenharmony_ci IDWTELEM * const ref = b + w2 - 1; 40cabdff1aSopenharmony_ci IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice 41cabdff1aSopenharmony_ci // (the first time erroneously), we allow the SSE2 code to run an extra pass. 42cabdff1aSopenharmony_ci // The savings in code and time are well worth having to store this value and 43cabdff1aSopenharmony_ci // calculate b[0] correctly afterwards. 44cabdff1aSopenharmony_ci 45cabdff1aSopenharmony_ci i = 0; 46cabdff1aSopenharmony_ci __asm__ volatile( 47cabdff1aSopenharmony_ci "pcmpeqd %%xmm7, %%xmm7 \n\t" 48cabdff1aSopenharmony_ci "pcmpeqd %%xmm3, %%xmm3 \n\t" 49cabdff1aSopenharmony_ci "psllw $1, %%xmm3 \n\t" 50cabdff1aSopenharmony_ci "paddw %%xmm7, %%xmm3 \n\t" 51cabdff1aSopenharmony_ci "psllw $13, %%xmm3 \n\t" 52cabdff1aSopenharmony_ci ::); 53cabdff1aSopenharmony_ci for(; i<w_l-15; i+=16){ 54cabdff1aSopenharmony_ci __asm__ volatile( 55cabdff1aSopenharmony_ci "movdqu (%1), %%xmm1 \n\t" 56cabdff1aSopenharmony_ci "movdqu 16(%1), %%xmm5 \n\t" 57cabdff1aSopenharmony_ci "movdqu 2(%1), %%xmm2 \n\t" 58cabdff1aSopenharmony_ci "movdqu 18(%1), %%xmm6 \n\t" 59cabdff1aSopenharmony_ci "paddw %%xmm1, %%xmm2 \n\t" 60cabdff1aSopenharmony_ci "paddw %%xmm5, %%xmm6 \n\t" 61cabdff1aSopenharmony_ci "paddw %%xmm7, %%xmm2 \n\t" 62cabdff1aSopenharmony_ci "paddw %%xmm7, %%xmm6 \n\t" 63cabdff1aSopenharmony_ci "pmulhw %%xmm3, %%xmm2 \n\t" 64cabdff1aSopenharmony_ci "pmulhw %%xmm3, %%xmm6 \n\t" 65cabdff1aSopenharmony_ci "paddw (%0), %%xmm2 \n\t" 66cabdff1aSopenharmony_ci "paddw 16(%0), %%xmm6 \n\t" 67cabdff1aSopenharmony_ci "movdqa %%xmm2, (%0) \n\t" 68cabdff1aSopenharmony_ci "movdqa %%xmm6, 16(%0) \n\t" 69cabdff1aSopenharmony_ci :: "r"(&b[i]), "r"(&ref[i]) 70cabdff1aSopenharmony_ci : "memory" 71cabdff1aSopenharmony_ci ); 72cabdff1aSopenharmony_ci } 73cabdff1aSopenharmony_ci snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); 74cabdff1aSopenharmony_ci b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); 75cabdff1aSopenharmony_ci } 76cabdff1aSopenharmony_ci 77cabdff1aSopenharmony_ci { // Lift 1 78cabdff1aSopenharmony_ci IDWTELEM * const dst = b+w2; 79cabdff1aSopenharmony_ci 80cabdff1aSopenharmony_ci i = 0; 81cabdff1aSopenharmony_ci for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){ 82cabdff1aSopenharmony_ci dst[i] = dst[i] - (b[i] + b[i + 1]); 83cabdff1aSopenharmony_ci } 84cabdff1aSopenharmony_ci for(; i<w_r-15; i+=16){ 85cabdff1aSopenharmony_ci __asm__ volatile( 86cabdff1aSopenharmony_ci "movdqu (%1), %%xmm1 \n\t" 87cabdff1aSopenharmony_ci "movdqu 16(%1), %%xmm5 \n\t" 88cabdff1aSopenharmony_ci "movdqu 2(%1), %%xmm2 \n\t" 89cabdff1aSopenharmony_ci "movdqu 18(%1), %%xmm6 \n\t" 90cabdff1aSopenharmony_ci "paddw %%xmm1, %%xmm2 \n\t" 91cabdff1aSopenharmony_ci "paddw %%xmm5, %%xmm6 \n\t" 92cabdff1aSopenharmony_ci "movdqa (%0), %%xmm0 \n\t" 93cabdff1aSopenharmony_ci "movdqa 16(%0), %%xmm4 \n\t" 94cabdff1aSopenharmony_ci "psubw %%xmm2, %%xmm0 \n\t" 95cabdff1aSopenharmony_ci "psubw %%xmm6, %%xmm4 \n\t" 96cabdff1aSopenharmony_ci "movdqa %%xmm0, (%0) \n\t" 97cabdff1aSopenharmony_ci "movdqa %%xmm4, 16(%0) \n\t" 98cabdff1aSopenharmony_ci :: "r"(&dst[i]), "r"(&b[i]) 99cabdff1aSopenharmony_ci : "memory" 100cabdff1aSopenharmony_ci ); 101cabdff1aSopenharmony_ci } 102cabdff1aSopenharmony_ci snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); 103cabdff1aSopenharmony_ci } 104cabdff1aSopenharmony_ci 105cabdff1aSopenharmony_ci { // Lift 2 106cabdff1aSopenharmony_ci IDWTELEM * const ref = b+w2 - 1; 107cabdff1aSopenharmony_ci IDWTELEM b_0 = b[0]; 108cabdff1aSopenharmony_ci 109cabdff1aSopenharmony_ci i = 0; 110cabdff1aSopenharmony_ci __asm__ volatile( 111cabdff1aSopenharmony_ci "psllw $15, %%xmm7 \n\t" 112cabdff1aSopenharmony_ci "pcmpeqw %%xmm6, %%xmm6 \n\t" 113cabdff1aSopenharmony_ci "psrlw $13, %%xmm6 \n\t" 114cabdff1aSopenharmony_ci "paddw %%xmm7, %%xmm6 \n\t" 115cabdff1aSopenharmony_ci ::); 116cabdff1aSopenharmony_ci for(; i<w_l-15; i+=16){ 117cabdff1aSopenharmony_ci __asm__ volatile( 118cabdff1aSopenharmony_ci "movdqu (%1), %%xmm0 \n\t" 119cabdff1aSopenharmony_ci "movdqu 16(%1), %%xmm4 \n\t" 120cabdff1aSopenharmony_ci "movdqu 2(%1), %%xmm1 \n\t" 121cabdff1aSopenharmony_ci "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts 122cabdff1aSopenharmony_ci "paddw %%xmm6, %%xmm0 \n\t" 123cabdff1aSopenharmony_ci "paddw %%xmm6, %%xmm4 \n\t" 124cabdff1aSopenharmony_ci "paddw %%xmm7, %%xmm1 \n\t" 125cabdff1aSopenharmony_ci "paddw %%xmm7, %%xmm5 \n\t" 126cabdff1aSopenharmony_ci "pavgw %%xmm1, %%xmm0 \n\t" 127cabdff1aSopenharmony_ci "pavgw %%xmm5, %%xmm4 \n\t" 128cabdff1aSopenharmony_ci "psubw %%xmm7, %%xmm0 \n\t" 129cabdff1aSopenharmony_ci "psubw %%xmm7, %%xmm4 \n\t" 130cabdff1aSopenharmony_ci "psraw $1, %%xmm0 \n\t" 131cabdff1aSopenharmony_ci "psraw $1, %%xmm4 \n\t" 132cabdff1aSopenharmony_ci "movdqa (%0), %%xmm1 \n\t" 133cabdff1aSopenharmony_ci "movdqa 16(%0), %%xmm5 \n\t" 134cabdff1aSopenharmony_ci "paddw %%xmm1, %%xmm0 \n\t" 135cabdff1aSopenharmony_ci "paddw %%xmm5, %%xmm4 \n\t" 136cabdff1aSopenharmony_ci "psraw $2, %%xmm0 \n\t" 137cabdff1aSopenharmony_ci "psraw $2, %%xmm4 \n\t" 138cabdff1aSopenharmony_ci "paddw %%xmm1, %%xmm0 \n\t" 139cabdff1aSopenharmony_ci "paddw %%xmm5, %%xmm4 \n\t" 140cabdff1aSopenharmony_ci "movdqa %%xmm0, (%0) \n\t" 141cabdff1aSopenharmony_ci "movdqa %%xmm4, 16(%0) \n\t" 142cabdff1aSopenharmony_ci :: "r"(&b[i]), "r"(&ref[i]) 143cabdff1aSopenharmony_ci : "memory" 144cabdff1aSopenharmony_ci ); 145cabdff1aSopenharmony_ci } 146cabdff1aSopenharmony_ci snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); 147cabdff1aSopenharmony_ci b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS); 148cabdff1aSopenharmony_ci } 149cabdff1aSopenharmony_ci 150cabdff1aSopenharmony_ci { // Lift 3 151cabdff1aSopenharmony_ci IDWTELEM * const src = b+w2; 152cabdff1aSopenharmony_ci 153cabdff1aSopenharmony_ci i = 0; 154cabdff1aSopenharmony_ci for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){ 155cabdff1aSopenharmony_ci temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS); 156cabdff1aSopenharmony_ci } 157cabdff1aSopenharmony_ci for(; i<w_r-7; i+=8){ 158cabdff1aSopenharmony_ci __asm__ volatile( 159cabdff1aSopenharmony_ci "movdqu 2(%1), %%xmm2 \n\t" 160cabdff1aSopenharmony_ci "movdqu 18(%1), %%xmm6 \n\t" 161cabdff1aSopenharmony_ci "paddw (%1), %%xmm2 \n\t" 162cabdff1aSopenharmony_ci "paddw 16(%1), %%xmm6 \n\t" 163cabdff1aSopenharmony_ci "movdqu (%0), %%xmm0 \n\t" 164cabdff1aSopenharmony_ci "movdqu 16(%0), %%xmm4 \n\t" 165cabdff1aSopenharmony_ci "paddw %%xmm2, %%xmm0 \n\t" 166cabdff1aSopenharmony_ci "paddw %%xmm6, %%xmm4 \n\t" 167cabdff1aSopenharmony_ci "psraw $1, %%xmm2 \n\t" 168cabdff1aSopenharmony_ci "psraw $1, %%xmm6 \n\t" 169cabdff1aSopenharmony_ci "paddw %%xmm0, %%xmm2 \n\t" 170cabdff1aSopenharmony_ci "paddw %%xmm4, %%xmm6 \n\t" 171cabdff1aSopenharmony_ci "movdqa %%xmm2, (%2) \n\t" 172cabdff1aSopenharmony_ci "movdqa %%xmm6, 16(%2) \n\t" 173cabdff1aSopenharmony_ci :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) 174cabdff1aSopenharmony_ci : "memory" 175cabdff1aSopenharmony_ci ); 176cabdff1aSopenharmony_ci } 177cabdff1aSopenharmony_ci snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); 178cabdff1aSopenharmony_ci } 179cabdff1aSopenharmony_ci 180cabdff1aSopenharmony_ci { 181cabdff1aSopenharmony_ci snow_interleave_line_header(&i, width, b, temp); 182cabdff1aSopenharmony_ci 183cabdff1aSopenharmony_ci for (; (i & 0x3E) != 0x3E; i-=2){ 184cabdff1aSopenharmony_ci b[i+1] = temp[i>>1]; 185cabdff1aSopenharmony_ci b[i] = b[i>>1]; 186cabdff1aSopenharmony_ci } 187cabdff1aSopenharmony_ci for (i-=62; i>=0; i-=64){ 188cabdff1aSopenharmony_ci __asm__ volatile( 189cabdff1aSopenharmony_ci "movdqa (%1), %%xmm0 \n\t" 190cabdff1aSopenharmony_ci "movdqa 16(%1), %%xmm2 \n\t" 191cabdff1aSopenharmony_ci "movdqa 32(%1), %%xmm4 \n\t" 192cabdff1aSopenharmony_ci "movdqa 48(%1), %%xmm6 \n\t" 193cabdff1aSopenharmony_ci "movdqa (%1), %%xmm1 \n\t" 194cabdff1aSopenharmony_ci "movdqa 16(%1), %%xmm3 \n\t" 195cabdff1aSopenharmony_ci "movdqa 32(%1), %%xmm5 \n\t" 196cabdff1aSopenharmony_ci "movdqa 48(%1), %%xmm7 \n\t" 197cabdff1aSopenharmony_ci "punpcklwd (%2), %%xmm0 \n\t" 198cabdff1aSopenharmony_ci "punpcklwd 16(%2), %%xmm2 \n\t" 199cabdff1aSopenharmony_ci "punpcklwd 32(%2), %%xmm4 \n\t" 200cabdff1aSopenharmony_ci "punpcklwd 48(%2), %%xmm6 \n\t" 201cabdff1aSopenharmony_ci "movdqa %%xmm0, (%0) \n\t" 202cabdff1aSopenharmony_ci "movdqa %%xmm2, 32(%0) \n\t" 203cabdff1aSopenharmony_ci "movdqa %%xmm4, 64(%0) \n\t" 204cabdff1aSopenharmony_ci "movdqa %%xmm6, 96(%0) \n\t" 205cabdff1aSopenharmony_ci "punpckhwd (%2), %%xmm1 \n\t" 206cabdff1aSopenharmony_ci "punpckhwd 16(%2), %%xmm3 \n\t" 207cabdff1aSopenharmony_ci "punpckhwd 32(%2), %%xmm5 \n\t" 208cabdff1aSopenharmony_ci "punpckhwd 48(%2), %%xmm7 \n\t" 209cabdff1aSopenharmony_ci "movdqa %%xmm1, 16(%0) \n\t" 210cabdff1aSopenharmony_ci "movdqa %%xmm3, 48(%0) \n\t" 211cabdff1aSopenharmony_ci "movdqa %%xmm5, 80(%0) \n\t" 212cabdff1aSopenharmony_ci "movdqa %%xmm7, 112(%0) \n\t" 213cabdff1aSopenharmony_ci :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1]) 214cabdff1aSopenharmony_ci : "memory" 215cabdff1aSopenharmony_ci ); 216cabdff1aSopenharmony_ci } 217cabdff1aSopenharmony_ci } 218cabdff1aSopenharmony_ci} 219cabdff1aSopenharmony_ci 220cabdff1aSopenharmony_cistatic void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){ 221cabdff1aSopenharmony_ci const int w2= (width+1)>>1; 222cabdff1aSopenharmony_ci const int w_l= (width>>1); 223cabdff1aSopenharmony_ci const int w_r= w2 - 1; 224cabdff1aSopenharmony_ci int i; 225cabdff1aSopenharmony_ci 226cabdff1aSopenharmony_ci { // Lift 0 227cabdff1aSopenharmony_ci IDWTELEM * const ref = b + w2 - 1; 228cabdff1aSopenharmony_ci 229cabdff1aSopenharmony_ci i = 1; 230cabdff1aSopenharmony_ci b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); 231cabdff1aSopenharmony_ci __asm__ volatile( 232cabdff1aSopenharmony_ci "pcmpeqw %%mm7, %%mm7 \n\t" 233cabdff1aSopenharmony_ci "pcmpeqw %%mm3, %%mm3 \n\t" 234cabdff1aSopenharmony_ci "psllw $1, %%mm3 \n\t" 235cabdff1aSopenharmony_ci "paddw %%mm7, %%mm3 \n\t" 236cabdff1aSopenharmony_ci "psllw $13, %%mm3 \n\t" 237cabdff1aSopenharmony_ci ::); 238cabdff1aSopenharmony_ci for(; i<w_l-7; i+=8){ 239cabdff1aSopenharmony_ci __asm__ volatile( 240cabdff1aSopenharmony_ci "movq (%1), %%mm2 \n\t" 241cabdff1aSopenharmony_ci "movq 8(%1), %%mm6 \n\t" 242cabdff1aSopenharmony_ci "paddw 2(%1), %%mm2 \n\t" 243cabdff1aSopenharmony_ci "paddw 10(%1), %%mm6 \n\t" 244cabdff1aSopenharmony_ci "paddw %%mm7, %%mm2 \n\t" 245cabdff1aSopenharmony_ci "paddw %%mm7, %%mm6 \n\t" 246cabdff1aSopenharmony_ci "pmulhw %%mm3, %%mm2 \n\t" 247cabdff1aSopenharmony_ci "pmulhw %%mm3, %%mm6 \n\t" 248cabdff1aSopenharmony_ci "paddw (%0), %%mm2 \n\t" 249cabdff1aSopenharmony_ci "paddw 8(%0), %%mm6 \n\t" 250cabdff1aSopenharmony_ci "movq %%mm2, (%0) \n\t" 251cabdff1aSopenharmony_ci "movq %%mm6, 8(%0) \n\t" 252cabdff1aSopenharmony_ci :: "r"(&b[i]), "r"(&ref[i]) 253cabdff1aSopenharmony_ci : "memory" 254cabdff1aSopenharmony_ci ); 255cabdff1aSopenharmony_ci } 256cabdff1aSopenharmony_ci snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); 257cabdff1aSopenharmony_ci } 258cabdff1aSopenharmony_ci 259cabdff1aSopenharmony_ci { // Lift 1 260cabdff1aSopenharmony_ci IDWTELEM * const dst = b+w2; 261cabdff1aSopenharmony_ci 262cabdff1aSopenharmony_ci i = 0; 263cabdff1aSopenharmony_ci for(; i<w_r-7; i+=8){ 264cabdff1aSopenharmony_ci __asm__ volatile( 265cabdff1aSopenharmony_ci "movq (%1), %%mm2 \n\t" 266cabdff1aSopenharmony_ci "movq 8(%1), %%mm6 \n\t" 267cabdff1aSopenharmony_ci "paddw 2(%1), %%mm2 \n\t" 268cabdff1aSopenharmony_ci "paddw 10(%1), %%mm6 \n\t" 269cabdff1aSopenharmony_ci "movq (%0), %%mm0 \n\t" 270cabdff1aSopenharmony_ci "movq 8(%0), %%mm4 \n\t" 271cabdff1aSopenharmony_ci "psubw %%mm2, %%mm0 \n\t" 272cabdff1aSopenharmony_ci "psubw %%mm6, %%mm4 \n\t" 273cabdff1aSopenharmony_ci "movq %%mm0, (%0) \n\t" 274cabdff1aSopenharmony_ci "movq %%mm4, 8(%0) \n\t" 275cabdff1aSopenharmony_ci :: "r"(&dst[i]), "r"(&b[i]) 276cabdff1aSopenharmony_ci : "memory" 277cabdff1aSopenharmony_ci ); 278cabdff1aSopenharmony_ci } 279cabdff1aSopenharmony_ci snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); 280cabdff1aSopenharmony_ci } 281cabdff1aSopenharmony_ci 282cabdff1aSopenharmony_ci { // Lift 2 283cabdff1aSopenharmony_ci IDWTELEM * const ref = b+w2 - 1; 284cabdff1aSopenharmony_ci 285cabdff1aSopenharmony_ci i = 1; 286cabdff1aSopenharmony_ci b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); 287cabdff1aSopenharmony_ci __asm__ volatile( 288cabdff1aSopenharmony_ci "psllw $15, %%mm7 \n\t" 289cabdff1aSopenharmony_ci "pcmpeqw %%mm6, %%mm6 \n\t" 290cabdff1aSopenharmony_ci "psrlw $13, %%mm6 \n\t" 291cabdff1aSopenharmony_ci "paddw %%mm7, %%mm6 \n\t" 292cabdff1aSopenharmony_ci ::); 293cabdff1aSopenharmony_ci for(; i<w_l-7; i+=8){ 294cabdff1aSopenharmony_ci __asm__ volatile( 295cabdff1aSopenharmony_ci "movq (%1), %%mm0 \n\t" 296cabdff1aSopenharmony_ci "movq 8(%1), %%mm4 \n\t" 297cabdff1aSopenharmony_ci "movq 2(%1), %%mm1 \n\t" 298cabdff1aSopenharmony_ci "movq 10(%1), %%mm5 \n\t" 299cabdff1aSopenharmony_ci "paddw %%mm6, %%mm0 \n\t" 300cabdff1aSopenharmony_ci "paddw %%mm6, %%mm4 \n\t" 301cabdff1aSopenharmony_ci "paddw %%mm7, %%mm1 \n\t" 302cabdff1aSopenharmony_ci "paddw %%mm7, %%mm5 \n\t" 303cabdff1aSopenharmony_ci "pavgw %%mm1, %%mm0 \n\t" 304cabdff1aSopenharmony_ci "pavgw %%mm5, %%mm4 \n\t" 305cabdff1aSopenharmony_ci "psubw %%mm7, %%mm0 \n\t" 306cabdff1aSopenharmony_ci "psubw %%mm7, %%mm4 \n\t" 307cabdff1aSopenharmony_ci "psraw $1, %%mm0 \n\t" 308cabdff1aSopenharmony_ci "psraw $1, %%mm4 \n\t" 309cabdff1aSopenharmony_ci "movq (%0), %%mm1 \n\t" 310cabdff1aSopenharmony_ci "movq 8(%0), %%mm5 \n\t" 311cabdff1aSopenharmony_ci "paddw %%mm1, %%mm0 \n\t" 312cabdff1aSopenharmony_ci "paddw %%mm5, %%mm4 \n\t" 313cabdff1aSopenharmony_ci "psraw $2, %%mm0 \n\t" 314cabdff1aSopenharmony_ci "psraw $2, %%mm4 \n\t" 315cabdff1aSopenharmony_ci "paddw %%mm1, %%mm0 \n\t" 316cabdff1aSopenharmony_ci "paddw %%mm5, %%mm4 \n\t" 317cabdff1aSopenharmony_ci "movq %%mm0, (%0) \n\t" 318cabdff1aSopenharmony_ci "movq %%mm4, 8(%0) \n\t" 319cabdff1aSopenharmony_ci :: "r"(&b[i]), "r"(&ref[i]) 320cabdff1aSopenharmony_ci : "memory" 321cabdff1aSopenharmony_ci ); 322cabdff1aSopenharmony_ci } 323cabdff1aSopenharmony_ci snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); 324cabdff1aSopenharmony_ci } 325cabdff1aSopenharmony_ci 326cabdff1aSopenharmony_ci { // Lift 3 327cabdff1aSopenharmony_ci IDWTELEM * const src = b+w2; 328cabdff1aSopenharmony_ci i = 0; 329cabdff1aSopenharmony_ci 330cabdff1aSopenharmony_ci for(; i<w_r-7; i+=8){ 331cabdff1aSopenharmony_ci __asm__ volatile( 332cabdff1aSopenharmony_ci "movq 2(%1), %%mm2 \n\t" 333cabdff1aSopenharmony_ci "movq 10(%1), %%mm6 \n\t" 334cabdff1aSopenharmony_ci "paddw (%1), %%mm2 \n\t" 335cabdff1aSopenharmony_ci "paddw 8(%1), %%mm6 \n\t" 336cabdff1aSopenharmony_ci "movq (%0), %%mm0 \n\t" 337cabdff1aSopenharmony_ci "movq 8(%0), %%mm4 \n\t" 338cabdff1aSopenharmony_ci "paddw %%mm2, %%mm0 \n\t" 339cabdff1aSopenharmony_ci "paddw %%mm6, %%mm4 \n\t" 340cabdff1aSopenharmony_ci "psraw $1, %%mm2 \n\t" 341cabdff1aSopenharmony_ci "psraw $1, %%mm6 \n\t" 342cabdff1aSopenharmony_ci "paddw %%mm0, %%mm2 \n\t" 343cabdff1aSopenharmony_ci "paddw %%mm4, %%mm6 \n\t" 344cabdff1aSopenharmony_ci "movq %%mm2, (%2) \n\t" 345cabdff1aSopenharmony_ci "movq %%mm6, 8(%2) \n\t" 346cabdff1aSopenharmony_ci :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) 347cabdff1aSopenharmony_ci : "memory" 348cabdff1aSopenharmony_ci ); 349cabdff1aSopenharmony_ci } 350cabdff1aSopenharmony_ci snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); 351cabdff1aSopenharmony_ci } 352cabdff1aSopenharmony_ci 353cabdff1aSopenharmony_ci { 354cabdff1aSopenharmony_ci snow_interleave_line_header(&i, width, b, temp); 355cabdff1aSopenharmony_ci 356cabdff1aSopenharmony_ci for (; (i & 0x1E) != 0x1E; i-=2){ 357cabdff1aSopenharmony_ci b[i+1] = temp[i>>1]; 358cabdff1aSopenharmony_ci b[i] = b[i>>1]; 359cabdff1aSopenharmony_ci } 360cabdff1aSopenharmony_ci for (i-=30; i>=0; i-=32){ 361cabdff1aSopenharmony_ci __asm__ volatile( 362cabdff1aSopenharmony_ci "movq (%1), %%mm0 \n\t" 363cabdff1aSopenharmony_ci "movq 8(%1), %%mm2 \n\t" 364cabdff1aSopenharmony_ci "movq 16(%1), %%mm4 \n\t" 365cabdff1aSopenharmony_ci "movq 24(%1), %%mm6 \n\t" 366cabdff1aSopenharmony_ci "movq (%1), %%mm1 \n\t" 367cabdff1aSopenharmony_ci "movq 8(%1), %%mm3 \n\t" 368cabdff1aSopenharmony_ci "movq 16(%1), %%mm5 \n\t" 369cabdff1aSopenharmony_ci "movq 24(%1), %%mm7 \n\t" 370cabdff1aSopenharmony_ci "punpcklwd (%2), %%mm0 \n\t" 371cabdff1aSopenharmony_ci "punpcklwd 8(%2), %%mm2 \n\t" 372cabdff1aSopenharmony_ci "punpcklwd 16(%2), %%mm4 \n\t" 373cabdff1aSopenharmony_ci "punpcklwd 24(%2), %%mm6 \n\t" 374cabdff1aSopenharmony_ci "movq %%mm0, (%0) \n\t" 375cabdff1aSopenharmony_ci "movq %%mm2, 16(%0) \n\t" 376cabdff1aSopenharmony_ci "movq %%mm4, 32(%0) \n\t" 377cabdff1aSopenharmony_ci "movq %%mm6, 48(%0) \n\t" 378cabdff1aSopenharmony_ci "punpckhwd (%2), %%mm1 \n\t" 379cabdff1aSopenharmony_ci "punpckhwd 8(%2), %%mm3 \n\t" 380cabdff1aSopenharmony_ci "punpckhwd 16(%2), %%mm5 \n\t" 381cabdff1aSopenharmony_ci "punpckhwd 24(%2), %%mm7 \n\t" 382cabdff1aSopenharmony_ci "movq %%mm1, 8(%0) \n\t" 383cabdff1aSopenharmony_ci "movq %%mm3, 24(%0) \n\t" 384cabdff1aSopenharmony_ci "movq %%mm5, 40(%0) \n\t" 385cabdff1aSopenharmony_ci "movq %%mm7, 56(%0) \n\t" 386cabdff1aSopenharmony_ci :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1]) 387cabdff1aSopenharmony_ci : "memory" 388cabdff1aSopenharmony_ci ); 389cabdff1aSopenharmony_ci } 390cabdff1aSopenharmony_ci } 391cabdff1aSopenharmony_ci} 392cabdff1aSopenharmony_ci 393cabdff1aSopenharmony_ci#if HAVE_7REGS 394cabdff1aSopenharmony_ci#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\ 395cabdff1aSopenharmony_ci ""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\ 396cabdff1aSopenharmony_ci ""op" 16("r",%%"FF_REG_d"), %%"t1" \n\t"\ 397cabdff1aSopenharmony_ci ""op" 32("r",%%"FF_REG_d"), %%"t2" \n\t"\ 398cabdff1aSopenharmony_ci ""op" 48("r",%%"FF_REG_d"), %%"t3" \n\t" 399cabdff1aSopenharmony_ci 400cabdff1aSopenharmony_ci#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\ 401cabdff1aSopenharmony_ci snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3) 402cabdff1aSopenharmony_ci 403cabdff1aSopenharmony_ci#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\ 404cabdff1aSopenharmony_ci snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3) 405cabdff1aSopenharmony_ci 406cabdff1aSopenharmony_ci#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\ 407cabdff1aSopenharmony_ci "psubw %%"s0", %%"t0" \n\t"\ 408cabdff1aSopenharmony_ci "psubw %%"s1", %%"t1" \n\t"\ 409cabdff1aSopenharmony_ci "psubw %%"s2", %%"t2" \n\t"\ 410cabdff1aSopenharmony_ci "psubw %%"s3", %%"t3" \n\t" 411cabdff1aSopenharmony_ci 412cabdff1aSopenharmony_ci#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\ 413cabdff1aSopenharmony_ci "movdqa %%"s0", ("w",%%"FF_REG_d") \n\t"\ 414cabdff1aSopenharmony_ci "movdqa %%"s1", 16("w",%%"FF_REG_d") \n\t"\ 415cabdff1aSopenharmony_ci "movdqa %%"s2", 32("w",%%"FF_REG_d") \n\t"\ 416cabdff1aSopenharmony_ci "movdqa %%"s3", 48("w",%%"FF_REG_d") \n\t" 417cabdff1aSopenharmony_ci 418cabdff1aSopenharmony_ci#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\ 419cabdff1aSopenharmony_ci "psraw $"n", %%"t0" \n\t"\ 420cabdff1aSopenharmony_ci "psraw $"n", %%"t1" \n\t"\ 421cabdff1aSopenharmony_ci "psraw $"n", %%"t2" \n\t"\ 422cabdff1aSopenharmony_ci "psraw $"n", %%"t3" \n\t" 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_ci#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\ 425cabdff1aSopenharmony_ci "paddw %%"s0", %%"t0" \n\t"\ 426cabdff1aSopenharmony_ci "paddw %%"s1", %%"t1" \n\t"\ 427cabdff1aSopenharmony_ci "paddw %%"s2", %%"t2" \n\t"\ 428cabdff1aSopenharmony_ci "paddw %%"s3", %%"t3" \n\t" 429cabdff1aSopenharmony_ci 430cabdff1aSopenharmony_ci#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\ 431cabdff1aSopenharmony_ci "pmulhw %%"s0", %%"t0" \n\t"\ 432cabdff1aSopenharmony_ci "pmulhw %%"s1", %%"t1" \n\t"\ 433cabdff1aSopenharmony_ci "pmulhw %%"s2", %%"t2" \n\t"\ 434cabdff1aSopenharmony_ci "pmulhw %%"s3", %%"t3" \n\t" 435cabdff1aSopenharmony_ci 436cabdff1aSopenharmony_ci#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\ 437cabdff1aSopenharmony_ci "movdqa %%"s0", %%"t0" \n\t"\ 438cabdff1aSopenharmony_ci "movdqa %%"s1", %%"t1" \n\t"\ 439cabdff1aSopenharmony_ci "movdqa %%"s2", %%"t2" \n\t"\ 440cabdff1aSopenharmony_ci "movdqa %%"s3", %%"t3" \n\t" 441cabdff1aSopenharmony_ci 442cabdff1aSopenharmony_cistatic void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ 443cabdff1aSopenharmony_ci x86_reg i = width; 444cabdff1aSopenharmony_ci 445cabdff1aSopenharmony_ci while(i & 0x1F) 446cabdff1aSopenharmony_ci { 447cabdff1aSopenharmony_ci i--; 448cabdff1aSopenharmony_ci b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; 449cabdff1aSopenharmony_ci b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; 450cabdff1aSopenharmony_ci b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; 451cabdff1aSopenharmony_ci b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; 452cabdff1aSopenharmony_ci } 453cabdff1aSopenharmony_ci i+=i; 454cabdff1aSopenharmony_ci 455cabdff1aSopenharmony_ci __asm__ volatile ( 456cabdff1aSopenharmony_ci "jmp 2f \n\t" 457cabdff1aSopenharmony_ci "1: \n\t" 458cabdff1aSopenharmony_ci snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") 459cabdff1aSopenharmony_ci snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6") 460cabdff1aSopenharmony_ci 461cabdff1aSopenharmony_ci 462cabdff1aSopenharmony_ci "pcmpeqw %%xmm0, %%xmm0 \n\t" 463cabdff1aSopenharmony_ci "pcmpeqw %%xmm2, %%xmm2 \n\t" 464cabdff1aSopenharmony_ci "paddw %%xmm2, %%xmm2 \n\t" 465cabdff1aSopenharmony_ci "paddw %%xmm0, %%xmm2 \n\t" 466cabdff1aSopenharmony_ci "psllw $13, %%xmm2 \n\t" 467cabdff1aSopenharmony_ci snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7") 468cabdff1aSopenharmony_ci snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7") 469cabdff1aSopenharmony_ci snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7") 470cabdff1aSopenharmony_ci snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7") 471cabdff1aSopenharmony_ci snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") 472cabdff1aSopenharmony_ci snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7") 473cabdff1aSopenharmony_ci snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") 474cabdff1aSopenharmony_ci snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6") 475cabdff1aSopenharmony_ci 476cabdff1aSopenharmony_ci "pcmpeqw %%xmm7, %%xmm7 \n\t" 477cabdff1aSopenharmony_ci "pcmpeqw %%xmm5, %%xmm5 \n\t" 478cabdff1aSopenharmony_ci "psllw $15, %%xmm7 \n\t" 479cabdff1aSopenharmony_ci "psrlw $13, %%xmm5 \n\t" 480cabdff1aSopenharmony_ci "paddw %%xmm7, %%xmm5 \n\t" 481cabdff1aSopenharmony_ci snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6") 482cabdff1aSopenharmony_ci "movq (%2,%%"FF_REG_d"), %%xmm1 \n\t" 483cabdff1aSopenharmony_ci "movq 8(%2,%%"FF_REG_d"), %%xmm3 \n\t" 484cabdff1aSopenharmony_ci "paddw %%xmm7, %%xmm1 \n\t" 485cabdff1aSopenharmony_ci "paddw %%xmm7, %%xmm3 \n\t" 486cabdff1aSopenharmony_ci "pavgw %%xmm1, %%xmm0 \n\t" 487cabdff1aSopenharmony_ci "pavgw %%xmm3, %%xmm2 \n\t" 488cabdff1aSopenharmony_ci "movq 16(%2,%%"FF_REG_d"), %%xmm1 \n\t" 489cabdff1aSopenharmony_ci "movq 24(%2,%%"FF_REG_d"), %%xmm3 \n\t" 490cabdff1aSopenharmony_ci "paddw %%xmm7, %%xmm1 \n\t" 491cabdff1aSopenharmony_ci "paddw %%xmm7, %%xmm3 \n\t" 492cabdff1aSopenharmony_ci "pavgw %%xmm1, %%xmm4 \n\t" 493cabdff1aSopenharmony_ci "pavgw %%xmm3, %%xmm6 \n\t" 494cabdff1aSopenharmony_ci snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6") 495cabdff1aSopenharmony_ci snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") 496cabdff1aSopenharmony_ci snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") 497cabdff1aSopenharmony_ci 498cabdff1aSopenharmony_ci snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6") 499cabdff1aSopenharmony_ci snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") 500cabdff1aSopenharmony_ci snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6") 501cabdff1aSopenharmony_ci snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6") 502cabdff1aSopenharmony_ci snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") 503cabdff1aSopenharmony_ci snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") 504cabdff1aSopenharmony_ci snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") 505cabdff1aSopenharmony_ci snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6") 506cabdff1aSopenharmony_ci snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6") 507cabdff1aSopenharmony_ci 508cabdff1aSopenharmony_ci "2: \n\t" 509cabdff1aSopenharmony_ci "sub $64, %%"FF_REG_d" \n\t" 510cabdff1aSopenharmony_ci "jge 1b \n\t" 511cabdff1aSopenharmony_ci :"+d"(i) 512cabdff1aSopenharmony_ci :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); 513cabdff1aSopenharmony_ci} 514cabdff1aSopenharmony_ci 515cabdff1aSopenharmony_ci#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ 516cabdff1aSopenharmony_ci ""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\ 517cabdff1aSopenharmony_ci ""op" 8("r",%%"FF_REG_d"), %%"t1" \n\t"\ 518cabdff1aSopenharmony_ci ""op" 16("r",%%"FF_REG_d"), %%"t2" \n\t"\ 519cabdff1aSopenharmony_ci ""op" 24("r",%%"FF_REG_d"), %%"t3" \n\t" 520cabdff1aSopenharmony_ci 521cabdff1aSopenharmony_ci#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\ 522cabdff1aSopenharmony_ci snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3) 523cabdff1aSopenharmony_ci 524cabdff1aSopenharmony_ci#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\ 525cabdff1aSopenharmony_ci snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3) 526cabdff1aSopenharmony_ci 527cabdff1aSopenharmony_ci#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\ 528cabdff1aSopenharmony_ci "movq %%"s0", ("w",%%"FF_REG_d") \n\t"\ 529cabdff1aSopenharmony_ci "movq %%"s1", 8("w",%%"FF_REG_d") \n\t"\ 530cabdff1aSopenharmony_ci "movq %%"s2", 16("w",%%"FF_REG_d") \n\t"\ 531cabdff1aSopenharmony_ci "movq %%"s3", 24("w",%%"FF_REG_d") \n\t" 532cabdff1aSopenharmony_ci 533cabdff1aSopenharmony_ci#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\ 534cabdff1aSopenharmony_ci "movq %%"s0", %%"t0" \n\t"\ 535cabdff1aSopenharmony_ci "movq %%"s1", %%"t1" \n\t"\ 536cabdff1aSopenharmony_ci "movq %%"s2", %%"t2" \n\t"\ 537cabdff1aSopenharmony_ci "movq %%"s3", %%"t3" \n\t" 538cabdff1aSopenharmony_ci 539cabdff1aSopenharmony_ci 540cabdff1aSopenharmony_cistatic void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ 541cabdff1aSopenharmony_ci x86_reg i = width; 542cabdff1aSopenharmony_ci while(i & 15) 543cabdff1aSopenharmony_ci { 544cabdff1aSopenharmony_ci i--; 545cabdff1aSopenharmony_ci b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; 546cabdff1aSopenharmony_ci b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; 547cabdff1aSopenharmony_ci b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; 548cabdff1aSopenharmony_ci b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; 549cabdff1aSopenharmony_ci } 550cabdff1aSopenharmony_ci i+=i; 551cabdff1aSopenharmony_ci __asm__ volatile( 552cabdff1aSopenharmony_ci "jmp 2f \n\t" 553cabdff1aSopenharmony_ci "1: \n\t" 554cabdff1aSopenharmony_ci 555cabdff1aSopenharmony_ci snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7") 556cabdff1aSopenharmony_ci snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7") 557cabdff1aSopenharmony_ci "pcmpeqw %%mm0, %%mm0 \n\t" 558cabdff1aSopenharmony_ci "pcmpeqw %%mm2, %%mm2 \n\t" 559cabdff1aSopenharmony_ci "paddw %%mm2, %%mm2 \n\t" 560cabdff1aSopenharmony_ci "paddw %%mm0, %%mm2 \n\t" 561cabdff1aSopenharmony_ci "psllw $13, %%mm2 \n\t" 562cabdff1aSopenharmony_ci snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7") 563cabdff1aSopenharmony_ci snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7") 564cabdff1aSopenharmony_ci snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7") 565cabdff1aSopenharmony_ci snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7") 566cabdff1aSopenharmony_ci snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6") 567cabdff1aSopenharmony_ci snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7") 568cabdff1aSopenharmony_ci snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") 569cabdff1aSopenharmony_ci snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6") 570cabdff1aSopenharmony_ci "pcmpeqw %%mm7, %%mm7 \n\t" 571cabdff1aSopenharmony_ci "pcmpeqw %%mm5, %%mm5 \n\t" 572cabdff1aSopenharmony_ci "psllw $15, %%mm7 \n\t" 573cabdff1aSopenharmony_ci "psrlw $13, %%mm5 \n\t" 574cabdff1aSopenharmony_ci "paddw %%mm7, %%mm5 \n\t" 575cabdff1aSopenharmony_ci snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6") 576cabdff1aSopenharmony_ci "movq (%2,%%"FF_REG_d"), %%mm1 \n\t" 577cabdff1aSopenharmony_ci "movq 8(%2,%%"FF_REG_d"), %%mm3 \n\t" 578cabdff1aSopenharmony_ci "paddw %%mm7, %%mm1 \n\t" 579cabdff1aSopenharmony_ci "paddw %%mm7, %%mm3 \n\t" 580cabdff1aSopenharmony_ci "pavgw %%mm1, %%mm0 \n\t" 581cabdff1aSopenharmony_ci "pavgw %%mm3, %%mm2 \n\t" 582cabdff1aSopenharmony_ci "movq 16(%2,%%"FF_REG_d"), %%mm1 \n\t" 583cabdff1aSopenharmony_ci "movq 24(%2,%%"FF_REG_d"), %%mm3 \n\t" 584cabdff1aSopenharmony_ci "paddw %%mm7, %%mm1 \n\t" 585cabdff1aSopenharmony_ci "paddw %%mm7, %%mm3 \n\t" 586cabdff1aSopenharmony_ci "pavgw %%mm1, %%mm4 \n\t" 587cabdff1aSopenharmony_ci "pavgw %%mm3, %%mm6 \n\t" 588cabdff1aSopenharmony_ci snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6") 589cabdff1aSopenharmony_ci snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") 590cabdff1aSopenharmony_ci snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") 591cabdff1aSopenharmony_ci 592cabdff1aSopenharmony_ci snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6") 593cabdff1aSopenharmony_ci snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") 594cabdff1aSopenharmony_ci snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6") 595cabdff1aSopenharmony_ci snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6") 596cabdff1aSopenharmony_ci snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") 597cabdff1aSopenharmony_ci snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") 598cabdff1aSopenharmony_ci snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") 599cabdff1aSopenharmony_ci snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6") 600cabdff1aSopenharmony_ci snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6") 601cabdff1aSopenharmony_ci 602cabdff1aSopenharmony_ci "2: \n\t" 603cabdff1aSopenharmony_ci "sub $32, %%"FF_REG_d" \n\t" 604cabdff1aSopenharmony_ci "jge 1b \n\t" 605cabdff1aSopenharmony_ci :"+d"(i) 606cabdff1aSopenharmony_ci :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); 607cabdff1aSopenharmony_ci} 608cabdff1aSopenharmony_ci#endif //HAVE_7REGS 609cabdff1aSopenharmony_ci 610cabdff1aSopenharmony_ci#if HAVE_6REGS 611cabdff1aSopenharmony_ci#define snow_inner_add_yblock_sse2_header \ 612cabdff1aSopenharmony_ci IDWTELEM * * dst_array = sb->line + src_y;\ 613cabdff1aSopenharmony_ci x86_reg tmp;\ 614cabdff1aSopenharmony_ci __asm__ volatile(\ 615cabdff1aSopenharmony_ci "mov %7, %%"FF_REG_c" \n\t"\ 616cabdff1aSopenharmony_ci "mov %6, %2 \n\t"\ 617cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_S" \n\t"\ 618cabdff1aSopenharmony_ci "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ 619cabdff1aSopenharmony_ci "pcmpeqd %%xmm3, %%xmm3 \n\t"\ 620cabdff1aSopenharmony_ci "psllw $15, %%xmm3 \n\t"\ 621cabdff1aSopenharmony_ci "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ 622cabdff1aSopenharmony_ci "1: \n\t"\ 623cabdff1aSopenharmony_ci "mov %1, %%"FF_REG_D" \n\t"\ 624cabdff1aSopenharmony_ci "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\ 625cabdff1aSopenharmony_ci "add %3, %%"FF_REG_D" \n\t" 626cabdff1aSopenharmony_ci 627cabdff1aSopenharmony_ci#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ 628cabdff1aSopenharmony_ci "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\ 629cabdff1aSopenharmony_ci "movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\ 630cabdff1aSopenharmony_ci "movq (%%"FF_REG_d", %%"FF_REG_c"), %%"out_reg2" \n\t"\ 631cabdff1aSopenharmony_ci "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ 632cabdff1aSopenharmony_ci "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ 633cabdff1aSopenharmony_ci "movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\ 634cabdff1aSopenharmony_ci "movq "s_offset"+16(%%"FF_REG_S"), %%xmm4 \n\t"\ 635cabdff1aSopenharmony_ci "punpcklbw %%xmm7, %%xmm0 \n\t"\ 636cabdff1aSopenharmony_ci "punpcklbw %%xmm7, %%xmm4 \n\t"\ 637cabdff1aSopenharmony_ci "pmullw %%xmm0, %%"out_reg1" \n\t"\ 638cabdff1aSopenharmony_ci "pmullw %%xmm4, %%"out_reg2" \n\t" 639cabdff1aSopenharmony_ci 640cabdff1aSopenharmony_ci#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\ 641cabdff1aSopenharmony_ci "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\ 642cabdff1aSopenharmony_ci "movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\ 643cabdff1aSopenharmony_ci "movq 8(%%"FF_REG_d"), %%"out_reg2" \n\t"\ 644cabdff1aSopenharmony_ci "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ 645cabdff1aSopenharmony_ci "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ 646cabdff1aSopenharmony_ci "movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\ 647cabdff1aSopenharmony_ci "movq "s_offset"+8(%%"FF_REG_S"), %%xmm4 \n\t"\ 648cabdff1aSopenharmony_ci "punpcklbw %%xmm7, %%xmm0 \n\t"\ 649cabdff1aSopenharmony_ci "punpcklbw %%xmm7, %%xmm4 \n\t"\ 650cabdff1aSopenharmony_ci "pmullw %%xmm0, %%"out_reg1" \n\t"\ 651cabdff1aSopenharmony_ci "pmullw %%xmm4, %%"out_reg2" \n\t" 652cabdff1aSopenharmony_ci 653cabdff1aSopenharmony_ci#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \ 654cabdff1aSopenharmony_ci snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\ 655cabdff1aSopenharmony_ci "paddusw %%xmm2, %%xmm1 \n\t"\ 656cabdff1aSopenharmony_ci "paddusw %%xmm6, %%xmm5 \n\t" 657cabdff1aSopenharmony_ci 658cabdff1aSopenharmony_ci#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \ 659cabdff1aSopenharmony_ci snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\ 660cabdff1aSopenharmony_ci "paddusw %%xmm2, %%xmm1 \n\t"\ 661cabdff1aSopenharmony_ci "paddusw %%xmm6, %%xmm5 \n\t" 662cabdff1aSopenharmony_ci 663cabdff1aSopenharmony_ci#define snow_inner_add_yblock_sse2_end_common1\ 664cabdff1aSopenharmony_ci "add $32, %%"FF_REG_S" \n\t"\ 665cabdff1aSopenharmony_ci "add %%"FF_REG_c", %0 \n\t"\ 666cabdff1aSopenharmony_ci "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\ 667cabdff1aSopenharmony_ci "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\ 668cabdff1aSopenharmony_ci "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\ 669cabdff1aSopenharmony_ci "add %%"FF_REG_c", (%%"FF_REG_a") \n\t" 670cabdff1aSopenharmony_ci 671cabdff1aSopenharmony_ci#define snow_inner_add_yblock_sse2_end_common2\ 672cabdff1aSopenharmony_ci "jnz 1b \n\t"\ 673cabdff1aSopenharmony_ci :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ 674cabdff1aSopenharmony_ci :\ 675cabdff1aSopenharmony_ci "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ 676cabdff1aSopenharmony_ci XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\ 677cabdff1aSopenharmony_ci "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d""); 678cabdff1aSopenharmony_ci 679cabdff1aSopenharmony_ci#define snow_inner_add_yblock_sse2_end_8\ 680cabdff1aSopenharmony_ci "sal $1, %%"FF_REG_c" \n\t"\ 681cabdff1aSopenharmony_ci "add"FF_OPSIZE" $"FF_PTR_SIZE"*2, %1 \n\t"\ 682cabdff1aSopenharmony_ci snow_inner_add_yblock_sse2_end_common1\ 683cabdff1aSopenharmony_ci "sar $1, %%"FF_REG_c" \n\t"\ 684cabdff1aSopenharmony_ci "sub $2, %2 \n\t"\ 685cabdff1aSopenharmony_ci snow_inner_add_yblock_sse2_end_common2 686cabdff1aSopenharmony_ci 687cabdff1aSopenharmony_ci#define snow_inner_add_yblock_sse2_end_16\ 688cabdff1aSopenharmony_ci "add"FF_OPSIZE" $"FF_PTR_SIZE"*1, %1 \n\t"\ 689cabdff1aSopenharmony_ci snow_inner_add_yblock_sse2_end_common1\ 690cabdff1aSopenharmony_ci "dec %2 \n\t"\ 691cabdff1aSopenharmony_ci snow_inner_add_yblock_sse2_end_common2 692cabdff1aSopenharmony_ci 693cabdff1aSopenharmony_cistatic void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, 694cabdff1aSopenharmony_ci int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ 695cabdff1aSopenharmony_cisnow_inner_add_yblock_sse2_header 696cabdff1aSopenharmony_cisnow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0") 697cabdff1aSopenharmony_cisnow_inner_add_yblock_sse2_accum_8("2", "8") 698cabdff1aSopenharmony_cisnow_inner_add_yblock_sse2_accum_8("1", "128") 699cabdff1aSopenharmony_cisnow_inner_add_yblock_sse2_accum_8("0", "136") 700cabdff1aSopenharmony_ci 701cabdff1aSopenharmony_ci "mov %0, %%"FF_REG_d" \n\t" 702cabdff1aSopenharmony_ci "movdqa (%%"FF_REG_D"), %%xmm0 \n\t" 703cabdff1aSopenharmony_ci "movdqa %%xmm1, %%xmm2 \n\t" 704cabdff1aSopenharmony_ci 705cabdff1aSopenharmony_ci "punpckhwd %%xmm7, %%xmm1 \n\t" 706cabdff1aSopenharmony_ci "punpcklwd %%xmm7, %%xmm2 \n\t" 707cabdff1aSopenharmony_ci "paddd %%xmm2, %%xmm0 \n\t" 708cabdff1aSopenharmony_ci "movdqa 16(%%"FF_REG_D"), %%xmm2\n\t" 709cabdff1aSopenharmony_ci "paddd %%xmm1, %%xmm2 \n\t" 710cabdff1aSopenharmony_ci "paddd %%xmm3, %%xmm0 \n\t" 711cabdff1aSopenharmony_ci "paddd %%xmm3, %%xmm2 \n\t" 712cabdff1aSopenharmony_ci 713cabdff1aSopenharmony_ci "mov %1, %%"FF_REG_D" \n\t" 714cabdff1aSopenharmony_ci "mov "FF_PTR_SIZE"(%%"FF_REG_D"), %%"FF_REG_D"; \n\t" 715cabdff1aSopenharmony_ci "add %3, %%"FF_REG_D" \n\t" 716cabdff1aSopenharmony_ci 717cabdff1aSopenharmony_ci "movdqa (%%"FF_REG_D"), %%xmm4 \n\t" 718cabdff1aSopenharmony_ci "movdqa %%xmm5, %%xmm6 \n\t" 719cabdff1aSopenharmony_ci "punpckhwd %%xmm7, %%xmm5 \n\t" 720cabdff1aSopenharmony_ci "punpcklwd %%xmm7, %%xmm6 \n\t" 721cabdff1aSopenharmony_ci "paddd %%xmm6, %%xmm4 \n\t" 722cabdff1aSopenharmony_ci "movdqa 16(%%"FF_REG_D"), %%xmm6\n\t" 723cabdff1aSopenharmony_ci "paddd %%xmm5, %%xmm6 \n\t" 724cabdff1aSopenharmony_ci "paddd %%xmm3, %%xmm4 \n\t" 725cabdff1aSopenharmony_ci "paddd %%xmm3, %%xmm6 \n\t" 726cabdff1aSopenharmony_ci 727cabdff1aSopenharmony_ci "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */ 728cabdff1aSopenharmony_ci "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */ 729cabdff1aSopenharmony_ci "packssdw %%xmm2, %%xmm0 \n\t" 730cabdff1aSopenharmony_ci "packuswb %%xmm7, %%xmm0 \n\t" 731cabdff1aSopenharmony_ci "movq %%xmm0, (%%"FF_REG_d") \n\t" 732cabdff1aSopenharmony_ci 733cabdff1aSopenharmony_ci "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */ 734cabdff1aSopenharmony_ci "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */ 735cabdff1aSopenharmony_ci "packssdw %%xmm6, %%xmm4 \n\t" 736cabdff1aSopenharmony_ci "packuswb %%xmm7, %%xmm4 \n\t" 737cabdff1aSopenharmony_ci "movq %%xmm4, (%%"FF_REG_d",%%"FF_REG_c"); \n\t" 738cabdff1aSopenharmony_cisnow_inner_add_yblock_sse2_end_8 739cabdff1aSopenharmony_ci} 740cabdff1aSopenharmony_ci 741cabdff1aSopenharmony_cistatic void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, 742cabdff1aSopenharmony_ci int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ 743cabdff1aSopenharmony_cisnow_inner_add_yblock_sse2_header 744cabdff1aSopenharmony_cisnow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0") 745cabdff1aSopenharmony_cisnow_inner_add_yblock_sse2_accum_16("2", "16") 746cabdff1aSopenharmony_cisnow_inner_add_yblock_sse2_accum_16("1", "512") 747cabdff1aSopenharmony_cisnow_inner_add_yblock_sse2_accum_16("0", "528") 748cabdff1aSopenharmony_ci 749cabdff1aSopenharmony_ci "mov %0, %%"FF_REG_d" \n\t" 750cabdff1aSopenharmony_ci "psrlw $4, %%xmm1 \n\t" 751cabdff1aSopenharmony_ci "psrlw $4, %%xmm5 \n\t" 752cabdff1aSopenharmony_ci "paddw (%%"FF_REG_D"), %%xmm1 \n\t" 753cabdff1aSopenharmony_ci "paddw 16(%%"FF_REG_D"), %%xmm5 \n\t" 754cabdff1aSopenharmony_ci "paddw %%xmm3, %%xmm1 \n\t" 755cabdff1aSopenharmony_ci "paddw %%xmm3, %%xmm5 \n\t" 756cabdff1aSopenharmony_ci "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */ 757cabdff1aSopenharmony_ci "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */ 758cabdff1aSopenharmony_ci "packuswb %%xmm5, %%xmm1 \n\t" 759cabdff1aSopenharmony_ci 760cabdff1aSopenharmony_ci "movdqu %%xmm1, (%%"FF_REG_d") \n\t" 761cabdff1aSopenharmony_ci 762cabdff1aSopenharmony_cisnow_inner_add_yblock_sse2_end_16 763cabdff1aSopenharmony_ci} 764cabdff1aSopenharmony_ci 765cabdff1aSopenharmony_ci#define snow_inner_add_yblock_mmx_header \ 766cabdff1aSopenharmony_ci IDWTELEM * * dst_array = sb->line + src_y;\ 767cabdff1aSopenharmony_ci x86_reg tmp;\ 768cabdff1aSopenharmony_ci __asm__ volatile(\ 769cabdff1aSopenharmony_ci "mov %7, %%"FF_REG_c" \n\t"\ 770cabdff1aSopenharmony_ci "mov %6, %2 \n\t"\ 771cabdff1aSopenharmony_ci "mov %4, %%"FF_REG_S" \n\t"\ 772cabdff1aSopenharmony_ci "pxor %%mm7, %%mm7 \n\t" /* 0 */\ 773cabdff1aSopenharmony_ci "pcmpeqd %%mm3, %%mm3 \n\t"\ 774cabdff1aSopenharmony_ci "psllw $15, %%mm3 \n\t"\ 775cabdff1aSopenharmony_ci "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ 776cabdff1aSopenharmony_ci "1: \n\t"\ 777cabdff1aSopenharmony_ci "mov %1, %%"FF_REG_D" \n\t"\ 778cabdff1aSopenharmony_ci "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\ 779cabdff1aSopenharmony_ci "add %3, %%"FF_REG_D" \n\t" 780cabdff1aSopenharmony_ci 781cabdff1aSopenharmony_ci#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ 782cabdff1aSopenharmony_ci "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\ 783cabdff1aSopenharmony_ci "movd "d_offset"(%%"FF_REG_d"), %%"out_reg1" \n\t"\ 784cabdff1aSopenharmony_ci "movd "d_offset"+4(%%"FF_REG_d"), %%"out_reg2" \n\t"\ 785cabdff1aSopenharmony_ci "punpcklbw %%mm7, %%"out_reg1" \n\t"\ 786cabdff1aSopenharmony_ci "punpcklbw %%mm7, %%"out_reg2" \n\t"\ 787cabdff1aSopenharmony_ci "movd "s_offset"(%%"FF_REG_S"), %%mm0 \n\t"\ 788cabdff1aSopenharmony_ci "movd "s_offset"+4(%%"FF_REG_S"), %%mm4 \n\t"\ 789cabdff1aSopenharmony_ci "punpcklbw %%mm7, %%mm0 \n\t"\ 790cabdff1aSopenharmony_ci "punpcklbw %%mm7, %%mm4 \n\t"\ 791cabdff1aSopenharmony_ci "pmullw %%mm0, %%"out_reg1" \n\t"\ 792cabdff1aSopenharmony_ci "pmullw %%mm4, %%"out_reg2" \n\t" 793cabdff1aSopenharmony_ci 794cabdff1aSopenharmony_ci#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \ 795cabdff1aSopenharmony_ci snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\ 796cabdff1aSopenharmony_ci "paddusw %%mm2, %%mm1 \n\t"\ 797cabdff1aSopenharmony_ci "paddusw %%mm6, %%mm5 \n\t" 798cabdff1aSopenharmony_ci 799cabdff1aSopenharmony_ci#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\ 800cabdff1aSopenharmony_ci "mov %0, %%"FF_REG_d" \n\t"\ 801cabdff1aSopenharmony_ci "psrlw $4, %%mm1 \n\t"\ 802cabdff1aSopenharmony_ci "psrlw $4, %%mm5 \n\t"\ 803cabdff1aSopenharmony_ci "paddw "read_offset"(%%"FF_REG_D"), %%mm1 \n\t"\ 804cabdff1aSopenharmony_ci "paddw "read_offset"+8(%%"FF_REG_D"), %%mm5 \n\t"\ 805cabdff1aSopenharmony_ci "paddw %%mm3, %%mm1 \n\t"\ 806cabdff1aSopenharmony_ci "paddw %%mm3, %%mm5 \n\t"\ 807cabdff1aSopenharmony_ci "psraw $4, %%mm1 \n\t"\ 808cabdff1aSopenharmony_ci "psraw $4, %%mm5 \n\t"\ 809cabdff1aSopenharmony_ci "packuswb %%mm5, %%mm1 \n\t"\ 810cabdff1aSopenharmony_ci "movq %%mm1, "write_offset"(%%"FF_REG_d") \n\t" 811cabdff1aSopenharmony_ci 812cabdff1aSopenharmony_ci#define snow_inner_add_yblock_mmx_end(s_step)\ 813cabdff1aSopenharmony_ci "add $"s_step", %%"FF_REG_S" \n\t"\ 814cabdff1aSopenharmony_ci "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\ 815cabdff1aSopenharmony_ci "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\ 816cabdff1aSopenharmony_ci "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\ 817cabdff1aSopenharmony_ci "add %%"FF_REG_c", (%%"FF_REG_a") \n\t"\ 818cabdff1aSopenharmony_ci "add"FF_OPSIZE " $"FF_PTR_SIZE"*1, %1 \n\t"\ 819cabdff1aSopenharmony_ci "add %%"FF_REG_c", %0 \n\t"\ 820cabdff1aSopenharmony_ci "dec %2 \n\t"\ 821cabdff1aSopenharmony_ci "jnz 1b \n\t"\ 822cabdff1aSopenharmony_ci :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ 823cabdff1aSopenharmony_ci :\ 824cabdff1aSopenharmony_ci "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ 825cabdff1aSopenharmony_ci "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d""); 826cabdff1aSopenharmony_ci 827cabdff1aSopenharmony_cistatic void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, 828cabdff1aSopenharmony_ci int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ 829cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_header 830cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") 831cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_accum("2", "8", "0") 832cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_accum("1", "128", "0") 833cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_accum("0", "136", "0") 834cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_mix("0", "0") 835cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_end("16") 836cabdff1aSopenharmony_ci} 837cabdff1aSopenharmony_ci 838cabdff1aSopenharmony_cistatic void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, 839cabdff1aSopenharmony_ci int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ 840cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_header 841cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") 842cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_accum("2", "16", "0") 843cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_accum("1", "512", "0") 844cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_accum("0", "528", "0") 845cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_mix("0", "0") 846cabdff1aSopenharmony_ci 847cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8") 848cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_accum("2", "24", "8") 849cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_accum("1", "520", "8") 850cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_accum("0", "536", "8") 851cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_mix("16", "8") 852cabdff1aSopenharmony_cisnow_inner_add_yblock_mmx_end("32") 853cabdff1aSopenharmony_ci} 854cabdff1aSopenharmony_ci 855cabdff1aSopenharmony_cistatic void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, 856cabdff1aSopenharmony_ci int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ 857cabdff1aSopenharmony_ci 858cabdff1aSopenharmony_ci if (b_w == 16) 859cabdff1aSopenharmony_ci inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); 860cabdff1aSopenharmony_ci else if (b_w == 8 && obmc_stride == 16) { 861cabdff1aSopenharmony_ci if (!(b_h & 1)) 862cabdff1aSopenharmony_ci inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); 863cabdff1aSopenharmony_ci else 864cabdff1aSopenharmony_ci inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); 865cabdff1aSopenharmony_ci } else 866cabdff1aSopenharmony_ci ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); 867cabdff1aSopenharmony_ci} 868cabdff1aSopenharmony_ci 869cabdff1aSopenharmony_cistatic void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, 870cabdff1aSopenharmony_ci int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ 871cabdff1aSopenharmony_ci if (b_w == 16) 872cabdff1aSopenharmony_ci inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); 873cabdff1aSopenharmony_ci else if (b_w == 8 && obmc_stride == 16) 874cabdff1aSopenharmony_ci inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); 875cabdff1aSopenharmony_ci else 876cabdff1aSopenharmony_ci ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); 877cabdff1aSopenharmony_ci} 878cabdff1aSopenharmony_ci#endif /* HAVE_6REGS */ 879cabdff1aSopenharmony_ci 880cabdff1aSopenharmony_ci#endif /* HAVE_INLINE_ASM */ 881cabdff1aSopenharmony_ci 882cabdff1aSopenharmony_ciav_cold void ff_dwt_init_x86(SnowDWTContext *c) 883cabdff1aSopenharmony_ci{ 884cabdff1aSopenharmony_ci#if HAVE_INLINE_ASM 885cabdff1aSopenharmony_ci int mm_flags = av_get_cpu_flags(); 886cabdff1aSopenharmony_ci 887cabdff1aSopenharmony_ci if (mm_flags & AV_CPU_FLAG_MMX) { 888cabdff1aSopenharmony_ci if(mm_flags & AV_CPU_FLAG_SSE2 & 0){ 889cabdff1aSopenharmony_ci c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; 890cabdff1aSopenharmony_ci#if HAVE_7REGS 891cabdff1aSopenharmony_ci c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; 892cabdff1aSopenharmony_ci#endif 893cabdff1aSopenharmony_ci#if HAVE_6REGS 894cabdff1aSopenharmony_ci c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; 895cabdff1aSopenharmony_ci#endif 896cabdff1aSopenharmony_ci } 897cabdff1aSopenharmony_ci else{ 898cabdff1aSopenharmony_ci if (mm_flags & AV_CPU_FLAG_MMXEXT) { 899cabdff1aSopenharmony_ci c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; 900cabdff1aSopenharmony_ci#if HAVE_7REGS 901cabdff1aSopenharmony_ci c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; 902cabdff1aSopenharmony_ci#endif 903cabdff1aSopenharmony_ci } 904cabdff1aSopenharmony_ci#if HAVE_6REGS 905cabdff1aSopenharmony_ci c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; 906cabdff1aSopenharmony_ci#endif 907cabdff1aSopenharmony_ci } 908cabdff1aSopenharmony_ci } 909cabdff1aSopenharmony_ci#endif /* HAVE_INLINE_ASM */ 910cabdff1aSopenharmony_ci} 911