1cb93a386Sopenharmony_ci; 2cb93a386Sopenharmony_ci; jfdctint.asm - accurate integer FDCT (MMX) 3cb93a386Sopenharmony_ci; 4cb93a386Sopenharmony_ci; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5cb93a386Sopenharmony_ci; Copyright (C) 2016, 2020, D. R. Commander. 6cb93a386Sopenharmony_ci; 7cb93a386Sopenharmony_ci; Based on the x86 SIMD extension for IJG JPEG library 8cb93a386Sopenharmony_ci; Copyright (C) 1999-2006, MIYASAKA Masaru. 9cb93a386Sopenharmony_ci; For conditions of distribution and use, see copyright notice in jsimdext.inc 10cb93a386Sopenharmony_ci; 11cb93a386Sopenharmony_ci; This file should be assembled with NASM (Netwide Assembler), 12cb93a386Sopenharmony_ci; can *not* be assembled with Microsoft's MASM or any compatible 13cb93a386Sopenharmony_ci; assembler (including Borland's Turbo Assembler). 14cb93a386Sopenharmony_ci; NASM is available from http://nasm.sourceforge.net/ or 15cb93a386Sopenharmony_ci; http://sourceforge.net/project/showfiles.php?group_id=6208 16cb93a386Sopenharmony_ci; 17cb93a386Sopenharmony_ci; This file contains a slower but more accurate integer implementation of the 18cb93a386Sopenharmony_ci; forward DCT (Discrete Cosine Transform). The following code is based 19cb93a386Sopenharmony_ci; directly on the IJG's original jfdctint.c; see the jfdctint.c for 20cb93a386Sopenharmony_ci; more details. 21cb93a386Sopenharmony_ci 22cb93a386Sopenharmony_ci%include "jsimdext.inc" 23cb93a386Sopenharmony_ci%include "jdct.inc" 24cb93a386Sopenharmony_ci 25cb93a386Sopenharmony_ci; -------------------------------------------------------------------------- 26cb93a386Sopenharmony_ci 27cb93a386Sopenharmony_ci%define CONST_BITS 13 28cb93a386Sopenharmony_ci%define PASS1_BITS 2 29cb93a386Sopenharmony_ci 30cb93a386Sopenharmony_ci%define DESCALE_P1 (CONST_BITS - PASS1_BITS) 31cb93a386Sopenharmony_ci%define DESCALE_P2 (CONST_BITS + PASS1_BITS) 32cb93a386Sopenharmony_ci 33cb93a386Sopenharmony_ci%if CONST_BITS == 13 34cb93a386Sopenharmony_ciF_0_298 equ 2446 ; FIX(0.298631336) 35cb93a386Sopenharmony_ciF_0_390 equ 3196 ; FIX(0.390180644) 36cb93a386Sopenharmony_ciF_0_541 equ 4433 ; FIX(0.541196100) 37cb93a386Sopenharmony_ciF_0_765 equ 6270 ; FIX(0.765366865) 38cb93a386Sopenharmony_ciF_0_899 equ 7373 ; FIX(0.899976223) 39cb93a386Sopenharmony_ciF_1_175 equ 9633 ; FIX(1.175875602) 40cb93a386Sopenharmony_ciF_1_501 equ 12299 ; FIX(1.501321110) 41cb93a386Sopenharmony_ciF_1_847 equ 15137 ; FIX(1.847759065) 42cb93a386Sopenharmony_ciF_1_961 equ 16069 ; FIX(1.961570560) 43cb93a386Sopenharmony_ciF_2_053 equ 16819 ; FIX(2.053119869) 44cb93a386Sopenharmony_ciF_2_562 equ 20995 ; FIX(2.562915447) 45cb93a386Sopenharmony_ciF_3_072 equ 25172 ; FIX(3.072711026) 46cb93a386Sopenharmony_ci%else 47cb93a386Sopenharmony_ci; NASM cannot do compile-time arithmetic on floating-point constants. 48cb93a386Sopenharmony_ci%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) 49cb93a386Sopenharmony_ciF_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) 50cb93a386Sopenharmony_ciF_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) 51cb93a386Sopenharmony_ciF_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) 52cb93a386Sopenharmony_ciF_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) 53cb93a386Sopenharmony_ciF_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) 54cb93a386Sopenharmony_ciF_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) 55cb93a386Sopenharmony_ciF_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) 56cb93a386Sopenharmony_ciF_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) 57cb93a386Sopenharmony_ciF_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) 58cb93a386Sopenharmony_ciF_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) 59cb93a386Sopenharmony_ciF_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) 60cb93a386Sopenharmony_ciF_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) 61cb93a386Sopenharmony_ci%endif 62cb93a386Sopenharmony_ci 63cb93a386Sopenharmony_ci; -------------------------------------------------------------------------- 64cb93a386Sopenharmony_ci SECTION SEG_CONST 65cb93a386Sopenharmony_ci 66cb93a386Sopenharmony_ci alignz 32 67cb93a386Sopenharmony_ci GLOBAL_DATA(jconst_fdct_islow_mmx) 68cb93a386Sopenharmony_ci 69cb93a386Sopenharmony_ciEXTN(jconst_fdct_islow_mmx): 70cb93a386Sopenharmony_ci 71cb93a386Sopenharmony_ciPW_F130_F054 times 2 dw (F_0_541 + F_0_765), F_0_541 72cb93a386Sopenharmony_ciPW_F054_MF130 times 2 dw F_0_541, (F_0_541 - F_1_847) 73cb93a386Sopenharmony_ciPW_MF078_F117 times 2 dw (F_1_175 - F_1_961), F_1_175 74cb93a386Sopenharmony_ciPW_F117_F078 times 2 dw F_1_175, (F_1_175 - F_0_390) 75cb93a386Sopenharmony_ciPW_MF060_MF089 times 2 dw (F_0_298 - F_0_899), -F_0_899 76cb93a386Sopenharmony_ciPW_MF089_F060 times 2 dw -F_0_899, (F_1_501 - F_0_899) 77cb93a386Sopenharmony_ciPW_MF050_MF256 times 2 dw (F_2_053 - F_2_562), -F_2_562 78cb93a386Sopenharmony_ciPW_MF256_F050 times 2 dw -F_2_562, (F_3_072 - F_2_562) 79cb93a386Sopenharmony_ciPD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1) 80cb93a386Sopenharmony_ciPD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1) 81cb93a386Sopenharmony_ciPW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS - 1) 82cb93a386Sopenharmony_ci 83cb93a386Sopenharmony_ci alignz 32 84cb93a386Sopenharmony_ci 85cb93a386Sopenharmony_ci; -------------------------------------------------------------------------- 86cb93a386Sopenharmony_ci SECTION SEG_TEXT 87cb93a386Sopenharmony_ci BITS 32 88cb93a386Sopenharmony_ci; 89cb93a386Sopenharmony_ci; Perform the forward DCT on one block of samples. 90cb93a386Sopenharmony_ci; 91cb93a386Sopenharmony_ci; GLOBAL(void) 92cb93a386Sopenharmony_ci; jsimd_fdct_islow_mmx(DCTELEM *data) 93cb93a386Sopenharmony_ci; 94cb93a386Sopenharmony_ci 95cb93a386Sopenharmony_ci%define data(b) (b) + 8 ; DCTELEM *data 96cb93a386Sopenharmony_ci 97cb93a386Sopenharmony_ci%define original_ebp ebp + 0 98cb93a386Sopenharmony_ci%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] 99cb93a386Sopenharmony_ci%define WK_NUM 2 100cb93a386Sopenharmony_ci 101cb93a386Sopenharmony_ci align 32 102cb93a386Sopenharmony_ci GLOBAL_FUNCTION(jsimd_fdct_islow_mmx) 103cb93a386Sopenharmony_ci 104cb93a386Sopenharmony_ciEXTN(jsimd_fdct_islow_mmx): 105cb93a386Sopenharmony_ci push ebp 106cb93a386Sopenharmony_ci mov eax, esp ; eax = original ebp 107cb93a386Sopenharmony_ci sub esp, byte 4 108cb93a386Sopenharmony_ci and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 109cb93a386Sopenharmony_ci mov [esp], eax 110cb93a386Sopenharmony_ci mov ebp, esp ; ebp = aligned ebp 111cb93a386Sopenharmony_ci lea esp, [wk(0)] 112cb93a386Sopenharmony_ci pushpic ebx 113cb93a386Sopenharmony_ci; push ecx ; need not be preserved 114cb93a386Sopenharmony_ci; push edx ; need not be preserved 115cb93a386Sopenharmony_ci; push esi ; unused 116cb93a386Sopenharmony_ci; push edi ; unused 117cb93a386Sopenharmony_ci 118cb93a386Sopenharmony_ci get_GOT ebx ; get GOT address 119cb93a386Sopenharmony_ci 120cb93a386Sopenharmony_ci ; ---- Pass 1: process rows. 121cb93a386Sopenharmony_ci 122cb93a386Sopenharmony_ci mov edx, POINTER [data(eax)] ; (DCTELEM *) 123cb93a386Sopenharmony_ci mov ecx, DCTSIZE/4 124cb93a386Sopenharmony_ci alignx 16, 7 125cb93a386Sopenharmony_ci.rowloop: 126cb93a386Sopenharmony_ci 127cb93a386Sopenharmony_ci movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] 128cb93a386Sopenharmony_ci movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] 129cb93a386Sopenharmony_ci movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] 130cb93a386Sopenharmony_ci movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] 131cb93a386Sopenharmony_ci 132cb93a386Sopenharmony_ci ; mm0=(20 21 22 23), mm2=(24 25 26 27) 133cb93a386Sopenharmony_ci ; mm1=(30 31 32 33), mm3=(34 35 36 37) 134cb93a386Sopenharmony_ci 135cb93a386Sopenharmony_ci movq mm4, mm0 ; transpose coefficients(phase 1) 136cb93a386Sopenharmony_ci punpcklwd mm0, mm1 ; mm0=(20 30 21 31) 137cb93a386Sopenharmony_ci punpckhwd mm4, mm1 ; mm4=(22 32 23 33) 138cb93a386Sopenharmony_ci movq mm5, mm2 ; transpose coefficients(phase 1) 139cb93a386Sopenharmony_ci punpcklwd mm2, mm3 ; mm2=(24 34 25 35) 140cb93a386Sopenharmony_ci punpckhwd mm5, mm3 ; mm5=(26 36 27 37) 141cb93a386Sopenharmony_ci 142cb93a386Sopenharmony_ci movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] 143cb93a386Sopenharmony_ci movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] 144cb93a386Sopenharmony_ci movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] 145cb93a386Sopenharmony_ci movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] 146cb93a386Sopenharmony_ci 147cb93a386Sopenharmony_ci ; mm6=(00 01 02 03), mm1=(04 05 06 07) 148cb93a386Sopenharmony_ci ; mm7=(10 11 12 13), mm3=(14 15 16 17) 149cb93a386Sopenharmony_ci 150cb93a386Sopenharmony_ci movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) 151cb93a386Sopenharmony_ci movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) 152cb93a386Sopenharmony_ci 153cb93a386Sopenharmony_ci movq mm4, mm6 ; transpose coefficients(phase 1) 154cb93a386Sopenharmony_ci punpcklwd mm6, mm7 ; mm6=(00 10 01 11) 155cb93a386Sopenharmony_ci punpckhwd mm4, mm7 ; mm4=(02 12 03 13) 156cb93a386Sopenharmony_ci movq mm2, mm1 ; transpose coefficients(phase 1) 157cb93a386Sopenharmony_ci punpcklwd mm1, mm3 ; mm1=(04 14 05 15) 158cb93a386Sopenharmony_ci punpckhwd mm2, mm3 ; mm2=(06 16 07 17) 159cb93a386Sopenharmony_ci 160cb93a386Sopenharmony_ci movq mm7, mm6 ; transpose coefficients(phase 2) 161cb93a386Sopenharmony_ci punpckldq mm6, mm0 ; mm6=(00 10 20 30)=data0 162cb93a386Sopenharmony_ci punpckhdq mm7, mm0 ; mm7=(01 11 21 31)=data1 163cb93a386Sopenharmony_ci movq mm3, mm2 ; transpose coefficients(phase 2) 164cb93a386Sopenharmony_ci punpckldq mm2, mm5 ; mm2=(06 16 26 36)=data6 165cb93a386Sopenharmony_ci punpckhdq mm3, mm5 ; mm3=(07 17 27 37)=data7 166cb93a386Sopenharmony_ci 167cb93a386Sopenharmony_ci movq mm0, mm7 168cb93a386Sopenharmony_ci movq mm5, mm6 169cb93a386Sopenharmony_ci psubw mm7, mm2 ; mm7=data1-data6=tmp6 170cb93a386Sopenharmony_ci psubw mm6, mm3 ; mm6=data0-data7=tmp7 171cb93a386Sopenharmony_ci paddw mm0, mm2 ; mm0=data1+data6=tmp1 172cb93a386Sopenharmony_ci paddw mm5, mm3 ; mm5=data0+data7=tmp0 173cb93a386Sopenharmony_ci 174cb93a386Sopenharmony_ci movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) 175cb93a386Sopenharmony_ci movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) 176cb93a386Sopenharmony_ci movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 177cb93a386Sopenharmony_ci movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 178cb93a386Sopenharmony_ci 179cb93a386Sopenharmony_ci movq mm7, mm4 ; transpose coefficients(phase 2) 180cb93a386Sopenharmony_ci punpckldq mm4, mm2 ; mm4=(02 12 22 32)=data2 181cb93a386Sopenharmony_ci punpckhdq mm7, mm2 ; mm7=(03 13 23 33)=data3 182cb93a386Sopenharmony_ci movq mm6, mm1 ; transpose coefficients(phase 2) 183cb93a386Sopenharmony_ci punpckldq mm1, mm3 ; mm1=(04 14 24 34)=data4 184cb93a386Sopenharmony_ci punpckhdq mm6, mm3 ; mm6=(05 15 25 35)=data5 185cb93a386Sopenharmony_ci 186cb93a386Sopenharmony_ci movq mm2, mm7 187cb93a386Sopenharmony_ci movq mm3, mm4 188cb93a386Sopenharmony_ci paddw mm7, mm1 ; mm7=data3+data4=tmp3 189cb93a386Sopenharmony_ci paddw mm4, mm6 ; mm4=data2+data5=tmp2 190cb93a386Sopenharmony_ci psubw mm2, mm1 ; mm2=data3-data4=tmp4 191cb93a386Sopenharmony_ci psubw mm3, mm6 ; mm3=data2-data5=tmp5 192cb93a386Sopenharmony_ci 193cb93a386Sopenharmony_ci ; -- Even part 194cb93a386Sopenharmony_ci 195cb93a386Sopenharmony_ci movq mm1, mm5 196cb93a386Sopenharmony_ci movq mm6, mm0 197cb93a386Sopenharmony_ci paddw mm5, mm7 ; mm5=tmp10 198cb93a386Sopenharmony_ci paddw mm0, mm4 ; mm0=tmp11 199cb93a386Sopenharmony_ci psubw mm1, mm7 ; mm1=tmp13 200cb93a386Sopenharmony_ci psubw mm6, mm4 ; mm6=tmp12 201cb93a386Sopenharmony_ci 202cb93a386Sopenharmony_ci movq mm7, mm5 203cb93a386Sopenharmony_ci paddw mm5, mm0 ; mm5=tmp10+tmp11 204cb93a386Sopenharmony_ci psubw mm7, mm0 ; mm7=tmp10-tmp11 205cb93a386Sopenharmony_ci 206cb93a386Sopenharmony_ci psllw mm5, PASS1_BITS ; mm5=data0 207cb93a386Sopenharmony_ci psllw mm7, PASS1_BITS ; mm7=data4 208cb93a386Sopenharmony_ci 209cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 210cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7 211cb93a386Sopenharmony_ci 212cb93a386Sopenharmony_ci ; (Original) 213cb93a386Sopenharmony_ci ; z1 = (tmp12 + tmp13) * 0.541196100; 214cb93a386Sopenharmony_ci ; data2 = z1 + tmp13 * 0.765366865; 215cb93a386Sopenharmony_ci ; data6 = z1 + tmp12 * -1.847759065; 216cb93a386Sopenharmony_ci ; 217cb93a386Sopenharmony_ci ; (This implementation) 218cb93a386Sopenharmony_ci ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; 219cb93a386Sopenharmony_ci ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); 220cb93a386Sopenharmony_ci 221cb93a386Sopenharmony_ci movq mm4, mm1 ; mm1=tmp13 222cb93a386Sopenharmony_ci movq mm0, mm1 223cb93a386Sopenharmony_ci punpcklwd mm4, mm6 ; mm6=tmp12 224cb93a386Sopenharmony_ci punpckhwd mm0, mm6 225cb93a386Sopenharmony_ci movq mm1, mm4 226cb93a386Sopenharmony_ci movq mm6, mm0 227cb93a386Sopenharmony_ci pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L 228cb93a386Sopenharmony_ci pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H 229cb93a386Sopenharmony_ci pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L 230cb93a386Sopenharmony_ci pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H 231cb93a386Sopenharmony_ci 232cb93a386Sopenharmony_ci paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)] 233cb93a386Sopenharmony_ci paddd mm0, [GOTOFF(ebx,PD_DESCALE_P1)] 234cb93a386Sopenharmony_ci psrad mm4, DESCALE_P1 235cb93a386Sopenharmony_ci psrad mm0, DESCALE_P1 236cb93a386Sopenharmony_ci paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)] 237cb93a386Sopenharmony_ci paddd mm6, [GOTOFF(ebx,PD_DESCALE_P1)] 238cb93a386Sopenharmony_ci psrad mm1, DESCALE_P1 239cb93a386Sopenharmony_ci psrad mm6, DESCALE_P1 240cb93a386Sopenharmony_ci 241cb93a386Sopenharmony_ci packssdw mm4, mm0 ; mm4=data2 242cb93a386Sopenharmony_ci packssdw mm1, mm6 ; mm1=data6 243cb93a386Sopenharmony_ci 244cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 245cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1 246cb93a386Sopenharmony_ci 247cb93a386Sopenharmony_ci ; -- Odd part 248cb93a386Sopenharmony_ci 249cb93a386Sopenharmony_ci movq mm5, MMWORD [wk(0)] ; mm5=tmp6 250cb93a386Sopenharmony_ci movq mm7, MMWORD [wk(1)] ; mm7=tmp7 251cb93a386Sopenharmony_ci 252cb93a386Sopenharmony_ci movq mm0, mm2 ; mm2=tmp4 253cb93a386Sopenharmony_ci movq mm6, mm3 ; mm3=tmp5 254cb93a386Sopenharmony_ci paddw mm0, mm5 ; mm0=z3 255cb93a386Sopenharmony_ci paddw mm6, mm7 ; mm6=z4 256cb93a386Sopenharmony_ci 257cb93a386Sopenharmony_ci ; (Original) 258cb93a386Sopenharmony_ci ; z5 = (z3 + z4) * 1.175875602; 259cb93a386Sopenharmony_ci ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 260cb93a386Sopenharmony_ci ; z3 += z5; z4 += z5; 261cb93a386Sopenharmony_ci ; 262cb93a386Sopenharmony_ci ; (This implementation) 263cb93a386Sopenharmony_ci ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 264cb93a386Sopenharmony_ci ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 265cb93a386Sopenharmony_ci 266cb93a386Sopenharmony_ci movq mm4, mm0 267cb93a386Sopenharmony_ci movq mm1, mm0 268cb93a386Sopenharmony_ci punpcklwd mm4, mm6 269cb93a386Sopenharmony_ci punpckhwd mm1, mm6 270cb93a386Sopenharmony_ci movq mm0, mm4 271cb93a386Sopenharmony_ci movq mm6, mm1 272cb93a386Sopenharmony_ci pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L 273cb93a386Sopenharmony_ci pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H 274cb93a386Sopenharmony_ci pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L 275cb93a386Sopenharmony_ci pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H 276cb93a386Sopenharmony_ci 277cb93a386Sopenharmony_ci movq MMWORD [wk(0)], mm4 ; wk(0)=z3L 278cb93a386Sopenharmony_ci movq MMWORD [wk(1)], mm1 ; wk(1)=z3H 279cb93a386Sopenharmony_ci 280cb93a386Sopenharmony_ci ; (Original) 281cb93a386Sopenharmony_ci ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; 282cb93a386Sopenharmony_ci ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; 283cb93a386Sopenharmony_ci ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; 284cb93a386Sopenharmony_ci ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 285cb93a386Sopenharmony_ci ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; 286cb93a386Sopenharmony_ci ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; 287cb93a386Sopenharmony_ci ; 288cb93a386Sopenharmony_ci ; (This implementation) 289cb93a386Sopenharmony_ci ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; 290cb93a386Sopenharmony_ci ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; 291cb93a386Sopenharmony_ci ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); 292cb93a386Sopenharmony_ci ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); 293cb93a386Sopenharmony_ci ; data7 = tmp4 + z3; data5 = tmp5 + z4; 294cb93a386Sopenharmony_ci ; data3 = tmp6 + z3; data1 = tmp7 + z4; 295cb93a386Sopenharmony_ci 296cb93a386Sopenharmony_ci movq mm4, mm2 297cb93a386Sopenharmony_ci movq mm1, mm2 298cb93a386Sopenharmony_ci punpcklwd mm4, mm7 299cb93a386Sopenharmony_ci punpckhwd mm1, mm7 300cb93a386Sopenharmony_ci movq mm2, mm4 301cb93a386Sopenharmony_ci movq mm7, mm1 302cb93a386Sopenharmony_ci pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L 303cb93a386Sopenharmony_ci pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H 304cb93a386Sopenharmony_ci pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L 305cb93a386Sopenharmony_ci pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H 306cb93a386Sopenharmony_ci 307cb93a386Sopenharmony_ci paddd mm4, MMWORD [wk(0)] ; mm4=data7L 308cb93a386Sopenharmony_ci paddd mm1, MMWORD [wk(1)] ; mm1=data7H 309cb93a386Sopenharmony_ci paddd mm2, mm0 ; mm2=data1L 310cb93a386Sopenharmony_ci paddd mm7, mm6 ; mm7=data1H 311cb93a386Sopenharmony_ci 312cb93a386Sopenharmony_ci paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)] 313cb93a386Sopenharmony_ci paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)] 314cb93a386Sopenharmony_ci psrad mm4, DESCALE_P1 315cb93a386Sopenharmony_ci psrad mm1, DESCALE_P1 316cb93a386Sopenharmony_ci paddd mm2, [GOTOFF(ebx,PD_DESCALE_P1)] 317cb93a386Sopenharmony_ci paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)] 318cb93a386Sopenharmony_ci psrad mm2, DESCALE_P1 319cb93a386Sopenharmony_ci psrad mm7, DESCALE_P1 320cb93a386Sopenharmony_ci 321cb93a386Sopenharmony_ci packssdw mm4, mm1 ; mm4=data7 322cb93a386Sopenharmony_ci packssdw mm2, mm7 ; mm2=data1 323cb93a386Sopenharmony_ci 324cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4 325cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 326cb93a386Sopenharmony_ci 327cb93a386Sopenharmony_ci movq mm1, mm3 328cb93a386Sopenharmony_ci movq mm7, mm3 329cb93a386Sopenharmony_ci punpcklwd mm1, mm5 330cb93a386Sopenharmony_ci punpckhwd mm7, mm5 331cb93a386Sopenharmony_ci movq mm3, mm1 332cb93a386Sopenharmony_ci movq mm5, mm7 333cb93a386Sopenharmony_ci pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L 334cb93a386Sopenharmony_ci pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H 335cb93a386Sopenharmony_ci pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L 336cb93a386Sopenharmony_ci pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H 337cb93a386Sopenharmony_ci 338cb93a386Sopenharmony_ci paddd mm1, mm0 ; mm1=data5L 339cb93a386Sopenharmony_ci paddd mm7, mm6 ; mm7=data5H 340cb93a386Sopenharmony_ci paddd mm3, MMWORD [wk(0)] ; mm3=data3L 341cb93a386Sopenharmony_ci paddd mm5, MMWORD [wk(1)] ; mm5=data3H 342cb93a386Sopenharmony_ci 343cb93a386Sopenharmony_ci paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)] 344cb93a386Sopenharmony_ci paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)] 345cb93a386Sopenharmony_ci psrad mm1, DESCALE_P1 346cb93a386Sopenharmony_ci psrad mm7, DESCALE_P1 347cb93a386Sopenharmony_ci paddd mm3, [GOTOFF(ebx,PD_DESCALE_P1)] 348cb93a386Sopenharmony_ci paddd mm5, [GOTOFF(ebx,PD_DESCALE_P1)] 349cb93a386Sopenharmony_ci psrad mm3, DESCALE_P1 350cb93a386Sopenharmony_ci psrad mm5, DESCALE_P1 351cb93a386Sopenharmony_ci 352cb93a386Sopenharmony_ci packssdw mm1, mm7 ; mm1=data5 353cb93a386Sopenharmony_ci packssdw mm3, mm5 ; mm3=data3 354cb93a386Sopenharmony_ci 355cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1 356cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 357cb93a386Sopenharmony_ci 358cb93a386Sopenharmony_ci add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM 359cb93a386Sopenharmony_ci dec ecx 360cb93a386Sopenharmony_ci jnz near .rowloop 361cb93a386Sopenharmony_ci 362cb93a386Sopenharmony_ci ; ---- Pass 2: process columns. 363cb93a386Sopenharmony_ci 364cb93a386Sopenharmony_ci mov edx, POINTER [data(eax)] ; (DCTELEM *) 365cb93a386Sopenharmony_ci mov ecx, DCTSIZE/4 366cb93a386Sopenharmony_ci alignx 16, 7 367cb93a386Sopenharmony_ci.columnloop: 368cb93a386Sopenharmony_ci 369cb93a386Sopenharmony_ci movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] 370cb93a386Sopenharmony_ci movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] 371cb93a386Sopenharmony_ci movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] 372cb93a386Sopenharmony_ci movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] 373cb93a386Sopenharmony_ci 374cb93a386Sopenharmony_ci ; mm0=(02 12 22 32), mm2=(42 52 62 72) 375cb93a386Sopenharmony_ci ; mm1=(03 13 23 33), mm3=(43 53 63 73) 376cb93a386Sopenharmony_ci 377cb93a386Sopenharmony_ci movq mm4, mm0 ; transpose coefficients(phase 1) 378cb93a386Sopenharmony_ci punpcklwd mm0, mm1 ; mm0=(02 03 12 13) 379cb93a386Sopenharmony_ci punpckhwd mm4, mm1 ; mm4=(22 23 32 33) 380cb93a386Sopenharmony_ci movq mm5, mm2 ; transpose coefficients(phase 1) 381cb93a386Sopenharmony_ci punpcklwd mm2, mm3 ; mm2=(42 43 52 53) 382cb93a386Sopenharmony_ci punpckhwd mm5, mm3 ; mm5=(62 63 72 73) 383cb93a386Sopenharmony_ci 384cb93a386Sopenharmony_ci movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] 385cb93a386Sopenharmony_ci movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] 386cb93a386Sopenharmony_ci movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] 387cb93a386Sopenharmony_ci movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] 388cb93a386Sopenharmony_ci 389cb93a386Sopenharmony_ci ; mm6=(00 10 20 30), mm1=(40 50 60 70) 390cb93a386Sopenharmony_ci ; mm7=(01 11 21 31), mm3=(41 51 61 71) 391cb93a386Sopenharmony_ci 392cb93a386Sopenharmony_ci movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) 393cb93a386Sopenharmony_ci movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) 394cb93a386Sopenharmony_ci 395cb93a386Sopenharmony_ci movq mm4, mm6 ; transpose coefficients(phase 1) 396cb93a386Sopenharmony_ci punpcklwd mm6, mm7 ; mm6=(00 01 10 11) 397cb93a386Sopenharmony_ci punpckhwd mm4, mm7 ; mm4=(20 21 30 31) 398cb93a386Sopenharmony_ci movq mm2, mm1 ; transpose coefficients(phase 1) 399cb93a386Sopenharmony_ci punpcklwd mm1, mm3 ; mm1=(40 41 50 51) 400cb93a386Sopenharmony_ci punpckhwd mm2, mm3 ; mm2=(60 61 70 71) 401cb93a386Sopenharmony_ci 402cb93a386Sopenharmony_ci movq mm7, mm6 ; transpose coefficients(phase 2) 403cb93a386Sopenharmony_ci punpckldq mm6, mm0 ; mm6=(00 01 02 03)=data0 404cb93a386Sopenharmony_ci punpckhdq mm7, mm0 ; mm7=(10 11 12 13)=data1 405cb93a386Sopenharmony_ci movq mm3, mm2 ; transpose coefficients(phase 2) 406cb93a386Sopenharmony_ci punpckldq mm2, mm5 ; mm2=(60 61 62 63)=data6 407cb93a386Sopenharmony_ci punpckhdq mm3, mm5 ; mm3=(70 71 72 73)=data7 408cb93a386Sopenharmony_ci 409cb93a386Sopenharmony_ci movq mm0, mm7 410cb93a386Sopenharmony_ci movq mm5, mm6 411cb93a386Sopenharmony_ci psubw mm7, mm2 ; mm7=data1-data6=tmp6 412cb93a386Sopenharmony_ci psubw mm6, mm3 ; mm6=data0-data7=tmp7 413cb93a386Sopenharmony_ci paddw mm0, mm2 ; mm0=data1+data6=tmp1 414cb93a386Sopenharmony_ci paddw mm5, mm3 ; mm5=data0+data7=tmp0 415cb93a386Sopenharmony_ci 416cb93a386Sopenharmony_ci movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) 417cb93a386Sopenharmony_ci movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) 418cb93a386Sopenharmony_ci movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 419cb93a386Sopenharmony_ci movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 420cb93a386Sopenharmony_ci 421cb93a386Sopenharmony_ci movq mm7, mm4 ; transpose coefficients(phase 2) 422cb93a386Sopenharmony_ci punpckldq mm4, mm2 ; mm4=(20 21 22 23)=data2 423cb93a386Sopenharmony_ci punpckhdq mm7, mm2 ; mm7=(30 31 32 33)=data3 424cb93a386Sopenharmony_ci movq mm6, mm1 ; transpose coefficients(phase 2) 425cb93a386Sopenharmony_ci punpckldq mm1, mm3 ; mm1=(40 41 42 43)=data4 426cb93a386Sopenharmony_ci punpckhdq mm6, mm3 ; mm6=(50 51 52 53)=data5 427cb93a386Sopenharmony_ci 428cb93a386Sopenharmony_ci movq mm2, mm7 429cb93a386Sopenharmony_ci movq mm3, mm4 430cb93a386Sopenharmony_ci paddw mm7, mm1 ; mm7=data3+data4=tmp3 431cb93a386Sopenharmony_ci paddw mm4, mm6 ; mm4=data2+data5=tmp2 432cb93a386Sopenharmony_ci psubw mm2, mm1 ; mm2=data3-data4=tmp4 433cb93a386Sopenharmony_ci psubw mm3, mm6 ; mm3=data2-data5=tmp5 434cb93a386Sopenharmony_ci 435cb93a386Sopenharmony_ci ; -- Even part 436cb93a386Sopenharmony_ci 437cb93a386Sopenharmony_ci movq mm1, mm5 438cb93a386Sopenharmony_ci movq mm6, mm0 439cb93a386Sopenharmony_ci paddw mm5, mm7 ; mm5=tmp10 440cb93a386Sopenharmony_ci paddw mm0, mm4 ; mm0=tmp11 441cb93a386Sopenharmony_ci psubw mm1, mm7 ; mm1=tmp13 442cb93a386Sopenharmony_ci psubw mm6, mm4 ; mm6=tmp12 443cb93a386Sopenharmony_ci 444cb93a386Sopenharmony_ci movq mm7, mm5 445cb93a386Sopenharmony_ci paddw mm5, mm0 ; mm5=tmp10+tmp11 446cb93a386Sopenharmony_ci psubw mm7, mm0 ; mm7=tmp10-tmp11 447cb93a386Sopenharmony_ci 448cb93a386Sopenharmony_ci paddw mm5, [GOTOFF(ebx,PW_DESCALE_P2X)] 449cb93a386Sopenharmony_ci paddw mm7, [GOTOFF(ebx,PW_DESCALE_P2X)] 450cb93a386Sopenharmony_ci psraw mm5, PASS1_BITS ; mm5=data0 451cb93a386Sopenharmony_ci psraw mm7, PASS1_BITS ; mm7=data4 452cb93a386Sopenharmony_ci 453cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 454cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7 455cb93a386Sopenharmony_ci 456cb93a386Sopenharmony_ci ; (Original) 457cb93a386Sopenharmony_ci ; z1 = (tmp12 + tmp13) * 0.541196100; 458cb93a386Sopenharmony_ci ; data2 = z1 + tmp13 * 0.765366865; 459cb93a386Sopenharmony_ci ; data6 = z1 + tmp12 * -1.847759065; 460cb93a386Sopenharmony_ci ; 461cb93a386Sopenharmony_ci ; (This implementation) 462cb93a386Sopenharmony_ci ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; 463cb93a386Sopenharmony_ci ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); 464cb93a386Sopenharmony_ci 465cb93a386Sopenharmony_ci movq mm4, mm1 ; mm1=tmp13 466cb93a386Sopenharmony_ci movq mm0, mm1 467cb93a386Sopenharmony_ci punpcklwd mm4, mm6 ; mm6=tmp12 468cb93a386Sopenharmony_ci punpckhwd mm0, mm6 469cb93a386Sopenharmony_ci movq mm1, mm4 470cb93a386Sopenharmony_ci movq mm6, mm0 471cb93a386Sopenharmony_ci pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L 472cb93a386Sopenharmony_ci pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H 473cb93a386Sopenharmony_ci pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L 474cb93a386Sopenharmony_ci pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H 475cb93a386Sopenharmony_ci 476cb93a386Sopenharmony_ci paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)] 477cb93a386Sopenharmony_ci paddd mm0, [GOTOFF(ebx,PD_DESCALE_P2)] 478cb93a386Sopenharmony_ci psrad mm4, DESCALE_P2 479cb93a386Sopenharmony_ci psrad mm0, DESCALE_P2 480cb93a386Sopenharmony_ci paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)] 481cb93a386Sopenharmony_ci paddd mm6, [GOTOFF(ebx,PD_DESCALE_P2)] 482cb93a386Sopenharmony_ci psrad mm1, DESCALE_P2 483cb93a386Sopenharmony_ci psrad mm6, DESCALE_P2 484cb93a386Sopenharmony_ci 485cb93a386Sopenharmony_ci packssdw mm4, mm0 ; mm4=data2 486cb93a386Sopenharmony_ci packssdw mm1, mm6 ; mm1=data6 487cb93a386Sopenharmony_ci 488cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 489cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1 490cb93a386Sopenharmony_ci 491cb93a386Sopenharmony_ci ; -- Odd part 492cb93a386Sopenharmony_ci 493cb93a386Sopenharmony_ci movq mm5, MMWORD [wk(0)] ; mm5=tmp6 494cb93a386Sopenharmony_ci movq mm7, MMWORD [wk(1)] ; mm7=tmp7 495cb93a386Sopenharmony_ci 496cb93a386Sopenharmony_ci movq mm0, mm2 ; mm2=tmp4 497cb93a386Sopenharmony_ci movq mm6, mm3 ; mm3=tmp5 498cb93a386Sopenharmony_ci paddw mm0, mm5 ; mm0=z3 499cb93a386Sopenharmony_ci paddw mm6, mm7 ; mm6=z4 500cb93a386Sopenharmony_ci 501cb93a386Sopenharmony_ci ; (Original) 502cb93a386Sopenharmony_ci ; z5 = (z3 + z4) * 1.175875602; 503cb93a386Sopenharmony_ci ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 504cb93a386Sopenharmony_ci ; z3 += z5; z4 += z5; 505cb93a386Sopenharmony_ci ; 506cb93a386Sopenharmony_ci ; (This implementation) 507cb93a386Sopenharmony_ci ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 508cb93a386Sopenharmony_ci ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 509cb93a386Sopenharmony_ci 510cb93a386Sopenharmony_ci movq mm4, mm0 511cb93a386Sopenharmony_ci movq mm1, mm0 512cb93a386Sopenharmony_ci punpcklwd mm4, mm6 513cb93a386Sopenharmony_ci punpckhwd mm1, mm6 514cb93a386Sopenharmony_ci movq mm0, mm4 515cb93a386Sopenharmony_ci movq mm6, mm1 516cb93a386Sopenharmony_ci pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L 517cb93a386Sopenharmony_ci pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H 518cb93a386Sopenharmony_ci pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L 519cb93a386Sopenharmony_ci pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H 520cb93a386Sopenharmony_ci 521cb93a386Sopenharmony_ci movq MMWORD [wk(0)], mm4 ; wk(0)=z3L 522cb93a386Sopenharmony_ci movq MMWORD [wk(1)], mm1 ; wk(1)=z3H 523cb93a386Sopenharmony_ci 524cb93a386Sopenharmony_ci ; (Original) 525cb93a386Sopenharmony_ci ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; 526cb93a386Sopenharmony_ci ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; 527cb93a386Sopenharmony_ci ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; 528cb93a386Sopenharmony_ci ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 529cb93a386Sopenharmony_ci ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; 530cb93a386Sopenharmony_ci ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; 531cb93a386Sopenharmony_ci ; 532cb93a386Sopenharmony_ci ; (This implementation) 533cb93a386Sopenharmony_ci ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; 534cb93a386Sopenharmony_ci ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; 535cb93a386Sopenharmony_ci ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); 536cb93a386Sopenharmony_ci ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); 537cb93a386Sopenharmony_ci ; data7 = tmp4 + z3; data5 = tmp5 + z4; 538cb93a386Sopenharmony_ci ; data3 = tmp6 + z3; data1 = tmp7 + z4; 539cb93a386Sopenharmony_ci 540cb93a386Sopenharmony_ci movq mm4, mm2 541cb93a386Sopenharmony_ci movq mm1, mm2 542cb93a386Sopenharmony_ci punpcklwd mm4, mm7 543cb93a386Sopenharmony_ci punpckhwd mm1, mm7 544cb93a386Sopenharmony_ci movq mm2, mm4 545cb93a386Sopenharmony_ci movq mm7, mm1 546cb93a386Sopenharmony_ci pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L 547cb93a386Sopenharmony_ci pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H 548cb93a386Sopenharmony_ci pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L 549cb93a386Sopenharmony_ci pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H 550cb93a386Sopenharmony_ci 551cb93a386Sopenharmony_ci paddd mm4, MMWORD [wk(0)] ; mm4=data7L 552cb93a386Sopenharmony_ci paddd mm1, MMWORD [wk(1)] ; mm1=data7H 553cb93a386Sopenharmony_ci paddd mm2, mm0 ; mm2=data1L 554cb93a386Sopenharmony_ci paddd mm7, mm6 ; mm7=data1H 555cb93a386Sopenharmony_ci 556cb93a386Sopenharmony_ci paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)] 557cb93a386Sopenharmony_ci paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)] 558cb93a386Sopenharmony_ci psrad mm4, DESCALE_P2 559cb93a386Sopenharmony_ci psrad mm1, DESCALE_P2 560cb93a386Sopenharmony_ci paddd mm2, [GOTOFF(ebx,PD_DESCALE_P2)] 561cb93a386Sopenharmony_ci paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)] 562cb93a386Sopenharmony_ci psrad mm2, DESCALE_P2 563cb93a386Sopenharmony_ci psrad mm7, DESCALE_P2 564cb93a386Sopenharmony_ci 565cb93a386Sopenharmony_ci packssdw mm4, mm1 ; mm4=data7 566cb93a386Sopenharmony_ci packssdw mm2, mm7 ; mm2=data1 567cb93a386Sopenharmony_ci 568cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4 569cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 570cb93a386Sopenharmony_ci 571cb93a386Sopenharmony_ci movq mm1, mm3 572cb93a386Sopenharmony_ci movq mm7, mm3 573cb93a386Sopenharmony_ci punpcklwd mm1, mm5 574cb93a386Sopenharmony_ci punpckhwd mm7, mm5 575cb93a386Sopenharmony_ci movq mm3, mm1 576cb93a386Sopenharmony_ci movq mm5, mm7 577cb93a386Sopenharmony_ci pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L 578cb93a386Sopenharmony_ci pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H 579cb93a386Sopenharmony_ci pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L 580cb93a386Sopenharmony_ci pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H 581cb93a386Sopenharmony_ci 582cb93a386Sopenharmony_ci paddd mm1, mm0 ; mm1=data5L 583cb93a386Sopenharmony_ci paddd mm7, mm6 ; mm7=data5H 584cb93a386Sopenharmony_ci paddd mm3, MMWORD [wk(0)] ; mm3=data3L 585cb93a386Sopenharmony_ci paddd mm5, MMWORD [wk(1)] ; mm5=data3H 586cb93a386Sopenharmony_ci 587cb93a386Sopenharmony_ci paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)] 588cb93a386Sopenharmony_ci paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)] 589cb93a386Sopenharmony_ci psrad mm1, DESCALE_P2 590cb93a386Sopenharmony_ci psrad mm7, DESCALE_P2 591cb93a386Sopenharmony_ci paddd mm3, [GOTOFF(ebx,PD_DESCALE_P2)] 592cb93a386Sopenharmony_ci paddd mm5, [GOTOFF(ebx,PD_DESCALE_P2)] 593cb93a386Sopenharmony_ci psrad mm3, DESCALE_P2 594cb93a386Sopenharmony_ci psrad mm5, DESCALE_P2 595cb93a386Sopenharmony_ci 596cb93a386Sopenharmony_ci packssdw mm1, mm7 ; mm1=data5 597cb93a386Sopenharmony_ci packssdw mm3, mm5 ; mm3=data3 598cb93a386Sopenharmony_ci 599cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1 600cb93a386Sopenharmony_ci movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 601cb93a386Sopenharmony_ci 602cb93a386Sopenharmony_ci add edx, byte 4*SIZEOF_DCTELEM 603cb93a386Sopenharmony_ci dec ecx 604cb93a386Sopenharmony_ci jnz near .columnloop 605cb93a386Sopenharmony_ci 606cb93a386Sopenharmony_ci emms ; empty MMX state 607cb93a386Sopenharmony_ci 608cb93a386Sopenharmony_ci; pop edi ; unused 609cb93a386Sopenharmony_ci; pop esi ; unused 610cb93a386Sopenharmony_ci; pop edx ; need not be preserved 611cb93a386Sopenharmony_ci; pop ecx ; need not be preserved 612cb93a386Sopenharmony_ci poppic ebx 613cb93a386Sopenharmony_ci mov esp, ebp ; esp <- aligned ebp 614cb93a386Sopenharmony_ci pop esp ; esp <- original ebp 615cb93a386Sopenharmony_ci pop ebp 616cb93a386Sopenharmony_ci ret 617cb93a386Sopenharmony_ci 618cb93a386Sopenharmony_ci; For some reason, the OS X linker does not honor the request to align the 619cb93a386Sopenharmony_ci; segment unless we do this. 620cb93a386Sopenharmony_ci align 32 621