1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* MMX/SSE2-optimized functions for the VP3 decoder 3cabdff1aSopenharmony_ci;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> 4cabdff1aSopenharmony_ci;* 5cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci;* 12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 16cabdff1aSopenharmony_ci;* 17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci;****************************************************************************** 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci; MMX-optimized functions cribbed from the original VP3 source code. 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ciSECTION_RODATA 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_civp3_idct_data: times 8 dw 64277 29cabdff1aSopenharmony_ci times 8 dw 60547 30cabdff1aSopenharmony_ci times 8 dw 54491 31cabdff1aSopenharmony_ci times 8 dw 46341 32cabdff1aSopenharmony_ci times 8 dw 36410 33cabdff1aSopenharmony_ci times 8 dw 25080 34cabdff1aSopenharmony_ci times 8 dw 12785 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_cipb_7: times 8 db 0x07 37cabdff1aSopenharmony_cipb_1F: times 8 db 0x1f 38cabdff1aSopenharmony_cipb_81: times 8 db 0x81 39cabdff1aSopenharmony_ci 40cabdff1aSopenharmony_cicextern pb_1 41cabdff1aSopenharmony_cicextern pb_3 42cabdff1aSopenharmony_cicextern pb_80 43cabdff1aSopenharmony_cicextern pb_FE 44cabdff1aSopenharmony_ci 45cabdff1aSopenharmony_cicextern pw_8 46cabdff1aSopenharmony_ci 47cabdff1aSopenharmony_ciSECTION .text 48cabdff1aSopenharmony_ci 49cabdff1aSopenharmony_ci; this is off by one or two for some cases when filter_limit is greater than 63 50cabdff1aSopenharmony_ci; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 51cabdff1aSopenharmony_ci; out: p1 in mm4, p2 in mm3 52cabdff1aSopenharmony_ci%macro VP3_LOOP_FILTER 0 53cabdff1aSopenharmony_ci movq m7, m6 54cabdff1aSopenharmony_ci pand m6, [pb_7] ; p0&7 55cabdff1aSopenharmony_ci psrlw m7, 3 56cabdff1aSopenharmony_ci pand m7, [pb_1F] ; p0>>3 57cabdff1aSopenharmony_ci movq m3, m2 ; p2 58cabdff1aSopenharmony_ci pxor m2, m4 59cabdff1aSopenharmony_ci pand m2, [pb_1] ; (p2^p1)&1 60cabdff1aSopenharmony_ci movq m5, m2 61cabdff1aSopenharmony_ci paddb m2, m2 62cabdff1aSopenharmony_ci paddb m2, m5 ; 3*(p2^p1)&1 63cabdff1aSopenharmony_ci paddb m2, m6 ; extra bits lost in shifts 64cabdff1aSopenharmony_ci pcmpeqb m0, m0 65cabdff1aSopenharmony_ci pxor m1, m0 ; 255 - p3 66cabdff1aSopenharmony_ci pavgb m1, m2 ; (256 - p3 + extrabits) >> 1 67cabdff1aSopenharmony_ci pxor m0, m4 ; 255 - p1 68cabdff1aSopenharmony_ci pavgb m0, m3 ; (256 + p2-p1) >> 1 69cabdff1aSopenharmony_ci paddb m1, [pb_3] 70cabdff1aSopenharmony_ci pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2 71cabdff1aSopenharmony_ci pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3 72cabdff1aSopenharmony_ci paddusb m7, m1 ; d+128+1 73cabdff1aSopenharmony_ci movq m6, [pb_81] 74cabdff1aSopenharmony_ci psubusb m6, m7 75cabdff1aSopenharmony_ci psubusb m7, [pb_81] 76cabdff1aSopenharmony_ci 77cabdff1aSopenharmony_ci movq m5, [r2+516] ; flim 78cabdff1aSopenharmony_ci pminub m6, m5 79cabdff1aSopenharmony_ci pminub m7, m5 80cabdff1aSopenharmony_ci movq m0, m6 81cabdff1aSopenharmony_ci movq m1, m7 82cabdff1aSopenharmony_ci paddb m6, m6 83cabdff1aSopenharmony_ci paddb m7, m7 84cabdff1aSopenharmony_ci pminub m6, m5 85cabdff1aSopenharmony_ci pminub m7, m5 86cabdff1aSopenharmony_ci psubb m6, m0 87cabdff1aSopenharmony_ci psubb m7, m1 88cabdff1aSopenharmony_ci paddusb m4, m7 89cabdff1aSopenharmony_ci psubusb m4, m6 90cabdff1aSopenharmony_ci psubusb m3, m7 91cabdff1aSopenharmony_ci paddusb m3, m6 92cabdff1aSopenharmony_ci%endmacro 93cabdff1aSopenharmony_ci 94cabdff1aSopenharmony_ci%macro STORE_4_WORDS 1 95cabdff1aSopenharmony_ci movd r2d, %1 96cabdff1aSopenharmony_ci mov [r0 -1], r2w 97cabdff1aSopenharmony_ci psrlq %1, 32 98cabdff1aSopenharmony_ci shr r2, 16 99cabdff1aSopenharmony_ci mov [r0+r1 -1], r2w 100cabdff1aSopenharmony_ci movd r2d, %1 101cabdff1aSopenharmony_ci mov [r0+r1*2-1], r2w 102cabdff1aSopenharmony_ci shr r2, 16 103cabdff1aSopenharmony_ci mov [r0+r3 -1], r2w 104cabdff1aSopenharmony_ci%endmacro 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_ciINIT_MMX mmxext 107cabdff1aSopenharmony_cicglobal vp3_v_loop_filter, 3, 4 108cabdff1aSopenharmony_ci mov r3, r1 109cabdff1aSopenharmony_ci neg r1 110cabdff1aSopenharmony_ci movq m6, [r0+r1*2] 111cabdff1aSopenharmony_ci movq m4, [r0+r1 ] 112cabdff1aSopenharmony_ci movq m2, [r0 ] 113cabdff1aSopenharmony_ci movq m1, [r0+r3 ] 114cabdff1aSopenharmony_ci 115cabdff1aSopenharmony_ci VP3_LOOP_FILTER 116cabdff1aSopenharmony_ci 117cabdff1aSopenharmony_ci movq [r0+r1], m4 118cabdff1aSopenharmony_ci movq [r0 ], m3 119cabdff1aSopenharmony_ci RET 120cabdff1aSopenharmony_ci 121cabdff1aSopenharmony_cicglobal vp3_h_loop_filter, 3, 4 122cabdff1aSopenharmony_ci lea r3, [r1*3] 123cabdff1aSopenharmony_ci 124cabdff1aSopenharmony_ci movd m6, [r0 -2] 125cabdff1aSopenharmony_ci movd m4, [r0+r1 -2] 126cabdff1aSopenharmony_ci movd m2, [r0+r1*2-2] 127cabdff1aSopenharmony_ci movd m1, [r0+r3 -2] 128cabdff1aSopenharmony_ci lea r0, [r0+r1*4 ] 129cabdff1aSopenharmony_ci punpcklbw m6, [r0 -2] 130cabdff1aSopenharmony_ci punpcklbw m4, [r0+r1 -2] 131cabdff1aSopenharmony_ci punpcklbw m2, [r0+r1*2-2] 132cabdff1aSopenharmony_ci punpcklbw m1, [r0+r3 -2] 133cabdff1aSopenharmony_ci sub r0, r3 134cabdff1aSopenharmony_ci sub r0, r1 135cabdff1aSopenharmony_ci 136cabdff1aSopenharmony_ci TRANSPOSE4x4B 6, 4, 2, 1, 0 137cabdff1aSopenharmony_ci VP3_LOOP_FILTER 138cabdff1aSopenharmony_ci SBUTTERFLY bw, 4, 3, 5 139cabdff1aSopenharmony_ci 140cabdff1aSopenharmony_ci STORE_4_WORDS m4 141cabdff1aSopenharmony_ci lea r0, [r0+r1*4 ] 142cabdff1aSopenharmony_ci STORE_4_WORDS m3 143cabdff1aSopenharmony_ci RET 144cabdff1aSopenharmony_ci 145cabdff1aSopenharmony_ci%macro PAVGB_NO_RND 0 146cabdff1aSopenharmony_ci mova m4, m0 147cabdff1aSopenharmony_ci mova m5, m2 148cabdff1aSopenharmony_ci pand m4, m1 149cabdff1aSopenharmony_ci pand m5, m3 150cabdff1aSopenharmony_ci pxor m1, m0 151cabdff1aSopenharmony_ci pxor m3, m2 152cabdff1aSopenharmony_ci pand m1, m6 153cabdff1aSopenharmony_ci pand m3, m6 154cabdff1aSopenharmony_ci psrlq m1, 1 155cabdff1aSopenharmony_ci psrlq m3, 1 156cabdff1aSopenharmony_ci paddb m4, m1 157cabdff1aSopenharmony_ci paddb m5, m3 158cabdff1aSopenharmony_ci%endmacro 159cabdff1aSopenharmony_ci 160cabdff1aSopenharmony_ciINIT_MMX mmx 161cabdff1aSopenharmony_cicglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3 162cabdff1aSopenharmony_ci mova m6, [pb_FE] 163cabdff1aSopenharmony_ci lea stride3q,[strideq+strideq*2] 164cabdff1aSopenharmony_ci.loop: 165cabdff1aSopenharmony_ci mova m0, [src1q] 166cabdff1aSopenharmony_ci mova m1, [src2q] 167cabdff1aSopenharmony_ci mova m2, [src1q+strideq] 168cabdff1aSopenharmony_ci mova m3, [src2q+strideq] 169cabdff1aSopenharmony_ci PAVGB_NO_RND 170cabdff1aSopenharmony_ci mova [dstq], m4 171cabdff1aSopenharmony_ci mova [dstq+strideq], m5 172cabdff1aSopenharmony_ci 173cabdff1aSopenharmony_ci mova m0, [src1q+strideq*2] 174cabdff1aSopenharmony_ci mova m1, [src2q+strideq*2] 175cabdff1aSopenharmony_ci mova m2, [src1q+stride3q] 176cabdff1aSopenharmony_ci mova m3, [src2q+stride3q] 177cabdff1aSopenharmony_ci PAVGB_NO_RND 178cabdff1aSopenharmony_ci mova [dstq+strideq*2], m4 179cabdff1aSopenharmony_ci mova [dstq+stride3q], m5 180cabdff1aSopenharmony_ci 181cabdff1aSopenharmony_ci lea src1q, [src1q+strideq*4] 182cabdff1aSopenharmony_ci lea src2q, [src2q+strideq*4] 183cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 184cabdff1aSopenharmony_ci sub hd, 4 185cabdff1aSopenharmony_ci jnz .loop 186cabdff1aSopenharmony_ci RET 187cabdff1aSopenharmony_ci 188cabdff1aSopenharmony_ci; from original comments: The Macro does IDct on 4 1-D Dcts 189cabdff1aSopenharmony_ci%macro BeginIDCT 0 190cabdff1aSopenharmony_ci movq m2, I(3) 191cabdff1aSopenharmony_ci movq m6, C(3) 192cabdff1aSopenharmony_ci movq m4, m2 193cabdff1aSopenharmony_ci movq m7, J(5) 194cabdff1aSopenharmony_ci pmulhw m4, m6 ; r4 = c3*i3 - i3 195cabdff1aSopenharmony_ci movq m1, C(5) 196cabdff1aSopenharmony_ci pmulhw m6, m7 ; r6 = c3*i5 - i5 197cabdff1aSopenharmony_ci movq m5, m1 198cabdff1aSopenharmony_ci pmulhw m1, m2 ; r1 = c5*i3 - i3 199cabdff1aSopenharmony_ci movq m3, I(1) 200cabdff1aSopenharmony_ci pmulhw m5, m7 ; r5 = c5*i5 - i5 201cabdff1aSopenharmony_ci movq m0, C(1) 202cabdff1aSopenharmony_ci paddw m4, m2 ; r4 = c3*i3 203cabdff1aSopenharmony_ci paddw m6, m7 ; r6 = c3*i5 204cabdff1aSopenharmony_ci paddw m2, m1 ; r2 = c5*i3 205cabdff1aSopenharmony_ci movq m1, J(7) 206cabdff1aSopenharmony_ci paddw m7, m5 ; r7 = c5*i5 207cabdff1aSopenharmony_ci movq m5, m0 ; r5 = c1 208cabdff1aSopenharmony_ci pmulhw m0, m3 ; r0 = c1*i1 - i1 209cabdff1aSopenharmony_ci paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5 210cabdff1aSopenharmony_ci pmulhw m5, m1 ; r5 = c1*i7 - i7 211cabdff1aSopenharmony_ci movq m7, C(7) 212cabdff1aSopenharmony_ci psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3 213cabdff1aSopenharmony_ci paddw m0, m3 ; r0 = c1*i1 214cabdff1aSopenharmony_ci pmulhw m3, m7 ; r3 = c7*i1 215cabdff1aSopenharmony_ci movq m2, I(2) 216cabdff1aSopenharmony_ci pmulhw m7, m1 ; r7 = c7*i7 217cabdff1aSopenharmony_ci paddw m5, m1 ; r5 = c1*i7 218cabdff1aSopenharmony_ci movq m1, m2 ; r1 = i2 219cabdff1aSopenharmony_ci pmulhw m2, C(2) ; r2 = c2*i2 - i2 220cabdff1aSopenharmony_ci psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7 221cabdff1aSopenharmony_ci movq m5, J(6) 222cabdff1aSopenharmony_ci paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7 223cabdff1aSopenharmony_ci movq m7, m5 ; r7 = i6 224cabdff1aSopenharmony_ci psubsw m0, m4 ; r0 = A - C 225cabdff1aSopenharmony_ci pmulhw m5, C(2) ; r5 = c2*i6 - i6 226cabdff1aSopenharmony_ci paddw m2, m1 ; r2 = c2*i2 227cabdff1aSopenharmony_ci pmulhw m1, C(6) ; r1 = c6*i2 228cabdff1aSopenharmony_ci paddsw m4, m4 ; r4 = C + C 229cabdff1aSopenharmony_ci paddsw m4, m0 ; r4 = C. = A + C 230cabdff1aSopenharmony_ci psubsw m3, m6 ; r3 = B - D 231cabdff1aSopenharmony_ci paddw m5, m7 ; r5 = c2*i6 232cabdff1aSopenharmony_ci paddsw m6, m6 ; r6 = D + D 233cabdff1aSopenharmony_ci pmulhw m7, C(6) ; r7 = c6*i6 234cabdff1aSopenharmony_ci paddsw m6, m3 ; r6 = D. = B + D 235cabdff1aSopenharmony_ci movq I(1), m4 ; save C. at I(1) 236cabdff1aSopenharmony_ci psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6 237cabdff1aSopenharmony_ci movq m4, C(4) 238cabdff1aSopenharmony_ci movq m5, m3 ; r5 = B - D 239cabdff1aSopenharmony_ci pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D) 240cabdff1aSopenharmony_ci paddsw m7, m2 ; r3 = (c4 - 1) * (B - D) 241cabdff1aSopenharmony_ci movq I(2), m6 ; save D. at I(2) 242cabdff1aSopenharmony_ci movq m2, m0 ; r2 = A - C 243cabdff1aSopenharmony_ci movq m6, I(0) 244cabdff1aSopenharmony_ci pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C) 245cabdff1aSopenharmony_ci paddw m5, m3 ; r5 = B. = c4 * (B - D) 246cabdff1aSopenharmony_ci movq m3, J(4) 247cabdff1aSopenharmony_ci psubsw m5, m1 ; r5 = B.. = B. - H 248cabdff1aSopenharmony_ci paddw m2, m0 ; r0 = A. = c4 * (A - C) 249cabdff1aSopenharmony_ci psubsw m6, m3 ; r6 = i0 - i4 250cabdff1aSopenharmony_ci movq m0, m6 251cabdff1aSopenharmony_ci pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4) 252cabdff1aSopenharmony_ci paddsw m3, m3 ; r3 = i4 + i4 253cabdff1aSopenharmony_ci paddsw m1, m1 ; r1 = H + H 254cabdff1aSopenharmony_ci paddsw m3, m0 ; r3 = i0 + i4 255cabdff1aSopenharmony_ci paddsw m1, m5 ; r1 = H. = B + H 256cabdff1aSopenharmony_ci pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4) 257cabdff1aSopenharmony_ci paddsw m6, m0 ; r6 = F = c4 * (i0 - i4) 258cabdff1aSopenharmony_ci psubsw m6, m2 ; r6 = F. = F - A. 259cabdff1aSopenharmony_ci paddsw m2, m2 ; r2 = A. + A. 260cabdff1aSopenharmony_ci movq m0, I(1) ; r0 = C. 261cabdff1aSopenharmony_ci paddsw m2, m6 ; r2 = A.. = F + A. 262cabdff1aSopenharmony_ci paddw m4, m3 ; r4 = E = c4 * (i0 + i4) 263cabdff1aSopenharmony_ci psubsw m2, m1 ; r2 = R2 = A.. - H. 264cabdff1aSopenharmony_ci%endmacro 265cabdff1aSopenharmony_ci 266cabdff1aSopenharmony_ci; RowIDCT gets ready to transpose 267cabdff1aSopenharmony_ci%macro RowIDCT 0 268cabdff1aSopenharmony_ci BeginIDCT 269cabdff1aSopenharmony_ci movq m3, I(2) ; r3 = D. 270cabdff1aSopenharmony_ci psubsw m4, m7 ; r4 = E. = E - G 271cabdff1aSopenharmony_ci paddsw m1, m1 ; r1 = H. + H. 272cabdff1aSopenharmony_ci paddsw m7, m7 ; r7 = G + G 273cabdff1aSopenharmony_ci paddsw m1, m2 ; r1 = R1 = A.. + H. 274cabdff1aSopenharmony_ci paddsw m7, m4 ; r1 = R1 = A.. + H. 275cabdff1aSopenharmony_ci psubsw m4, m3 ; r4 = R4 = E. - D. 276cabdff1aSopenharmony_ci paddsw m3, m3 277cabdff1aSopenharmony_ci psubsw m6, m5 ; r6 = R6 = F. - B.. 278cabdff1aSopenharmony_ci paddsw m5, m5 279cabdff1aSopenharmony_ci paddsw m3, m4 ; r3 = R3 = E. + D. 280cabdff1aSopenharmony_ci paddsw m5, m6 ; r5 = R5 = F. + B.. 281cabdff1aSopenharmony_ci psubsw m7, m0 ; r7 = R7 = G. - C. 282cabdff1aSopenharmony_ci paddsw m0, m0 283cabdff1aSopenharmony_ci movq I(1), m1 ; save R1 284cabdff1aSopenharmony_ci paddsw m0, m7 ; r0 = R0 = G. + C. 285cabdff1aSopenharmony_ci%endmacro 286cabdff1aSopenharmony_ci 287cabdff1aSopenharmony_ci; Column IDCT normalizes and stores final results 288cabdff1aSopenharmony_ci%macro ColumnIDCT 0 289cabdff1aSopenharmony_ci BeginIDCT 290cabdff1aSopenharmony_ci paddsw m2, OC_8 ; adjust R2 (and R1) for shift 291cabdff1aSopenharmony_ci paddsw m1, m1 ; r1 = H. + H. 292cabdff1aSopenharmony_ci paddsw m1, m2 ; r1 = R1 = A.. + H. 293cabdff1aSopenharmony_ci psraw m2, 4 ; r2 = NR2 294cabdff1aSopenharmony_ci psubsw m4, m7 ; r4 = E. = E - G 295cabdff1aSopenharmony_ci psraw m1, 4 ; r1 = NR2 296cabdff1aSopenharmony_ci movq m3, I(2) ; r3 = D. 297cabdff1aSopenharmony_ci paddsw m7, m7 ; r7 = G + G 298cabdff1aSopenharmony_ci movq I(2), m2 ; store NR2 at I2 299cabdff1aSopenharmony_ci paddsw m7, m4 ; r7 = G. = E + G 300cabdff1aSopenharmony_ci movq I(1), m1 ; store NR1 at I1 301cabdff1aSopenharmony_ci psubsw m4, m3 ; r4 = R4 = E. - D. 302cabdff1aSopenharmony_ci paddsw m4, OC_8 ; adjust R4 (and R3) for shift 303cabdff1aSopenharmony_ci paddsw m3, m3 ; r3 = D. + D. 304cabdff1aSopenharmony_ci paddsw m3, m4 ; r3 = R3 = E. + D. 305cabdff1aSopenharmony_ci psraw m4, 4 ; r4 = NR4 306cabdff1aSopenharmony_ci psubsw m6, m5 ; r6 = R6 = F. - B.. 307cabdff1aSopenharmony_ci psraw m3, 4 ; r3 = NR3 308cabdff1aSopenharmony_ci paddsw m6, OC_8 ; adjust R6 (and R5) for shift 309cabdff1aSopenharmony_ci paddsw m5, m5 ; r5 = B.. + B.. 310cabdff1aSopenharmony_ci paddsw m5, m6 ; r5 = R5 = F. + B.. 311cabdff1aSopenharmony_ci psraw m6, 4 ; r6 = NR6 312cabdff1aSopenharmony_ci movq J(4), m4 ; store NR4 at J4 313cabdff1aSopenharmony_ci psraw m5, 4 ; r5 = NR5 314cabdff1aSopenharmony_ci movq I(3), m3 ; store NR3 at I3 315cabdff1aSopenharmony_ci psubsw m7, m0 ; r7 = R7 = G. - C. 316cabdff1aSopenharmony_ci paddsw m7, OC_8 ; adjust R7 (and R0) for shift 317cabdff1aSopenharmony_ci paddsw m0, m0 ; r0 = C. + C. 318cabdff1aSopenharmony_ci paddsw m0, m7 ; r0 = R0 = G. + C. 319cabdff1aSopenharmony_ci psraw m7, 4 ; r7 = NR7 320cabdff1aSopenharmony_ci movq J(6), m6 ; store NR6 at J6 321cabdff1aSopenharmony_ci psraw m0, 4 ; r0 = NR0 322cabdff1aSopenharmony_ci movq J(5), m5 ; store NR5 at J5 323cabdff1aSopenharmony_ci movq J(7), m7 ; store NR7 at J7 324cabdff1aSopenharmony_ci movq I(0), m0 ; store NR0 at I0 325cabdff1aSopenharmony_ci%endmacro 326cabdff1aSopenharmony_ci 327cabdff1aSopenharmony_ci; Following macro does two 4x4 transposes in place. 328cabdff1aSopenharmony_ci; 329cabdff1aSopenharmony_ci; At entry (we assume): 330cabdff1aSopenharmony_ci; 331cabdff1aSopenharmony_ci; r0 = a3 a2 a1 a0 332cabdff1aSopenharmony_ci; I(1) = b3 b2 b1 b0 333cabdff1aSopenharmony_ci; r2 = c3 c2 c1 c0 334cabdff1aSopenharmony_ci; r3 = d3 d2 d1 d0 335cabdff1aSopenharmony_ci; 336cabdff1aSopenharmony_ci; r4 = e3 e2 e1 e0 337cabdff1aSopenharmony_ci; r5 = f3 f2 f1 f0 338cabdff1aSopenharmony_ci; r6 = g3 g2 g1 g0 339cabdff1aSopenharmony_ci; r7 = h3 h2 h1 h0 340cabdff1aSopenharmony_ci; 341cabdff1aSopenharmony_ci; At exit, we have: 342cabdff1aSopenharmony_ci; 343cabdff1aSopenharmony_ci; I(0) = d0 c0 b0 a0 344cabdff1aSopenharmony_ci; I(1) = d1 c1 b1 a1 345cabdff1aSopenharmony_ci; I(2) = d2 c2 b2 a2 346cabdff1aSopenharmony_ci; I(3) = d3 c3 b3 a3 347cabdff1aSopenharmony_ci; 348cabdff1aSopenharmony_ci; J(4) = h0 g0 f0 e0 349cabdff1aSopenharmony_ci; J(5) = h1 g1 f1 e1 350cabdff1aSopenharmony_ci; J(6) = h2 g2 f2 e2 351cabdff1aSopenharmony_ci; J(7) = h3 g3 f3 e3 352cabdff1aSopenharmony_ci; 353cabdff1aSopenharmony_ci; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. 354cabdff1aSopenharmony_ci; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. 355cabdff1aSopenharmony_ci; 356cabdff1aSopenharmony_ci; Since r1 is free at entry, we calculate the Js first. 357cabdff1aSopenharmony_ci%macro Transpose 0 358cabdff1aSopenharmony_ci movq m1, m4 ; r1 = e3 e2 e1 e0 359cabdff1aSopenharmony_ci punpcklwd m4, m5 ; r4 = f1 e1 f0 e0 360cabdff1aSopenharmony_ci movq I(0), m0 ; save a3 a2 a1 a0 361cabdff1aSopenharmony_ci punpckhwd m1, m5 ; r1 = f3 e3 f2 e2 362cabdff1aSopenharmony_ci movq m0, m6 ; r0 = g3 g2 g1 g0 363cabdff1aSopenharmony_ci punpcklwd m6, m7 ; r6 = h1 g1 h0 g0 364cabdff1aSopenharmony_ci movq m5, m4 ; r5 = f1 e1 f0 e0 365cabdff1aSopenharmony_ci punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4 366cabdff1aSopenharmony_ci punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5 367cabdff1aSopenharmony_ci movq m6, m1 ; r6 = f3 e3 f2 e2 368cabdff1aSopenharmony_ci movq J(4), m4 369cabdff1aSopenharmony_ci punpckhwd m0, m7 ; r0 = h3 g3 h2 g2 370cabdff1aSopenharmony_ci movq J(5), m5 371cabdff1aSopenharmony_ci punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7 372cabdff1aSopenharmony_ci movq m4, I(0) ; r4 = a3 a2 a1 a0 373cabdff1aSopenharmony_ci punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6 374cabdff1aSopenharmony_ci movq m5, I(1) ; r5 = b3 b2 b1 b0 375cabdff1aSopenharmony_ci movq m0, m4 ; r0 = a3 a2 a1 a0 376cabdff1aSopenharmony_ci movq J(7), m6 377cabdff1aSopenharmony_ci punpcklwd m0, m5 ; r0 = b1 a1 b0 a0 378cabdff1aSopenharmony_ci movq J(6), m1 379cabdff1aSopenharmony_ci punpckhwd m4, m5 ; r4 = b3 a3 b2 a2 380cabdff1aSopenharmony_ci movq m5, m2 ; r5 = c3 c2 c1 c0 381cabdff1aSopenharmony_ci punpcklwd m2, m3 ; r2 = d1 c1 d0 c0 382cabdff1aSopenharmony_ci movq m1, m0 ; r1 = b1 a1 b0 a0 383cabdff1aSopenharmony_ci punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0 384cabdff1aSopenharmony_ci punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1 385cabdff1aSopenharmony_ci movq m2, m4 ; r2 = b3 a3 b2 a2 386cabdff1aSopenharmony_ci movq I(0), m0 387cabdff1aSopenharmony_ci punpckhwd m5, m3 ; r5 = d3 c3 d2 c2 388cabdff1aSopenharmony_ci movq I(1), m1 389cabdff1aSopenharmony_ci punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3 390cabdff1aSopenharmony_ci punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2 391cabdff1aSopenharmony_ci movq I(3), m4 392cabdff1aSopenharmony_ci movq I(2), m2 393cabdff1aSopenharmony_ci%endmacro 394cabdff1aSopenharmony_ci 395cabdff1aSopenharmony_ci%macro VP3_1D_IDCT_SSE2 0 396cabdff1aSopenharmony_ci movdqa m2, I(3) ; xmm2 = i3 397cabdff1aSopenharmony_ci movdqa m6, C(3) ; xmm6 = c3 398cabdff1aSopenharmony_ci movdqa m4, m2 ; xmm4 = i3 399cabdff1aSopenharmony_ci movdqa m7, I(5) ; xmm7 = i5 400cabdff1aSopenharmony_ci pmulhw m4, m6 ; xmm4 = c3 * i3 - i3 401cabdff1aSopenharmony_ci movdqa m1, C(5) ; xmm1 = c5 402cabdff1aSopenharmony_ci pmulhw m6, m7 ; xmm6 = c3 * i5 - i5 403cabdff1aSopenharmony_ci movdqa m5, m1 ; xmm5 = c5 404cabdff1aSopenharmony_ci pmulhw m1, m2 ; xmm1 = c5 * i3 - i3 405cabdff1aSopenharmony_ci movdqa m3, I(1) ; xmm3 = i1 406cabdff1aSopenharmony_ci pmulhw m5, m7 ; xmm5 = c5 * i5 - i5 407cabdff1aSopenharmony_ci movdqa m0, C(1) ; xmm0 = c1 408cabdff1aSopenharmony_ci paddw m4, m2 ; xmm4 = c3 * i3 409cabdff1aSopenharmony_ci paddw m6, m7 ; xmm6 = c3 * i5 410cabdff1aSopenharmony_ci paddw m2, m1 ; xmm2 = c5 * i3 411cabdff1aSopenharmony_ci movdqa m1, I(7) ; xmm1 = i7 412cabdff1aSopenharmony_ci paddw m7, m5 ; xmm7 = c5 * i5 413cabdff1aSopenharmony_ci movdqa m5, m0 ; xmm5 = c1 414cabdff1aSopenharmony_ci pmulhw m0, m3 ; xmm0 = c1 * i1 - i1 415cabdff1aSopenharmony_ci paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C 416cabdff1aSopenharmony_ci pmulhw m5, m1 ; xmm5 = c1 * i7 - i7 417cabdff1aSopenharmony_ci movdqa m7, C(7) ; xmm7 = c7 418cabdff1aSopenharmony_ci psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D 419cabdff1aSopenharmony_ci paddw m0, m3 ; xmm0 = c1 * i1 420cabdff1aSopenharmony_ci pmulhw m3, m7 ; xmm3 = c7 * i1 421cabdff1aSopenharmony_ci movdqa m2, I(2) ; xmm2 = i2 422cabdff1aSopenharmony_ci pmulhw m7, m1 ; xmm7 = c7 * i7 423cabdff1aSopenharmony_ci paddw m5, m1 ; xmm5 = c1 * i7 424cabdff1aSopenharmony_ci movdqa m1, m2 ; xmm1 = i2 425cabdff1aSopenharmony_ci pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2 426cabdff1aSopenharmony_ci psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B 427cabdff1aSopenharmony_ci movdqa m5, I(6) ; xmm5 = i6 428cabdff1aSopenharmony_ci paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A 429cabdff1aSopenharmony_ci movdqa m7, m5 ; xmm7 = i6 430cabdff1aSopenharmony_ci psubsw m0, m4 ; xmm0 = A - C 431cabdff1aSopenharmony_ci pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6 432cabdff1aSopenharmony_ci paddw m2, m1 ; xmm2 = i2 * c2 433cabdff1aSopenharmony_ci pmulhw m1, C(6) ; xmm1 = c6 * i2 434cabdff1aSopenharmony_ci paddsw m4, m4 ; xmm4 = C + C 435cabdff1aSopenharmony_ci paddsw m4, m0 ; xmm4 = A + C = C. 436cabdff1aSopenharmony_ci psubsw m3, m6 ; xmm3 = B - D 437cabdff1aSopenharmony_ci paddw m5, m7 ; xmm5 = c2 * i6 438cabdff1aSopenharmony_ci paddsw m6, m6 ; xmm6 = D + D 439cabdff1aSopenharmony_ci pmulhw m7, C(6) ; xmm7 = c6 * i6 440cabdff1aSopenharmony_ci paddsw m6, m3 ; xmm6 = B + D = D. 441cabdff1aSopenharmony_ci movdqa I(1), m4 ; Save C. at I(1) 442cabdff1aSopenharmony_ci psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H 443cabdff1aSopenharmony_ci movdqa m4, C(4) ; xmm4 = C4 444cabdff1aSopenharmony_ci movdqa m5, m3 ; xmm5 = B - D 445cabdff1aSopenharmony_ci pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D ) 446cabdff1aSopenharmony_ci paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G 447cabdff1aSopenharmony_ci movdqa I(2), m6 ; save D. at I(2) 448cabdff1aSopenharmony_ci movdqa m2, m0 ; xmm2 = A - C 449cabdff1aSopenharmony_ci movdqa m6, I(0) ; xmm6 = i0 450cabdff1aSopenharmony_ci pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A. 451cabdff1aSopenharmony_ci paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B. 452cabdff1aSopenharmony_ci movdqa m3, I(4) ; xmm3 = i4 453cabdff1aSopenharmony_ci psubsw m5, m1 ; xmm5 = B. - H = B.. 454cabdff1aSopenharmony_ci paddw m2, m0 ; xmm2 = c4 * ( A - C) = A. 455cabdff1aSopenharmony_ci psubsw m6, m3 ; xmm6 = i0 - i4 456cabdff1aSopenharmony_ci movdqa m0, m6 ; xmm0 = i0 - i4 457cabdff1aSopenharmony_ci pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F 458cabdff1aSopenharmony_ci paddsw m3, m3 ; xmm3 = i4 + i4 459cabdff1aSopenharmony_ci paddsw m1, m1 ; xmm1 = H + H 460cabdff1aSopenharmony_ci paddsw m3, m0 ; xmm3 = i0 + i4 461cabdff1aSopenharmony_ci paddsw m1, m5 ; xmm1 = B. + H = H. 462cabdff1aSopenharmony_ci pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 ) 463cabdff1aSopenharmony_ci paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 ) 464cabdff1aSopenharmony_ci psubsw m6, m2 ; xmm6 = F - A. = F. 465cabdff1aSopenharmony_ci paddsw m2, m2 ; xmm2 = A. + A. 466cabdff1aSopenharmony_ci movdqa m0, I(1) ; Load C. from I(1) 467cabdff1aSopenharmony_ci paddsw m2, m6 ; xmm2 = F + A. = A.. 468cabdff1aSopenharmony_ci paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3 469cabdff1aSopenharmony_ci psubsw m2, m1 ; xmm2 = A.. - H. = R2 470cabdff1aSopenharmony_ci ADD(m2) ; Adjust R2 and R1 before shifting 471cabdff1aSopenharmony_ci paddsw m1, m1 ; xmm1 = H. + H. 472cabdff1aSopenharmony_ci paddsw m1, m2 ; xmm1 = A.. + H. = R1 473cabdff1aSopenharmony_ci SHIFT(m2) ; xmm2 = op2 474cabdff1aSopenharmony_ci psubsw m4, m7 ; xmm4 = E - G = E. 475cabdff1aSopenharmony_ci SHIFT(m1) ; xmm1 = op1 476cabdff1aSopenharmony_ci movdqa m3, I(2) ; Load D. from I(2) 477cabdff1aSopenharmony_ci paddsw m7, m7 ; xmm7 = G + G 478cabdff1aSopenharmony_ci paddsw m7, m4 ; xmm7 = E + G = G. 479cabdff1aSopenharmony_ci psubsw m4, m3 ; xmm4 = E. - D. = R4 480cabdff1aSopenharmony_ci ADD(m4) ; Adjust R4 and R3 before shifting 481cabdff1aSopenharmony_ci paddsw m3, m3 ; xmm3 = D. + D. 482cabdff1aSopenharmony_ci paddsw m3, m4 ; xmm3 = E. + D. = R3 483cabdff1aSopenharmony_ci SHIFT(m4) ; xmm4 = op4 484cabdff1aSopenharmony_ci psubsw m6, m5 ; xmm6 = F. - B..= R6 485cabdff1aSopenharmony_ci SHIFT(m3) ; xmm3 = op3 486cabdff1aSopenharmony_ci ADD(m6) ; Adjust R6 and R5 before shifting 487cabdff1aSopenharmony_ci paddsw m5, m5 ; xmm5 = B.. + B.. 488cabdff1aSopenharmony_ci paddsw m5, m6 ; xmm5 = F. + B.. = R5 489cabdff1aSopenharmony_ci SHIFT(m6) ; xmm6 = op6 490cabdff1aSopenharmony_ci SHIFT(m5) ; xmm5 = op5 491cabdff1aSopenharmony_ci psubsw m7, m0 ; xmm7 = G. - C. = R7 492cabdff1aSopenharmony_ci ADD(m7) ; Adjust R7 and R0 before shifting 493cabdff1aSopenharmony_ci paddsw m0, m0 ; xmm0 = C. + C. 494cabdff1aSopenharmony_ci paddsw m0, m7 ; xmm0 = G. + C. 495cabdff1aSopenharmony_ci SHIFT(m7) ; xmm7 = op7 496cabdff1aSopenharmony_ci SHIFT(m0) ; xmm0 = op0 497cabdff1aSopenharmony_ci%endmacro 498cabdff1aSopenharmony_ci 499cabdff1aSopenharmony_ci%macro PUT_BLOCK 8 500cabdff1aSopenharmony_ci movdqa O(0), m%1 501cabdff1aSopenharmony_ci movdqa O(1), m%2 502cabdff1aSopenharmony_ci movdqa O(2), m%3 503cabdff1aSopenharmony_ci movdqa O(3), m%4 504cabdff1aSopenharmony_ci movdqa O(4), m%5 505cabdff1aSopenharmony_ci movdqa O(5), m%6 506cabdff1aSopenharmony_ci movdqa O(6), m%7 507cabdff1aSopenharmony_ci movdqa O(7), m%8 508cabdff1aSopenharmony_ci%endmacro 509cabdff1aSopenharmony_ci 510cabdff1aSopenharmony_ci%macro VP3_IDCT 1 511cabdff1aSopenharmony_ci%if mmsize == 16 512cabdff1aSopenharmony_ci%define I(x) [%1+16*x] 513cabdff1aSopenharmony_ci%define O(x) [%1+16*x] 514cabdff1aSopenharmony_ci%define C(x) [vp3_idct_data+16*(x-1)] 515cabdff1aSopenharmony_ci%define SHIFT(x) 516cabdff1aSopenharmony_ci%define ADD(x) 517cabdff1aSopenharmony_ci VP3_1D_IDCT_SSE2 518cabdff1aSopenharmony_ci%if ARCH_X86_64 519cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 520cabdff1aSopenharmony_ci%else 521cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16] 522cabdff1aSopenharmony_ci%endif 523cabdff1aSopenharmony_ci PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 524cabdff1aSopenharmony_ci 525cabdff1aSopenharmony_ci%define SHIFT(x) psraw x, 4 526cabdff1aSopenharmony_ci%define ADD(x) paddsw x, [pw_8] 527cabdff1aSopenharmony_ci VP3_1D_IDCT_SSE2 528cabdff1aSopenharmony_ci PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 529cabdff1aSopenharmony_ci%else ; mmsize == 8 530cabdff1aSopenharmony_ci ; eax = quantized input 531cabdff1aSopenharmony_ci ; ebx = dequantizer matrix 532cabdff1aSopenharmony_ci ; ecx = IDCT constants 533cabdff1aSopenharmony_ci ; M(I) = ecx + MaskOffset(0) + I * 8 534cabdff1aSopenharmony_ci ; C(I) = ecx + CosineOffset(32) + (I-1) * 8 535cabdff1aSopenharmony_ci ; edx = output 536cabdff1aSopenharmony_ci ; r0..r7 = mm0..mm7 537cabdff1aSopenharmony_ci%define OC_8 [pw_8] 538cabdff1aSopenharmony_ci%define C(x) [vp3_idct_data+16*(x-1)] 539cabdff1aSopenharmony_ci 540cabdff1aSopenharmony_ci ; at this point, function has completed dequantization + dezigzag + 541cabdff1aSopenharmony_ci ; partial transposition; now do the idct itself 542cabdff1aSopenharmony_ci%define I(x) [%1+16*x] 543cabdff1aSopenharmony_ci%define J(x) [%1+16*x] 544cabdff1aSopenharmony_ci RowIDCT 545cabdff1aSopenharmony_ci Transpose 546cabdff1aSopenharmony_ci 547cabdff1aSopenharmony_ci%define I(x) [%1+16*x+8] 548cabdff1aSopenharmony_ci%define J(x) [%1+16*x+8] 549cabdff1aSopenharmony_ci RowIDCT 550cabdff1aSopenharmony_ci Transpose 551cabdff1aSopenharmony_ci 552cabdff1aSopenharmony_ci%define I(x) [%1+16* x] 553cabdff1aSopenharmony_ci%define J(x) [%1+16*(x-4)+8] 554cabdff1aSopenharmony_ci ColumnIDCT 555cabdff1aSopenharmony_ci 556cabdff1aSopenharmony_ci%define I(x) [%1+16* x +64] 557cabdff1aSopenharmony_ci%define J(x) [%1+16*(x-4)+72] 558cabdff1aSopenharmony_ci ColumnIDCT 559cabdff1aSopenharmony_ci%endif ; mmsize == 16/8 560cabdff1aSopenharmony_ci%endmacro 561cabdff1aSopenharmony_ci 562cabdff1aSopenharmony_ci%macro vp3_idct_funcs 0 563cabdff1aSopenharmony_cicglobal vp3_idct_put, 3, 4, 9 564cabdff1aSopenharmony_ci VP3_IDCT r2 565cabdff1aSopenharmony_ci 566cabdff1aSopenharmony_ci mova m4, [pb_80] 567cabdff1aSopenharmony_ci lea r3, [r1*3] 568cabdff1aSopenharmony_ci%assign %%i 0 569cabdff1aSopenharmony_ci%rep 16/mmsize 570cabdff1aSopenharmony_ci mova m0, [r2+mmsize*0+%%i] 571cabdff1aSopenharmony_ci mova m1, [r2+mmsize*2+%%i] 572cabdff1aSopenharmony_ci mova m2, [r2+mmsize*4+%%i] 573cabdff1aSopenharmony_ci mova m3, [r2+mmsize*6+%%i] 574cabdff1aSopenharmony_ci packsswb m0, [r2+mmsize*1+%%i] 575cabdff1aSopenharmony_ci packsswb m1, [r2+mmsize*3+%%i] 576cabdff1aSopenharmony_ci packsswb m2, [r2+mmsize*5+%%i] 577cabdff1aSopenharmony_ci packsswb m3, [r2+mmsize*7+%%i] 578cabdff1aSopenharmony_ci paddb m0, m4 579cabdff1aSopenharmony_ci paddb m1, m4 580cabdff1aSopenharmony_ci paddb m2, m4 581cabdff1aSopenharmony_ci paddb m3, m4 582cabdff1aSopenharmony_ci movq [r0 ], m0 583cabdff1aSopenharmony_ci movhps [r0+r1 ], m0 584cabdff1aSopenharmony_ci movq [r0+r1*2], m1 585cabdff1aSopenharmony_ci movhps [r0+r3 ], m1 586cabdff1aSopenharmony_ci%if %%i == 0 587cabdff1aSopenharmony_ci lea r0, [r0+r1*4] 588cabdff1aSopenharmony_ci%endif 589cabdff1aSopenharmony_ci movq [r0 ], m2 590cabdff1aSopenharmony_ci movhps [r0+r1 ], m2 591cabdff1aSopenharmony_ci movq [r0+r1*2], m3 592cabdff1aSopenharmony_ci movhps [r0+r3 ], m3 593cabdff1aSopenharmony_ci%assign %%i %%i+8 594cabdff1aSopenharmony_ci%endrep 595cabdff1aSopenharmony_ci 596cabdff1aSopenharmony_ci pxor m0, m0 597cabdff1aSopenharmony_ci%assign %%offset 0 598cabdff1aSopenharmony_ci%rep 128/mmsize 599cabdff1aSopenharmony_ci mova [r2+%%offset], m0 600cabdff1aSopenharmony_ci%assign %%offset %%offset+mmsize 601cabdff1aSopenharmony_ci%endrep 602cabdff1aSopenharmony_ci RET 603cabdff1aSopenharmony_ci 604cabdff1aSopenharmony_cicglobal vp3_idct_add, 3, 4, 9 605cabdff1aSopenharmony_ci VP3_IDCT r2 606cabdff1aSopenharmony_ci 607cabdff1aSopenharmony_ci lea r3, [r1*3] 608cabdff1aSopenharmony_ci pxor m4, m4 609cabdff1aSopenharmony_ci%assign %%i 0 610cabdff1aSopenharmony_ci%rep 2 611cabdff1aSopenharmony_ci movq m0, [r0] 612cabdff1aSopenharmony_ci movq m1, [r0+r1] 613cabdff1aSopenharmony_ci movq m2, [r0+r1*2] 614cabdff1aSopenharmony_ci movq m3, [r0+r3] 615cabdff1aSopenharmony_ci punpcklbw m0, m4 616cabdff1aSopenharmony_ci punpcklbw m1, m4 617cabdff1aSopenharmony_ci punpcklbw m2, m4 618cabdff1aSopenharmony_ci punpcklbw m3, m4 619cabdff1aSopenharmony_ci paddsw m0, [r2+ 0+%%i] 620cabdff1aSopenharmony_ci paddsw m1, [r2+16+%%i] 621cabdff1aSopenharmony_ci paddsw m2, [r2+32+%%i] 622cabdff1aSopenharmony_ci paddsw m3, [r2+48+%%i] 623cabdff1aSopenharmony_ci packuswb m0, m1 624cabdff1aSopenharmony_ci packuswb m2, m3 625cabdff1aSopenharmony_ci movq [r0 ], m0 626cabdff1aSopenharmony_ci movhps [r0+r1 ], m0 627cabdff1aSopenharmony_ci movq [r0+r1*2], m2 628cabdff1aSopenharmony_ci movhps [r0+r3 ], m2 629cabdff1aSopenharmony_ci%if %%i == 0 630cabdff1aSopenharmony_ci lea r0, [r0+r1*4] 631cabdff1aSopenharmony_ci%endif 632cabdff1aSopenharmony_ci%assign %%i %%i+64 633cabdff1aSopenharmony_ci%endrep 634cabdff1aSopenharmony_ci%assign %%i 0 635cabdff1aSopenharmony_ci%rep 128/mmsize 636cabdff1aSopenharmony_ci mova [r2+%%i], m4 637cabdff1aSopenharmony_ci%assign %%i %%i+mmsize 638cabdff1aSopenharmony_ci%endrep 639cabdff1aSopenharmony_ci RET 640cabdff1aSopenharmony_ci%endmacro 641cabdff1aSopenharmony_ci 642cabdff1aSopenharmony_ciINIT_XMM sse2 643cabdff1aSopenharmony_civp3_idct_funcs 644cabdff1aSopenharmony_ci 645cabdff1aSopenharmony_ci%macro DC_ADD 0 646cabdff1aSopenharmony_ci movq m2, [r0 ] 647cabdff1aSopenharmony_ci movq m3, [r0+r1 ] 648cabdff1aSopenharmony_ci paddusb m2, m0 649cabdff1aSopenharmony_ci movq m4, [r0+r1*2] 650cabdff1aSopenharmony_ci paddusb m3, m0 651cabdff1aSopenharmony_ci movq m5, [r0+r2 ] 652cabdff1aSopenharmony_ci paddusb m4, m0 653cabdff1aSopenharmony_ci paddusb m5, m0 654cabdff1aSopenharmony_ci psubusb m2, m1 655cabdff1aSopenharmony_ci psubusb m3, m1 656cabdff1aSopenharmony_ci movq [r0 ], m2 657cabdff1aSopenharmony_ci psubusb m4, m1 658cabdff1aSopenharmony_ci movq [r0+r1 ], m3 659cabdff1aSopenharmony_ci psubusb m5, m1 660cabdff1aSopenharmony_ci movq [r0+r1*2], m4 661cabdff1aSopenharmony_ci movq [r0+r2 ], m5 662cabdff1aSopenharmony_ci%endmacro 663cabdff1aSopenharmony_ci 664cabdff1aSopenharmony_ciINIT_MMX mmxext 665cabdff1aSopenharmony_cicglobal vp3_idct_dc_add, 3, 4 666cabdff1aSopenharmony_ci movsx r3, word [r2] 667cabdff1aSopenharmony_ci mov word [r2], 0 668cabdff1aSopenharmony_ci lea r2, [r1*3] 669cabdff1aSopenharmony_ci add r3, 15 670cabdff1aSopenharmony_ci sar r3, 5 671cabdff1aSopenharmony_ci movd m0, r3d 672cabdff1aSopenharmony_ci pshufw m0, m0, 0x0 673cabdff1aSopenharmony_ci pxor m1, m1 674cabdff1aSopenharmony_ci psubw m1, m0 675cabdff1aSopenharmony_ci packuswb m0, m0 676cabdff1aSopenharmony_ci packuswb m1, m1 677cabdff1aSopenharmony_ci DC_ADD 678cabdff1aSopenharmony_ci lea r0, [r0+r1*4] 679cabdff1aSopenharmony_ci DC_ADD 680cabdff1aSopenharmony_ci RET 681