1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * (C) Copyright IBM Corporation 2004 3bf215546Sopenharmony_ci * All Rights Reserved. 4bf215546Sopenharmony_ci * 5bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 6bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 7bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 8bf215546Sopenharmony_ci * on the rights to use, copy, modify, merge, publish, distribute, sub 9bf215546Sopenharmony_ci * license, and/or sell copies of the Software, and to permit persons to whom 10bf215546Sopenharmony_ci * the Software is furnished to do so, subject to the following conditions: 11bf215546Sopenharmony_ci * 12bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 13bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 14bf215546Sopenharmony_ci * Software. 15bf215546Sopenharmony_ci * 16bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19bf215546Sopenharmony_ci * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20bf215546Sopenharmony_ci * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21bf215546Sopenharmony_ci * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22bf215546Sopenharmony_ci * USE OR OTHER DEALINGS IN THE SOFTWARE. 23bf215546Sopenharmony_ci */ 24bf215546Sopenharmony_ci 25bf215546Sopenharmony_ci/** 26bf215546Sopenharmony_ci * \file read_rgba_span_x86.S 27bf215546Sopenharmony_ci * Optimized routines to transfer pixel data from the framebuffer to a 28bf215546Sopenharmony_ci * buffer in main memory. 29bf215546Sopenharmony_ci * 30bf215546Sopenharmony_ci * \author Ian Romanick <idr@us.ibm.com> 31bf215546Sopenharmony_ci */ 32bf215546Sopenharmony_ci/* Control flow enforcement support */ 33bf215546Sopenharmony_ci#ifdef HAVE_CET_H 34bf215546Sopenharmony_ci#include <cet.h> 35bf215546Sopenharmony_ci#else 36bf215546Sopenharmony_ci#define _CET_ENDBR 37bf215546Sopenharmony_ci#endif 38bf215546Sopenharmony_ci 39bf215546Sopenharmony_ci .file "read_rgba_span_x86.S" 40bf215546Sopenharmony_ci#if !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */ 41bf215546Sopenharmony_ci/* Kevin F. Quinn 2nd July 2006 42bf215546Sopenharmony_ci * Replaced data segment constants with text-segment instructions. 43bf215546Sopenharmony_ci */ 44bf215546Sopenharmony_ci#define LOAD_MASK(mvins,m1,m2) \ 45bf215546Sopenharmony_ci pushl $0xff00ff00 ;\ 46bf215546Sopenharmony_ci pushl $0xff00ff00 ;\ 47bf215546Sopenharmony_ci pushl $0xff00ff00 ;\ 48bf215546Sopenharmony_ci pushl $0xff00ff00 ;\ 49bf215546Sopenharmony_ci mvins (%esp), m1 ;\ 50bf215546Sopenharmony_ci pushl $0x00ff0000 ;\ 51bf215546Sopenharmony_ci pushl $0x00ff0000 ;\ 52bf215546Sopenharmony_ci pushl $0x00ff0000 ;\ 53bf215546Sopenharmony_ci pushl $0x00ff0000 ;\ 54bf215546Sopenharmony_ci mvins (%esp), m2 ;\ 55bf215546Sopenharmony_ci addl $32, %esp 56bf215546Sopenharmony_ci 57bf215546Sopenharmony_ci/* I implemented these as macros because they appear in several places, 58bf215546Sopenharmony_ci * and I've tweaked them a number of times. I got tired of changing every 59bf215546Sopenharmony_ci * place they appear. :) 60bf215546Sopenharmony_ci */ 61bf215546Sopenharmony_ci 62bf215546Sopenharmony_ci#define DO_ONE_PIXEL() \ 63bf215546Sopenharmony_ci movl (%ebx), %eax ; \ 64bf215546Sopenharmony_ci addl $4, %ebx ; \ 65bf215546Sopenharmony_ci bswap %eax /* ARGB -> BGRA */ ; \ 66bf215546Sopenharmony_ci rorl $8, %eax /* BGRA -> ABGR */ ; \ 67bf215546Sopenharmony_ci movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ 68bf215546Sopenharmony_ci addl $4, %ecx 69bf215546Sopenharmony_ci 70bf215546Sopenharmony_ci#define DO_ONE_LAST_PIXEL() \ 71bf215546Sopenharmony_ci movl (%ebx), %eax ; \ 72bf215546Sopenharmony_ci bswap %eax /* ARGB -> BGRA */ ; \ 73bf215546Sopenharmony_ci rorl $8, %eax /* BGRA -> ABGR */ ; \ 74bf215546Sopenharmony_ci movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; 75bf215546Sopenharmony_ci 76bf215546Sopenharmony_ci 77bf215546Sopenharmony_ci/** 78bf215546Sopenharmony_ci * MMX optimized version of the BGRA8888_REV to RGBA copy routine. 79bf215546Sopenharmony_ci * 80bf215546Sopenharmony_ci * \warning 81bf215546Sopenharmony_ci * This function assumes that the caller will issue the EMMS instruction 82bf215546Sopenharmony_ci * at the correct places. 83bf215546Sopenharmony_ci */ 84bf215546Sopenharmony_ci 85bf215546Sopenharmony_ci.globl _generic_read_RGBA_span_BGRA8888_REV_MMX 86bf215546Sopenharmony_ci.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX 87bf215546Sopenharmony_ci .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function 88bf215546Sopenharmony_ci_generic_read_RGBA_span_BGRA8888_REV_MMX: 89bf215546Sopenharmony_ci _CET_ENDBR 90bf215546Sopenharmony_ci pushl %ebx 91bf215546Sopenharmony_ci 92bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS 93bf215546Sopenharmony_ci emms 94bf215546Sopenharmony_ci#endif 95bf215546Sopenharmony_ci LOAD_MASK(movq,%mm1,%mm2) 96bf215546Sopenharmony_ci 97bf215546Sopenharmony_ci movl 8(%esp), %ebx /* source pointer */ 98bf215546Sopenharmony_ci movl 16(%esp), %edx /* number of pixels to copy */ 99bf215546Sopenharmony_ci movl 12(%esp), %ecx /* destination pointer */ 100bf215546Sopenharmony_ci 101bf215546Sopenharmony_ci testl %edx, %edx 102bf215546Sopenharmony_ci jle .L20 /* Bail if there's nothing to do. */ 103bf215546Sopenharmony_ci 104bf215546Sopenharmony_ci movl %ebx, %eax 105bf215546Sopenharmony_ci 106bf215546Sopenharmony_ci negl %eax 107bf215546Sopenharmony_ci sarl $2, %eax 108bf215546Sopenharmony_ci andl $1, %eax 109bf215546Sopenharmony_ci je .L17 110bf215546Sopenharmony_ci 111bf215546Sopenharmony_ci subl %eax, %edx 112bf215546Sopenharmony_ci DO_ONE_PIXEL() 113bf215546Sopenharmony_ci.L17: 114bf215546Sopenharmony_ci 115bf215546Sopenharmony_ci /* Would it be faster to unroll this loop once and process 4 pixels 116bf215546Sopenharmony_ci * per pass, instead of just two? 117bf215546Sopenharmony_ci */ 118bf215546Sopenharmony_ci 119bf215546Sopenharmony_ci movl %edx, %eax 120bf215546Sopenharmony_ci shrl %eax 121bf215546Sopenharmony_ci jmp .L18 122bf215546Sopenharmony_ci.L19: 123bf215546Sopenharmony_ci movq (%ebx), %mm0 124bf215546Sopenharmony_ci addl $8, %ebx 125bf215546Sopenharmony_ci 126bf215546Sopenharmony_ci /* These 9 instructions do what PSHUFB (if there were such an 127bf215546Sopenharmony_ci * instruction) could do in 1. :( 128bf215546Sopenharmony_ci */ 129bf215546Sopenharmony_ci 130bf215546Sopenharmony_ci movq %mm0, %mm3 131bf215546Sopenharmony_ci movq %mm0, %mm4 132bf215546Sopenharmony_ci 133bf215546Sopenharmony_ci pand %mm2, %mm3 134bf215546Sopenharmony_ci psllq $16, %mm4 135bf215546Sopenharmony_ci psrlq $16, %mm3 136bf215546Sopenharmony_ci pand %mm2, %mm4 137bf215546Sopenharmony_ci 138bf215546Sopenharmony_ci pand %mm1, %mm0 139bf215546Sopenharmony_ci por %mm4, %mm3 140bf215546Sopenharmony_ci por %mm3, %mm0 141bf215546Sopenharmony_ci 142bf215546Sopenharmony_ci movq %mm0, (%ecx) 143bf215546Sopenharmony_ci addl $8, %ecx 144bf215546Sopenharmony_ci subl $1, %eax 145bf215546Sopenharmony_ci.L18: 146bf215546Sopenharmony_ci jne .L19 147bf215546Sopenharmony_ci 148bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS 149bf215546Sopenharmony_ci emms 150bf215546Sopenharmony_ci#endif 151bf215546Sopenharmony_ci 152bf215546Sopenharmony_ci /* At this point there are either 1 or 0 pixels remaining to be 153bf215546Sopenharmony_ci * converted. Convert the last pixel, if needed. 154bf215546Sopenharmony_ci */ 155bf215546Sopenharmony_ci 156bf215546Sopenharmony_ci testl $1, %edx 157bf215546Sopenharmony_ci je .L20 158bf215546Sopenharmony_ci 159bf215546Sopenharmony_ci DO_ONE_LAST_PIXEL() 160bf215546Sopenharmony_ci 161bf215546Sopenharmony_ci.L20: 162bf215546Sopenharmony_ci popl %ebx 163bf215546Sopenharmony_ci ret 164bf215546Sopenharmony_ci .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX 165bf215546Sopenharmony_ci 166bf215546Sopenharmony_ci 167bf215546Sopenharmony_ci/** 168bf215546Sopenharmony_ci * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE 169bf215546Sopenharmony_ci * instructions are only actually used to read data from the framebuffer. 170bf215546Sopenharmony_ci * In practice, the speed-up is pretty small. 171bf215546Sopenharmony_ci * 172bf215546Sopenharmony_ci * \todo 173bf215546Sopenharmony_ci * Do some more testing and determine if there's any reason to have this 174bf215546Sopenharmony_ci * function in addition to the MMX version. 175bf215546Sopenharmony_ci * 176bf215546Sopenharmony_ci * \warning 177bf215546Sopenharmony_ci * This function assumes that the caller will issue the EMMS instruction 178bf215546Sopenharmony_ci * at the correct places. 179bf215546Sopenharmony_ci */ 180bf215546Sopenharmony_ci 181bf215546Sopenharmony_ci.globl _generic_read_RGBA_span_BGRA8888_REV_SSE 182bf215546Sopenharmony_ci.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE 183bf215546Sopenharmony_ci .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function 184bf215546Sopenharmony_ci_generic_read_RGBA_span_BGRA8888_REV_SSE: 185bf215546Sopenharmony_ci _CET_ENDBR 186bf215546Sopenharmony_ci pushl %esi 187bf215546Sopenharmony_ci pushl %ebx 188bf215546Sopenharmony_ci pushl %ebp 189bf215546Sopenharmony_ci 190bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS 191bf215546Sopenharmony_ci emms 192bf215546Sopenharmony_ci#endif 193bf215546Sopenharmony_ci 194bf215546Sopenharmony_ci LOAD_MASK(movq,%mm1,%mm2) 195bf215546Sopenharmony_ci 196bf215546Sopenharmony_ci movl 16(%esp), %ebx /* source pointer */ 197bf215546Sopenharmony_ci movl 24(%esp), %edx /* number of pixels to copy */ 198bf215546Sopenharmony_ci movl 20(%esp), %ecx /* destination pointer */ 199bf215546Sopenharmony_ci 200bf215546Sopenharmony_ci testl %edx, %edx 201bf215546Sopenharmony_ci jle .L35 /* Bail if there's nothing to do. */ 202bf215546Sopenharmony_ci 203bf215546Sopenharmony_ci movl %esp, %ebp 204bf215546Sopenharmony_ci subl $16, %esp 205bf215546Sopenharmony_ci andl $0xfffffff0, %esp 206bf215546Sopenharmony_ci 207bf215546Sopenharmony_ci movl %ebx, %eax 208bf215546Sopenharmony_ci movl %edx, %esi 209bf215546Sopenharmony_ci 210bf215546Sopenharmony_ci negl %eax 211bf215546Sopenharmony_ci andl $15, %eax 212bf215546Sopenharmony_ci sarl $2, %eax 213bf215546Sopenharmony_ci cmpl %edx, %eax 214bf215546Sopenharmony_ci cmovle %eax, %esi 215bf215546Sopenharmony_ci 216bf215546Sopenharmony_ci subl %esi, %edx 217bf215546Sopenharmony_ci 218bf215546Sopenharmony_ci testl $1, %esi 219bf215546Sopenharmony_ci je .L32 220bf215546Sopenharmony_ci 221bf215546Sopenharmony_ci DO_ONE_PIXEL() 222bf215546Sopenharmony_ci.L32: 223bf215546Sopenharmony_ci 224bf215546Sopenharmony_ci testl $2, %esi 225bf215546Sopenharmony_ci je .L31 226bf215546Sopenharmony_ci 227bf215546Sopenharmony_ci movq (%ebx), %mm0 228bf215546Sopenharmony_ci addl $8, %ebx 229bf215546Sopenharmony_ci 230bf215546Sopenharmony_ci movq %mm0, %mm3 231bf215546Sopenharmony_ci movq %mm0, %mm4 232bf215546Sopenharmony_ci 233bf215546Sopenharmony_ci pand %mm2, %mm3 234bf215546Sopenharmony_ci psllq $16, %mm4 235bf215546Sopenharmony_ci psrlq $16, %mm3 236bf215546Sopenharmony_ci pand %mm2, %mm4 237bf215546Sopenharmony_ci 238bf215546Sopenharmony_ci pand %mm1, %mm0 239bf215546Sopenharmony_ci por %mm4, %mm3 240bf215546Sopenharmony_ci por %mm3, %mm0 241bf215546Sopenharmony_ci 242bf215546Sopenharmony_ci movq %mm0, (%ecx) 243bf215546Sopenharmony_ci addl $8, %ecx 244bf215546Sopenharmony_ci.L31: 245bf215546Sopenharmony_ci 246bf215546Sopenharmony_ci movl %edx, %eax 247bf215546Sopenharmony_ci shrl $2, %eax 248bf215546Sopenharmony_ci jmp .L33 249bf215546Sopenharmony_ci.L34: 250bf215546Sopenharmony_ci movaps (%ebx), %xmm0 251bf215546Sopenharmony_ci addl $16, %ebx 252bf215546Sopenharmony_ci 253bf215546Sopenharmony_ci /* This would be so much better if we could just move directly from 254bf215546Sopenharmony_ci * an SSE register to an MMX register. Unfortunately, that 255bf215546Sopenharmony_ci * functionality wasn't introduced until SSE2 with the MOVDQ2Q 256bf215546Sopenharmony_ci * instruction. 257bf215546Sopenharmony_ci */ 258bf215546Sopenharmony_ci 259bf215546Sopenharmony_ci movaps %xmm0, (%esp) 260bf215546Sopenharmony_ci movq (%esp), %mm0 261bf215546Sopenharmony_ci movq 8(%esp), %mm5 262bf215546Sopenharmony_ci 263bf215546Sopenharmony_ci movq %mm0, %mm3 264bf215546Sopenharmony_ci movq %mm0, %mm4 265bf215546Sopenharmony_ci movq %mm5, %mm6 266bf215546Sopenharmony_ci movq %mm5, %mm7 267bf215546Sopenharmony_ci 268bf215546Sopenharmony_ci pand %mm2, %mm3 269bf215546Sopenharmony_ci pand %mm2, %mm6 270bf215546Sopenharmony_ci 271bf215546Sopenharmony_ci psllq $16, %mm4 272bf215546Sopenharmony_ci psllq $16, %mm7 273bf215546Sopenharmony_ci 274bf215546Sopenharmony_ci psrlq $16, %mm3 275bf215546Sopenharmony_ci psrlq $16, %mm6 276bf215546Sopenharmony_ci 277bf215546Sopenharmony_ci pand %mm2, %mm4 278bf215546Sopenharmony_ci pand %mm2, %mm7 279bf215546Sopenharmony_ci 280bf215546Sopenharmony_ci pand %mm1, %mm0 281bf215546Sopenharmony_ci pand %mm1, %mm5 282bf215546Sopenharmony_ci 283bf215546Sopenharmony_ci por %mm4, %mm3 284bf215546Sopenharmony_ci por %mm7, %mm6 285bf215546Sopenharmony_ci 286bf215546Sopenharmony_ci por %mm3, %mm0 287bf215546Sopenharmony_ci por %mm6, %mm5 288bf215546Sopenharmony_ci 289bf215546Sopenharmony_ci movq %mm0, (%ecx) 290bf215546Sopenharmony_ci movq %mm5, 8(%ecx) 291bf215546Sopenharmony_ci addl $16, %ecx 292bf215546Sopenharmony_ci 293bf215546Sopenharmony_ci subl $1, %eax 294bf215546Sopenharmony_ci.L33: 295bf215546Sopenharmony_ci jne .L34 296bf215546Sopenharmony_ci 297bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS 298bf215546Sopenharmony_ci emms 299bf215546Sopenharmony_ci#endif 300bf215546Sopenharmony_ci movl %ebp, %esp 301bf215546Sopenharmony_ci 302bf215546Sopenharmony_ci /* At this point there are either [0, 3] pixels remaining to be 303bf215546Sopenharmony_ci * converted. 304bf215546Sopenharmony_ci */ 305bf215546Sopenharmony_ci 306bf215546Sopenharmony_ci testl $2, %edx 307bf215546Sopenharmony_ci je .L36 308bf215546Sopenharmony_ci 309bf215546Sopenharmony_ci movq (%ebx), %mm0 310bf215546Sopenharmony_ci addl $8, %ebx 311bf215546Sopenharmony_ci 312bf215546Sopenharmony_ci movq %mm0, %mm3 313bf215546Sopenharmony_ci movq %mm0, %mm4 314bf215546Sopenharmony_ci 315bf215546Sopenharmony_ci pand %mm2, %mm3 316bf215546Sopenharmony_ci psllq $16, %mm4 317bf215546Sopenharmony_ci psrlq $16, %mm3 318bf215546Sopenharmony_ci pand %mm2, %mm4 319bf215546Sopenharmony_ci 320bf215546Sopenharmony_ci pand %mm1, %mm0 321bf215546Sopenharmony_ci por %mm4, %mm3 322bf215546Sopenharmony_ci por %mm3, %mm0 323bf215546Sopenharmony_ci 324bf215546Sopenharmony_ci movq %mm0, (%ecx) 325bf215546Sopenharmony_ci addl $8, %ecx 326bf215546Sopenharmony_ci.L36: 327bf215546Sopenharmony_ci 328bf215546Sopenharmony_ci testl $1, %edx 329bf215546Sopenharmony_ci je .L35 330bf215546Sopenharmony_ci 331bf215546Sopenharmony_ci DO_ONE_LAST_PIXEL() 332bf215546Sopenharmony_ci.L35: 333bf215546Sopenharmony_ci popl %ebp 334bf215546Sopenharmony_ci popl %ebx 335bf215546Sopenharmony_ci popl %esi 336bf215546Sopenharmony_ci ret 337bf215546Sopenharmony_ci .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE 338bf215546Sopenharmony_ci 339bf215546Sopenharmony_ci 340bf215546Sopenharmony_ci/** 341bf215546Sopenharmony_ci * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine. 342bf215546Sopenharmony_ci */ 343bf215546Sopenharmony_ci 344bf215546Sopenharmony_ci .text 345bf215546Sopenharmony_ci.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2 346bf215546Sopenharmony_ci.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2 347bf215546Sopenharmony_ci .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function 348bf215546Sopenharmony_ci_generic_read_RGBA_span_BGRA8888_REV_SSE2: 349bf215546Sopenharmony_ci _CET_ENDBR 350bf215546Sopenharmony_ci pushl %esi 351bf215546Sopenharmony_ci pushl %ebx 352bf215546Sopenharmony_ci 353bf215546Sopenharmony_ci LOAD_MASK(movdqu,%xmm1,%xmm2) 354bf215546Sopenharmony_ci 355bf215546Sopenharmony_ci movl 12(%esp), %ebx /* source pointer */ 356bf215546Sopenharmony_ci movl 20(%esp), %edx /* number of pixels to copy */ 357bf215546Sopenharmony_ci movl 16(%esp), %ecx /* destination pointer */ 358bf215546Sopenharmony_ci 359bf215546Sopenharmony_ci movl %ebx, %eax 360bf215546Sopenharmony_ci movl %edx, %esi 361bf215546Sopenharmony_ci 362bf215546Sopenharmony_ci testl %edx, %edx 363bf215546Sopenharmony_ci jle .L46 /* Bail if there's nothing to do. */ 364bf215546Sopenharmony_ci 365bf215546Sopenharmony_ci /* If the source pointer isn't a multiple of 16 we have to process 366bf215546Sopenharmony_ci * a few pixels the "slow" way to get the address aligned for 367bf215546Sopenharmony_ci * the SSE fetch intsructions. 368bf215546Sopenharmony_ci */ 369bf215546Sopenharmony_ci 370bf215546Sopenharmony_ci negl %eax 371bf215546Sopenharmony_ci andl $15, %eax 372bf215546Sopenharmony_ci sarl $2, %eax 373bf215546Sopenharmony_ci 374bf215546Sopenharmony_ci cmpl %edx, %eax 375bf215546Sopenharmony_ci cmovbe %eax, %esi 376bf215546Sopenharmony_ci subl %esi, %edx 377bf215546Sopenharmony_ci 378bf215546Sopenharmony_ci testl $1, %esi 379bf215546Sopenharmony_ci je .L41 380bf215546Sopenharmony_ci 381bf215546Sopenharmony_ci DO_ONE_PIXEL() 382bf215546Sopenharmony_ci.L41: 383bf215546Sopenharmony_ci testl $2, %esi 384bf215546Sopenharmony_ci je .L40 385bf215546Sopenharmony_ci 386bf215546Sopenharmony_ci movq (%ebx), %xmm0 387bf215546Sopenharmony_ci addl $8, %ebx 388bf215546Sopenharmony_ci 389bf215546Sopenharmony_ci movdqa %xmm0, %xmm3 390bf215546Sopenharmony_ci movdqa %xmm0, %xmm4 391bf215546Sopenharmony_ci andps %xmm1, %xmm0 392bf215546Sopenharmony_ci 393bf215546Sopenharmony_ci andps %xmm2, %xmm3 394bf215546Sopenharmony_ci pslldq $2, %xmm4 395bf215546Sopenharmony_ci psrldq $2, %xmm3 396bf215546Sopenharmony_ci andps %xmm2, %xmm4 397bf215546Sopenharmony_ci 398bf215546Sopenharmony_ci orps %xmm4, %xmm3 399bf215546Sopenharmony_ci orps %xmm3, %xmm0 400bf215546Sopenharmony_ci 401bf215546Sopenharmony_ci movq %xmm0, (%ecx) 402bf215546Sopenharmony_ci addl $8, %ecx 403bf215546Sopenharmony_ci.L40: 404bf215546Sopenharmony_ci 405bf215546Sopenharmony_ci /* Would it be worth having a specialized version of this loop for 406bf215546Sopenharmony_ci * the case where the destination is 16-byte aligned? That version 407bf215546Sopenharmony_ci * would be identical except that it could use movedqa instead of 408bf215546Sopenharmony_ci * movdqu. 409bf215546Sopenharmony_ci */ 410bf215546Sopenharmony_ci 411bf215546Sopenharmony_ci movl %edx, %eax 412bf215546Sopenharmony_ci shrl $2, %eax 413bf215546Sopenharmony_ci jmp .L42 414bf215546Sopenharmony_ci.L43: 415bf215546Sopenharmony_ci movdqa (%ebx), %xmm0 416bf215546Sopenharmony_ci addl $16, %ebx 417bf215546Sopenharmony_ci 418bf215546Sopenharmony_ci movdqa %xmm0, %xmm3 419bf215546Sopenharmony_ci movdqa %xmm0, %xmm4 420bf215546Sopenharmony_ci andps %xmm1, %xmm0 421bf215546Sopenharmony_ci 422bf215546Sopenharmony_ci andps %xmm2, %xmm3 423bf215546Sopenharmony_ci pslldq $2, %xmm4 424bf215546Sopenharmony_ci psrldq $2, %xmm3 425bf215546Sopenharmony_ci andps %xmm2, %xmm4 426bf215546Sopenharmony_ci 427bf215546Sopenharmony_ci orps %xmm4, %xmm3 428bf215546Sopenharmony_ci orps %xmm3, %xmm0 429bf215546Sopenharmony_ci 430bf215546Sopenharmony_ci movdqu %xmm0, (%ecx) 431bf215546Sopenharmony_ci addl $16, %ecx 432bf215546Sopenharmony_ci subl $1, %eax 433bf215546Sopenharmony_ci.L42: 434bf215546Sopenharmony_ci jne .L43 435bf215546Sopenharmony_ci 436bf215546Sopenharmony_ci 437bf215546Sopenharmony_ci /* There may be upto 3 pixels remaining to be copied. Take care 438bf215546Sopenharmony_ci * of them now. We do the 2 pixel case first because the data 439bf215546Sopenharmony_ci * will be aligned. 440bf215546Sopenharmony_ci */ 441bf215546Sopenharmony_ci 442bf215546Sopenharmony_ci testl $2, %edx 443bf215546Sopenharmony_ci je .L47 444bf215546Sopenharmony_ci 445bf215546Sopenharmony_ci movq (%ebx), %xmm0 446bf215546Sopenharmony_ci addl $8, %ebx 447bf215546Sopenharmony_ci 448bf215546Sopenharmony_ci movdqa %xmm0, %xmm3 449bf215546Sopenharmony_ci movdqa %xmm0, %xmm4 450bf215546Sopenharmony_ci andps %xmm1, %xmm0 451bf215546Sopenharmony_ci 452bf215546Sopenharmony_ci andps %xmm2, %xmm3 453bf215546Sopenharmony_ci pslldq $2, %xmm4 454bf215546Sopenharmony_ci psrldq $2, %xmm3 455bf215546Sopenharmony_ci andps %xmm2, %xmm4 456bf215546Sopenharmony_ci 457bf215546Sopenharmony_ci orps %xmm4, %xmm3 458bf215546Sopenharmony_ci orps %xmm3, %xmm0 459bf215546Sopenharmony_ci 460bf215546Sopenharmony_ci movq %xmm0, (%ecx) 461bf215546Sopenharmony_ci addl $8, %ecx 462bf215546Sopenharmony_ci.L47: 463bf215546Sopenharmony_ci 464bf215546Sopenharmony_ci testl $1, %edx 465bf215546Sopenharmony_ci je .L46 466bf215546Sopenharmony_ci 467bf215546Sopenharmony_ci DO_ONE_LAST_PIXEL() 468bf215546Sopenharmony_ci.L46: 469bf215546Sopenharmony_ci 470bf215546Sopenharmony_ci popl %ebx 471bf215546Sopenharmony_ci popl %esi 472bf215546Sopenharmony_ci ret 473bf215546Sopenharmony_ci .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2 474bf215546Sopenharmony_ci 475bf215546Sopenharmony_ci 476bf215546Sopenharmony_ci 477bf215546Sopenharmony_ci#define MASK_565_L 0x07e0f800 478bf215546Sopenharmony_ci#define MASK_565_H 0x0000001f 479bf215546Sopenharmony_ci/* Setting SCALE_ADJUST to 5 gives a perfect match with the 480bf215546Sopenharmony_ci * classic C implementation in Mesa. Setting SCALE_ADJUST 481bf215546Sopenharmony_ci * to 0 is slightly faster but at a small cost to accuracy. 482bf215546Sopenharmony_ci */ 483bf215546Sopenharmony_ci#define SCALE_ADJUST 5 484bf215546Sopenharmony_ci#if SCALE_ADJUST == 5 485bf215546Sopenharmony_ci#define PRESCALE_L 0x00100001 486bf215546Sopenharmony_ci#define PRESCALE_H 0x00000200 487bf215546Sopenharmony_ci#define SCALE_L 0x40C620E8 488bf215546Sopenharmony_ci#define SCALE_H 0x0000839d 489bf215546Sopenharmony_ci#elif SCALE_ADJUST == 0 490bf215546Sopenharmony_ci#define PRESCALE_L 0x00200001 491bf215546Sopenharmony_ci#define PRESCALE_H 0x00000800 492bf215546Sopenharmony_ci#define SCALE_L 0x01040108 493bf215546Sopenharmony_ci#define SCALE_H 0x00000108 494bf215546Sopenharmony_ci#else 495bf215546Sopenharmony_ci#error SCALE_ADJUST must either be 5 or 0. 496bf215546Sopenharmony_ci#endif 497bf215546Sopenharmony_ci#define ALPHA_L 0x00000000 498bf215546Sopenharmony_ci#define ALPHA_H 0x00ff0000 499bf215546Sopenharmony_ci 500bf215546Sopenharmony_ci/** 501bf215546Sopenharmony_ci * MMX optimized version of the RGB565 to RGBA copy routine. 502bf215546Sopenharmony_ci */ 503bf215546Sopenharmony_ci 504bf215546Sopenharmony_ci .text 505bf215546Sopenharmony_ci .globl _generic_read_RGBA_span_RGB565_MMX 506bf215546Sopenharmony_ci .hidden _generic_read_RGBA_span_RGB565_MMX 507bf215546Sopenharmony_ci .type _generic_read_RGBA_span_RGB565_MMX, @function 508bf215546Sopenharmony_ci 509bf215546Sopenharmony_ci_generic_read_RGBA_span_RGB565_MMX: 510bf215546Sopenharmony_ci _CET_ENDBR 511bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS 512bf215546Sopenharmony_ci emms 513bf215546Sopenharmony_ci#endif 514bf215546Sopenharmony_ci 515bf215546Sopenharmony_ci movl 4(%esp), %eax /* source pointer */ 516bf215546Sopenharmony_ci movl 8(%esp), %edx /* destination pointer */ 517bf215546Sopenharmony_ci movl 12(%esp), %ecx /* number of pixels to copy */ 518bf215546Sopenharmony_ci 519bf215546Sopenharmony_ci pushl $MASK_565_H 520bf215546Sopenharmony_ci pushl $MASK_565_L 521bf215546Sopenharmony_ci movq (%esp), %mm5 522bf215546Sopenharmony_ci pushl $PRESCALE_H 523bf215546Sopenharmony_ci pushl $PRESCALE_L 524bf215546Sopenharmony_ci movq (%esp), %mm6 525bf215546Sopenharmony_ci pushl $SCALE_H 526bf215546Sopenharmony_ci pushl $SCALE_L 527bf215546Sopenharmony_ci movq (%esp), %mm7 528bf215546Sopenharmony_ci pushl $ALPHA_H 529bf215546Sopenharmony_ci pushl $ALPHA_L 530bf215546Sopenharmony_ci movq (%esp), %mm3 531bf215546Sopenharmony_ci addl $32,%esp 532bf215546Sopenharmony_ci 533bf215546Sopenharmony_ci sarl $2, %ecx 534bf215546Sopenharmony_ci jl .L01 /* Bail early if the count is negative. */ 535bf215546Sopenharmony_ci jmp .L02 536bf215546Sopenharmony_ci 537bf215546Sopenharmony_ci.L03: 538bf215546Sopenharmony_ci /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and 539bf215546Sopenharmony_ci * second pixels into the four words of %mm0 and %mm2. 540bf215546Sopenharmony_ci */ 541bf215546Sopenharmony_ci 542bf215546Sopenharmony_ci movq (%eax), %mm4 543bf215546Sopenharmony_ci addl $8, %eax 544bf215546Sopenharmony_ci 545bf215546Sopenharmony_ci pshufw $0x00, %mm4, %mm0 546bf215546Sopenharmony_ci pshufw $0x55, %mm4, %mm2 547bf215546Sopenharmony_ci 548bf215546Sopenharmony_ci 549bf215546Sopenharmony_ci /* Mask the pixels so that each word of each register contains only 550bf215546Sopenharmony_ci * one color component. 551bf215546Sopenharmony_ci */ 552bf215546Sopenharmony_ci 553bf215546Sopenharmony_ci pand %mm5, %mm0 554bf215546Sopenharmony_ci pand %mm5, %mm2 555bf215546Sopenharmony_ci 556bf215546Sopenharmony_ci 557bf215546Sopenharmony_ci /* Adjust the component values so that they are as small as possible, 558bf215546Sopenharmony_ci * but large enough so that we can multiply them by an unsigned 16-bit 559bf215546Sopenharmony_ci * number and get a value as large as 0x00ff0000. 560bf215546Sopenharmony_ci */ 561bf215546Sopenharmony_ci 562bf215546Sopenharmony_ci pmullw %mm6, %mm0 563bf215546Sopenharmony_ci pmullw %mm6, %mm2 564bf215546Sopenharmony_ci#if SCALE_ADJUST > 0 565bf215546Sopenharmony_ci psrlw $SCALE_ADJUST, %mm0 566bf215546Sopenharmony_ci psrlw $SCALE_ADJUST, %mm2 567bf215546Sopenharmony_ci#endif 568bf215546Sopenharmony_ci 569bf215546Sopenharmony_ci /* Scale the input component values to be on the range 570bf215546Sopenharmony_ci * [0, 0x00ff0000]. This it the real magic of the whole routine. 571bf215546Sopenharmony_ci */ 572bf215546Sopenharmony_ci 573bf215546Sopenharmony_ci pmulhuw %mm7, %mm0 574bf215546Sopenharmony_ci pmulhuw %mm7, %mm2 575bf215546Sopenharmony_ci 576bf215546Sopenharmony_ci 577bf215546Sopenharmony_ci /* Always set the alpha value to 0xff. 578bf215546Sopenharmony_ci */ 579bf215546Sopenharmony_ci 580bf215546Sopenharmony_ci por %mm3, %mm0 581bf215546Sopenharmony_ci por %mm3, %mm2 582bf215546Sopenharmony_ci 583bf215546Sopenharmony_ci 584bf215546Sopenharmony_ci /* Pack the 16-bit values to 8-bit values and store the converted 585bf215546Sopenharmony_ci * pixel data. 586bf215546Sopenharmony_ci */ 587bf215546Sopenharmony_ci 588bf215546Sopenharmony_ci packuswb %mm2, %mm0 589bf215546Sopenharmony_ci movq %mm0, (%edx) 590bf215546Sopenharmony_ci addl $8, %edx 591bf215546Sopenharmony_ci 592bf215546Sopenharmony_ci pshufw $0xaa, %mm4, %mm0 593bf215546Sopenharmony_ci pshufw $0xff, %mm4, %mm2 594bf215546Sopenharmony_ci 595bf215546Sopenharmony_ci pand %mm5, %mm0 596bf215546Sopenharmony_ci pand %mm5, %mm2 597bf215546Sopenharmony_ci pmullw %mm6, %mm0 598bf215546Sopenharmony_ci pmullw %mm6, %mm2 599bf215546Sopenharmony_ci#if SCALE_ADJUST > 0 600bf215546Sopenharmony_ci psrlw $SCALE_ADJUST, %mm0 601bf215546Sopenharmony_ci psrlw $SCALE_ADJUST, %mm2 602bf215546Sopenharmony_ci#endif 603bf215546Sopenharmony_ci pmulhuw %mm7, %mm0 604bf215546Sopenharmony_ci pmulhuw %mm7, %mm2 605bf215546Sopenharmony_ci 606bf215546Sopenharmony_ci por %mm3, %mm0 607bf215546Sopenharmony_ci por %mm3, %mm2 608bf215546Sopenharmony_ci 609bf215546Sopenharmony_ci packuswb %mm2, %mm0 610bf215546Sopenharmony_ci 611bf215546Sopenharmony_ci movq %mm0, (%edx) 612bf215546Sopenharmony_ci addl $8, %edx 613bf215546Sopenharmony_ci 614bf215546Sopenharmony_ci subl $1, %ecx 615bf215546Sopenharmony_ci.L02: 616bf215546Sopenharmony_ci jne .L03 617bf215546Sopenharmony_ci 618bf215546Sopenharmony_ci 619bf215546Sopenharmony_ci /* At this point there can be at most 3 pixels left to process. If 620bf215546Sopenharmony_ci * there is either 2 or 3 left, process 2. 621bf215546Sopenharmony_ci */ 622bf215546Sopenharmony_ci 623bf215546Sopenharmony_ci movl 12(%esp), %ecx 624bf215546Sopenharmony_ci testl $0x02, %ecx 625bf215546Sopenharmony_ci je .L04 626bf215546Sopenharmony_ci 627bf215546Sopenharmony_ci movd (%eax), %mm4 628bf215546Sopenharmony_ci addl $4, %eax 629bf215546Sopenharmony_ci 630bf215546Sopenharmony_ci pshufw $0x00, %mm4, %mm0 631bf215546Sopenharmony_ci pshufw $0x55, %mm4, %mm2 632bf215546Sopenharmony_ci 633bf215546Sopenharmony_ci pand %mm5, %mm0 634bf215546Sopenharmony_ci pand %mm5, %mm2 635bf215546Sopenharmony_ci pmullw %mm6, %mm0 636bf215546Sopenharmony_ci pmullw %mm6, %mm2 637bf215546Sopenharmony_ci#if SCALE_ADJUST > 0 638bf215546Sopenharmony_ci psrlw $SCALE_ADJUST, %mm0 639bf215546Sopenharmony_ci psrlw $SCALE_ADJUST, %mm2 640bf215546Sopenharmony_ci#endif 641bf215546Sopenharmony_ci pmulhuw %mm7, %mm0 642bf215546Sopenharmony_ci pmulhuw %mm7, %mm2 643bf215546Sopenharmony_ci 644bf215546Sopenharmony_ci por %mm3, %mm0 645bf215546Sopenharmony_ci por %mm3, %mm2 646bf215546Sopenharmony_ci 647bf215546Sopenharmony_ci packuswb %mm2, %mm0 648bf215546Sopenharmony_ci 649bf215546Sopenharmony_ci movq %mm0, (%edx) 650bf215546Sopenharmony_ci addl $8, %edx 651bf215546Sopenharmony_ci 652bf215546Sopenharmony_ci.L04: 653bf215546Sopenharmony_ci /* At this point there can be at most 1 pixel left to process. 654bf215546Sopenharmony_ci * Process it if needed. 655bf215546Sopenharmony_ci */ 656bf215546Sopenharmony_ci 657bf215546Sopenharmony_ci testl $0x01, %ecx 658bf215546Sopenharmony_ci je .L01 659bf215546Sopenharmony_ci 660bf215546Sopenharmony_ci movzwl (%eax), %ecx 661bf215546Sopenharmony_ci movd %ecx, %mm4 662bf215546Sopenharmony_ci 663bf215546Sopenharmony_ci pshufw $0x00, %mm4, %mm0 664bf215546Sopenharmony_ci 665bf215546Sopenharmony_ci pand %mm5, %mm0 666bf215546Sopenharmony_ci pmullw %mm6, %mm0 667bf215546Sopenharmony_ci#if SCALE_ADJUST > 0 668bf215546Sopenharmony_ci psrlw $SCALE_ADJUST, %mm0 669bf215546Sopenharmony_ci#endif 670bf215546Sopenharmony_ci pmulhuw %mm7, %mm0 671bf215546Sopenharmony_ci 672bf215546Sopenharmony_ci por %mm3, %mm0 673bf215546Sopenharmony_ci 674bf215546Sopenharmony_ci packuswb %mm0, %mm0 675bf215546Sopenharmony_ci 676bf215546Sopenharmony_ci movd %mm0, (%edx) 677bf215546Sopenharmony_ci 678bf215546Sopenharmony_ci.L01: 679bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS 680bf215546Sopenharmony_ci emms 681bf215546Sopenharmony_ci#endif 682bf215546Sopenharmony_ci ret 683bf215546Sopenharmony_ci#endif /* !defined(__MINGW32__) && !defined(__APPLE__) */ 684bf215546Sopenharmony_ci 685bf215546Sopenharmony_ci#if defined (__ELF__) && defined (__linux__) 686bf215546Sopenharmony_ci .section .note.GNU-stack,"",%progbits 687bf215546Sopenharmony_ci#endif 688