mesa/x86/read_rgba_span_x86.S

bf215546Sopenharmony_ci/*
bf215546Sopenharmony_ci * (C) Copyright IBM Corporation 2004
bf215546Sopenharmony_ci * All Rights Reserved.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
bf215546Sopenharmony_ci * on the rights to use, copy, modify, merge, publish, distribute, sub
bf215546Sopenharmony_ci * license, and/or sell copies of the Software, and to permit persons to whom
bf215546Sopenharmony_ci * the Software is furnished to do so, subject to the following conditions:
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
bf215546Sopenharmony_ci * Software.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
bf215546Sopenharmony_ci * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
bf215546Sopenharmony_ci * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
bf215546Sopenharmony_ci * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
bf215546Sopenharmony_ci * USE OR OTHER DEALINGS IN THE SOFTWARE.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * \file read_rgba_span_x86.S
bf215546Sopenharmony_ci * Optimized routines to transfer pixel data from the framebuffer to a
bf215546Sopenharmony_ci * buffer in main memory.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * \author Ian Romanick <idr@us.ibm.com>
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci/* Control flow enforcement support */
bf215546Sopenharmony_ci#ifdef HAVE_CET_H
bf215546Sopenharmony_ci#include <cet.h>
bf215546Sopenharmony_ci#else
bf215546Sopenharmony_ci#define _CET_ENDBR
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	.file	"read_rgba_span_x86.S"
bf215546Sopenharmony_ci#if !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
bf215546Sopenharmony_ci/* Kevin F. Quinn 2nd July 2006
bf215546Sopenharmony_ci * Replaced data segment constants with text-segment instructions.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci#define	LOAD_MASK(mvins,m1,m2) \
bf215546Sopenharmony_ci   	pushl	$0xff00ff00 ;\
bf215546Sopenharmony_ci   	pushl	$0xff00ff00 ;\
bf215546Sopenharmony_ci   	pushl	$0xff00ff00 ;\
bf215546Sopenharmony_ci   	pushl	$0xff00ff00 ;\
bf215546Sopenharmony_ci	mvins	(%esp), m1	;\
bf215546Sopenharmony_ci   	pushl	$0x00ff0000 ;\
bf215546Sopenharmony_ci   	pushl	$0x00ff0000 ;\
bf215546Sopenharmony_ci   	pushl	$0x00ff0000 ;\
bf215546Sopenharmony_ci   	pushl	$0x00ff0000 ;\
bf215546Sopenharmony_ci	mvins	(%esp), m2	;\
bf215546Sopenharmony_ci	addl	$32, %esp
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* I implemented these as macros because they appear in several places,
bf215546Sopenharmony_ci * and I've tweaked them a number of times.  I got tired of changing every
bf215546Sopenharmony_ci * place they appear. :)
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#define DO_ONE_PIXEL() \
bf215546Sopenharmony_ci	movl	(%ebx), %eax ; \
bf215546Sopenharmony_ci	addl	$4, %ebx ; \
bf215546Sopenharmony_ci	bswap	%eax          /* ARGB -> BGRA */ ; \
bf215546Sopenharmony_ci	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
bf215546Sopenharmony_ci	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
bf215546Sopenharmony_ci	addl	$4, %ecx
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#define DO_ONE_LAST_PIXEL() \
bf215546Sopenharmony_ci	movl	(%ebx), %eax ; \
bf215546Sopenharmony_ci	bswap	%eax          /* ARGB -> BGRA */ ; \
bf215546Sopenharmony_ci	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
bf215546Sopenharmony_ci	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * \warning
bf215546Sopenharmony_ci * This function assumes that the caller will issue the EMMS instruction
bf215546Sopenharmony_ci * at the correct places.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
bf215546Sopenharmony_ci.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
bf215546Sopenharmony_ci	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
bf215546Sopenharmony_ci_generic_read_RGBA_span_BGRA8888_REV_MMX:
bf215546Sopenharmony_ci	_CET_ENDBR
bf215546Sopenharmony_ci	pushl	%ebx
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS
bf215546Sopenharmony_ci	emms
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci	LOAD_MASK(movq,%mm1,%mm2)
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movl	8(%esp), %ebx	/* source pointer */
bf215546Sopenharmony_ci	movl	16(%esp), %edx	/* number of pixels to copy */
bf215546Sopenharmony_ci	movl	12(%esp), %ecx	/* destination pointer */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	testl	%edx, %edx
bf215546Sopenharmony_ci	jle	.L20		/* Bail if there's nothing to do. */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movl	%ebx, %eax
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	negl	%eax
bf215546Sopenharmony_ci	sarl	$2, %eax
bf215546Sopenharmony_ci	andl	$1, %eax
bf215546Sopenharmony_ci	je	.L17
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	subl	%eax, %edx
bf215546Sopenharmony_ci	DO_ONE_PIXEL()
bf215546Sopenharmony_ci.L17:
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	/* Would it be faster to unroll this loop once and process 4 pixels
bf215546Sopenharmony_ci	 * per pass, instead of just two?
bf215546Sopenharmony_ci	 */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movl	%edx, %eax
bf215546Sopenharmony_ci	shrl	%eax
bf215546Sopenharmony_ci	jmp	.L18
bf215546Sopenharmony_ci.L19:
bf215546Sopenharmony_ci	movq	(%ebx), %mm0
bf215546Sopenharmony_ci	addl	$8, %ebx
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	/* These 9 instructions do what PSHUFB (if there were such an
bf215546Sopenharmony_ci	 * instruction) could do in 1. :(
bf215546Sopenharmony_ci	 */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movq	%mm0, %mm3
bf215546Sopenharmony_ci	movq	%mm0, %mm4
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pand	%mm2, %mm3
bf215546Sopenharmony_ci	psllq	$16, %mm4
bf215546Sopenharmony_ci	psrlq	$16, %mm3
bf215546Sopenharmony_ci	pand	%mm2, %mm4
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pand	%mm1, %mm0
bf215546Sopenharmony_ci	por	%mm4, %mm3
bf215546Sopenharmony_ci	por	%mm3, %mm0
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movq	%mm0, (%ecx)
bf215546Sopenharmony_ci	addl	$8, %ecx
bf215546Sopenharmony_ci	subl	$1, %eax
bf215546Sopenharmony_ci.L18:
bf215546Sopenharmony_ci	jne	.L19
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS
bf215546Sopenharmony_ci	emms
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	/* At this point there are either 1 or 0 pixels remaining to be
bf215546Sopenharmony_ci	 * converted.  Convert the last pixel, if needed.
bf215546Sopenharmony_ci	 */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	testl	$1, %edx
bf215546Sopenharmony_ci	je	.L20
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	DO_ONE_LAST_PIXEL()
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci.L20:
bf215546Sopenharmony_ci	popl	%ebx
bf215546Sopenharmony_ci	ret
bf215546Sopenharmony_ci	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
bf215546Sopenharmony_ci * instructions are only actually used to read data from the framebuffer.
bf215546Sopenharmony_ci * In practice, the speed-up is pretty small.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * \todo
bf215546Sopenharmony_ci * Do some more testing and determine if there's any reason to have this
bf215546Sopenharmony_ci * function in addition to the MMX version.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * \warning
bf215546Sopenharmony_ci * This function assumes that the caller will issue the EMMS instruction
bf215546Sopenharmony_ci * at the correct places.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
bf215546Sopenharmony_ci.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
bf215546Sopenharmony_ci	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function
bf215546Sopenharmony_ci_generic_read_RGBA_span_BGRA8888_REV_SSE:
bf215546Sopenharmony_ci	_CET_ENDBR
bf215546Sopenharmony_ci	pushl	%esi
bf215546Sopenharmony_ci	pushl	%ebx
bf215546Sopenharmony_ci	pushl	%ebp
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS
bf215546Sopenharmony_ci	emms
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	LOAD_MASK(movq,%mm1,%mm2)
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movl	16(%esp), %ebx	/* source pointer */
bf215546Sopenharmony_ci	movl	24(%esp), %edx	/* number of pixels to copy */
bf215546Sopenharmony_ci	movl	20(%esp), %ecx	/* destination pointer */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	testl	%edx, %edx
bf215546Sopenharmony_ci	jle	.L35		/* Bail if there's nothing to do. */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movl	%esp, %ebp
bf215546Sopenharmony_ci	subl	$16, %esp
bf215546Sopenharmony_ci	andl	$0xfffffff0, %esp
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movl	%ebx, %eax
bf215546Sopenharmony_ci	movl	%edx, %esi
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	negl	%eax
bf215546Sopenharmony_ci	andl	$15, %eax
bf215546Sopenharmony_ci	sarl	$2, %eax
bf215546Sopenharmony_ci	cmpl	%edx, %eax
bf215546Sopenharmony_ci	cmovle	%eax, %esi
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	subl	%esi, %edx
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	testl	$1, %esi
bf215546Sopenharmony_ci	je	.L32
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	DO_ONE_PIXEL()
bf215546Sopenharmony_ci.L32:
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	testl	$2, %esi
bf215546Sopenharmony_ci	je	.L31
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movq	(%ebx), %mm0
bf215546Sopenharmony_ci	addl	$8, %ebx
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movq	%mm0, %mm3
bf215546Sopenharmony_ci	movq	%mm0, %mm4
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pand	%mm2, %mm3
bf215546Sopenharmony_ci	psllq	$16, %mm4
bf215546Sopenharmony_ci	psrlq	$16, %mm3
bf215546Sopenharmony_ci	pand	%mm2, %mm4
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pand	%mm1, %mm0
bf215546Sopenharmony_ci	por	%mm4, %mm3
bf215546Sopenharmony_ci	por	%mm3, %mm0
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movq	%mm0, (%ecx)
bf215546Sopenharmony_ci	addl	$8, %ecx
bf215546Sopenharmony_ci.L31:
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movl	%edx, %eax
bf215546Sopenharmony_ci	shrl	$2, %eax
bf215546Sopenharmony_ci	jmp	.L33
bf215546Sopenharmony_ci.L34:
bf215546Sopenharmony_ci	movaps	(%ebx), %xmm0
bf215546Sopenharmony_ci	addl	$16, %ebx
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	/* This would be so much better if we could just move directly from
bf215546Sopenharmony_ci	 * an SSE register to an MMX register.  Unfortunately, that
bf215546Sopenharmony_ci	 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
bf215546Sopenharmony_ci	 * instruction.
bf215546Sopenharmony_ci	 */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movaps	%xmm0, (%esp)
bf215546Sopenharmony_ci	movq	(%esp), %mm0
bf215546Sopenharmony_ci	movq	8(%esp), %mm5
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movq	%mm0, %mm3
bf215546Sopenharmony_ci	movq	%mm0, %mm4
bf215546Sopenharmony_ci	movq	%mm5, %mm6
bf215546Sopenharmony_ci	movq	%mm5, %mm7
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pand	%mm2, %mm3
bf215546Sopenharmony_ci	pand	%mm2, %mm6
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	psllq	$16, %mm4
bf215546Sopenharmony_ci	psllq	$16, %mm7
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	psrlq	$16, %mm3
bf215546Sopenharmony_ci	psrlq	$16, %mm6
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pand	%mm2, %mm4
bf215546Sopenharmony_ci	pand	%mm2, %mm7
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pand	%mm1, %mm0
bf215546Sopenharmony_ci	pand	%mm1, %mm5
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	por	%mm4, %mm3
bf215546Sopenharmony_ci	por	%mm7, %mm6
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	por	%mm3, %mm0
bf215546Sopenharmony_ci	por	%mm6, %mm5
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movq	%mm0, (%ecx)
bf215546Sopenharmony_ci	movq	%mm5, 8(%ecx)
bf215546Sopenharmony_ci	addl	$16, %ecx
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	subl	$1, %eax
bf215546Sopenharmony_ci.L33:
bf215546Sopenharmony_ci	jne	.L34
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS
bf215546Sopenharmony_ci	emms
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci	movl	%ebp, %esp
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	/* At this point there are either [0, 3] pixels remaining to be
bf215546Sopenharmony_ci	 * converted.
bf215546Sopenharmony_ci	 */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	testl	$2, %edx
bf215546Sopenharmony_ci	je	.L36
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movq	(%ebx), %mm0
bf215546Sopenharmony_ci	addl	$8, %ebx
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movq	%mm0, %mm3
bf215546Sopenharmony_ci	movq	%mm0, %mm4
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pand	%mm2, %mm3
bf215546Sopenharmony_ci	psllq	$16, %mm4
bf215546Sopenharmony_ci	psrlq	$16, %mm3
bf215546Sopenharmony_ci	pand	%mm2, %mm4
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pand	%mm1, %mm0
bf215546Sopenharmony_ci	por	%mm4, %mm3
bf215546Sopenharmony_ci	por	%mm3, %mm0
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movq	%mm0, (%ecx)
bf215546Sopenharmony_ci	addl	$8, %ecx
bf215546Sopenharmony_ci.L36:
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	testl	$1, %edx
bf215546Sopenharmony_ci	je	.L35
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	DO_ONE_LAST_PIXEL()
bf215546Sopenharmony_ci.L35:
bf215546Sopenharmony_ci	popl	%ebp
bf215546Sopenharmony_ci	popl	%ebx
bf215546Sopenharmony_ci	popl	%esi
bf215546Sopenharmony_ci	ret
bf215546Sopenharmony_ci	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	.text
bf215546Sopenharmony_ci.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
bf215546Sopenharmony_ci.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
bf215546Sopenharmony_ci	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
bf215546Sopenharmony_ci_generic_read_RGBA_span_BGRA8888_REV_SSE2:
bf215546Sopenharmony_ci	_CET_ENDBR
bf215546Sopenharmony_ci	pushl	%esi
bf215546Sopenharmony_ci	pushl	%ebx
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	LOAD_MASK(movdqu,%xmm1,%xmm2)
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movl	12(%esp), %ebx	/* source pointer */
bf215546Sopenharmony_ci	movl	20(%esp), %edx	/* number of pixels to copy */
bf215546Sopenharmony_ci	movl	16(%esp), %ecx	/* destination pointer */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movl	%ebx, %eax
bf215546Sopenharmony_ci	movl	%edx, %esi
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	testl	%edx, %edx
bf215546Sopenharmony_ci	jle	.L46		/* Bail if there's nothing to do. */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	/* If the source pointer isn't a multiple of 16 we have to process
bf215546Sopenharmony_ci	 * a few pixels the "slow" way to get the address aligned for
bf215546Sopenharmony_ci	 * the SSE fetch intsructions.
bf215546Sopenharmony_ci	 */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	negl	%eax
bf215546Sopenharmony_ci	andl	$15, %eax
bf215546Sopenharmony_ci	sarl	$2, %eax
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	cmpl	%edx, %eax
bf215546Sopenharmony_ci	cmovbe	%eax, %esi
bf215546Sopenharmony_ci	subl	%esi, %edx
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	testl	$1, %esi
bf215546Sopenharmony_ci	je	.L41
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	DO_ONE_PIXEL()
bf215546Sopenharmony_ci.L41:
bf215546Sopenharmony_ci	testl	$2, %esi
bf215546Sopenharmony_ci	je	.L40
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movq	(%ebx), %xmm0
bf215546Sopenharmony_ci	addl	$8, %ebx
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movdqa	%xmm0, %xmm3
bf215546Sopenharmony_ci	movdqa	%xmm0, %xmm4
bf215546Sopenharmony_ci	andps	%xmm1, %xmm0
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	andps	%xmm2, %xmm3
bf215546Sopenharmony_ci	pslldq	$2, %xmm4
bf215546Sopenharmony_ci	psrldq	$2, %xmm3
bf215546Sopenharmony_ci	andps	%xmm2, %xmm4
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	orps	%xmm4, %xmm3
bf215546Sopenharmony_ci	orps	%xmm3, %xmm0
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movq	%xmm0, (%ecx)
bf215546Sopenharmony_ci	addl	$8, %ecx
bf215546Sopenharmony_ci.L40:
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	/* Would it be worth having a specialized version of this loop for
bf215546Sopenharmony_ci	 * the case where the destination is 16-byte aligned?  That version
bf215546Sopenharmony_ci	 * would be identical except that it could use movedqa instead of
bf215546Sopenharmony_ci	 * movdqu.
bf215546Sopenharmony_ci	 */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movl	%edx, %eax
bf215546Sopenharmony_ci	shrl	$2, %eax
bf215546Sopenharmony_ci	jmp	.L42
bf215546Sopenharmony_ci.L43:
bf215546Sopenharmony_ci	movdqa	(%ebx), %xmm0
bf215546Sopenharmony_ci	addl	$16, %ebx
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movdqa	%xmm0, %xmm3
bf215546Sopenharmony_ci	movdqa	%xmm0, %xmm4
bf215546Sopenharmony_ci	andps	%xmm1, %xmm0
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	andps	%xmm2, %xmm3
bf215546Sopenharmony_ci	pslldq	$2, %xmm4
bf215546Sopenharmony_ci	psrldq	$2, %xmm3
bf215546Sopenharmony_ci	andps	%xmm2, %xmm4
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	orps	%xmm4, %xmm3
bf215546Sopenharmony_ci	orps	%xmm3, %xmm0
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movdqu	%xmm0, (%ecx)
bf215546Sopenharmony_ci	addl	$16, %ecx
bf215546Sopenharmony_ci	subl	$1, %eax
bf215546Sopenharmony_ci.L42:
bf215546Sopenharmony_ci	jne	.L43
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	/* There may be upto 3 pixels remaining to be copied.  Take care
bf215546Sopenharmony_ci	 * of them now.  We do the 2 pixel case first because the data
bf215546Sopenharmony_ci	 * will be aligned.
bf215546Sopenharmony_ci	 */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	testl	$2, %edx
bf215546Sopenharmony_ci	je	.L47
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movq	(%ebx), %xmm0
bf215546Sopenharmony_ci	addl	$8, %ebx
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movdqa	%xmm0, %xmm3
bf215546Sopenharmony_ci	movdqa	%xmm0, %xmm4
bf215546Sopenharmony_ci	andps	%xmm1, %xmm0
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	andps	%xmm2, %xmm3
bf215546Sopenharmony_ci	pslldq	$2, %xmm4
bf215546Sopenharmony_ci	psrldq	$2, %xmm3
bf215546Sopenharmony_ci	andps	%xmm2, %xmm4
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	orps	%xmm4, %xmm3
bf215546Sopenharmony_ci	orps	%xmm3, %xmm0
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movq	%xmm0, (%ecx)
bf215546Sopenharmony_ci	addl	$8, %ecx
bf215546Sopenharmony_ci.L47:
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	testl	$1, %edx
bf215546Sopenharmony_ci	je	.L46
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	DO_ONE_LAST_PIXEL()
bf215546Sopenharmony_ci.L46:
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	popl	%ebx
bf215546Sopenharmony_ci	popl	%esi
bf215546Sopenharmony_ci	ret
bf215546Sopenharmony_ci	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#define MASK_565_L	0x07e0f800
bf215546Sopenharmony_ci#define MASK_565_H	0x0000001f
bf215546Sopenharmony_ci/* Setting SCALE_ADJUST to 5 gives a perfect match with the
bf215546Sopenharmony_ci * classic C implementation in Mesa.  Setting SCALE_ADJUST
bf215546Sopenharmony_ci * to 0 is slightly faster but at a small cost to accuracy.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci#define SCALE_ADJUST	5
bf215546Sopenharmony_ci#if SCALE_ADJUST == 5
bf215546Sopenharmony_ci#define PRESCALE_L 0x00100001
bf215546Sopenharmony_ci#define PRESCALE_H 0x00000200
bf215546Sopenharmony_ci#define SCALE_L 0x40C620E8
bf215546Sopenharmony_ci#define SCALE_H 0x0000839d
bf215546Sopenharmony_ci#elif SCALE_ADJUST == 0
bf215546Sopenharmony_ci#define PRESCALE_L 0x00200001
bf215546Sopenharmony_ci#define PRESCALE_H 0x00000800
bf215546Sopenharmony_ci#define SCALE_L 0x01040108
bf215546Sopenharmony_ci#define SCALE_H 0x00000108
bf215546Sopenharmony_ci#else
bf215546Sopenharmony_ci#error SCALE_ADJUST must either be 5 or 0.
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci#define ALPHA_L 0x00000000
bf215546Sopenharmony_ci#define ALPHA_H 0x00ff0000
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * MMX optimized version of the RGB565 to RGBA copy routine.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	.text
bf215546Sopenharmony_ci	.globl	_generic_read_RGBA_span_RGB565_MMX
bf215546Sopenharmony_ci        .hidden _generic_read_RGBA_span_RGB565_MMX
bf215546Sopenharmony_ci	.type	_generic_read_RGBA_span_RGB565_MMX, @function
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci_generic_read_RGBA_span_RGB565_MMX:
bf215546Sopenharmony_ci	_CET_ENDBR
bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS
bf215546Sopenharmony_ci	emms
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movl	4(%esp), %eax	/* source pointer */
bf215546Sopenharmony_ci	movl	8(%esp), %edx	/* destination pointer */
bf215546Sopenharmony_ci	movl	12(%esp), %ecx	/* number of pixels to copy */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pushl	$MASK_565_H
bf215546Sopenharmony_ci	pushl	$MASK_565_L
bf215546Sopenharmony_ci	movq	(%esp), %mm5
bf215546Sopenharmony_ci	pushl	$PRESCALE_H
bf215546Sopenharmony_ci	pushl	$PRESCALE_L
bf215546Sopenharmony_ci	movq	(%esp), %mm6
bf215546Sopenharmony_ci	pushl	$SCALE_H
bf215546Sopenharmony_ci	pushl	$SCALE_L
bf215546Sopenharmony_ci	movq	(%esp), %mm7
bf215546Sopenharmony_ci	pushl	$ALPHA_H
bf215546Sopenharmony_ci	pushl	$ALPHA_L
bf215546Sopenharmony_ci	movq	(%esp), %mm3
bf215546Sopenharmony_ci	addl	$32,%esp
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	sarl	$2, %ecx
bf215546Sopenharmony_ci	jl	.L01		/* Bail early if the count is negative. */
bf215546Sopenharmony_ci	jmp	.L02
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci.L03:
bf215546Sopenharmony_ci	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
bf215546Sopenharmony_ci	 * second pixels into the four words of %mm0 and %mm2.
bf215546Sopenharmony_ci      	 */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movq	(%eax), %mm4
bf215546Sopenharmony_ci	addl	$8, %eax
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pshufw	$0x00, %mm4, %mm0
bf215546Sopenharmony_ci	pshufw	$0x55, %mm4, %mm2
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	/* Mask the pixels so that each word of each register contains only
bf215546Sopenharmony_ci	 * one color component.
bf215546Sopenharmony_ci	 */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pand	%mm5, %mm0
bf215546Sopenharmony_ci	pand	%mm5, %mm2
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	/* Adjust the component values so that they are as small as possible,
bf215546Sopenharmony_ci	 * but large enough so that we can multiply them by an unsigned 16-bit
bf215546Sopenharmony_ci	 * number and get a value as large as 0x00ff0000.
bf215546Sopenharmony_ci 	 */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pmullw	%mm6, %mm0
bf215546Sopenharmony_ci	pmullw	%mm6, %mm2
bf215546Sopenharmony_ci#if SCALE_ADJUST > 0
bf215546Sopenharmony_ci	psrlw	$SCALE_ADJUST, %mm0
bf215546Sopenharmony_ci	psrlw	$SCALE_ADJUST, %mm2
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	/* Scale the input component values to be on the range
bf215546Sopenharmony_ci	 * [0, 0x00ff0000].  This it the real magic of the whole routine.
bf215546Sopenharmony_ci	 */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pmulhuw	%mm7, %mm0
bf215546Sopenharmony_ci	pmulhuw	%mm7, %mm2
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	/* Always set the alpha value to 0xff.
bf215546Sopenharmony_ci	 */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci 	por %mm3, %mm0
bf215546Sopenharmony_ci 	por %mm3, %mm2
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	/* Pack the 16-bit values to 8-bit values and store the converted
bf215546Sopenharmony_ci	 * pixel data.
bf215546Sopenharmony_ci	 */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	packuswb	%mm2, %mm0
bf215546Sopenharmony_ci	movq	%mm0, (%edx)
bf215546Sopenharmony_ci	addl	$8, %edx
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pshufw	$0xaa, %mm4, %mm0
bf215546Sopenharmony_ci	pshufw	$0xff, %mm4, %mm2
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pand	%mm5, %mm0
bf215546Sopenharmony_ci	pand	%mm5, %mm2
bf215546Sopenharmony_ci	pmullw	%mm6, %mm0
bf215546Sopenharmony_ci	pmullw	%mm6, %mm2
bf215546Sopenharmony_ci#if SCALE_ADJUST > 0
bf215546Sopenharmony_ci	psrlw	$SCALE_ADJUST, %mm0
bf215546Sopenharmony_ci	psrlw	$SCALE_ADJUST, %mm2
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci	pmulhuw	%mm7, %mm0
bf215546Sopenharmony_ci	pmulhuw	%mm7, %mm2
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci 	por %mm3, %mm0
bf215546Sopenharmony_ci 	por %mm3, %mm2
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	packuswb	%mm2, %mm0
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movq	%mm0, (%edx)
bf215546Sopenharmony_ci	addl	$8, %edx
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	subl	$1, %ecx
bf215546Sopenharmony_ci.L02:
bf215546Sopenharmony_ci	jne	.L03
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	/* At this point there can be at most 3 pixels left to process.  If
bf215546Sopenharmony_ci	 * there is either 2 or 3 left, process 2.
bf215546Sopenharmony_ci         */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movl	12(%esp), %ecx
bf215546Sopenharmony_ci	testl	$0x02, %ecx
bf215546Sopenharmony_ci	je	.L04
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movd	(%eax), %mm4
bf215546Sopenharmony_ci	addl	$4, %eax
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pshufw	$0x00, %mm4, %mm0
bf215546Sopenharmony_ci	pshufw	$0x55, %mm4, %mm2
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pand	%mm5, %mm0
bf215546Sopenharmony_ci	pand	%mm5, %mm2
bf215546Sopenharmony_ci	pmullw	%mm6, %mm0
bf215546Sopenharmony_ci	pmullw	%mm6, %mm2
bf215546Sopenharmony_ci#if SCALE_ADJUST > 0
bf215546Sopenharmony_ci	psrlw	$SCALE_ADJUST, %mm0
bf215546Sopenharmony_ci	psrlw	$SCALE_ADJUST, %mm2
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci	pmulhuw	%mm7, %mm0
bf215546Sopenharmony_ci	pmulhuw	%mm7, %mm2
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci 	por %mm3, %mm0
bf215546Sopenharmony_ci 	por %mm3, %mm2
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	packuswb	%mm2, %mm0
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movq	%mm0, (%edx)
bf215546Sopenharmony_ci	addl	$8, %edx
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci.L04:
bf215546Sopenharmony_ci	/* At this point there can be at most 1 pixel left to process.
bf215546Sopenharmony_ci	 * Process it if needed.
bf215546Sopenharmony_ci         */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	testl	$0x01, %ecx
bf215546Sopenharmony_ci	je	.L01
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movzwl	(%eax), %ecx
bf215546Sopenharmony_ci	movd	%ecx, %mm4
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pshufw	$0x00, %mm4, %mm0
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	pand	%mm5, %mm0
bf215546Sopenharmony_ci	pmullw	%mm6, %mm0
bf215546Sopenharmony_ci#if SCALE_ADJUST > 0
bf215546Sopenharmony_ci	psrlw	$SCALE_ADJUST, %mm0
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci	pmulhuw	%mm7, %mm0
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci 	por %mm3, %mm0
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	packuswb	%mm0, %mm0
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci	movd	%mm0, (%edx)
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci.L01:
bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS
bf215546Sopenharmony_ci	emms
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci	ret
bf215546Sopenharmony_ci#endif /* !defined(__MINGW32__) && !defined(__APPLE__) */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#if defined (__ELF__) && defined (__linux__)
bf215546Sopenharmony_ci	.section .note.GNU-stack,"",%progbits
bf215546Sopenharmony_ci#endif