1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * (C) Copyright IBM Corporation 2004
3bf215546Sopenharmony_ci * All Rights Reserved.
4bf215546Sopenharmony_ci *
5bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
6bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
7bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
8bf215546Sopenharmony_ci * on the rights to use, copy, modify, merge, publish, distribute, sub
9bf215546Sopenharmony_ci * license, and/or sell copies of the Software, and to permit persons to whom
10bf215546Sopenharmony_ci * the Software is furnished to do so, subject to the following conditions:
11bf215546Sopenharmony_ci *
12bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
13bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
14bf215546Sopenharmony_ci * Software.
15bf215546Sopenharmony_ci *
16bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19bf215546Sopenharmony_ci * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20bf215546Sopenharmony_ci * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21bf215546Sopenharmony_ci * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22bf215546Sopenharmony_ci * USE OR OTHER DEALINGS IN THE SOFTWARE.
23bf215546Sopenharmony_ci */
24bf215546Sopenharmony_ci
25bf215546Sopenharmony_ci/**
26bf215546Sopenharmony_ci * \file read_rgba_span_x86.S
27bf215546Sopenharmony_ci * Optimized routines to transfer pixel data from the framebuffer to a
28bf215546Sopenharmony_ci * buffer in main memory.
29bf215546Sopenharmony_ci *
30bf215546Sopenharmony_ci * \author Ian Romanick <idr@us.ibm.com>
31bf215546Sopenharmony_ci */
32bf215546Sopenharmony_ci/* Control flow enforcement support */
33bf215546Sopenharmony_ci#ifdef HAVE_CET_H
34bf215546Sopenharmony_ci#include <cet.h>
35bf215546Sopenharmony_ci#else
36bf215546Sopenharmony_ci#define _CET_ENDBR
37bf215546Sopenharmony_ci#endif
38bf215546Sopenharmony_ci
39bf215546Sopenharmony_ci	.file	"read_rgba_span_x86.S"
40bf215546Sopenharmony_ci#if !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
41bf215546Sopenharmony_ci/* Kevin F. Quinn 2nd July 2006
42bf215546Sopenharmony_ci * Replaced data segment constants with text-segment instructions.
43bf215546Sopenharmony_ci */
44bf215546Sopenharmony_ci#define	LOAD_MASK(mvins,m1,m2) \
45bf215546Sopenharmony_ci   	pushl	$0xff00ff00 ;\
46bf215546Sopenharmony_ci   	pushl	$0xff00ff00 ;\
47bf215546Sopenharmony_ci   	pushl	$0xff00ff00 ;\
48bf215546Sopenharmony_ci   	pushl	$0xff00ff00 ;\
49bf215546Sopenharmony_ci	mvins	(%esp), m1	;\
50bf215546Sopenharmony_ci   	pushl	$0x00ff0000 ;\
51bf215546Sopenharmony_ci   	pushl	$0x00ff0000 ;\
52bf215546Sopenharmony_ci   	pushl	$0x00ff0000 ;\
53bf215546Sopenharmony_ci   	pushl	$0x00ff0000 ;\
54bf215546Sopenharmony_ci	mvins	(%esp), m2	;\
55bf215546Sopenharmony_ci	addl	$32, %esp
56bf215546Sopenharmony_ci
57bf215546Sopenharmony_ci/* I implemented these as macros because they appear in several places,
58bf215546Sopenharmony_ci * and I've tweaked them a number of times.  I got tired of changing every
59bf215546Sopenharmony_ci * place they appear. :)
60bf215546Sopenharmony_ci */
61bf215546Sopenharmony_ci
62bf215546Sopenharmony_ci#define DO_ONE_PIXEL() \
63bf215546Sopenharmony_ci	movl	(%ebx), %eax ; \
64bf215546Sopenharmony_ci	addl	$4, %ebx ; \
65bf215546Sopenharmony_ci	bswap	%eax          /* ARGB -> BGRA */ ; \
66bf215546Sopenharmony_ci	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
67bf215546Sopenharmony_ci	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
68bf215546Sopenharmony_ci	addl	$4, %ecx
69bf215546Sopenharmony_ci
70bf215546Sopenharmony_ci#define DO_ONE_LAST_PIXEL() \
71bf215546Sopenharmony_ci	movl	(%ebx), %eax ; \
72bf215546Sopenharmony_ci	bswap	%eax          /* ARGB -> BGRA */ ; \
73bf215546Sopenharmony_ci	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
74bf215546Sopenharmony_ci	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ;
75bf215546Sopenharmony_ci
76bf215546Sopenharmony_ci
77bf215546Sopenharmony_ci/**
78bf215546Sopenharmony_ci * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
79bf215546Sopenharmony_ci *
80bf215546Sopenharmony_ci * \warning
81bf215546Sopenharmony_ci * This function assumes that the caller will issue the EMMS instruction
82bf215546Sopenharmony_ci * at the correct places.
83bf215546Sopenharmony_ci */
84bf215546Sopenharmony_ci
85bf215546Sopenharmony_ci.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
86bf215546Sopenharmony_ci.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
87bf215546Sopenharmony_ci	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
88bf215546Sopenharmony_ci_generic_read_RGBA_span_BGRA8888_REV_MMX:
89bf215546Sopenharmony_ci	_CET_ENDBR
90bf215546Sopenharmony_ci	pushl	%ebx
91bf215546Sopenharmony_ci
92bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS
93bf215546Sopenharmony_ci	emms
94bf215546Sopenharmony_ci#endif
95bf215546Sopenharmony_ci	LOAD_MASK(movq,%mm1,%mm2)
96bf215546Sopenharmony_ci
97bf215546Sopenharmony_ci	movl	8(%esp), %ebx	/* source pointer */
98bf215546Sopenharmony_ci	movl	16(%esp), %edx	/* number of pixels to copy */
99bf215546Sopenharmony_ci	movl	12(%esp), %ecx	/* destination pointer */
100bf215546Sopenharmony_ci
101bf215546Sopenharmony_ci	testl	%edx, %edx
102bf215546Sopenharmony_ci	jle	.L20		/* Bail if there's nothing to do. */
103bf215546Sopenharmony_ci
104bf215546Sopenharmony_ci	movl	%ebx, %eax
105bf215546Sopenharmony_ci
106bf215546Sopenharmony_ci	negl	%eax
107bf215546Sopenharmony_ci	sarl	$2, %eax
108bf215546Sopenharmony_ci	andl	$1, %eax
109bf215546Sopenharmony_ci	je	.L17
110bf215546Sopenharmony_ci
111bf215546Sopenharmony_ci	subl	%eax, %edx
112bf215546Sopenharmony_ci	DO_ONE_PIXEL()
113bf215546Sopenharmony_ci.L17:
114bf215546Sopenharmony_ci
115bf215546Sopenharmony_ci	/* Would it be faster to unroll this loop once and process 4 pixels
116bf215546Sopenharmony_ci	 * per pass, instead of just two?
117bf215546Sopenharmony_ci	 */
118bf215546Sopenharmony_ci
119bf215546Sopenharmony_ci	movl	%edx, %eax
120bf215546Sopenharmony_ci	shrl	%eax
121bf215546Sopenharmony_ci	jmp	.L18
122bf215546Sopenharmony_ci.L19:
123bf215546Sopenharmony_ci	movq	(%ebx), %mm0
124bf215546Sopenharmony_ci	addl	$8, %ebx
125bf215546Sopenharmony_ci
126bf215546Sopenharmony_ci	/* These 9 instructions do what PSHUFB (if there were such an
127bf215546Sopenharmony_ci	 * instruction) could do in 1. :(
128bf215546Sopenharmony_ci	 */
129bf215546Sopenharmony_ci
130bf215546Sopenharmony_ci	movq	%mm0, %mm3
131bf215546Sopenharmony_ci	movq	%mm0, %mm4
132bf215546Sopenharmony_ci
133bf215546Sopenharmony_ci	pand	%mm2, %mm3
134bf215546Sopenharmony_ci	psllq	$16, %mm4
135bf215546Sopenharmony_ci	psrlq	$16, %mm3
136bf215546Sopenharmony_ci	pand	%mm2, %mm4
137bf215546Sopenharmony_ci
138bf215546Sopenharmony_ci	pand	%mm1, %mm0
139bf215546Sopenharmony_ci	por	%mm4, %mm3
140bf215546Sopenharmony_ci	por	%mm3, %mm0
141bf215546Sopenharmony_ci
142bf215546Sopenharmony_ci	movq	%mm0, (%ecx)
143bf215546Sopenharmony_ci	addl	$8, %ecx
144bf215546Sopenharmony_ci	subl	$1, %eax
145bf215546Sopenharmony_ci.L18:
146bf215546Sopenharmony_ci	jne	.L19
147bf215546Sopenharmony_ci
148bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS
149bf215546Sopenharmony_ci	emms
150bf215546Sopenharmony_ci#endif
151bf215546Sopenharmony_ci
152bf215546Sopenharmony_ci	/* At this point there are either 1 or 0 pixels remaining to be
153bf215546Sopenharmony_ci	 * converted.  Convert the last pixel, if needed.
154bf215546Sopenharmony_ci	 */
155bf215546Sopenharmony_ci
156bf215546Sopenharmony_ci	testl	$1, %edx
157bf215546Sopenharmony_ci	je	.L20
158bf215546Sopenharmony_ci
159bf215546Sopenharmony_ci	DO_ONE_LAST_PIXEL()
160bf215546Sopenharmony_ci
161bf215546Sopenharmony_ci.L20:
162bf215546Sopenharmony_ci	popl	%ebx
163bf215546Sopenharmony_ci	ret
164bf215546Sopenharmony_ci	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
165bf215546Sopenharmony_ci
166bf215546Sopenharmony_ci
167bf215546Sopenharmony_ci/**
168bf215546Sopenharmony_ci * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
169bf215546Sopenharmony_ci * instructions are only actually used to read data from the framebuffer.
170bf215546Sopenharmony_ci * In practice, the speed-up is pretty small.
171bf215546Sopenharmony_ci *
172bf215546Sopenharmony_ci * \todo
173bf215546Sopenharmony_ci * Do some more testing and determine if there's any reason to have this
174bf215546Sopenharmony_ci * function in addition to the MMX version.
175bf215546Sopenharmony_ci *
176bf215546Sopenharmony_ci * \warning
177bf215546Sopenharmony_ci * This function assumes that the caller will issue the EMMS instruction
178bf215546Sopenharmony_ci * at the correct places.
179bf215546Sopenharmony_ci */
180bf215546Sopenharmony_ci
181bf215546Sopenharmony_ci.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
182bf215546Sopenharmony_ci.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
183bf215546Sopenharmony_ci	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function
184bf215546Sopenharmony_ci_generic_read_RGBA_span_BGRA8888_REV_SSE:
185bf215546Sopenharmony_ci	_CET_ENDBR
186bf215546Sopenharmony_ci	pushl	%esi
187bf215546Sopenharmony_ci	pushl	%ebx
188bf215546Sopenharmony_ci	pushl	%ebp
189bf215546Sopenharmony_ci
190bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS
191bf215546Sopenharmony_ci	emms
192bf215546Sopenharmony_ci#endif
193bf215546Sopenharmony_ci
194bf215546Sopenharmony_ci	LOAD_MASK(movq,%mm1,%mm2)
195bf215546Sopenharmony_ci
196bf215546Sopenharmony_ci	movl	16(%esp), %ebx	/* source pointer */
197bf215546Sopenharmony_ci	movl	24(%esp), %edx	/* number of pixels to copy */
198bf215546Sopenharmony_ci	movl	20(%esp), %ecx	/* destination pointer */
199bf215546Sopenharmony_ci
200bf215546Sopenharmony_ci	testl	%edx, %edx
201bf215546Sopenharmony_ci	jle	.L35		/* Bail if there's nothing to do. */
202bf215546Sopenharmony_ci
203bf215546Sopenharmony_ci	movl	%esp, %ebp
204bf215546Sopenharmony_ci	subl	$16, %esp
205bf215546Sopenharmony_ci	andl	$0xfffffff0, %esp
206bf215546Sopenharmony_ci
207bf215546Sopenharmony_ci	movl	%ebx, %eax
208bf215546Sopenharmony_ci	movl	%edx, %esi
209bf215546Sopenharmony_ci
210bf215546Sopenharmony_ci	negl	%eax
211bf215546Sopenharmony_ci	andl	$15, %eax
212bf215546Sopenharmony_ci	sarl	$2, %eax
213bf215546Sopenharmony_ci	cmpl	%edx, %eax
214bf215546Sopenharmony_ci	cmovle	%eax, %esi
215bf215546Sopenharmony_ci
216bf215546Sopenharmony_ci	subl	%esi, %edx
217bf215546Sopenharmony_ci
218bf215546Sopenharmony_ci	testl	$1, %esi
219bf215546Sopenharmony_ci	je	.L32
220bf215546Sopenharmony_ci
221bf215546Sopenharmony_ci	DO_ONE_PIXEL()
222bf215546Sopenharmony_ci.L32:
223bf215546Sopenharmony_ci
224bf215546Sopenharmony_ci	testl	$2, %esi
225bf215546Sopenharmony_ci	je	.L31
226bf215546Sopenharmony_ci
227bf215546Sopenharmony_ci	movq	(%ebx), %mm0
228bf215546Sopenharmony_ci	addl	$8, %ebx
229bf215546Sopenharmony_ci
230bf215546Sopenharmony_ci	movq	%mm0, %mm3
231bf215546Sopenharmony_ci	movq	%mm0, %mm4
232bf215546Sopenharmony_ci
233bf215546Sopenharmony_ci	pand	%mm2, %mm3
234bf215546Sopenharmony_ci	psllq	$16, %mm4
235bf215546Sopenharmony_ci	psrlq	$16, %mm3
236bf215546Sopenharmony_ci	pand	%mm2, %mm4
237bf215546Sopenharmony_ci
238bf215546Sopenharmony_ci	pand	%mm1, %mm0
239bf215546Sopenharmony_ci	por	%mm4, %mm3
240bf215546Sopenharmony_ci	por	%mm3, %mm0
241bf215546Sopenharmony_ci
242bf215546Sopenharmony_ci	movq	%mm0, (%ecx)
243bf215546Sopenharmony_ci	addl	$8, %ecx
244bf215546Sopenharmony_ci.L31:
245bf215546Sopenharmony_ci
246bf215546Sopenharmony_ci	movl	%edx, %eax
247bf215546Sopenharmony_ci	shrl	$2, %eax
248bf215546Sopenharmony_ci	jmp	.L33
249bf215546Sopenharmony_ci.L34:
250bf215546Sopenharmony_ci	movaps	(%ebx), %xmm0
251bf215546Sopenharmony_ci	addl	$16, %ebx
252bf215546Sopenharmony_ci
253bf215546Sopenharmony_ci	/* This would be so much better if we could just move directly from
254bf215546Sopenharmony_ci	 * an SSE register to an MMX register.  Unfortunately, that
255bf215546Sopenharmony_ci	 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
256bf215546Sopenharmony_ci	 * instruction.
257bf215546Sopenharmony_ci	 */
258bf215546Sopenharmony_ci
259bf215546Sopenharmony_ci	movaps	%xmm0, (%esp)
260bf215546Sopenharmony_ci	movq	(%esp), %mm0
261bf215546Sopenharmony_ci	movq	8(%esp), %mm5
262bf215546Sopenharmony_ci
263bf215546Sopenharmony_ci	movq	%mm0, %mm3
264bf215546Sopenharmony_ci	movq	%mm0, %mm4
265bf215546Sopenharmony_ci	movq	%mm5, %mm6
266bf215546Sopenharmony_ci	movq	%mm5, %mm7
267bf215546Sopenharmony_ci
268bf215546Sopenharmony_ci	pand	%mm2, %mm3
269bf215546Sopenharmony_ci	pand	%mm2, %mm6
270bf215546Sopenharmony_ci
271bf215546Sopenharmony_ci	psllq	$16, %mm4
272bf215546Sopenharmony_ci	psllq	$16, %mm7
273bf215546Sopenharmony_ci
274bf215546Sopenharmony_ci	psrlq	$16, %mm3
275bf215546Sopenharmony_ci	psrlq	$16, %mm6
276bf215546Sopenharmony_ci
277bf215546Sopenharmony_ci	pand	%mm2, %mm4
278bf215546Sopenharmony_ci	pand	%mm2, %mm7
279bf215546Sopenharmony_ci
280bf215546Sopenharmony_ci	pand	%mm1, %mm0
281bf215546Sopenharmony_ci	pand	%mm1, %mm5
282bf215546Sopenharmony_ci
283bf215546Sopenharmony_ci	por	%mm4, %mm3
284bf215546Sopenharmony_ci	por	%mm7, %mm6
285bf215546Sopenharmony_ci
286bf215546Sopenharmony_ci	por	%mm3, %mm0
287bf215546Sopenharmony_ci	por	%mm6, %mm5
288bf215546Sopenharmony_ci
289bf215546Sopenharmony_ci	movq	%mm0, (%ecx)
290bf215546Sopenharmony_ci	movq	%mm5, 8(%ecx)
291bf215546Sopenharmony_ci	addl	$16, %ecx
292bf215546Sopenharmony_ci
293bf215546Sopenharmony_ci	subl	$1, %eax
294bf215546Sopenharmony_ci.L33:
295bf215546Sopenharmony_ci	jne	.L34
296bf215546Sopenharmony_ci
297bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS
298bf215546Sopenharmony_ci	emms
299bf215546Sopenharmony_ci#endif
300bf215546Sopenharmony_ci	movl	%ebp, %esp
301bf215546Sopenharmony_ci
302bf215546Sopenharmony_ci	/* At this point there are either [0, 3] pixels remaining to be
303bf215546Sopenharmony_ci	 * converted.
304bf215546Sopenharmony_ci	 */
305bf215546Sopenharmony_ci
306bf215546Sopenharmony_ci	testl	$2, %edx
307bf215546Sopenharmony_ci	je	.L36
308bf215546Sopenharmony_ci
309bf215546Sopenharmony_ci	movq	(%ebx), %mm0
310bf215546Sopenharmony_ci	addl	$8, %ebx
311bf215546Sopenharmony_ci
312bf215546Sopenharmony_ci	movq	%mm0, %mm3
313bf215546Sopenharmony_ci	movq	%mm0, %mm4
314bf215546Sopenharmony_ci
315bf215546Sopenharmony_ci	pand	%mm2, %mm3
316bf215546Sopenharmony_ci	psllq	$16, %mm4
317bf215546Sopenharmony_ci	psrlq	$16, %mm3
318bf215546Sopenharmony_ci	pand	%mm2, %mm4
319bf215546Sopenharmony_ci
320bf215546Sopenharmony_ci	pand	%mm1, %mm0
321bf215546Sopenharmony_ci	por	%mm4, %mm3
322bf215546Sopenharmony_ci	por	%mm3, %mm0
323bf215546Sopenharmony_ci
324bf215546Sopenharmony_ci	movq	%mm0, (%ecx)
325bf215546Sopenharmony_ci	addl	$8, %ecx
326bf215546Sopenharmony_ci.L36:
327bf215546Sopenharmony_ci
328bf215546Sopenharmony_ci	testl	$1, %edx
329bf215546Sopenharmony_ci	je	.L35
330bf215546Sopenharmony_ci
331bf215546Sopenharmony_ci	DO_ONE_LAST_PIXEL()
332bf215546Sopenharmony_ci.L35:
333bf215546Sopenharmony_ci	popl	%ebp
334bf215546Sopenharmony_ci	popl	%ebx
335bf215546Sopenharmony_ci	popl	%esi
336bf215546Sopenharmony_ci	ret
337bf215546Sopenharmony_ci	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
338bf215546Sopenharmony_ci
339bf215546Sopenharmony_ci
340bf215546Sopenharmony_ci/**
341bf215546Sopenharmony_ci * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
342bf215546Sopenharmony_ci */
343bf215546Sopenharmony_ci
344bf215546Sopenharmony_ci	.text
345bf215546Sopenharmony_ci.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
346bf215546Sopenharmony_ci.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
347bf215546Sopenharmony_ci	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
348bf215546Sopenharmony_ci_generic_read_RGBA_span_BGRA8888_REV_SSE2:
349bf215546Sopenharmony_ci	_CET_ENDBR
350bf215546Sopenharmony_ci	pushl	%esi
351bf215546Sopenharmony_ci	pushl	%ebx
352bf215546Sopenharmony_ci
353bf215546Sopenharmony_ci	LOAD_MASK(movdqu,%xmm1,%xmm2)
354bf215546Sopenharmony_ci
355bf215546Sopenharmony_ci	movl	12(%esp), %ebx	/* source pointer */
356bf215546Sopenharmony_ci	movl	20(%esp), %edx	/* number of pixels to copy */
357bf215546Sopenharmony_ci	movl	16(%esp), %ecx	/* destination pointer */
358bf215546Sopenharmony_ci
359bf215546Sopenharmony_ci	movl	%ebx, %eax
360bf215546Sopenharmony_ci	movl	%edx, %esi
361bf215546Sopenharmony_ci
362bf215546Sopenharmony_ci	testl	%edx, %edx
363bf215546Sopenharmony_ci	jle	.L46		/* Bail if there's nothing to do. */
364bf215546Sopenharmony_ci
365bf215546Sopenharmony_ci	/* If the source pointer isn't a multiple of 16 we have to process
366bf215546Sopenharmony_ci	 * a few pixels the "slow" way to get the address aligned for
367bf215546Sopenharmony_ci	 * the SSE fetch intsructions.
368bf215546Sopenharmony_ci	 */
369bf215546Sopenharmony_ci
370bf215546Sopenharmony_ci	negl	%eax
371bf215546Sopenharmony_ci	andl	$15, %eax
372bf215546Sopenharmony_ci	sarl	$2, %eax
373bf215546Sopenharmony_ci
374bf215546Sopenharmony_ci	cmpl	%edx, %eax
375bf215546Sopenharmony_ci	cmovbe	%eax, %esi
376bf215546Sopenharmony_ci	subl	%esi, %edx
377bf215546Sopenharmony_ci
378bf215546Sopenharmony_ci	testl	$1, %esi
379bf215546Sopenharmony_ci	je	.L41
380bf215546Sopenharmony_ci
381bf215546Sopenharmony_ci	DO_ONE_PIXEL()
382bf215546Sopenharmony_ci.L41:
383bf215546Sopenharmony_ci	testl	$2, %esi
384bf215546Sopenharmony_ci	je	.L40
385bf215546Sopenharmony_ci
386bf215546Sopenharmony_ci	movq	(%ebx), %xmm0
387bf215546Sopenharmony_ci	addl	$8, %ebx
388bf215546Sopenharmony_ci
389bf215546Sopenharmony_ci	movdqa	%xmm0, %xmm3
390bf215546Sopenharmony_ci	movdqa	%xmm0, %xmm4
391bf215546Sopenharmony_ci	andps	%xmm1, %xmm0
392bf215546Sopenharmony_ci
393bf215546Sopenharmony_ci	andps	%xmm2, %xmm3
394bf215546Sopenharmony_ci	pslldq	$2, %xmm4
395bf215546Sopenharmony_ci	psrldq	$2, %xmm3
396bf215546Sopenharmony_ci	andps	%xmm2, %xmm4
397bf215546Sopenharmony_ci
398bf215546Sopenharmony_ci	orps	%xmm4, %xmm3
399bf215546Sopenharmony_ci	orps	%xmm3, %xmm0
400bf215546Sopenharmony_ci
401bf215546Sopenharmony_ci	movq	%xmm0, (%ecx)
402bf215546Sopenharmony_ci	addl	$8, %ecx
403bf215546Sopenharmony_ci.L40:
404bf215546Sopenharmony_ci
405bf215546Sopenharmony_ci	/* Would it be worth having a specialized version of this loop for
406bf215546Sopenharmony_ci	 * the case where the destination is 16-byte aligned?  That version
407bf215546Sopenharmony_ci	 * would be identical except that it could use movedqa instead of
408bf215546Sopenharmony_ci	 * movdqu.
409bf215546Sopenharmony_ci	 */
410bf215546Sopenharmony_ci
411bf215546Sopenharmony_ci	movl	%edx, %eax
412bf215546Sopenharmony_ci	shrl	$2, %eax
413bf215546Sopenharmony_ci	jmp	.L42
414bf215546Sopenharmony_ci.L43:
415bf215546Sopenharmony_ci	movdqa	(%ebx), %xmm0
416bf215546Sopenharmony_ci	addl	$16, %ebx
417bf215546Sopenharmony_ci
418bf215546Sopenharmony_ci	movdqa	%xmm0, %xmm3
419bf215546Sopenharmony_ci	movdqa	%xmm0, %xmm4
420bf215546Sopenharmony_ci	andps	%xmm1, %xmm0
421bf215546Sopenharmony_ci
422bf215546Sopenharmony_ci	andps	%xmm2, %xmm3
423bf215546Sopenharmony_ci	pslldq	$2, %xmm4
424bf215546Sopenharmony_ci	psrldq	$2, %xmm3
425bf215546Sopenharmony_ci	andps	%xmm2, %xmm4
426bf215546Sopenharmony_ci
427bf215546Sopenharmony_ci	orps	%xmm4, %xmm3
428bf215546Sopenharmony_ci	orps	%xmm3, %xmm0
429bf215546Sopenharmony_ci
430bf215546Sopenharmony_ci	movdqu	%xmm0, (%ecx)
431bf215546Sopenharmony_ci	addl	$16, %ecx
432bf215546Sopenharmony_ci	subl	$1, %eax
433bf215546Sopenharmony_ci.L42:
434bf215546Sopenharmony_ci	jne	.L43
435bf215546Sopenharmony_ci
436bf215546Sopenharmony_ci
437bf215546Sopenharmony_ci	/* There may be upto 3 pixels remaining to be copied.  Take care
438bf215546Sopenharmony_ci	 * of them now.  We do the 2 pixel case first because the data
439bf215546Sopenharmony_ci	 * will be aligned.
440bf215546Sopenharmony_ci	 */
441bf215546Sopenharmony_ci
442bf215546Sopenharmony_ci	testl	$2, %edx
443bf215546Sopenharmony_ci	je	.L47
444bf215546Sopenharmony_ci
445bf215546Sopenharmony_ci	movq	(%ebx), %xmm0
446bf215546Sopenharmony_ci	addl	$8, %ebx
447bf215546Sopenharmony_ci
448bf215546Sopenharmony_ci	movdqa	%xmm0, %xmm3
449bf215546Sopenharmony_ci	movdqa	%xmm0, %xmm4
450bf215546Sopenharmony_ci	andps	%xmm1, %xmm0
451bf215546Sopenharmony_ci
452bf215546Sopenharmony_ci	andps	%xmm2, %xmm3
453bf215546Sopenharmony_ci	pslldq	$2, %xmm4
454bf215546Sopenharmony_ci	psrldq	$2, %xmm3
455bf215546Sopenharmony_ci	andps	%xmm2, %xmm4
456bf215546Sopenharmony_ci
457bf215546Sopenharmony_ci	orps	%xmm4, %xmm3
458bf215546Sopenharmony_ci	orps	%xmm3, %xmm0
459bf215546Sopenharmony_ci
460bf215546Sopenharmony_ci	movq	%xmm0, (%ecx)
461bf215546Sopenharmony_ci	addl	$8, %ecx
462bf215546Sopenharmony_ci.L47:
463bf215546Sopenharmony_ci
464bf215546Sopenharmony_ci	testl	$1, %edx
465bf215546Sopenharmony_ci	je	.L46
466bf215546Sopenharmony_ci
467bf215546Sopenharmony_ci	DO_ONE_LAST_PIXEL()
468bf215546Sopenharmony_ci.L46:
469bf215546Sopenharmony_ci
470bf215546Sopenharmony_ci	popl	%ebx
471bf215546Sopenharmony_ci	popl	%esi
472bf215546Sopenharmony_ci	ret
473bf215546Sopenharmony_ci	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
474bf215546Sopenharmony_ci
475bf215546Sopenharmony_ci
476bf215546Sopenharmony_ci
477bf215546Sopenharmony_ci#define MASK_565_L	0x07e0f800
478bf215546Sopenharmony_ci#define MASK_565_H	0x0000001f
479bf215546Sopenharmony_ci/* Setting SCALE_ADJUST to 5 gives a perfect match with the
480bf215546Sopenharmony_ci * classic C implementation in Mesa.  Setting SCALE_ADJUST
481bf215546Sopenharmony_ci * to 0 is slightly faster but at a small cost to accuracy.
482bf215546Sopenharmony_ci */
483bf215546Sopenharmony_ci#define SCALE_ADJUST	5
484bf215546Sopenharmony_ci#if SCALE_ADJUST == 5
485bf215546Sopenharmony_ci#define PRESCALE_L 0x00100001
486bf215546Sopenharmony_ci#define PRESCALE_H 0x00000200
487bf215546Sopenharmony_ci#define SCALE_L 0x40C620E8
488bf215546Sopenharmony_ci#define SCALE_H 0x0000839d
489bf215546Sopenharmony_ci#elif SCALE_ADJUST == 0
490bf215546Sopenharmony_ci#define PRESCALE_L 0x00200001
491bf215546Sopenharmony_ci#define PRESCALE_H 0x00000800
492bf215546Sopenharmony_ci#define SCALE_L 0x01040108
493bf215546Sopenharmony_ci#define SCALE_H 0x00000108
494bf215546Sopenharmony_ci#else
495bf215546Sopenharmony_ci#error SCALE_ADJUST must either be 5 or 0.
496bf215546Sopenharmony_ci#endif
497bf215546Sopenharmony_ci#define ALPHA_L 0x00000000
498bf215546Sopenharmony_ci#define ALPHA_H 0x00ff0000
499bf215546Sopenharmony_ci
500bf215546Sopenharmony_ci/**
501bf215546Sopenharmony_ci * MMX optimized version of the RGB565 to RGBA copy routine.
502bf215546Sopenharmony_ci */
503bf215546Sopenharmony_ci
504bf215546Sopenharmony_ci	.text
505bf215546Sopenharmony_ci	.globl	_generic_read_RGBA_span_RGB565_MMX
506bf215546Sopenharmony_ci        .hidden _generic_read_RGBA_span_RGB565_MMX
507bf215546Sopenharmony_ci	.type	_generic_read_RGBA_span_RGB565_MMX, @function
508bf215546Sopenharmony_ci
509bf215546Sopenharmony_ci_generic_read_RGBA_span_RGB565_MMX:
510bf215546Sopenharmony_ci	_CET_ENDBR
511bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS
512bf215546Sopenharmony_ci	emms
513bf215546Sopenharmony_ci#endif
514bf215546Sopenharmony_ci
515bf215546Sopenharmony_ci	movl	4(%esp), %eax	/* source pointer */
516bf215546Sopenharmony_ci	movl	8(%esp), %edx	/* destination pointer */
517bf215546Sopenharmony_ci	movl	12(%esp), %ecx	/* number of pixels to copy */
518bf215546Sopenharmony_ci
519bf215546Sopenharmony_ci	pushl	$MASK_565_H
520bf215546Sopenharmony_ci	pushl	$MASK_565_L
521bf215546Sopenharmony_ci	movq	(%esp), %mm5
522bf215546Sopenharmony_ci	pushl	$PRESCALE_H
523bf215546Sopenharmony_ci	pushl	$PRESCALE_L
524bf215546Sopenharmony_ci	movq	(%esp), %mm6
525bf215546Sopenharmony_ci	pushl	$SCALE_H
526bf215546Sopenharmony_ci	pushl	$SCALE_L
527bf215546Sopenharmony_ci	movq	(%esp), %mm7
528bf215546Sopenharmony_ci	pushl	$ALPHA_H
529bf215546Sopenharmony_ci	pushl	$ALPHA_L
530bf215546Sopenharmony_ci	movq	(%esp), %mm3
531bf215546Sopenharmony_ci	addl	$32,%esp
532bf215546Sopenharmony_ci
533bf215546Sopenharmony_ci	sarl	$2, %ecx
534bf215546Sopenharmony_ci	jl	.L01		/* Bail early if the count is negative. */
535bf215546Sopenharmony_ci	jmp	.L02
536bf215546Sopenharmony_ci
537bf215546Sopenharmony_ci.L03:
538bf215546Sopenharmony_ci	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
539bf215546Sopenharmony_ci	 * second pixels into the four words of %mm0 and %mm2.
540bf215546Sopenharmony_ci      	 */
541bf215546Sopenharmony_ci
542bf215546Sopenharmony_ci	movq	(%eax), %mm4
543bf215546Sopenharmony_ci	addl	$8, %eax
544bf215546Sopenharmony_ci
545bf215546Sopenharmony_ci	pshufw	$0x00, %mm4, %mm0
546bf215546Sopenharmony_ci	pshufw	$0x55, %mm4, %mm2
547bf215546Sopenharmony_ci
548bf215546Sopenharmony_ci
549bf215546Sopenharmony_ci	/* Mask the pixels so that each word of each register contains only
550bf215546Sopenharmony_ci	 * one color component.
551bf215546Sopenharmony_ci	 */
552bf215546Sopenharmony_ci
553bf215546Sopenharmony_ci	pand	%mm5, %mm0
554bf215546Sopenharmony_ci	pand	%mm5, %mm2
555bf215546Sopenharmony_ci
556bf215546Sopenharmony_ci
557bf215546Sopenharmony_ci	/* Adjust the component values so that they are as small as possible,
558bf215546Sopenharmony_ci	 * but large enough so that we can multiply them by an unsigned 16-bit
559bf215546Sopenharmony_ci	 * number and get a value as large as 0x00ff0000.
560bf215546Sopenharmony_ci 	 */
561bf215546Sopenharmony_ci
562bf215546Sopenharmony_ci	pmullw	%mm6, %mm0
563bf215546Sopenharmony_ci	pmullw	%mm6, %mm2
564bf215546Sopenharmony_ci#if SCALE_ADJUST > 0
565bf215546Sopenharmony_ci	psrlw	$SCALE_ADJUST, %mm0
566bf215546Sopenharmony_ci	psrlw	$SCALE_ADJUST, %mm2
567bf215546Sopenharmony_ci#endif
568bf215546Sopenharmony_ci
569bf215546Sopenharmony_ci	/* Scale the input component values to be on the range
570bf215546Sopenharmony_ci	 * [0, 0x00ff0000].  This it the real magic of the whole routine.
571bf215546Sopenharmony_ci	 */
572bf215546Sopenharmony_ci
573bf215546Sopenharmony_ci	pmulhuw	%mm7, %mm0
574bf215546Sopenharmony_ci	pmulhuw	%mm7, %mm2
575bf215546Sopenharmony_ci
576bf215546Sopenharmony_ci
577bf215546Sopenharmony_ci	/* Always set the alpha value to 0xff.
578bf215546Sopenharmony_ci	 */
579bf215546Sopenharmony_ci
580bf215546Sopenharmony_ci 	por %mm3, %mm0
581bf215546Sopenharmony_ci 	por %mm3, %mm2
582bf215546Sopenharmony_ci
583bf215546Sopenharmony_ci
584bf215546Sopenharmony_ci	/* Pack the 16-bit values to 8-bit values and store the converted
585bf215546Sopenharmony_ci	 * pixel data.
586bf215546Sopenharmony_ci	 */
587bf215546Sopenharmony_ci
588bf215546Sopenharmony_ci	packuswb	%mm2, %mm0
589bf215546Sopenharmony_ci	movq	%mm0, (%edx)
590bf215546Sopenharmony_ci	addl	$8, %edx
591bf215546Sopenharmony_ci
592bf215546Sopenharmony_ci	pshufw	$0xaa, %mm4, %mm0
593bf215546Sopenharmony_ci	pshufw	$0xff, %mm4, %mm2
594bf215546Sopenharmony_ci
595bf215546Sopenharmony_ci	pand	%mm5, %mm0
596bf215546Sopenharmony_ci	pand	%mm5, %mm2
597bf215546Sopenharmony_ci	pmullw	%mm6, %mm0
598bf215546Sopenharmony_ci	pmullw	%mm6, %mm2
599bf215546Sopenharmony_ci#if SCALE_ADJUST > 0
600bf215546Sopenharmony_ci	psrlw	$SCALE_ADJUST, %mm0
601bf215546Sopenharmony_ci	psrlw	$SCALE_ADJUST, %mm2
602bf215546Sopenharmony_ci#endif
603bf215546Sopenharmony_ci	pmulhuw	%mm7, %mm0
604bf215546Sopenharmony_ci	pmulhuw	%mm7, %mm2
605bf215546Sopenharmony_ci
606bf215546Sopenharmony_ci 	por %mm3, %mm0
607bf215546Sopenharmony_ci 	por %mm3, %mm2
608bf215546Sopenharmony_ci
609bf215546Sopenharmony_ci	packuswb	%mm2, %mm0
610bf215546Sopenharmony_ci
611bf215546Sopenharmony_ci	movq	%mm0, (%edx)
612bf215546Sopenharmony_ci	addl	$8, %edx
613bf215546Sopenharmony_ci
614bf215546Sopenharmony_ci	subl	$1, %ecx
615bf215546Sopenharmony_ci.L02:
616bf215546Sopenharmony_ci	jne	.L03
617bf215546Sopenharmony_ci
618bf215546Sopenharmony_ci
619bf215546Sopenharmony_ci	/* At this point there can be at most 3 pixels left to process.  If
620bf215546Sopenharmony_ci	 * there is either 2 or 3 left, process 2.
621bf215546Sopenharmony_ci         */
622bf215546Sopenharmony_ci
623bf215546Sopenharmony_ci	movl	12(%esp), %ecx
624bf215546Sopenharmony_ci	testl	$0x02, %ecx
625bf215546Sopenharmony_ci	je	.L04
626bf215546Sopenharmony_ci
627bf215546Sopenharmony_ci	movd	(%eax), %mm4
628bf215546Sopenharmony_ci	addl	$4, %eax
629bf215546Sopenharmony_ci
630bf215546Sopenharmony_ci	pshufw	$0x00, %mm4, %mm0
631bf215546Sopenharmony_ci	pshufw	$0x55, %mm4, %mm2
632bf215546Sopenharmony_ci
633bf215546Sopenharmony_ci	pand	%mm5, %mm0
634bf215546Sopenharmony_ci	pand	%mm5, %mm2
635bf215546Sopenharmony_ci	pmullw	%mm6, %mm0
636bf215546Sopenharmony_ci	pmullw	%mm6, %mm2
637bf215546Sopenharmony_ci#if SCALE_ADJUST > 0
638bf215546Sopenharmony_ci	psrlw	$SCALE_ADJUST, %mm0
639bf215546Sopenharmony_ci	psrlw	$SCALE_ADJUST, %mm2
640bf215546Sopenharmony_ci#endif
641bf215546Sopenharmony_ci	pmulhuw	%mm7, %mm0
642bf215546Sopenharmony_ci	pmulhuw	%mm7, %mm2
643bf215546Sopenharmony_ci
644bf215546Sopenharmony_ci 	por %mm3, %mm0
645bf215546Sopenharmony_ci 	por %mm3, %mm2
646bf215546Sopenharmony_ci
647bf215546Sopenharmony_ci	packuswb	%mm2, %mm0
648bf215546Sopenharmony_ci
649bf215546Sopenharmony_ci	movq	%mm0, (%edx)
650bf215546Sopenharmony_ci	addl	$8, %edx
651bf215546Sopenharmony_ci
652bf215546Sopenharmony_ci.L04:
653bf215546Sopenharmony_ci	/* At this point there can be at most 1 pixel left to process.
654bf215546Sopenharmony_ci	 * Process it if needed.
655bf215546Sopenharmony_ci         */
656bf215546Sopenharmony_ci
657bf215546Sopenharmony_ci	testl	$0x01, %ecx
658bf215546Sopenharmony_ci	je	.L01
659bf215546Sopenharmony_ci
660bf215546Sopenharmony_ci	movzwl	(%eax), %ecx
661bf215546Sopenharmony_ci	movd	%ecx, %mm4
662bf215546Sopenharmony_ci
663bf215546Sopenharmony_ci	pshufw	$0x00, %mm4, %mm0
664bf215546Sopenharmony_ci
665bf215546Sopenharmony_ci	pand	%mm5, %mm0
666bf215546Sopenharmony_ci	pmullw	%mm6, %mm0
667bf215546Sopenharmony_ci#if SCALE_ADJUST > 0
668bf215546Sopenharmony_ci	psrlw	$SCALE_ADJUST, %mm0
669bf215546Sopenharmony_ci#endif
670bf215546Sopenharmony_ci	pmulhuw	%mm7, %mm0
671bf215546Sopenharmony_ci
672bf215546Sopenharmony_ci 	por %mm3, %mm0
673bf215546Sopenharmony_ci
674bf215546Sopenharmony_ci	packuswb	%mm0, %mm0
675bf215546Sopenharmony_ci
676bf215546Sopenharmony_ci	movd	%mm0, (%edx)
677bf215546Sopenharmony_ci
678bf215546Sopenharmony_ci.L01:
679bf215546Sopenharmony_ci#ifdef USE_INNER_EMMS
680bf215546Sopenharmony_ci	emms
681bf215546Sopenharmony_ci#endif
682bf215546Sopenharmony_ci	ret
683bf215546Sopenharmony_ci#endif /* !defined(__MINGW32__) && !defined(__APPLE__) */
684bf215546Sopenharmony_ci
685bf215546Sopenharmony_ci#if defined (__ELF__) && defined (__linux__)
686bf215546Sopenharmony_ci	.section .note.GNU-stack,"",%progbits
687bf215546Sopenharmony_ci#endif
688