1 /*
2  * (C) Copyright IBM Corporation 2004
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19  * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 /**
26  * \file read_rgba_span_x86.S
27  * Optimized routines to transfer pixel data from the framebuffer to a
28  * buffer in main memory.
29  *
30  * \author Ian Romanick <idr@us.ibm.com>
31  */
32 /* Control flow enforcement support */
33 #ifdef HAVE_CET_H
34 #include <cet.h>
35 #else
36 #define _CET_ENDBR
37 #endif
38 
39 	.file	"read_rgba_span_x86.S"
40 #if !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
41 /* Kevin F. Quinn 2nd July 2006
42  * Replaced data segment constants with text-segment instructions.
43  */
44 #define	LOAD_MASK(mvins,m1,m2) \
45    	pushl	$0xff00ff00 ;\
46    	pushl	$0xff00ff00 ;\
47    	pushl	$0xff00ff00 ;\
48    	pushl	$0xff00ff00 ;\
49 	mvins	(%esp), m1	;\
50    	pushl	$0x00ff0000 ;\
51    	pushl	$0x00ff0000 ;\
52    	pushl	$0x00ff0000 ;\
53    	pushl	$0x00ff0000 ;\
54 	mvins	(%esp), m2	;\
55 	addl	$32, %esp
56 
57 /* I implemented these as macros because they appear in several places,
58  * and I've tweaked them a number of times.  I got tired of changing every
59  * place they appear. :)
60  */
61 
62 #define DO_ONE_PIXEL() \
63 	movl	(%ebx), %eax ; \
64 	addl	$4, %ebx ; \
65 	bswap	%eax          /* ARGB -> BGRA */ ; \
66 	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
67 	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
68 	addl	$4, %ecx
69 
70 #define DO_ONE_LAST_PIXEL() \
71 	movl	(%ebx), %eax ; \
72 	bswap	%eax          /* ARGB -> BGRA */ ; \
73 	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
74 	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ;
75 
76 
77 /**
78  * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
79  *
80  * \warning
81  * This function assumes that the caller will issue the EMMS instruction
82  * at the correct places.
83  */
84 
85 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
86 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
87 	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
88 _generic_read_RGBA_span_BGRA8888_REV_MMX:
89 	_CET_ENDBR
90 	pushl	%ebx
91 
92 #ifdef USE_INNER_EMMS
93 	emms
94 #endif
95 	LOAD_MASK(movq,%mm1,%mm2)
96 
97 	movl	8(%esp), %ebx	/* source pointer */
98 	movl	16(%esp), %edx	/* number of pixels to copy */
99 	movl	12(%esp), %ecx	/* destination pointer */
100 
101 	testl	%edx, %edx
102 	jle	.L20		/* Bail if there's nothing to do. */
103 
104 	movl	%ebx, %eax
105 
106 	negl	%eax
107 	sarl	$2, %eax
108 	andl	$1, %eax
109 	je	.L17
110 
111 	subl	%eax, %edx
112 	DO_ONE_PIXEL()
113 .L17:
114 
115 	/* Would it be faster to unroll this loop once and process 4 pixels
116 	 * per pass, instead of just two?
117 	 */
118 
119 	movl	%edx, %eax
120 	shrl	%eax
121 	jmp	.L18
122 .L19:
123 	movq	(%ebx), %mm0
124 	addl	$8, %ebx
125 
126 	/* These 9 instructions do what PSHUFB (if there were such an
127 	 * instruction) could do in 1. :(
128 	 */
129 
130 	movq	%mm0, %mm3
131 	movq	%mm0, %mm4
132 
133 	pand	%mm2, %mm3
134 	psllq	$16, %mm4
135 	psrlq	$16, %mm3
136 	pand	%mm2, %mm4
137 
138 	pand	%mm1, %mm0
139 	por	%mm4, %mm3
140 	por	%mm3, %mm0
141 
142 	movq	%mm0, (%ecx)
143 	addl	$8, %ecx
144 	subl	$1, %eax
145 .L18:
146 	jne	.L19
147 
148 #ifdef USE_INNER_EMMS
149 	emms
150 #endif
151 
152 	/* At this point there are either 1 or 0 pixels remaining to be
153 	 * converted.  Convert the last pixel, if needed.
154 	 */
155 
156 	testl	$1, %edx
157 	je	.L20
158 
159 	DO_ONE_LAST_PIXEL()
160 
161 .L20:
162 	popl	%ebx
163 	ret
164 	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
165 
166 
167 /**
168  * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
169  * instructions are only actually used to read data from the framebuffer.
170  * In practice, the speed-up is pretty small.
171  *
172  * \todo
173  * Do some more testing and determine if there's any reason to have this
174  * function in addition to the MMX version.
175  *
176  * \warning
177  * This function assumes that the caller will issue the EMMS instruction
178  * at the correct places.
179  */
180 
181 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
182 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
183 	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function
184 _generic_read_RGBA_span_BGRA8888_REV_SSE:
185 	_CET_ENDBR
186 	pushl	%esi
187 	pushl	%ebx
188 	pushl	%ebp
189 
190 #ifdef USE_INNER_EMMS
191 	emms
192 #endif
193 
194 	LOAD_MASK(movq,%mm1,%mm2)
195 
196 	movl	16(%esp), %ebx	/* source pointer */
197 	movl	24(%esp), %edx	/* number of pixels to copy */
198 	movl	20(%esp), %ecx	/* destination pointer */
199 
200 	testl	%edx, %edx
201 	jle	.L35		/* Bail if there's nothing to do. */
202 
203 	movl	%esp, %ebp
204 	subl	$16, %esp
205 	andl	$0xfffffff0, %esp
206 
207 	movl	%ebx, %eax
208 	movl	%edx, %esi
209 
210 	negl	%eax
211 	andl	$15, %eax
212 	sarl	$2, %eax
213 	cmpl	%edx, %eax
214 	cmovle	%eax, %esi
215 
216 	subl	%esi, %edx
217 
218 	testl	$1, %esi
219 	je	.L32
220 
221 	DO_ONE_PIXEL()
222 .L32:
223 
224 	testl	$2, %esi
225 	je	.L31
226 
227 	movq	(%ebx), %mm0
228 	addl	$8, %ebx
229 
230 	movq	%mm0, %mm3
231 	movq	%mm0, %mm4
232 
233 	pand	%mm2, %mm3
234 	psllq	$16, %mm4
235 	psrlq	$16, %mm3
236 	pand	%mm2, %mm4
237 
238 	pand	%mm1, %mm0
239 	por	%mm4, %mm3
240 	por	%mm3, %mm0
241 
242 	movq	%mm0, (%ecx)
243 	addl	$8, %ecx
244 .L31:
245 
246 	movl	%edx, %eax
247 	shrl	$2, %eax
248 	jmp	.L33
249 .L34:
250 	movaps	(%ebx), %xmm0
251 	addl	$16, %ebx
252 
253 	/* This would be so much better if we could just move directly from
254 	 * an SSE register to an MMX register.  Unfortunately, that
255 	 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
256 	 * instruction.
257 	 */
258 
259 	movaps	%xmm0, (%esp)
260 	movq	(%esp), %mm0
261 	movq	8(%esp), %mm5
262 
263 	movq	%mm0, %mm3
264 	movq	%mm0, %mm4
265 	movq	%mm5, %mm6
266 	movq	%mm5, %mm7
267 
268 	pand	%mm2, %mm3
269 	pand	%mm2, %mm6
270 
271 	psllq	$16, %mm4
272 	psllq	$16, %mm7
273 
274 	psrlq	$16, %mm3
275 	psrlq	$16, %mm6
276 
277 	pand	%mm2, %mm4
278 	pand	%mm2, %mm7
279 
280 	pand	%mm1, %mm0
281 	pand	%mm1, %mm5
282 
283 	por	%mm4, %mm3
284 	por	%mm7, %mm6
285 
286 	por	%mm3, %mm0
287 	por	%mm6, %mm5
288 
289 	movq	%mm0, (%ecx)
290 	movq	%mm5, 8(%ecx)
291 	addl	$16, %ecx
292 
293 	subl	$1, %eax
294 .L33:
295 	jne	.L34
296 
297 #ifdef USE_INNER_EMMS
298 	emms
299 #endif
300 	movl	%ebp, %esp
301 
302 	/* At this point there are either [0, 3] pixels remaining to be
303 	 * converted.
304 	 */
305 
306 	testl	$2, %edx
307 	je	.L36
308 
309 	movq	(%ebx), %mm0
310 	addl	$8, %ebx
311 
312 	movq	%mm0, %mm3
313 	movq	%mm0, %mm4
314 
315 	pand	%mm2, %mm3
316 	psllq	$16, %mm4
317 	psrlq	$16, %mm3
318 	pand	%mm2, %mm4
319 
320 	pand	%mm1, %mm0
321 	por	%mm4, %mm3
322 	por	%mm3, %mm0
323 
324 	movq	%mm0, (%ecx)
325 	addl	$8, %ecx
326 .L36:
327 
328 	testl	$1, %edx
329 	je	.L35
330 
331 	DO_ONE_LAST_PIXEL()
332 .L35:
333 	popl	%ebp
334 	popl	%ebx
335 	popl	%esi
336 	ret
337 	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
338 
339 
340 /**
341  * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
342  */
343 
344 	.text
345 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
346 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
347 	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
348 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
349 	_CET_ENDBR
350 	pushl	%esi
351 	pushl	%ebx
352 
353 	LOAD_MASK(movdqu,%xmm1,%xmm2)
354 
355 	movl	12(%esp), %ebx	/* source pointer */
356 	movl	20(%esp), %edx	/* number of pixels to copy */
357 	movl	16(%esp), %ecx	/* destination pointer */
358 
359 	movl	%ebx, %eax
360 	movl	%edx, %esi
361 
362 	testl	%edx, %edx
363 	jle	.L46		/* Bail if there's nothing to do. */
364 
365 	/* If the source pointer isn't a multiple of 16 we have to process
366 	 * a few pixels the "slow" way to get the address aligned for
367 	 * the SSE fetch intsructions.
368 	 */
369 
370 	negl	%eax
371 	andl	$15, %eax
372 	sarl	$2, %eax
373 
374 	cmpl	%edx, %eax
375 	cmovbe	%eax, %esi
376 	subl	%esi, %edx
377 
378 	testl	$1, %esi
379 	je	.L41
380 
381 	DO_ONE_PIXEL()
382 .L41:
383 	testl	$2, %esi
384 	je	.L40
385 
386 	movq	(%ebx), %xmm0
387 	addl	$8, %ebx
388 
389 	movdqa	%xmm0, %xmm3
390 	movdqa	%xmm0, %xmm4
391 	andps	%xmm1, %xmm0
392 
393 	andps	%xmm2, %xmm3
394 	pslldq	$2, %xmm4
395 	psrldq	$2, %xmm3
396 	andps	%xmm2, %xmm4
397 
398 	orps	%xmm4, %xmm3
399 	orps	%xmm3, %xmm0
400 
401 	movq	%xmm0, (%ecx)
402 	addl	$8, %ecx
403 .L40:
404 
405 	/* Would it be worth having a specialized version of this loop for
406 	 * the case where the destination is 16-byte aligned?  That version
407 	 * would be identical except that it could use movedqa instead of
408 	 * movdqu.
409 	 */
410 
411 	movl	%edx, %eax
412 	shrl	$2, %eax
413 	jmp	.L42
414 .L43:
415 	movdqa	(%ebx), %xmm0
416 	addl	$16, %ebx
417 
418 	movdqa	%xmm0, %xmm3
419 	movdqa	%xmm0, %xmm4
420 	andps	%xmm1, %xmm0
421 
422 	andps	%xmm2, %xmm3
423 	pslldq	$2, %xmm4
424 	psrldq	$2, %xmm3
425 	andps	%xmm2, %xmm4
426 
427 	orps	%xmm4, %xmm3
428 	orps	%xmm3, %xmm0
429 
430 	movdqu	%xmm0, (%ecx)
431 	addl	$16, %ecx
432 	subl	$1, %eax
433 .L42:
434 	jne	.L43
435 
436 
437 	/* There may be upto 3 pixels remaining to be copied.  Take care
438 	 * of them now.  We do the 2 pixel case first because the data
439 	 * will be aligned.
440 	 */
441 
442 	testl	$2, %edx
443 	je	.L47
444 
445 	movq	(%ebx), %xmm0
446 	addl	$8, %ebx
447 
448 	movdqa	%xmm0, %xmm3
449 	movdqa	%xmm0, %xmm4
450 	andps	%xmm1, %xmm0
451 
452 	andps	%xmm2, %xmm3
453 	pslldq	$2, %xmm4
454 	psrldq	$2, %xmm3
455 	andps	%xmm2, %xmm4
456 
457 	orps	%xmm4, %xmm3
458 	orps	%xmm3, %xmm0
459 
460 	movq	%xmm0, (%ecx)
461 	addl	$8, %ecx
462 .L47:
463 
464 	testl	$1, %edx
465 	je	.L46
466 
467 	DO_ONE_LAST_PIXEL()
468 .L46:
469 
470 	popl	%ebx
471 	popl	%esi
472 	ret
473 	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
474 
475 
476 
477 #define MASK_565_L	0x07e0f800
478 #define MASK_565_H	0x0000001f
479 /* Setting SCALE_ADJUST to 5 gives a perfect match with the
480  * classic C implementation in Mesa.  Setting SCALE_ADJUST
481  * to 0 is slightly faster but at a small cost to accuracy.
482  */
483 #define SCALE_ADJUST	5
484 #if SCALE_ADJUST == 5
485 #define PRESCALE_L 0x00100001
486 #define PRESCALE_H 0x00000200
487 #define SCALE_L 0x40C620E8
488 #define SCALE_H 0x0000839d
489 #elif SCALE_ADJUST == 0
490 #define PRESCALE_L 0x00200001
491 #define PRESCALE_H 0x00000800
492 #define SCALE_L 0x01040108
493 #define SCALE_H 0x00000108
494 #else
495 #error SCALE_ADJUST must either be 5 or 0.
496 #endif
497 #define ALPHA_L 0x00000000
498 #define ALPHA_H 0x00ff0000
499 
500 /**
501  * MMX optimized version of the RGB565 to RGBA copy routine.
502  */
503 
504 	.text
505 	.globl	_generic_read_RGBA_span_RGB565_MMX
506         .hidden _generic_read_RGBA_span_RGB565_MMX
507 	.type	_generic_read_RGBA_span_RGB565_MMX, @function
508 
509 _generic_read_RGBA_span_RGB565_MMX:
510 	_CET_ENDBR
511 #ifdef USE_INNER_EMMS
512 	emms
513 #endif
514 
515 	movl	4(%esp), %eax	/* source pointer */
516 	movl	8(%esp), %edx	/* destination pointer */
517 	movl	12(%esp), %ecx	/* number of pixels to copy */
518 
519 	pushl	$MASK_565_H
520 	pushl	$MASK_565_L
521 	movq	(%esp), %mm5
522 	pushl	$PRESCALE_H
523 	pushl	$PRESCALE_L
524 	movq	(%esp), %mm6
525 	pushl	$SCALE_H
526 	pushl	$SCALE_L
527 	movq	(%esp), %mm7
528 	pushl	$ALPHA_H
529 	pushl	$ALPHA_L
530 	movq	(%esp), %mm3
531 	addl	$32,%esp
532 
533 	sarl	$2, %ecx
534 	jl	.L01		/* Bail early if the count is negative. */
535 	jmp	.L02
536 
537 .L03:
538 	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
539 	 * second pixels into the four words of %mm0 and %mm2.
540       	 */
541 
542 	movq	(%eax), %mm4
543 	addl	$8, %eax
544 
545 	pshufw	$0x00, %mm4, %mm0
546 	pshufw	$0x55, %mm4, %mm2
547 
548 
549 	/* Mask the pixels so that each word of each register contains only
550 	 * one color component.
551 	 */
552 
553 	pand	%mm5, %mm0
554 	pand	%mm5, %mm2
555 
556 
557 	/* Adjust the component values so that they are as small as possible,
558 	 * but large enough so that we can multiply them by an unsigned 16-bit
559 	 * number and get a value as large as 0x00ff0000.
560  	 */
561 
562 	pmullw	%mm6, %mm0
563 	pmullw	%mm6, %mm2
564 #if SCALE_ADJUST > 0
565 	psrlw	$SCALE_ADJUST, %mm0
566 	psrlw	$SCALE_ADJUST, %mm2
567 #endif
568 
569 	/* Scale the input component values to be on the range
570 	 * [0, 0x00ff0000].  This it the real magic of the whole routine.
571 	 */
572 
573 	pmulhuw	%mm7, %mm0
574 	pmulhuw	%mm7, %mm2
575 
576 
577 	/* Always set the alpha value to 0xff.
578 	 */
579 
580  	por %mm3, %mm0
581  	por %mm3, %mm2
582 
583 
584 	/* Pack the 16-bit values to 8-bit values and store the converted
585 	 * pixel data.
586 	 */
587 
588 	packuswb	%mm2, %mm0
589 	movq	%mm0, (%edx)
590 	addl	$8, %edx
591 
592 	pshufw	$0xaa, %mm4, %mm0
593 	pshufw	$0xff, %mm4, %mm2
594 
595 	pand	%mm5, %mm0
596 	pand	%mm5, %mm2
597 	pmullw	%mm6, %mm0
598 	pmullw	%mm6, %mm2
599 #if SCALE_ADJUST > 0
600 	psrlw	$SCALE_ADJUST, %mm0
601 	psrlw	$SCALE_ADJUST, %mm2
602 #endif
603 	pmulhuw	%mm7, %mm0
604 	pmulhuw	%mm7, %mm2
605 
606  	por %mm3, %mm0
607  	por %mm3, %mm2
608 
609 	packuswb	%mm2, %mm0
610 
611 	movq	%mm0, (%edx)
612 	addl	$8, %edx
613 
614 	subl	$1, %ecx
615 .L02:
616 	jne	.L03
617 
618 
619 	/* At this point there can be at most 3 pixels left to process.  If
620 	 * there is either 2 or 3 left, process 2.
621          */
622 
623 	movl	12(%esp), %ecx
624 	testl	$0x02, %ecx
625 	je	.L04
626 
627 	movd	(%eax), %mm4
628 	addl	$4, %eax
629 
630 	pshufw	$0x00, %mm4, %mm0
631 	pshufw	$0x55, %mm4, %mm2
632 
633 	pand	%mm5, %mm0
634 	pand	%mm5, %mm2
635 	pmullw	%mm6, %mm0
636 	pmullw	%mm6, %mm2
637 #if SCALE_ADJUST > 0
638 	psrlw	$SCALE_ADJUST, %mm0
639 	psrlw	$SCALE_ADJUST, %mm2
640 #endif
641 	pmulhuw	%mm7, %mm0
642 	pmulhuw	%mm7, %mm2
643 
644  	por %mm3, %mm0
645  	por %mm3, %mm2
646 
647 	packuswb	%mm2, %mm0
648 
649 	movq	%mm0, (%edx)
650 	addl	$8, %edx
651 
652 .L04:
653 	/* At this point there can be at most 1 pixel left to process.
654 	 * Process it if needed.
655          */
656 
657 	testl	$0x01, %ecx
658 	je	.L01
659 
660 	movzwl	(%eax), %ecx
661 	movd	%ecx, %mm4
662 
663 	pshufw	$0x00, %mm4, %mm0
664 
665 	pand	%mm5, %mm0
666 	pmullw	%mm6, %mm0
667 #if SCALE_ADJUST > 0
668 	psrlw	$SCALE_ADJUST, %mm0
669 #endif
670 	pmulhuw	%mm7, %mm0
671 
672  	por %mm3, %mm0
673 
674 	packuswb	%mm0, %mm0
675 
676 	movd	%mm0, (%edx)
677 
678 .L01:
679 #ifdef USE_INNER_EMMS
680 	emms
681 #endif
682 	ret
683 #endif /* !defined(__MINGW32__) && !defined(__APPLE__) */
684 
685 #if defined (__ELF__) && defined (__linux__)
686 	.section .note.GNU-stack,"",%progbits
687 #endif
688