1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright 2003 VMware, Inc.
3bf215546Sopenharmony_ci * All Rights Reserved.
4bf215546Sopenharmony_ci *
5bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
6bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
7bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
8bf215546Sopenharmony_ci * on the rights to use, copy, modify, merge, publish, distribute, sub
9bf215546Sopenharmony_ci * license, and/or sell copies of the Software, and to permit persons to whom
10bf215546Sopenharmony_ci * the Software is furnished to do so, subject to the following conditions:
11bf215546Sopenharmony_ci *
12bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
13bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
14bf215546Sopenharmony_ci * Software.
15bf215546Sopenharmony_ci *
16bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19bf215546Sopenharmony_ci * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20bf215546Sopenharmony_ci * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21bf215546Sopenharmony_ci * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22bf215546Sopenharmony_ci * USE OR OTHER DEALINGS IN THE SOFTWARE.
23bf215546Sopenharmony_ci *
24bf215546Sopenharmony_ci * Authors:
25bf215546Sopenharmony_ci *    Keith Whitwell <keithw@vmware.com>
26bf215546Sopenharmony_ci */
27bf215546Sopenharmony_ci
28bf215546Sopenharmony_ci
29bf215546Sopenharmony_ci#include "pipe/p_config.h"
30bf215546Sopenharmony_ci#include "pipe/p_compiler.h"
31bf215546Sopenharmony_ci#include "util/u_memory.h"
32bf215546Sopenharmony_ci#include "util/u_math.h"
33bf215546Sopenharmony_ci#include "util/format/u_format.h"
34bf215546Sopenharmony_ci
35bf215546Sopenharmony_ci#include "translate.h"
36bf215546Sopenharmony_ci
37bf215546Sopenharmony_ci
38bf215546Sopenharmony_ci#if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && !defined(EMBEDDED_DEVICE)
39bf215546Sopenharmony_ci
40bf215546Sopenharmony_ci#include "rtasm/rtasm_cpu.h"
41bf215546Sopenharmony_ci#include "rtasm/rtasm_x86sse.h"
42bf215546Sopenharmony_ci
43bf215546Sopenharmony_ci
44bf215546Sopenharmony_ci#define X    0
45bf215546Sopenharmony_ci#define Y    1
46bf215546Sopenharmony_ci#define Z    2
47bf215546Sopenharmony_ci#define W    3
48bf215546Sopenharmony_ci
49bf215546Sopenharmony_ci
50bf215546Sopenharmony_cistruct translate_buffer
51bf215546Sopenharmony_ci{
52bf215546Sopenharmony_ci   const void *base_ptr;
53bf215546Sopenharmony_ci   uintptr_t stride;
54bf215546Sopenharmony_ci   unsigned max_index;
55bf215546Sopenharmony_ci};
56bf215546Sopenharmony_ci
57bf215546Sopenharmony_cistruct translate_buffer_variant
58bf215546Sopenharmony_ci{
59bf215546Sopenharmony_ci   unsigned buffer_index;
60bf215546Sopenharmony_ci   unsigned instance_divisor;
61bf215546Sopenharmony_ci   void *ptr;                   /* updated either per vertex or per instance */
62bf215546Sopenharmony_ci};
63bf215546Sopenharmony_ci
64bf215546Sopenharmony_ci
65bf215546Sopenharmony_ci#define ELEMENT_BUFFER_INSTANCE_ID  1001
66bf215546Sopenharmony_ci
67bf215546Sopenharmony_ci#define NUM_FLOAT_CONSTS 9
68bf215546Sopenharmony_ci#define NUM_UNSIGNED_CONSTS 1
69bf215546Sopenharmony_ci
70bf215546Sopenharmony_cienum
71bf215546Sopenharmony_ci{
72bf215546Sopenharmony_ci   CONST_IDENTITY,
73bf215546Sopenharmony_ci   CONST_INV_127,
74bf215546Sopenharmony_ci   CONST_INV_255,
75bf215546Sopenharmony_ci   CONST_INV_32767,
76bf215546Sopenharmony_ci   CONST_INV_65535,
77bf215546Sopenharmony_ci   CONST_INV_2147483647,
78bf215546Sopenharmony_ci   CONST_INV_4294967295,
79bf215546Sopenharmony_ci   CONST_255,
80bf215546Sopenharmony_ci   CONST_2147483648,
81bf215546Sopenharmony_ci   /* float consts end */
82bf215546Sopenharmony_ci   CONST_2147483647_INT,
83bf215546Sopenharmony_ci};
84bf215546Sopenharmony_ci
85bf215546Sopenharmony_ci#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
86bf215546Sopenharmony_cistatic float consts[NUM_FLOAT_CONSTS][4] = {
87bf215546Sopenharmony_ci   {0, 0, 0, 1},
88bf215546Sopenharmony_ci   C(1.0 / 127.0),
89bf215546Sopenharmony_ci   C(1.0 / 255.0),
90bf215546Sopenharmony_ci   C(1.0 / 32767.0),
91bf215546Sopenharmony_ci   C(1.0 / 65535.0),
92bf215546Sopenharmony_ci   C(1.0 / 2147483647.0),
93bf215546Sopenharmony_ci   C(1.0 / 4294967295.0),
94bf215546Sopenharmony_ci   C(255.0),
95bf215546Sopenharmony_ci   C(2147483648.0),
96bf215546Sopenharmony_ci};
97bf215546Sopenharmony_ci
98bf215546Sopenharmony_ci#undef C
99bf215546Sopenharmony_ci
100bf215546Sopenharmony_cistatic unsigned uconsts[NUM_UNSIGNED_CONSTS][4] = {
101bf215546Sopenharmony_ci   {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff},
102bf215546Sopenharmony_ci};
103bf215546Sopenharmony_ci
104bf215546Sopenharmony_cistruct translate_sse
105bf215546Sopenharmony_ci{
106bf215546Sopenharmony_ci   struct translate translate;
107bf215546Sopenharmony_ci
108bf215546Sopenharmony_ci   struct x86_function linear_func;
109bf215546Sopenharmony_ci   struct x86_function elt_func;
110bf215546Sopenharmony_ci   struct x86_function elt16_func;
111bf215546Sopenharmony_ci   struct x86_function elt8_func;
112bf215546Sopenharmony_ci   struct x86_function *func;
113bf215546Sopenharmony_ci
114bf215546Sopenharmony_ci   alignas(16) float consts[NUM_FLOAT_CONSTS][4];
115bf215546Sopenharmony_ci   alignas(16) float uconsts[NUM_UNSIGNED_CONSTS][4];
116bf215546Sopenharmony_ci   int8_t reg_to_const[16];
117bf215546Sopenharmony_ci   int8_t const_to_reg[NUM_FLOAT_CONSTS + NUM_UNSIGNED_CONSTS];
118bf215546Sopenharmony_ci
119bf215546Sopenharmony_ci   struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS];
120bf215546Sopenharmony_ci   unsigned nr_buffers;
121bf215546Sopenharmony_ci
122bf215546Sopenharmony_ci   /* Multiple buffer variants can map to a single buffer. */
123bf215546Sopenharmony_ci   struct translate_buffer_variant buffer_variant[TRANSLATE_MAX_ATTRIBS];
124bf215546Sopenharmony_ci   unsigned nr_buffer_variants;
125bf215546Sopenharmony_ci
126bf215546Sopenharmony_ci   /* Multiple elements can map to a single buffer variant. */
127bf215546Sopenharmony_ci   unsigned element_to_buffer_variant[TRANSLATE_MAX_ATTRIBS];
128bf215546Sopenharmony_ci
129bf215546Sopenharmony_ci   boolean use_instancing;
130bf215546Sopenharmony_ci   unsigned instance_id;
131bf215546Sopenharmony_ci   unsigned start_instance;
132bf215546Sopenharmony_ci
133bf215546Sopenharmony_ci   /* these are actually known values, but putting them in a struct
134bf215546Sopenharmony_ci    * like this is helpful to keep them in sync across the file.
135bf215546Sopenharmony_ci    */
136bf215546Sopenharmony_ci   struct x86_reg tmp_EAX;
137bf215546Sopenharmony_ci   struct x86_reg tmp2_EDX;
138bf215546Sopenharmony_ci   struct x86_reg src_ECX;
139bf215546Sopenharmony_ci   struct x86_reg idx_ESI;      /* either start+i or &elt[i] */
140bf215546Sopenharmony_ci   struct x86_reg machine_EDI;
141bf215546Sopenharmony_ci   struct x86_reg outbuf_EBX;
142bf215546Sopenharmony_ci   struct x86_reg count_EBP;    /* decrements to zero */
143bf215546Sopenharmony_ci};
144bf215546Sopenharmony_ci
145bf215546Sopenharmony_ci
146bf215546Sopenharmony_cistatic int
147bf215546Sopenharmony_ciget_offset(const void *a, const void *b)
148bf215546Sopenharmony_ci{
149bf215546Sopenharmony_ci   return (const char *) b - (const char *) a;
150bf215546Sopenharmony_ci}
151bf215546Sopenharmony_ci
152bf215546Sopenharmony_ci
153bf215546Sopenharmony_cistatic struct x86_reg
154bf215546Sopenharmony_ciget_const(struct translate_sse *p, unsigned id)
155bf215546Sopenharmony_ci{
156bf215546Sopenharmony_ci   struct x86_reg reg;
157bf215546Sopenharmony_ci   unsigned i;
158bf215546Sopenharmony_ci
159bf215546Sopenharmony_ci   if (p->const_to_reg[id] >= 0)
160bf215546Sopenharmony_ci      return x86_make_reg(file_XMM, p->const_to_reg[id]);
161bf215546Sopenharmony_ci
162bf215546Sopenharmony_ci   for (i = 2; i < 8; ++i) {
163bf215546Sopenharmony_ci      if (p->reg_to_const[i] < 0)
164bf215546Sopenharmony_ci         break;
165bf215546Sopenharmony_ci   }
166bf215546Sopenharmony_ci
167bf215546Sopenharmony_ci   /* TODO: be smarter here */
168bf215546Sopenharmony_ci   if (i == 8)
169bf215546Sopenharmony_ci      --i;
170bf215546Sopenharmony_ci
171bf215546Sopenharmony_ci   reg = x86_make_reg(file_XMM, i);
172bf215546Sopenharmony_ci
173bf215546Sopenharmony_ci   if (p->reg_to_const[i] >= 0)
174bf215546Sopenharmony_ci      p->const_to_reg[p->reg_to_const[i]] = -1;
175bf215546Sopenharmony_ci
176bf215546Sopenharmony_ci   p->reg_to_const[i] = id;
177bf215546Sopenharmony_ci   p->const_to_reg[id] = i;
178bf215546Sopenharmony_ci
179bf215546Sopenharmony_ci   /* TODO: this should happen outside the loop, if possible */
180bf215546Sopenharmony_ci   const void *c;
181bf215546Sopenharmony_ci   if (id < NUM_FLOAT_CONSTS)
182bf215546Sopenharmony_ci      c = &p->consts[id][0];
183bf215546Sopenharmony_ci   else
184bf215546Sopenharmony_ci      c = &p->uconsts[id - NUM_FLOAT_CONSTS][0];
185bf215546Sopenharmony_ci   sse_movaps(p->func, reg,
186bf215546Sopenharmony_ci              x86_make_disp(p->machine_EDI, get_offset(p, c)));
187bf215546Sopenharmony_ci
188bf215546Sopenharmony_ci   return reg;
189bf215546Sopenharmony_ci}
190bf215546Sopenharmony_ci
191bf215546Sopenharmony_ci
192bf215546Sopenharmony_ci/* load the data in a SSE2 register, padding with zeros */
193bf215546Sopenharmony_cistatic boolean
194bf215546Sopenharmony_ciemit_load_sse2(struct translate_sse *p,
195bf215546Sopenharmony_ci               struct x86_reg data, struct x86_reg src, unsigned size)
196bf215546Sopenharmony_ci{
197bf215546Sopenharmony_ci   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
198bf215546Sopenharmony_ci   struct x86_reg tmp = p->tmp_EAX;
199bf215546Sopenharmony_ci   switch (size) {
200bf215546Sopenharmony_ci   case 1:
201bf215546Sopenharmony_ci      x86_movzx8(p->func, tmp, src);
202bf215546Sopenharmony_ci      sse2_movd(p->func, data, tmp);
203bf215546Sopenharmony_ci      break;
204bf215546Sopenharmony_ci   case 2:
205bf215546Sopenharmony_ci      x86_movzx16(p->func, tmp, src);
206bf215546Sopenharmony_ci      sse2_movd(p->func, data, tmp);
207bf215546Sopenharmony_ci      break;
208bf215546Sopenharmony_ci   case 3:
209bf215546Sopenharmony_ci      x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
210bf215546Sopenharmony_ci      x86_shl_imm(p->func, tmp, 16);
211bf215546Sopenharmony_ci      x86_mov16(p->func, tmp, src);
212bf215546Sopenharmony_ci      sse2_movd(p->func, data, tmp);
213bf215546Sopenharmony_ci      break;
214bf215546Sopenharmony_ci   case 4:
215bf215546Sopenharmony_ci      sse2_movd(p->func, data, src);
216bf215546Sopenharmony_ci      break;
217bf215546Sopenharmony_ci   case 6:
218bf215546Sopenharmony_ci      sse2_movd(p->func, data, src);
219bf215546Sopenharmony_ci      x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
220bf215546Sopenharmony_ci      sse2_movd(p->func, tmpXMM, tmp);
221bf215546Sopenharmony_ci      sse2_punpckldq(p->func, data, tmpXMM);
222bf215546Sopenharmony_ci      break;
223bf215546Sopenharmony_ci   case 8:
224bf215546Sopenharmony_ci      sse2_movq(p->func, data, src);
225bf215546Sopenharmony_ci      break;
226bf215546Sopenharmony_ci   case 12:
227bf215546Sopenharmony_ci      sse2_movq(p->func, data, src);
228bf215546Sopenharmony_ci      sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
229bf215546Sopenharmony_ci      sse2_punpcklqdq(p->func, data, tmpXMM);
230bf215546Sopenharmony_ci      break;
231bf215546Sopenharmony_ci   case 16:
232bf215546Sopenharmony_ci      sse2_movdqu(p->func, data, src);
233bf215546Sopenharmony_ci      break;
234bf215546Sopenharmony_ci   default:
235bf215546Sopenharmony_ci      return FALSE;
236bf215546Sopenharmony_ci   }
237bf215546Sopenharmony_ci   return TRUE;
238bf215546Sopenharmony_ci}
239bf215546Sopenharmony_ci
240bf215546Sopenharmony_ci
241bf215546Sopenharmony_ci/* this value can be passed for the out_chans argument */
242bf215546Sopenharmony_ci#define CHANNELS_0001 5
243bf215546Sopenharmony_ci
244bf215546Sopenharmony_ci
245bf215546Sopenharmony_ci/* this function will load #chans float values, and will
246bf215546Sopenharmony_ci * pad the register with zeroes at least up to out_chans.
247bf215546Sopenharmony_ci *
248bf215546Sopenharmony_ci * If out_chans is set to CHANNELS_0001, then the fourth
249bf215546Sopenharmony_ci * value will be padded with 1. Only pass this value if
250bf215546Sopenharmony_ci * chans < 4 or results are undefined.
251bf215546Sopenharmony_ci */
252bf215546Sopenharmony_cistatic void
253bf215546Sopenharmony_ciemit_load_float32(struct translate_sse *p, struct x86_reg data,
254bf215546Sopenharmony_ci                  struct x86_reg arg0, unsigned out_chans, unsigned chans)
255bf215546Sopenharmony_ci{
256bf215546Sopenharmony_ci   switch (chans) {
257bf215546Sopenharmony_ci   case 1:
258bf215546Sopenharmony_ci      /* a 0 0 0
259bf215546Sopenharmony_ci       * a 0 0 1
260bf215546Sopenharmony_ci       */
261bf215546Sopenharmony_ci      sse_movss(p->func, data, arg0);
262bf215546Sopenharmony_ci      if (out_chans == CHANNELS_0001)
263bf215546Sopenharmony_ci         sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
264bf215546Sopenharmony_ci      break;
265bf215546Sopenharmony_ci   case 2:
266bf215546Sopenharmony_ci      /* 0 0 0 1
267bf215546Sopenharmony_ci       * a b 0 1
268bf215546Sopenharmony_ci       */
269bf215546Sopenharmony_ci      if (out_chans == CHANNELS_0001)
270bf215546Sopenharmony_ci         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
271bf215546Sopenharmony_ci                    SHUF(X, Y, Z, W));
272bf215546Sopenharmony_ci      else if (out_chans > 2)
273bf215546Sopenharmony_ci         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
274bf215546Sopenharmony_ci      sse_movlps(p->func, data, arg0);
275bf215546Sopenharmony_ci      break;
276bf215546Sopenharmony_ci   case 3:
277bf215546Sopenharmony_ci      /* Have to jump through some hoops:
278bf215546Sopenharmony_ci       *
279bf215546Sopenharmony_ci       * c 0 0 0
280bf215546Sopenharmony_ci       * c 0 0 1 if out_chans == CHANNELS_0001
281bf215546Sopenharmony_ci       * 0 0 c 0/1
282bf215546Sopenharmony_ci       * a b c 0/1
283bf215546Sopenharmony_ci       */
284bf215546Sopenharmony_ci      sse_movss(p->func, data, x86_make_disp(arg0, 8));
285bf215546Sopenharmony_ci      if (out_chans == CHANNELS_0001)
286bf215546Sopenharmony_ci         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
287bf215546Sopenharmony_ci                    SHUF(X, Y, Z, W));
288bf215546Sopenharmony_ci      sse_shufps(p->func, data, data, SHUF(Y, Z, X, W));
289bf215546Sopenharmony_ci      sse_movlps(p->func, data, arg0);
290bf215546Sopenharmony_ci      break;
291bf215546Sopenharmony_ci   case 4:
292bf215546Sopenharmony_ci      sse_movups(p->func, data, arg0);
293bf215546Sopenharmony_ci      break;
294bf215546Sopenharmony_ci   }
295bf215546Sopenharmony_ci}
296bf215546Sopenharmony_ci
297bf215546Sopenharmony_ci/* this function behaves like emit_load_float32, but loads
298bf215546Sopenharmony_ci   64-bit floating point numbers, converting them to 32-bit
299bf215546Sopenharmony_ci  ones */
300bf215546Sopenharmony_cistatic void
301bf215546Sopenharmony_ciemit_load_float64to32(struct translate_sse *p, struct x86_reg data,
302bf215546Sopenharmony_ci                      struct x86_reg arg0, unsigned out_chans, unsigned chans)
303bf215546Sopenharmony_ci{
304bf215546Sopenharmony_ci   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
305bf215546Sopenharmony_ci   switch (chans) {
306bf215546Sopenharmony_ci   case 1:
307bf215546Sopenharmony_ci      sse2_movsd(p->func, data, arg0);
308bf215546Sopenharmony_ci      if (out_chans > 1)
309bf215546Sopenharmony_ci         sse2_cvtpd2ps(p->func, data, data);
310bf215546Sopenharmony_ci      else
311bf215546Sopenharmony_ci         sse2_cvtsd2ss(p->func, data, data);
312bf215546Sopenharmony_ci      if (out_chans == CHANNELS_0001)
313bf215546Sopenharmony_ci         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
314bf215546Sopenharmony_ci                    SHUF(X, Y, Z, W));
315bf215546Sopenharmony_ci      break;
316bf215546Sopenharmony_ci   case 2:
317bf215546Sopenharmony_ci      sse2_movupd(p->func, data, arg0);
318bf215546Sopenharmony_ci      sse2_cvtpd2ps(p->func, data, data);
319bf215546Sopenharmony_ci      if (out_chans == CHANNELS_0001)
320bf215546Sopenharmony_ci         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
321bf215546Sopenharmony_ci                    SHUF(X, Y, Z, W));
322bf215546Sopenharmony_ci      else if (out_chans > 2)
323bf215546Sopenharmony_ci         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
324bf215546Sopenharmony_ci      break;
325bf215546Sopenharmony_ci   case 3:
326bf215546Sopenharmony_ci      sse2_movupd(p->func, data, arg0);
327bf215546Sopenharmony_ci      sse2_cvtpd2ps(p->func, data, data);
328bf215546Sopenharmony_ci      sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
329bf215546Sopenharmony_ci      if (out_chans > 3)
330bf215546Sopenharmony_ci         sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
331bf215546Sopenharmony_ci      else
332bf215546Sopenharmony_ci         sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
333bf215546Sopenharmony_ci      sse_movlhps(p->func, data, tmpXMM);
334bf215546Sopenharmony_ci      if (out_chans == CHANNELS_0001)
335bf215546Sopenharmony_ci         sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
336bf215546Sopenharmony_ci      break;
337bf215546Sopenharmony_ci   case 4:
338bf215546Sopenharmony_ci      sse2_movupd(p->func, data, arg0);
339bf215546Sopenharmony_ci      sse2_cvtpd2ps(p->func, data, data);
340bf215546Sopenharmony_ci      sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
341bf215546Sopenharmony_ci      sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
342bf215546Sopenharmony_ci      sse_movlhps(p->func, data, tmpXMM);
343bf215546Sopenharmony_ci      break;
344bf215546Sopenharmony_ci   }
345bf215546Sopenharmony_ci}
346bf215546Sopenharmony_ci
347bf215546Sopenharmony_ci
348bf215546Sopenharmony_cistatic void
349bf215546Sopenharmony_ciemit_mov64(struct translate_sse *p, struct x86_reg dst_gpr,
350bf215546Sopenharmony_ci           struct x86_reg dst_xmm, struct x86_reg src_gpr,
351bf215546Sopenharmony_ci           struct x86_reg src_xmm)
352bf215546Sopenharmony_ci{
353bf215546Sopenharmony_ci   if (x86_target(p->func) != X86_32)
354bf215546Sopenharmony_ci      x64_mov64(p->func, dst_gpr, src_gpr);
355bf215546Sopenharmony_ci   else {
356bf215546Sopenharmony_ci      /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
357bf215546Sopenharmony_ci      if (x86_target_caps(p->func) & X86_SSE2)
358bf215546Sopenharmony_ci         sse2_movq(p->func, dst_xmm, src_xmm);
359bf215546Sopenharmony_ci      else
360bf215546Sopenharmony_ci         sse_movlps(p->func, dst_xmm, src_xmm);
361bf215546Sopenharmony_ci   }
362bf215546Sopenharmony_ci}
363bf215546Sopenharmony_ci
364bf215546Sopenharmony_ci
365bf215546Sopenharmony_cistatic void
366bf215546Sopenharmony_ciemit_load64(struct translate_sse *p, struct x86_reg dst_gpr,
367bf215546Sopenharmony_ci            struct x86_reg dst_xmm, struct x86_reg src)
368bf215546Sopenharmony_ci{
369bf215546Sopenharmony_ci   emit_mov64(p, dst_gpr, dst_xmm, src, src);
370bf215546Sopenharmony_ci}
371bf215546Sopenharmony_ci
372bf215546Sopenharmony_ci
373bf215546Sopenharmony_cistatic void
374bf215546Sopenharmony_ciemit_store64(struct translate_sse *p, struct x86_reg dst,
375bf215546Sopenharmony_ci             struct x86_reg src_gpr, struct x86_reg src_xmm)
376bf215546Sopenharmony_ci{
377bf215546Sopenharmony_ci   emit_mov64(p, dst, dst, src_gpr, src_xmm);
378bf215546Sopenharmony_ci}
379bf215546Sopenharmony_ci
380bf215546Sopenharmony_ci
381bf215546Sopenharmony_cistatic void
382bf215546Sopenharmony_ciemit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
383bf215546Sopenharmony_ci{
384bf215546Sopenharmony_ci   if (x86_target_caps(p->func) & X86_SSE2)
385bf215546Sopenharmony_ci      sse2_movdqu(p->func, dst, src);
386bf215546Sopenharmony_ci   else
387bf215546Sopenharmony_ci      sse_movups(p->func, dst, src);
388bf215546Sopenharmony_ci}
389bf215546Sopenharmony_ci
390bf215546Sopenharmony_ci
391bf215546Sopenharmony_ci/* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
392bf215546Sopenharmony_ci * but may or may not be good on older processors
393bf215546Sopenharmony_ci * TODO: may perhaps want to use non-temporal stores here if possible
394bf215546Sopenharmony_ci */
395bf215546Sopenharmony_cistatic void
396bf215546Sopenharmony_ciemit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src,
397bf215546Sopenharmony_ci            unsigned size)
398bf215546Sopenharmony_ci{
399bf215546Sopenharmony_ci   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
400bf215546Sopenharmony_ci   struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
401bf215546Sopenharmony_ci   struct x86_reg dataGPR = p->tmp_EAX;
402bf215546Sopenharmony_ci   struct x86_reg dataGPR2 = p->tmp2_EDX;
403bf215546Sopenharmony_ci
404bf215546Sopenharmony_ci   if (size < 8) {
405bf215546Sopenharmony_ci      switch (size) {
406bf215546Sopenharmony_ci      case 1:
407bf215546Sopenharmony_ci         x86_mov8(p->func, dataGPR, src);
408bf215546Sopenharmony_ci         x86_mov8(p->func, dst, dataGPR);
409bf215546Sopenharmony_ci         break;
410bf215546Sopenharmony_ci      case 2:
411bf215546Sopenharmony_ci         x86_mov16(p->func, dataGPR, src);
412bf215546Sopenharmony_ci         x86_mov16(p->func, dst, dataGPR);
413bf215546Sopenharmony_ci         break;
414bf215546Sopenharmony_ci      case 3:
415bf215546Sopenharmony_ci         x86_mov16(p->func, dataGPR, src);
416bf215546Sopenharmony_ci         x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
417bf215546Sopenharmony_ci         x86_mov16(p->func, dst, dataGPR);
418bf215546Sopenharmony_ci         x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
419bf215546Sopenharmony_ci         break;
420bf215546Sopenharmony_ci      case 4:
421bf215546Sopenharmony_ci         x86_mov(p->func, dataGPR, src);
422bf215546Sopenharmony_ci         x86_mov(p->func, dst, dataGPR);
423bf215546Sopenharmony_ci         break;
424bf215546Sopenharmony_ci      case 6:
425bf215546Sopenharmony_ci         x86_mov(p->func, dataGPR, src);
426bf215546Sopenharmony_ci         x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
427bf215546Sopenharmony_ci         x86_mov(p->func, dst, dataGPR);
428bf215546Sopenharmony_ci         x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
429bf215546Sopenharmony_ci         break;
430bf215546Sopenharmony_ci      }
431bf215546Sopenharmony_ci   }
432bf215546Sopenharmony_ci   else if (!(x86_target_caps(p->func) & X86_SSE)) {
433bf215546Sopenharmony_ci      unsigned i = 0;
434bf215546Sopenharmony_ci      assert((size & 3) == 0);
435bf215546Sopenharmony_ci      for (i = 0; i < size; i += 4) {
436bf215546Sopenharmony_ci         x86_mov(p->func, dataGPR, x86_make_disp(src, i));
437bf215546Sopenharmony_ci         x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
438bf215546Sopenharmony_ci      }
439bf215546Sopenharmony_ci   }
440bf215546Sopenharmony_ci   else {
441bf215546Sopenharmony_ci      switch (size) {
442bf215546Sopenharmony_ci      case 8:
443bf215546Sopenharmony_ci         emit_load64(p, dataGPR, dataXMM, src);
444bf215546Sopenharmony_ci         emit_store64(p, dst, dataGPR, dataXMM);
445bf215546Sopenharmony_ci         break;
446bf215546Sopenharmony_ci      case 12:
447bf215546Sopenharmony_ci         emit_load64(p, dataGPR2, dataXMM, src);
448bf215546Sopenharmony_ci         x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
449bf215546Sopenharmony_ci         emit_store64(p, dst, dataGPR2, dataXMM);
450bf215546Sopenharmony_ci         x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
451bf215546Sopenharmony_ci         break;
452bf215546Sopenharmony_ci      case 16:
453bf215546Sopenharmony_ci         emit_mov128(p, dataXMM, src);
454bf215546Sopenharmony_ci         emit_mov128(p, dst, dataXMM);
455bf215546Sopenharmony_ci         break;
456bf215546Sopenharmony_ci      case 24:
457bf215546Sopenharmony_ci         emit_mov128(p, dataXMM, src);
458bf215546Sopenharmony_ci         emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
459bf215546Sopenharmony_ci         emit_mov128(p, dst, dataXMM);
460bf215546Sopenharmony_ci         emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
461bf215546Sopenharmony_ci         break;
462bf215546Sopenharmony_ci      case 32:
463bf215546Sopenharmony_ci         emit_mov128(p, dataXMM, src);
464bf215546Sopenharmony_ci         emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
465bf215546Sopenharmony_ci         emit_mov128(p, dst, dataXMM);
466bf215546Sopenharmony_ci         emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
467bf215546Sopenharmony_ci         break;
468bf215546Sopenharmony_ci      default:
469bf215546Sopenharmony_ci         assert(0);
470bf215546Sopenharmony_ci      }
471bf215546Sopenharmony_ci   }
472bf215546Sopenharmony_ci}
473bf215546Sopenharmony_ci
474bf215546Sopenharmony_cistatic boolean
475bf215546Sopenharmony_citranslate_attr_convert(struct translate_sse *p,
476bf215546Sopenharmony_ci                       const struct translate_element *a,
477bf215546Sopenharmony_ci                       struct x86_reg src, struct x86_reg dst)
478bf215546Sopenharmony_ci{
479bf215546Sopenharmony_ci   const struct util_format_description *input_desc =
480bf215546Sopenharmony_ci      util_format_description(a->input_format);
481bf215546Sopenharmony_ci   const struct util_format_description *output_desc =
482bf215546Sopenharmony_ci      util_format_description(a->output_format);
483bf215546Sopenharmony_ci   unsigned i;
484bf215546Sopenharmony_ci   boolean id_swizzle = TRUE;
485bf215546Sopenharmony_ci   unsigned swizzle[4] =
486bf215546Sopenharmony_ci      { PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE,
487bf215546Sopenharmony_ci        PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE };
488bf215546Sopenharmony_ci   unsigned needed_chans = 0;
489bf215546Sopenharmony_ci   unsigned imms[2] = { 0, 0x3f800000 };
490bf215546Sopenharmony_ci
491bf215546Sopenharmony_ci   if (a->output_format == PIPE_FORMAT_NONE
492bf215546Sopenharmony_ci       || a->input_format == PIPE_FORMAT_NONE)
493bf215546Sopenharmony_ci      return FALSE;
494bf215546Sopenharmony_ci
495bf215546Sopenharmony_ci   if (input_desc->channel[0].size & 7)
496bf215546Sopenharmony_ci      return FALSE;
497bf215546Sopenharmony_ci
498bf215546Sopenharmony_ci   if (input_desc->colorspace != output_desc->colorspace)
499bf215546Sopenharmony_ci      return FALSE;
500bf215546Sopenharmony_ci
501bf215546Sopenharmony_ci   for (i = 1; i < input_desc->nr_channels; ++i) {
502bf215546Sopenharmony_ci      if (memcmp
503bf215546Sopenharmony_ci          (&input_desc->channel[i], &input_desc->channel[0],
504bf215546Sopenharmony_ci           sizeof(input_desc->channel[0])))
505bf215546Sopenharmony_ci         return FALSE;
506bf215546Sopenharmony_ci   }
507bf215546Sopenharmony_ci
508bf215546Sopenharmony_ci   for (i = 1; i < output_desc->nr_channels; ++i) {
509bf215546Sopenharmony_ci      if (memcmp
510bf215546Sopenharmony_ci          (&output_desc->channel[i], &output_desc->channel[0],
511bf215546Sopenharmony_ci           sizeof(output_desc->channel[0]))) {
512bf215546Sopenharmony_ci         return FALSE;
513bf215546Sopenharmony_ci      }
514bf215546Sopenharmony_ci   }
515bf215546Sopenharmony_ci
516bf215546Sopenharmony_ci   for (i = 0; i < output_desc->nr_channels; ++i) {
517bf215546Sopenharmony_ci      if (output_desc->swizzle[i] < 4)
518bf215546Sopenharmony_ci         swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
519bf215546Sopenharmony_ci   }
520bf215546Sopenharmony_ci
521bf215546Sopenharmony_ci   if ((x86_target_caps(p->func) & X86_SSE) &&
522bf215546Sopenharmony_ci       (0 || a->output_format == PIPE_FORMAT_R32_FLOAT
523bf215546Sopenharmony_ci        || a->output_format == PIPE_FORMAT_R32G32_FLOAT
524bf215546Sopenharmony_ci        || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
525bf215546Sopenharmony_ci        || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) {
526bf215546Sopenharmony_ci      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
527bf215546Sopenharmony_ci      struct x86_reg auxXMM;
528bf215546Sopenharmony_ci
529bf215546Sopenharmony_ci      for (i = 0; i < output_desc->nr_channels; ++i) {
530bf215546Sopenharmony_ci         if (swizzle[i] == PIPE_SWIZZLE_0
531bf215546Sopenharmony_ci             && i >= input_desc->nr_channels)
532bf215546Sopenharmony_ci            swizzle[i] = i;
533bf215546Sopenharmony_ci      }
534bf215546Sopenharmony_ci
535bf215546Sopenharmony_ci      for (i = 0; i < output_desc->nr_channels; ++i) {
536bf215546Sopenharmony_ci         if (swizzle[i] < 4)
537bf215546Sopenharmony_ci            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
538bf215546Sopenharmony_ci         if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
539bf215546Sopenharmony_ci            id_swizzle = FALSE;
540bf215546Sopenharmony_ci      }
541bf215546Sopenharmony_ci
542bf215546Sopenharmony_ci      if (needed_chans > 0) {
543bf215546Sopenharmony_ci         switch (input_desc->channel[0].type) {
544bf215546Sopenharmony_ci         case UTIL_FORMAT_TYPE_UNSIGNED:
545bf215546Sopenharmony_ci            if (!(x86_target_caps(p->func) & X86_SSE2))
546bf215546Sopenharmony_ci               return FALSE;
547bf215546Sopenharmony_ci            emit_load_sse2(p, dataXMM, src,
548bf215546Sopenharmony_ci                           input_desc->channel[0].size *
549bf215546Sopenharmony_ci                           input_desc->nr_channels >> 3);
550bf215546Sopenharmony_ci
551bf215546Sopenharmony_ci            /* TODO: add support for SSE4.1 pmovzx */
552bf215546Sopenharmony_ci            switch (input_desc->channel[0].size) {
553bf215546Sopenharmony_ci            case 8:
554bf215546Sopenharmony_ci               /* TODO: this may be inefficient due to get_identity() being
555bf215546Sopenharmony_ci                *  used both as a float and integer register.
556bf215546Sopenharmony_ci                */
557bf215546Sopenharmony_ci               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
558bf215546Sopenharmony_ci               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
559bf215546Sopenharmony_ci               break;
560bf215546Sopenharmony_ci            case 16:
561bf215546Sopenharmony_ci               sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
562bf215546Sopenharmony_ci               break;
563bf215546Sopenharmony_ci            case 32:           /* we lose precision here */
564bf215546Sopenharmony_ci               /* No unsigned conversion (except in AVX512F), so we check if
565bf215546Sopenharmony_ci                * it's negative, and stick the high bit as a separate float
566bf215546Sopenharmony_ci                * value in an aux register: */
567bf215546Sopenharmony_ci               auxXMM = x86_make_reg(file_XMM, 1);
568bf215546Sopenharmony_ci               /* aux = 0 */
569bf215546Sopenharmony_ci               sse_xorps(p->func, auxXMM, auxXMM);
570bf215546Sopenharmony_ci               /* aux = aux > data ? 0xffffffff : 0 */
571bf215546Sopenharmony_ci               sse2_pcmpgtd(p->func, auxXMM, dataXMM);
572bf215546Sopenharmony_ci               /* data = data & 0x7fffffff */
573bf215546Sopenharmony_ci               sse_andps(p->func, dataXMM, get_const(p, CONST_2147483647_INT));
574bf215546Sopenharmony_ci               /* aux = aux & 2147483648.0 */
575bf215546Sopenharmony_ci               sse_andps(p->func, auxXMM, get_const(p, CONST_2147483648));
576bf215546Sopenharmony_ci               break;
577bf215546Sopenharmony_ci            default:
578bf215546Sopenharmony_ci               return FALSE;
579bf215546Sopenharmony_ci            }
580bf215546Sopenharmony_ci            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
581bf215546Sopenharmony_ci            if (input_desc->channel[0].size == 32)
582bf215546Sopenharmony_ci               /* add in the high bit's worth of float that we AND'd away */
583bf215546Sopenharmony_ci               sse_addps(p->func, dataXMM, auxXMM);
584bf215546Sopenharmony_ci            if (input_desc->channel[0].normalized) {
585bf215546Sopenharmony_ci               struct x86_reg factor;
586bf215546Sopenharmony_ci               switch (input_desc->channel[0].size) {
587bf215546Sopenharmony_ci               case 8:
588bf215546Sopenharmony_ci                  factor = get_const(p, CONST_INV_255);
589bf215546Sopenharmony_ci                  break;
590bf215546Sopenharmony_ci               case 16:
591bf215546Sopenharmony_ci                  factor = get_const(p, CONST_INV_65535);
592bf215546Sopenharmony_ci                  break;
593bf215546Sopenharmony_ci               case 32:
594bf215546Sopenharmony_ci                  factor = get_const(p, CONST_INV_4294967295);
595bf215546Sopenharmony_ci                  break;
596bf215546Sopenharmony_ci               default:
597bf215546Sopenharmony_ci                  assert(0);
598bf215546Sopenharmony_ci                  factor.disp = 0;
599bf215546Sopenharmony_ci                  factor.file = 0;
600bf215546Sopenharmony_ci                  factor.idx = 0;
601bf215546Sopenharmony_ci                  factor.mod = 0;
602bf215546Sopenharmony_ci                  break;
603bf215546Sopenharmony_ci               }
604bf215546Sopenharmony_ci               sse_mulps(p->func, dataXMM, factor);
605bf215546Sopenharmony_ci            }
606bf215546Sopenharmony_ci            break;
607bf215546Sopenharmony_ci         case UTIL_FORMAT_TYPE_SIGNED:
608bf215546Sopenharmony_ci            if (!(x86_target_caps(p->func) & X86_SSE2))
609bf215546Sopenharmony_ci               return FALSE;
610bf215546Sopenharmony_ci            emit_load_sse2(p, dataXMM, src,
611bf215546Sopenharmony_ci                           input_desc->channel[0].size *
612bf215546Sopenharmony_ci                           input_desc->nr_channels >> 3);
613bf215546Sopenharmony_ci
614bf215546Sopenharmony_ci            /* TODO: add support for SSE4.1 pmovsx */
615bf215546Sopenharmony_ci            switch (input_desc->channel[0].size) {
616bf215546Sopenharmony_ci            case 8:
617bf215546Sopenharmony_ci               sse2_punpcklbw(p->func, dataXMM, dataXMM);
618bf215546Sopenharmony_ci               sse2_punpcklbw(p->func, dataXMM, dataXMM);
619bf215546Sopenharmony_ci               sse2_psrad_imm(p->func, dataXMM, 24);
620bf215546Sopenharmony_ci               break;
621bf215546Sopenharmony_ci            case 16:
622bf215546Sopenharmony_ci               sse2_punpcklwd(p->func, dataXMM, dataXMM);
623bf215546Sopenharmony_ci               sse2_psrad_imm(p->func, dataXMM, 16);
624bf215546Sopenharmony_ci               break;
625bf215546Sopenharmony_ci            case 32:           /* we lose precision here */
626bf215546Sopenharmony_ci               break;
627bf215546Sopenharmony_ci            default:
628bf215546Sopenharmony_ci               return FALSE;
629bf215546Sopenharmony_ci            }
630bf215546Sopenharmony_ci            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
631bf215546Sopenharmony_ci            if (input_desc->channel[0].normalized) {
632bf215546Sopenharmony_ci               struct x86_reg factor;
633bf215546Sopenharmony_ci               switch (input_desc->channel[0].size) {
634bf215546Sopenharmony_ci               case 8:
635bf215546Sopenharmony_ci                  factor = get_const(p, CONST_INV_127);
636bf215546Sopenharmony_ci                  break;
637bf215546Sopenharmony_ci               case 16:
638bf215546Sopenharmony_ci                  factor = get_const(p, CONST_INV_32767);
639bf215546Sopenharmony_ci                  break;
640bf215546Sopenharmony_ci               case 32:
641bf215546Sopenharmony_ci                  factor = get_const(p, CONST_INV_2147483647);
642bf215546Sopenharmony_ci                  break;
643bf215546Sopenharmony_ci               default:
644bf215546Sopenharmony_ci                  assert(0);
645bf215546Sopenharmony_ci                  factor.disp = 0;
646bf215546Sopenharmony_ci                  factor.file = 0;
647bf215546Sopenharmony_ci                  factor.idx = 0;
648bf215546Sopenharmony_ci                  factor.mod = 0;
649bf215546Sopenharmony_ci                  break;
650bf215546Sopenharmony_ci               }
651bf215546Sopenharmony_ci               sse_mulps(p->func, dataXMM, factor);
652bf215546Sopenharmony_ci            }
653bf215546Sopenharmony_ci            break;
654bf215546Sopenharmony_ci
655bf215546Sopenharmony_ci            break;
656bf215546Sopenharmony_ci         case UTIL_FORMAT_TYPE_FLOAT:
657bf215546Sopenharmony_ci            if (input_desc->channel[0].size != 32
658bf215546Sopenharmony_ci                && input_desc->channel[0].size != 64) {
659bf215546Sopenharmony_ci               return FALSE;
660bf215546Sopenharmony_ci            }
661bf215546Sopenharmony_ci            if (swizzle[3] == PIPE_SWIZZLE_1
662bf215546Sopenharmony_ci                && input_desc->nr_channels <= 3) {
663bf215546Sopenharmony_ci               swizzle[3] = PIPE_SWIZZLE_W;
664bf215546Sopenharmony_ci               needed_chans = CHANNELS_0001;
665bf215546Sopenharmony_ci            }
666bf215546Sopenharmony_ci            switch (input_desc->channel[0].size) {
667bf215546Sopenharmony_ci            case 32:
668bf215546Sopenharmony_ci               emit_load_float32(p, dataXMM, src, needed_chans,
669bf215546Sopenharmony_ci                                 input_desc->nr_channels);
670bf215546Sopenharmony_ci               break;
671bf215546Sopenharmony_ci            case 64:           /* we lose precision here */
672bf215546Sopenharmony_ci               if (!(x86_target_caps(p->func) & X86_SSE2))
673bf215546Sopenharmony_ci                  return FALSE;
674bf215546Sopenharmony_ci               emit_load_float64to32(p, dataXMM, src, needed_chans,
675bf215546Sopenharmony_ci                                     input_desc->nr_channels);
676bf215546Sopenharmony_ci               break;
677bf215546Sopenharmony_ci            default:
678bf215546Sopenharmony_ci               return FALSE;
679bf215546Sopenharmony_ci            }
680bf215546Sopenharmony_ci            break;
681bf215546Sopenharmony_ci         default:
682bf215546Sopenharmony_ci            return FALSE;
683bf215546Sopenharmony_ci         }
684bf215546Sopenharmony_ci
685bf215546Sopenharmony_ci         if (!id_swizzle) {
686bf215546Sopenharmony_ci            sse_shufps(p->func, dataXMM, dataXMM,
687bf215546Sopenharmony_ci                       SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]));
688bf215546Sopenharmony_ci         }
689bf215546Sopenharmony_ci      }
690bf215546Sopenharmony_ci
691bf215546Sopenharmony_ci      if (output_desc->nr_channels >= 4
692bf215546Sopenharmony_ci          && swizzle[0] < PIPE_SWIZZLE_0
693bf215546Sopenharmony_ci          && swizzle[1] < PIPE_SWIZZLE_0
694bf215546Sopenharmony_ci          && swizzle[2] < PIPE_SWIZZLE_0
695bf215546Sopenharmony_ci          && swizzle[3] < PIPE_SWIZZLE_0) {
696bf215546Sopenharmony_ci         sse_movups(p->func, dst, dataXMM);
697bf215546Sopenharmony_ci      }
698bf215546Sopenharmony_ci      else {
699bf215546Sopenharmony_ci         if (output_desc->nr_channels >= 2
700bf215546Sopenharmony_ci             && swizzle[0] < PIPE_SWIZZLE_0
701bf215546Sopenharmony_ci             && swizzle[1] < PIPE_SWIZZLE_0) {
702bf215546Sopenharmony_ci            sse_movlps(p->func, dst, dataXMM);
703bf215546Sopenharmony_ci         }
704bf215546Sopenharmony_ci         else {
705bf215546Sopenharmony_ci            if (swizzle[0] < PIPE_SWIZZLE_0) {
706bf215546Sopenharmony_ci               sse_movss(p->func, dst, dataXMM);
707bf215546Sopenharmony_ci            }
708bf215546Sopenharmony_ci            else {
709bf215546Sopenharmony_ci               x86_mov_imm(p->func, dst,
710bf215546Sopenharmony_ci                           imms[swizzle[0] - PIPE_SWIZZLE_0]);
711bf215546Sopenharmony_ci            }
712bf215546Sopenharmony_ci
713bf215546Sopenharmony_ci            if (output_desc->nr_channels >= 2) {
714bf215546Sopenharmony_ci               if (swizzle[1] < PIPE_SWIZZLE_0) {
715bf215546Sopenharmony_ci                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
716bf215546Sopenharmony_ci                  sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
717bf215546Sopenharmony_ci               }
718bf215546Sopenharmony_ci               else {
719bf215546Sopenharmony_ci                  x86_mov_imm(p->func, x86_make_disp(dst, 4),
720bf215546Sopenharmony_ci                              imms[swizzle[1] - PIPE_SWIZZLE_0]);
721bf215546Sopenharmony_ci               }
722bf215546Sopenharmony_ci            }
723bf215546Sopenharmony_ci         }
724bf215546Sopenharmony_ci
725bf215546Sopenharmony_ci         if (output_desc->nr_channels >= 3) {
726bf215546Sopenharmony_ci            if (output_desc->nr_channels >= 4
727bf215546Sopenharmony_ci                && swizzle[2] < PIPE_SWIZZLE_0
728bf215546Sopenharmony_ci                && swizzle[3] < PIPE_SWIZZLE_0) {
729bf215546Sopenharmony_ci               sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
730bf215546Sopenharmony_ci            }
731bf215546Sopenharmony_ci            else {
732bf215546Sopenharmony_ci               if (swizzle[2] < PIPE_SWIZZLE_0) {
733bf215546Sopenharmony_ci                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
734bf215546Sopenharmony_ci                  sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
735bf215546Sopenharmony_ci               }
736bf215546Sopenharmony_ci               else {
737bf215546Sopenharmony_ci                  x86_mov_imm(p->func, x86_make_disp(dst, 8),
738bf215546Sopenharmony_ci                              imms[swizzle[2] - PIPE_SWIZZLE_0]);
739bf215546Sopenharmony_ci               }
740bf215546Sopenharmony_ci
741bf215546Sopenharmony_ci               if (output_desc->nr_channels >= 4) {
742bf215546Sopenharmony_ci                  if (swizzle[3] < PIPE_SWIZZLE_0) {
743bf215546Sopenharmony_ci                     sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
744bf215546Sopenharmony_ci                     sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
745bf215546Sopenharmony_ci                  }
746bf215546Sopenharmony_ci                  else {
747bf215546Sopenharmony_ci                     x86_mov_imm(p->func, x86_make_disp(dst, 12),
748bf215546Sopenharmony_ci                                 imms[swizzle[3] - PIPE_SWIZZLE_0]);
749bf215546Sopenharmony_ci                  }
750bf215546Sopenharmony_ci               }
751bf215546Sopenharmony_ci            }
752bf215546Sopenharmony_ci         }
753bf215546Sopenharmony_ci      }
754bf215546Sopenharmony_ci      return TRUE;
755bf215546Sopenharmony_ci   }
756bf215546Sopenharmony_ci   else if ((x86_target_caps(p->func) & X86_SSE2)
757bf215546Sopenharmony_ci            && input_desc->channel[0].size == 8
758bf215546Sopenharmony_ci            && output_desc->channel[0].size == 16
759bf215546Sopenharmony_ci            && output_desc->channel[0].normalized ==
760bf215546Sopenharmony_ci            input_desc->channel[0].normalized &&
761bf215546Sopenharmony_ci            (0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
762bf215546Sopenharmony_ci                   && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
763bf215546Sopenharmony_ci             || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
764bf215546Sopenharmony_ci                 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
765bf215546Sopenharmony_ci             || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED
766bf215546Sopenharmony_ci                 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) {
767bf215546Sopenharmony_ci      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
768bf215546Sopenharmony_ci      struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
769bf215546Sopenharmony_ci      struct x86_reg tmp = p->tmp_EAX;
770bf215546Sopenharmony_ci      unsigned imms[2] = { 0, 1 };
771bf215546Sopenharmony_ci
772bf215546Sopenharmony_ci      for (i = 0; i < output_desc->nr_channels; ++i) {
773bf215546Sopenharmony_ci         if (swizzle[i] == PIPE_SWIZZLE_0
774bf215546Sopenharmony_ci             && i >= input_desc->nr_channels) {
775bf215546Sopenharmony_ci            swizzle[i] = i;
776bf215546Sopenharmony_ci         }
777bf215546Sopenharmony_ci      }
778bf215546Sopenharmony_ci
779bf215546Sopenharmony_ci      for (i = 0; i < output_desc->nr_channels; ++i) {
780bf215546Sopenharmony_ci         if (swizzle[i] < 4)
781bf215546Sopenharmony_ci            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
782bf215546Sopenharmony_ci         if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
783bf215546Sopenharmony_ci            id_swizzle = FALSE;
784bf215546Sopenharmony_ci      }
785bf215546Sopenharmony_ci
786bf215546Sopenharmony_ci      if (needed_chans > 0) {
787bf215546Sopenharmony_ci         emit_load_sse2(p, dataXMM, src,
788bf215546Sopenharmony_ci                        input_desc->channel[0].size *
789bf215546Sopenharmony_ci                        input_desc->nr_channels >> 3);
790bf215546Sopenharmony_ci
791bf215546Sopenharmony_ci         switch (input_desc->channel[0].type) {
792bf215546Sopenharmony_ci         case UTIL_FORMAT_TYPE_UNSIGNED:
793bf215546Sopenharmony_ci            if (input_desc->channel[0].normalized) {
794bf215546Sopenharmony_ci               sse2_punpcklbw(p->func, dataXMM, dataXMM);
795bf215546Sopenharmony_ci               if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
796bf215546Sopenharmony_ci                  sse2_psrlw_imm(p->func, dataXMM, 1);
797bf215546Sopenharmony_ci            }
798bf215546Sopenharmony_ci            else
799bf215546Sopenharmony_ci               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
800bf215546Sopenharmony_ci            break;
801bf215546Sopenharmony_ci         case UTIL_FORMAT_TYPE_SIGNED:
802bf215546Sopenharmony_ci            if (input_desc->channel[0].normalized) {
803bf215546Sopenharmony_ci               sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
804bf215546Sopenharmony_ci               sse2_punpcklbw(p->func, tmpXMM, dataXMM);
805bf215546Sopenharmony_ci               sse2_psllw_imm(p->func, dataXMM, 9);
806bf215546Sopenharmony_ci               sse2_psrlw_imm(p->func, dataXMM, 8);
807bf215546Sopenharmony_ci               sse2_por(p->func, tmpXMM, dataXMM);
808bf215546Sopenharmony_ci               sse2_psrlw_imm(p->func, dataXMM, 7);
809bf215546Sopenharmony_ci               sse2_por(p->func, tmpXMM, dataXMM);
810bf215546Sopenharmony_ci               {
811bf215546Sopenharmony_ci                  struct x86_reg t = dataXMM;
812bf215546Sopenharmony_ci                  dataXMM = tmpXMM;
813bf215546Sopenharmony_ci                  tmpXMM = t;
814bf215546Sopenharmony_ci               }
815bf215546Sopenharmony_ci            }
816bf215546Sopenharmony_ci            else {
817bf215546Sopenharmony_ci               sse2_punpcklbw(p->func, dataXMM, dataXMM);
818bf215546Sopenharmony_ci               sse2_psraw_imm(p->func, dataXMM, 8);
819bf215546Sopenharmony_ci            }
820bf215546Sopenharmony_ci            break;
821bf215546Sopenharmony_ci         default:
822bf215546Sopenharmony_ci            assert(0);
823bf215546Sopenharmony_ci         }
824bf215546Sopenharmony_ci
825bf215546Sopenharmony_ci         if (output_desc->channel[0].normalized)
826bf215546Sopenharmony_ci            imms[1] =
827bf215546Sopenharmony_ci               (output_desc->channel[0].type ==
828bf215546Sopenharmony_ci                UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
829bf215546Sopenharmony_ci
830bf215546Sopenharmony_ci         if (!id_swizzle)
831bf215546Sopenharmony_ci            sse2_pshuflw(p->func, dataXMM, dataXMM,
832bf215546Sopenharmony_ci                         (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) |
833bf215546Sopenharmony_ci                         ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
834bf215546Sopenharmony_ci      }
835bf215546Sopenharmony_ci
836bf215546Sopenharmony_ci      if (output_desc->nr_channels >= 4
837bf215546Sopenharmony_ci          && swizzle[0] < PIPE_SWIZZLE_0
838bf215546Sopenharmony_ci          && swizzle[1] < PIPE_SWIZZLE_0
839bf215546Sopenharmony_ci          && swizzle[2] < PIPE_SWIZZLE_0
840bf215546Sopenharmony_ci          && swizzle[3] < PIPE_SWIZZLE_0) {
841bf215546Sopenharmony_ci         sse2_movq(p->func, dst, dataXMM);
842bf215546Sopenharmony_ci      }
843bf215546Sopenharmony_ci      else {
844bf215546Sopenharmony_ci         if (swizzle[0] < PIPE_SWIZZLE_0) {
845bf215546Sopenharmony_ci            if (output_desc->nr_channels >= 2
846bf215546Sopenharmony_ci                && swizzle[1] < PIPE_SWIZZLE_0) {
847bf215546Sopenharmony_ci               sse2_movd(p->func, dst, dataXMM);
848bf215546Sopenharmony_ci            }
849bf215546Sopenharmony_ci            else {
850bf215546Sopenharmony_ci               sse2_movd(p->func, tmp, dataXMM);
851bf215546Sopenharmony_ci               x86_mov16(p->func, dst, tmp);
852bf215546Sopenharmony_ci               if (output_desc->nr_channels >= 2)
853bf215546Sopenharmony_ci                  x86_mov16_imm(p->func, x86_make_disp(dst, 2),
854bf215546Sopenharmony_ci                                imms[swizzle[1] - PIPE_SWIZZLE_0]);
855bf215546Sopenharmony_ci            }
856bf215546Sopenharmony_ci         }
857bf215546Sopenharmony_ci         else {
858bf215546Sopenharmony_ci            if (output_desc->nr_channels >= 2
859bf215546Sopenharmony_ci                && swizzle[1] >= PIPE_SWIZZLE_0) {
860bf215546Sopenharmony_ci               x86_mov_imm(p->func, dst,
861bf215546Sopenharmony_ci                           (imms[swizzle[1] - PIPE_SWIZZLE_0] << 16) |
862bf215546Sopenharmony_ci                           imms[swizzle[0] - PIPE_SWIZZLE_0]);
863bf215546Sopenharmony_ci            }
864bf215546Sopenharmony_ci            else {
865bf215546Sopenharmony_ci               x86_mov16_imm(p->func, dst,
866bf215546Sopenharmony_ci                             imms[swizzle[0] - PIPE_SWIZZLE_0]);
867bf215546Sopenharmony_ci               if (output_desc->nr_channels >= 2) {
868bf215546Sopenharmony_ci                  sse2_movd(p->func, tmp, dataXMM);
869bf215546Sopenharmony_ci                  x86_shr_imm(p->func, tmp, 16);
870bf215546Sopenharmony_ci                  x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
871bf215546Sopenharmony_ci               }
872bf215546Sopenharmony_ci            }
873bf215546Sopenharmony_ci         }
874bf215546Sopenharmony_ci
875bf215546Sopenharmony_ci         if (output_desc->nr_channels >= 3) {
876bf215546Sopenharmony_ci            if (swizzle[2] < PIPE_SWIZZLE_0) {
877bf215546Sopenharmony_ci               if (output_desc->nr_channels >= 4
878bf215546Sopenharmony_ci                   && swizzle[3] < PIPE_SWIZZLE_0) {
879bf215546Sopenharmony_ci                  sse2_psrlq_imm(p->func, dataXMM, 32);
880bf215546Sopenharmony_ci                  sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
881bf215546Sopenharmony_ci               }
882bf215546Sopenharmony_ci               else {
883bf215546Sopenharmony_ci                  sse2_psrlq_imm(p->func, dataXMM, 32);
884bf215546Sopenharmony_ci                  sse2_movd(p->func, tmp, dataXMM);
885bf215546Sopenharmony_ci                  x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
886bf215546Sopenharmony_ci                  if (output_desc->nr_channels >= 4) {
887bf215546Sopenharmony_ci                     x86_mov16_imm(p->func, x86_make_disp(dst, 6),
888bf215546Sopenharmony_ci                                   imms[swizzle[3] - PIPE_SWIZZLE_0]);
889bf215546Sopenharmony_ci                  }
890bf215546Sopenharmony_ci               }
891bf215546Sopenharmony_ci            }
892bf215546Sopenharmony_ci            else {
893bf215546Sopenharmony_ci               if (output_desc->nr_channels >= 4
894bf215546Sopenharmony_ci                   && swizzle[3] >= PIPE_SWIZZLE_0) {
895bf215546Sopenharmony_ci                  x86_mov_imm(p->func, x86_make_disp(dst, 4),
896bf215546Sopenharmony_ci                              (imms[swizzle[3] - PIPE_SWIZZLE_0] << 16)
897bf215546Sopenharmony_ci                              | imms[swizzle[2] - PIPE_SWIZZLE_0]);
898bf215546Sopenharmony_ci               }
899bf215546Sopenharmony_ci               else {
900bf215546Sopenharmony_ci                  x86_mov16_imm(p->func, x86_make_disp(dst, 4),
901bf215546Sopenharmony_ci                                imms[swizzle[2] - PIPE_SWIZZLE_0]);
902bf215546Sopenharmony_ci
903bf215546Sopenharmony_ci                  if (output_desc->nr_channels >= 4) {
904bf215546Sopenharmony_ci                     sse2_psrlq_imm(p->func, dataXMM, 48);
905bf215546Sopenharmony_ci                     sse2_movd(p->func, tmp, dataXMM);
906bf215546Sopenharmony_ci                     x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
907bf215546Sopenharmony_ci                  }
908bf215546Sopenharmony_ci               }
909bf215546Sopenharmony_ci            }
910bf215546Sopenharmony_ci         }
911bf215546Sopenharmony_ci      }
912bf215546Sopenharmony_ci      return TRUE;
913bf215546Sopenharmony_ci   }
914bf215546Sopenharmony_ci   else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0],
915bf215546Sopenharmony_ci                    sizeof(output_desc->channel[0]))) {
916bf215546Sopenharmony_ci      struct x86_reg tmp = p->tmp_EAX;
917bf215546Sopenharmony_ci      unsigned i;
918bf215546Sopenharmony_ci
919bf215546Sopenharmony_ci      if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4
920bf215546Sopenharmony_ci          && output_desc->nr_channels == 4
921bf215546Sopenharmony_ci          && swizzle[0] == PIPE_SWIZZLE_W
922bf215546Sopenharmony_ci          && swizzle[1] == PIPE_SWIZZLE_Z
923bf215546Sopenharmony_ci          && swizzle[2] == PIPE_SWIZZLE_Y
924bf215546Sopenharmony_ci          && swizzle[3] == PIPE_SWIZZLE_X) {
925bf215546Sopenharmony_ci         /* TODO: support movbe */
926bf215546Sopenharmony_ci         x86_mov(p->func, tmp, src);
927bf215546Sopenharmony_ci         x86_bswap(p->func, tmp);
928bf215546Sopenharmony_ci         x86_mov(p->func, dst, tmp);
929bf215546Sopenharmony_ci         return TRUE;
930bf215546Sopenharmony_ci      }
931bf215546Sopenharmony_ci
932bf215546Sopenharmony_ci      for (i = 0; i < output_desc->nr_channels; ++i) {
933bf215546Sopenharmony_ci         switch (output_desc->channel[0].size) {
934bf215546Sopenharmony_ci         case 8:
935bf215546Sopenharmony_ci            if (swizzle[i] >= PIPE_SWIZZLE_0) {
936bf215546Sopenharmony_ci               unsigned v = 0;
937bf215546Sopenharmony_ci               if (swizzle[i] == PIPE_SWIZZLE_1) {
938bf215546Sopenharmony_ci                  switch (output_desc->channel[0].type) {
939bf215546Sopenharmony_ci                  case UTIL_FORMAT_TYPE_UNSIGNED:
940bf215546Sopenharmony_ci                     v = output_desc->channel[0].normalized ? 0xff : 1;
941bf215546Sopenharmony_ci                     break;
942bf215546Sopenharmony_ci                  case UTIL_FORMAT_TYPE_SIGNED:
943bf215546Sopenharmony_ci                     v = output_desc->channel[0].normalized ? 0x7f : 1;
944bf215546Sopenharmony_ci                     break;
945bf215546Sopenharmony_ci                  default:
946bf215546Sopenharmony_ci                     return FALSE;
947bf215546Sopenharmony_ci                  }
948bf215546Sopenharmony_ci               }
949bf215546Sopenharmony_ci               x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
950bf215546Sopenharmony_ci            }
951bf215546Sopenharmony_ci            else {
952bf215546Sopenharmony_ci               x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
953bf215546Sopenharmony_ci               x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
954bf215546Sopenharmony_ci            }
955bf215546Sopenharmony_ci            break;
956bf215546Sopenharmony_ci         case 16:
957bf215546Sopenharmony_ci            if (swizzle[i] >= PIPE_SWIZZLE_0) {
958bf215546Sopenharmony_ci               unsigned v = 0;
959bf215546Sopenharmony_ci               if (swizzle[i] == PIPE_SWIZZLE_1) {
960bf215546Sopenharmony_ci                  switch (output_desc->channel[1].type) {
961bf215546Sopenharmony_ci                  case UTIL_FORMAT_TYPE_UNSIGNED:
962bf215546Sopenharmony_ci                     v = output_desc->channel[1].normalized ? 0xffff : 1;
963bf215546Sopenharmony_ci                     break;
964bf215546Sopenharmony_ci                  case UTIL_FORMAT_TYPE_SIGNED:
965bf215546Sopenharmony_ci                     v = output_desc->channel[1].normalized ? 0x7fff : 1;
966bf215546Sopenharmony_ci                     break;
967bf215546Sopenharmony_ci                  case UTIL_FORMAT_TYPE_FLOAT:
968bf215546Sopenharmony_ci                     v = 0x3c00;
969bf215546Sopenharmony_ci                     break;
970bf215546Sopenharmony_ci                  default:
971bf215546Sopenharmony_ci                     return FALSE;
972bf215546Sopenharmony_ci                  }
973bf215546Sopenharmony_ci               }
974bf215546Sopenharmony_ci               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
975bf215546Sopenharmony_ci            }
976bf215546Sopenharmony_ci            else if (swizzle[i] == PIPE_SWIZZLE_0) {
977bf215546Sopenharmony_ci               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
978bf215546Sopenharmony_ci            }
979bf215546Sopenharmony_ci            else {
980bf215546Sopenharmony_ci               x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
981bf215546Sopenharmony_ci               x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
982bf215546Sopenharmony_ci            }
983bf215546Sopenharmony_ci            break;
984bf215546Sopenharmony_ci         case 32:
985bf215546Sopenharmony_ci            if (swizzle[i] >= PIPE_SWIZZLE_0) {
986bf215546Sopenharmony_ci               unsigned v = 0;
987bf215546Sopenharmony_ci               if (swizzle[i] == PIPE_SWIZZLE_1) {
988bf215546Sopenharmony_ci                  switch (output_desc->channel[1].type) {
989bf215546Sopenharmony_ci                  case UTIL_FORMAT_TYPE_UNSIGNED:
990bf215546Sopenharmony_ci                     v = output_desc->channel[1].normalized ? 0xffffffff : 1;
991bf215546Sopenharmony_ci                     break;
992bf215546Sopenharmony_ci                  case UTIL_FORMAT_TYPE_SIGNED:
993bf215546Sopenharmony_ci                     v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
994bf215546Sopenharmony_ci                     break;
995bf215546Sopenharmony_ci                  case UTIL_FORMAT_TYPE_FLOAT:
996bf215546Sopenharmony_ci                     v = 0x3f800000;
997bf215546Sopenharmony_ci                     break;
998bf215546Sopenharmony_ci                  default:
999bf215546Sopenharmony_ci                     return FALSE;
1000bf215546Sopenharmony_ci                  }
1001bf215546Sopenharmony_ci               }
1002bf215546Sopenharmony_ci               x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
1003bf215546Sopenharmony_ci            }
1004bf215546Sopenharmony_ci            else {
1005bf215546Sopenharmony_ci               x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
1006bf215546Sopenharmony_ci               x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
1007bf215546Sopenharmony_ci            }
1008bf215546Sopenharmony_ci            break;
1009bf215546Sopenharmony_ci         case 64:
1010bf215546Sopenharmony_ci            if (swizzle[i] >= PIPE_SWIZZLE_0) {
1011bf215546Sopenharmony_ci               unsigned l = 0;
1012bf215546Sopenharmony_ci               unsigned h = 0;
1013bf215546Sopenharmony_ci               if (swizzle[i] == PIPE_SWIZZLE_1) {
1014bf215546Sopenharmony_ci                  switch (output_desc->channel[1].type) {
1015bf215546Sopenharmony_ci                  case UTIL_FORMAT_TYPE_UNSIGNED:
1016bf215546Sopenharmony_ci                     h = output_desc->channel[1].normalized ? 0xffffffff : 0;
1017bf215546Sopenharmony_ci                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
1018bf215546Sopenharmony_ci                     break;
1019bf215546Sopenharmony_ci                  case UTIL_FORMAT_TYPE_SIGNED:
1020bf215546Sopenharmony_ci                     h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
1021bf215546Sopenharmony_ci                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
1022bf215546Sopenharmony_ci                     break;
1023bf215546Sopenharmony_ci                  case UTIL_FORMAT_TYPE_FLOAT:
1024bf215546Sopenharmony_ci                     h = 0x3ff00000;
1025bf215546Sopenharmony_ci                     l = 0;
1026bf215546Sopenharmony_ci                     break;
1027bf215546Sopenharmony_ci                  default:
1028bf215546Sopenharmony_ci                     return FALSE;
1029bf215546Sopenharmony_ci                  }
1030bf215546Sopenharmony_ci               }
1031bf215546Sopenharmony_ci               x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
1032bf215546Sopenharmony_ci               x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
1033bf215546Sopenharmony_ci            }
1034bf215546Sopenharmony_ci            else {
1035bf215546Sopenharmony_ci               if (x86_target_caps(p->func) & X86_SSE) {
1036bf215546Sopenharmony_ci                  struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
1037bf215546Sopenharmony_ci                  emit_load64(p, tmp, tmpXMM,
1038bf215546Sopenharmony_ci                              x86_make_disp(src, swizzle[i] * 8));
1039bf215546Sopenharmony_ci                  emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
1040bf215546Sopenharmony_ci               }
1041bf215546Sopenharmony_ci               else {
1042bf215546Sopenharmony_ci                  x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
1043bf215546Sopenharmony_ci                  x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
1044bf215546Sopenharmony_ci                  x86_mov(p->func, tmp,
1045bf215546Sopenharmony_ci                          x86_make_disp(src, swizzle[i] * 8 + 4));
1046bf215546Sopenharmony_ci                  x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
1047bf215546Sopenharmony_ci               }
1048bf215546Sopenharmony_ci            }
1049bf215546Sopenharmony_ci            break;
1050bf215546Sopenharmony_ci         default:
1051bf215546Sopenharmony_ci            return FALSE;
1052bf215546Sopenharmony_ci         }
1053bf215546Sopenharmony_ci      }
1054bf215546Sopenharmony_ci      return TRUE;
1055bf215546Sopenharmony_ci   }
1056bf215546Sopenharmony_ci   /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1057bf215546Sopenharmony_ci   else if ((x86_target_caps(p->func) & X86_SSE2) &&
1058bf215546Sopenharmony_ci            a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT &&
1059bf215546Sopenharmony_ci            (0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
1060bf215546Sopenharmony_ci             || a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) {
1061bf215546Sopenharmony_ci      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
1062bf215546Sopenharmony_ci
1063bf215546Sopenharmony_ci      /* load */
1064bf215546Sopenharmony_ci      sse_movups(p->func, dataXMM, src);
1065bf215546Sopenharmony_ci
1066bf215546Sopenharmony_ci      if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
1067bf215546Sopenharmony_ci         sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3));
1068bf215546Sopenharmony_ci      }
1069bf215546Sopenharmony_ci
1070bf215546Sopenharmony_ci      /* scale by 255.0 */
1071bf215546Sopenharmony_ci      sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
1072bf215546Sopenharmony_ci
1073bf215546Sopenharmony_ci      /* pack and emit */
1074bf215546Sopenharmony_ci      sse2_cvtps2dq(p->func, dataXMM, dataXMM);
1075bf215546Sopenharmony_ci      sse2_packssdw(p->func, dataXMM, dataXMM);
1076bf215546Sopenharmony_ci      sse2_packuswb(p->func, dataXMM, dataXMM);
1077bf215546Sopenharmony_ci      sse2_movd(p->func, dst, dataXMM);
1078bf215546Sopenharmony_ci
1079bf215546Sopenharmony_ci      return TRUE;
1080bf215546Sopenharmony_ci   }
1081bf215546Sopenharmony_ci
1082bf215546Sopenharmony_ci   return FALSE;
1083bf215546Sopenharmony_ci}
1084bf215546Sopenharmony_ci
1085bf215546Sopenharmony_ci
1086bf215546Sopenharmony_cistatic boolean
1087bf215546Sopenharmony_citranslate_attr(struct translate_sse *p,
1088bf215546Sopenharmony_ci               const struct translate_element *a,
1089bf215546Sopenharmony_ci               struct x86_reg src, struct x86_reg dst)
1090bf215546Sopenharmony_ci{
1091bf215546Sopenharmony_ci   if (a->input_format == a->output_format) {
1092bf215546Sopenharmony_ci      emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
1093bf215546Sopenharmony_ci      return TRUE;
1094bf215546Sopenharmony_ci   }
1095bf215546Sopenharmony_ci
1096bf215546Sopenharmony_ci   return translate_attr_convert(p, a, src, dst);
1097bf215546Sopenharmony_ci}
1098bf215546Sopenharmony_ci
1099bf215546Sopenharmony_ci
1100bf215546Sopenharmony_cistatic boolean
1101bf215546Sopenharmony_ciinit_inputs(struct translate_sse *p, unsigned index_size)
1102bf215546Sopenharmony_ci{
1103bf215546Sopenharmony_ci   unsigned i;
1104bf215546Sopenharmony_ci   struct x86_reg instance_id =
1105bf215546Sopenharmony_ci      x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1106bf215546Sopenharmony_ci   struct x86_reg start_instance =
1107bf215546Sopenharmony_ci      x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance));
1108bf215546Sopenharmony_ci
1109bf215546Sopenharmony_ci   for (i = 0; i < p->nr_buffer_variants; i++) {
1110bf215546Sopenharmony_ci      struct translate_buffer_variant *variant = &p->buffer_variant[i];
1111bf215546Sopenharmony_ci      struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
1112bf215546Sopenharmony_ci
1113bf215546Sopenharmony_ci      if (!index_size || variant->instance_divisor) {
1114bf215546Sopenharmony_ci         struct x86_reg buf_max_index =
1115bf215546Sopenharmony_ci            x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index));
1116bf215546Sopenharmony_ci         struct x86_reg buf_stride =
1117bf215546Sopenharmony_ci            x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride));
1118bf215546Sopenharmony_ci         struct x86_reg buf_ptr =
1119bf215546Sopenharmony_ci            x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr));
1120bf215546Sopenharmony_ci         struct x86_reg buf_base_ptr =
1121bf215546Sopenharmony_ci            x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr));
1122bf215546Sopenharmony_ci         struct x86_reg elt = p->idx_ESI;
1123bf215546Sopenharmony_ci         struct x86_reg tmp_EAX = p->tmp_EAX;
1124bf215546Sopenharmony_ci
1125bf215546Sopenharmony_ci         /* Calculate pointer to first attrib:
1126bf215546Sopenharmony_ci          *   base_ptr + stride * index, where index depends on instance divisor
1127bf215546Sopenharmony_ci          */
1128bf215546Sopenharmony_ci         if (variant->instance_divisor) {
1129bf215546Sopenharmony_ci            struct x86_reg tmp_EDX = p->tmp2_EDX;
1130bf215546Sopenharmony_ci
1131bf215546Sopenharmony_ci            /* Start with instance = instance_id
1132bf215546Sopenharmony_ci             * which is true if divisor is 1.
1133bf215546Sopenharmony_ci             */
1134bf215546Sopenharmony_ci            x86_mov(p->func, tmp_EAX, instance_id);
1135bf215546Sopenharmony_ci
1136bf215546Sopenharmony_ci            if (variant->instance_divisor != 1) {
1137bf215546Sopenharmony_ci               struct x86_reg tmp_ECX = p->src_ECX;
1138bf215546Sopenharmony_ci
1139bf215546Sopenharmony_ci               /* TODO: Add x86_shr() to rtasm and use it whenever
1140bf215546Sopenharmony_ci                *       instance divisor is power of two.
1141bf215546Sopenharmony_ci                */
1142bf215546Sopenharmony_ci               x86_xor(p->func, tmp_EDX, tmp_EDX);
1143bf215546Sopenharmony_ci               x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
1144bf215546Sopenharmony_ci               x86_div(p->func, tmp_ECX);       /* EAX = EDX:EAX / ECX */
1145bf215546Sopenharmony_ci            }
1146bf215546Sopenharmony_ci
1147bf215546Sopenharmony_ci            /* instance = (instance_id / divisor) + start_instance
1148bf215546Sopenharmony_ci             */
1149bf215546Sopenharmony_ci            x86_mov(p->func, tmp_EDX, start_instance);
1150bf215546Sopenharmony_ci            x86_add(p->func, tmp_EAX, tmp_EDX);
1151bf215546Sopenharmony_ci
1152bf215546Sopenharmony_ci            /* XXX we need to clamp the index here too, but to a
1153bf215546Sopenharmony_ci             * per-array max value, not the draw->pt.max_index value
1154bf215546Sopenharmony_ci             * that's being given to us via translate->set_buffer().
1155bf215546Sopenharmony_ci             */
1156bf215546Sopenharmony_ci         }
1157bf215546Sopenharmony_ci         else {
1158bf215546Sopenharmony_ci            x86_mov(p->func, tmp_EAX, elt);
1159bf215546Sopenharmony_ci
1160bf215546Sopenharmony_ci            /* Clamp to max_index
1161bf215546Sopenharmony_ci             */
1162bf215546Sopenharmony_ci            x86_cmp(p->func, tmp_EAX, buf_max_index);
1163bf215546Sopenharmony_ci            x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);
1164bf215546Sopenharmony_ci         }
1165bf215546Sopenharmony_ci
1166bf215546Sopenharmony_ci         x86_mov(p->func, p->tmp2_EDX, buf_stride);
1167bf215546Sopenharmony_ci         x64_rexw(p->func);
1168bf215546Sopenharmony_ci         x86_imul(p->func, tmp_EAX, p->tmp2_EDX);
1169bf215546Sopenharmony_ci         x64_rexw(p->func);
1170bf215546Sopenharmony_ci         x86_add(p->func, tmp_EAX, buf_base_ptr);
1171bf215546Sopenharmony_ci
1172bf215546Sopenharmony_ci         x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1173bf215546Sopenharmony_ci
1174bf215546Sopenharmony_ci         /* In the linear case, keep the buffer pointer instead of the
1175bf215546Sopenharmony_ci          * index number.
1176bf215546Sopenharmony_ci          */
1177bf215546Sopenharmony_ci         if (!index_size && p->nr_buffer_variants == 1) {
1178bf215546Sopenharmony_ci            x64_rexw(p->func);
1179bf215546Sopenharmony_ci            x86_mov(p->func, elt, tmp_EAX);
1180bf215546Sopenharmony_ci         }
1181bf215546Sopenharmony_ci         else {
1182bf215546Sopenharmony_ci            x64_rexw(p->func);
1183bf215546Sopenharmony_ci            x86_mov(p->func, buf_ptr, tmp_EAX);
1184bf215546Sopenharmony_ci         }
1185bf215546Sopenharmony_ci      }
1186bf215546Sopenharmony_ci   }
1187bf215546Sopenharmony_ci
1188bf215546Sopenharmony_ci   return TRUE;
1189bf215546Sopenharmony_ci}
1190bf215546Sopenharmony_ci
1191bf215546Sopenharmony_ci
1192bf215546Sopenharmony_cistatic struct x86_reg
1193bf215546Sopenharmony_ciget_buffer_ptr(struct translate_sse *p,
1194bf215546Sopenharmony_ci               unsigned index_size, unsigned var_idx, struct x86_reg elt)
1195bf215546Sopenharmony_ci{
1196bf215546Sopenharmony_ci   if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1197bf215546Sopenharmony_ci      return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1198bf215546Sopenharmony_ci   }
1199bf215546Sopenharmony_ci   if (!index_size && p->nr_buffer_variants == 1) {
1200bf215546Sopenharmony_ci      return p->idx_ESI;
1201bf215546Sopenharmony_ci   }
1202bf215546Sopenharmony_ci   else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
1203bf215546Sopenharmony_ci      struct x86_reg ptr = p->src_ECX;
1204bf215546Sopenharmony_ci      struct x86_reg buf_ptr =
1205bf215546Sopenharmony_ci         x86_make_disp(p->machine_EDI,
1206bf215546Sopenharmony_ci                       get_offset(p, &p->buffer_variant[var_idx].ptr));
1207bf215546Sopenharmony_ci
1208bf215546Sopenharmony_ci      x64_rexw(p->func);
1209bf215546Sopenharmony_ci      x86_mov(p->func, ptr, buf_ptr);
1210bf215546Sopenharmony_ci      return ptr;
1211bf215546Sopenharmony_ci   }
1212bf215546Sopenharmony_ci   else {
1213bf215546Sopenharmony_ci      struct x86_reg ptr = p->src_ECX;
1214bf215546Sopenharmony_ci      const struct translate_buffer_variant *variant =
1215bf215546Sopenharmony_ci         &p->buffer_variant[var_idx];
1216bf215546Sopenharmony_ci      struct x86_reg buf_stride =
1217bf215546Sopenharmony_ci         x86_make_disp(p->machine_EDI,
1218bf215546Sopenharmony_ci                       get_offset(p, &p->buffer[variant->buffer_index].stride));
1219bf215546Sopenharmony_ci      struct x86_reg buf_base_ptr =
1220bf215546Sopenharmony_ci         x86_make_disp(p->machine_EDI,
1221bf215546Sopenharmony_ci                  get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
1222bf215546Sopenharmony_ci      struct x86_reg buf_max_index =
1223bf215546Sopenharmony_ci         x86_make_disp(p->machine_EDI,
1224bf215546Sopenharmony_ci                  get_offset(p, &p->buffer[variant->buffer_index].max_index));
1225bf215546Sopenharmony_ci
1226bf215546Sopenharmony_ci      /* Calculate pointer to current attrib:
1227bf215546Sopenharmony_ci       */
1228bf215546Sopenharmony_ci      switch (index_size) {
1229bf215546Sopenharmony_ci      case 1:
1230bf215546Sopenharmony_ci         x86_movzx8(p->func, ptr, elt);
1231bf215546Sopenharmony_ci         break;
1232bf215546Sopenharmony_ci      case 2:
1233bf215546Sopenharmony_ci         x86_movzx16(p->func, ptr, elt);
1234bf215546Sopenharmony_ci         break;
1235bf215546Sopenharmony_ci      case 4:
1236bf215546Sopenharmony_ci         x86_mov(p->func, ptr, elt);
1237bf215546Sopenharmony_ci         break;
1238bf215546Sopenharmony_ci      }
1239bf215546Sopenharmony_ci
1240bf215546Sopenharmony_ci      /* Clamp to max_index
1241bf215546Sopenharmony_ci       */
1242bf215546Sopenharmony_ci      x86_cmp(p->func, ptr, buf_max_index);
1243bf215546Sopenharmony_ci      x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);
1244bf215546Sopenharmony_ci
1245bf215546Sopenharmony_ci      x86_mov(p->func, p->tmp2_EDX, buf_stride);
1246bf215546Sopenharmony_ci      x64_rexw(p->func);
1247bf215546Sopenharmony_ci      x86_imul(p->func, ptr, p->tmp2_EDX);
1248bf215546Sopenharmony_ci      x64_rexw(p->func);
1249bf215546Sopenharmony_ci      x86_add(p->func, ptr, buf_base_ptr);
1250bf215546Sopenharmony_ci      return ptr;
1251bf215546Sopenharmony_ci   }
1252bf215546Sopenharmony_ci}
1253bf215546Sopenharmony_ci
1254bf215546Sopenharmony_ci
1255bf215546Sopenharmony_cistatic boolean
1256bf215546Sopenharmony_ciincr_inputs(struct translate_sse *p, unsigned index_size)
1257bf215546Sopenharmony_ci{
1258bf215546Sopenharmony_ci   if (!index_size && p->nr_buffer_variants == 1) {
1259bf215546Sopenharmony_ci      const unsigned buffer_index = p->buffer_variant[0].buffer_index;
1260bf215546Sopenharmony_ci      struct x86_reg stride =
1261bf215546Sopenharmony_ci         x86_make_disp(p->machine_EDI,
1262bf215546Sopenharmony_ci                       get_offset(p, &p->buffer[buffer_index].stride));
1263bf215546Sopenharmony_ci
1264bf215546Sopenharmony_ci      if (p->buffer_variant[0].instance_divisor == 0) {
1265bf215546Sopenharmony_ci         x64_rexw(p->func);
1266bf215546Sopenharmony_ci         x86_add(p->func, p->idx_ESI, stride);
1267bf215546Sopenharmony_ci         sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1268bf215546Sopenharmony_ci      }
1269bf215546Sopenharmony_ci   }
1270bf215546Sopenharmony_ci   else if (!index_size) {
1271bf215546Sopenharmony_ci      unsigned i;
1272bf215546Sopenharmony_ci
1273bf215546Sopenharmony_ci      /* Is this worthwhile??
1274bf215546Sopenharmony_ci       */
1275bf215546Sopenharmony_ci      for (i = 0; i < p->nr_buffer_variants; i++) {
1276bf215546Sopenharmony_ci         struct translate_buffer_variant *variant = &p->buffer_variant[i];
1277bf215546Sopenharmony_ci         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1278bf215546Sopenharmony_ci                                                get_offset(p, &variant->ptr));
1279bf215546Sopenharmony_ci      struct x86_reg buf_stride =
1280bf215546Sopenharmony_ci         x86_make_disp(p->machine_EDI,
1281bf215546Sopenharmony_ci                       get_offset(p, &p->buffer[variant->buffer_index].stride));
1282bf215546Sopenharmony_ci
1283bf215546Sopenharmony_ci         if (variant->instance_divisor == 0) {
1284bf215546Sopenharmony_ci            x86_mov(p->func, p->tmp_EAX, buf_stride);
1285bf215546Sopenharmony_ci            x64_rexw(p->func);
1286bf215546Sopenharmony_ci            x86_add(p->func, p->tmp_EAX, buf_ptr);
1287bf215546Sopenharmony_ci            if (i == 0)
1288bf215546Sopenharmony_ci               sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
1289bf215546Sopenharmony_ci            x64_rexw(p->func);
1290bf215546Sopenharmony_ci            x86_mov(p->func, buf_ptr, p->tmp_EAX);
1291bf215546Sopenharmony_ci         }
1292bf215546Sopenharmony_ci      }
1293bf215546Sopenharmony_ci   }
1294bf215546Sopenharmony_ci   else {
1295bf215546Sopenharmony_ci      x64_rexw(p->func);
1296bf215546Sopenharmony_ci      x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
1297bf215546Sopenharmony_ci   }
1298bf215546Sopenharmony_ci
1299bf215546Sopenharmony_ci   return TRUE;
1300bf215546Sopenharmony_ci}
1301bf215546Sopenharmony_ci
1302bf215546Sopenharmony_ci
1303bf215546Sopenharmony_ci/* Build run( struct translate *machine,
1304bf215546Sopenharmony_ci *            unsigned start,
1305bf215546Sopenharmony_ci *            unsigned count,
1306bf215546Sopenharmony_ci *            void *output_buffer )
1307bf215546Sopenharmony_ci * or
1308bf215546Sopenharmony_ci *  run_elts( struct translate *machine,
1309bf215546Sopenharmony_ci *            unsigned *elts,
1310bf215546Sopenharmony_ci *            unsigned count,
1311bf215546Sopenharmony_ci *            void *output_buffer )
1312bf215546Sopenharmony_ci *
1313bf215546Sopenharmony_ci *  Lots of hardcoding
1314bf215546Sopenharmony_ci *
1315bf215546Sopenharmony_ci * EAX -- pointer to current output vertex
1316bf215546Sopenharmony_ci * ECX -- pointer to current attribute
1317bf215546Sopenharmony_ci *
1318bf215546Sopenharmony_ci */
1319bf215546Sopenharmony_cistatic boolean
1320bf215546Sopenharmony_cibuild_vertex_emit(struct translate_sse *p,
1321bf215546Sopenharmony_ci                  struct x86_function *func, unsigned index_size)
1322bf215546Sopenharmony_ci{
1323bf215546Sopenharmony_ci   int fixup, label;
1324bf215546Sopenharmony_ci   unsigned j;
1325bf215546Sopenharmony_ci
1326bf215546Sopenharmony_ci   memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
1327bf215546Sopenharmony_ci   memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
1328bf215546Sopenharmony_ci
1329bf215546Sopenharmony_ci   p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1330bf215546Sopenharmony_ci   p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
1331bf215546Sopenharmony_ci   p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
1332bf215546Sopenharmony_ci   p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
1333bf215546Sopenharmony_ci   p->count_EBP = x86_make_reg(file_REG32, reg_BP);
1334bf215546Sopenharmony_ci   p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
1335bf215546Sopenharmony_ci   p->src_ECX = x86_make_reg(file_REG32, reg_CX);
1336bf215546Sopenharmony_ci
1337bf215546Sopenharmony_ci   p->func = func;
1338bf215546Sopenharmony_ci
1339bf215546Sopenharmony_ci   x86_init_func(p->func);
1340bf215546Sopenharmony_ci
1341bf215546Sopenharmony_ci   if (x86_target(p->func) == X86_64_WIN64_ABI) {
1342bf215546Sopenharmony_ci      /* the ABI guarantees a 16-byte aligned 32-byte "shadow space"
1343bf215546Sopenharmony_ci       * above the return address
1344bf215546Sopenharmony_ci       */
1345bf215546Sopenharmony_ci      sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8),
1346bf215546Sopenharmony_ci                  x86_make_reg(file_XMM, 6));
1347bf215546Sopenharmony_ci      sse2_movdqa(p->func,
1348bf215546Sopenharmony_ci                  x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24),
1349bf215546Sopenharmony_ci                  x86_make_reg(file_XMM, 7));
1350bf215546Sopenharmony_ci   }
1351bf215546Sopenharmony_ci
1352bf215546Sopenharmony_ci   x86_push(p->func, p->outbuf_EBX);
1353bf215546Sopenharmony_ci   x86_push(p->func, p->count_EBP);
1354bf215546Sopenharmony_ci
1355bf215546Sopenharmony_ci   /* on non-Win64 x86-64, these are already in the right registers */
1356bf215546Sopenharmony_ci   if (x86_target(p->func) != X86_64_STD_ABI) {
1357bf215546Sopenharmony_ci      x86_push(p->func, p->machine_EDI);
1358bf215546Sopenharmony_ci      x86_push(p->func, p->idx_ESI);
1359bf215546Sopenharmony_ci
1360bf215546Sopenharmony_ci      if (x86_target(p->func) != X86_32) {
1361bf215546Sopenharmony_ci         x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1362bf215546Sopenharmony_ci         x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1363bf215546Sopenharmony_ci      }
1364bf215546Sopenharmony_ci      else {
1365bf215546Sopenharmony_ci         x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1366bf215546Sopenharmony_ci         x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1367bf215546Sopenharmony_ci      }
1368bf215546Sopenharmony_ci   }
1369bf215546Sopenharmony_ci
1370bf215546Sopenharmony_ci   x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
1371bf215546Sopenharmony_ci
1372bf215546Sopenharmony_ci   if (x86_target(p->func) != X86_32)
1373bf215546Sopenharmony_ci      x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1374bf215546Sopenharmony_ci   else
1375bf215546Sopenharmony_ci      x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1376bf215546Sopenharmony_ci
1377bf215546Sopenharmony_ci   /* Load instance ID.
1378bf215546Sopenharmony_ci    */
1379bf215546Sopenharmony_ci   if (p->use_instancing) {
1380bf215546Sopenharmony_ci      x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4));
1381bf215546Sopenharmony_ci      x86_mov(p->func,
1382bf215546Sopenharmony_ci              x86_make_disp(p->machine_EDI,
1383bf215546Sopenharmony_ci                            get_offset(p, &p->start_instance)), p->tmp2_EDX);
1384bf215546Sopenharmony_ci
1385bf215546Sopenharmony_ci      x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5));
1386bf215546Sopenharmony_ci      x86_mov(p->func,
1387bf215546Sopenharmony_ci              x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1388bf215546Sopenharmony_ci              p->tmp_EAX);
1389bf215546Sopenharmony_ci   }
1390bf215546Sopenharmony_ci
1391bf215546Sopenharmony_ci   /* Get vertex count, compare to zero
1392bf215546Sopenharmony_ci    */
1393bf215546Sopenharmony_ci   x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
1394bf215546Sopenharmony_ci   x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1395bf215546Sopenharmony_ci   fixup = x86_jcc_forward(p->func, cc_E);
1396bf215546Sopenharmony_ci
1397bf215546Sopenharmony_ci   /* always load, needed or not:
1398bf215546Sopenharmony_ci    */
1399bf215546Sopenharmony_ci   init_inputs(p, index_size);
1400bf215546Sopenharmony_ci
1401bf215546Sopenharmony_ci   /* Note address for loop jump
1402bf215546Sopenharmony_ci    */
1403bf215546Sopenharmony_ci   label = x86_get_label(p->func);
1404bf215546Sopenharmony_ci   {
1405bf215546Sopenharmony_ci      struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
1406bf215546Sopenharmony_ci      int last_variant = -1;
1407bf215546Sopenharmony_ci      struct x86_reg vb;
1408bf215546Sopenharmony_ci
1409bf215546Sopenharmony_ci      for (j = 0; j < p->translate.key.nr_elements; j++) {
1410bf215546Sopenharmony_ci         const struct translate_element *a = &p->translate.key.element[j];
1411bf215546Sopenharmony_ci         unsigned variant = p->element_to_buffer_variant[j];
1412bf215546Sopenharmony_ci
1413bf215546Sopenharmony_ci         /* Figure out source pointer address:
1414bf215546Sopenharmony_ci          */
1415bf215546Sopenharmony_ci         if (variant != last_variant) {
1416bf215546Sopenharmony_ci            last_variant = variant;
1417bf215546Sopenharmony_ci            vb = get_buffer_ptr(p, index_size, variant, elt);
1418bf215546Sopenharmony_ci         }
1419bf215546Sopenharmony_ci
1420bf215546Sopenharmony_ci         if (!translate_attr(p, a,
1421bf215546Sopenharmony_ci                             x86_make_disp(vb, a->input_offset),
1422bf215546Sopenharmony_ci                             x86_make_disp(p->outbuf_EBX, a->output_offset)))
1423bf215546Sopenharmony_ci            return FALSE;
1424bf215546Sopenharmony_ci      }
1425bf215546Sopenharmony_ci
1426bf215546Sopenharmony_ci      /* Next output vertex:
1427bf215546Sopenharmony_ci       */
1428bf215546Sopenharmony_ci      x64_rexw(p->func);
1429bf215546Sopenharmony_ci      x86_lea(p->func, p->outbuf_EBX,
1430bf215546Sopenharmony_ci              x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride));
1431bf215546Sopenharmony_ci
1432bf215546Sopenharmony_ci      /* Incr index
1433bf215546Sopenharmony_ci       */
1434bf215546Sopenharmony_ci      incr_inputs(p, index_size);
1435bf215546Sopenharmony_ci   }
1436bf215546Sopenharmony_ci
1437bf215546Sopenharmony_ci   /* decr count, loop if not zero
1438bf215546Sopenharmony_ci    */
1439bf215546Sopenharmony_ci   x86_dec(p->func, p->count_EBP);
1440bf215546Sopenharmony_ci   x86_jcc(p->func, cc_NZ, label);
1441bf215546Sopenharmony_ci
1442bf215546Sopenharmony_ci   /* Exit mmx state?
1443bf215546Sopenharmony_ci    */
1444bf215546Sopenharmony_ci   if (p->func->need_emms)
1445bf215546Sopenharmony_ci      mmx_emms(p->func);
1446bf215546Sopenharmony_ci
1447bf215546Sopenharmony_ci   /* Land forward jump here:
1448bf215546Sopenharmony_ci    */
1449bf215546Sopenharmony_ci   x86_fixup_fwd_jump(p->func, fixup);
1450bf215546Sopenharmony_ci
1451bf215546Sopenharmony_ci   /* Pop regs and return
1452bf215546Sopenharmony_ci    */
1453bf215546Sopenharmony_ci   if (x86_target(p->func) != X86_64_STD_ABI) {
1454bf215546Sopenharmony_ci      x86_pop(p->func, p->idx_ESI);
1455bf215546Sopenharmony_ci      x86_pop(p->func, p->machine_EDI);
1456bf215546Sopenharmony_ci   }
1457bf215546Sopenharmony_ci
1458bf215546Sopenharmony_ci   x86_pop(p->func, p->count_EBP);
1459bf215546Sopenharmony_ci   x86_pop(p->func, p->outbuf_EBX);
1460bf215546Sopenharmony_ci
1461bf215546Sopenharmony_ci   if (x86_target(p->func) == X86_64_WIN64_ABI) {
1462bf215546Sopenharmony_ci      sse2_movdqa(p->func, x86_make_reg(file_XMM, 6),
1463bf215546Sopenharmony_ci                  x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1464bf215546Sopenharmony_ci      sse2_movdqa(p->func, x86_make_reg(file_XMM, 7),
1465bf215546Sopenharmony_ci                  x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
1466bf215546Sopenharmony_ci   }
1467bf215546Sopenharmony_ci   x86_ret(p->func);
1468bf215546Sopenharmony_ci
1469bf215546Sopenharmony_ci   return TRUE;
1470bf215546Sopenharmony_ci}
1471bf215546Sopenharmony_ci
1472bf215546Sopenharmony_ci
1473bf215546Sopenharmony_cistatic void
1474bf215546Sopenharmony_citranslate_sse_set_buffer(struct translate *translate,
1475bf215546Sopenharmony_ci                         unsigned buf,
1476bf215546Sopenharmony_ci                         const void *ptr, unsigned stride, unsigned max_index)
1477bf215546Sopenharmony_ci{
1478bf215546Sopenharmony_ci   struct translate_sse *p = (struct translate_sse *) translate;
1479bf215546Sopenharmony_ci
1480bf215546Sopenharmony_ci   if (buf < p->nr_buffers) {
1481bf215546Sopenharmony_ci      p->buffer[buf].base_ptr = (char *) ptr;
1482bf215546Sopenharmony_ci      p->buffer[buf].stride = stride;
1483bf215546Sopenharmony_ci      p->buffer[buf].max_index = max_index;
1484bf215546Sopenharmony_ci   }
1485bf215546Sopenharmony_ci
1486bf215546Sopenharmony_ci   if (0)
1487bf215546Sopenharmony_ci      debug_printf("%s %d/%d: %p %d\n",
1488bf215546Sopenharmony_ci                   __FUNCTION__, buf, p->nr_buffers, ptr, stride);
1489bf215546Sopenharmony_ci}
1490bf215546Sopenharmony_ci
1491bf215546Sopenharmony_ci
1492bf215546Sopenharmony_cistatic void
1493bf215546Sopenharmony_citranslate_sse_release(struct translate *translate)
1494bf215546Sopenharmony_ci{
1495bf215546Sopenharmony_ci   struct translate_sse *p = (struct translate_sse *) translate;
1496bf215546Sopenharmony_ci
1497bf215546Sopenharmony_ci   x86_release_func(&p->elt8_func);
1498bf215546Sopenharmony_ci   x86_release_func(&p->elt16_func);
1499bf215546Sopenharmony_ci   x86_release_func(&p->elt_func);
1500bf215546Sopenharmony_ci   x86_release_func(&p->linear_func);
1501bf215546Sopenharmony_ci
1502bf215546Sopenharmony_ci   os_free_aligned(p);
1503bf215546Sopenharmony_ci}
1504bf215546Sopenharmony_ci
1505bf215546Sopenharmony_ci
1506bf215546Sopenharmony_cistruct translate *
1507bf215546Sopenharmony_citranslate_sse2_create(const struct translate_key *key)
1508bf215546Sopenharmony_ci{
1509bf215546Sopenharmony_ci   struct translate_sse *p = NULL;
1510bf215546Sopenharmony_ci   unsigned i;
1511bf215546Sopenharmony_ci
1512bf215546Sopenharmony_ci   /* this is misnamed, it actually refers to whether rtasm is enabled or not */
1513bf215546Sopenharmony_ci   if (!rtasm_cpu_has_sse())
1514bf215546Sopenharmony_ci      goto fail;
1515bf215546Sopenharmony_ci
1516bf215546Sopenharmony_ci   p = os_malloc_aligned(sizeof(struct translate_sse), 16);
1517bf215546Sopenharmony_ci   if (!p)
1518bf215546Sopenharmony_ci      goto fail;
1519bf215546Sopenharmony_ci
1520bf215546Sopenharmony_ci   memset(p, 0, sizeof(*p));
1521bf215546Sopenharmony_ci   memcpy(p->consts, consts, sizeof(consts));
1522bf215546Sopenharmony_ci   memcpy(p->uconsts, uconsts, sizeof(uconsts));
1523bf215546Sopenharmony_ci
1524bf215546Sopenharmony_ci   p->translate.key = *key;
1525bf215546Sopenharmony_ci   p->translate.release = translate_sse_release;
1526bf215546Sopenharmony_ci   p->translate.set_buffer = translate_sse_set_buffer;
1527bf215546Sopenharmony_ci
1528bf215546Sopenharmony_ci   assert(key->nr_elements <= TRANSLATE_MAX_ATTRIBS);
1529bf215546Sopenharmony_ci
1530bf215546Sopenharmony_ci   for (i = 0; i < key->nr_elements; i++) {
1531bf215546Sopenharmony_ci      if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1532bf215546Sopenharmony_ci         unsigned j;
1533bf215546Sopenharmony_ci
1534bf215546Sopenharmony_ci         p->nr_buffers =
1535bf215546Sopenharmony_ci            MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1536bf215546Sopenharmony_ci
1537bf215546Sopenharmony_ci         if (key->element[i].instance_divisor) {
1538bf215546Sopenharmony_ci            p->use_instancing = TRUE;
1539bf215546Sopenharmony_ci         }
1540bf215546Sopenharmony_ci
1541bf215546Sopenharmony_ci         /*
1542bf215546Sopenharmony_ci          * Map vertex element to vertex buffer variant.
1543bf215546Sopenharmony_ci          */
1544bf215546Sopenharmony_ci         for (j = 0; j < p->nr_buffer_variants; j++) {
1545bf215546Sopenharmony_ci            if (p->buffer_variant[j].buffer_index ==
1546bf215546Sopenharmony_ci                key->element[i].input_buffer
1547bf215546Sopenharmony_ci                && p->buffer_variant[j].instance_divisor ==
1548bf215546Sopenharmony_ci                key->element[i].instance_divisor) {
1549bf215546Sopenharmony_ci               break;
1550bf215546Sopenharmony_ci            }
1551bf215546Sopenharmony_ci         }
1552bf215546Sopenharmony_ci         if (j == p->nr_buffer_variants) {
1553bf215546Sopenharmony_ci            p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
1554bf215546Sopenharmony_ci            p->buffer_variant[j].instance_divisor =
1555bf215546Sopenharmony_ci               key->element[i].instance_divisor;
1556bf215546Sopenharmony_ci            p->nr_buffer_variants++;
1557bf215546Sopenharmony_ci         }
1558bf215546Sopenharmony_ci         p->element_to_buffer_variant[i] = j;
1559bf215546Sopenharmony_ci      }
1560bf215546Sopenharmony_ci      else {
1561bf215546Sopenharmony_ci         assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1562bf215546Sopenharmony_ci
1563bf215546Sopenharmony_ci         p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
1564bf215546Sopenharmony_ci      }
1565bf215546Sopenharmony_ci   }
1566bf215546Sopenharmony_ci
1567bf215546Sopenharmony_ci   if (0)
1568bf215546Sopenharmony_ci      debug_printf("nr_buffers: %d\n", p->nr_buffers);
1569bf215546Sopenharmony_ci
1570bf215546Sopenharmony_ci   if (!build_vertex_emit(p, &p->linear_func, 0))
1571bf215546Sopenharmony_ci      goto fail;
1572bf215546Sopenharmony_ci
1573bf215546Sopenharmony_ci   if (!build_vertex_emit(p, &p->elt_func, 4))
1574bf215546Sopenharmony_ci      goto fail;
1575bf215546Sopenharmony_ci
1576bf215546Sopenharmony_ci   if (!build_vertex_emit(p, &p->elt16_func, 2))
1577bf215546Sopenharmony_ci      goto fail;
1578bf215546Sopenharmony_ci
1579bf215546Sopenharmony_ci   if (!build_vertex_emit(p, &p->elt8_func, 1))
1580bf215546Sopenharmony_ci      goto fail;
1581bf215546Sopenharmony_ci
1582bf215546Sopenharmony_ci   p->translate.run = (run_func) x86_get_func(&p->linear_func);
1583bf215546Sopenharmony_ci   if (p->translate.run == NULL)
1584bf215546Sopenharmony_ci      goto fail;
1585bf215546Sopenharmony_ci
1586bf215546Sopenharmony_ci   p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
1587bf215546Sopenharmony_ci   if (p->translate.run_elts == NULL)
1588bf215546Sopenharmony_ci      goto fail;
1589bf215546Sopenharmony_ci
1590bf215546Sopenharmony_ci   p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
1591bf215546Sopenharmony_ci   if (p->translate.run_elts16 == NULL)
1592bf215546Sopenharmony_ci      goto fail;
1593bf215546Sopenharmony_ci
1594bf215546Sopenharmony_ci   p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
1595bf215546Sopenharmony_ci   if (p->translate.run_elts8 == NULL)
1596bf215546Sopenharmony_ci      goto fail;
1597bf215546Sopenharmony_ci
1598bf215546Sopenharmony_ci   return &p->translate;
1599bf215546Sopenharmony_ci
1600bf215546Sopenharmony_ci fail:
1601bf215546Sopenharmony_ci   if (p)
1602bf215546Sopenharmony_ci      translate_sse_release(&p->translate);
1603bf215546Sopenharmony_ci
1604bf215546Sopenharmony_ci   return NULL;
1605bf215546Sopenharmony_ci}
1606bf215546Sopenharmony_ci
1607bf215546Sopenharmony_ci
1608bf215546Sopenharmony_ci#else
1609bf215546Sopenharmony_ci
1610bf215546Sopenharmony_cistruct translate *
1611bf215546Sopenharmony_citranslate_sse2_create(const struct translate_key *key)
1612bf215546Sopenharmony_ci{
1613bf215546Sopenharmony_ci   return NULL;
1614bf215546Sopenharmony_ci}
1615bf215546Sopenharmony_ci
1616bf215546Sopenharmony_ci#endif
1617