1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright 2003 VMware, Inc. 3bf215546Sopenharmony_ci * All Rights Reserved. 4bf215546Sopenharmony_ci * 5bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 6bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 7bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 8bf215546Sopenharmony_ci * on the rights to use, copy, modify, merge, publish, distribute, sub 9bf215546Sopenharmony_ci * license, and/or sell copies of the Software, and to permit persons to whom 10bf215546Sopenharmony_ci * the Software is furnished to do so, subject to the following conditions: 11bf215546Sopenharmony_ci * 12bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 13bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 14bf215546Sopenharmony_ci * Software. 15bf215546Sopenharmony_ci * 16bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19bf215546Sopenharmony_ci * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20bf215546Sopenharmony_ci * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21bf215546Sopenharmony_ci * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22bf215546Sopenharmony_ci * USE OR OTHER DEALINGS IN THE SOFTWARE. 23bf215546Sopenharmony_ci * 24bf215546Sopenharmony_ci * Authors: 25bf215546Sopenharmony_ci * Keith Whitwell <keithw@vmware.com> 26bf215546Sopenharmony_ci */ 27bf215546Sopenharmony_ci 28bf215546Sopenharmony_ci 29bf215546Sopenharmony_ci#include "pipe/p_config.h" 30bf215546Sopenharmony_ci#include "pipe/p_compiler.h" 31bf215546Sopenharmony_ci#include "util/u_memory.h" 32bf215546Sopenharmony_ci#include "util/u_math.h" 33bf215546Sopenharmony_ci#include "util/format/u_format.h" 34bf215546Sopenharmony_ci 35bf215546Sopenharmony_ci#include "translate.h" 36bf215546Sopenharmony_ci 37bf215546Sopenharmony_ci 38bf215546Sopenharmony_ci#if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && !defined(EMBEDDED_DEVICE) 39bf215546Sopenharmony_ci 40bf215546Sopenharmony_ci#include "rtasm/rtasm_cpu.h" 41bf215546Sopenharmony_ci#include "rtasm/rtasm_x86sse.h" 42bf215546Sopenharmony_ci 43bf215546Sopenharmony_ci 44bf215546Sopenharmony_ci#define X 0 45bf215546Sopenharmony_ci#define Y 1 46bf215546Sopenharmony_ci#define Z 2 47bf215546Sopenharmony_ci#define W 3 48bf215546Sopenharmony_ci 49bf215546Sopenharmony_ci 50bf215546Sopenharmony_cistruct translate_buffer 51bf215546Sopenharmony_ci{ 52bf215546Sopenharmony_ci const void *base_ptr; 53bf215546Sopenharmony_ci uintptr_t stride; 54bf215546Sopenharmony_ci unsigned max_index; 55bf215546Sopenharmony_ci}; 56bf215546Sopenharmony_ci 57bf215546Sopenharmony_cistruct translate_buffer_variant 58bf215546Sopenharmony_ci{ 59bf215546Sopenharmony_ci unsigned buffer_index; 60bf215546Sopenharmony_ci unsigned instance_divisor; 61bf215546Sopenharmony_ci void *ptr; /* updated either per vertex or per instance */ 62bf215546Sopenharmony_ci}; 63bf215546Sopenharmony_ci 64bf215546Sopenharmony_ci 65bf215546Sopenharmony_ci#define ELEMENT_BUFFER_INSTANCE_ID 1001 66bf215546Sopenharmony_ci 67bf215546Sopenharmony_ci#define NUM_FLOAT_CONSTS 9 68bf215546Sopenharmony_ci#define NUM_UNSIGNED_CONSTS 1 69bf215546Sopenharmony_ci 70bf215546Sopenharmony_cienum 71bf215546Sopenharmony_ci{ 72bf215546Sopenharmony_ci CONST_IDENTITY, 73bf215546Sopenharmony_ci CONST_INV_127, 74bf215546Sopenharmony_ci CONST_INV_255, 75bf215546Sopenharmony_ci CONST_INV_32767, 76bf215546Sopenharmony_ci CONST_INV_65535, 77bf215546Sopenharmony_ci CONST_INV_2147483647, 78bf215546Sopenharmony_ci CONST_INV_4294967295, 79bf215546Sopenharmony_ci CONST_255, 80bf215546Sopenharmony_ci CONST_2147483648, 81bf215546Sopenharmony_ci /* float consts end */ 82bf215546Sopenharmony_ci CONST_2147483647_INT, 83bf215546Sopenharmony_ci}; 84bf215546Sopenharmony_ci 85bf215546Sopenharmony_ci#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)} 86bf215546Sopenharmony_cistatic float consts[NUM_FLOAT_CONSTS][4] = { 87bf215546Sopenharmony_ci {0, 0, 0, 1}, 88bf215546Sopenharmony_ci C(1.0 / 127.0), 89bf215546Sopenharmony_ci C(1.0 / 255.0), 90bf215546Sopenharmony_ci C(1.0 / 32767.0), 91bf215546Sopenharmony_ci C(1.0 / 65535.0), 92bf215546Sopenharmony_ci C(1.0 / 2147483647.0), 93bf215546Sopenharmony_ci C(1.0 / 4294967295.0), 94bf215546Sopenharmony_ci C(255.0), 95bf215546Sopenharmony_ci C(2147483648.0), 96bf215546Sopenharmony_ci}; 97bf215546Sopenharmony_ci 98bf215546Sopenharmony_ci#undef C 99bf215546Sopenharmony_ci 100bf215546Sopenharmony_cistatic unsigned uconsts[NUM_UNSIGNED_CONSTS][4] = { 101bf215546Sopenharmony_ci {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}, 102bf215546Sopenharmony_ci}; 103bf215546Sopenharmony_ci 104bf215546Sopenharmony_cistruct translate_sse 105bf215546Sopenharmony_ci{ 106bf215546Sopenharmony_ci struct translate translate; 107bf215546Sopenharmony_ci 108bf215546Sopenharmony_ci struct x86_function linear_func; 109bf215546Sopenharmony_ci struct x86_function elt_func; 110bf215546Sopenharmony_ci struct x86_function elt16_func; 111bf215546Sopenharmony_ci struct x86_function elt8_func; 112bf215546Sopenharmony_ci struct x86_function *func; 113bf215546Sopenharmony_ci 114bf215546Sopenharmony_ci alignas(16) float consts[NUM_FLOAT_CONSTS][4]; 115bf215546Sopenharmony_ci alignas(16) float uconsts[NUM_UNSIGNED_CONSTS][4]; 116bf215546Sopenharmony_ci int8_t reg_to_const[16]; 117bf215546Sopenharmony_ci int8_t const_to_reg[NUM_FLOAT_CONSTS + NUM_UNSIGNED_CONSTS]; 118bf215546Sopenharmony_ci 119bf215546Sopenharmony_ci struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS]; 120bf215546Sopenharmony_ci unsigned nr_buffers; 121bf215546Sopenharmony_ci 122bf215546Sopenharmony_ci /* Multiple buffer variants can map to a single buffer. */ 123bf215546Sopenharmony_ci struct translate_buffer_variant buffer_variant[TRANSLATE_MAX_ATTRIBS]; 124bf215546Sopenharmony_ci unsigned nr_buffer_variants; 125bf215546Sopenharmony_ci 126bf215546Sopenharmony_ci /* Multiple elements can map to a single buffer variant. */ 127bf215546Sopenharmony_ci unsigned element_to_buffer_variant[TRANSLATE_MAX_ATTRIBS]; 128bf215546Sopenharmony_ci 129bf215546Sopenharmony_ci boolean use_instancing; 130bf215546Sopenharmony_ci unsigned instance_id; 131bf215546Sopenharmony_ci unsigned start_instance; 132bf215546Sopenharmony_ci 133bf215546Sopenharmony_ci /* these are actually known values, but putting them in a struct 134bf215546Sopenharmony_ci * like this is helpful to keep them in sync across the file. 135bf215546Sopenharmony_ci */ 136bf215546Sopenharmony_ci struct x86_reg tmp_EAX; 137bf215546Sopenharmony_ci struct x86_reg tmp2_EDX; 138bf215546Sopenharmony_ci struct x86_reg src_ECX; 139bf215546Sopenharmony_ci struct x86_reg idx_ESI; /* either start+i or &elt[i] */ 140bf215546Sopenharmony_ci struct x86_reg machine_EDI; 141bf215546Sopenharmony_ci struct x86_reg outbuf_EBX; 142bf215546Sopenharmony_ci struct x86_reg count_EBP; /* decrements to zero */ 143bf215546Sopenharmony_ci}; 144bf215546Sopenharmony_ci 145bf215546Sopenharmony_ci 146bf215546Sopenharmony_cistatic int 147bf215546Sopenharmony_ciget_offset(const void *a, const void *b) 148bf215546Sopenharmony_ci{ 149bf215546Sopenharmony_ci return (const char *) b - (const char *) a; 150bf215546Sopenharmony_ci} 151bf215546Sopenharmony_ci 152bf215546Sopenharmony_ci 153bf215546Sopenharmony_cistatic struct x86_reg 154bf215546Sopenharmony_ciget_const(struct translate_sse *p, unsigned id) 155bf215546Sopenharmony_ci{ 156bf215546Sopenharmony_ci struct x86_reg reg; 157bf215546Sopenharmony_ci unsigned i; 158bf215546Sopenharmony_ci 159bf215546Sopenharmony_ci if (p->const_to_reg[id] >= 0) 160bf215546Sopenharmony_ci return x86_make_reg(file_XMM, p->const_to_reg[id]); 161bf215546Sopenharmony_ci 162bf215546Sopenharmony_ci for (i = 2; i < 8; ++i) { 163bf215546Sopenharmony_ci if (p->reg_to_const[i] < 0) 164bf215546Sopenharmony_ci break; 165bf215546Sopenharmony_ci } 166bf215546Sopenharmony_ci 167bf215546Sopenharmony_ci /* TODO: be smarter here */ 168bf215546Sopenharmony_ci if (i == 8) 169bf215546Sopenharmony_ci --i; 170bf215546Sopenharmony_ci 171bf215546Sopenharmony_ci reg = x86_make_reg(file_XMM, i); 172bf215546Sopenharmony_ci 173bf215546Sopenharmony_ci if (p->reg_to_const[i] >= 0) 174bf215546Sopenharmony_ci p->const_to_reg[p->reg_to_const[i]] = -1; 175bf215546Sopenharmony_ci 176bf215546Sopenharmony_ci p->reg_to_const[i] = id; 177bf215546Sopenharmony_ci p->const_to_reg[id] = i; 178bf215546Sopenharmony_ci 179bf215546Sopenharmony_ci /* TODO: this should happen outside the loop, if possible */ 180bf215546Sopenharmony_ci const void *c; 181bf215546Sopenharmony_ci if (id < NUM_FLOAT_CONSTS) 182bf215546Sopenharmony_ci c = &p->consts[id][0]; 183bf215546Sopenharmony_ci else 184bf215546Sopenharmony_ci c = &p->uconsts[id - NUM_FLOAT_CONSTS][0]; 185bf215546Sopenharmony_ci sse_movaps(p->func, reg, 186bf215546Sopenharmony_ci x86_make_disp(p->machine_EDI, get_offset(p, c))); 187bf215546Sopenharmony_ci 188bf215546Sopenharmony_ci return reg; 189bf215546Sopenharmony_ci} 190bf215546Sopenharmony_ci 191bf215546Sopenharmony_ci 192bf215546Sopenharmony_ci/* load the data in a SSE2 register, padding with zeros */ 193bf215546Sopenharmony_cistatic boolean 194bf215546Sopenharmony_ciemit_load_sse2(struct translate_sse *p, 195bf215546Sopenharmony_ci struct x86_reg data, struct x86_reg src, unsigned size) 196bf215546Sopenharmony_ci{ 197bf215546Sopenharmony_ci struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); 198bf215546Sopenharmony_ci struct x86_reg tmp = p->tmp_EAX; 199bf215546Sopenharmony_ci switch (size) { 200bf215546Sopenharmony_ci case 1: 201bf215546Sopenharmony_ci x86_movzx8(p->func, tmp, src); 202bf215546Sopenharmony_ci sse2_movd(p->func, data, tmp); 203bf215546Sopenharmony_ci break; 204bf215546Sopenharmony_ci case 2: 205bf215546Sopenharmony_ci x86_movzx16(p->func, tmp, src); 206bf215546Sopenharmony_ci sse2_movd(p->func, data, tmp); 207bf215546Sopenharmony_ci break; 208bf215546Sopenharmony_ci case 3: 209bf215546Sopenharmony_ci x86_movzx8(p->func, tmp, x86_make_disp(src, 2)); 210bf215546Sopenharmony_ci x86_shl_imm(p->func, tmp, 16); 211bf215546Sopenharmony_ci x86_mov16(p->func, tmp, src); 212bf215546Sopenharmony_ci sse2_movd(p->func, data, tmp); 213bf215546Sopenharmony_ci break; 214bf215546Sopenharmony_ci case 4: 215bf215546Sopenharmony_ci sse2_movd(p->func, data, src); 216bf215546Sopenharmony_ci break; 217bf215546Sopenharmony_ci case 6: 218bf215546Sopenharmony_ci sse2_movd(p->func, data, src); 219bf215546Sopenharmony_ci x86_movzx16(p->func, tmp, x86_make_disp(src, 4)); 220bf215546Sopenharmony_ci sse2_movd(p->func, tmpXMM, tmp); 221bf215546Sopenharmony_ci sse2_punpckldq(p->func, data, tmpXMM); 222bf215546Sopenharmony_ci break; 223bf215546Sopenharmony_ci case 8: 224bf215546Sopenharmony_ci sse2_movq(p->func, data, src); 225bf215546Sopenharmony_ci break; 226bf215546Sopenharmony_ci case 12: 227bf215546Sopenharmony_ci sse2_movq(p->func, data, src); 228bf215546Sopenharmony_ci sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8)); 229bf215546Sopenharmony_ci sse2_punpcklqdq(p->func, data, tmpXMM); 230bf215546Sopenharmony_ci break; 231bf215546Sopenharmony_ci case 16: 232bf215546Sopenharmony_ci sse2_movdqu(p->func, data, src); 233bf215546Sopenharmony_ci break; 234bf215546Sopenharmony_ci default: 235bf215546Sopenharmony_ci return FALSE; 236bf215546Sopenharmony_ci } 237bf215546Sopenharmony_ci return TRUE; 238bf215546Sopenharmony_ci} 239bf215546Sopenharmony_ci 240bf215546Sopenharmony_ci 241bf215546Sopenharmony_ci/* this value can be passed for the out_chans argument */ 242bf215546Sopenharmony_ci#define CHANNELS_0001 5 243bf215546Sopenharmony_ci 244bf215546Sopenharmony_ci 245bf215546Sopenharmony_ci/* this function will load #chans float values, and will 246bf215546Sopenharmony_ci * pad the register with zeroes at least up to out_chans. 247bf215546Sopenharmony_ci * 248bf215546Sopenharmony_ci * If out_chans is set to CHANNELS_0001, then the fourth 249bf215546Sopenharmony_ci * value will be padded with 1. Only pass this value if 250bf215546Sopenharmony_ci * chans < 4 or results are undefined. 251bf215546Sopenharmony_ci */ 252bf215546Sopenharmony_cistatic void 253bf215546Sopenharmony_ciemit_load_float32(struct translate_sse *p, struct x86_reg data, 254bf215546Sopenharmony_ci struct x86_reg arg0, unsigned out_chans, unsigned chans) 255bf215546Sopenharmony_ci{ 256bf215546Sopenharmony_ci switch (chans) { 257bf215546Sopenharmony_ci case 1: 258bf215546Sopenharmony_ci /* a 0 0 0 259bf215546Sopenharmony_ci * a 0 0 1 260bf215546Sopenharmony_ci */ 261bf215546Sopenharmony_ci sse_movss(p->func, data, arg0); 262bf215546Sopenharmony_ci if (out_chans == CHANNELS_0001) 263bf215546Sopenharmony_ci sse_orps(p->func, data, get_const(p, CONST_IDENTITY)); 264bf215546Sopenharmony_ci break; 265bf215546Sopenharmony_ci case 2: 266bf215546Sopenharmony_ci /* 0 0 0 1 267bf215546Sopenharmony_ci * a b 0 1 268bf215546Sopenharmony_ci */ 269bf215546Sopenharmony_ci if (out_chans == CHANNELS_0001) 270bf215546Sopenharmony_ci sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), 271bf215546Sopenharmony_ci SHUF(X, Y, Z, W)); 272bf215546Sopenharmony_ci else if (out_chans > 2) 273bf215546Sopenharmony_ci sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY)); 274bf215546Sopenharmony_ci sse_movlps(p->func, data, arg0); 275bf215546Sopenharmony_ci break; 276bf215546Sopenharmony_ci case 3: 277bf215546Sopenharmony_ci /* Have to jump through some hoops: 278bf215546Sopenharmony_ci * 279bf215546Sopenharmony_ci * c 0 0 0 280bf215546Sopenharmony_ci * c 0 0 1 if out_chans == CHANNELS_0001 281bf215546Sopenharmony_ci * 0 0 c 0/1 282bf215546Sopenharmony_ci * a b c 0/1 283bf215546Sopenharmony_ci */ 284bf215546Sopenharmony_ci sse_movss(p->func, data, x86_make_disp(arg0, 8)); 285bf215546Sopenharmony_ci if (out_chans == CHANNELS_0001) 286bf215546Sopenharmony_ci sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), 287bf215546Sopenharmony_ci SHUF(X, Y, Z, W)); 288bf215546Sopenharmony_ci sse_shufps(p->func, data, data, SHUF(Y, Z, X, W)); 289bf215546Sopenharmony_ci sse_movlps(p->func, data, arg0); 290bf215546Sopenharmony_ci break; 291bf215546Sopenharmony_ci case 4: 292bf215546Sopenharmony_ci sse_movups(p->func, data, arg0); 293bf215546Sopenharmony_ci break; 294bf215546Sopenharmony_ci } 295bf215546Sopenharmony_ci} 296bf215546Sopenharmony_ci 297bf215546Sopenharmony_ci/* this function behaves like emit_load_float32, but loads 298bf215546Sopenharmony_ci 64-bit floating point numbers, converting them to 32-bit 299bf215546Sopenharmony_ci ones */ 300bf215546Sopenharmony_cistatic void 301bf215546Sopenharmony_ciemit_load_float64to32(struct translate_sse *p, struct x86_reg data, 302bf215546Sopenharmony_ci struct x86_reg arg0, unsigned out_chans, unsigned chans) 303bf215546Sopenharmony_ci{ 304bf215546Sopenharmony_ci struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); 305bf215546Sopenharmony_ci switch (chans) { 306bf215546Sopenharmony_ci case 1: 307bf215546Sopenharmony_ci sse2_movsd(p->func, data, arg0); 308bf215546Sopenharmony_ci if (out_chans > 1) 309bf215546Sopenharmony_ci sse2_cvtpd2ps(p->func, data, data); 310bf215546Sopenharmony_ci else 311bf215546Sopenharmony_ci sse2_cvtsd2ss(p->func, data, data); 312bf215546Sopenharmony_ci if (out_chans == CHANNELS_0001) 313bf215546Sopenharmony_ci sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), 314bf215546Sopenharmony_ci SHUF(X, Y, Z, W)); 315bf215546Sopenharmony_ci break; 316bf215546Sopenharmony_ci case 2: 317bf215546Sopenharmony_ci sse2_movupd(p->func, data, arg0); 318bf215546Sopenharmony_ci sse2_cvtpd2ps(p->func, data, data); 319bf215546Sopenharmony_ci if (out_chans == CHANNELS_0001) 320bf215546Sopenharmony_ci sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), 321bf215546Sopenharmony_ci SHUF(X, Y, Z, W)); 322bf215546Sopenharmony_ci else if (out_chans > 2) 323bf215546Sopenharmony_ci sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY)); 324bf215546Sopenharmony_ci break; 325bf215546Sopenharmony_ci case 3: 326bf215546Sopenharmony_ci sse2_movupd(p->func, data, arg0); 327bf215546Sopenharmony_ci sse2_cvtpd2ps(p->func, data, data); 328bf215546Sopenharmony_ci sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16)); 329bf215546Sopenharmony_ci if (out_chans > 3) 330bf215546Sopenharmony_ci sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); 331bf215546Sopenharmony_ci else 332bf215546Sopenharmony_ci sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM); 333bf215546Sopenharmony_ci sse_movlhps(p->func, data, tmpXMM); 334bf215546Sopenharmony_ci if (out_chans == CHANNELS_0001) 335bf215546Sopenharmony_ci sse_orps(p->func, data, get_const(p, CONST_IDENTITY)); 336bf215546Sopenharmony_ci break; 337bf215546Sopenharmony_ci case 4: 338bf215546Sopenharmony_ci sse2_movupd(p->func, data, arg0); 339bf215546Sopenharmony_ci sse2_cvtpd2ps(p->func, data, data); 340bf215546Sopenharmony_ci sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16)); 341bf215546Sopenharmony_ci sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); 342bf215546Sopenharmony_ci sse_movlhps(p->func, data, tmpXMM); 343bf215546Sopenharmony_ci break; 344bf215546Sopenharmony_ci } 345bf215546Sopenharmony_ci} 346bf215546Sopenharmony_ci 347bf215546Sopenharmony_ci 348bf215546Sopenharmony_cistatic void 349bf215546Sopenharmony_ciemit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, 350bf215546Sopenharmony_ci struct x86_reg dst_xmm, struct x86_reg src_gpr, 351bf215546Sopenharmony_ci struct x86_reg src_xmm) 352bf215546Sopenharmony_ci{ 353bf215546Sopenharmony_ci if (x86_target(p->func) != X86_32) 354bf215546Sopenharmony_ci x64_mov64(p->func, dst_gpr, src_gpr); 355bf215546Sopenharmony_ci else { 356bf215546Sopenharmony_ci /* TODO: when/on which CPUs is SSE2 actually better than SSE? */ 357bf215546Sopenharmony_ci if (x86_target_caps(p->func) & X86_SSE2) 358bf215546Sopenharmony_ci sse2_movq(p->func, dst_xmm, src_xmm); 359bf215546Sopenharmony_ci else 360bf215546Sopenharmony_ci sse_movlps(p->func, dst_xmm, src_xmm); 361bf215546Sopenharmony_ci } 362bf215546Sopenharmony_ci} 363bf215546Sopenharmony_ci 364bf215546Sopenharmony_ci 365bf215546Sopenharmony_cistatic void 366bf215546Sopenharmony_ciemit_load64(struct translate_sse *p, struct x86_reg dst_gpr, 367bf215546Sopenharmony_ci struct x86_reg dst_xmm, struct x86_reg src) 368bf215546Sopenharmony_ci{ 369bf215546Sopenharmony_ci emit_mov64(p, dst_gpr, dst_xmm, src, src); 370bf215546Sopenharmony_ci} 371bf215546Sopenharmony_ci 372bf215546Sopenharmony_ci 373bf215546Sopenharmony_cistatic void 374bf215546Sopenharmony_ciemit_store64(struct translate_sse *p, struct x86_reg dst, 375bf215546Sopenharmony_ci struct x86_reg src_gpr, struct x86_reg src_xmm) 376bf215546Sopenharmony_ci{ 377bf215546Sopenharmony_ci emit_mov64(p, dst, dst, src_gpr, src_xmm); 378bf215546Sopenharmony_ci} 379bf215546Sopenharmony_ci 380bf215546Sopenharmony_ci 381bf215546Sopenharmony_cistatic void 382bf215546Sopenharmony_ciemit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src) 383bf215546Sopenharmony_ci{ 384bf215546Sopenharmony_ci if (x86_target_caps(p->func) & X86_SSE2) 385bf215546Sopenharmony_ci sse2_movdqu(p->func, dst, src); 386bf215546Sopenharmony_ci else 387bf215546Sopenharmony_ci sse_movups(p->func, dst, src); 388bf215546Sopenharmony_ci} 389bf215546Sopenharmony_ci 390bf215546Sopenharmony_ci 391bf215546Sopenharmony_ci/* TODO: this uses unaligned accesses liberally, which is great on Nehalem, 392bf215546Sopenharmony_ci * but may or may not be good on older processors 393bf215546Sopenharmony_ci * TODO: may perhaps want to use non-temporal stores here if possible 394bf215546Sopenharmony_ci */ 395bf215546Sopenharmony_cistatic void 396bf215546Sopenharmony_ciemit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, 397bf215546Sopenharmony_ci unsigned size) 398bf215546Sopenharmony_ci{ 399bf215546Sopenharmony_ci struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 400bf215546Sopenharmony_ci struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1); 401bf215546Sopenharmony_ci struct x86_reg dataGPR = p->tmp_EAX; 402bf215546Sopenharmony_ci struct x86_reg dataGPR2 = p->tmp2_EDX; 403bf215546Sopenharmony_ci 404bf215546Sopenharmony_ci if (size < 8) { 405bf215546Sopenharmony_ci switch (size) { 406bf215546Sopenharmony_ci case 1: 407bf215546Sopenharmony_ci x86_mov8(p->func, dataGPR, src); 408bf215546Sopenharmony_ci x86_mov8(p->func, dst, dataGPR); 409bf215546Sopenharmony_ci break; 410bf215546Sopenharmony_ci case 2: 411bf215546Sopenharmony_ci x86_mov16(p->func, dataGPR, src); 412bf215546Sopenharmony_ci x86_mov16(p->func, dst, dataGPR); 413bf215546Sopenharmony_ci break; 414bf215546Sopenharmony_ci case 3: 415bf215546Sopenharmony_ci x86_mov16(p->func, dataGPR, src); 416bf215546Sopenharmony_ci x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2)); 417bf215546Sopenharmony_ci x86_mov16(p->func, dst, dataGPR); 418bf215546Sopenharmony_ci x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2); 419bf215546Sopenharmony_ci break; 420bf215546Sopenharmony_ci case 4: 421bf215546Sopenharmony_ci x86_mov(p->func, dataGPR, src); 422bf215546Sopenharmony_ci x86_mov(p->func, dst, dataGPR); 423bf215546Sopenharmony_ci break; 424bf215546Sopenharmony_ci case 6: 425bf215546Sopenharmony_ci x86_mov(p->func, dataGPR, src); 426bf215546Sopenharmony_ci x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4)); 427bf215546Sopenharmony_ci x86_mov(p->func, dst, dataGPR); 428bf215546Sopenharmony_ci x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2); 429bf215546Sopenharmony_ci break; 430bf215546Sopenharmony_ci } 431bf215546Sopenharmony_ci } 432bf215546Sopenharmony_ci else if (!(x86_target_caps(p->func) & X86_SSE)) { 433bf215546Sopenharmony_ci unsigned i = 0; 434bf215546Sopenharmony_ci assert((size & 3) == 0); 435bf215546Sopenharmony_ci for (i = 0; i < size; i += 4) { 436bf215546Sopenharmony_ci x86_mov(p->func, dataGPR, x86_make_disp(src, i)); 437bf215546Sopenharmony_ci x86_mov(p->func, x86_make_disp(dst, i), dataGPR); 438bf215546Sopenharmony_ci } 439bf215546Sopenharmony_ci } 440bf215546Sopenharmony_ci else { 441bf215546Sopenharmony_ci switch (size) { 442bf215546Sopenharmony_ci case 8: 443bf215546Sopenharmony_ci emit_load64(p, dataGPR, dataXMM, src); 444bf215546Sopenharmony_ci emit_store64(p, dst, dataGPR, dataXMM); 445bf215546Sopenharmony_ci break; 446bf215546Sopenharmony_ci case 12: 447bf215546Sopenharmony_ci emit_load64(p, dataGPR2, dataXMM, src); 448bf215546Sopenharmony_ci x86_mov(p->func, dataGPR, x86_make_disp(src, 8)); 449bf215546Sopenharmony_ci emit_store64(p, dst, dataGPR2, dataXMM); 450bf215546Sopenharmony_ci x86_mov(p->func, x86_make_disp(dst, 8), dataGPR); 451bf215546Sopenharmony_ci break; 452bf215546Sopenharmony_ci case 16: 453bf215546Sopenharmony_ci emit_mov128(p, dataXMM, src); 454bf215546Sopenharmony_ci emit_mov128(p, dst, dataXMM); 455bf215546Sopenharmony_ci break; 456bf215546Sopenharmony_ci case 24: 457bf215546Sopenharmony_ci emit_mov128(p, dataXMM, src); 458bf215546Sopenharmony_ci emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16)); 459bf215546Sopenharmony_ci emit_mov128(p, dst, dataXMM); 460bf215546Sopenharmony_ci emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2); 461bf215546Sopenharmony_ci break; 462bf215546Sopenharmony_ci case 32: 463bf215546Sopenharmony_ci emit_mov128(p, dataXMM, src); 464bf215546Sopenharmony_ci emit_mov128(p, dataXMM2, x86_make_disp(src, 16)); 465bf215546Sopenharmony_ci emit_mov128(p, dst, dataXMM); 466bf215546Sopenharmony_ci emit_mov128(p, x86_make_disp(dst, 16), dataXMM2); 467bf215546Sopenharmony_ci break; 468bf215546Sopenharmony_ci default: 469bf215546Sopenharmony_ci assert(0); 470bf215546Sopenharmony_ci } 471bf215546Sopenharmony_ci } 472bf215546Sopenharmony_ci} 473bf215546Sopenharmony_ci 474bf215546Sopenharmony_cistatic boolean 475bf215546Sopenharmony_citranslate_attr_convert(struct translate_sse *p, 476bf215546Sopenharmony_ci const struct translate_element *a, 477bf215546Sopenharmony_ci struct x86_reg src, struct x86_reg dst) 478bf215546Sopenharmony_ci{ 479bf215546Sopenharmony_ci const struct util_format_description *input_desc = 480bf215546Sopenharmony_ci util_format_description(a->input_format); 481bf215546Sopenharmony_ci const struct util_format_description *output_desc = 482bf215546Sopenharmony_ci util_format_description(a->output_format); 483bf215546Sopenharmony_ci unsigned i; 484bf215546Sopenharmony_ci boolean id_swizzle = TRUE; 485bf215546Sopenharmony_ci unsigned swizzle[4] = 486bf215546Sopenharmony_ci { PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE, 487bf215546Sopenharmony_ci PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE }; 488bf215546Sopenharmony_ci unsigned needed_chans = 0; 489bf215546Sopenharmony_ci unsigned imms[2] = { 0, 0x3f800000 }; 490bf215546Sopenharmony_ci 491bf215546Sopenharmony_ci if (a->output_format == PIPE_FORMAT_NONE 492bf215546Sopenharmony_ci || a->input_format == PIPE_FORMAT_NONE) 493bf215546Sopenharmony_ci return FALSE; 494bf215546Sopenharmony_ci 495bf215546Sopenharmony_ci if (input_desc->channel[0].size & 7) 496bf215546Sopenharmony_ci return FALSE; 497bf215546Sopenharmony_ci 498bf215546Sopenharmony_ci if (input_desc->colorspace != output_desc->colorspace) 499bf215546Sopenharmony_ci return FALSE; 500bf215546Sopenharmony_ci 501bf215546Sopenharmony_ci for (i = 1; i < input_desc->nr_channels; ++i) { 502bf215546Sopenharmony_ci if (memcmp 503bf215546Sopenharmony_ci (&input_desc->channel[i], &input_desc->channel[0], 504bf215546Sopenharmony_ci sizeof(input_desc->channel[0]))) 505bf215546Sopenharmony_ci return FALSE; 506bf215546Sopenharmony_ci } 507bf215546Sopenharmony_ci 508bf215546Sopenharmony_ci for (i = 1; i < output_desc->nr_channels; ++i) { 509bf215546Sopenharmony_ci if (memcmp 510bf215546Sopenharmony_ci (&output_desc->channel[i], &output_desc->channel[0], 511bf215546Sopenharmony_ci sizeof(output_desc->channel[0]))) { 512bf215546Sopenharmony_ci return FALSE; 513bf215546Sopenharmony_ci } 514bf215546Sopenharmony_ci } 515bf215546Sopenharmony_ci 516bf215546Sopenharmony_ci for (i = 0; i < output_desc->nr_channels; ++i) { 517bf215546Sopenharmony_ci if (output_desc->swizzle[i] < 4) 518bf215546Sopenharmony_ci swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i]; 519bf215546Sopenharmony_ci } 520bf215546Sopenharmony_ci 521bf215546Sopenharmony_ci if ((x86_target_caps(p->func) & X86_SSE) && 522bf215546Sopenharmony_ci (0 || a->output_format == PIPE_FORMAT_R32_FLOAT 523bf215546Sopenharmony_ci || a->output_format == PIPE_FORMAT_R32G32_FLOAT 524bf215546Sopenharmony_ci || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT 525bf215546Sopenharmony_ci || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) { 526bf215546Sopenharmony_ci struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 527bf215546Sopenharmony_ci struct x86_reg auxXMM; 528bf215546Sopenharmony_ci 529bf215546Sopenharmony_ci for (i = 0; i < output_desc->nr_channels; ++i) { 530bf215546Sopenharmony_ci if (swizzle[i] == PIPE_SWIZZLE_0 531bf215546Sopenharmony_ci && i >= input_desc->nr_channels) 532bf215546Sopenharmony_ci swizzle[i] = i; 533bf215546Sopenharmony_ci } 534bf215546Sopenharmony_ci 535bf215546Sopenharmony_ci for (i = 0; i < output_desc->nr_channels; ++i) { 536bf215546Sopenharmony_ci if (swizzle[i] < 4) 537bf215546Sopenharmony_ci needed_chans = MAX2(needed_chans, swizzle[i] + 1); 538bf215546Sopenharmony_ci if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i) 539bf215546Sopenharmony_ci id_swizzle = FALSE; 540bf215546Sopenharmony_ci } 541bf215546Sopenharmony_ci 542bf215546Sopenharmony_ci if (needed_chans > 0) { 543bf215546Sopenharmony_ci switch (input_desc->channel[0].type) { 544bf215546Sopenharmony_ci case UTIL_FORMAT_TYPE_UNSIGNED: 545bf215546Sopenharmony_ci if (!(x86_target_caps(p->func) & X86_SSE2)) 546bf215546Sopenharmony_ci return FALSE; 547bf215546Sopenharmony_ci emit_load_sse2(p, dataXMM, src, 548bf215546Sopenharmony_ci input_desc->channel[0].size * 549bf215546Sopenharmony_ci input_desc->nr_channels >> 3); 550bf215546Sopenharmony_ci 551bf215546Sopenharmony_ci /* TODO: add support for SSE4.1 pmovzx */ 552bf215546Sopenharmony_ci switch (input_desc->channel[0].size) { 553bf215546Sopenharmony_ci case 8: 554bf215546Sopenharmony_ci /* TODO: this may be inefficient due to get_identity() being 555bf215546Sopenharmony_ci * used both as a float and integer register. 556bf215546Sopenharmony_ci */ 557bf215546Sopenharmony_ci sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 558bf215546Sopenharmony_ci sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 559bf215546Sopenharmony_ci break; 560bf215546Sopenharmony_ci case 16: 561bf215546Sopenharmony_ci sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 562bf215546Sopenharmony_ci break; 563bf215546Sopenharmony_ci case 32: /* we lose precision here */ 564bf215546Sopenharmony_ci /* No unsigned conversion (except in AVX512F), so we check if 565bf215546Sopenharmony_ci * it's negative, and stick the high bit as a separate float 566bf215546Sopenharmony_ci * value in an aux register: */ 567bf215546Sopenharmony_ci auxXMM = x86_make_reg(file_XMM, 1); 568bf215546Sopenharmony_ci /* aux = 0 */ 569bf215546Sopenharmony_ci sse_xorps(p->func, auxXMM, auxXMM); 570bf215546Sopenharmony_ci /* aux = aux > data ? 0xffffffff : 0 */ 571bf215546Sopenharmony_ci sse2_pcmpgtd(p->func, auxXMM, dataXMM); 572bf215546Sopenharmony_ci /* data = data & 0x7fffffff */ 573bf215546Sopenharmony_ci sse_andps(p->func, dataXMM, get_const(p, CONST_2147483647_INT)); 574bf215546Sopenharmony_ci /* aux = aux & 2147483648.0 */ 575bf215546Sopenharmony_ci sse_andps(p->func, auxXMM, get_const(p, CONST_2147483648)); 576bf215546Sopenharmony_ci break; 577bf215546Sopenharmony_ci default: 578bf215546Sopenharmony_ci return FALSE; 579bf215546Sopenharmony_ci } 580bf215546Sopenharmony_ci sse2_cvtdq2ps(p->func, dataXMM, dataXMM); 581bf215546Sopenharmony_ci if (input_desc->channel[0].size == 32) 582bf215546Sopenharmony_ci /* add in the high bit's worth of float that we AND'd away */ 583bf215546Sopenharmony_ci sse_addps(p->func, dataXMM, auxXMM); 584bf215546Sopenharmony_ci if (input_desc->channel[0].normalized) { 585bf215546Sopenharmony_ci struct x86_reg factor; 586bf215546Sopenharmony_ci switch (input_desc->channel[0].size) { 587bf215546Sopenharmony_ci case 8: 588bf215546Sopenharmony_ci factor = get_const(p, CONST_INV_255); 589bf215546Sopenharmony_ci break; 590bf215546Sopenharmony_ci case 16: 591bf215546Sopenharmony_ci factor = get_const(p, CONST_INV_65535); 592bf215546Sopenharmony_ci break; 593bf215546Sopenharmony_ci case 32: 594bf215546Sopenharmony_ci factor = get_const(p, CONST_INV_4294967295); 595bf215546Sopenharmony_ci break; 596bf215546Sopenharmony_ci default: 597bf215546Sopenharmony_ci assert(0); 598bf215546Sopenharmony_ci factor.disp = 0; 599bf215546Sopenharmony_ci factor.file = 0; 600bf215546Sopenharmony_ci factor.idx = 0; 601bf215546Sopenharmony_ci factor.mod = 0; 602bf215546Sopenharmony_ci break; 603bf215546Sopenharmony_ci } 604bf215546Sopenharmony_ci sse_mulps(p->func, dataXMM, factor); 605bf215546Sopenharmony_ci } 606bf215546Sopenharmony_ci break; 607bf215546Sopenharmony_ci case UTIL_FORMAT_TYPE_SIGNED: 608bf215546Sopenharmony_ci if (!(x86_target_caps(p->func) & X86_SSE2)) 609bf215546Sopenharmony_ci return FALSE; 610bf215546Sopenharmony_ci emit_load_sse2(p, dataXMM, src, 611bf215546Sopenharmony_ci input_desc->channel[0].size * 612bf215546Sopenharmony_ci input_desc->nr_channels >> 3); 613bf215546Sopenharmony_ci 614bf215546Sopenharmony_ci /* TODO: add support for SSE4.1 pmovsx */ 615bf215546Sopenharmony_ci switch (input_desc->channel[0].size) { 616bf215546Sopenharmony_ci case 8: 617bf215546Sopenharmony_ci sse2_punpcklbw(p->func, dataXMM, dataXMM); 618bf215546Sopenharmony_ci sse2_punpcklbw(p->func, dataXMM, dataXMM); 619bf215546Sopenharmony_ci sse2_psrad_imm(p->func, dataXMM, 24); 620bf215546Sopenharmony_ci break; 621bf215546Sopenharmony_ci case 16: 622bf215546Sopenharmony_ci sse2_punpcklwd(p->func, dataXMM, dataXMM); 623bf215546Sopenharmony_ci sse2_psrad_imm(p->func, dataXMM, 16); 624bf215546Sopenharmony_ci break; 625bf215546Sopenharmony_ci case 32: /* we lose precision here */ 626bf215546Sopenharmony_ci break; 627bf215546Sopenharmony_ci default: 628bf215546Sopenharmony_ci return FALSE; 629bf215546Sopenharmony_ci } 630bf215546Sopenharmony_ci sse2_cvtdq2ps(p->func, dataXMM, dataXMM); 631bf215546Sopenharmony_ci if (input_desc->channel[0].normalized) { 632bf215546Sopenharmony_ci struct x86_reg factor; 633bf215546Sopenharmony_ci switch (input_desc->channel[0].size) { 634bf215546Sopenharmony_ci case 8: 635bf215546Sopenharmony_ci factor = get_const(p, CONST_INV_127); 636bf215546Sopenharmony_ci break; 637bf215546Sopenharmony_ci case 16: 638bf215546Sopenharmony_ci factor = get_const(p, CONST_INV_32767); 639bf215546Sopenharmony_ci break; 640bf215546Sopenharmony_ci case 32: 641bf215546Sopenharmony_ci factor = get_const(p, CONST_INV_2147483647); 642bf215546Sopenharmony_ci break; 643bf215546Sopenharmony_ci default: 644bf215546Sopenharmony_ci assert(0); 645bf215546Sopenharmony_ci factor.disp = 0; 646bf215546Sopenharmony_ci factor.file = 0; 647bf215546Sopenharmony_ci factor.idx = 0; 648bf215546Sopenharmony_ci factor.mod = 0; 649bf215546Sopenharmony_ci break; 650bf215546Sopenharmony_ci } 651bf215546Sopenharmony_ci sse_mulps(p->func, dataXMM, factor); 652bf215546Sopenharmony_ci } 653bf215546Sopenharmony_ci break; 654bf215546Sopenharmony_ci 655bf215546Sopenharmony_ci break; 656bf215546Sopenharmony_ci case UTIL_FORMAT_TYPE_FLOAT: 657bf215546Sopenharmony_ci if (input_desc->channel[0].size != 32 658bf215546Sopenharmony_ci && input_desc->channel[0].size != 64) { 659bf215546Sopenharmony_ci return FALSE; 660bf215546Sopenharmony_ci } 661bf215546Sopenharmony_ci if (swizzle[3] == PIPE_SWIZZLE_1 662bf215546Sopenharmony_ci && input_desc->nr_channels <= 3) { 663bf215546Sopenharmony_ci swizzle[3] = PIPE_SWIZZLE_W; 664bf215546Sopenharmony_ci needed_chans = CHANNELS_0001; 665bf215546Sopenharmony_ci } 666bf215546Sopenharmony_ci switch (input_desc->channel[0].size) { 667bf215546Sopenharmony_ci case 32: 668bf215546Sopenharmony_ci emit_load_float32(p, dataXMM, src, needed_chans, 669bf215546Sopenharmony_ci input_desc->nr_channels); 670bf215546Sopenharmony_ci break; 671bf215546Sopenharmony_ci case 64: /* we lose precision here */ 672bf215546Sopenharmony_ci if (!(x86_target_caps(p->func) & X86_SSE2)) 673bf215546Sopenharmony_ci return FALSE; 674bf215546Sopenharmony_ci emit_load_float64to32(p, dataXMM, src, needed_chans, 675bf215546Sopenharmony_ci input_desc->nr_channels); 676bf215546Sopenharmony_ci break; 677bf215546Sopenharmony_ci default: 678bf215546Sopenharmony_ci return FALSE; 679bf215546Sopenharmony_ci } 680bf215546Sopenharmony_ci break; 681bf215546Sopenharmony_ci default: 682bf215546Sopenharmony_ci return FALSE; 683bf215546Sopenharmony_ci } 684bf215546Sopenharmony_ci 685bf215546Sopenharmony_ci if (!id_swizzle) { 686bf215546Sopenharmony_ci sse_shufps(p->func, dataXMM, dataXMM, 687bf215546Sopenharmony_ci SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3])); 688bf215546Sopenharmony_ci } 689bf215546Sopenharmony_ci } 690bf215546Sopenharmony_ci 691bf215546Sopenharmony_ci if (output_desc->nr_channels >= 4 692bf215546Sopenharmony_ci && swizzle[0] < PIPE_SWIZZLE_0 693bf215546Sopenharmony_ci && swizzle[1] < PIPE_SWIZZLE_0 694bf215546Sopenharmony_ci && swizzle[2] < PIPE_SWIZZLE_0 695bf215546Sopenharmony_ci && swizzle[3] < PIPE_SWIZZLE_0) { 696bf215546Sopenharmony_ci sse_movups(p->func, dst, dataXMM); 697bf215546Sopenharmony_ci } 698bf215546Sopenharmony_ci else { 699bf215546Sopenharmony_ci if (output_desc->nr_channels >= 2 700bf215546Sopenharmony_ci && swizzle[0] < PIPE_SWIZZLE_0 701bf215546Sopenharmony_ci && swizzle[1] < PIPE_SWIZZLE_0) { 702bf215546Sopenharmony_ci sse_movlps(p->func, dst, dataXMM); 703bf215546Sopenharmony_ci } 704bf215546Sopenharmony_ci else { 705bf215546Sopenharmony_ci if (swizzle[0] < PIPE_SWIZZLE_0) { 706bf215546Sopenharmony_ci sse_movss(p->func, dst, dataXMM); 707bf215546Sopenharmony_ci } 708bf215546Sopenharmony_ci else { 709bf215546Sopenharmony_ci x86_mov_imm(p->func, dst, 710bf215546Sopenharmony_ci imms[swizzle[0] - PIPE_SWIZZLE_0]); 711bf215546Sopenharmony_ci } 712bf215546Sopenharmony_ci 713bf215546Sopenharmony_ci if (output_desc->nr_channels >= 2) { 714bf215546Sopenharmony_ci if (swizzle[1] < PIPE_SWIZZLE_0) { 715bf215546Sopenharmony_ci sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3)); 716bf215546Sopenharmony_ci sse_movss(p->func, x86_make_disp(dst, 4), dataXMM); 717bf215546Sopenharmony_ci } 718bf215546Sopenharmony_ci else { 719bf215546Sopenharmony_ci x86_mov_imm(p->func, x86_make_disp(dst, 4), 720bf215546Sopenharmony_ci imms[swizzle[1] - PIPE_SWIZZLE_0]); 721bf215546Sopenharmony_ci } 722bf215546Sopenharmony_ci } 723bf215546Sopenharmony_ci } 724bf215546Sopenharmony_ci 725bf215546Sopenharmony_ci if (output_desc->nr_channels >= 3) { 726bf215546Sopenharmony_ci if (output_desc->nr_channels >= 4 727bf215546Sopenharmony_ci && swizzle[2] < PIPE_SWIZZLE_0 728bf215546Sopenharmony_ci && swizzle[3] < PIPE_SWIZZLE_0) { 729bf215546Sopenharmony_ci sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM); 730bf215546Sopenharmony_ci } 731bf215546Sopenharmony_ci else { 732bf215546Sopenharmony_ci if (swizzle[2] < PIPE_SWIZZLE_0) { 733bf215546Sopenharmony_ci sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3)); 734bf215546Sopenharmony_ci sse_movss(p->func, x86_make_disp(dst, 8), dataXMM); 735bf215546Sopenharmony_ci } 736bf215546Sopenharmony_ci else { 737bf215546Sopenharmony_ci x86_mov_imm(p->func, x86_make_disp(dst, 8), 738bf215546Sopenharmony_ci imms[swizzle[2] - PIPE_SWIZZLE_0]); 739bf215546Sopenharmony_ci } 740bf215546Sopenharmony_ci 741bf215546Sopenharmony_ci if (output_desc->nr_channels >= 4) { 742bf215546Sopenharmony_ci if (swizzle[3] < PIPE_SWIZZLE_0) { 743bf215546Sopenharmony_ci sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3)); 744bf215546Sopenharmony_ci sse_movss(p->func, x86_make_disp(dst, 12), dataXMM); 745bf215546Sopenharmony_ci } 746bf215546Sopenharmony_ci else { 747bf215546Sopenharmony_ci x86_mov_imm(p->func, x86_make_disp(dst, 12), 748bf215546Sopenharmony_ci imms[swizzle[3] - PIPE_SWIZZLE_0]); 749bf215546Sopenharmony_ci } 750bf215546Sopenharmony_ci } 751bf215546Sopenharmony_ci } 752bf215546Sopenharmony_ci } 753bf215546Sopenharmony_ci } 754bf215546Sopenharmony_ci return TRUE; 755bf215546Sopenharmony_ci } 756bf215546Sopenharmony_ci else if ((x86_target_caps(p->func) & X86_SSE2) 757bf215546Sopenharmony_ci && input_desc->channel[0].size == 8 758bf215546Sopenharmony_ci && output_desc->channel[0].size == 16 759bf215546Sopenharmony_ci && output_desc->channel[0].normalized == 760bf215546Sopenharmony_ci input_desc->channel[0].normalized && 761bf215546Sopenharmony_ci (0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED 762bf215546Sopenharmony_ci && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) 763bf215546Sopenharmony_ci || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED 764bf215546Sopenharmony_ci && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) 765bf215546Sopenharmony_ci || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED 766bf215546Sopenharmony_ci && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) { 767bf215546Sopenharmony_ci struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 768bf215546Sopenharmony_ci struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); 769bf215546Sopenharmony_ci struct x86_reg tmp = p->tmp_EAX; 770bf215546Sopenharmony_ci unsigned imms[2] = { 0, 1 }; 771bf215546Sopenharmony_ci 772bf215546Sopenharmony_ci for (i = 0; i < output_desc->nr_channels; ++i) { 773bf215546Sopenharmony_ci if (swizzle[i] == PIPE_SWIZZLE_0 774bf215546Sopenharmony_ci && i >= input_desc->nr_channels) { 775bf215546Sopenharmony_ci swizzle[i] = i; 776bf215546Sopenharmony_ci } 777bf215546Sopenharmony_ci } 778bf215546Sopenharmony_ci 779bf215546Sopenharmony_ci for (i = 0; i < output_desc->nr_channels; ++i) { 780bf215546Sopenharmony_ci if (swizzle[i] < 4) 781bf215546Sopenharmony_ci needed_chans = MAX2(needed_chans, swizzle[i] + 1); 782bf215546Sopenharmony_ci if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i) 783bf215546Sopenharmony_ci id_swizzle = FALSE; 784bf215546Sopenharmony_ci } 785bf215546Sopenharmony_ci 786bf215546Sopenharmony_ci if (needed_chans > 0) { 787bf215546Sopenharmony_ci emit_load_sse2(p, dataXMM, src, 788bf215546Sopenharmony_ci input_desc->channel[0].size * 789bf215546Sopenharmony_ci input_desc->nr_channels >> 3); 790bf215546Sopenharmony_ci 791bf215546Sopenharmony_ci switch (input_desc->channel[0].type) { 792bf215546Sopenharmony_ci case UTIL_FORMAT_TYPE_UNSIGNED: 793bf215546Sopenharmony_ci if (input_desc->channel[0].normalized) { 794bf215546Sopenharmony_ci sse2_punpcklbw(p->func, dataXMM, dataXMM); 795bf215546Sopenharmony_ci if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) 796bf215546Sopenharmony_ci sse2_psrlw_imm(p->func, dataXMM, 1); 797bf215546Sopenharmony_ci } 798bf215546Sopenharmony_ci else 799bf215546Sopenharmony_ci sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 800bf215546Sopenharmony_ci break; 801bf215546Sopenharmony_ci case UTIL_FORMAT_TYPE_SIGNED: 802bf215546Sopenharmony_ci if (input_desc->channel[0].normalized) { 803bf215546Sopenharmony_ci sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY)); 804bf215546Sopenharmony_ci sse2_punpcklbw(p->func, tmpXMM, dataXMM); 805bf215546Sopenharmony_ci sse2_psllw_imm(p->func, dataXMM, 9); 806bf215546Sopenharmony_ci sse2_psrlw_imm(p->func, dataXMM, 8); 807bf215546Sopenharmony_ci sse2_por(p->func, tmpXMM, dataXMM); 808bf215546Sopenharmony_ci sse2_psrlw_imm(p->func, dataXMM, 7); 809bf215546Sopenharmony_ci sse2_por(p->func, tmpXMM, dataXMM); 810bf215546Sopenharmony_ci { 811bf215546Sopenharmony_ci struct x86_reg t = dataXMM; 812bf215546Sopenharmony_ci dataXMM = tmpXMM; 813bf215546Sopenharmony_ci tmpXMM = t; 814bf215546Sopenharmony_ci } 815bf215546Sopenharmony_ci } 816bf215546Sopenharmony_ci else { 817bf215546Sopenharmony_ci sse2_punpcklbw(p->func, dataXMM, dataXMM); 818bf215546Sopenharmony_ci sse2_psraw_imm(p->func, dataXMM, 8); 819bf215546Sopenharmony_ci } 820bf215546Sopenharmony_ci break; 821bf215546Sopenharmony_ci default: 822bf215546Sopenharmony_ci assert(0); 823bf215546Sopenharmony_ci } 824bf215546Sopenharmony_ci 825bf215546Sopenharmony_ci if (output_desc->channel[0].normalized) 826bf215546Sopenharmony_ci imms[1] = 827bf215546Sopenharmony_ci (output_desc->channel[0].type == 828bf215546Sopenharmony_ci UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff; 829bf215546Sopenharmony_ci 830bf215546Sopenharmony_ci if (!id_swizzle) 831bf215546Sopenharmony_ci sse2_pshuflw(p->func, dataXMM, dataXMM, 832bf215546Sopenharmony_ci (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | 833bf215546Sopenharmony_ci ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6)); 834bf215546Sopenharmony_ci } 835bf215546Sopenharmony_ci 836bf215546Sopenharmony_ci if (output_desc->nr_channels >= 4 837bf215546Sopenharmony_ci && swizzle[0] < PIPE_SWIZZLE_0 838bf215546Sopenharmony_ci && swizzle[1] < PIPE_SWIZZLE_0 839bf215546Sopenharmony_ci && swizzle[2] < PIPE_SWIZZLE_0 840bf215546Sopenharmony_ci && swizzle[3] < PIPE_SWIZZLE_0) { 841bf215546Sopenharmony_ci sse2_movq(p->func, dst, dataXMM); 842bf215546Sopenharmony_ci } 843bf215546Sopenharmony_ci else { 844bf215546Sopenharmony_ci if (swizzle[0] < PIPE_SWIZZLE_0) { 845bf215546Sopenharmony_ci if (output_desc->nr_channels >= 2 846bf215546Sopenharmony_ci && swizzle[1] < PIPE_SWIZZLE_0) { 847bf215546Sopenharmony_ci sse2_movd(p->func, dst, dataXMM); 848bf215546Sopenharmony_ci } 849bf215546Sopenharmony_ci else { 850bf215546Sopenharmony_ci sse2_movd(p->func, tmp, dataXMM); 851bf215546Sopenharmony_ci x86_mov16(p->func, dst, tmp); 852bf215546Sopenharmony_ci if (output_desc->nr_channels >= 2) 853bf215546Sopenharmony_ci x86_mov16_imm(p->func, x86_make_disp(dst, 2), 854bf215546Sopenharmony_ci imms[swizzle[1] - PIPE_SWIZZLE_0]); 855bf215546Sopenharmony_ci } 856bf215546Sopenharmony_ci } 857bf215546Sopenharmony_ci else { 858bf215546Sopenharmony_ci if (output_desc->nr_channels >= 2 859bf215546Sopenharmony_ci && swizzle[1] >= PIPE_SWIZZLE_0) { 860bf215546Sopenharmony_ci x86_mov_imm(p->func, dst, 861bf215546Sopenharmony_ci (imms[swizzle[1] - PIPE_SWIZZLE_0] << 16) | 862bf215546Sopenharmony_ci imms[swizzle[0] - PIPE_SWIZZLE_0]); 863bf215546Sopenharmony_ci } 864bf215546Sopenharmony_ci else { 865bf215546Sopenharmony_ci x86_mov16_imm(p->func, dst, 866bf215546Sopenharmony_ci imms[swizzle[0] - PIPE_SWIZZLE_0]); 867bf215546Sopenharmony_ci if (output_desc->nr_channels >= 2) { 868bf215546Sopenharmony_ci sse2_movd(p->func, tmp, dataXMM); 869bf215546Sopenharmony_ci x86_shr_imm(p->func, tmp, 16); 870bf215546Sopenharmony_ci x86_mov16(p->func, x86_make_disp(dst, 2), tmp); 871bf215546Sopenharmony_ci } 872bf215546Sopenharmony_ci } 873bf215546Sopenharmony_ci } 874bf215546Sopenharmony_ci 875bf215546Sopenharmony_ci if (output_desc->nr_channels >= 3) { 876bf215546Sopenharmony_ci if (swizzle[2] < PIPE_SWIZZLE_0) { 877bf215546Sopenharmony_ci if (output_desc->nr_channels >= 4 878bf215546Sopenharmony_ci && swizzle[3] < PIPE_SWIZZLE_0) { 879bf215546Sopenharmony_ci sse2_psrlq_imm(p->func, dataXMM, 32); 880bf215546Sopenharmony_ci sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM); 881bf215546Sopenharmony_ci } 882bf215546Sopenharmony_ci else { 883bf215546Sopenharmony_ci sse2_psrlq_imm(p->func, dataXMM, 32); 884bf215546Sopenharmony_ci sse2_movd(p->func, tmp, dataXMM); 885bf215546Sopenharmony_ci x86_mov16(p->func, x86_make_disp(dst, 4), tmp); 886bf215546Sopenharmony_ci if (output_desc->nr_channels >= 4) { 887bf215546Sopenharmony_ci x86_mov16_imm(p->func, x86_make_disp(dst, 6), 888bf215546Sopenharmony_ci imms[swizzle[3] - PIPE_SWIZZLE_0]); 889bf215546Sopenharmony_ci } 890bf215546Sopenharmony_ci } 891bf215546Sopenharmony_ci } 892bf215546Sopenharmony_ci else { 893bf215546Sopenharmony_ci if (output_desc->nr_channels >= 4 894bf215546Sopenharmony_ci && swizzle[3] >= PIPE_SWIZZLE_0) { 895bf215546Sopenharmony_ci x86_mov_imm(p->func, x86_make_disp(dst, 4), 896bf215546Sopenharmony_ci (imms[swizzle[3] - PIPE_SWIZZLE_0] << 16) 897bf215546Sopenharmony_ci | imms[swizzle[2] - PIPE_SWIZZLE_0]); 898bf215546Sopenharmony_ci } 899bf215546Sopenharmony_ci else { 900bf215546Sopenharmony_ci x86_mov16_imm(p->func, x86_make_disp(dst, 4), 901bf215546Sopenharmony_ci imms[swizzle[2] - PIPE_SWIZZLE_0]); 902bf215546Sopenharmony_ci 903bf215546Sopenharmony_ci if (output_desc->nr_channels >= 4) { 904bf215546Sopenharmony_ci sse2_psrlq_imm(p->func, dataXMM, 48); 905bf215546Sopenharmony_ci sse2_movd(p->func, tmp, dataXMM); 906bf215546Sopenharmony_ci x86_mov16(p->func, x86_make_disp(dst, 6), tmp); 907bf215546Sopenharmony_ci } 908bf215546Sopenharmony_ci } 909bf215546Sopenharmony_ci } 910bf215546Sopenharmony_ci } 911bf215546Sopenharmony_ci } 912bf215546Sopenharmony_ci return TRUE; 913bf215546Sopenharmony_ci } 914bf215546Sopenharmony_ci else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0], 915bf215546Sopenharmony_ci sizeof(output_desc->channel[0]))) { 916bf215546Sopenharmony_ci struct x86_reg tmp = p->tmp_EAX; 917bf215546Sopenharmony_ci unsigned i; 918bf215546Sopenharmony_ci 919bf215546Sopenharmony_ci if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 920bf215546Sopenharmony_ci && output_desc->nr_channels == 4 921bf215546Sopenharmony_ci && swizzle[0] == PIPE_SWIZZLE_W 922bf215546Sopenharmony_ci && swizzle[1] == PIPE_SWIZZLE_Z 923bf215546Sopenharmony_ci && swizzle[2] == PIPE_SWIZZLE_Y 924bf215546Sopenharmony_ci && swizzle[3] == PIPE_SWIZZLE_X) { 925bf215546Sopenharmony_ci /* TODO: support movbe */ 926bf215546Sopenharmony_ci x86_mov(p->func, tmp, src); 927bf215546Sopenharmony_ci x86_bswap(p->func, tmp); 928bf215546Sopenharmony_ci x86_mov(p->func, dst, tmp); 929bf215546Sopenharmony_ci return TRUE; 930bf215546Sopenharmony_ci } 931bf215546Sopenharmony_ci 932bf215546Sopenharmony_ci for (i = 0; i < output_desc->nr_channels; ++i) { 933bf215546Sopenharmony_ci switch (output_desc->channel[0].size) { 934bf215546Sopenharmony_ci case 8: 935bf215546Sopenharmony_ci if (swizzle[i] >= PIPE_SWIZZLE_0) { 936bf215546Sopenharmony_ci unsigned v = 0; 937bf215546Sopenharmony_ci if (swizzle[i] == PIPE_SWIZZLE_1) { 938bf215546Sopenharmony_ci switch (output_desc->channel[0].type) { 939bf215546Sopenharmony_ci case UTIL_FORMAT_TYPE_UNSIGNED: 940bf215546Sopenharmony_ci v = output_desc->channel[0].normalized ? 0xff : 1; 941bf215546Sopenharmony_ci break; 942bf215546Sopenharmony_ci case UTIL_FORMAT_TYPE_SIGNED: 943bf215546Sopenharmony_ci v = output_desc->channel[0].normalized ? 0x7f : 1; 944bf215546Sopenharmony_ci break; 945bf215546Sopenharmony_ci default: 946bf215546Sopenharmony_ci return FALSE; 947bf215546Sopenharmony_ci } 948bf215546Sopenharmony_ci } 949bf215546Sopenharmony_ci x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v); 950bf215546Sopenharmony_ci } 951bf215546Sopenharmony_ci else { 952bf215546Sopenharmony_ci x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1)); 953bf215546Sopenharmony_ci x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp); 954bf215546Sopenharmony_ci } 955bf215546Sopenharmony_ci break; 956bf215546Sopenharmony_ci case 16: 957bf215546Sopenharmony_ci if (swizzle[i] >= PIPE_SWIZZLE_0) { 958bf215546Sopenharmony_ci unsigned v = 0; 959bf215546Sopenharmony_ci if (swizzle[i] == PIPE_SWIZZLE_1) { 960bf215546Sopenharmony_ci switch (output_desc->channel[1].type) { 961bf215546Sopenharmony_ci case UTIL_FORMAT_TYPE_UNSIGNED: 962bf215546Sopenharmony_ci v = output_desc->channel[1].normalized ? 0xffff : 1; 963bf215546Sopenharmony_ci break; 964bf215546Sopenharmony_ci case UTIL_FORMAT_TYPE_SIGNED: 965bf215546Sopenharmony_ci v = output_desc->channel[1].normalized ? 0x7fff : 1; 966bf215546Sopenharmony_ci break; 967bf215546Sopenharmony_ci case UTIL_FORMAT_TYPE_FLOAT: 968bf215546Sopenharmony_ci v = 0x3c00; 969bf215546Sopenharmony_ci break; 970bf215546Sopenharmony_ci default: 971bf215546Sopenharmony_ci return FALSE; 972bf215546Sopenharmony_ci } 973bf215546Sopenharmony_ci } 974bf215546Sopenharmony_ci x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v); 975bf215546Sopenharmony_ci } 976bf215546Sopenharmony_ci else if (swizzle[i] == PIPE_SWIZZLE_0) { 977bf215546Sopenharmony_ci x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0); 978bf215546Sopenharmony_ci } 979bf215546Sopenharmony_ci else { 980bf215546Sopenharmony_ci x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2)); 981bf215546Sopenharmony_ci x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp); 982bf215546Sopenharmony_ci } 983bf215546Sopenharmony_ci break; 984bf215546Sopenharmony_ci case 32: 985bf215546Sopenharmony_ci if (swizzle[i] >= PIPE_SWIZZLE_0) { 986bf215546Sopenharmony_ci unsigned v = 0; 987bf215546Sopenharmony_ci if (swizzle[i] == PIPE_SWIZZLE_1) { 988bf215546Sopenharmony_ci switch (output_desc->channel[1].type) { 989bf215546Sopenharmony_ci case UTIL_FORMAT_TYPE_UNSIGNED: 990bf215546Sopenharmony_ci v = output_desc->channel[1].normalized ? 0xffffffff : 1; 991bf215546Sopenharmony_ci break; 992bf215546Sopenharmony_ci case UTIL_FORMAT_TYPE_SIGNED: 993bf215546Sopenharmony_ci v = output_desc->channel[1].normalized ? 0x7fffffff : 1; 994bf215546Sopenharmony_ci break; 995bf215546Sopenharmony_ci case UTIL_FORMAT_TYPE_FLOAT: 996bf215546Sopenharmony_ci v = 0x3f800000; 997bf215546Sopenharmony_ci break; 998bf215546Sopenharmony_ci default: 999bf215546Sopenharmony_ci return FALSE; 1000bf215546Sopenharmony_ci } 1001bf215546Sopenharmony_ci } 1002bf215546Sopenharmony_ci x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v); 1003bf215546Sopenharmony_ci } 1004bf215546Sopenharmony_ci else { 1005bf215546Sopenharmony_ci x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4)); 1006bf215546Sopenharmony_ci x86_mov(p->func, x86_make_disp(dst, i * 4), tmp); 1007bf215546Sopenharmony_ci } 1008bf215546Sopenharmony_ci break; 1009bf215546Sopenharmony_ci case 64: 1010bf215546Sopenharmony_ci if (swizzle[i] >= PIPE_SWIZZLE_0) { 1011bf215546Sopenharmony_ci unsigned l = 0; 1012bf215546Sopenharmony_ci unsigned h = 0; 1013bf215546Sopenharmony_ci if (swizzle[i] == PIPE_SWIZZLE_1) { 1014bf215546Sopenharmony_ci switch (output_desc->channel[1].type) { 1015bf215546Sopenharmony_ci case UTIL_FORMAT_TYPE_UNSIGNED: 1016bf215546Sopenharmony_ci h = output_desc->channel[1].normalized ? 0xffffffff : 0; 1017bf215546Sopenharmony_ci l = output_desc->channel[1].normalized ? 0xffffffff : 1; 1018bf215546Sopenharmony_ci break; 1019bf215546Sopenharmony_ci case UTIL_FORMAT_TYPE_SIGNED: 1020bf215546Sopenharmony_ci h = output_desc->channel[1].normalized ? 0x7fffffff : 0; 1021bf215546Sopenharmony_ci l = output_desc->channel[1].normalized ? 0xffffffff : 1; 1022bf215546Sopenharmony_ci break; 1023bf215546Sopenharmony_ci case UTIL_FORMAT_TYPE_FLOAT: 1024bf215546Sopenharmony_ci h = 0x3ff00000; 1025bf215546Sopenharmony_ci l = 0; 1026bf215546Sopenharmony_ci break; 1027bf215546Sopenharmony_ci default: 1028bf215546Sopenharmony_ci return FALSE; 1029bf215546Sopenharmony_ci } 1030bf215546Sopenharmony_ci } 1031bf215546Sopenharmony_ci x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l); 1032bf215546Sopenharmony_ci x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h); 1033bf215546Sopenharmony_ci } 1034bf215546Sopenharmony_ci else { 1035bf215546Sopenharmony_ci if (x86_target_caps(p->func) & X86_SSE) { 1036bf215546Sopenharmony_ci struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0); 1037bf215546Sopenharmony_ci emit_load64(p, tmp, tmpXMM, 1038bf215546Sopenharmony_ci x86_make_disp(src, swizzle[i] * 8)); 1039bf215546Sopenharmony_ci emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM); 1040bf215546Sopenharmony_ci } 1041bf215546Sopenharmony_ci else { 1042bf215546Sopenharmony_ci x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8)); 1043bf215546Sopenharmony_ci x86_mov(p->func, x86_make_disp(dst, i * 8), tmp); 1044bf215546Sopenharmony_ci x86_mov(p->func, tmp, 1045bf215546Sopenharmony_ci x86_make_disp(src, swizzle[i] * 8 + 4)); 1046bf215546Sopenharmony_ci x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp); 1047bf215546Sopenharmony_ci } 1048bf215546Sopenharmony_ci } 1049bf215546Sopenharmony_ci break; 1050bf215546Sopenharmony_ci default: 1051bf215546Sopenharmony_ci return FALSE; 1052bf215546Sopenharmony_ci } 1053bf215546Sopenharmony_ci } 1054bf215546Sopenharmony_ci return TRUE; 1055bf215546Sopenharmony_ci } 1056bf215546Sopenharmony_ci /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */ 1057bf215546Sopenharmony_ci else if ((x86_target_caps(p->func) & X86_SSE2) && 1058bf215546Sopenharmony_ci a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && 1059bf215546Sopenharmony_ci (0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM 1060bf215546Sopenharmony_ci || a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) { 1061bf215546Sopenharmony_ci struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 1062bf215546Sopenharmony_ci 1063bf215546Sopenharmony_ci /* load */ 1064bf215546Sopenharmony_ci sse_movups(p->func, dataXMM, src); 1065bf215546Sopenharmony_ci 1066bf215546Sopenharmony_ci if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) { 1067bf215546Sopenharmony_ci sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3)); 1068bf215546Sopenharmony_ci } 1069bf215546Sopenharmony_ci 1070bf215546Sopenharmony_ci /* scale by 255.0 */ 1071bf215546Sopenharmony_ci sse_mulps(p->func, dataXMM, get_const(p, CONST_255)); 1072bf215546Sopenharmony_ci 1073bf215546Sopenharmony_ci /* pack and emit */ 1074bf215546Sopenharmony_ci sse2_cvtps2dq(p->func, dataXMM, dataXMM); 1075bf215546Sopenharmony_ci sse2_packssdw(p->func, dataXMM, dataXMM); 1076bf215546Sopenharmony_ci sse2_packuswb(p->func, dataXMM, dataXMM); 1077bf215546Sopenharmony_ci sse2_movd(p->func, dst, dataXMM); 1078bf215546Sopenharmony_ci 1079bf215546Sopenharmony_ci return TRUE; 1080bf215546Sopenharmony_ci } 1081bf215546Sopenharmony_ci 1082bf215546Sopenharmony_ci return FALSE; 1083bf215546Sopenharmony_ci} 1084bf215546Sopenharmony_ci 1085bf215546Sopenharmony_ci 1086bf215546Sopenharmony_cistatic boolean 1087bf215546Sopenharmony_citranslate_attr(struct translate_sse *p, 1088bf215546Sopenharmony_ci const struct translate_element *a, 1089bf215546Sopenharmony_ci struct x86_reg src, struct x86_reg dst) 1090bf215546Sopenharmony_ci{ 1091bf215546Sopenharmony_ci if (a->input_format == a->output_format) { 1092bf215546Sopenharmony_ci emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1)); 1093bf215546Sopenharmony_ci return TRUE; 1094bf215546Sopenharmony_ci } 1095bf215546Sopenharmony_ci 1096bf215546Sopenharmony_ci return translate_attr_convert(p, a, src, dst); 1097bf215546Sopenharmony_ci} 1098bf215546Sopenharmony_ci 1099bf215546Sopenharmony_ci 1100bf215546Sopenharmony_cistatic boolean 1101bf215546Sopenharmony_ciinit_inputs(struct translate_sse *p, unsigned index_size) 1102bf215546Sopenharmony_ci{ 1103bf215546Sopenharmony_ci unsigned i; 1104bf215546Sopenharmony_ci struct x86_reg instance_id = 1105bf215546Sopenharmony_ci x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); 1106bf215546Sopenharmony_ci struct x86_reg start_instance = 1107bf215546Sopenharmony_ci x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance)); 1108bf215546Sopenharmony_ci 1109bf215546Sopenharmony_ci for (i = 0; i < p->nr_buffer_variants; i++) { 1110bf215546Sopenharmony_ci struct translate_buffer_variant *variant = &p->buffer_variant[i]; 1111bf215546Sopenharmony_ci struct translate_buffer *buffer = &p->buffer[variant->buffer_index]; 1112bf215546Sopenharmony_ci 1113bf215546Sopenharmony_ci if (!index_size || variant->instance_divisor) { 1114bf215546Sopenharmony_ci struct x86_reg buf_max_index = 1115bf215546Sopenharmony_ci x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index)); 1116bf215546Sopenharmony_ci struct x86_reg buf_stride = 1117bf215546Sopenharmony_ci x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride)); 1118bf215546Sopenharmony_ci struct x86_reg buf_ptr = 1119bf215546Sopenharmony_ci x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr)); 1120bf215546Sopenharmony_ci struct x86_reg buf_base_ptr = 1121bf215546Sopenharmony_ci x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr)); 1122bf215546Sopenharmony_ci struct x86_reg elt = p->idx_ESI; 1123bf215546Sopenharmony_ci struct x86_reg tmp_EAX = p->tmp_EAX; 1124bf215546Sopenharmony_ci 1125bf215546Sopenharmony_ci /* Calculate pointer to first attrib: 1126bf215546Sopenharmony_ci * base_ptr + stride * index, where index depends on instance divisor 1127bf215546Sopenharmony_ci */ 1128bf215546Sopenharmony_ci if (variant->instance_divisor) { 1129bf215546Sopenharmony_ci struct x86_reg tmp_EDX = p->tmp2_EDX; 1130bf215546Sopenharmony_ci 1131bf215546Sopenharmony_ci /* Start with instance = instance_id 1132bf215546Sopenharmony_ci * which is true if divisor is 1. 1133bf215546Sopenharmony_ci */ 1134bf215546Sopenharmony_ci x86_mov(p->func, tmp_EAX, instance_id); 1135bf215546Sopenharmony_ci 1136bf215546Sopenharmony_ci if (variant->instance_divisor != 1) { 1137bf215546Sopenharmony_ci struct x86_reg tmp_ECX = p->src_ECX; 1138bf215546Sopenharmony_ci 1139bf215546Sopenharmony_ci /* TODO: Add x86_shr() to rtasm and use it whenever 1140bf215546Sopenharmony_ci * instance divisor is power of two. 1141bf215546Sopenharmony_ci */ 1142bf215546Sopenharmony_ci x86_xor(p->func, tmp_EDX, tmp_EDX); 1143bf215546Sopenharmony_ci x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor); 1144bf215546Sopenharmony_ci x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ 1145bf215546Sopenharmony_ci } 1146bf215546Sopenharmony_ci 1147bf215546Sopenharmony_ci /* instance = (instance_id / divisor) + start_instance 1148bf215546Sopenharmony_ci */ 1149bf215546Sopenharmony_ci x86_mov(p->func, tmp_EDX, start_instance); 1150bf215546Sopenharmony_ci x86_add(p->func, tmp_EAX, tmp_EDX); 1151bf215546Sopenharmony_ci 1152bf215546Sopenharmony_ci /* XXX we need to clamp the index here too, but to a 1153bf215546Sopenharmony_ci * per-array max value, not the draw->pt.max_index value 1154bf215546Sopenharmony_ci * that's being given to us via translate->set_buffer(). 1155bf215546Sopenharmony_ci */ 1156bf215546Sopenharmony_ci } 1157bf215546Sopenharmony_ci else { 1158bf215546Sopenharmony_ci x86_mov(p->func, tmp_EAX, elt); 1159bf215546Sopenharmony_ci 1160bf215546Sopenharmony_ci /* Clamp to max_index 1161bf215546Sopenharmony_ci */ 1162bf215546Sopenharmony_ci x86_cmp(p->func, tmp_EAX, buf_max_index); 1163bf215546Sopenharmony_ci x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE); 1164bf215546Sopenharmony_ci } 1165bf215546Sopenharmony_ci 1166bf215546Sopenharmony_ci x86_mov(p->func, p->tmp2_EDX, buf_stride); 1167bf215546Sopenharmony_ci x64_rexw(p->func); 1168bf215546Sopenharmony_ci x86_imul(p->func, tmp_EAX, p->tmp2_EDX); 1169bf215546Sopenharmony_ci x64_rexw(p->func); 1170bf215546Sopenharmony_ci x86_add(p->func, tmp_EAX, buf_base_ptr); 1171bf215546Sopenharmony_ci 1172bf215546Sopenharmony_ci x86_cmp(p->func, p->count_EBP, p->tmp_EAX); 1173bf215546Sopenharmony_ci 1174bf215546Sopenharmony_ci /* In the linear case, keep the buffer pointer instead of the 1175bf215546Sopenharmony_ci * index number. 1176bf215546Sopenharmony_ci */ 1177bf215546Sopenharmony_ci if (!index_size && p->nr_buffer_variants == 1) { 1178bf215546Sopenharmony_ci x64_rexw(p->func); 1179bf215546Sopenharmony_ci x86_mov(p->func, elt, tmp_EAX); 1180bf215546Sopenharmony_ci } 1181bf215546Sopenharmony_ci else { 1182bf215546Sopenharmony_ci x64_rexw(p->func); 1183bf215546Sopenharmony_ci x86_mov(p->func, buf_ptr, tmp_EAX); 1184bf215546Sopenharmony_ci } 1185bf215546Sopenharmony_ci } 1186bf215546Sopenharmony_ci } 1187bf215546Sopenharmony_ci 1188bf215546Sopenharmony_ci return TRUE; 1189bf215546Sopenharmony_ci} 1190bf215546Sopenharmony_ci 1191bf215546Sopenharmony_ci 1192bf215546Sopenharmony_cistatic struct x86_reg 1193bf215546Sopenharmony_ciget_buffer_ptr(struct translate_sse *p, 1194bf215546Sopenharmony_ci unsigned index_size, unsigned var_idx, struct x86_reg elt) 1195bf215546Sopenharmony_ci{ 1196bf215546Sopenharmony_ci if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) { 1197bf215546Sopenharmony_ci return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); 1198bf215546Sopenharmony_ci } 1199bf215546Sopenharmony_ci if (!index_size && p->nr_buffer_variants == 1) { 1200bf215546Sopenharmony_ci return p->idx_ESI; 1201bf215546Sopenharmony_ci } 1202bf215546Sopenharmony_ci else if (!index_size || p->buffer_variant[var_idx].instance_divisor) { 1203bf215546Sopenharmony_ci struct x86_reg ptr = p->src_ECX; 1204bf215546Sopenharmony_ci struct x86_reg buf_ptr = 1205bf215546Sopenharmony_ci x86_make_disp(p->machine_EDI, 1206bf215546Sopenharmony_ci get_offset(p, &p->buffer_variant[var_idx].ptr)); 1207bf215546Sopenharmony_ci 1208bf215546Sopenharmony_ci x64_rexw(p->func); 1209bf215546Sopenharmony_ci x86_mov(p->func, ptr, buf_ptr); 1210bf215546Sopenharmony_ci return ptr; 1211bf215546Sopenharmony_ci } 1212bf215546Sopenharmony_ci else { 1213bf215546Sopenharmony_ci struct x86_reg ptr = p->src_ECX; 1214bf215546Sopenharmony_ci const struct translate_buffer_variant *variant = 1215bf215546Sopenharmony_ci &p->buffer_variant[var_idx]; 1216bf215546Sopenharmony_ci struct x86_reg buf_stride = 1217bf215546Sopenharmony_ci x86_make_disp(p->machine_EDI, 1218bf215546Sopenharmony_ci get_offset(p, &p->buffer[variant->buffer_index].stride)); 1219bf215546Sopenharmony_ci struct x86_reg buf_base_ptr = 1220bf215546Sopenharmony_ci x86_make_disp(p->machine_EDI, 1221bf215546Sopenharmony_ci get_offset(p, &p->buffer[variant->buffer_index].base_ptr)); 1222bf215546Sopenharmony_ci struct x86_reg buf_max_index = 1223bf215546Sopenharmony_ci x86_make_disp(p->machine_EDI, 1224bf215546Sopenharmony_ci get_offset(p, &p->buffer[variant->buffer_index].max_index)); 1225bf215546Sopenharmony_ci 1226bf215546Sopenharmony_ci /* Calculate pointer to current attrib: 1227bf215546Sopenharmony_ci */ 1228bf215546Sopenharmony_ci switch (index_size) { 1229bf215546Sopenharmony_ci case 1: 1230bf215546Sopenharmony_ci x86_movzx8(p->func, ptr, elt); 1231bf215546Sopenharmony_ci break; 1232bf215546Sopenharmony_ci case 2: 1233bf215546Sopenharmony_ci x86_movzx16(p->func, ptr, elt); 1234bf215546Sopenharmony_ci break; 1235bf215546Sopenharmony_ci case 4: 1236bf215546Sopenharmony_ci x86_mov(p->func, ptr, elt); 1237bf215546Sopenharmony_ci break; 1238bf215546Sopenharmony_ci } 1239bf215546Sopenharmony_ci 1240bf215546Sopenharmony_ci /* Clamp to max_index 1241bf215546Sopenharmony_ci */ 1242bf215546Sopenharmony_ci x86_cmp(p->func, ptr, buf_max_index); 1243bf215546Sopenharmony_ci x86_cmovcc(p->func, ptr, buf_max_index, cc_AE); 1244bf215546Sopenharmony_ci 1245bf215546Sopenharmony_ci x86_mov(p->func, p->tmp2_EDX, buf_stride); 1246bf215546Sopenharmony_ci x64_rexw(p->func); 1247bf215546Sopenharmony_ci x86_imul(p->func, ptr, p->tmp2_EDX); 1248bf215546Sopenharmony_ci x64_rexw(p->func); 1249bf215546Sopenharmony_ci x86_add(p->func, ptr, buf_base_ptr); 1250bf215546Sopenharmony_ci return ptr; 1251bf215546Sopenharmony_ci } 1252bf215546Sopenharmony_ci} 1253bf215546Sopenharmony_ci 1254bf215546Sopenharmony_ci 1255bf215546Sopenharmony_cistatic boolean 1256bf215546Sopenharmony_ciincr_inputs(struct translate_sse *p, unsigned index_size) 1257bf215546Sopenharmony_ci{ 1258bf215546Sopenharmony_ci if (!index_size && p->nr_buffer_variants == 1) { 1259bf215546Sopenharmony_ci const unsigned buffer_index = p->buffer_variant[0].buffer_index; 1260bf215546Sopenharmony_ci struct x86_reg stride = 1261bf215546Sopenharmony_ci x86_make_disp(p->machine_EDI, 1262bf215546Sopenharmony_ci get_offset(p, &p->buffer[buffer_index].stride)); 1263bf215546Sopenharmony_ci 1264bf215546Sopenharmony_ci if (p->buffer_variant[0].instance_divisor == 0) { 1265bf215546Sopenharmony_ci x64_rexw(p->func); 1266bf215546Sopenharmony_ci x86_add(p->func, p->idx_ESI, stride); 1267bf215546Sopenharmony_ci sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192)); 1268bf215546Sopenharmony_ci } 1269bf215546Sopenharmony_ci } 1270bf215546Sopenharmony_ci else if (!index_size) { 1271bf215546Sopenharmony_ci unsigned i; 1272bf215546Sopenharmony_ci 1273bf215546Sopenharmony_ci /* Is this worthwhile?? 1274bf215546Sopenharmony_ci */ 1275bf215546Sopenharmony_ci for (i = 0; i < p->nr_buffer_variants; i++) { 1276bf215546Sopenharmony_ci struct translate_buffer_variant *variant = &p->buffer_variant[i]; 1277bf215546Sopenharmony_ci struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, 1278bf215546Sopenharmony_ci get_offset(p, &variant->ptr)); 1279bf215546Sopenharmony_ci struct x86_reg buf_stride = 1280bf215546Sopenharmony_ci x86_make_disp(p->machine_EDI, 1281bf215546Sopenharmony_ci get_offset(p, &p->buffer[variant->buffer_index].stride)); 1282bf215546Sopenharmony_ci 1283bf215546Sopenharmony_ci if (variant->instance_divisor == 0) { 1284bf215546Sopenharmony_ci x86_mov(p->func, p->tmp_EAX, buf_stride); 1285bf215546Sopenharmony_ci x64_rexw(p->func); 1286bf215546Sopenharmony_ci x86_add(p->func, p->tmp_EAX, buf_ptr); 1287bf215546Sopenharmony_ci if (i == 0) 1288bf215546Sopenharmony_ci sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192)); 1289bf215546Sopenharmony_ci x64_rexw(p->func); 1290bf215546Sopenharmony_ci x86_mov(p->func, buf_ptr, p->tmp_EAX); 1291bf215546Sopenharmony_ci } 1292bf215546Sopenharmony_ci } 1293bf215546Sopenharmony_ci } 1294bf215546Sopenharmony_ci else { 1295bf215546Sopenharmony_ci x64_rexw(p->func); 1296bf215546Sopenharmony_ci x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size)); 1297bf215546Sopenharmony_ci } 1298bf215546Sopenharmony_ci 1299bf215546Sopenharmony_ci return TRUE; 1300bf215546Sopenharmony_ci} 1301bf215546Sopenharmony_ci 1302bf215546Sopenharmony_ci 1303bf215546Sopenharmony_ci/* Build run( struct translate *machine, 1304bf215546Sopenharmony_ci * unsigned start, 1305bf215546Sopenharmony_ci * unsigned count, 1306bf215546Sopenharmony_ci * void *output_buffer ) 1307bf215546Sopenharmony_ci * or 1308bf215546Sopenharmony_ci * run_elts( struct translate *machine, 1309bf215546Sopenharmony_ci * unsigned *elts, 1310bf215546Sopenharmony_ci * unsigned count, 1311bf215546Sopenharmony_ci * void *output_buffer ) 1312bf215546Sopenharmony_ci * 1313bf215546Sopenharmony_ci * Lots of hardcoding 1314bf215546Sopenharmony_ci * 1315bf215546Sopenharmony_ci * EAX -- pointer to current output vertex 1316bf215546Sopenharmony_ci * ECX -- pointer to current attribute 1317bf215546Sopenharmony_ci * 1318bf215546Sopenharmony_ci */ 1319bf215546Sopenharmony_cistatic boolean 1320bf215546Sopenharmony_cibuild_vertex_emit(struct translate_sse *p, 1321bf215546Sopenharmony_ci struct x86_function *func, unsigned index_size) 1322bf215546Sopenharmony_ci{ 1323bf215546Sopenharmony_ci int fixup, label; 1324bf215546Sopenharmony_ci unsigned j; 1325bf215546Sopenharmony_ci 1326bf215546Sopenharmony_ci memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const)); 1327bf215546Sopenharmony_ci memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg)); 1328bf215546Sopenharmony_ci 1329bf215546Sopenharmony_ci p->tmp_EAX = x86_make_reg(file_REG32, reg_AX); 1330bf215546Sopenharmony_ci p->idx_ESI = x86_make_reg(file_REG32, reg_SI); 1331bf215546Sopenharmony_ci p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX); 1332bf215546Sopenharmony_ci p->machine_EDI = x86_make_reg(file_REG32, reg_DI); 1333bf215546Sopenharmony_ci p->count_EBP = x86_make_reg(file_REG32, reg_BP); 1334bf215546Sopenharmony_ci p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX); 1335bf215546Sopenharmony_ci p->src_ECX = x86_make_reg(file_REG32, reg_CX); 1336bf215546Sopenharmony_ci 1337bf215546Sopenharmony_ci p->func = func; 1338bf215546Sopenharmony_ci 1339bf215546Sopenharmony_ci x86_init_func(p->func); 1340bf215546Sopenharmony_ci 1341bf215546Sopenharmony_ci if (x86_target(p->func) == X86_64_WIN64_ABI) { 1342bf215546Sopenharmony_ci /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" 1343bf215546Sopenharmony_ci * above the return address 1344bf215546Sopenharmony_ci */ 1345bf215546Sopenharmony_ci sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), 1346bf215546Sopenharmony_ci x86_make_reg(file_XMM, 6)); 1347bf215546Sopenharmony_ci sse2_movdqa(p->func, 1348bf215546Sopenharmony_ci x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), 1349bf215546Sopenharmony_ci x86_make_reg(file_XMM, 7)); 1350bf215546Sopenharmony_ci } 1351bf215546Sopenharmony_ci 1352bf215546Sopenharmony_ci x86_push(p->func, p->outbuf_EBX); 1353bf215546Sopenharmony_ci x86_push(p->func, p->count_EBP); 1354bf215546Sopenharmony_ci 1355bf215546Sopenharmony_ci /* on non-Win64 x86-64, these are already in the right registers */ 1356bf215546Sopenharmony_ci if (x86_target(p->func) != X86_64_STD_ABI) { 1357bf215546Sopenharmony_ci x86_push(p->func, p->machine_EDI); 1358bf215546Sopenharmony_ci x86_push(p->func, p->idx_ESI); 1359bf215546Sopenharmony_ci 1360bf215546Sopenharmony_ci if (x86_target(p->func) != X86_32) { 1361bf215546Sopenharmony_ci x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); 1362bf215546Sopenharmony_ci x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); 1363bf215546Sopenharmony_ci } 1364bf215546Sopenharmony_ci else { 1365bf215546Sopenharmony_ci x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); 1366bf215546Sopenharmony_ci x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); 1367bf215546Sopenharmony_ci } 1368bf215546Sopenharmony_ci } 1369bf215546Sopenharmony_ci 1370bf215546Sopenharmony_ci x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3)); 1371bf215546Sopenharmony_ci 1372bf215546Sopenharmony_ci if (x86_target(p->func) != X86_32) 1373bf215546Sopenharmony_ci x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6)); 1374bf215546Sopenharmony_ci else 1375bf215546Sopenharmony_ci x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6)); 1376bf215546Sopenharmony_ci 1377bf215546Sopenharmony_ci /* Load instance ID. 1378bf215546Sopenharmony_ci */ 1379bf215546Sopenharmony_ci if (p->use_instancing) { 1380bf215546Sopenharmony_ci x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4)); 1381bf215546Sopenharmony_ci x86_mov(p->func, 1382bf215546Sopenharmony_ci x86_make_disp(p->machine_EDI, 1383bf215546Sopenharmony_ci get_offset(p, &p->start_instance)), p->tmp2_EDX); 1384bf215546Sopenharmony_ci 1385bf215546Sopenharmony_ci x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5)); 1386bf215546Sopenharmony_ci x86_mov(p->func, 1387bf215546Sopenharmony_ci x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)), 1388bf215546Sopenharmony_ci p->tmp_EAX); 1389bf215546Sopenharmony_ci } 1390bf215546Sopenharmony_ci 1391bf215546Sopenharmony_ci /* Get vertex count, compare to zero 1392bf215546Sopenharmony_ci */ 1393bf215546Sopenharmony_ci x86_xor(p->func, p->tmp_EAX, p->tmp_EAX); 1394bf215546Sopenharmony_ci x86_cmp(p->func, p->count_EBP, p->tmp_EAX); 1395bf215546Sopenharmony_ci fixup = x86_jcc_forward(p->func, cc_E); 1396bf215546Sopenharmony_ci 1397bf215546Sopenharmony_ci /* always load, needed or not: 1398bf215546Sopenharmony_ci */ 1399bf215546Sopenharmony_ci init_inputs(p, index_size); 1400bf215546Sopenharmony_ci 1401bf215546Sopenharmony_ci /* Note address for loop jump 1402bf215546Sopenharmony_ci */ 1403bf215546Sopenharmony_ci label = x86_get_label(p->func); 1404bf215546Sopenharmony_ci { 1405bf215546Sopenharmony_ci struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI); 1406bf215546Sopenharmony_ci int last_variant = -1; 1407bf215546Sopenharmony_ci struct x86_reg vb; 1408bf215546Sopenharmony_ci 1409bf215546Sopenharmony_ci for (j = 0; j < p->translate.key.nr_elements; j++) { 1410bf215546Sopenharmony_ci const struct translate_element *a = &p->translate.key.element[j]; 1411bf215546Sopenharmony_ci unsigned variant = p->element_to_buffer_variant[j]; 1412bf215546Sopenharmony_ci 1413bf215546Sopenharmony_ci /* Figure out source pointer address: 1414bf215546Sopenharmony_ci */ 1415bf215546Sopenharmony_ci if (variant != last_variant) { 1416bf215546Sopenharmony_ci last_variant = variant; 1417bf215546Sopenharmony_ci vb = get_buffer_ptr(p, index_size, variant, elt); 1418bf215546Sopenharmony_ci } 1419bf215546Sopenharmony_ci 1420bf215546Sopenharmony_ci if (!translate_attr(p, a, 1421bf215546Sopenharmony_ci x86_make_disp(vb, a->input_offset), 1422bf215546Sopenharmony_ci x86_make_disp(p->outbuf_EBX, a->output_offset))) 1423bf215546Sopenharmony_ci return FALSE; 1424bf215546Sopenharmony_ci } 1425bf215546Sopenharmony_ci 1426bf215546Sopenharmony_ci /* Next output vertex: 1427bf215546Sopenharmony_ci */ 1428bf215546Sopenharmony_ci x64_rexw(p->func); 1429bf215546Sopenharmony_ci x86_lea(p->func, p->outbuf_EBX, 1430bf215546Sopenharmony_ci x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride)); 1431bf215546Sopenharmony_ci 1432bf215546Sopenharmony_ci /* Incr index 1433bf215546Sopenharmony_ci */ 1434bf215546Sopenharmony_ci incr_inputs(p, index_size); 1435bf215546Sopenharmony_ci } 1436bf215546Sopenharmony_ci 1437bf215546Sopenharmony_ci /* decr count, loop if not zero 1438bf215546Sopenharmony_ci */ 1439bf215546Sopenharmony_ci x86_dec(p->func, p->count_EBP); 1440bf215546Sopenharmony_ci x86_jcc(p->func, cc_NZ, label); 1441bf215546Sopenharmony_ci 1442bf215546Sopenharmony_ci /* Exit mmx state? 1443bf215546Sopenharmony_ci */ 1444bf215546Sopenharmony_ci if (p->func->need_emms) 1445bf215546Sopenharmony_ci mmx_emms(p->func); 1446bf215546Sopenharmony_ci 1447bf215546Sopenharmony_ci /* Land forward jump here: 1448bf215546Sopenharmony_ci */ 1449bf215546Sopenharmony_ci x86_fixup_fwd_jump(p->func, fixup); 1450bf215546Sopenharmony_ci 1451bf215546Sopenharmony_ci /* Pop regs and return 1452bf215546Sopenharmony_ci */ 1453bf215546Sopenharmony_ci if (x86_target(p->func) != X86_64_STD_ABI) { 1454bf215546Sopenharmony_ci x86_pop(p->func, p->idx_ESI); 1455bf215546Sopenharmony_ci x86_pop(p->func, p->machine_EDI); 1456bf215546Sopenharmony_ci } 1457bf215546Sopenharmony_ci 1458bf215546Sopenharmony_ci x86_pop(p->func, p->count_EBP); 1459bf215546Sopenharmony_ci x86_pop(p->func, p->outbuf_EBX); 1460bf215546Sopenharmony_ci 1461bf215546Sopenharmony_ci if (x86_target(p->func) == X86_64_WIN64_ABI) { 1462bf215546Sopenharmony_ci sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), 1463bf215546Sopenharmony_ci x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8)); 1464bf215546Sopenharmony_ci sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), 1465bf215546Sopenharmony_ci x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24)); 1466bf215546Sopenharmony_ci } 1467bf215546Sopenharmony_ci x86_ret(p->func); 1468bf215546Sopenharmony_ci 1469bf215546Sopenharmony_ci return TRUE; 1470bf215546Sopenharmony_ci} 1471bf215546Sopenharmony_ci 1472bf215546Sopenharmony_ci 1473bf215546Sopenharmony_cistatic void 1474bf215546Sopenharmony_citranslate_sse_set_buffer(struct translate *translate, 1475bf215546Sopenharmony_ci unsigned buf, 1476bf215546Sopenharmony_ci const void *ptr, unsigned stride, unsigned max_index) 1477bf215546Sopenharmony_ci{ 1478bf215546Sopenharmony_ci struct translate_sse *p = (struct translate_sse *) translate; 1479bf215546Sopenharmony_ci 1480bf215546Sopenharmony_ci if (buf < p->nr_buffers) { 1481bf215546Sopenharmony_ci p->buffer[buf].base_ptr = (char *) ptr; 1482bf215546Sopenharmony_ci p->buffer[buf].stride = stride; 1483bf215546Sopenharmony_ci p->buffer[buf].max_index = max_index; 1484bf215546Sopenharmony_ci } 1485bf215546Sopenharmony_ci 1486bf215546Sopenharmony_ci if (0) 1487bf215546Sopenharmony_ci debug_printf("%s %d/%d: %p %d\n", 1488bf215546Sopenharmony_ci __FUNCTION__, buf, p->nr_buffers, ptr, stride); 1489bf215546Sopenharmony_ci} 1490bf215546Sopenharmony_ci 1491bf215546Sopenharmony_ci 1492bf215546Sopenharmony_cistatic void 1493bf215546Sopenharmony_citranslate_sse_release(struct translate *translate) 1494bf215546Sopenharmony_ci{ 1495bf215546Sopenharmony_ci struct translate_sse *p = (struct translate_sse *) translate; 1496bf215546Sopenharmony_ci 1497bf215546Sopenharmony_ci x86_release_func(&p->elt8_func); 1498bf215546Sopenharmony_ci x86_release_func(&p->elt16_func); 1499bf215546Sopenharmony_ci x86_release_func(&p->elt_func); 1500bf215546Sopenharmony_ci x86_release_func(&p->linear_func); 1501bf215546Sopenharmony_ci 1502bf215546Sopenharmony_ci os_free_aligned(p); 1503bf215546Sopenharmony_ci} 1504bf215546Sopenharmony_ci 1505bf215546Sopenharmony_ci 1506bf215546Sopenharmony_cistruct translate * 1507bf215546Sopenharmony_citranslate_sse2_create(const struct translate_key *key) 1508bf215546Sopenharmony_ci{ 1509bf215546Sopenharmony_ci struct translate_sse *p = NULL; 1510bf215546Sopenharmony_ci unsigned i; 1511bf215546Sopenharmony_ci 1512bf215546Sopenharmony_ci /* this is misnamed, it actually refers to whether rtasm is enabled or not */ 1513bf215546Sopenharmony_ci if (!rtasm_cpu_has_sse()) 1514bf215546Sopenharmony_ci goto fail; 1515bf215546Sopenharmony_ci 1516bf215546Sopenharmony_ci p = os_malloc_aligned(sizeof(struct translate_sse), 16); 1517bf215546Sopenharmony_ci if (!p) 1518bf215546Sopenharmony_ci goto fail; 1519bf215546Sopenharmony_ci 1520bf215546Sopenharmony_ci memset(p, 0, sizeof(*p)); 1521bf215546Sopenharmony_ci memcpy(p->consts, consts, sizeof(consts)); 1522bf215546Sopenharmony_ci memcpy(p->uconsts, uconsts, sizeof(uconsts)); 1523bf215546Sopenharmony_ci 1524bf215546Sopenharmony_ci p->translate.key = *key; 1525bf215546Sopenharmony_ci p->translate.release = translate_sse_release; 1526bf215546Sopenharmony_ci p->translate.set_buffer = translate_sse_set_buffer; 1527bf215546Sopenharmony_ci 1528bf215546Sopenharmony_ci assert(key->nr_elements <= TRANSLATE_MAX_ATTRIBS); 1529bf215546Sopenharmony_ci 1530bf215546Sopenharmony_ci for (i = 0; i < key->nr_elements; i++) { 1531bf215546Sopenharmony_ci if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) { 1532bf215546Sopenharmony_ci unsigned j; 1533bf215546Sopenharmony_ci 1534bf215546Sopenharmony_ci p->nr_buffers = 1535bf215546Sopenharmony_ci MAX2(p->nr_buffers, key->element[i].input_buffer + 1); 1536bf215546Sopenharmony_ci 1537bf215546Sopenharmony_ci if (key->element[i].instance_divisor) { 1538bf215546Sopenharmony_ci p->use_instancing = TRUE; 1539bf215546Sopenharmony_ci } 1540bf215546Sopenharmony_ci 1541bf215546Sopenharmony_ci /* 1542bf215546Sopenharmony_ci * Map vertex element to vertex buffer variant. 1543bf215546Sopenharmony_ci */ 1544bf215546Sopenharmony_ci for (j = 0; j < p->nr_buffer_variants; j++) { 1545bf215546Sopenharmony_ci if (p->buffer_variant[j].buffer_index == 1546bf215546Sopenharmony_ci key->element[i].input_buffer 1547bf215546Sopenharmony_ci && p->buffer_variant[j].instance_divisor == 1548bf215546Sopenharmony_ci key->element[i].instance_divisor) { 1549bf215546Sopenharmony_ci break; 1550bf215546Sopenharmony_ci } 1551bf215546Sopenharmony_ci } 1552bf215546Sopenharmony_ci if (j == p->nr_buffer_variants) { 1553bf215546Sopenharmony_ci p->buffer_variant[j].buffer_index = key->element[i].input_buffer; 1554bf215546Sopenharmony_ci p->buffer_variant[j].instance_divisor = 1555bf215546Sopenharmony_ci key->element[i].instance_divisor; 1556bf215546Sopenharmony_ci p->nr_buffer_variants++; 1557bf215546Sopenharmony_ci } 1558bf215546Sopenharmony_ci p->element_to_buffer_variant[i] = j; 1559bf215546Sopenharmony_ci } 1560bf215546Sopenharmony_ci else { 1561bf215546Sopenharmony_ci assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID); 1562bf215546Sopenharmony_ci 1563bf215546Sopenharmony_ci p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID; 1564bf215546Sopenharmony_ci } 1565bf215546Sopenharmony_ci } 1566bf215546Sopenharmony_ci 1567bf215546Sopenharmony_ci if (0) 1568bf215546Sopenharmony_ci debug_printf("nr_buffers: %d\n", p->nr_buffers); 1569bf215546Sopenharmony_ci 1570bf215546Sopenharmony_ci if (!build_vertex_emit(p, &p->linear_func, 0)) 1571bf215546Sopenharmony_ci goto fail; 1572bf215546Sopenharmony_ci 1573bf215546Sopenharmony_ci if (!build_vertex_emit(p, &p->elt_func, 4)) 1574bf215546Sopenharmony_ci goto fail; 1575bf215546Sopenharmony_ci 1576bf215546Sopenharmony_ci if (!build_vertex_emit(p, &p->elt16_func, 2)) 1577bf215546Sopenharmony_ci goto fail; 1578bf215546Sopenharmony_ci 1579bf215546Sopenharmony_ci if (!build_vertex_emit(p, &p->elt8_func, 1)) 1580bf215546Sopenharmony_ci goto fail; 1581bf215546Sopenharmony_ci 1582bf215546Sopenharmony_ci p->translate.run = (run_func) x86_get_func(&p->linear_func); 1583bf215546Sopenharmony_ci if (p->translate.run == NULL) 1584bf215546Sopenharmony_ci goto fail; 1585bf215546Sopenharmony_ci 1586bf215546Sopenharmony_ci p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func); 1587bf215546Sopenharmony_ci if (p->translate.run_elts == NULL) 1588bf215546Sopenharmony_ci goto fail; 1589bf215546Sopenharmony_ci 1590bf215546Sopenharmony_ci p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func); 1591bf215546Sopenharmony_ci if (p->translate.run_elts16 == NULL) 1592bf215546Sopenharmony_ci goto fail; 1593bf215546Sopenharmony_ci 1594bf215546Sopenharmony_ci p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func); 1595bf215546Sopenharmony_ci if (p->translate.run_elts8 == NULL) 1596bf215546Sopenharmony_ci goto fail; 1597bf215546Sopenharmony_ci 1598bf215546Sopenharmony_ci return &p->translate; 1599bf215546Sopenharmony_ci 1600bf215546Sopenharmony_ci fail: 1601bf215546Sopenharmony_ci if (p) 1602bf215546Sopenharmony_ci translate_sse_release(&p->translate); 1603bf215546Sopenharmony_ci 1604bf215546Sopenharmony_ci return NULL; 1605bf215546Sopenharmony_ci} 1606bf215546Sopenharmony_ci 1607bf215546Sopenharmony_ci 1608bf215546Sopenharmony_ci#else 1609bf215546Sopenharmony_ci 1610bf215546Sopenharmony_cistruct translate * 1611bf215546Sopenharmony_citranslate_sse2_create(const struct translate_key *key) 1612bf215546Sopenharmony_ci{ 1613bf215546Sopenharmony_ci return NULL; 1614bf215546Sopenharmony_ci} 1615bf215546Sopenharmony_ci 1616bf215546Sopenharmony_ci#endif 1617