1/* 2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 */ 23#include "r600_sq.h" 24#include "r600_formats.h" 25#include "r600_opcodes.h" 26#include "r600_shader.h" 27#include "r600_dump.h" 28#include "r600d.h" 29#include "sfn/sfn_nir.h" 30 31#include "sb/sb_public.h" 32 33#include "pipe/p_shader_tokens.h" 34#include "tgsi/tgsi_info.h" 35#include "tgsi/tgsi_parse.h" 36#include "tgsi/tgsi_scan.h" 37#include "tgsi/tgsi_dump.h" 38#include "tgsi/tgsi_from_mesa.h" 39#include "nir/tgsi_to_nir.h" 40#include "nir/nir_to_tgsi_info.h" 41#include "compiler/nir/nir.h" 42#include "util/u_bitcast.h" 43#include "util/u_memory.h" 44#include "util/u_math.h" 45#include <stdio.h> 46#include <errno.h> 47 48/* CAYMAN notes 49Why CAYMAN got loops for lots of instructions is explained here. 50 51-These 8xx t-slot only ops are implemented in all vector slots. 52MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT 53These 8xx t-slot only opcodes become vector ops, with all four 54slots expecting the arguments on sources a and b. Result is 55broadcast to all channels. 56MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64 57These 8xx t-slot only opcodes become vector ops in the z, y, and 58x slots. 59EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64 60RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64 61SQRT_IEEE/_64 62SIN/COS 63The w slot may have an independent co-issued operation, or if the 64result is required to be in the w slot, the opcode above may be 65issued in the w slot as well. 66The compiler must issue the source argument to slots z, y, and x 67*/ 68 69/* Contents of r0 on entry to various shaders 70 71 VS - .x = VertexID 72 .y = RelVertexID (??) 73 .w = InstanceID 74 75 GS - r0.xyw, r1.xyz = per-vertex offsets 76 r0.z = PrimitiveID 77 78 TCS - .x = PatchID 79 .y = RelPatchID (??) 80 .z = InvocationID 81 .w = tess factor base. 82 83 TES - .x = TessCoord.x 84 - .y = TessCoord.y 85 - .z = RelPatchID (??) 86 - .w = PrimitiveID 87 88 PS - face_gpr.z = SampleMask 89 face_gpr.w = SampleID 90*/ 91#define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16) 92static int r600_shader_from_tgsi(struct r600_context *rctx, 93 struct r600_pipe_shader *pipeshader, 94 union r600_shader_key key); 95 96static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr, 97 int size, unsigned comp_mask) { 98 99 if (!size) 100 return; 101 102 if (ps->num_arrays == ps->max_arrays) { 103 ps->max_arrays += 64; 104 ps->arrays = realloc(ps->arrays, ps->max_arrays * 105 sizeof(struct r600_shader_array)); 106 } 107 108 int n = ps->num_arrays; 109 ++ps->num_arrays; 110 111 ps->arrays[n].comp_mask = comp_mask; 112 ps->arrays[n].gpr_start = start_gpr; 113 ps->arrays[n].gpr_count = size; 114} 115 116static void r600_dump_streamout(struct pipe_stream_output_info *so) 117{ 118 unsigned i; 119 120 fprintf(stderr, "STREAMOUT\n"); 121 for (i = 0; i < so->num_outputs; i++) { 122 unsigned mask = ((1 << so->output[i].num_components) - 1) << 123 so->output[i].start_component; 124 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n", 125 i, 126 so->output[i].stream, 127 so->output[i].output_buffer, 128 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, 129 so->output[i].register_index, 130 mask & 1 ? "x" : "", 131 mask & 2 ? "y" : "", 132 mask & 4 ? "z" : "", 133 mask & 8 ? "w" : "", 134 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : ""); 135 } 136} 137 138static int store_shader(struct pipe_context *ctx, 139 struct r600_pipe_shader *shader) 140{ 141 struct r600_context *rctx = (struct r600_context *)ctx; 142 uint32_t *ptr, i; 143 144 if (shader->bo == NULL) { 145 shader->bo = (struct r600_resource*) 146 pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4); 147 if (shader->bo == NULL) { 148 return -ENOMEM; 149 } 150 ptr = r600_buffer_map_sync_with_rings( 151 &rctx->b, shader->bo, 152 PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY); 153 if (R600_BIG_ENDIAN) { 154 for (i = 0; i < shader->shader.bc.ndw; ++i) { 155 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]); 156 } 157 } else { 158 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr)); 159 } 160 rctx->b.ws->buffer_unmap(rctx->b.ws, shader->bo->buf); 161 } 162 163 return 0; 164} 165 166extern const struct nir_shader_compiler_options r600_nir_options; 167static int nshader = 0; 168int r600_pipe_shader_create(struct pipe_context *ctx, 169 struct r600_pipe_shader *shader, 170 union r600_shader_key key) 171{ 172 struct r600_context *rctx = (struct r600_context *)ctx; 173 struct r600_pipe_shader_selector *sel = shader->selector; 174 int r; 175 struct r600_screen *rscreen = (struct r600_screen *)ctx->screen; 176 177 int processor = sel->ir_type == PIPE_SHADER_IR_TGSI ? 178 tgsi_get_processor_type(sel->tokens): 179 pipe_shader_type_from_mesa(sel->nir->info.stage); 180 181 bool dump = r600_can_dump_shader(&rctx->screen->b, processor); 182 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB) || 183 (rctx->screen->b.debug_flags & DBG_NIR_SB); 184 unsigned sb_disasm; 185 unsigned export_shader; 186 187 shader->shader.bc.isa = rctx->isa; 188 189 if (!(rscreen->b.debug_flags & DBG_NIR_PREFERRED)) { 190 assert(sel->ir_type == PIPE_SHADER_IR_TGSI); 191 r = r600_shader_from_tgsi(rctx, shader, key); 192 if (r) { 193 R600_ERR("translation from TGSI failed !\n"); 194 goto error; 195 } 196 } else { 197 if (sel->ir_type == PIPE_SHADER_IR_TGSI) { 198 if (sel->nir) 199 ralloc_free(sel->nir); 200 sel->nir = tgsi_to_nir(sel->tokens, ctx->screen, true); 201 const nir_shader_compiler_options *nir_options = 202 (const nir_shader_compiler_options *) 203 ctx->screen->get_compiler_options(ctx->screen, 204 PIPE_SHADER_IR_NIR, 205 shader->shader.processor_type); 206 /* Lower int64 ops because we have some r600 build-in shaders that use it */ 207 if (nir_options->lower_int64_options) { 208 NIR_PASS_V(sel->nir, nir_lower_regs_to_ssa); 209 NIR_PASS_V(sel->nir, nir_lower_alu_to_scalar, NULL, NULL); 210 NIR_PASS_V(sel->nir, nir_lower_int64); 211 NIR_PASS_V(sel->nir, nir_opt_vectorize, NULL, NULL); 212 } 213 NIR_PASS_V(sel->nir, nir_lower_flrp, ~0, false); 214 } 215 nir_tgsi_scan_shader(sel->nir, &sel->info, true); 216 217 r = r600_shader_from_nir(rctx, shader, &key); 218 if (r) { 219 fprintf(stderr, "--Failed shader--------------------------------------------------\n"); 220 221 if (sel->ir_type == PIPE_SHADER_IR_TGSI) { 222 fprintf(stderr, "--TGSI--------------------------------------------------------\n"); 223 tgsi_dump(sel->tokens, 0); 224 } 225 226 if (rscreen->b.debug_flags & (DBG_NIR_PREFERRED)) { 227 fprintf(stderr, "--NIR --------------------------------------------------------\n"); 228 nir_print_shader(sel->nir, stderr); 229 } 230 231 R600_ERR("translation from NIR failed !\n"); 232 goto error; 233 } 234 } 235 236 if (dump) { 237 if (sel->ir_type == PIPE_SHADER_IR_TGSI) { 238 fprintf(stderr, "--TGSI--------------------------------------------------------\n"); 239 tgsi_dump(sel->tokens, 0); 240 } 241 242 if (sel->so.num_outputs) { 243 r600_dump_streamout(&sel->so); 244 } 245 } 246 247 if (shader->shader.processor_type == PIPE_SHADER_VERTEX) { 248 /* only disable for vertex shaders in tess paths */ 249 if (key.vs.as_ls) 250 use_sb = 0; 251 } 252 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL); 253 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL); 254 use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE); 255 256 /* disable SB for shaders using doubles */ 257 use_sb &= !shader->shader.uses_doubles; 258 259 use_sb &= !shader->shader.uses_atomics; 260 use_sb &= !shader->shader.uses_images; 261 use_sb &= !shader->shader.uses_helper_invocation; 262 263 /* SB can't handle READ_SCRATCH properly */ 264 use_sb &= !(shader->shader.needs_scratch_space && rscreen->b.gfx_level < R700); 265 266 /* sb has bugs in array reg allocation 267 * (dEQP-GLES2.functional.shaders.struct.local.struct_array_dynamic_index_fragment 268 * with NTT) 269 */ 270 use_sb &= !(shader->shader.indirect_files & (1 << TGSI_FILE_TEMPORARY)); 271 use_sb &= !(shader->shader.indirect_files & (1 << TGSI_FILE_CONSTANT)); 272 273 /* sb has scheduling assertion fails with interpolate_at. */ 274 use_sb &= !shader->shader.uses_interpolate_at_sample; 275 276 /* Check if the bytecode has already been built. */ 277 if (!shader->shader.bc.bytecode) { 278 r = r600_bytecode_build(&shader->shader.bc); 279 if (r) { 280 R600_ERR("building bytecode failed !\n"); 281 goto error; 282 } 283 } 284 285 sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM); 286 if (dump && !sb_disasm) { 287 fprintf(stderr, "--------------------------------------------------------------\n"); 288 r600_bytecode_disasm(&shader->shader.bc); 289 fprintf(stderr, "______________________________________________________________\n"); 290 } else if ((dump && sb_disasm) || use_sb) { 291 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader, 292 dump, use_sb); 293 if (r) { 294 R600_ERR("r600_sb_bytecode_process failed !\n"); 295 goto error; 296 } 297 } 298 299 if (dump) { 300 print_shader_info(stderr, nshader++, &shader->shader); 301 print_pipe_info(stderr, &sel->info); 302 } 303 304 if (shader->gs_copy_shader) { 305 if (dump) { 306 // dump copy shader 307 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc, 308 &shader->gs_copy_shader->shader, dump, 0); 309 if (r) 310 goto error; 311 } 312 313 if ((r = store_shader(ctx, shader->gs_copy_shader))) 314 goto error; 315 } 316 317 /* Store the shader in a buffer. */ 318 if ((r = store_shader(ctx, shader))) 319 goto error; 320 321 /* Build state. */ 322 switch (shader->shader.processor_type) { 323 case PIPE_SHADER_TESS_CTRL: 324 evergreen_update_hs_state(ctx, shader); 325 break; 326 case PIPE_SHADER_TESS_EVAL: 327 if (key.tes.as_es) 328 evergreen_update_es_state(ctx, shader); 329 else 330 evergreen_update_vs_state(ctx, shader); 331 break; 332 case PIPE_SHADER_GEOMETRY: 333 if (rctx->b.gfx_level >= EVERGREEN) { 334 evergreen_update_gs_state(ctx, shader); 335 evergreen_update_vs_state(ctx, shader->gs_copy_shader); 336 } else { 337 r600_update_gs_state(ctx, shader); 338 r600_update_vs_state(ctx, shader->gs_copy_shader); 339 } 340 break; 341 case PIPE_SHADER_VERTEX: 342 export_shader = key.vs.as_es; 343 if (rctx->b.gfx_level >= EVERGREEN) { 344 if (key.vs.as_ls) 345 evergreen_update_ls_state(ctx, shader); 346 else if (key.vs.as_es) 347 evergreen_update_es_state(ctx, shader); 348 else 349 evergreen_update_vs_state(ctx, shader); 350 } else { 351 if (export_shader) 352 r600_update_es_state(ctx, shader); 353 else 354 r600_update_vs_state(ctx, shader); 355 } 356 break; 357 case PIPE_SHADER_FRAGMENT: 358 if (rctx->b.gfx_level >= EVERGREEN) { 359 evergreen_update_ps_state(ctx, shader); 360 } else { 361 r600_update_ps_state(ctx, shader); 362 } 363 break; 364 case PIPE_SHADER_COMPUTE: 365 evergreen_update_ls_state(ctx, shader); 366 break; 367 default: 368 r = -EINVAL; 369 goto error; 370 } 371 372 util_debug_message(&rctx->b.debug, SHADER_INFO, "%s shader: %d dw, %d gprs, %d alu_groups, %d loops, %d cf, %d stack", 373 _mesa_shader_stage_to_abbrev(tgsi_processor_to_shader_stage(processor)), 374 shader->shader.bc.ndw, 375 shader->shader.bc.ngpr, 376 shader->shader.bc.nalu_groups, 377 shader->shader.num_loops, 378 shader->shader.bc.ncf, 379 shader->shader.bc.nstack); 380 381 return 0; 382 383error: 384 r600_pipe_shader_destroy(ctx, shader); 385 return r; 386} 387 388void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader) 389{ 390 r600_resource_reference(&shader->bo, NULL); 391 if (list_is_linked(&shader->shader.bc.cf)) 392 r600_bytecode_clear(&shader->shader.bc); 393 r600_release_command_buffer(&shader->command_buffer); 394} 395 396/* 397 * tgsi -> r600 shader 398 */ 399struct r600_shader_tgsi_instruction; 400 401struct r600_shader_src { 402 unsigned sel; 403 unsigned swizzle[4]; 404 unsigned neg; 405 unsigned abs; 406 unsigned rel; 407 unsigned kc_bank; 408 boolean kc_rel; /* true if cache bank is indexed */ 409 uint32_t value[4]; 410}; 411 412struct eg_interp { 413 boolean enabled; 414 unsigned ij_index; 415}; 416 417struct r600_shader_ctx { 418 struct tgsi_shader_info info; 419 struct tgsi_array_info *array_infos; 420 /* flag for each tgsi temp array if its been spilled or not */ 421 bool *spilled_arrays; 422 struct tgsi_parse_context parse; 423 const struct tgsi_token *tokens; 424 unsigned type; 425 unsigned file_offset[TGSI_FILE_COUNT]; 426 unsigned temp_reg; 427 const struct r600_shader_tgsi_instruction *inst_info; 428 struct r600_bytecode *bc; 429 struct r600_shader *shader; 430 struct r600_shader_src src[4]; 431 uint32_t *literals; 432 uint32_t nliterals; 433 uint32_t max_driver_temp_used; 434 /* needed for evergreen interpolation */ 435 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid 436 /* evergreen/cayman also store sample mask in face register */ 437 int face_gpr; 438 /* sample id is .w component stored in fixed point position register */ 439 int fixed_pt_position_gpr; 440 int colors_used; 441 boolean clip_vertex_write; 442 unsigned cv_output; 443 unsigned edgeflag_output; 444 int helper_invoc_reg; 445 int cs_block_size_reg; 446 int cs_grid_size_reg; 447 bool cs_block_size_loaded, cs_grid_size_loaded; 448 int fragcoord_input; 449 int next_ring_offset; 450 int gs_out_ring_offset; 451 int gs_next_vertex; 452 struct r600_shader *gs_for_vs; 453 int gs_export_gpr_tregs[4]; 454 int gs_rotated_input[2]; 455 const struct pipe_stream_output_info *gs_stream_output_info; 456 unsigned enabled_stream_buffers_mask; 457 unsigned tess_input_info; /* temp with tess input offsets */ 458 unsigned tess_output_info; /* temp with tess input offsets */ 459 unsigned thread_id_gpr; /* temp with thread id calculated for images */ 460}; 461 462struct r600_shader_tgsi_instruction { 463 unsigned op; 464 int (*process)(struct r600_shader_ctx *ctx); 465}; 466 467static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind); 468static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[]; 469static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx); 470static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason); 471static void fc_pushlevel(struct r600_shader_ctx *ctx, int type); 472static int tgsi_else(struct r600_shader_ctx *ctx); 473static int tgsi_endif(struct r600_shader_ctx *ctx); 474static int tgsi_bgnloop(struct r600_shader_ctx *ctx); 475static int tgsi_endloop(struct r600_shader_ctx *ctx); 476static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx); 477static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, 478 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan, 479 unsigned int dst_reg); 480static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src, 481 const struct r600_shader_src *shader_src, 482 unsigned chan); 483static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg, 484 unsigned dst_reg, unsigned mask); 485 486static bool ctx_needs_stack_workaround_8xx(struct r600_shader_ctx *ctx) 487{ 488 if (ctx->bc->family == CHIP_HEMLOCK || 489 ctx->bc->family == CHIP_CYPRESS || 490 ctx->bc->family == CHIP_JUNIPER) 491 return false; 492 return true; 493} 494 495static int tgsi_last_instruction(unsigned writemask) 496{ 497 int i, lasti = 0; 498 499 for (i = 0; i < 4; i++) { 500 if (writemask & (1 << i)) { 501 lasti = i; 502 } 503 } 504 return lasti; 505} 506 507static int tgsi_is_supported(struct r600_shader_ctx *ctx) 508{ 509 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction; 510 unsigned j; 511 512 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) { 513 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs); 514 return -EINVAL; 515 } 516#if 0 517 if (i->Instruction.Label) { 518 R600_ERR("label unsupported\n"); 519 return -EINVAL; 520 } 521#endif 522 for (j = 0; j < i->Instruction.NumSrcRegs; j++) { 523 if (i->Src[j].Register.Dimension) { 524 switch (i->Src[j].Register.File) { 525 case TGSI_FILE_CONSTANT: 526 case TGSI_FILE_HW_ATOMIC: 527 break; 528 case TGSI_FILE_INPUT: 529 if (ctx->type == PIPE_SHADER_GEOMETRY || 530 ctx->type == PIPE_SHADER_TESS_CTRL || 531 ctx->type == PIPE_SHADER_TESS_EVAL) 532 break; 533 FALLTHROUGH; 534 case TGSI_FILE_OUTPUT: 535 if (ctx->type == PIPE_SHADER_TESS_CTRL) 536 break; 537 FALLTHROUGH; 538 default: 539 R600_ERR("unsupported src %d (file %d, dimension %d)\n", j, 540 i->Src[j].Register.File, 541 i->Src[j].Register.Dimension); 542 return -EINVAL; 543 } 544 } 545 } 546 for (j = 0; j < i->Instruction.NumDstRegs; j++) { 547 if (i->Dst[j].Register.Dimension) { 548 if (ctx->type == PIPE_SHADER_TESS_CTRL) 549 continue; 550 R600_ERR("unsupported dst (dimension)\n"); 551 return -EINVAL; 552 } 553 } 554 return 0; 555} 556 557int eg_get_interpolator_index(unsigned interpolate, unsigned location) 558{ 559 if (interpolate == TGSI_INTERPOLATE_COLOR || 560 interpolate == TGSI_INTERPOLATE_LINEAR || 561 interpolate == TGSI_INTERPOLATE_PERSPECTIVE) 562 { 563 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR; 564 int loc; 565 566 switch(location) { 567 case TGSI_INTERPOLATE_LOC_CENTER: 568 loc = 1; 569 break; 570 case TGSI_INTERPOLATE_LOC_CENTROID: 571 loc = 2; 572 break; 573 case TGSI_INTERPOLATE_LOC_SAMPLE: 574 default: 575 loc = 0; break; 576 } 577 578 return is_linear * 3 + loc; 579 } 580 581 return -1; 582} 583 584static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx, 585 int input) 586{ 587 int i = eg_get_interpolator_index( 588 ctx->shader->input[input].interpolate, 589 ctx->shader->input[input].interpolate_location); 590 assert(i >= 0); 591 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index; 592} 593 594static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input) 595{ 596 int i, r; 597 struct r600_bytecode_alu alu; 598 int gpr = 0, base_chan = 0; 599 int ij_index = ctx->shader->input[input].ij_index; 600 601 /* work out gpr and base_chan from index */ 602 gpr = ij_index / 2; 603 base_chan = (2 * (ij_index % 2)) + 1; 604 605 for (i = 0; i < 8; i++) { 606 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 607 608 if (i < 4) 609 alu.op = ALU_OP2_INTERP_ZW; 610 else 611 alu.op = ALU_OP2_INTERP_XY; 612 613 if ((i > 1) && (i < 6)) { 614 alu.dst.sel = ctx->shader->input[input].gpr; 615 alu.dst.write = 1; 616 } 617 618 alu.dst.chan = i % 4; 619 620 alu.src[0].sel = gpr; 621 alu.src[0].chan = (base_chan - (i % 2)); 622 623 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 624 625 alu.bank_swizzle_force = SQ_ALU_VEC_210; 626 if ((i % 4) == 3) 627 alu.last = 1; 628 r = r600_bytecode_add_alu(ctx->bc, &alu); 629 if (r) 630 return r; 631 } 632 return 0; 633} 634 635static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input) 636{ 637 int i, r; 638 struct r600_bytecode_alu alu; 639 640 for (i = 0; i < 4; i++) { 641 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 642 643 alu.op = ALU_OP1_INTERP_LOAD_P0; 644 645 alu.dst.sel = ctx->shader->input[input].gpr; 646 alu.dst.write = 1; 647 648 alu.dst.chan = i; 649 650 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 651 alu.src[0].chan = i; 652 653 if (i == 3) 654 alu.last = 1; 655 r = r600_bytecode_add_alu(ctx->bc, &alu); 656 if (r) 657 return r; 658 } 659 return 0; 660} 661 662/* 663 * Special export handling in shaders 664 * 665 * shader export ARRAY_BASE for EXPORT_POS: 666 * 60 is position 667 * 61 is misc vector 668 * 62, 63 are clip distance vectors 669 * 670 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL: 671 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61 672 * USE_VTX_POINT_SIZE - point size in the X channel of export 61 673 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61 674 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61 675 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61 676 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually 677 * exclusive from render target index) 678 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors 679 * 680 * 681 * shader export ARRAY_BASE for EXPORT_PIXEL: 682 * 0-7 CB targets 683 * 61 computed Z vector 684 * 685 * The use of the values exported in the computed Z vector are controlled 686 * by DB_SHADER_CONTROL: 687 * Z_EXPORT_ENABLE - Z as a float in RED 688 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN 689 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA 690 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE 691 * DB_SOURCE_FORMAT - export control restrictions 692 * 693 */ 694 695 696/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */ 697static int r600_spi_sid(struct r600_shader_io * io) 698{ 699 int index, name = io->name; 700 701 /* These params are handled differently, they don't need 702 * semantic indices, so we'll use 0 for them. 703 */ 704 if (name == TGSI_SEMANTIC_POSITION || 705 name == TGSI_SEMANTIC_PSIZE || 706 name == TGSI_SEMANTIC_EDGEFLAG || 707 name == TGSI_SEMANTIC_FACE || 708 name == TGSI_SEMANTIC_SAMPLEMASK) 709 index = 0; 710 else { 711 if (name == TGSI_SEMANTIC_GENERIC) { 712 /* For generic params simply use sid from tgsi */ 713 index = 9 + io->sid; 714 } else if (name == TGSI_SEMANTIC_TEXCOORD) { 715 index = io->sid; 716 } else { 717 /* For non-generic params - pack name and sid into 8 bits */ 718 index = 0x80 | (name<<3) | (io->sid); 719 } 720 721 /* Make sure that all really used indices have nonzero value, so 722 * we can just compare it to 0 later instead of comparing the name 723 * with different values to detect special cases. */ 724 index++; 725 } 726 727 return index; 728}; 729 730/* we need this to get a common lds index for vs/tcs/tes input/outputs */ 731int r600_get_lds_unique_index(unsigned semantic_name, unsigned index) 732{ 733 switch (semantic_name) { 734 case TGSI_SEMANTIC_POSITION: 735 return 0; 736 case TGSI_SEMANTIC_PSIZE: 737 return 1; 738 case TGSI_SEMANTIC_CLIPDIST: 739 assert(index <= 1); 740 return 2 + index; 741 case TGSI_SEMANTIC_TEXCOORD: 742 return 4 + index; 743 case TGSI_SEMANTIC_COLOR: 744 return 12 + index; 745 case TGSI_SEMANTIC_BCOLOR: 746 return 14 + index; 747 case TGSI_SEMANTIC_CLIPVERTEX: 748 return 16; 749 case TGSI_SEMANTIC_GENERIC: 750 if (index <= 63-17) 751 return 17 + index; 752 else 753 /* same explanation as in the default statement, 754 * the only user hitting this is st/nine. 755 */ 756 return 0; 757 758 /* patch indices are completely separate and thus start from 0 */ 759 case TGSI_SEMANTIC_TESSOUTER: 760 return 0; 761 case TGSI_SEMANTIC_TESSINNER: 762 return 1; 763 case TGSI_SEMANTIC_PATCH: 764 return 2 + index; 765 766 default: 767 /* Don't fail here. The result of this function is only used 768 * for LS, TCS, TES, and GS, where legacy GL semantics can't 769 * occur, but this function is called for all vertex shaders 770 * before it's known whether LS will be compiled or not. 771 */ 772 return 0; 773 } 774} 775 776/* turn input into interpolate on EG */ 777static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index) 778{ 779 int r = 0; 780 781 if (ctx->shader->input[index].spi_sid) { 782 ctx->shader->input[index].lds_pos = ctx->shader->nlds++; 783 if (ctx->shader->input[index].interpolate > 0) { 784 evergreen_interp_assign_ij_index(ctx, index); 785 r = evergreen_interp_alu(ctx, index); 786 } else { 787 r = evergreen_interp_flat(ctx, index); 788 } 789 } 790 return r; 791} 792 793static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back) 794{ 795 struct r600_bytecode_alu alu; 796 int i, r; 797 int gpr_front = ctx->shader->input[front].gpr; 798 int gpr_back = ctx->shader->input[back].gpr; 799 800 for (i = 0; i < 4; i++) { 801 memset(&alu, 0, sizeof(alu)); 802 alu.op = ALU_OP3_CNDGT; 803 alu.is_op3 = 1; 804 alu.dst.write = 1; 805 alu.dst.sel = gpr_front; 806 alu.src[0].sel = ctx->face_gpr; 807 alu.src[1].sel = gpr_front; 808 alu.src[2].sel = gpr_back; 809 810 alu.dst.chan = i; 811 alu.src[1].chan = i; 812 alu.src[2].chan = i; 813 alu.last = (i==3); 814 815 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 816 return r; 817 } 818 819 return 0; 820} 821 822/* execute a single slot ALU calculation */ 823static int single_alu_op2(struct r600_shader_ctx *ctx, int op, 824 int dst_sel, int dst_chan, 825 int src0_sel, unsigned src0_chan_val, 826 int src1_sel, unsigned src1_chan_val) 827{ 828 struct r600_bytecode_alu alu; 829 int r, i; 830 831 if (ctx->bc->gfx_level == CAYMAN && op == ALU_OP2_MULLO_INT) { 832 for (i = 0; i < 4; i++) { 833 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 834 alu.op = op; 835 alu.src[0].sel = src0_sel; 836 if (src0_sel == V_SQ_ALU_SRC_LITERAL) 837 alu.src[0].value = src0_chan_val; 838 else 839 alu.src[0].chan = src0_chan_val; 840 alu.src[1].sel = src1_sel; 841 if (src1_sel == V_SQ_ALU_SRC_LITERAL) 842 alu.src[1].value = src1_chan_val; 843 else 844 alu.src[1].chan = src1_chan_val; 845 alu.dst.sel = dst_sel; 846 alu.dst.chan = i; 847 alu.dst.write = i == dst_chan; 848 alu.last = (i == 3); 849 r = r600_bytecode_add_alu(ctx->bc, &alu); 850 if (r) 851 return r; 852 } 853 return 0; 854 } 855 856 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 857 alu.op = op; 858 alu.src[0].sel = src0_sel; 859 if (src0_sel == V_SQ_ALU_SRC_LITERAL) 860 alu.src[0].value = src0_chan_val; 861 else 862 alu.src[0].chan = src0_chan_val; 863 alu.src[1].sel = src1_sel; 864 if (src1_sel == V_SQ_ALU_SRC_LITERAL) 865 alu.src[1].value = src1_chan_val; 866 else 867 alu.src[1].chan = src1_chan_val; 868 alu.dst.sel = dst_sel; 869 alu.dst.chan = dst_chan; 870 alu.dst.write = 1; 871 alu.last = 1; 872 r = r600_bytecode_add_alu(ctx->bc, &alu); 873 if (r) 874 return r; 875 return 0; 876} 877 878/* execute a single slot ALU calculation */ 879static int single_alu_op3(struct r600_shader_ctx *ctx, int op, 880 int dst_sel, int dst_chan, 881 int src0_sel, unsigned src0_chan_val, 882 int src1_sel, unsigned src1_chan_val, 883 int src2_sel, unsigned src2_chan_val) 884{ 885 struct r600_bytecode_alu alu; 886 int r; 887 888 /* validate this for other ops */ 889 assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT || op == ALU_OP3_BFE_UINT); 890 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 891 alu.op = op; 892 alu.src[0].sel = src0_sel; 893 if (src0_sel == V_SQ_ALU_SRC_LITERAL) 894 alu.src[0].value = src0_chan_val; 895 else 896 alu.src[0].chan = src0_chan_val; 897 alu.src[1].sel = src1_sel; 898 if (src1_sel == V_SQ_ALU_SRC_LITERAL) 899 alu.src[1].value = src1_chan_val; 900 else 901 alu.src[1].chan = src1_chan_val; 902 alu.src[2].sel = src2_sel; 903 if (src2_sel == V_SQ_ALU_SRC_LITERAL) 904 alu.src[2].value = src2_chan_val; 905 else 906 alu.src[2].chan = src2_chan_val; 907 alu.dst.sel = dst_sel; 908 alu.dst.chan = dst_chan; 909 alu.is_op3 = 1; 910 alu.last = 1; 911 r = r600_bytecode_add_alu(ctx->bc, &alu); 912 if (r) 913 return r; 914 return 0; 915} 916 917/* put it in temp_reg.x */ 918static int get_lds_offset0(struct r600_shader_ctx *ctx, 919 int rel_patch_chan, 920 int temp_reg, bool is_patch_var) 921{ 922 int r; 923 924 /* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */ 925 /* ADD 926 Dimension - patch0_offset (input_vals.z), 927 Non-dim - patch0_data_offset (input_vals.w) 928 */ 929 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 930 temp_reg, 0, 931 ctx->tess_output_info, 0, 932 0, rel_patch_chan, 933 ctx->tess_output_info, is_patch_var ? 3 : 2); 934 if (r) 935 return r; 936 return 0; 937} 938 939static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index) 940{ 941 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg; 942} 943 944static int r600_get_temp(struct r600_shader_ctx *ctx) 945{ 946 return ctx->temp_reg + ctx->max_driver_temp_used++; 947} 948 949static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid) 950{ 951 int i; 952 i = ctx->shader->noutput++; 953 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID; 954 ctx->shader->output[i].sid = 0; 955 ctx->shader->output[i].gpr = 0; 956 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT; 957 ctx->shader->output[i].write_mask = 0x4; 958 ctx->shader->output[i].spi_sid = prim_id_sid; 959 960 return 0; 961} 962 963static int tgsi_barrier(struct r600_shader_ctx *ctx) 964{ 965 struct r600_bytecode_alu alu; 966 int r; 967 968 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 969 alu.op = ctx->inst_info->op; 970 alu.last = 1; 971 972 r = r600_bytecode_add_alu(ctx->bc, &alu); 973 if (r) 974 return r; 975 976 /* XXX: Need to implement GWS ops to sync across wavefronts */ 977 978 return 0; 979} 980 981static int tgsi_membar(struct r600_shader_ctx *ctx) 982{ 983 /* Wait for any SSBO/image stores to land. */ 984 return r600_bytecode_wait_acks(ctx->bc); 985} 986 987static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed) 988{ 989 // pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays 990 unsigned n = ctx->info.array_max[TGSI_FILE_TEMPORARY]; 991 unsigned narrays_left = n; 992 bool *spilled = ctx->spilled_arrays; // assumed calloc:ed 993 994 *scratch_space_needed = 0; 995 while (*regno > 124 && narrays_left) { 996 unsigned i; 997 unsigned largest = 0; 998 unsigned largest_index = 0; 999 1000 for (i = 0; i < n; i++) { 1001 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1; 1002 if (!spilled[i] && size > largest) { 1003 largest = size; 1004 largest_index = i; 1005 } 1006 } 1007 1008 spilled[largest_index] = true; 1009 *regno -= largest; 1010 *scratch_space_needed += largest; 1011 1012 narrays_left --; 1013 } 1014 1015 if (narrays_left == 0) { 1016 ctx->info.indirect_files &= ~(1 << TGSI_FILE_TEMPORARY); 1017 } 1018} 1019 1020/* Take spilled temp arrays into account when translating tgsi register 1021 * indexes into r600 gprs if spilled is false, or scratch array offset if 1022 * spilled is true */ 1023static int map_tgsi_reg_index_to_r600_gpr(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, bool *spilled) 1024{ 1025 unsigned i; 1026 unsigned spilled_size = 0; 1027 1028 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) { 1029 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) { 1030 if (ctx->spilled_arrays[i]) { 1031 /* vec4 index into spilled scratch memory */ 1032 *spilled = true; 1033 return tgsi_reg_index - ctx->array_infos[i].range.First + spilled_size; 1034 } 1035 else { 1036 /* regular GPR array */ 1037 *spilled = false; 1038 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY]; 1039 } 1040 } 1041 1042 if (tgsi_reg_index < ctx->array_infos[i].range.First) 1043 break; 1044 if (ctx->spilled_arrays[i]) { 1045 spilled_size += ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1; 1046 } 1047 } 1048 1049 /* regular GPR index, minus the holes from spilled arrays */ 1050 *spilled = false; 1051 1052 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY]; 1053} 1054 1055/* look up spill area base offset and array size for a spilled temp array */ 1056static void get_spilled_array_base_and_size(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, 1057 unsigned *array_base, unsigned *array_size) 1058{ 1059 unsigned i; 1060 unsigned offset = 0; 1061 1062 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) { 1063 if (ctx->spilled_arrays[i]) { 1064 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1; 1065 1066 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) { 1067 *array_base = offset; 1068 *array_size = size - 1; /* hw counts from 1 */ 1069 1070 return; 1071 } 1072 1073 offset += size; 1074 } 1075 } 1076} 1077 1078static int tgsi_declaration(struct r600_shader_ctx *ctx) 1079{ 1080 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration; 1081 int r, i, j, count = d->Range.Last - d->Range.First + 1; 1082 1083 switch (d->Declaration.File) { 1084 case TGSI_FILE_INPUT: 1085 for (j = 0; j < count; j++) { 1086 i = ctx->shader->ninput + j; 1087 assert(i < ARRAY_SIZE(ctx->shader->input)); 1088 ctx->shader->input[i].name = d->Semantic.Name; 1089 ctx->shader->input[i].sid = d->Semantic.Index + j; 1090 ctx->shader->input[i].interpolate = d->Interp.Interpolate; 1091 ctx->shader->input[i].interpolate_location = d->Interp.Location; 1092 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j; 1093 if (ctx->type == PIPE_SHADER_FRAGMENT) { 1094 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]); 1095 switch (ctx->shader->input[i].name) { 1096 case TGSI_SEMANTIC_FACE: 1097 if (ctx->face_gpr != -1) 1098 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */ 1099 else 1100 ctx->face_gpr = ctx->shader->input[i].gpr; 1101 break; 1102 case TGSI_SEMANTIC_COLOR: 1103 ctx->colors_used++; 1104 break; 1105 case TGSI_SEMANTIC_POSITION: 1106 ctx->fragcoord_input = i; 1107 break; 1108 case TGSI_SEMANTIC_PRIMID: 1109 /* set this for now */ 1110 ctx->shader->gs_prim_id_input = true; 1111 ctx->shader->ps_prim_id_input = i; 1112 break; 1113 } 1114 if (ctx->bc->gfx_level >= EVERGREEN) { 1115 if ((r = evergreen_interp_input(ctx, i))) 1116 return r; 1117 } 1118 } else if (ctx->type == PIPE_SHADER_GEOMETRY) { 1119 /* FIXME probably skip inputs if they aren't passed in the ring */ 1120 ctx->shader->input[i].ring_offset = ctx->next_ring_offset; 1121 ctx->next_ring_offset += 16; 1122 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID) 1123 ctx->shader->gs_prim_id_input = true; 1124 } 1125 } 1126 ctx->shader->ninput += count; 1127 break; 1128 case TGSI_FILE_OUTPUT: 1129 for (j = 0; j < count; j++) { 1130 i = ctx->shader->noutput + j; 1131 assert(i < ARRAY_SIZE(ctx->shader->output)); 1132 ctx->shader->output[i].name = d->Semantic.Name; 1133 ctx->shader->output[i].sid = d->Semantic.Index + j; 1134 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j; 1135 ctx->shader->output[i].interpolate = d->Interp.Interpolate; 1136 ctx->shader->output[i].write_mask = d->Declaration.UsageMask; 1137 if (ctx->type == PIPE_SHADER_VERTEX || 1138 ctx->type == PIPE_SHADER_GEOMETRY || 1139 ctx->type == PIPE_SHADER_TESS_EVAL) { 1140 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]); 1141 switch (d->Semantic.Name) { 1142 case TGSI_SEMANTIC_CLIPDIST: 1143 break; 1144 case TGSI_SEMANTIC_PSIZE: 1145 ctx->shader->vs_out_misc_write = 1; 1146 ctx->shader->vs_out_point_size = 1; 1147 break; 1148 case TGSI_SEMANTIC_EDGEFLAG: 1149 ctx->shader->vs_out_misc_write = 1; 1150 ctx->shader->vs_out_edgeflag = 1; 1151 ctx->edgeflag_output = i; 1152 break; 1153 case TGSI_SEMANTIC_VIEWPORT_INDEX: 1154 ctx->shader->vs_out_misc_write = 1; 1155 ctx->shader->vs_out_viewport = 1; 1156 break; 1157 case TGSI_SEMANTIC_LAYER: 1158 ctx->shader->vs_out_misc_write = 1; 1159 ctx->shader->vs_out_layer = 1; 1160 break; 1161 case TGSI_SEMANTIC_CLIPVERTEX: 1162 ctx->clip_vertex_write = TRUE; 1163 ctx->cv_output = i; 1164 break; 1165 } 1166 if (ctx->type == PIPE_SHADER_GEOMETRY) { 1167 ctx->gs_out_ring_offset += 16; 1168 } 1169 } 1170 } 1171 ctx->shader->noutput += count; 1172 break; 1173 case TGSI_FILE_TEMPORARY: 1174 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) { 1175 if (d->Array.ArrayID) { 1176 bool spilled; 1177 unsigned idx = map_tgsi_reg_index_to_r600_gpr(ctx, 1178 d->Range.First, 1179 &spilled); 1180 1181 if (!spilled) { 1182 r600_add_gpr_array(ctx->shader, idx, 1183 d->Range.Last - d->Range.First + 1, 0x0F); 1184 } 1185 } 1186 } 1187 break; 1188 1189 case TGSI_FILE_CONSTANT: 1190 case TGSI_FILE_SAMPLER: 1191 case TGSI_FILE_SAMPLER_VIEW: 1192 case TGSI_FILE_ADDRESS: 1193 case TGSI_FILE_BUFFER: 1194 case TGSI_FILE_IMAGE: 1195 case TGSI_FILE_MEMORY: 1196 break; 1197 1198 case TGSI_FILE_HW_ATOMIC: 1199 i = ctx->shader->nhwatomic_ranges; 1200 ctx->shader->atomics[i].start = d->Range.First; 1201 ctx->shader->atomics[i].end = d->Range.Last; 1202 ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic; 1203 ctx->shader->atomics[i].buffer_id = d->Dim.Index2D; 1204 ctx->shader->nhwatomic_ranges++; 1205 ctx->shader->nhwatomic += count; 1206 break; 1207 1208 case TGSI_FILE_SYSTEM_VALUE: 1209 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK || 1210 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID || 1211 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) { 1212 break; /* Already handled from allocate_system_value_inputs */ 1213 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) { 1214 break; 1215 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID) 1216 break; 1217 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID) 1218 break; 1219 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER || 1220 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) { 1221 int param = r600_get_lds_unique_index(d->Semantic.Name, 0); 1222 int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2; 1223 unsigned temp_reg = r600_get_temp(ctx); 1224 1225 r = get_lds_offset0(ctx, 2, temp_reg, true); 1226 if (r) 1227 return r; 1228 1229 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 1230 temp_reg, 0, 1231 temp_reg, 0, 1232 V_SQ_ALU_SRC_LITERAL, param * 16); 1233 if (r) 1234 return r; 1235 1236 do_lds_fetch_values(ctx, temp_reg, dreg, 0xf); 1237 } 1238 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) { 1239 /* MOV r1.x, r0.x; 1240 MOV r1.y, r0.y; 1241 */ 1242 for (i = 0; i < 2; i++) { 1243 struct r600_bytecode_alu alu; 1244 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1245 alu.op = ALU_OP1_MOV; 1246 alu.src[0].sel = 0; 1247 alu.src[0].chan = 0 + i; 1248 alu.dst.sel = 1; 1249 alu.dst.chan = 0 + i; 1250 alu.dst.write = 1; 1251 alu.last = (i == 1) ? 1 : 0; 1252 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1253 return r; 1254 } 1255 /* ADD r1.z, 1.0f, -r0.x */ 1256 struct r600_bytecode_alu alu; 1257 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1258 alu.op = ALU_OP2_ADD; 1259 alu.src[0].sel = V_SQ_ALU_SRC_1; 1260 alu.src[1].sel = 1; 1261 alu.src[1].chan = 0; 1262 alu.src[1].neg = 1; 1263 alu.dst.sel = 1; 1264 alu.dst.chan = 2; 1265 alu.dst.write = 1; 1266 alu.last = 1; 1267 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1268 return r; 1269 1270 /* ADD r1.z, r1.z, -r1.y */ 1271 alu.op = ALU_OP2_ADD; 1272 alu.src[0].sel = 1; 1273 alu.src[0].chan = 2; 1274 alu.src[1].sel = 1; 1275 alu.src[1].chan = 1; 1276 alu.src[1].neg = 1; 1277 alu.dst.sel = 1; 1278 alu.dst.chan = 2; 1279 alu.dst.write = 1; 1280 alu.last = 1; 1281 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1282 return r; 1283 break; 1284 } 1285 break; 1286 default: 1287 R600_ERR("unsupported file %d declaration\n", d->Declaration.File); 1288 return -EINVAL; 1289 } 1290 return 0; 1291} 1292 1293static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset) 1294{ 1295 struct tgsi_parse_context parse; 1296 struct { 1297 boolean enabled; 1298 int *reg; 1299 unsigned name, alternate_name; 1300 } inputs[2] = { 1301 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */ 1302 1303 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */ 1304 }; 1305 int num_regs = 0; 1306 unsigned k, i; 1307 1308 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) { 1309 return 0; 1310 } 1311 1312 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */ 1313 while (!tgsi_parse_end_of_tokens(&parse)) { 1314 tgsi_parse_token(&parse); 1315 1316 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) { 1317 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction; 1318 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE || 1319 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 1320 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID) 1321 { 1322 int interpolate, location, k; 1323 1324 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 1325 location = TGSI_INTERPOLATE_LOC_CENTER; 1326 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) { 1327 location = TGSI_INTERPOLATE_LOC_CENTER; 1328 /* Needs sample positions, currently those are always available */ 1329 } else { 1330 location = TGSI_INTERPOLATE_LOC_CENTROID; 1331 } 1332 1333 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index]; 1334 k = eg_get_interpolator_index(interpolate, location); 1335 if (k >= 0) 1336 ctx->eg_interpolators[k].enabled = true; 1337 } 1338 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) { 1339 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration; 1340 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) { 1341 for (k = 0; k < ARRAY_SIZE(inputs); k++) { 1342 if (d->Semantic.Name == inputs[k].name || 1343 d->Semantic.Name == inputs[k].alternate_name) { 1344 inputs[k].enabled = true; 1345 } 1346 } 1347 } 1348 } 1349 } 1350 1351 tgsi_parse_free(&parse); 1352 1353 if (ctx->info.reads_samplemask && 1354 (ctx->info.uses_linear_sample || ctx->info.uses_persp_sample)) { 1355 inputs[1].enabled = true; 1356 } 1357 1358 if (ctx->bc->gfx_level >= EVERGREEN) { 1359 int num_baryc = 0; 1360 /* assign gpr to each interpolator according to priority */ 1361 for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) { 1362 if (ctx->eg_interpolators[i].enabled) { 1363 ctx->eg_interpolators[i].ij_index = num_baryc; 1364 num_baryc++; 1365 } 1366 } 1367 num_baryc = (num_baryc + 1) >> 1; 1368 gpr_offset += num_baryc; 1369 } 1370 1371 for (i = 0; i < ARRAY_SIZE(inputs); i++) { 1372 boolean enabled = inputs[i].enabled; 1373 int *reg = inputs[i].reg; 1374 unsigned name = inputs[i].name; 1375 1376 if (enabled) { 1377 int gpr = gpr_offset + num_regs++; 1378 ctx->shader->nsys_inputs++; 1379 1380 // add to inputs, allocate a gpr 1381 k = ctx->shader->ninput++; 1382 ctx->shader->input[k].name = name; 1383 ctx->shader->input[k].sid = 0; 1384 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT; 1385 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER; 1386 *reg = ctx->shader->input[k].gpr = gpr; 1387 } 1388 } 1389 1390 return gpr_offset + num_regs; 1391} 1392 1393/* 1394 * for evergreen we need to scan the shader to find the number of GPRs we need to 1395 * reserve for interpolation and system values 1396 * 1397 * we need to know if we are going to emit any sample or centroid inputs 1398 * if perspective and linear are required 1399*/ 1400static int evergreen_gpr_count(struct r600_shader_ctx *ctx) 1401{ 1402 unsigned i; 1403 1404 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators)); 1405 1406 /* 1407 * Could get this information from the shader info. But right now 1408 * we interpolate all declared inputs, whereas the shader info will 1409 * only contain the bits if the inputs are actually used, so it might 1410 * not be safe... 1411 */ 1412 for (i = 0; i < ctx->info.num_inputs; i++) { 1413 int k; 1414 /* skip position/face/mask/sampleid */ 1415 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION || 1416 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE || 1417 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK || 1418 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID) 1419 continue; 1420 1421 k = eg_get_interpolator_index( 1422 ctx->info.input_interpolate[i], 1423 ctx->info.input_interpolate_loc[i]); 1424 if (k >= 0) 1425 ctx->eg_interpolators[k].enabled = TRUE; 1426 } 1427 1428 /* XXX PULL MODEL and LINE STIPPLE */ 1429 1430 return allocate_system_value_inputs(ctx, 0); 1431} 1432 1433/* sample_id_sel == NULL means fetch for current sample */ 1434static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel) 1435{ 1436 struct r600_bytecode_vtx vtx; 1437 int r, t1; 1438 1439 t1 = r600_get_temp(ctx); 1440 1441 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 1442 vtx.op = FETCH_OP_VFETCH; 1443 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER; 1444 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1445 if (sample_id == NULL) { 1446 assert(ctx->fixed_pt_position_gpr != -1); 1447 1448 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w; 1449 vtx.src_sel_x = 3; 1450 } 1451 else { 1452 struct r600_bytecode_alu alu; 1453 1454 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1455 alu.op = ALU_OP1_MOV; 1456 r600_bytecode_src(&alu.src[0], sample_id, chan_sel); 1457 alu.dst.sel = t1; 1458 alu.dst.write = 1; 1459 alu.last = 1; 1460 r = r600_bytecode_add_alu(ctx->bc, &alu); 1461 if (r) 1462 return r; 1463 1464 vtx.src_gpr = t1; 1465 vtx.src_sel_x = 0; 1466 } 1467 vtx.mega_fetch_count = 16; 1468 vtx.dst_gpr = t1; 1469 vtx.dst_sel_x = 0; 1470 vtx.dst_sel_y = 1; 1471 vtx.dst_sel_z = 2; 1472 vtx.dst_sel_w = 3; 1473 vtx.data_format = FMT_32_32_32_32_FLOAT; 1474 vtx.num_format_all = 2; 1475 vtx.format_comp_all = 1; 1476 vtx.use_const_fields = 0; 1477 vtx.offset = 0; 1478 vtx.endian = r600_endian_swap(32); 1479 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */ 1480 1481 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 1482 if (r) 1483 return r; 1484 1485 return t1; 1486} 1487 1488static int eg_load_helper_invocation(struct r600_shader_ctx *ctx) 1489{ 1490 int r; 1491 struct r600_bytecode_alu alu; 1492 1493 /* do a vtx fetch with wqm set on the vtx fetch */ 1494 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1495 alu.op = ALU_OP1_MOV; 1496 alu.dst.sel = ctx->helper_invoc_reg; 1497 alu.dst.chan = 0; 1498 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 1499 alu.src[0].value = 0xffffffff; 1500 alu.dst.write = 1; 1501 alu.last = 1; 1502 r = r600_bytecode_add_alu(ctx->bc, &alu); 1503 if (r) 1504 return r; 1505 1506 /* do a vtx fetch in VPM mode */ 1507 struct r600_bytecode_vtx vtx; 1508 memset(&vtx, 0, sizeof(vtx)); 1509 vtx.op = FETCH_OP_GET_BUFFER_RESINFO; 1510 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER; 1511 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1512 vtx.src_gpr = 0; 1513 vtx.mega_fetch_count = 16; /* no idea here really... */ 1514 vtx.dst_gpr = ctx->helper_invoc_reg; 1515 vtx.dst_sel_x = 4; 1516 vtx.dst_sel_y = 7; /* SEL_Y */ 1517 vtx.dst_sel_z = 7; /* SEL_Z */ 1518 vtx.dst_sel_w = 7; /* SEL_W */ 1519 vtx.data_format = FMT_32; 1520 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx))) 1521 return r; 1522 ctx->bc->cf_last->vpm = 1; 1523 return 0; 1524} 1525 1526static int cm_load_helper_invocation(struct r600_shader_ctx *ctx) 1527{ 1528 int r; 1529 struct r600_bytecode_alu alu; 1530 1531 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1532 alu.op = ALU_OP1_MOV; 1533 alu.dst.sel = ctx->helper_invoc_reg; 1534 alu.dst.chan = 0; 1535 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 1536 alu.src[0].value = 0xffffffff; 1537 alu.dst.write = 1; 1538 alu.last = 1; 1539 r = r600_bytecode_add_alu(ctx->bc, &alu); 1540 if (r) 1541 return r; 1542 1543 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1544 alu.op = ALU_OP1_MOV; 1545 alu.dst.sel = ctx->helper_invoc_reg; 1546 alu.dst.chan = 0; 1547 alu.src[0].sel = V_SQ_ALU_SRC_0; 1548 alu.dst.write = 1; 1549 alu.last = 1; 1550 r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE); 1551 if (r) 1552 return r; 1553 1554 return ctx->helper_invoc_reg; 1555} 1556 1557static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block) 1558{ 1559 struct r600_bytecode_vtx vtx; 1560 int r, t1; 1561 1562 if (ctx->cs_block_size_loaded) 1563 return ctx->cs_block_size_reg; 1564 if (ctx->cs_grid_size_loaded) 1565 return ctx->cs_grid_size_reg; 1566 1567 t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg; 1568 struct r600_bytecode_alu alu; 1569 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1570 alu.op = ALU_OP1_MOV; 1571 alu.src[0].sel = V_SQ_ALU_SRC_0; 1572 alu.dst.sel = t1; 1573 alu.dst.write = 1; 1574 alu.last = 1; 1575 r = r600_bytecode_add_alu(ctx->bc, &alu); 1576 if (r) 1577 return r; 1578 1579 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 1580 vtx.op = FETCH_OP_VFETCH; 1581 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER; 1582 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1583 vtx.src_gpr = t1; 1584 vtx.src_sel_x = 0; 1585 1586 vtx.mega_fetch_count = 16; 1587 vtx.dst_gpr = t1; 1588 vtx.dst_sel_x = 0; 1589 vtx.dst_sel_y = 1; 1590 vtx.dst_sel_z = 2; 1591 vtx.dst_sel_w = 7; 1592 vtx.data_format = FMT_32_32_32_32; 1593 vtx.num_format_all = 1; 1594 vtx.format_comp_all = 0; 1595 vtx.use_const_fields = 0; 1596 vtx.offset = load_block ? 0 : 16; // first element is size of buffer 1597 vtx.endian = r600_endian_swap(32); 1598 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */ 1599 1600 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 1601 if (r) 1602 return r; 1603 1604 if (load_block) 1605 ctx->cs_block_size_loaded = true; 1606 else 1607 ctx->cs_grid_size_loaded = true; 1608 return t1; 1609} 1610 1611static void tgsi_src(struct r600_shader_ctx *ctx, 1612 const struct tgsi_full_src_register *tgsi_src, 1613 struct r600_shader_src *r600_src) 1614{ 1615 memset(r600_src, 0, sizeof(*r600_src)); 1616 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX; 1617 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY; 1618 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ; 1619 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW; 1620 r600_src->neg = tgsi_src->Register.Negate; 1621 r600_src->abs = tgsi_src->Register.Absolute; 1622 1623 if (tgsi_src->Register.File == TGSI_FILE_TEMPORARY) { 1624 bool spilled; 1625 unsigned idx; 1626 1627 idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_src->Register.Index, &spilled); 1628 1629 if (spilled) { 1630 int reg = r600_get_temp(ctx); 1631 int r; 1632 1633 r600_src->sel = reg; 1634 1635 if (ctx->bc->gfx_level < R700) { 1636 struct r600_bytecode_output cf; 1637 1638 memset(&cf, 0, sizeof(struct r600_bytecode_output)); 1639 cf.op = CF_OP_MEM_SCRATCH; 1640 cf.elem_size = 3; 1641 cf.gpr = reg; 1642 cf.comp_mask = 0xF; 1643 cf.swizzle_x = 0; 1644 cf.swizzle_y = 1; 1645 cf.swizzle_z = 2; 1646 cf.swizzle_w = 3; 1647 cf.burst_count = 1; 1648 1649 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index, 1650 &cf.array_base, &cf.array_size); 1651 1652 if (tgsi_src->Register.Indirect) { 1653 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND; 1654 cf.index_gpr = ctx->bc->ar_reg; 1655 } 1656 else { 1657 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ; 1658 cf.array_base += idx; 1659 cf.array_size = 0; 1660 } 1661 1662 r = r600_bytecode_add_output(ctx->bc, &cf); 1663 } 1664 else { 1665 struct r600_bytecode_vtx vtx; 1666 1667 r600_bytecode_wait_acks(ctx->bc); 1668 1669 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 1670 vtx.op = FETCH_OP_READ_SCRATCH; 1671 vtx.dst_gpr = reg; 1672 vtx.uncached = 1; // Must bypass cache since prior spill written in same invocation 1673 vtx.elem_size = 3; 1674 vtx.data_format = FMT_32_32_32_32; 1675 vtx.num_format_all = V_038010_SQ_NUM_FORMAT_INT; 1676 vtx.dst_sel_x = tgsi_src->Register.SwizzleX; 1677 vtx.dst_sel_y = tgsi_src->Register.SwizzleY; 1678 vtx.dst_sel_z = tgsi_src->Register.SwizzleZ; 1679 vtx.dst_sel_w = tgsi_src->Register.SwizzleW; 1680 1681 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index, 1682 &vtx.array_base, &vtx.array_size); 1683 1684 if (tgsi_src->Register.Indirect) { 1685 vtx.indexed = 1; 1686 vtx.src_gpr = ctx->bc->ar_reg; 1687 } 1688 else { 1689 vtx.array_base += idx; 1690 vtx.array_size = 0; 1691 } 1692 1693 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 1694 } 1695 1696 if (r) 1697 return; 1698 } 1699 else { 1700 if (tgsi_src->Register.Indirect) 1701 r600_src->rel = V_SQ_REL_RELATIVE; 1702 1703 r600_src->sel = idx; 1704 } 1705 1706 return; 1707 } 1708 1709 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) { 1710 int index; 1711 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) && 1712 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) && 1713 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) { 1714 1715 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX; 1716 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel); 1717 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL) 1718 return; 1719 } 1720 index = tgsi_src->Register.Index; 1721 r600_src->sel = V_SQ_ALU_SRC_LITERAL; 1722 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value)); 1723 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) { 1724 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) { 1725 r600_src->swizzle[0] = 2; // Z value 1726 r600_src->swizzle[1] = 2; 1727 r600_src->swizzle[2] = 2; 1728 r600_src->swizzle[3] = 2; 1729 r600_src->sel = ctx->face_gpr; 1730 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) { 1731 r600_src->swizzle[0] = 3; // W value 1732 r600_src->swizzle[1] = 3; 1733 r600_src->swizzle[2] = 3; 1734 r600_src->swizzle[3] = 3; 1735 r600_src->sel = ctx->fixed_pt_position_gpr; 1736 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) { 1737 r600_src->swizzle[0] = 0; 1738 r600_src->swizzle[1] = 1; 1739 r600_src->swizzle[2] = 4; 1740 r600_src->swizzle[3] = 4; 1741 r600_src->sel = load_sample_position(ctx, NULL, -1); 1742 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) { 1743 r600_src->swizzle[0] = 3; 1744 r600_src->swizzle[1] = 3; 1745 r600_src->swizzle[2] = 3; 1746 r600_src->swizzle[3] = 3; 1747 r600_src->sel = 0; 1748 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) { 1749 r600_src->swizzle[0] = 0; 1750 r600_src->swizzle[1] = 0; 1751 r600_src->swizzle[2] = 0; 1752 r600_src->swizzle[3] = 0; 1753 r600_src->sel = 0; 1754 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) { 1755 r600_src->sel = 0; 1756 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) { 1757 r600_src->sel = 1; 1758 } else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) { 1759 r600_src->swizzle[0] = 3; 1760 r600_src->swizzle[1] = 3; 1761 r600_src->swizzle[2] = 3; 1762 r600_src->swizzle[3] = 3; 1763 r600_src->sel = 1; 1764 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) { 1765 r600_src->swizzle[0] = 2; 1766 r600_src->swizzle[1] = 2; 1767 r600_src->swizzle[2] = 2; 1768 r600_src->swizzle[3] = 2; 1769 r600_src->sel = 0; 1770 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) { 1771 r600_src->sel = 1; 1772 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) { 1773 r600_src->sel = 3; 1774 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) { 1775 r600_src->sel = 2; 1776 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) { 1777 r600_src->sel = ctx->tess_input_info; 1778 r600_src->swizzle[0] = 2; 1779 r600_src->swizzle[1] = 2; 1780 r600_src->swizzle[2] = 2; 1781 r600_src->swizzle[3] = 2; 1782 } else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) { 1783 r600_src->sel = 0; 1784 r600_src->swizzle[0] = 0; 1785 r600_src->swizzle[1] = 0; 1786 r600_src->swizzle[2] = 0; 1787 r600_src->swizzle[3] = 0; 1788 } else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) { 1789 r600_src->sel = 0; 1790 r600_src->swizzle[0] = 3; 1791 r600_src->swizzle[1] = 3; 1792 r600_src->swizzle[2] = 3; 1793 r600_src->swizzle[3] = 3; 1794 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) { 1795 r600_src->sel = load_block_grid_size(ctx, false); 1796 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) { 1797 r600_src->sel = load_block_grid_size(ctx, true); 1798 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) { 1799 r600_src->sel = ctx->helper_invoc_reg; 1800 r600_src->swizzle[0] = 0; 1801 r600_src->swizzle[1] = 0; 1802 r600_src->swizzle[2] = 0; 1803 r600_src->swizzle[3] = 0; 1804 } 1805 } else { 1806 if (tgsi_src->Register.Indirect) 1807 r600_src->rel = V_SQ_REL_RELATIVE; 1808 r600_src->sel = tgsi_src->Register.Index; 1809 r600_src->sel += ctx->file_offset[tgsi_src->Register.File]; 1810 } 1811 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) { 1812 if (tgsi_src->Register.Dimension) { 1813 r600_src->kc_bank = tgsi_src->Dimension.Index; 1814 if (tgsi_src->Dimension.Indirect) { 1815 r600_src->kc_rel = 1; 1816 } 1817 } 1818 } 1819} 1820 1821static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, 1822 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan, 1823 unsigned int dst_reg) 1824{ 1825 struct r600_bytecode_vtx vtx; 1826 unsigned int ar_reg; 1827 int r; 1828 1829 if (offset) { 1830 struct r600_bytecode_alu alu; 1831 1832 memset(&alu, 0, sizeof(alu)); 1833 1834 alu.op = ALU_OP2_ADD_INT; 1835 alu.src[0].sel = ctx->bc->ar_reg; 1836 alu.src[0].chan = ar_chan; 1837 1838 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 1839 alu.src[1].value = offset; 1840 1841 alu.dst.sel = dst_reg; 1842 alu.dst.chan = ar_chan; 1843 alu.dst.write = 1; 1844 alu.last = 1; 1845 1846 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1847 return r; 1848 1849 ar_reg = dst_reg; 1850 } else { 1851 ar_reg = ctx->bc->ar_reg; 1852 } 1853 1854 memset(&vtx, 0, sizeof(vtx)); 1855 vtx.buffer_id = cb_idx; 1856 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1857 vtx.src_gpr = ar_reg; 1858 vtx.src_sel_x = ar_chan; 1859 vtx.mega_fetch_count = 16; 1860 vtx.dst_gpr = dst_reg; 1861 vtx.dst_sel_x = 0; /* SEL_X */ 1862 vtx.dst_sel_y = 1; /* SEL_Y */ 1863 vtx.dst_sel_z = 2; /* SEL_Z */ 1864 vtx.dst_sel_w = 3; /* SEL_W */ 1865 vtx.data_format = FMT_32_32_32_32_FLOAT; 1866 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */ 1867 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */ 1868 vtx.endian = r600_endian_swap(32); 1869 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE; 1870 1871 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 1872 return r; 1873 1874 return 0; 1875} 1876 1877static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 1878{ 1879 struct r600_bytecode_vtx vtx; 1880 int r; 1881 unsigned index = src->Register.Index; 1882 unsigned vtx_id = src->Dimension.Index; 1883 int offset_reg = ctx->gs_rotated_input[vtx_id / 3]; 1884 int offset_chan = vtx_id % 3; 1885 int t2 = 0; 1886 1887 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y, 1888 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */ 1889 1890 if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2) 1891 offset_chan = 3; 1892 1893 if (src->Dimension.Indirect || src->Register.Indirect) 1894 t2 = r600_get_temp(ctx); 1895 1896 if (src->Dimension.Indirect) { 1897 int treg[3]; 1898 struct r600_bytecode_alu alu; 1899 int r, i; 1900 unsigned addr_reg; 1901 addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index); 1902 if (src->DimIndirect.Index > 0) { 1903 r = single_alu_op2(ctx, ALU_OP1_MOV, 1904 ctx->bc->ar_reg, 0, 1905 addr_reg, 0, 1906 0, 0); 1907 if (r) 1908 return r; 1909 } 1910 /* 1911 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt. 1912 at least this is what fglrx seems to do. */ 1913 for (i = 0; i < 3; i++) { 1914 treg[i] = r600_get_temp(ctx); 1915 } 1916 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F); 1917 1918 for (i = 0; i < 3; i++) { 1919 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1920 alu.op = ALU_OP1_MOV; 1921 alu.src[0].sel = ctx->gs_rotated_input[0]; 1922 alu.src[0].chan = i == 2 ? 3 : i; 1923 alu.dst.sel = treg[i]; 1924 alu.dst.chan = 0; 1925 alu.dst.write = 1; 1926 alu.last = 1; 1927 r = r600_bytecode_add_alu(ctx->bc, &alu); 1928 if (r) 1929 return r; 1930 } 1931 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1932 alu.op = ALU_OP1_MOV; 1933 alu.src[0].sel = treg[0]; 1934 alu.src[0].rel = 1; 1935 alu.dst.sel = t2; 1936 alu.dst.write = 1; 1937 alu.last = 1; 1938 r = r600_bytecode_add_alu(ctx->bc, &alu); 1939 if (r) 1940 return r; 1941 offset_reg = t2; 1942 offset_chan = 0; 1943 } 1944 1945 if (src->Register.Indirect) { 1946 int addr_reg; 1947 unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID]; 1948 1949 addr_reg = get_address_file_reg(ctx, src->Indirect.Index); 1950 1951 /* pull the value from index_reg */ 1952 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 1953 t2, 1, 1954 addr_reg, 0, 1955 V_SQ_ALU_SRC_LITERAL, first); 1956 if (r) 1957 return r; 1958 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 1959 t2, 0, 1960 t2, 1, 1961 V_SQ_ALU_SRC_LITERAL, 4, 1962 offset_reg, offset_chan); 1963 if (r) 1964 return r; 1965 offset_reg = t2; 1966 offset_chan = 0; 1967 index = src->Register.Index - first; 1968 } 1969 1970 memset(&vtx, 0, sizeof(vtx)); 1971 vtx.buffer_id = R600_GS_RING_CONST_BUFFER; 1972 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1973 vtx.src_gpr = offset_reg; 1974 vtx.src_sel_x = offset_chan; 1975 vtx.offset = index * 16; /*bytes*/ 1976 vtx.mega_fetch_count = 16; 1977 vtx.dst_gpr = dst_reg; 1978 vtx.dst_sel_x = 0; /* SEL_X */ 1979 vtx.dst_sel_y = 1; /* SEL_Y */ 1980 vtx.dst_sel_z = 2; /* SEL_Z */ 1981 vtx.dst_sel_w = 3; /* SEL_W */ 1982 if (ctx->bc->gfx_level >= EVERGREEN) { 1983 vtx.use_const_fields = 1; 1984 } else { 1985 vtx.data_format = FMT_32_32_32_32_FLOAT; 1986 } 1987 1988 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 1989 return r; 1990 1991 return 0; 1992} 1993 1994static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx) 1995{ 1996 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 1997 unsigned i; 1998 1999 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 2000 struct tgsi_full_src_register *src = &inst->Src[i]; 2001 2002 if (src->Register.File == TGSI_FILE_INPUT) { 2003 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) { 2004 /* primitive id is in R0.z */ 2005 ctx->src[i].sel = 0; 2006 ctx->src[i].swizzle[0] = 2; 2007 } 2008 } 2009 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) { 2010 int treg = r600_get_temp(ctx); 2011 2012 fetch_gs_input(ctx, src, treg); 2013 ctx->src[i].sel = treg; 2014 ctx->src[i].rel = 0; 2015 } 2016 } 2017 return 0; 2018} 2019 2020 2021/* Tessellation shaders pass outputs to the next shader using LDS. 2022 * 2023 * LS outputs = TCS(HS) inputs 2024 * TCS(HS) outputs = TES(DS) inputs 2025 * 2026 * The LDS layout is: 2027 * - TCS inputs for patch 0 2028 * - TCS inputs for patch 1 2029 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2) 2030 * - ... 2031 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset 2032 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset 2033 * - TCS outputs for patch 1 2034 * - Per-patch TCS outputs for patch 1 2035 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2) 2036 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2) 2037 * - ... 2038 * 2039 * All three shaders VS(LS), TCS, TES share the same LDS space. 2040 */ 2041/* this will return with the dw address in temp_reg.x */ 2042static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg, 2043 const struct tgsi_full_dst_register *dst, 2044 const struct tgsi_full_src_register *src, 2045 int stride_bytes_reg, int stride_bytes_chan) 2046{ 2047 struct tgsi_full_dst_register reg; 2048 ubyte *name, *index, *array_first; 2049 int r; 2050 int param; 2051 struct tgsi_shader_info *info = &ctx->info; 2052 /* Set the register description. The address computation is the same 2053 * for sources and destinations. */ 2054 if (src) { 2055 reg.Register.File = src->Register.File; 2056 reg.Register.Index = src->Register.Index; 2057 reg.Register.Indirect = src->Register.Indirect; 2058 reg.Register.Dimension = src->Register.Dimension; 2059 reg.Indirect = src->Indirect; 2060 reg.Dimension = src->Dimension; 2061 reg.DimIndirect = src->DimIndirect; 2062 } else 2063 reg = *dst; 2064 2065 /* If the register is 2-dimensional (e.g. an array of vertices 2066 * in a primitive), calculate the base address of the vertex. */ 2067 if (reg.Register.Dimension) { 2068 int sel, chan; 2069 if (reg.Dimension.Indirect) { 2070 unsigned addr_reg; 2071 assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS); 2072 2073 addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index); 2074 /* pull the value from index_reg */ 2075 sel = addr_reg; 2076 chan = 0; 2077 } else { 2078 sel = V_SQ_ALU_SRC_LITERAL; 2079 chan = reg.Dimension.Index; 2080 } 2081 2082 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 2083 temp_reg, 0, 2084 stride_bytes_reg, stride_bytes_chan, 2085 sel, chan, 2086 temp_reg, 0); 2087 if (r) 2088 return r; 2089 } 2090 2091 if (reg.Register.File == TGSI_FILE_INPUT) { 2092 name = info->input_semantic_name; 2093 index = info->input_semantic_index; 2094 array_first = info->input_array_first; 2095 } else if (reg.Register.File == TGSI_FILE_OUTPUT) { 2096 name = info->output_semantic_name; 2097 index = info->output_semantic_index; 2098 array_first = info->output_array_first; 2099 } else { 2100 assert(0); 2101 return -1; 2102 } 2103 if (reg.Register.Indirect) { 2104 int addr_reg; 2105 int first; 2106 /* Add the relative address of the element. */ 2107 if (reg.Indirect.ArrayID) 2108 first = array_first[reg.Indirect.ArrayID]; 2109 else 2110 first = reg.Register.Index; 2111 2112 addr_reg = get_address_file_reg(ctx, reg.Indirect.Index); 2113 2114 /* pull the value from index_reg */ 2115 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 2116 temp_reg, 0, 2117 V_SQ_ALU_SRC_LITERAL, 16, 2118 addr_reg, 0, 2119 temp_reg, 0); 2120 if (r) 2121 return r; 2122 2123 param = r600_get_lds_unique_index(name[first], 2124 index[first]); 2125 2126 } else { 2127 param = r600_get_lds_unique_index(name[reg.Register.Index], 2128 index[reg.Register.Index]); 2129 } 2130 2131 /* add to base_addr - passed in temp_reg.x */ 2132 if (param) { 2133 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 2134 temp_reg, 0, 2135 temp_reg, 0, 2136 V_SQ_ALU_SRC_LITERAL, param * 16); 2137 if (r) 2138 return r; 2139 2140 } 2141 return 0; 2142} 2143 2144static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg, 2145 unsigned dst_reg, unsigned mask) 2146{ 2147 struct r600_bytecode_alu alu; 2148 int r, i, lasti; 2149 2150 if ((ctx->bc->cf_last->ndw>>1) >= 0x60) 2151 ctx->bc->force_add_cf = 1; 2152 2153 lasti = tgsi_last_instruction(mask); 2154 for (i = 1; i <= lasti; i++) { 2155 if (!(mask & (1 << i))) 2156 continue; 2157 2158 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 2159 temp_reg, i, 2160 temp_reg, 0, 2161 V_SQ_ALU_SRC_LITERAL, 4 * i); 2162 if (r) 2163 return r; 2164 } 2165 for (i = 0; i <= lasti; i++) { 2166 if (!(mask & (1 << i))) 2167 continue; 2168 2169 /* emit an LDS_READ_RET */ 2170 memset(&alu, 0, sizeof(alu)); 2171 alu.op = LDS_OP1_LDS_READ_RET; 2172 alu.src[0].sel = temp_reg; 2173 alu.src[0].chan = i; 2174 alu.src[1].sel = V_SQ_ALU_SRC_0; 2175 alu.src[2].sel = V_SQ_ALU_SRC_0; 2176 alu.dst.chan = 0; 2177 alu.is_lds_idx_op = true; 2178 alu.last = 1; 2179 r = r600_bytecode_add_alu(ctx->bc, &alu); 2180 if (r) 2181 return r; 2182 } 2183 for (i = 0; i <= lasti; i++) { 2184 if (!(mask & (1 << i))) 2185 continue; 2186 2187 /* then read from LDS_OQ_A_POP */ 2188 memset(&alu, 0, sizeof(alu)); 2189 2190 alu.op = ALU_OP1_MOV; 2191 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP; 2192 alu.src[0].chan = 0; 2193 alu.dst.sel = dst_reg; 2194 alu.dst.chan = i; 2195 alu.dst.write = 1; 2196 alu.last = 1; 2197 r = r600_bytecode_add_alu(ctx->bc, &alu); 2198 if (r) 2199 return r; 2200 } 2201 return 0; 2202} 2203 2204static int fetch_mask(struct tgsi_src_register *reg) 2205{ 2206 int mask = 0; 2207 mask |= 1 << reg->SwizzleX; 2208 mask |= 1 << reg->SwizzleY; 2209 mask |= 1 << reg->SwizzleZ; 2210 mask |= 1 << reg->SwizzleW; 2211 return mask; 2212} 2213 2214static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 2215{ 2216 int r; 2217 unsigned temp_reg = r600_get_temp(ctx); 2218 2219 r = get_lds_offset0(ctx, 2, temp_reg, 2220 src->Register.Dimension ? false : true); 2221 if (r) 2222 return r; 2223 2224 /* the base address is now in temp.x */ 2225 r = r600_get_byte_address(ctx, temp_reg, 2226 NULL, src, ctx->tess_output_info, 1); 2227 if (r) 2228 return r; 2229 2230 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register)); 2231 if (r) 2232 return r; 2233 return 0; 2234} 2235 2236static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 2237{ 2238 int r; 2239 unsigned temp_reg = r600_get_temp(ctx); 2240 2241 /* t.x = ips * r0.y */ 2242 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24, 2243 temp_reg, 0, 2244 ctx->tess_input_info, 0, 2245 0, 1); 2246 2247 if (r) 2248 return r; 2249 2250 /* the base address is now in temp.x */ 2251 r = r600_get_byte_address(ctx, temp_reg, 2252 NULL, src, ctx->tess_input_info, 1); 2253 if (r) 2254 return r; 2255 2256 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register)); 2257 if (r) 2258 return r; 2259 return 0; 2260} 2261 2262static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 2263{ 2264 int r; 2265 unsigned temp_reg = r600_get_temp(ctx); 2266 2267 r = get_lds_offset0(ctx, 1, temp_reg, 2268 src->Register.Dimension ? false : true); 2269 if (r) 2270 return r; 2271 /* the base address is now in temp.x */ 2272 r = r600_get_byte_address(ctx, temp_reg, 2273 NULL, src, 2274 ctx->tess_output_info, 1); 2275 if (r) 2276 return r; 2277 2278 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register)); 2279 if (r) 2280 return r; 2281 return 0; 2282} 2283 2284static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx) 2285{ 2286 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2287 unsigned i; 2288 2289 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 2290 struct tgsi_full_src_register *src = &inst->Src[i]; 2291 2292 if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) { 2293 int treg = r600_get_temp(ctx); 2294 fetch_tes_input(ctx, src, treg); 2295 ctx->src[i].sel = treg; 2296 ctx->src[i].rel = 0; 2297 } 2298 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) { 2299 int treg = r600_get_temp(ctx); 2300 fetch_tcs_input(ctx, src, treg); 2301 ctx->src[i].sel = treg; 2302 ctx->src[i].rel = 0; 2303 } 2304 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) { 2305 int treg = r600_get_temp(ctx); 2306 fetch_tcs_output(ctx, src, treg); 2307 ctx->src[i].sel = treg; 2308 ctx->src[i].rel = 0; 2309 } 2310 } 2311 return 0; 2312} 2313 2314static int tgsi_split_constant(struct r600_shader_ctx *ctx) 2315{ 2316 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2317 struct r600_bytecode_alu alu; 2318 int i, j, k, nconst, r; 2319 2320 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) { 2321 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) { 2322 nconst++; 2323 } 2324 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]); 2325 } 2326 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) { 2327 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) { 2328 continue; 2329 } 2330 2331 if (ctx->src[i].rel) { 2332 int chan = inst->Src[i].Indirect.Swizzle; 2333 int treg = r600_get_temp(ctx); 2334 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg))) 2335 return r; 2336 2337 ctx->src[i].kc_bank = 0; 2338 ctx->src[i].kc_rel = 0; 2339 ctx->src[i].sel = treg; 2340 ctx->src[i].rel = 0; 2341 j--; 2342 } else if (j > 0) { 2343 int treg = r600_get_temp(ctx); 2344 for (k = 0; k < 4; k++) { 2345 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2346 alu.op = ALU_OP1_MOV; 2347 alu.src[0].sel = ctx->src[i].sel; 2348 alu.src[0].chan = k; 2349 alu.src[0].rel = ctx->src[i].rel; 2350 alu.src[0].kc_bank = ctx->src[i].kc_bank; 2351 alu.src[0].kc_rel = ctx->src[i].kc_rel; 2352 alu.dst.sel = treg; 2353 alu.dst.chan = k; 2354 alu.dst.write = 1; 2355 if (k == 3) 2356 alu.last = 1; 2357 r = r600_bytecode_add_alu(ctx->bc, &alu); 2358 if (r) 2359 return r; 2360 } 2361 ctx->src[i].sel = treg; 2362 ctx->src[i].rel =0; 2363 j--; 2364 } 2365 } 2366 return 0; 2367} 2368 2369/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */ 2370static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx) 2371{ 2372 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2373 struct r600_bytecode_alu alu; 2374 int i, j, k, nliteral, r; 2375 2376 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) { 2377 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 2378 nliteral++; 2379 } 2380 } 2381 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) { 2382 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 2383 int treg = r600_get_temp(ctx); 2384 for (k = 0; k < 4; k++) { 2385 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2386 alu.op = ALU_OP1_MOV; 2387 alu.src[0].sel = ctx->src[i].sel; 2388 alu.src[0].chan = k; 2389 alu.src[0].value = ctx->src[i].value[k]; 2390 alu.dst.sel = treg; 2391 alu.dst.chan = k; 2392 alu.dst.write = 1; 2393 if (k == 3) 2394 alu.last = 1; 2395 r = r600_bytecode_add_alu(ctx->bc, &alu); 2396 if (r) 2397 return r; 2398 } 2399 ctx->src[i].sel = treg; 2400 j--; 2401 } 2402 } 2403 return 0; 2404} 2405 2406static int process_twoside_color_inputs(struct r600_shader_ctx *ctx) 2407{ 2408 int i, r, count = ctx->shader->ninput; 2409 2410 for (i = 0; i < count; i++) { 2411 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) { 2412 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input); 2413 if (r) 2414 return r; 2415 } 2416 } 2417 return 0; 2418} 2419 2420static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so, 2421 int stream, unsigned *stream_item_size UNUSED) 2422{ 2423 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS]; 2424 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS]; 2425 int j, r; 2426 unsigned i; 2427 2428 /* Sanity checking. */ 2429 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) { 2430 R600_ERR("Too many stream outputs: %d\n", so->num_outputs); 2431 r = -EINVAL; 2432 goto out_err; 2433 } 2434 for (i = 0; i < so->num_outputs; i++) { 2435 if (so->output[i].output_buffer >= 4) { 2436 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n", 2437 so->output[i].output_buffer); 2438 r = -EINVAL; 2439 goto out_err; 2440 } 2441 } 2442 2443 /* Initialize locations where the outputs are stored. */ 2444 for (i = 0; i < so->num_outputs; i++) { 2445 2446 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr; 2447 start_comp[i] = so->output[i].start_component; 2448 /* Lower outputs with dst_offset < start_component. 2449 * 2450 * We can only output 4D vectors with a write mask, e.g. we can 2451 * only output the W component at offset 3, etc. If we want 2452 * to store Y, Z, or W at buffer offset 0, we need to use MOV 2453 * to move it to X and output X. */ 2454 if (so->output[i].dst_offset < so->output[i].start_component) { 2455 unsigned tmp = r600_get_temp(ctx); 2456 2457 for (j = 0; j < so->output[i].num_components; j++) { 2458 struct r600_bytecode_alu alu; 2459 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2460 alu.op = ALU_OP1_MOV; 2461 alu.src[0].sel = so_gpr[i]; 2462 alu.src[0].chan = so->output[i].start_component + j; 2463 2464 alu.dst.sel = tmp; 2465 alu.dst.chan = j; 2466 alu.dst.write = 1; 2467 if (j == so->output[i].num_components - 1) 2468 alu.last = 1; 2469 r = r600_bytecode_add_alu(ctx->bc, &alu); 2470 if (r) 2471 return r; 2472 } 2473 start_comp[i] = 0; 2474 so_gpr[i] = tmp; 2475 } 2476 } 2477 2478 /* Write outputs to buffers. */ 2479 for (i = 0; i < so->num_outputs; i++) { 2480 struct r600_bytecode_output output; 2481 2482 if (stream != -1 && stream != so->output[i].stream) 2483 continue; 2484 2485 memset(&output, 0, sizeof(struct r600_bytecode_output)); 2486 output.gpr = so_gpr[i]; 2487 output.elem_size = so->output[i].num_components - 1; 2488 if (output.elem_size == 2) 2489 output.elem_size = 3; // 3 not supported, write 4 with junk at end 2490 output.array_base = so->output[i].dst_offset - start_comp[i]; 2491 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; 2492 output.burst_count = 1; 2493 /* array_size is an upper limit for the burst_count 2494 * with MEM_STREAM instructions */ 2495 output.array_size = 0xFFF; 2496 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i]; 2497 2498 if (ctx->bc->gfx_level >= EVERGREEN) { 2499 switch (so->output[i].output_buffer) { 2500 case 0: 2501 output.op = CF_OP_MEM_STREAM0_BUF0; 2502 break; 2503 case 1: 2504 output.op = CF_OP_MEM_STREAM0_BUF1; 2505 break; 2506 case 2: 2507 output.op = CF_OP_MEM_STREAM0_BUF2; 2508 break; 2509 case 3: 2510 output.op = CF_OP_MEM_STREAM0_BUF3; 2511 break; 2512 } 2513 output.op += so->output[i].stream * 4; 2514 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3); 2515 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4; 2516 } else { 2517 switch (so->output[i].output_buffer) { 2518 case 0: 2519 output.op = CF_OP_MEM_STREAM0; 2520 break; 2521 case 1: 2522 output.op = CF_OP_MEM_STREAM1; 2523 break; 2524 case 2: 2525 output.op = CF_OP_MEM_STREAM2; 2526 break; 2527 case 3: 2528 output.op = CF_OP_MEM_STREAM3; 2529 break; 2530 } 2531 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer; 2532 } 2533 r = r600_bytecode_add_output(ctx->bc, &output); 2534 if (r) 2535 goto out_err; 2536 } 2537 return 0; 2538out_err: 2539 return r; 2540} 2541 2542static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx) 2543{ 2544 struct r600_bytecode_alu alu; 2545 unsigned reg; 2546 2547 if (!ctx->shader->vs_out_edgeflag) 2548 return; 2549 2550 reg = ctx->shader->output[ctx->edgeflag_output].gpr; 2551 2552 /* clamp(x, 0, 1) */ 2553 memset(&alu, 0, sizeof(alu)); 2554 alu.op = ALU_OP1_MOV; 2555 alu.src[0].sel = reg; 2556 alu.dst.sel = reg; 2557 alu.dst.write = 1; 2558 alu.dst.clamp = 1; 2559 alu.last = 1; 2560 r600_bytecode_add_alu(ctx->bc, &alu); 2561 2562 memset(&alu, 0, sizeof(alu)); 2563 alu.op = ALU_OP1_FLT_TO_INT; 2564 alu.src[0].sel = reg; 2565 alu.dst.sel = reg; 2566 alu.dst.write = 1; 2567 alu.last = 1; 2568 r600_bytecode_add_alu(ctx->bc, &alu); 2569} 2570 2571int generate_gs_copy_shader(struct r600_context *rctx, 2572 struct r600_pipe_shader *gs, 2573 struct pipe_stream_output_info *so) 2574{ 2575 struct r600_shader_ctx ctx = {}; 2576 struct r600_shader *gs_shader = &gs->shader; 2577 struct r600_pipe_shader *cshader; 2578 unsigned ocnt = gs_shader->noutput; 2579 struct r600_bytecode_alu alu; 2580 struct r600_bytecode_vtx vtx; 2581 struct r600_bytecode_output output; 2582 struct r600_bytecode_cf *cf_jump, *cf_pop, 2583 *last_exp_pos = NULL, *last_exp_param = NULL; 2584 int next_clip_pos = 61, next_param = 0; 2585 unsigned i, j; 2586 int ring; 2587 bool only_ring_0 = true; 2588 cshader = calloc(1, sizeof(struct r600_pipe_shader)); 2589 if (!cshader) 2590 return 0; 2591 2592 memcpy(cshader->shader.output, gs_shader->output, ocnt * 2593 sizeof(struct r600_shader_io)); 2594 2595 cshader->shader.noutput = ocnt; 2596 2597 ctx.shader = &cshader->shader; 2598 ctx.bc = &ctx.shader->bc; 2599 ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX; 2600 2601 r600_bytecode_init(ctx.bc, rctx->b.gfx_level, rctx->b.family, 2602 rctx->screen->has_compressed_msaa_texturing); 2603 2604 ctx.bc->isa = rctx->isa; 2605 2606 cf_jump = NULL; 2607 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes)); 2608 2609 /* R0.x = R0.x & 0x3fffffff */ 2610 memset(&alu, 0, sizeof(alu)); 2611 alu.op = ALU_OP2_AND_INT; 2612 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2613 alu.src[1].value = 0x3fffffff; 2614 alu.dst.write = 1; 2615 r600_bytecode_add_alu(ctx.bc, &alu); 2616 2617 /* R0.y = R0.x >> 30 */ 2618 memset(&alu, 0, sizeof(alu)); 2619 alu.op = ALU_OP2_LSHR_INT; 2620 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2621 alu.src[1].value = 0x1e; 2622 alu.dst.chan = 1; 2623 alu.dst.write = 1; 2624 alu.last = 1; 2625 r600_bytecode_add_alu(ctx.bc, &alu); 2626 2627 /* fetch vertex data from GSVS ring */ 2628 for (i = 0; i < ocnt; ++i) { 2629 struct r600_shader_io *out = &ctx.shader->output[i]; 2630 2631 out->gpr = i + 1; 2632 out->ring_offset = i * 16; 2633 2634 memset(&vtx, 0, sizeof(vtx)); 2635 vtx.op = FETCH_OP_VFETCH; 2636 vtx.buffer_id = R600_GS_RING_CONST_BUFFER; 2637 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 2638 vtx.mega_fetch_count = 16; 2639 vtx.offset = out->ring_offset; 2640 vtx.dst_gpr = out->gpr; 2641 vtx.src_gpr = 0; 2642 vtx.dst_sel_x = 0; 2643 vtx.dst_sel_y = 1; 2644 vtx.dst_sel_z = 2; 2645 vtx.dst_sel_w = 3; 2646 if (rctx->b.gfx_level >= EVERGREEN) { 2647 vtx.use_const_fields = 1; 2648 } else { 2649 vtx.data_format = FMT_32_32_32_32_FLOAT; 2650 } 2651 2652 r600_bytecode_add_vtx(ctx.bc, &vtx); 2653 } 2654 ctx.temp_reg = i + 1; 2655 for (ring = 3; ring >= 0; --ring) { 2656 bool enabled = false; 2657 for (i = 0; i < so->num_outputs; i++) { 2658 if (so->output[i].stream == ring) { 2659 enabled = true; 2660 if (ring > 0) 2661 only_ring_0 = false; 2662 break; 2663 } 2664 } 2665 if (ring != 0 && !enabled) { 2666 cshader->shader.ring_item_sizes[ring] = 0; 2667 continue; 2668 } 2669 2670 if (cf_jump) { 2671 // Patch up jump label 2672 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP); 2673 cf_pop = ctx.bc->cf_last; 2674 2675 cf_jump->cf_addr = cf_pop->id + 2; 2676 cf_jump->pop_count = 1; 2677 cf_pop->cf_addr = cf_pop->id + 2; 2678 cf_pop->pop_count = 1; 2679 } 2680 2681 /* PRED_SETE_INT __, R0.y, ring */ 2682 memset(&alu, 0, sizeof(alu)); 2683 alu.op = ALU_OP2_PRED_SETE_INT; 2684 alu.src[0].chan = 1; 2685 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2686 alu.src[1].value = ring; 2687 alu.execute_mask = 1; 2688 alu.update_pred = 1; 2689 alu.last = 1; 2690 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE); 2691 2692 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP); 2693 cf_jump = ctx.bc->cf_last; 2694 2695 if (enabled) 2696 emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]); 2697 cshader->shader.ring_item_sizes[ring] = ocnt * 16; 2698 } 2699 2700 /* bc adds nops - copy it */ 2701 if (ctx.bc->gfx_level == R600) { 2702 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2703 alu.op = ALU_OP0_NOP; 2704 alu.last = 1; 2705 r600_bytecode_add_alu(ctx.bc, &alu); 2706 2707 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); 2708 } 2709 2710 /* export vertex data */ 2711 /* XXX factor out common code with r600_shader_from_tgsi ? */ 2712 for (i = 0; i < ocnt; ++i) { 2713 struct r600_shader_io *out = &ctx.shader->output[i]; 2714 bool instream0 = true; 2715 if (out->name == TGSI_SEMANTIC_CLIPVERTEX) 2716 continue; 2717 2718 for (j = 0; j < so->num_outputs; j++) { 2719 if (so->output[j].register_index == i) { 2720 if (so->output[j].stream == 0) 2721 break; 2722 if (so->output[j].stream > 0) 2723 instream0 = false; 2724 } 2725 } 2726 if (!instream0) 2727 continue; 2728 memset(&output, 0, sizeof(output)); 2729 output.gpr = out->gpr; 2730 output.elem_size = 3; 2731 output.swizzle_x = 0; 2732 output.swizzle_y = 1; 2733 output.swizzle_z = 2; 2734 output.swizzle_w = 3; 2735 output.burst_count = 1; 2736 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 2737 output.op = CF_OP_EXPORT; 2738 switch (out->name) { 2739 case TGSI_SEMANTIC_POSITION: 2740 output.array_base = 60; 2741 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2742 break; 2743 2744 case TGSI_SEMANTIC_PSIZE: 2745 output.array_base = 61; 2746 if (next_clip_pos == 61) 2747 next_clip_pos = 62; 2748 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2749 output.swizzle_y = 7; 2750 output.swizzle_z = 7; 2751 output.swizzle_w = 7; 2752 ctx.shader->vs_out_misc_write = 1; 2753 ctx.shader->vs_out_point_size = 1; 2754 break; 2755 case TGSI_SEMANTIC_LAYER: 2756 if (out->spi_sid) { 2757 /* duplicate it as PARAM to pass to the pixel shader */ 2758 output.array_base = next_param++; 2759 r600_bytecode_add_output(ctx.bc, &output); 2760 last_exp_param = ctx.bc->cf_last; 2761 } 2762 output.array_base = 61; 2763 if (next_clip_pos == 61) 2764 next_clip_pos = 62; 2765 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2766 output.swizzle_x = 7; 2767 output.swizzle_y = 7; 2768 output.swizzle_z = 0; 2769 output.swizzle_w = 7; 2770 ctx.shader->vs_out_misc_write = 1; 2771 ctx.shader->vs_out_layer = 1; 2772 break; 2773 case TGSI_SEMANTIC_VIEWPORT_INDEX: 2774 if (out->spi_sid) { 2775 /* duplicate it as PARAM to pass to the pixel shader */ 2776 output.array_base = next_param++; 2777 r600_bytecode_add_output(ctx.bc, &output); 2778 last_exp_param = ctx.bc->cf_last; 2779 } 2780 output.array_base = 61; 2781 if (next_clip_pos == 61) 2782 next_clip_pos = 62; 2783 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2784 ctx.shader->vs_out_misc_write = 1; 2785 ctx.shader->vs_out_viewport = 1; 2786 output.swizzle_x = 7; 2787 output.swizzle_y = 7; 2788 output.swizzle_z = 7; 2789 output.swizzle_w = 0; 2790 break; 2791 case TGSI_SEMANTIC_CLIPDIST: 2792 /* spi_sid is 0 for clipdistance outputs that were generated 2793 * for clipvertex - we don't need to pass them to PS */ 2794 ctx.shader->clip_dist_write = gs->shader.clip_dist_write; 2795 ctx.shader->cull_dist_write = gs->shader.cull_dist_write; 2796 ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask; 2797 if (out->spi_sid) { 2798 /* duplicate it as PARAM to pass to the pixel shader */ 2799 output.array_base = next_param++; 2800 r600_bytecode_add_output(ctx.bc, &output); 2801 last_exp_param = ctx.bc->cf_last; 2802 } 2803 output.array_base = next_clip_pos++; 2804 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2805 break; 2806 case TGSI_SEMANTIC_FOG: 2807 output.swizzle_y = 4; /* 0 */ 2808 output.swizzle_z = 4; /* 0 */ 2809 output.swizzle_w = 5; /* 1 */ 2810 break; 2811 default: 2812 output.array_base = next_param++; 2813 break; 2814 } 2815 r600_bytecode_add_output(ctx.bc, &output); 2816 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM) 2817 last_exp_param = ctx.bc->cf_last; 2818 else 2819 last_exp_pos = ctx.bc->cf_last; 2820 } 2821 2822 if (!last_exp_pos) { 2823 memset(&output, 0, sizeof(output)); 2824 output.gpr = 0; 2825 output.elem_size = 3; 2826 output.swizzle_x = 7; 2827 output.swizzle_y = 7; 2828 output.swizzle_z = 7; 2829 output.swizzle_w = 7; 2830 output.burst_count = 1; 2831 output.type = 2; 2832 output.op = CF_OP_EXPORT; 2833 output.array_base = 60; 2834 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2835 r600_bytecode_add_output(ctx.bc, &output); 2836 last_exp_pos = ctx.bc->cf_last; 2837 } 2838 2839 if (!last_exp_param) { 2840 memset(&output, 0, sizeof(output)); 2841 output.gpr = 0; 2842 output.elem_size = 3; 2843 output.swizzle_x = 7; 2844 output.swizzle_y = 7; 2845 output.swizzle_z = 7; 2846 output.swizzle_w = 7; 2847 output.burst_count = 1; 2848 output.type = 2; 2849 output.op = CF_OP_EXPORT; 2850 output.array_base = next_param++; 2851 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 2852 r600_bytecode_add_output(ctx.bc, &output); 2853 last_exp_param = ctx.bc->cf_last; 2854 } 2855 2856 last_exp_pos->op = CF_OP_EXPORT_DONE; 2857 last_exp_param->op = CF_OP_EXPORT_DONE; 2858 2859 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP); 2860 cf_pop = ctx.bc->cf_last; 2861 2862 cf_jump->cf_addr = cf_pop->id + 2; 2863 cf_jump->pop_count = 1; 2864 cf_pop->cf_addr = cf_pop->id + 2; 2865 cf_pop->pop_count = 1; 2866 2867 if (ctx.bc->gfx_level == CAYMAN) 2868 cm_bytecode_add_cf_end(ctx.bc); 2869 else { 2870 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); 2871 ctx.bc->cf_last->end_of_program = 1; 2872 } 2873 2874 gs->gs_copy_shader = cshader; 2875 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask; 2876 2877 ctx.bc->nstack = 1; 2878 2879 return r600_bytecode_build(ctx.bc); 2880} 2881 2882static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind) 2883{ 2884 if (ind) { 2885 struct r600_bytecode_alu alu; 2886 int r; 2887 2888 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2889 alu.op = ALU_OP2_ADD_INT; 2890 alu.src[0].sel = ctx->gs_export_gpr_tregs[idx]; 2891 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2892 alu.src[1].value = ctx->gs_out_ring_offset >> 4; 2893 alu.dst.sel = ctx->gs_export_gpr_tregs[idx]; 2894 alu.dst.write = 1; 2895 alu.last = 1; 2896 r = r600_bytecode_add_alu(ctx->bc, &alu); 2897 if (r) 2898 return r; 2899 } 2900 return 0; 2901} 2902 2903static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind) 2904{ 2905 struct r600_bytecode_output output; 2906 int ring_offset; 2907 unsigned i, k; 2908 int effective_stream = stream == -1 ? 0 : stream; 2909 int idx = 0; 2910 2911 for (i = 0; i < ctx->shader->noutput; i++) { 2912 if (ctx->gs_for_vs) { 2913 /* for ES we need to lookup corresponding ring offset expected by GS 2914 * (map this output to GS input by name and sid) */ 2915 /* FIXME precompute offsets */ 2916 ring_offset = -1; 2917 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) { 2918 struct r600_shader_io *in = &ctx->gs_for_vs->input[k]; 2919 struct r600_shader_io *out = &ctx->shader->output[i]; 2920 if (in->name == out->name && in->sid == out->sid) 2921 ring_offset = in->ring_offset; 2922 } 2923 2924 if (ring_offset == -1) 2925 continue; 2926 } else { 2927 ring_offset = idx * 16; 2928 idx++; 2929 } 2930 2931 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION) 2932 continue; 2933 /* next_ring_offset after parsing input decls contains total size of 2934 * single vertex data, gs_next_vertex - current vertex index */ 2935 if (!ind) 2936 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex; 2937 2938 memset(&output, 0, sizeof(struct r600_bytecode_output)); 2939 output.gpr = ctx->shader->output[i].gpr; 2940 output.elem_size = 3; 2941 output.comp_mask = 0xF; 2942 output.burst_count = 1; 2943 2944 if (ind) 2945 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 2946 else 2947 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; 2948 2949 switch (stream) { 2950 default: 2951 case 0: 2952 output.op = CF_OP_MEM_RING; break; 2953 case 1: 2954 output.op = CF_OP_MEM_RING1; break; 2955 case 2: 2956 output.op = CF_OP_MEM_RING2; break; 2957 case 3: 2958 output.op = CF_OP_MEM_RING3; break; 2959 } 2960 2961 if (ind) { 2962 output.array_base = ring_offset >> 2; /* in dwords */ 2963 output.array_size = 0xfff; 2964 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream]; 2965 } else 2966 output.array_base = ring_offset >> 2; /* in dwords */ 2967 r600_bytecode_add_output(ctx->bc, &output); 2968 } 2969 2970 ++ctx->gs_next_vertex; 2971 return 0; 2972} 2973 2974 2975static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx) 2976{ 2977 int r; 2978 struct r600_bytecode_vtx vtx; 2979 int temp_val = ctx->temp_reg; 2980 /* need to store the TCS output somewhere */ 2981 r = single_alu_op2(ctx, ALU_OP1_MOV, 2982 temp_val, 0, 2983 V_SQ_ALU_SRC_LITERAL, 0, 2984 0, 0); 2985 if (r) 2986 return r; 2987 2988 /* used by VS/TCS */ 2989 if (ctx->tess_input_info) { 2990 /* fetch tcs input values into resv space */ 2991 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 2992 vtx.op = FETCH_OP_VFETCH; 2993 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER; 2994 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 2995 vtx.mega_fetch_count = 16; 2996 vtx.data_format = FMT_32_32_32_32; 2997 vtx.num_format_all = 2; 2998 vtx.format_comp_all = 1; 2999 vtx.use_const_fields = 0; 3000 vtx.endian = r600_endian_swap(32); 3001 vtx.srf_mode_all = 1; 3002 vtx.offset = 0; 3003 vtx.dst_gpr = ctx->tess_input_info; 3004 vtx.dst_sel_x = 0; 3005 vtx.dst_sel_y = 1; 3006 vtx.dst_sel_z = 2; 3007 vtx.dst_sel_w = 3; 3008 vtx.src_gpr = temp_val; 3009 vtx.src_sel_x = 0; 3010 3011 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 3012 if (r) 3013 return r; 3014 } 3015 3016 /* used by TCS/TES */ 3017 if (ctx->tess_output_info) { 3018 /* fetch tcs output values into resv space */ 3019 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 3020 vtx.op = FETCH_OP_VFETCH; 3021 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER; 3022 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 3023 vtx.mega_fetch_count = 16; 3024 vtx.data_format = FMT_32_32_32_32; 3025 vtx.num_format_all = 2; 3026 vtx.format_comp_all = 1; 3027 vtx.use_const_fields = 0; 3028 vtx.endian = r600_endian_swap(32); 3029 vtx.srf_mode_all = 1; 3030 vtx.offset = 16; 3031 vtx.dst_gpr = ctx->tess_output_info; 3032 vtx.dst_sel_x = 0; 3033 vtx.dst_sel_y = 1; 3034 vtx.dst_sel_z = 2; 3035 vtx.dst_sel_w = 3; 3036 vtx.src_gpr = temp_val; 3037 vtx.src_sel_x = 0; 3038 3039 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 3040 if (r) 3041 return r; 3042 } 3043 return 0; 3044} 3045 3046static int emit_lds_vs_writes(struct r600_shader_ctx *ctx) 3047{ 3048 int j, r; 3049 int temp_reg; 3050 unsigned i; 3051 3052 /* fetch tcs input values into input_vals */ 3053 ctx->tess_input_info = r600_get_temp(ctx); 3054 ctx->tess_output_info = 0; 3055 r = r600_fetch_tess_io_info(ctx); 3056 if (r) 3057 return r; 3058 3059 temp_reg = r600_get_temp(ctx); 3060 /* dst reg contains LDS address stride * idx */ 3061 /* MUL vertexID, vertex_dw_stride */ 3062 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24, 3063 temp_reg, 0, 3064 ctx->tess_input_info, 1, 3065 0, 1); /* rel id in r0.y? */ 3066 if (r) 3067 return r; 3068 3069 for (i = 0; i < ctx->shader->noutput; i++) { 3070 struct r600_bytecode_alu alu; 3071 int param = r600_get_lds_unique_index(ctx->shader->output[i].name, 3072 ctx->shader->output[i].sid); 3073 3074 if (param) { 3075 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 3076 temp_reg, 1, 3077 temp_reg, 0, 3078 V_SQ_ALU_SRC_LITERAL, param * 16); 3079 if (r) 3080 return r; 3081 } 3082 3083 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 3084 temp_reg, 2, 3085 temp_reg, param ? 1 : 0, 3086 V_SQ_ALU_SRC_LITERAL, 8); 3087 if (r) 3088 return r; 3089 3090 3091 for (j = 0; j < 2; j++) { 3092 int chan = (j == 1) ? 2 : (param ? 1 : 0); 3093 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3094 alu.op = LDS_OP3_LDS_WRITE_REL; 3095 alu.src[0].sel = temp_reg; 3096 alu.src[0].chan = chan; 3097 alu.src[1].sel = ctx->shader->output[i].gpr; 3098 alu.src[1].chan = j * 2; 3099 alu.src[2].sel = ctx->shader->output[i].gpr; 3100 alu.src[2].chan = (j * 2) + 1; 3101 alu.last = 1; 3102 alu.dst.chan = 0; 3103 alu.lds_idx = 1; 3104 alu.is_lds_idx_op = true; 3105 r = r600_bytecode_add_alu(ctx->bc, &alu); 3106 if (r) 3107 return r; 3108 } 3109 } 3110 return 0; 3111} 3112 3113static int r600_store_tcs_output(struct r600_shader_ctx *ctx) 3114{ 3115 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 3116 const struct tgsi_full_dst_register *dst = &inst->Dst[0]; 3117 int i, r, lasti; 3118 int temp_reg = r600_get_temp(ctx); 3119 struct r600_bytecode_alu alu; 3120 unsigned write_mask = dst->Register.WriteMask; 3121 3122 if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT) 3123 return 0; 3124 3125 r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true); 3126 if (r) 3127 return r; 3128 3129 /* the base address is now in temp.x */ 3130 r = r600_get_byte_address(ctx, temp_reg, 3131 &inst->Dst[0], NULL, ctx->tess_output_info, 1); 3132 if (r) 3133 return r; 3134 3135 /* LDS write */ 3136 lasti = tgsi_last_instruction(write_mask); 3137 for (i = 1; i <= lasti; i++) { 3138 3139 if (!(write_mask & (1 << i))) 3140 continue; 3141 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 3142 temp_reg, i, 3143 temp_reg, 0, 3144 V_SQ_ALU_SRC_LITERAL, 4 * i); 3145 if (r) 3146 return r; 3147 } 3148 3149 for (i = 0; i <= lasti; i++) { 3150 if (!(write_mask & (1 << i))) 3151 continue; 3152 3153 if ((i == 0 && ((write_mask & 3) == 3)) || 3154 (i == 2 && ((write_mask & 0xc) == 0xc))) { 3155 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3156 alu.op = LDS_OP3_LDS_WRITE_REL; 3157 alu.src[0].sel = temp_reg; 3158 alu.src[0].chan = i; 3159 3160 alu.src[1].sel = dst->Register.Index; 3161 alu.src[1].sel += ctx->file_offset[dst->Register.File]; 3162 alu.src[1].chan = i; 3163 3164 alu.src[2].sel = dst->Register.Index; 3165 alu.src[2].sel += ctx->file_offset[dst->Register.File]; 3166 alu.src[2].chan = i + 1; 3167 alu.lds_idx = 1; 3168 alu.dst.chan = 0; 3169 alu.last = 1; 3170 alu.is_lds_idx_op = true; 3171 r = r600_bytecode_add_alu(ctx->bc, &alu); 3172 if (r) 3173 return r; 3174 i += 1; 3175 continue; 3176 } 3177 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3178 alu.op = LDS_OP2_LDS_WRITE; 3179 alu.src[0].sel = temp_reg; 3180 alu.src[0].chan = i; 3181 3182 alu.src[1].sel = dst->Register.Index; 3183 alu.src[1].sel += ctx->file_offset[dst->Register.File]; 3184 alu.src[1].chan = i; 3185 3186 alu.src[2].sel = V_SQ_ALU_SRC_0; 3187 alu.dst.chan = 0; 3188 alu.last = 1; 3189 alu.is_lds_idx_op = true; 3190 r = r600_bytecode_add_alu(ctx->bc, &alu); 3191 if (r) 3192 return r; 3193 } 3194 return 0; 3195} 3196 3197static int r600_tess_factor_read(struct r600_shader_ctx *ctx, 3198 int output_idx, int nc) 3199{ 3200 int param; 3201 unsigned temp_reg = r600_get_temp(ctx); 3202 unsigned name = ctx->shader->output[output_idx].name; 3203 int dreg = ctx->shader->output[output_idx].gpr; 3204 int r; 3205 3206 param = r600_get_lds_unique_index(name, 0); 3207 r = get_lds_offset0(ctx, 1, temp_reg, true); 3208 if (r) 3209 return r; 3210 3211 if (param) { 3212 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 3213 temp_reg, 0, 3214 temp_reg, 0, 3215 V_SQ_ALU_SRC_LITERAL, param * 16); 3216 if (r) 3217 return r; 3218 } 3219 3220 do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1)); 3221 return 0; 3222} 3223 3224static int r600_emit_tess_factor(struct r600_shader_ctx *ctx) 3225{ 3226 int stride, outer_comps, inner_comps; 3227 int tessinner_idx = -1, tessouter_idx = -1; 3228 int i, r; 3229 unsigned j; 3230 int temp_reg = r600_get_temp(ctx); 3231 int treg[3] = {-1, -1, -1}; 3232 struct r600_bytecode_alu alu; 3233 struct r600_bytecode_cf *cf_jump, *cf_pop; 3234 3235 /* only execute factor emission for invocation 0 */ 3236 /* PRED_SETE_INT __, R0.x, 0 */ 3237 memset(&alu, 0, sizeof(alu)); 3238 alu.op = ALU_OP2_PRED_SETE_INT; 3239 alu.src[0].chan = 2; 3240 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 3241 alu.execute_mask = 1; 3242 alu.update_pred = 1; 3243 alu.last = 1; 3244 r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE); 3245 3246 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP); 3247 cf_jump = ctx->bc->cf_last; 3248 3249 treg[0] = r600_get_temp(ctx); 3250 switch (ctx->shader->tcs_prim_mode) { 3251 case PIPE_PRIM_LINES: 3252 stride = 8; /* 2 dwords, 1 vec2 store */ 3253 outer_comps = 2; 3254 inner_comps = 0; 3255 break; 3256 case PIPE_PRIM_TRIANGLES: 3257 stride = 16; /* 4 dwords, 1 vec4 store */ 3258 outer_comps = 3; 3259 inner_comps = 1; 3260 treg[1] = r600_get_temp(ctx); 3261 break; 3262 case PIPE_PRIM_QUADS: 3263 stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */ 3264 outer_comps = 4; 3265 inner_comps = 2; 3266 treg[1] = r600_get_temp(ctx); 3267 treg[2] = r600_get_temp(ctx); 3268 break; 3269 default: 3270 assert(0); 3271 return -1; 3272 } 3273 3274 /* R0 is InvocationID, RelPatchID, PatchID, tf_base */ 3275 /* TF_WRITE takes index in R.x, value in R.y */ 3276 for (j = 0; j < ctx->shader->noutput; j++) { 3277 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER) 3278 tessinner_idx = j; 3279 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER) 3280 tessouter_idx = j; 3281 } 3282 3283 if (tessouter_idx == -1) 3284 return -1; 3285 3286 if (tessinner_idx == -1 && inner_comps) 3287 return -1; 3288 3289 if (tessouter_idx != -1) { 3290 r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps); 3291 if (r) 3292 return r; 3293 } 3294 3295 if (tessinner_idx != -1) { 3296 r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps); 3297 if (r) 3298 return r; 3299 } 3300 3301 /* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */ 3302 /* r.x = relpatchid(r0.y) * tf_stride */ 3303 3304 /* multiply incoming r0.y * stride - t.x = r0.y * stride */ 3305 /* add incoming r0.w to it: t.x = t.x + r0.w */ 3306 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 3307 temp_reg, 0, 3308 0, 1, 3309 V_SQ_ALU_SRC_LITERAL, stride, 3310 0, 3); 3311 if (r) 3312 return r; 3313 3314 for (i = 0; i < outer_comps + inner_comps; i++) { 3315 int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx; 3316 int out_comp = i >= outer_comps ? i - outer_comps : i; 3317 3318 if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) { 3319 if (out_comp == 1) 3320 out_comp = 0; 3321 else if (out_comp == 0) 3322 out_comp = 1; 3323 } 3324 3325 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 3326 treg[i / 2], (2 * (i % 2)), 3327 temp_reg, 0, 3328 V_SQ_ALU_SRC_LITERAL, 4 * i); 3329 if (r) 3330 return r; 3331 r = single_alu_op2(ctx, ALU_OP1_MOV, 3332 treg[i / 2], 1 + (2 * (i%2)), 3333 ctx->shader->output[out_idx].gpr, out_comp, 3334 0, 0); 3335 if (r) 3336 return r; 3337 } 3338 for (i = 0; i < outer_comps + inner_comps; i++) { 3339 struct r600_bytecode_gds gds; 3340 3341 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 3342 gds.src_gpr = treg[i / 2]; 3343 gds.src_sel_x = 2 * (i % 2); 3344 gds.src_sel_y = 1 + (2 * (i % 2)); 3345 gds.src_sel_z = 4; 3346 gds.dst_sel_x = 7; 3347 gds.dst_sel_y = 7; 3348 gds.dst_sel_z = 7; 3349 gds.dst_sel_w = 7; 3350 gds.op = FETCH_OP_TF_WRITE; 3351 r = r600_bytecode_add_gds(ctx->bc, &gds); 3352 if (r) 3353 return r; 3354 } 3355 3356 // Patch up jump label 3357 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP); 3358 cf_pop = ctx->bc->cf_last; 3359 3360 cf_jump->cf_addr = cf_pop->id + 2; 3361 cf_jump->pop_count = 1; 3362 cf_pop->cf_addr = cf_pop->id + 2; 3363 cf_pop->pop_count = 1; 3364 3365 return 0; 3366} 3367 3368/* 3369 * We have to work out the thread ID for load and atomic 3370 * operations, which store the returned value to an index 3371 * in an intermediate buffer. 3372 * The index is calculated by taking the thread id, 3373 * calculated from the MBCNT instructions. 3374 * Then the shader engine ID is multiplied by 256, 3375 * and the wave id is added. 3376 * Then the result is multipled by 64 and thread id is 3377 * added. 3378 */ 3379static int load_thread_id_gpr(struct r600_shader_ctx *ctx) 3380{ 3381 struct r600_bytecode_alu alu; 3382 int r; 3383 3384 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3385 alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT; 3386 alu.dst.sel = ctx->temp_reg; 3387 alu.dst.chan = 0; 3388 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3389 alu.src[0].value = 0xffffffff; 3390 alu.dst.write = 1; 3391 r = r600_bytecode_add_alu(ctx->bc, &alu); 3392 if (r) 3393 return r; 3394 3395 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3396 alu.op = ALU_OP1_MBCNT_32HI_INT; 3397 alu.dst.sel = ctx->temp_reg; 3398 alu.dst.chan = 1; 3399 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3400 alu.src[0].value = 0xffffffff; 3401 alu.dst.write = 1; 3402 r = r600_bytecode_add_alu(ctx->bc, &alu); 3403 if (r) 3404 return r; 3405 3406 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3407 alu.op = ALU_OP3_MULADD_UINT24; 3408 alu.dst.sel = ctx->temp_reg; 3409 alu.dst.chan = 2; 3410 alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID; 3411 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 3412 alu.src[1].value = 256; 3413 alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID; 3414 alu.dst.write = 1; 3415 alu.is_op3 = 1; 3416 alu.last = 1; 3417 r = r600_bytecode_add_alu(ctx->bc, &alu); 3418 if (r) 3419 return r; 3420 3421 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 3422 ctx->thread_id_gpr, 1, 3423 ctx->temp_reg, 2, 3424 V_SQ_ALU_SRC_LITERAL, 0x40, 3425 ctx->temp_reg, 0); 3426 if (r) 3427 return r; 3428 return 0; 3429} 3430 3431static int r600_shader_from_tgsi(struct r600_context *rctx, 3432 struct r600_pipe_shader *pipeshader, 3433 union r600_shader_key key) 3434{ 3435 struct r600_screen *rscreen = rctx->screen; 3436 struct r600_shader *shader = &pipeshader->shader; 3437 struct tgsi_token *tokens = pipeshader->selector->tokens; 3438 struct pipe_stream_output_info so = pipeshader->selector->so; 3439 struct tgsi_full_immediate *immediate; 3440 struct r600_shader_ctx ctx; 3441 struct r600_bytecode_output output[ARRAY_SIZE(shader->output)]; 3442 unsigned output_done, noutput; 3443 unsigned opcode; 3444 int j, k, r = 0; 3445 unsigned i; 3446 int next_param_base = 0, next_clip_base; 3447 int max_color_exports = MAX2(key.ps.nr_cbufs, 1); 3448 bool indirect_gprs; 3449 bool ring_outputs = false; 3450 bool lds_outputs = false; 3451 bool lds_inputs = false; 3452 bool pos_emitted = false; 3453 3454 ctx.bc = &shader->bc; 3455 ctx.shader = shader; 3456 3457 r600_bytecode_init(ctx.bc, rscreen->b.gfx_level, rscreen->b.family, 3458 rscreen->has_compressed_msaa_texturing); 3459 ctx.tokens = tokens; 3460 tgsi_scan_shader(tokens, &ctx.info); 3461 shader->indirect_files = ctx.info.indirect_files; 3462 3463 int narrays = ctx.info.array_max[TGSI_FILE_TEMPORARY]; 3464 ctx.array_infos = calloc(narrays, sizeof(*ctx.array_infos)); 3465 ctx.spilled_arrays = calloc(narrays, sizeof(bool)); 3466 tgsi_scan_arrays(tokens, TGSI_FILE_TEMPORARY, narrays, ctx.array_infos); 3467 3468 shader->uses_helper_invocation = false; 3469 shader->uses_doubles = ctx.info.uses_doubles; 3470 shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC]; 3471 shader->num_loops = ctx.info.opcode_count[TGSI_OPCODE_BGNLOOP]; 3472 shader->uses_interpolate_at_sample = ctx.info.opcode_count[TGSI_OPCODE_INTERP_SAMPLE] != 0; 3473 3474 shader->nsys_inputs = 0; 3475 3476 shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 || 3477 ctx.info.file_count[TGSI_FILE_BUFFER] > 0; 3478 indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER)); 3479 tgsi_parse_init(&ctx.parse, tokens); 3480 ctx.type = ctx.info.processor; 3481 shader->processor_type = ctx.type; 3482 ctx.bc->type = shader->processor_type; 3483 3484 switch (ctx.type) { 3485 case PIPE_SHADER_VERTEX: 3486 shader->vs_as_gs_a = key.vs.as_gs_a; 3487 shader->vs_as_es = key.vs.as_es; 3488 shader->vs_as_ls = key.vs.as_ls; 3489 shader->atomic_base = key.vs.first_atomic_counter; 3490 if (shader->vs_as_es) 3491 ring_outputs = true; 3492 if (shader->vs_as_ls) 3493 lds_outputs = true; 3494 break; 3495 case PIPE_SHADER_GEOMETRY: 3496 ring_outputs = true; 3497 shader->atomic_base = key.gs.first_atomic_counter; 3498 shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix; 3499 break; 3500 case PIPE_SHADER_TESS_CTRL: 3501 shader->tcs_prim_mode = key.tcs.prim_mode; 3502 shader->atomic_base = key.tcs.first_atomic_counter; 3503 lds_outputs = true; 3504 lds_inputs = true; 3505 break; 3506 case PIPE_SHADER_TESS_EVAL: 3507 shader->tes_as_es = key.tes.as_es; 3508 shader->atomic_base = key.tes.first_atomic_counter; 3509 lds_inputs = true; 3510 if (shader->tes_as_es) 3511 ring_outputs = true; 3512 break; 3513 case PIPE_SHADER_FRAGMENT: 3514 shader->two_side = key.ps.color_two_side; 3515 shader->atomic_base = key.ps.first_atomic_counter; 3516 shader->rat_base = key.ps.nr_cbufs; 3517 shader->image_size_const_offset = key.ps.image_size_const_offset; 3518 break; 3519 case PIPE_SHADER_COMPUTE: 3520 shader->rat_base = 0; 3521 shader->image_size_const_offset = ctx.info.file_count[TGSI_FILE_SAMPLER]; 3522 break; 3523 default: 3524 break; 3525 } 3526 3527 if (shader->vs_as_es || shader->tes_as_es) { 3528 ctx.gs_for_vs = &rctx->gs_shader->current->shader; 3529 } else { 3530 ctx.gs_for_vs = NULL; 3531 } 3532 3533 ctx.next_ring_offset = 0; 3534 ctx.gs_out_ring_offset = 0; 3535 ctx.gs_next_vertex = 0; 3536 ctx.gs_stream_output_info = &so; 3537 3538 ctx.thread_id_gpr = -1; 3539 ctx.face_gpr = -1; 3540 ctx.fixed_pt_position_gpr = -1; 3541 ctx.fragcoord_input = -1; 3542 ctx.colors_used = 0; 3543 ctx.clip_vertex_write = 0; 3544 3545 ctx.helper_invoc_reg = -1; 3546 ctx.cs_block_size_reg = -1; 3547 ctx.cs_grid_size_reg = -1; 3548 ctx.cs_block_size_loaded = false; 3549 ctx.cs_grid_size_loaded = false; 3550 3551 shader->nr_ps_color_exports = 0; 3552 3553 3554 /* register allocations */ 3555 /* Values [0,127] correspond to GPR[0..127]. 3556 * Values [128,159] correspond to constant buffer bank 0 3557 * Values [160,191] correspond to constant buffer bank 1 3558 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG) 3559 * Values [256,287] correspond to constant buffer bank 2 (EG) 3560 * Values [288,319] correspond to constant buffer bank 3 (EG) 3561 * Other special values are shown in the list below. 3562 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+) 3563 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+) 3564 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+) 3565 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+) 3566 * 248 SQ_ALU_SRC_0: special constant 0.0. 3567 * 249 SQ_ALU_SRC_1: special constant 1.0 float. 3568 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer. 3569 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer. 3570 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float. 3571 * 253 SQ_ALU_SRC_LITERAL: literal constant. 3572 * 254 SQ_ALU_SRC_PV: previous vector result. 3573 * 255 SQ_ALU_SRC_PS: previous scalar result. 3574 */ 3575 for (i = 0; i < TGSI_FILE_COUNT; i++) { 3576 ctx.file_offset[i] = 0; 3577 } 3578 3579 if (ctx.type == PIPE_SHADER_VERTEX) { 3580 3581 ctx.file_offset[TGSI_FILE_INPUT] = 1; 3582 if (ctx.info.num_inputs) 3583 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS); 3584 } 3585 if (ctx.type == PIPE_SHADER_FRAGMENT) { 3586 if (ctx.bc->gfx_level >= EVERGREEN) 3587 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx); 3588 else 3589 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]); 3590 3591 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) { 3592 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) { 3593 ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++; 3594 shader->uses_helper_invocation = true; 3595 } 3596 } 3597 } 3598 if (ctx.type == PIPE_SHADER_GEOMETRY) { 3599 /* FIXME 1 would be enough in some cases (3 or less input vertices) */ 3600 ctx.file_offset[TGSI_FILE_INPUT] = 2; 3601 } 3602 if (ctx.type == PIPE_SHADER_TESS_CTRL) 3603 ctx.file_offset[TGSI_FILE_INPUT] = 1; 3604 if (ctx.type == PIPE_SHADER_TESS_EVAL) { 3605 bool add_tesscoord = false, add_tess_inout = false; 3606 ctx.file_offset[TGSI_FILE_INPUT] = 1; 3607 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) { 3608 /* if we have tesscoord save one reg */ 3609 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD) 3610 add_tesscoord = true; 3611 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER || 3612 ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER) 3613 add_tess_inout = true; 3614 } 3615 if (add_tesscoord || add_tess_inout) 3616 ctx.file_offset[TGSI_FILE_INPUT]++; 3617 if (add_tess_inout) 3618 ctx.file_offset[TGSI_FILE_INPUT]+=2; 3619 } 3620 if (ctx.type == PIPE_SHADER_COMPUTE) { 3621 ctx.file_offset[TGSI_FILE_INPUT] = 2; 3622 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) { 3623 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE) 3624 ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++; 3625 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE) 3626 ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++; 3627 } 3628 } 3629 3630 ctx.file_offset[TGSI_FILE_OUTPUT] = 3631 ctx.file_offset[TGSI_FILE_INPUT] + 3632 ctx.info.file_max[TGSI_FILE_INPUT] + 1; 3633 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] + 3634 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1; 3635 3636 /* Outside the GPR range. This will be translated to one of the 3637 * kcache banks later. */ 3638 ctx.file_offset[TGSI_FILE_CONSTANT] = 512; 3639 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL; 3640 3641 pipeshader->scratch_space_needed = 0; 3642 int regno = ctx.file_offset[TGSI_FILE_TEMPORARY] + 3643 ctx.info.file_max[TGSI_FILE_TEMPORARY]; 3644 if (regno > 124) { 3645 choose_spill_arrays(&ctx, ®no, &pipeshader->scratch_space_needed); 3646 shader->indirect_files = ctx.info.indirect_files; 3647 } 3648 shader->needs_scratch_space = pipeshader->scratch_space_needed != 0; 3649 3650 ctx.bc->ar_reg = ++regno; 3651 ctx.bc->index_reg[0] = ++regno; 3652 ctx.bc->index_reg[1] = ++regno; 3653 3654 if (ctx.type == PIPE_SHADER_TESS_CTRL) { 3655 ctx.tess_input_info = ++regno; 3656 ctx.tess_output_info = ++regno; 3657 } else if (ctx.type == PIPE_SHADER_TESS_EVAL) { 3658 ctx.tess_input_info = ++regno; 3659 ctx.tess_output_info = ++regno; 3660 } else if (ctx.type == PIPE_SHADER_GEOMETRY) { 3661 ctx.gs_export_gpr_tregs[0] = ++regno; 3662 ctx.gs_export_gpr_tregs[1] = ++regno; 3663 ctx.gs_export_gpr_tregs[2] = ++regno; 3664 ctx.gs_export_gpr_tregs[3] = ++regno; 3665 if (ctx.shader->gs_tri_strip_adj_fix) { 3666 ctx.gs_rotated_input[0] = ++regno; 3667 ctx.gs_rotated_input[1] = ++regno; 3668 } else { 3669 ctx.gs_rotated_input[0] = 0; 3670 ctx.gs_rotated_input[1] = 1; 3671 } 3672 } 3673 3674 if (shader->uses_images) { 3675 ctx.thread_id_gpr = ++regno; 3676 } 3677 ctx.temp_reg = ++regno; 3678 3679 shader->max_arrays = 0; 3680 shader->num_arrays = 0; 3681 if (indirect_gprs) { 3682 3683 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) { 3684 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT], 3685 ctx.file_offset[TGSI_FILE_OUTPUT] - 3686 ctx.file_offset[TGSI_FILE_INPUT], 3687 0x0F); 3688 } 3689 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) { 3690 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT], 3691 ctx.file_offset[TGSI_FILE_TEMPORARY] - 3692 ctx.file_offset[TGSI_FILE_OUTPUT], 3693 0x0F); 3694 } 3695 } 3696 3697 ctx.nliterals = 0; 3698 ctx.literals = NULL; 3699 ctx.max_driver_temp_used = 0; 3700 3701 shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] && 3702 ctx.info.colors_written == 1; 3703 shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; 3704 shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]; 3705 3706 if (ctx.type == PIPE_SHADER_VERTEX || 3707 ctx.type == PIPE_SHADER_GEOMETRY || 3708 ctx.type == PIPE_SHADER_TESS_EVAL) { 3709 shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] + 3710 ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1; 3711 shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1; 3712 shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]; 3713 } 3714 3715 if (shader->vs_as_gs_a) 3716 vs_add_primid_output(&ctx, key.vs.prim_id_out); 3717 3718 if (ctx.thread_id_gpr != -1) { 3719 r = load_thread_id_gpr(&ctx); 3720 if (r) 3721 return r; 3722 } 3723 3724 if (ctx.type == PIPE_SHADER_TESS_EVAL) 3725 r600_fetch_tess_io_info(&ctx); 3726 3727 while (!tgsi_parse_end_of_tokens(&ctx.parse)) { 3728 tgsi_parse_token(&ctx.parse); 3729 switch (ctx.parse.FullToken.Token.Type) { 3730 case TGSI_TOKEN_TYPE_IMMEDIATE: 3731 immediate = &ctx.parse.FullToken.FullImmediate; 3732 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16); 3733 if(ctx.literals == NULL) { 3734 r = -ENOMEM; 3735 goto out_err; 3736 } 3737 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint; 3738 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint; 3739 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint; 3740 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint; 3741 ctx.nliterals++; 3742 break; 3743 case TGSI_TOKEN_TYPE_DECLARATION: 3744 r = tgsi_declaration(&ctx); 3745 if (r) 3746 goto out_err; 3747 break; 3748 case TGSI_TOKEN_TYPE_INSTRUCTION: 3749 case TGSI_TOKEN_TYPE_PROPERTY: 3750 break; 3751 default: 3752 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type); 3753 r = -EINVAL; 3754 goto out_err; 3755 } 3756 } 3757 3758 shader->ring_item_sizes[0] = ctx.next_ring_offset; 3759 shader->ring_item_sizes[1] = 0; 3760 shader->ring_item_sizes[2] = 0; 3761 shader->ring_item_sizes[3] = 0; 3762 3763 /* Process two side if needed */ 3764 if (shader->two_side && ctx.colors_used) { 3765 int i, count = ctx.shader->ninput; 3766 unsigned next_lds_loc = ctx.shader->nlds; 3767 3768 /* additional inputs will be allocated right after the existing inputs, 3769 * we won't need them after the color selection, so we don't need to 3770 * reserve these gprs for the rest of the shader code and to adjust 3771 * output offsets etc. */ 3772 int gpr = ctx.file_offset[TGSI_FILE_INPUT] + 3773 ctx.info.file_max[TGSI_FILE_INPUT] + 1; 3774 3775 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */ 3776 if (ctx.face_gpr == -1) { 3777 i = ctx.shader->ninput++; 3778 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE; 3779 ctx.shader->input[i].spi_sid = 0; 3780 ctx.shader->input[i].gpr = gpr++; 3781 ctx.face_gpr = ctx.shader->input[i].gpr; 3782 } 3783 3784 for (i = 0; i < count; i++) { 3785 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) { 3786 int ni = ctx.shader->ninput++; 3787 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io)); 3788 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR; 3789 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]); 3790 ctx.shader->input[ni].gpr = gpr++; 3791 // TGSI to LLVM needs to know the lds position of inputs. 3792 // Non LLVM path computes it later (in process_twoside_color) 3793 ctx.shader->input[ni].lds_pos = next_lds_loc++; 3794 ctx.shader->input[i].back_color_input = ni; 3795 if (ctx.bc->gfx_level >= EVERGREEN) { 3796 if ((r = evergreen_interp_input(&ctx, ni))) 3797 return r; 3798 } 3799 } 3800 } 3801 } 3802 3803 if (ctx.shader->uses_helper_invocation) { 3804 if (ctx.bc->gfx_level == CAYMAN) 3805 r = cm_load_helper_invocation(&ctx); 3806 else 3807 r = eg_load_helper_invocation(&ctx); 3808 if (r) 3809 return r; 3810 } 3811 3812 /* 3813 * XXX this relies on fixed_pt_position_gpr only being present when 3814 * this shader should be executed per sample. Should be the case for now... 3815 */ 3816 if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) { 3817 /* 3818 * Fix up sample mask. The hw always gives us coverage mask for 3819 * the pixel. However, for per-sample shading, we need the 3820 * coverage for the shader invocation only. 3821 * Also, with disabled msaa, only the first bit should be set 3822 * (luckily the same fixup works for both problems). 3823 * For now, we can only do it if we know this shader is always 3824 * executed per sample (due to usage of bits in the shader 3825 * forcing per-sample execution). 3826 * If the fb is not multisampled, we'd do unnecessary work but 3827 * it should still be correct. 3828 * It will however do nothing for sample shading according 3829 * to MinSampleShading. 3830 */ 3831 struct r600_bytecode_alu alu; 3832 int tmp = r600_get_temp(&ctx); 3833 assert(ctx.face_gpr != -1); 3834 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3835 3836 alu.op = ALU_OP2_LSHL_INT; 3837 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3838 alu.src[0].value = 0x1; 3839 alu.src[1].sel = ctx.fixed_pt_position_gpr; 3840 alu.src[1].chan = 3; 3841 alu.dst.sel = tmp; 3842 alu.dst.chan = 0; 3843 alu.dst.write = 1; 3844 alu.last = 1; 3845 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3846 return r; 3847 3848 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3849 alu.op = ALU_OP2_AND_INT; 3850 alu.src[0].sel = tmp; 3851 alu.src[1].sel = ctx.face_gpr; 3852 alu.src[1].chan = 2; 3853 alu.dst.sel = ctx.face_gpr; 3854 alu.dst.chan = 2; 3855 alu.dst.write = 1; 3856 alu.last = 1; 3857 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3858 return r; 3859 } 3860 3861 if (ctx.fragcoord_input >= 0) { 3862 if (ctx.bc->gfx_level == CAYMAN) { 3863 for (j = 0 ; j < 4; j++) { 3864 struct r600_bytecode_alu alu; 3865 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3866 alu.op = ALU_OP1_RECIP_IEEE; 3867 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; 3868 alu.src[0].chan = 3; 3869 3870 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; 3871 alu.dst.chan = j; 3872 alu.dst.write = (j == 3); 3873 alu.last = (j == 3); 3874 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3875 return r; 3876 } 3877 } else { 3878 struct r600_bytecode_alu alu; 3879 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3880 alu.op = ALU_OP1_RECIP_IEEE; 3881 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; 3882 alu.src[0].chan = 3; 3883 3884 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; 3885 alu.dst.chan = 3; 3886 alu.dst.write = 1; 3887 alu.last = 1; 3888 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3889 return r; 3890 } 3891 } 3892 3893 if (ctx.type == PIPE_SHADER_GEOMETRY) { 3894 struct r600_bytecode_alu alu; 3895 int r; 3896 3897 /* GS thread with no output workaround - emit a cut at start of GS */ 3898 if (ctx.bc->gfx_level == R600) 3899 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX); 3900 3901 for (j = 0; j < 4; j++) { 3902 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3903 alu.op = ALU_OP1_MOV; 3904 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3905 alu.src[0].value = 0; 3906 alu.dst.sel = ctx.gs_export_gpr_tregs[j]; 3907 alu.dst.write = 1; 3908 alu.last = 1; 3909 r = r600_bytecode_add_alu(ctx.bc, &alu); 3910 if (r) 3911 return r; 3912 } 3913 3914 if (ctx.shader->gs_tri_strip_adj_fix) { 3915 r = single_alu_op2(&ctx, ALU_OP2_AND_INT, 3916 ctx.gs_rotated_input[0], 2, 3917 0, 2, 3918 V_SQ_ALU_SRC_LITERAL, 1); 3919 if (r) 3920 return r; 3921 3922 for (i = 0; i < 6; i++) { 3923 int rotated = (i + 4) % 6; 3924 int offset_reg = i / 3; 3925 int offset_chan = i % 3; 3926 int rotated_offset_reg = rotated / 3; 3927 int rotated_offset_chan = rotated % 3; 3928 3929 if (offset_reg == 0 && offset_chan == 2) 3930 offset_chan = 3; 3931 if (rotated_offset_reg == 0 && rotated_offset_chan == 2) 3932 rotated_offset_chan = 3; 3933 3934 r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT, 3935 ctx.gs_rotated_input[offset_reg], offset_chan, 3936 ctx.gs_rotated_input[0], 2, 3937 offset_reg, offset_chan, 3938 rotated_offset_reg, rotated_offset_chan); 3939 if (r) 3940 return r; 3941 } 3942 } 3943 } 3944 3945 if (ctx.type == PIPE_SHADER_TESS_CTRL) 3946 r600_fetch_tess_io_info(&ctx); 3947 3948 if (shader->two_side && ctx.colors_used) { 3949 if ((r = process_twoside_color_inputs(&ctx))) 3950 return r; 3951 } 3952 3953 tgsi_parse_init(&ctx.parse, tokens); 3954 while (!tgsi_parse_end_of_tokens(&ctx.parse)) { 3955 tgsi_parse_token(&ctx.parse); 3956 switch (ctx.parse.FullToken.Token.Type) { 3957 case TGSI_TOKEN_TYPE_INSTRUCTION: 3958 r = tgsi_is_supported(&ctx); 3959 if (r) 3960 goto out_err; 3961 ctx.max_driver_temp_used = 0; 3962 /* reserve first tmp for everyone */ 3963 r600_get_temp(&ctx); 3964 3965 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode; 3966 if ((r = tgsi_split_constant(&ctx))) 3967 goto out_err; 3968 if ((r = tgsi_split_literal_constant(&ctx))) 3969 goto out_err; 3970 if (ctx.type == PIPE_SHADER_GEOMETRY) { 3971 if ((r = tgsi_split_gs_inputs(&ctx))) 3972 goto out_err; 3973 } else if (lds_inputs) { 3974 if ((r = tgsi_split_lds_inputs(&ctx))) 3975 goto out_err; 3976 } 3977 if (ctx.bc->gfx_level == CAYMAN) 3978 ctx.inst_info = &cm_shader_tgsi_instruction[opcode]; 3979 else if (ctx.bc->gfx_level >= EVERGREEN) 3980 ctx.inst_info = &eg_shader_tgsi_instruction[opcode]; 3981 else 3982 ctx.inst_info = &r600_shader_tgsi_instruction[opcode]; 3983 3984 ctx.bc->precise |= ctx.parse.FullToken.FullInstruction.Instruction.Precise; 3985 3986 r = ctx.inst_info->process(&ctx); 3987 if (r) 3988 goto out_err; 3989 3990 if (ctx.type == PIPE_SHADER_TESS_CTRL) { 3991 r = r600_store_tcs_output(&ctx); 3992 if (r) 3993 goto out_err; 3994 } 3995 break; 3996 default: 3997 break; 3998 } 3999 } 4000 4001 /* Reset the temporary register counter. */ 4002 ctx.max_driver_temp_used = 0; 4003 4004 noutput = shader->noutput; 4005 4006 if (!ring_outputs && ctx.clip_vertex_write) { 4007 unsigned clipdist_temp[2]; 4008 4009 clipdist_temp[0] = r600_get_temp(&ctx); 4010 clipdist_temp[1] = r600_get_temp(&ctx); 4011 4012 /* need to convert a clipvertex write into clipdistance writes and not export 4013 the clip vertex anymore */ 4014 4015 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io)); 4016 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST; 4017 shader->output[noutput].gpr = clipdist_temp[0]; 4018 noutput++; 4019 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST; 4020 shader->output[noutput].gpr = clipdist_temp[1]; 4021 noutput++; 4022 4023 /* reset spi_sid for clipvertex output to avoid confusing spi */ 4024 shader->output[ctx.cv_output].spi_sid = 0; 4025 4026 shader->clip_dist_write = 0xFF; 4027 shader->cc_dist_mask = 0xFF; 4028 4029 for (i = 0; i < 8; i++) { 4030 int oreg = i >> 2; 4031 int ochan = i & 3; 4032 4033 for (j = 0; j < 4; j++) { 4034 struct r600_bytecode_alu alu; 4035 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4036 alu.op = ALU_OP2_DOT4; 4037 alu.src[0].sel = shader->output[ctx.cv_output].gpr; 4038 alu.src[0].chan = j; 4039 4040 alu.src[1].sel = 512 + i; 4041 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 4042 alu.src[1].chan = j; 4043 4044 alu.dst.sel = clipdist_temp[oreg]; 4045 alu.dst.chan = j; 4046 alu.dst.write = (j == ochan); 4047 if (j == 3) 4048 alu.last = 1; 4049 r = r600_bytecode_add_alu(ctx.bc, &alu); 4050 if (r) 4051 return r; 4052 } 4053 } 4054 } 4055 4056 /* Add stream outputs. */ 4057 if (so.num_outputs) { 4058 bool emit = false; 4059 if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX) 4060 emit = true; 4061 if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL) 4062 emit = true; 4063 if (emit) 4064 emit_streamout(&ctx, &so, -1, NULL); 4065 } 4066 pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask; 4067 convert_edgeflag_to_int(&ctx); 4068 4069 if (ctx.type == PIPE_SHADER_TESS_CTRL) 4070 r600_emit_tess_factor(&ctx); 4071 4072 if (lds_outputs) { 4073 if (ctx.type == PIPE_SHADER_VERTEX) { 4074 if (ctx.shader->noutput) 4075 emit_lds_vs_writes(&ctx); 4076 } 4077 } else if (ring_outputs) { 4078 if (shader->vs_as_es || shader->tes_as_es) { 4079 ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx); 4080 ctx.gs_export_gpr_tregs[1] = -1; 4081 ctx.gs_export_gpr_tregs[2] = -1; 4082 ctx.gs_export_gpr_tregs[3] = -1; 4083 4084 emit_gs_ring_writes(&ctx, &so, -1, FALSE); 4085 } 4086 } else { 4087 /* Export output */ 4088 next_clip_base = shader->vs_out_misc_write ? 62 : 61; 4089 4090 for (i = 0, j = 0; i < noutput; i++, j++) { 4091 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4092 output[j].gpr = shader->output[i].gpr; 4093 output[j].elem_size = 3; 4094 output[j].swizzle_x = 0; 4095 output[j].swizzle_y = 1; 4096 output[j].swizzle_z = 2; 4097 output[j].swizzle_w = 3; 4098 output[j].burst_count = 1; 4099 output[j].type = 0xffffffff; 4100 output[j].op = CF_OP_EXPORT; 4101 switch (ctx.type) { 4102 case PIPE_SHADER_VERTEX: 4103 case PIPE_SHADER_TESS_EVAL: 4104 switch (shader->output[i].name) { 4105 case TGSI_SEMANTIC_POSITION: 4106 output[j].array_base = 60; 4107 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4108 pos_emitted = true; 4109 break; 4110 4111 case TGSI_SEMANTIC_PSIZE: 4112 output[j].array_base = 61; 4113 output[j].swizzle_y = 7; 4114 output[j].swizzle_z = 7; 4115 output[j].swizzle_w = 7; 4116 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4117 pos_emitted = true; 4118 break; 4119 case TGSI_SEMANTIC_EDGEFLAG: 4120 output[j].array_base = 61; 4121 output[j].swizzle_x = 7; 4122 output[j].swizzle_y = 0; 4123 output[j].swizzle_z = 7; 4124 output[j].swizzle_w = 7; 4125 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4126 pos_emitted = true; 4127 break; 4128 case TGSI_SEMANTIC_LAYER: 4129 /* spi_sid is 0 for outputs that are 4130 * not consumed by PS */ 4131 if (shader->output[i].spi_sid) { 4132 output[j].array_base = next_param_base++; 4133 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4134 j++; 4135 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output)); 4136 } 4137 output[j].array_base = 61; 4138 output[j].swizzle_x = 7; 4139 output[j].swizzle_y = 7; 4140 output[j].swizzle_z = 0; 4141 output[j].swizzle_w = 7; 4142 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4143 pos_emitted = true; 4144 break; 4145 case TGSI_SEMANTIC_VIEWPORT_INDEX: 4146 /* spi_sid is 0 for outputs that are 4147 * not consumed by PS */ 4148 if (shader->output[i].spi_sid) { 4149 output[j].array_base = next_param_base++; 4150 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4151 j++; 4152 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output)); 4153 } 4154 output[j].array_base = 61; 4155 output[j].swizzle_x = 7; 4156 output[j].swizzle_y = 7; 4157 output[j].swizzle_z = 7; 4158 output[j].swizzle_w = 0; 4159 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4160 pos_emitted = true; 4161 break; 4162 case TGSI_SEMANTIC_CLIPVERTEX: 4163 j--; 4164 break; 4165 case TGSI_SEMANTIC_CLIPDIST: 4166 output[j].array_base = next_clip_base++; 4167 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4168 pos_emitted = true; 4169 /* spi_sid is 0 for clipdistance outputs that were generated 4170 * for clipvertex - we don't need to pass them to PS */ 4171 if (shader->output[i].spi_sid) { 4172 j++; 4173 /* duplicate it as PARAM to pass to the pixel shader */ 4174 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output)); 4175 output[j].array_base = next_param_base++; 4176 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4177 } 4178 break; 4179 case TGSI_SEMANTIC_FOG: 4180 output[j].swizzle_y = 4; /* 0 */ 4181 output[j].swizzle_z = 4; /* 0 */ 4182 output[j].swizzle_w = 5; /* 1 */ 4183 break; 4184 case TGSI_SEMANTIC_PRIMID: 4185 output[j].swizzle_x = 2; 4186 output[j].swizzle_y = 4; /* 0 */ 4187 output[j].swizzle_z = 4; /* 0 */ 4188 output[j].swizzle_w = 4; /* 0 */ 4189 break; 4190 } 4191 4192 break; 4193 case PIPE_SHADER_FRAGMENT: 4194 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) { 4195 /* never export more colors than the number of CBs */ 4196 if (shader->output[i].sid >= max_color_exports) { 4197 /* skip export */ 4198 j--; 4199 continue; 4200 } 4201 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3; 4202 output[j].array_base = shader->output[i].sid; 4203 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4204 shader->nr_ps_color_exports++; 4205 shader->ps_color_export_mask |= (0xf << (shader->output[i].sid * 4)); 4206 4207 /* If the i-th target format is set, all previous target formats must 4208 * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well. 4209 */ 4210 if (shader->output[i].sid > 0) 4211 for (unsigned x = 0; x < shader->output[i].sid; x++) 4212 shader->ps_color_export_mask |= (1 << (x*4)); 4213 4214 if (shader->output[i].sid > shader->ps_export_highest) 4215 shader->ps_export_highest = shader->output[i].sid; 4216 if (shader->fs_write_all && (rscreen->b.gfx_level >= EVERGREEN)) { 4217 for (k = 1; k < max_color_exports; k++) { 4218 j++; 4219 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4220 output[j].gpr = shader->output[i].gpr; 4221 output[j].elem_size = 3; 4222 output[j].swizzle_x = 0; 4223 output[j].swizzle_y = 1; 4224 output[j].swizzle_z = 2; 4225 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3; 4226 output[j].burst_count = 1; 4227 output[j].array_base = k; 4228 output[j].op = CF_OP_EXPORT; 4229 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4230 shader->nr_ps_color_exports++; 4231 if (k > shader->ps_export_highest) 4232 shader->ps_export_highest = k; 4233 shader->ps_color_export_mask |= (0xf << (j * 4)); 4234 } 4235 } 4236 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) { 4237 output[j].array_base = 61; 4238 output[j].swizzle_x = 2; 4239 output[j].swizzle_y = 7; 4240 output[j].swizzle_z = output[j].swizzle_w = 7; 4241 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4242 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) { 4243 output[j].array_base = 61; 4244 output[j].swizzle_x = 7; 4245 output[j].swizzle_y = 1; 4246 output[j].swizzle_z = output[j].swizzle_w = 7; 4247 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4248 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) { 4249 output[j].array_base = 61; 4250 output[j].swizzle_x = 7; 4251 output[j].swizzle_y = 7; 4252 output[j].swizzle_z = 0; 4253 output[j].swizzle_w = 7; 4254 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4255 } else { 4256 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name); 4257 r = -EINVAL; 4258 goto out_err; 4259 } 4260 break; 4261 case PIPE_SHADER_TESS_CTRL: 4262 break; 4263 default: 4264 R600_ERR("unsupported processor type %d\n", ctx.type); 4265 r = -EINVAL; 4266 goto out_err; 4267 } 4268 4269 if (output[j].type == 0xffffffff) { 4270 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4271 output[j].array_base = next_param_base++; 4272 } 4273 } 4274 4275 /* add fake position export */ 4276 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) { 4277 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4278 output[j].gpr = 0; 4279 output[j].elem_size = 3; 4280 output[j].swizzle_x = 7; 4281 output[j].swizzle_y = 7; 4282 output[j].swizzle_z = 7; 4283 output[j].swizzle_w = 7; 4284 output[j].burst_count = 1; 4285 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 4286 output[j].array_base = 60; 4287 output[j].op = CF_OP_EXPORT; 4288 j++; 4289 } 4290 4291 /* add fake param output for vertex shader if no param is exported */ 4292 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) { 4293 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4294 output[j].gpr = 0; 4295 output[j].elem_size = 3; 4296 output[j].swizzle_x = 7; 4297 output[j].swizzle_y = 7; 4298 output[j].swizzle_z = 7; 4299 output[j].swizzle_w = 7; 4300 output[j].burst_count = 1; 4301 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 4302 output[j].array_base = 0; 4303 output[j].op = CF_OP_EXPORT; 4304 j++; 4305 } 4306 4307 /* add fake pixel export */ 4308 if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) { 4309 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 4310 output[j].gpr = 0; 4311 output[j].elem_size = 3; 4312 output[j].swizzle_x = 7; 4313 output[j].swizzle_y = 7; 4314 output[j].swizzle_z = 7; 4315 output[j].swizzle_w = 7; 4316 output[j].burst_count = 1; 4317 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 4318 output[j].array_base = 0; 4319 output[j].op = CF_OP_EXPORT; 4320 j++; 4321 shader->nr_ps_color_exports++; 4322 shader->ps_color_export_mask = 0xf; 4323 } 4324 4325 noutput = j; 4326 4327 /* set export done on last export of each type */ 4328 for (k = noutput - 1, output_done = 0; k >= 0; k--) { 4329 if (!(output_done & (1 << output[k].type))) { 4330 output_done |= (1 << output[k].type); 4331 output[k].op = CF_OP_EXPORT_DONE; 4332 } 4333 } 4334 /* add output to bytecode */ 4335 for (i = 0; i < noutput; i++) { 4336 r = r600_bytecode_add_output(ctx.bc, &output[i]); 4337 if (r) 4338 goto out_err; 4339 } 4340 } 4341 4342 /* add program end */ 4343 if (ctx.bc->gfx_level == CAYMAN) 4344 cm_bytecode_add_cf_end(ctx.bc); 4345 else { 4346 const struct cf_op_info *last = NULL; 4347 4348 if (ctx.bc->cf_last) 4349 last = r600_isa_cf(ctx.bc->cf_last->op); 4350 4351 /* alu clause instructions don't have EOP bit, so add NOP */ 4352 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP) 4353 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); 4354 4355 ctx.bc->cf_last->end_of_program = 1; 4356 } 4357 4358 /* check GPR limit - we have 124 = 128 - 4 4359 * (4 are reserved as alu clause temporary registers) */ 4360 if (ctx.bc->ngpr > 124) { 4361 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr); 4362 r = -ENOMEM; 4363 goto out_err; 4364 } 4365 4366 if (ctx.type == PIPE_SHADER_GEOMETRY) { 4367 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so))) 4368 return r; 4369 } 4370 4371 free(ctx.spilled_arrays); 4372 free(ctx.array_infos); 4373 free(ctx.literals); 4374 tgsi_parse_free(&ctx.parse); 4375 return 0; 4376out_err: 4377 free(ctx.spilled_arrays); 4378 free(ctx.array_infos); 4379 free(ctx.literals); 4380 tgsi_parse_free(&ctx.parse); 4381 return r; 4382} 4383 4384static int tgsi_unsupported(struct r600_shader_ctx *ctx) 4385{ 4386 const unsigned tgsi_opcode = 4387 ctx->parse.FullToken.FullInstruction.Instruction.Opcode; 4388 R600_ERR("%s tgsi opcode unsupported\n", 4389 tgsi_get_opcode_name(tgsi_opcode)); 4390 return -EINVAL; 4391} 4392 4393static int tgsi_end(struct r600_shader_ctx *ctx UNUSED) 4394{ 4395 return 0; 4396} 4397 4398static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src, 4399 const struct r600_shader_src *shader_src, 4400 unsigned chan) 4401{ 4402 bc_src->sel = shader_src->sel; 4403 bc_src->chan = shader_src->swizzle[chan]; 4404 bc_src->neg = shader_src->neg; 4405 bc_src->abs = shader_src->abs; 4406 bc_src->rel = shader_src->rel; 4407 bc_src->value = shader_src->value[bc_src->chan]; 4408 bc_src->kc_bank = shader_src->kc_bank; 4409 bc_src->kc_rel = shader_src->kc_rel; 4410} 4411 4412static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src) 4413{ 4414 bc_src->abs = 1; 4415 bc_src->neg = 0; 4416} 4417 4418static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src) 4419{ 4420 bc_src->neg = !bc_src->neg; 4421} 4422 4423static void tgsi_dst(struct r600_shader_ctx *ctx, 4424 const struct tgsi_full_dst_register *tgsi_dst, 4425 unsigned swizzle, 4426 struct r600_bytecode_alu_dst *r600_dst) 4427{ 4428 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4429 4430 if (tgsi_dst->Register.File == TGSI_FILE_TEMPORARY) { 4431 bool spilled; 4432 unsigned idx; 4433 4434 idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_dst->Register.Index, &spilled); 4435 4436 if (spilled) { 4437 struct r600_bytecode_output cf; 4438 int reg = 0; 4439 int r; 4440 bool add_pending_output = true; 4441 4442 memset(&cf, 0, sizeof(struct r600_bytecode_output)); 4443 get_spilled_array_base_and_size(ctx, tgsi_dst->Register.Index, 4444 &cf.array_base, &cf.array_size); 4445 4446 /* If no component has spilled, reserve a register and add the spill code 4447 * ctx->bc->n_pending_outputs is cleared after each instruction group */ 4448 if (ctx->bc->n_pending_outputs == 0) { 4449 reg = r600_get_temp(ctx); 4450 } else { 4451 /* If we are already spilling and the output address is the same like 4452 * before then just reuse the same slot */ 4453 struct r600_bytecode_output *tmpl = &ctx->bc->pending_outputs[ctx->bc->n_pending_outputs-1]; 4454 if ((cf.array_base + idx == tmpl->array_base) || 4455 (cf.array_base == tmpl->array_base && 4456 tmpl->index_gpr == ctx->bc->ar_reg && 4457 tgsi_dst->Register.Indirect)) { 4458 reg = ctx->bc->pending_outputs[0].gpr; 4459 add_pending_output = false; 4460 } else { 4461 reg = r600_get_temp(ctx); 4462 } 4463 } 4464 4465 r600_dst->sel = reg; 4466 r600_dst->chan = swizzle; 4467 r600_dst->write = 1; 4468 if (inst->Instruction.Saturate) { 4469 r600_dst->clamp = 1; 4470 } 4471 4472 /* Add new outputs as pending */ 4473 if (add_pending_output) { 4474 cf.op = CF_OP_MEM_SCRATCH; 4475 cf.elem_size = 3; 4476 cf.gpr = reg; 4477 cf.type = r600_bytecode_write_export_ack_type(ctx->bc, tgsi_dst->Register.Indirect); 4478 cf.mark = 1; 4479 cf.comp_mask = inst->Dst[0].Register.WriteMask; 4480 cf.swizzle_x = 0; 4481 cf.swizzle_y = 1; 4482 cf.swizzle_z = 2; 4483 cf.swizzle_w = 3; 4484 cf.burst_count = 1; 4485 4486 if (tgsi_dst->Register.Indirect) { 4487 cf.index_gpr = ctx->bc->ar_reg; 4488 } else { 4489 cf.array_base += idx; 4490 cf.array_size = 0; 4491 } 4492 4493 r = r600_bytecode_add_pending_output(ctx->bc, &cf); 4494 if (r) 4495 return; 4496 4497 r600_bytecode_add_ack(ctx->bc); 4498 } 4499 return; 4500 } 4501 else { 4502 r600_dst->sel = idx; 4503 } 4504 } 4505 else { 4506 r600_dst->sel = tgsi_dst->Register.Index; 4507 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File]; 4508 } 4509 r600_dst->chan = swizzle; 4510 r600_dst->write = 1; 4511 if (inst->Instruction.Saturate) { 4512 r600_dst->clamp = 1; 4513 } 4514 if (ctx->type == PIPE_SHADER_TESS_CTRL) { 4515 if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) { 4516 return; 4517 } 4518 } 4519 if (tgsi_dst->Register.Indirect) 4520 r600_dst->rel = V_SQ_REL_RELATIVE; 4521 4522} 4523 4524static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override) 4525{ 4526 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4527 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4528 struct r600_bytecode_alu alu; 4529 int i, j, r, lasti = tgsi_last_instruction(write_mask); 4530 int use_tmp = 0; 4531 int swizzle_x = inst->Src[0].Register.SwizzleX; 4532 4533 if (singledest) { 4534 switch (write_mask) { 4535 case 0x1: 4536 if (swizzle_x == 2) { 4537 write_mask = 0xc; 4538 use_tmp = 3; 4539 } else 4540 write_mask = 0x3; 4541 break; 4542 case 0x2: 4543 if (swizzle_x == 2) { 4544 write_mask = 0xc; 4545 use_tmp = 3; 4546 } else { 4547 write_mask = 0x3; 4548 use_tmp = 1; 4549 } 4550 break; 4551 case 0x4: 4552 if (swizzle_x == 0) { 4553 write_mask = 0x3; 4554 use_tmp = 1; 4555 } else 4556 write_mask = 0xc; 4557 break; 4558 case 0x8: 4559 if (swizzle_x == 0) { 4560 write_mask = 0x3; 4561 use_tmp = 1; 4562 } else { 4563 write_mask = 0xc; 4564 use_tmp = 3; 4565 } 4566 break; 4567 } 4568 } 4569 4570 lasti = tgsi_last_instruction(write_mask); 4571 for (i = 0; i <= lasti; i++) { 4572 4573 if (!(write_mask & (1 << i))) 4574 continue; 4575 4576 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4577 4578 if (singledest) { 4579 if (use_tmp || dest_temp) { 4580 alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp; 4581 alu.dst.chan = i; 4582 alu.dst.write = 1; 4583 } else { 4584 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4585 } 4586 if (i == 1 || i == 3) 4587 alu.dst.write = 0; 4588 } else 4589 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4590 4591 alu.op = op_override ? op_override : ctx->inst_info->op; 4592 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) { 4593 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4594 } else if (!swap) { 4595 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4596 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i)); 4597 } 4598 } else { 4599 r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i)); 4600 r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i)); 4601 } 4602 4603 /* handle some special cases */ 4604 if (i == 1 || i == 3) { 4605 switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) { 4606 case TGSI_OPCODE_DABS: 4607 r600_bytecode_src_set_abs(&alu.src[0]); 4608 break; 4609 default: 4610 break; 4611 } 4612 } 4613 if (i == lasti) { 4614 alu.last = 1; 4615 } 4616 r = r600_bytecode_add_alu(ctx->bc, &alu); 4617 if (r) 4618 return r; 4619 } 4620 4621 if (use_tmp) { 4622 write_mask = inst->Dst[0].Register.WriteMask; 4623 4624 lasti = tgsi_last_instruction(write_mask); 4625 /* move result from temp to dst */ 4626 for (i = 0; i <= lasti; i++) { 4627 if (!(write_mask & (1 << i))) 4628 continue; 4629 4630 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4631 alu.op = ALU_OP1_MOV; 4632 4633 if (dest_temp) { 4634 alu.dst.sel = dest_temp; 4635 alu.dst.chan = i; 4636 alu.dst.write = 1; 4637 } else 4638 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4639 alu.src[0].sel = ctx->temp_reg; 4640 alu.src[0].chan = use_tmp - 1; 4641 alu.last = (i == lasti); 4642 4643 r = r600_bytecode_add_alu(ctx->bc, &alu); 4644 if (r) 4645 return r; 4646 } 4647 } 4648 return 0; 4649} 4650 4651static int tgsi_op2_64(struct r600_shader_ctx *ctx) 4652{ 4653 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4654 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4655 /* confirm writemasking */ 4656 if ((write_mask & 0x3) != 0x3 && 4657 (write_mask & 0xc) != 0xc) { 4658 fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask); 4659 return -1; 4660 } 4661 return tgsi_op2_64_params(ctx, false, false, 0, 0); 4662} 4663 4664static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx) 4665{ 4666 return tgsi_op2_64_params(ctx, true, false, 0, 0); 4667} 4668 4669static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx) 4670{ 4671 return tgsi_op2_64_params(ctx, true, true, 0, 0); 4672} 4673 4674static int tgsi_op3_64(struct r600_shader_ctx *ctx) 4675{ 4676 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4677 struct r600_bytecode_alu alu; 4678 int i, j, r; 4679 int lasti = 3; 4680 int tmp = r600_get_temp(ctx); 4681 4682 for (i = 0; i < lasti + 1; i++) { 4683 4684 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4685 alu.op = ctx->inst_info->op; 4686 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4687 r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1); 4688 } 4689 4690 if (inst->Dst[0].Register.WriteMask & (1 << i)) 4691 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4692 else 4693 alu.dst.sel = tmp; 4694 4695 alu.dst.chan = i; 4696 alu.is_op3 = 1; 4697 if (i == lasti) { 4698 alu.last = 1; 4699 } 4700 r = r600_bytecode_add_alu(ctx->bc, &alu); 4701 if (r) 4702 return r; 4703 } 4704 return 0; 4705} 4706 4707static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only) 4708{ 4709 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4710 struct r600_bytecode_alu alu; 4711 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4712 int i, j, r, lasti = tgsi_last_instruction(write_mask); 4713 /* use temp register if trans_only and more than one dst component */ 4714 int use_tmp = trans_only && (write_mask ^ (1 << lasti)); 4715 unsigned op = ctx->inst_info->op; 4716 4717 if (op == ALU_OP2_MUL_IEEE && 4718 ctx->info.properties[TGSI_PROPERTY_LEGACY_MATH_RULES]) 4719 op = ALU_OP2_MUL; 4720 4721 /* nir_to_tgsi lowers nir_op_isub to UADD + negate, since r600 doesn't support 4722 * source modifiers with integer ops we switch back to SUB_INT */ 4723 bool src1_neg = ctx->src[1].neg; 4724 if (op == ALU_OP2_ADD_INT && src1_neg) { 4725 src1_neg = false; 4726 op = ALU_OP2_SUB_INT; 4727 } 4728 4729 for (i = 0; i <= lasti; i++) { 4730 if (!(write_mask & (1 << i))) 4731 continue; 4732 4733 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4734 if (use_tmp) { 4735 alu.dst.sel = ctx->temp_reg; 4736 alu.dst.chan = i; 4737 alu.dst.write = 1; 4738 } else 4739 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4740 4741 alu.op = op; 4742 if (!swap) { 4743 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4744 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 4745 } 4746 alu.src[1].neg = src1_neg; 4747 } else { 4748 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 4749 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 4750 } 4751 if (i == lasti || trans_only) { 4752 alu.last = 1; 4753 } 4754 r = r600_bytecode_add_alu(ctx->bc, &alu); 4755 if (r) 4756 return r; 4757 } 4758 4759 if (use_tmp) { 4760 /* move result from temp to dst */ 4761 for (i = 0; i <= lasti; i++) { 4762 if (!(write_mask & (1 << i))) 4763 continue; 4764 4765 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4766 alu.op = ALU_OP1_MOV; 4767 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4768 alu.src[0].sel = ctx->temp_reg; 4769 alu.src[0].chan = i; 4770 alu.last = (i == lasti); 4771 4772 r = r600_bytecode_add_alu(ctx->bc, &alu); 4773 if (r) 4774 return r; 4775 } 4776 } 4777 return 0; 4778} 4779 4780static int tgsi_op2(struct r600_shader_ctx *ctx) 4781{ 4782 return tgsi_op2_s(ctx, 0, 0); 4783} 4784 4785static int tgsi_op2_swap(struct r600_shader_ctx *ctx) 4786{ 4787 return tgsi_op2_s(ctx, 1, 0); 4788} 4789 4790static int tgsi_op2_trans(struct r600_shader_ctx *ctx) 4791{ 4792 return tgsi_op2_s(ctx, 0, 1); 4793} 4794 4795static int tgsi_ineg(struct r600_shader_ctx *ctx) 4796{ 4797 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4798 struct r600_bytecode_alu alu; 4799 int i, r; 4800 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4801 4802 for (i = 0; i < lasti + 1; i++) { 4803 4804 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4805 continue; 4806 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4807 alu.op = ctx->inst_info->op; 4808 4809 alu.src[0].sel = V_SQ_ALU_SRC_0; 4810 4811 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 4812 4813 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4814 4815 if (i == lasti) { 4816 alu.last = 1; 4817 } 4818 r = r600_bytecode_add_alu(ctx->bc, &alu); 4819 if (r) 4820 return r; 4821 } 4822 return 0; 4823 4824} 4825 4826static int tgsi_dneg(struct r600_shader_ctx *ctx) 4827{ 4828 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4829 struct r600_bytecode_alu alu; 4830 int i, r; 4831 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4832 4833 for (i = 0; i < lasti + 1; i++) { 4834 4835 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4836 continue; 4837 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4838 alu.op = ALU_OP1_MOV; 4839 4840 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4841 4842 if (i == 1 || i == 3) 4843 r600_bytecode_src_toggle_neg(&alu.src[0]); 4844 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4845 4846 if (i == lasti) { 4847 alu.last = 1; 4848 } 4849 r = r600_bytecode_add_alu(ctx->bc, &alu); 4850 if (r) 4851 return r; 4852 } 4853 return 0; 4854 4855} 4856 4857static int tgsi_dfracexp(struct r600_shader_ctx *ctx) 4858{ 4859 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4860 struct r600_bytecode_alu alu; 4861 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4862 int i, j, r; 4863 4864 for (i = 0; i <= 3; i++) { 4865 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4866 alu.op = ctx->inst_info->op; 4867 4868 alu.dst.sel = ctx->temp_reg; 4869 alu.dst.chan = i; 4870 alu.dst.write = 1; 4871 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4872 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i)); 4873 } 4874 4875 if (i == 3) 4876 alu.last = 1; 4877 4878 r = r600_bytecode_add_alu(ctx->bc, &alu); 4879 if (r) 4880 return r; 4881 } 4882 4883 /* Replicate significand result across channels. */ 4884 for (i = 0; i <= 3; i++) { 4885 if (!(write_mask & (1 << i))) 4886 continue; 4887 4888 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4889 alu.op = ALU_OP1_MOV; 4890 alu.src[0].chan = (i & 1) + 2; 4891 alu.src[0].sel = ctx->temp_reg; 4892 4893 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4894 alu.dst.write = 1; 4895 alu.last = 1; 4896 r = r600_bytecode_add_alu(ctx->bc, &alu); 4897 if (r) 4898 return r; 4899 } 4900 4901 for (i = 0; i <= 3; i++) { 4902 if (inst->Dst[1].Register.WriteMask & (1 << i)) { 4903 /* MOV third channels to writemask dst1 */ 4904 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4905 alu.op = ALU_OP1_MOV; 4906 alu.src[0].chan = 1; 4907 alu.src[0].sel = ctx->temp_reg; 4908 4909 tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst); 4910 alu.last = 1; 4911 r = r600_bytecode_add_alu(ctx->bc, &alu); 4912 if (r) 4913 return r; 4914 break; 4915 } 4916 } 4917 return 0; 4918} 4919 4920 4921static int egcm_int_to_double(struct r600_shader_ctx *ctx) 4922{ 4923 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4924 struct r600_bytecode_alu alu; 4925 int i, c, r; 4926 int write_mask = inst->Dst[0].Register.WriteMask; 4927 int temp_reg = r600_get_temp(ctx); 4928 4929 assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D || 4930 inst->Instruction.Opcode == TGSI_OPCODE_U2D); 4931 4932 for (c = 0; c < 2; c++) { 4933 int dchan = c * 2; 4934 if (write_mask & (0x3 << dchan)) { 4935 /* split into 24-bit int and 8-bit int */ 4936 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4937 alu.op = ALU_OP2_AND_INT; 4938 alu.dst.sel = temp_reg; 4939 alu.dst.chan = dchan; 4940 r600_bytecode_src(&alu.src[0], &ctx->src[0], c); 4941 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4942 alu.src[1].value = 0xffffff00; 4943 alu.dst.write = 1; 4944 r = r600_bytecode_add_alu(ctx->bc, &alu); 4945 if (r) 4946 return r; 4947 4948 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4949 alu.op = ALU_OP2_AND_INT; 4950 alu.dst.sel = temp_reg; 4951 alu.dst.chan = dchan + 1; 4952 r600_bytecode_src(&alu.src[0], &ctx->src[0], c); 4953 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4954 alu.src[1].value = 0xff; 4955 alu.dst.write = 1; 4956 alu.last = 1; 4957 r = r600_bytecode_add_alu(ctx->bc, &alu); 4958 if (r) 4959 return r; 4960 } 4961 } 4962 4963 for (c = 0; c < 2; c++) { 4964 int dchan = c * 2; 4965 if (write_mask & (0x3 << dchan)) { 4966 for (i = dchan; i <= dchan + 1; i++) { 4967 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4968 alu.op = i == dchan ? ctx->inst_info->op : ALU_OP1_UINT_TO_FLT; 4969 4970 alu.src[0].sel = temp_reg; 4971 alu.src[0].chan = i; 4972 alu.dst.sel = temp_reg; 4973 alu.dst.chan = i; 4974 alu.dst.write = 1; 4975 if (ctx->bc->gfx_level == CAYMAN) 4976 alu.last = i == dchan + 1; 4977 else 4978 alu.last = 1; /* trans only ops on evergreen */ 4979 4980 r = r600_bytecode_add_alu(ctx->bc, &alu); 4981 if (r) 4982 return r; 4983 } 4984 } 4985 } 4986 4987 for (c = 0; c < 2; c++) { 4988 int dchan = c * 2; 4989 if (write_mask & (0x3 << dchan)) { 4990 for (i = 0; i < 4; i++) { 4991 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4992 alu.op = ALU_OP1_FLT32_TO_FLT64; 4993 4994 alu.src[0].chan = dchan + (i / 2); 4995 if (i == 0 || i == 2) 4996 alu.src[0].sel = temp_reg; 4997 else { 4998 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 4999 alu.src[0].value = 0x0; 5000 } 5001 alu.dst.sel = ctx->temp_reg; 5002 alu.dst.chan = i; 5003 alu.last = i == 3; 5004 alu.dst.write = 1; 5005 5006 r = r600_bytecode_add_alu(ctx->bc, &alu); 5007 if (r) 5008 return r; 5009 } 5010 5011 for (i = 0; i <= 1; i++) { 5012 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5013 alu.op = ALU_OP2_ADD_64; 5014 5015 alu.src[0].chan = fp64_switch(i); 5016 alu.src[0].sel = ctx->temp_reg; 5017 5018 alu.src[1].chan = fp64_switch(i + 2); 5019 alu.src[1].sel = ctx->temp_reg; 5020 tgsi_dst(ctx, &inst->Dst[0], dchan + i, &alu.dst); 5021 alu.last = i == 1; 5022 5023 r = r600_bytecode_add_alu(ctx->bc, &alu); 5024 if (r) 5025 return r; 5026 } 5027 } 5028 } 5029 5030 return 0; 5031} 5032 5033static int egcm_double_to_int(struct r600_shader_ctx *ctx) 5034{ 5035 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5036 struct r600_bytecode_alu alu; 5037 int i, r; 5038 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5039 int treg = r600_get_temp(ctx); 5040 assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I || 5041 inst->Instruction.Opcode == TGSI_OPCODE_D2U); 5042 5043 /* do a 64->32 into a temp register */ 5044 r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32); 5045 if (r) 5046 return r; 5047 5048 for (i = 0; i <= lasti; i++) { 5049 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5050 continue; 5051 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5052 alu.op = ctx->inst_info->op; 5053 5054 alu.src[0].chan = i; 5055 alu.src[0].sel = treg; 5056 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5057 alu.last = (i == lasti); 5058 5059 r = r600_bytecode_add_alu(ctx->bc, &alu); 5060 if (r) 5061 return r; 5062 } 5063 5064 return 0; 5065} 5066 5067static int cayman_emit_unary_double_raw(struct r600_bytecode *bc, 5068 unsigned op, 5069 int dst_reg, 5070 struct r600_shader_src *src, 5071 bool abs) 5072{ 5073 struct r600_bytecode_alu alu; 5074 const int last_slot = 3; 5075 int r; 5076 5077 /* these have to write the result to X/Y by the looks of it */ 5078 for (int i = 0 ; i < last_slot; i++) { 5079 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5080 alu.op = op; 5081 5082 r600_bytecode_src(&alu.src[0], src, 1); 5083 r600_bytecode_src(&alu.src[1], src, 0); 5084 5085 if (abs) 5086 r600_bytecode_src_set_abs(&alu.src[1]); 5087 5088 alu.dst.sel = dst_reg; 5089 alu.dst.chan = i; 5090 alu.dst.write = (i == 0 || i == 1); 5091 5092 if (bc->gfx_level != CAYMAN || i == last_slot - 1) 5093 alu.last = 1; 5094 r = r600_bytecode_add_alu(bc, &alu); 5095 if (r) 5096 return r; 5097 } 5098 5099 return 0; 5100} 5101 5102static int cayman_emit_double_instr(struct r600_shader_ctx *ctx) 5103{ 5104 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5105 int i, r; 5106 struct r600_bytecode_alu alu; 5107 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5108 int t1 = ctx->temp_reg; 5109 5110 /* should only be one src regs */ 5111 assert(inst->Instruction.NumSrcRegs == 1); 5112 5113 /* only support one double at a time */ 5114 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY || 5115 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW); 5116 5117 r = cayman_emit_unary_double_raw( 5118 ctx->bc, ctx->inst_info->op, t1, 5119 &ctx->src[0], 5120 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ || 5121 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT); 5122 if (r) 5123 return r; 5124 5125 for (i = 0 ; i <= lasti; i++) { 5126 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5127 continue; 5128 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5129 alu.op = ALU_OP1_MOV; 5130 alu.src[0].sel = t1; 5131 alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1; 5132 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5133 alu.dst.write = 1; 5134 if (i == lasti) 5135 alu.last = 1; 5136 r = r600_bytecode_add_alu(ctx->bc, &alu); 5137 if (r) 5138 return r; 5139 } 5140 return 0; 5141} 5142 5143static int cayman_emit_float_instr(struct r600_shader_ctx *ctx) 5144{ 5145 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5146 int i, j, r; 5147 struct r600_bytecode_alu alu; 5148 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 5149 5150 for (i = 0 ; i < last_slot; i++) { 5151 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5152 alu.op = ctx->inst_info->op; 5153 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 5154 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0); 5155 5156 /* RSQ should take the absolute value of src */ 5157 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) { 5158 r600_bytecode_src_set_abs(&alu.src[j]); 5159 } 5160 } 5161 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5162 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5163 5164 if (i == last_slot - 1) 5165 alu.last = 1; 5166 r = r600_bytecode_add_alu(ctx->bc, &alu); 5167 if (r) 5168 return r; 5169 } 5170 return 0; 5171} 5172 5173static int cayman_mul_int_instr(struct r600_shader_ctx *ctx) 5174{ 5175 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5176 int i, j, k, r; 5177 struct r600_bytecode_alu alu; 5178 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5179 int t1 = ctx->temp_reg; 5180 5181 for (k = 0; k <= lasti; k++) { 5182 if (!(inst->Dst[0].Register.WriteMask & (1 << k))) 5183 continue; 5184 5185 for (i = 0 ; i < 4; i++) { 5186 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5187 alu.op = ctx->inst_info->op; 5188 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 5189 r600_bytecode_src(&alu.src[j], &ctx->src[j], k); 5190 } 5191 alu.dst.sel = t1; 5192 alu.dst.chan = i; 5193 alu.dst.write = (i == k); 5194 if (i == 3) 5195 alu.last = 1; 5196 r = r600_bytecode_add_alu(ctx->bc, &alu); 5197 if (r) 5198 return r; 5199 } 5200 } 5201 5202 for (i = 0 ; i <= lasti; i++) { 5203 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5204 continue; 5205 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5206 alu.op = ALU_OP1_MOV; 5207 alu.src[0].sel = t1; 5208 alu.src[0].chan = i; 5209 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5210 alu.dst.write = 1; 5211 if (i == lasti) 5212 alu.last = 1; 5213 r = r600_bytecode_add_alu(ctx->bc, &alu); 5214 if (r) 5215 return r; 5216 } 5217 5218 return 0; 5219} 5220 5221 5222static int cayman_mul_double_instr(struct r600_shader_ctx *ctx) 5223{ 5224 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5225 int i, j, k, r; 5226 struct r600_bytecode_alu alu; 5227 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5228 int t1 = ctx->temp_reg; 5229 5230 /* t1 would get overwritten below if we actually tried to 5231 * multiply two pairs of doubles at a time. */ 5232 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY || 5233 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW); 5234 5235 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1; 5236 5237 for (i = 0; i < 4; i++) { 5238 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5239 alu.op = ctx->inst_info->op; 5240 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 5241 r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1)); 5242 } 5243 alu.dst.sel = t1; 5244 alu.dst.chan = i; 5245 alu.dst.write = 1; 5246 if (i == 3) 5247 alu.last = 1; 5248 r = r600_bytecode_add_alu(ctx->bc, &alu); 5249 if (r) 5250 return r; 5251 } 5252 5253 for (i = 0; i <= lasti; i++) { 5254 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5255 continue; 5256 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5257 alu.op = ALU_OP1_MOV; 5258 alu.src[0].sel = t1; 5259 alu.src[0].chan = i; 5260 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5261 alu.dst.write = 1; 5262 if (i == lasti) 5263 alu.last = 1; 5264 r = r600_bytecode_add_alu(ctx->bc, &alu); 5265 if (r) 5266 return r; 5267 } 5268 5269 return 0; 5270} 5271 5272/* 5273 * Emit RECIP_64 + MUL_64 to implement division. 5274 */ 5275static int cayman_ddiv_instr(struct r600_shader_ctx *ctx) 5276{ 5277 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5278 int r; 5279 struct r600_bytecode_alu alu; 5280 int t1 = ctx->temp_reg; 5281 int k; 5282 5283 /* Only support one double at a time. This is the same constraint as 5284 * in DMUL lowering. */ 5285 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY || 5286 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW); 5287 5288 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1; 5289 5290 r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false); 5291 if (r) 5292 return r; 5293 5294 for (int i = 0; i < 4; i++) { 5295 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5296 alu.op = ALU_OP2_MUL_64; 5297 5298 r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1)); 5299 5300 alu.src[1].sel = t1; 5301 alu.src[1].chan = (i == 3) ? 0 : 1; 5302 5303 alu.dst.sel = t1; 5304 alu.dst.chan = i; 5305 alu.dst.write = 1; 5306 if (i == 3) 5307 alu.last = 1; 5308 r = r600_bytecode_add_alu(ctx->bc, &alu); 5309 if (r) 5310 return r; 5311 } 5312 5313 for (int i = 0; i < 2; i++) { 5314 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5315 alu.op = ALU_OP1_MOV; 5316 alu.src[0].sel = t1; 5317 alu.src[0].chan = i; 5318 tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst); 5319 alu.dst.write = 1; 5320 if (i == 1) 5321 alu.last = 1; 5322 r = r600_bytecode_add_alu(ctx->bc, &alu); 5323 if (r) 5324 return r; 5325 } 5326 return 0; 5327} 5328 5329/* 5330 * r600 - trunc to -PI..PI range 5331 * r700 - normalize by dividing by 2PI 5332 * see fdo bug 27901 5333 */ 5334static int tgsi_setup_trig(struct r600_shader_ctx *ctx) 5335{ 5336 int r; 5337 struct r600_bytecode_alu alu; 5338 5339 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5340 alu.op = ALU_OP3_MULADD; 5341 alu.is_op3 = 1; 5342 5343 alu.dst.chan = 0; 5344 alu.dst.sel = ctx->temp_reg; 5345 alu.dst.write = 1; 5346 5347 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5348 5349 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 5350 alu.src[1].chan = 0; 5351 alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI); 5352 alu.src[2].sel = V_SQ_ALU_SRC_0_5; 5353 alu.src[2].chan = 0; 5354 alu.last = 1; 5355 r = r600_bytecode_add_alu(ctx->bc, &alu); 5356 if (r) 5357 return r; 5358 5359 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5360 alu.op = ALU_OP1_FRACT; 5361 5362 alu.dst.chan = 0; 5363 alu.dst.sel = ctx->temp_reg; 5364 alu.dst.write = 1; 5365 5366 alu.src[0].sel = ctx->temp_reg; 5367 alu.src[0].chan = 0; 5368 alu.last = 1; 5369 r = r600_bytecode_add_alu(ctx->bc, &alu); 5370 if (r) 5371 return r; 5372 5373 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5374 alu.op = ALU_OP3_MULADD; 5375 alu.is_op3 = 1; 5376 5377 alu.dst.chan = 0; 5378 alu.dst.sel = ctx->temp_reg; 5379 alu.dst.write = 1; 5380 5381 alu.src[0].sel = ctx->temp_reg; 5382 alu.src[0].chan = 0; 5383 5384 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 5385 alu.src[1].chan = 0; 5386 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 5387 alu.src[2].chan = 0; 5388 5389 if (ctx->bc->gfx_level == R600) { 5390 alu.src[1].value = u_bitcast_f2u(2.0f * M_PI); 5391 alu.src[2].value = u_bitcast_f2u(-M_PI); 5392 } else { 5393 alu.src[1].sel = V_SQ_ALU_SRC_1; 5394 alu.src[2].sel = V_SQ_ALU_SRC_0_5; 5395 alu.src[2].neg = 1; 5396 } 5397 5398 alu.last = 1; 5399 r = r600_bytecode_add_alu(ctx->bc, &alu); 5400 if (r) 5401 return r; 5402 return 0; 5403} 5404 5405static int cayman_trig(struct r600_shader_ctx *ctx) 5406{ 5407 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5408 struct r600_bytecode_alu alu; 5409 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 5410 int i, r; 5411 5412 r = tgsi_setup_trig(ctx); 5413 if (r) 5414 return r; 5415 5416 5417 for (i = 0; i < last_slot; i++) { 5418 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5419 alu.op = ctx->inst_info->op; 5420 alu.dst.chan = i; 5421 5422 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5423 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5424 5425 alu.src[0].sel = ctx->temp_reg; 5426 alu.src[0].chan = 0; 5427 if (i == last_slot - 1) 5428 alu.last = 1; 5429 r = r600_bytecode_add_alu(ctx->bc, &alu); 5430 if (r) 5431 return r; 5432 } 5433 return 0; 5434} 5435 5436static int tgsi_trig(struct r600_shader_ctx *ctx) 5437{ 5438 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5439 struct r600_bytecode_alu alu; 5440 int i, r; 5441 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5442 5443 r = tgsi_setup_trig(ctx); 5444 if (r) 5445 return r; 5446 5447 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5448 alu.op = ctx->inst_info->op; 5449 alu.dst.chan = 0; 5450 alu.dst.sel = ctx->temp_reg; 5451 alu.dst.write = 1; 5452 5453 alu.src[0].sel = ctx->temp_reg; 5454 alu.src[0].chan = 0; 5455 alu.last = 1; 5456 r = r600_bytecode_add_alu(ctx->bc, &alu); 5457 if (r) 5458 return r; 5459 5460 /* replicate result */ 5461 for (i = 0; i < lasti + 1; i++) { 5462 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5463 continue; 5464 5465 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5466 alu.op = ALU_OP1_MOV; 5467 5468 alu.src[0].sel = ctx->temp_reg; 5469 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5470 if (i == lasti) 5471 alu.last = 1; 5472 r = r600_bytecode_add_alu(ctx->bc, &alu); 5473 if (r) 5474 return r; 5475 } 5476 return 0; 5477} 5478 5479static int tgsi_kill(struct r600_shader_ctx *ctx) 5480{ 5481 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5482 struct r600_bytecode_alu alu; 5483 int i, r; 5484 5485 for (i = 0; i < 4; i++) { 5486 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5487 alu.op = ctx->inst_info->op; 5488 5489 alu.dst.chan = i; 5490 5491 alu.src[0].sel = V_SQ_ALU_SRC_0; 5492 5493 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) { 5494 alu.src[1].sel = V_SQ_ALU_SRC_1; 5495 alu.src[1].neg = 1; 5496 } else { 5497 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 5498 } 5499 if (i == 3) { 5500 alu.last = 1; 5501 } 5502 r = r600_bytecode_add_alu(ctx->bc, &alu); 5503 if (r) 5504 return r; 5505 } 5506 5507 /* kill must be last in ALU */ 5508 ctx->bc->force_add_cf = 1; 5509 ctx->shader->uses_kill = TRUE; 5510 return 0; 5511} 5512 5513static int tgsi_lit(struct r600_shader_ctx *ctx) 5514{ 5515 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5516 struct r600_bytecode_alu alu; 5517 int r; 5518 5519 /* tmp.x = max(src.y, 0.0) */ 5520 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5521 alu.op = ALU_OP2_MAX; 5522 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 5523 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/ 5524 alu.src[1].chan = 1; 5525 5526 alu.dst.sel = ctx->temp_reg; 5527 alu.dst.chan = 0; 5528 alu.dst.write = 1; 5529 5530 alu.last = 1; 5531 r = r600_bytecode_add_alu(ctx->bc, &alu); 5532 if (r) 5533 return r; 5534 5535 if (inst->Dst[0].Register.WriteMask & (1 << 2)) 5536 { 5537 int chan; 5538 int sel; 5539 unsigned i; 5540 5541 if (ctx->bc->gfx_level == CAYMAN) { 5542 for (i = 0; i < 3; i++) { 5543 /* tmp.z = log(tmp.x) */ 5544 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5545 alu.op = ALU_OP1_LOG_CLAMPED; 5546 alu.src[0].sel = ctx->temp_reg; 5547 alu.src[0].chan = 0; 5548 alu.dst.sel = ctx->temp_reg; 5549 alu.dst.chan = i; 5550 if (i == 2) { 5551 alu.dst.write = 1; 5552 alu.last = 1; 5553 } else 5554 alu.dst.write = 0; 5555 5556 r = r600_bytecode_add_alu(ctx->bc, &alu); 5557 if (r) 5558 return r; 5559 } 5560 } else { 5561 /* tmp.z = log(tmp.x) */ 5562 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5563 alu.op = ALU_OP1_LOG_CLAMPED; 5564 alu.src[0].sel = ctx->temp_reg; 5565 alu.src[0].chan = 0; 5566 alu.dst.sel = ctx->temp_reg; 5567 alu.dst.chan = 2; 5568 alu.dst.write = 1; 5569 alu.last = 1; 5570 r = r600_bytecode_add_alu(ctx->bc, &alu); 5571 if (r) 5572 return r; 5573 } 5574 5575 chan = alu.dst.chan; 5576 sel = alu.dst.sel; 5577 5578 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */ 5579 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5580 alu.op = ALU_OP3_MUL_LIT; 5581 alu.src[0].sel = sel; 5582 alu.src[0].chan = chan; 5583 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3); 5584 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0); 5585 alu.dst.sel = ctx->temp_reg; 5586 alu.dst.chan = 0; 5587 alu.dst.write = 1; 5588 alu.is_op3 = 1; 5589 alu.last = 1; 5590 r = r600_bytecode_add_alu(ctx->bc, &alu); 5591 if (r) 5592 return r; 5593 5594 if (ctx->bc->gfx_level == CAYMAN) { 5595 for (i = 0; i < 3; i++) { 5596 /* dst.z = exp(tmp.x) */ 5597 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5598 alu.op = ALU_OP1_EXP_IEEE; 5599 alu.src[0].sel = ctx->temp_reg; 5600 alu.src[0].chan = 0; 5601 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5602 if (i == 2) { 5603 alu.dst.write = 1; 5604 alu.last = 1; 5605 } else 5606 alu.dst.write = 0; 5607 r = r600_bytecode_add_alu(ctx->bc, &alu); 5608 if (r) 5609 return r; 5610 } 5611 } else { 5612 /* dst.z = exp(tmp.x) */ 5613 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5614 alu.op = ALU_OP1_EXP_IEEE; 5615 alu.src[0].sel = ctx->temp_reg; 5616 alu.src[0].chan = 0; 5617 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 5618 alu.last = 1; 5619 r = r600_bytecode_add_alu(ctx->bc, &alu); 5620 if (r) 5621 return r; 5622 } 5623 } 5624 5625 /* dst.x, <- 1.0 */ 5626 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5627 alu.op = ALU_OP1_MOV; 5628 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/ 5629 alu.src[0].chan = 0; 5630 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 5631 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1; 5632 r = r600_bytecode_add_alu(ctx->bc, &alu); 5633 if (r) 5634 return r; 5635 5636 /* dst.y = max(src.x, 0.0) */ 5637 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5638 alu.op = ALU_OP2_MAX; 5639 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5640 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/ 5641 alu.src[1].chan = 0; 5642 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 5643 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1; 5644 r = r600_bytecode_add_alu(ctx->bc, &alu); 5645 if (r) 5646 return r; 5647 5648 /* dst.w, <- 1.0 */ 5649 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5650 alu.op = ALU_OP1_MOV; 5651 alu.src[0].sel = V_SQ_ALU_SRC_1; 5652 alu.src[0].chan = 0; 5653 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst); 5654 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1; 5655 alu.last = 1; 5656 r = r600_bytecode_add_alu(ctx->bc, &alu); 5657 if (r) 5658 return r; 5659 5660 return 0; 5661} 5662 5663static int tgsi_rsq(struct r600_shader_ctx *ctx) 5664{ 5665 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5666 struct r600_bytecode_alu alu; 5667 int i, r; 5668 5669 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5670 5671 alu.op = ALU_OP1_RECIPSQRT_IEEE; 5672 5673 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 5674 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0); 5675 r600_bytecode_src_set_abs(&alu.src[i]); 5676 } 5677 alu.dst.sel = ctx->temp_reg; 5678 alu.dst.write = 1; 5679 alu.last = 1; 5680 r = r600_bytecode_add_alu(ctx->bc, &alu); 5681 if (r) 5682 return r; 5683 /* replicate result */ 5684 return tgsi_helper_tempx_replicate(ctx); 5685} 5686 5687static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx) 5688{ 5689 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5690 struct r600_bytecode_alu alu; 5691 int i, r; 5692 5693 for (i = 0; i < 4; i++) { 5694 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5695 alu.src[0].sel = ctx->temp_reg; 5696 alu.op = ALU_OP1_MOV; 5697 alu.dst.chan = i; 5698 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5699 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5700 if (i == 3) 5701 alu.last = 1; 5702 r = r600_bytecode_add_alu(ctx->bc, &alu); 5703 if (r) 5704 return r; 5705 } 5706 return 0; 5707} 5708 5709static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx) 5710{ 5711 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5712 struct r600_bytecode_alu alu; 5713 int i, r; 5714 5715 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5716 alu.op = ctx->inst_info->op; 5717 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 5718 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0); 5719 } 5720 alu.dst.sel = ctx->temp_reg; 5721 alu.dst.write = 1; 5722 alu.last = 1; 5723 r = r600_bytecode_add_alu(ctx->bc, &alu); 5724 if (r) 5725 return r; 5726 /* replicate result */ 5727 return tgsi_helper_tempx_replicate(ctx); 5728} 5729 5730static int cayman_pow(struct r600_shader_ctx *ctx) 5731{ 5732 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5733 int i, r; 5734 struct r600_bytecode_alu alu; 5735 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 5736 5737 for (i = 0; i < 3; i++) { 5738 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5739 alu.op = ALU_OP1_LOG_IEEE; 5740 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5741 alu.dst.sel = ctx->temp_reg; 5742 alu.dst.chan = i; 5743 alu.dst.write = 1; 5744 if (i == 2) 5745 alu.last = 1; 5746 r = r600_bytecode_add_alu(ctx->bc, &alu); 5747 if (r) 5748 return r; 5749 } 5750 5751 /* b * LOG2(a) */ 5752 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5753 alu.op = ALU_OP2_MUL; 5754 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 5755 alu.src[1].sel = ctx->temp_reg; 5756 alu.dst.sel = ctx->temp_reg; 5757 alu.dst.write = 1; 5758 alu.last = 1; 5759 r = r600_bytecode_add_alu(ctx->bc, &alu); 5760 if (r) 5761 return r; 5762 5763 for (i = 0; i < last_slot; i++) { 5764 /* POW(a,b) = EXP2(b * LOG2(a))*/ 5765 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5766 alu.op = ALU_OP1_EXP_IEEE; 5767 alu.src[0].sel = ctx->temp_reg; 5768 5769 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5770 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5771 if (i == last_slot - 1) 5772 alu.last = 1; 5773 r = r600_bytecode_add_alu(ctx->bc, &alu); 5774 if (r) 5775 return r; 5776 } 5777 return 0; 5778} 5779 5780static int tgsi_pow(struct r600_shader_ctx *ctx) 5781{ 5782 struct r600_bytecode_alu alu; 5783 int r; 5784 5785 /* LOG2(a) */ 5786 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5787 alu.op = ALU_OP1_LOG_IEEE; 5788 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5789 alu.dst.sel = ctx->temp_reg; 5790 alu.dst.write = 1; 5791 alu.last = 1; 5792 r = r600_bytecode_add_alu(ctx->bc, &alu); 5793 if (r) 5794 return r; 5795 /* b * LOG2(a) */ 5796 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5797 alu.op = ALU_OP2_MUL; 5798 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 5799 alu.src[1].sel = ctx->temp_reg; 5800 alu.dst.sel = ctx->temp_reg; 5801 alu.dst.write = 1; 5802 alu.last = 1; 5803 r = r600_bytecode_add_alu(ctx->bc, &alu); 5804 if (r) 5805 return r; 5806 /* POW(a,b) = EXP2(b * LOG2(a))*/ 5807 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5808 alu.op = ALU_OP1_EXP_IEEE; 5809 alu.src[0].sel = ctx->temp_reg; 5810 alu.dst.sel = ctx->temp_reg; 5811 alu.dst.write = 1; 5812 alu.last = 1; 5813 r = r600_bytecode_add_alu(ctx->bc, &alu); 5814 if (r) 5815 return r; 5816 return tgsi_helper_tempx_replicate(ctx); 5817} 5818 5819static int emit_mul_int_op(struct r600_bytecode *bc, 5820 struct r600_bytecode_alu *alu_src) 5821{ 5822 struct r600_bytecode_alu alu; 5823 int i, r; 5824 alu = *alu_src; 5825 if (bc->gfx_level == CAYMAN) { 5826 for (i = 0; i < 4; i++) { 5827 alu.dst.chan = i; 5828 alu.dst.write = (i == alu_src->dst.chan); 5829 alu.last = (i == 3); 5830 5831 r = r600_bytecode_add_alu(bc, &alu); 5832 if (r) 5833 return r; 5834 } 5835 } else { 5836 alu.last = 1; 5837 r = r600_bytecode_add_alu(bc, &alu); 5838 if (r) 5839 return r; 5840 } 5841 return 0; 5842} 5843 5844static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op) 5845{ 5846 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5847 struct r600_bytecode_alu alu; 5848 int i, r, j; 5849 unsigned write_mask = inst->Dst[0].Register.WriteMask; 5850 int lasti = tgsi_last_instruction(write_mask); 5851 int tmp0 = ctx->temp_reg; 5852 int tmp1 = r600_get_temp(ctx); 5853 int tmp2 = r600_get_temp(ctx); 5854 int tmp3 = r600_get_temp(ctx); 5855 int tmp4 = 0; 5856 5857 /* Use additional temp if dst register and src register are the same */ 5858 if (inst->Src[0].Register.Index == inst->Dst[0].Register.Index || 5859 inst->Src[1].Register.Index == inst->Dst[0].Register.Index) { 5860 tmp4 = r600_get_temp(ctx); 5861 } 5862 5863 /* Unsigned path: 5864 * 5865 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder 5866 * 5867 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error 5868 * 2. tmp0.z = lo (tmp0.x * src2) 5869 * 3. tmp0.w = -tmp0.z 5870 * 4. tmp0.y = hi (tmp0.x * src2) 5871 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2)) 5872 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error 5873 * 7. tmp1.x = tmp0.x - tmp0.w 5874 * 8. tmp1.y = tmp0.x + tmp0.w 5875 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) 5876 * 10. tmp0.z = hi(tmp0.x * src1) = q 5877 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r 5878 * 5879 * 12. tmp0.w = src1 - tmp0.y = r 5880 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison) 5881 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison) 5882 * 5883 * if DIV 5884 * 5885 * 15. tmp1.z = tmp0.z + 1 = q + 1 5886 * 16. tmp1.w = tmp0.z - 1 = q - 1 5887 * 5888 * else MOD 5889 * 5890 * 15. tmp1.z = tmp0.w - src2 = r - src2 5891 * 16. tmp1.w = tmp0.w + src2 = r + src2 5892 * 5893 * endif 5894 * 5895 * 17. tmp1.x = tmp1.x & tmp1.y 5896 * 5897 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z 5898 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z 5899 * 5900 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z 5901 * 20. dst = src2==0 ? MAX_UINT : tmp0.z 5902 * 5903 * Signed path: 5904 * 5905 * Same as unsigned, using abs values of the operands, 5906 * and fixing the sign of the result in the end. 5907 */ 5908 5909 for (i = 0; i < 4; i++) { 5910 if (!(write_mask & (1<<i))) 5911 continue; 5912 5913 if (signed_op) { 5914 5915 /* tmp2.x = -src0 */ 5916 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5917 alu.op = ALU_OP2_SUB_INT; 5918 5919 alu.dst.sel = tmp2; 5920 alu.dst.chan = 0; 5921 alu.dst.write = 1; 5922 5923 alu.src[0].sel = V_SQ_ALU_SRC_0; 5924 5925 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 5926 5927 alu.last = 1; 5928 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5929 return r; 5930 5931 /* tmp2.y = -src1 */ 5932 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5933 alu.op = ALU_OP2_SUB_INT; 5934 5935 alu.dst.sel = tmp2; 5936 alu.dst.chan = 1; 5937 alu.dst.write = 1; 5938 5939 alu.src[0].sel = V_SQ_ALU_SRC_0; 5940 5941 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5942 5943 alu.last = 1; 5944 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5945 return r; 5946 5947 /* tmp2.z sign bit is set if src0 and src2 signs are different */ 5948 /* it will be a sign of the quotient */ 5949 if (!mod) { 5950 5951 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5952 alu.op = ALU_OP2_XOR_INT; 5953 5954 alu.dst.sel = tmp2; 5955 alu.dst.chan = 2; 5956 alu.dst.write = 1; 5957 5958 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 5959 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5960 5961 alu.last = 1; 5962 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5963 return r; 5964 } 5965 5966 /* tmp2.x = |src0| */ 5967 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5968 alu.op = ALU_OP3_CNDGE_INT; 5969 alu.is_op3 = 1; 5970 5971 alu.dst.sel = tmp2; 5972 alu.dst.chan = 0; 5973 alu.dst.write = 1; 5974 5975 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 5976 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 5977 alu.src[2].sel = tmp2; 5978 alu.src[2].chan = 0; 5979 5980 alu.last = 1; 5981 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5982 return r; 5983 5984 /* tmp2.y = |src1| */ 5985 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5986 alu.op = ALU_OP3_CNDGE_INT; 5987 alu.is_op3 = 1; 5988 5989 alu.dst.sel = tmp2; 5990 alu.dst.chan = 1; 5991 alu.dst.write = 1; 5992 5993 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 5994 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5995 alu.src[2].sel = tmp2; 5996 alu.src[2].chan = 1; 5997 5998 alu.last = 1; 5999 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6000 return r; 6001 6002 } 6003 6004 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */ 6005 if (ctx->bc->gfx_level == CAYMAN) { 6006 /* tmp3.x = u2f(src2) */ 6007 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6008 alu.op = ALU_OP1_UINT_TO_FLT; 6009 6010 alu.dst.sel = tmp3; 6011 alu.dst.chan = 0; 6012 alu.dst.write = 1; 6013 6014 if (signed_op) { 6015 alu.src[0].sel = tmp2; 6016 alu.src[0].chan = 1; 6017 } else { 6018 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 6019 } 6020 6021 alu.last = 1; 6022 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6023 return r; 6024 6025 /* tmp0.x = recip(tmp3.x) */ 6026 for (j = 0 ; j < 3; j++) { 6027 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6028 alu.op = ALU_OP1_RECIP_IEEE; 6029 6030 alu.dst.sel = tmp0; 6031 alu.dst.chan = j; 6032 alu.dst.write = (j == 0); 6033 6034 alu.src[0].sel = tmp3; 6035 alu.src[0].chan = 0; 6036 6037 if (j == 2) 6038 alu.last = 1; 6039 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6040 return r; 6041 } 6042 6043 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6044 alu.op = ALU_OP2_MUL; 6045 6046 alu.src[0].sel = tmp0; 6047 alu.src[0].chan = 0; 6048 6049 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 6050 alu.src[1].value = 0x4f800000; 6051 6052 alu.dst.sel = tmp3; 6053 alu.dst.write = 1; 6054 alu.last = 1; 6055 r = r600_bytecode_add_alu(ctx->bc, &alu); 6056 if (r) 6057 return r; 6058 6059 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6060 alu.op = ALU_OP1_FLT_TO_UINT; 6061 6062 alu.dst.sel = tmp0; 6063 alu.dst.chan = 0; 6064 alu.dst.write = 1; 6065 6066 alu.src[0].sel = tmp3; 6067 alu.src[0].chan = 0; 6068 6069 alu.last = 1; 6070 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6071 return r; 6072 6073 } else { 6074 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6075 alu.op = ALU_OP1_RECIP_UINT; 6076 6077 alu.dst.sel = tmp0; 6078 alu.dst.chan = 0; 6079 alu.dst.write = 1; 6080 6081 if (signed_op) { 6082 alu.src[0].sel = tmp2; 6083 alu.src[0].chan = 1; 6084 } else { 6085 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 6086 } 6087 6088 alu.last = 1; 6089 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6090 return r; 6091 } 6092 6093 /* 2. tmp0.z = lo (tmp0.x * src2) */ 6094 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6095 alu.op = ALU_OP2_MULLO_UINT; 6096 6097 alu.dst.sel = tmp0; 6098 alu.dst.chan = 2; 6099 alu.dst.write = 1; 6100 6101 alu.src[0].sel = tmp0; 6102 alu.src[0].chan = 0; 6103 if (signed_op) { 6104 alu.src[1].sel = tmp2; 6105 alu.src[1].chan = 1; 6106 } else { 6107 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6108 } 6109 6110 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6111 return r; 6112 6113 /* 3. tmp0.w = -tmp0.z */ 6114 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6115 alu.op = ALU_OP2_SUB_INT; 6116 6117 alu.dst.sel = tmp0; 6118 alu.dst.chan = 3; 6119 alu.dst.write = 1; 6120 6121 alu.src[0].sel = V_SQ_ALU_SRC_0; 6122 alu.src[1].sel = tmp0; 6123 alu.src[1].chan = 2; 6124 6125 alu.last = 1; 6126 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6127 return r; 6128 6129 /* 4. tmp0.y = hi (tmp0.x * src2) */ 6130 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6131 alu.op = ALU_OP2_MULHI_UINT; 6132 6133 alu.dst.sel = tmp0; 6134 alu.dst.chan = 1; 6135 alu.dst.write = 1; 6136 6137 alu.src[0].sel = tmp0; 6138 alu.src[0].chan = 0; 6139 6140 if (signed_op) { 6141 alu.src[1].sel = tmp2; 6142 alu.src[1].chan = 1; 6143 } else { 6144 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6145 } 6146 6147 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6148 return r; 6149 6150 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */ 6151 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6152 alu.op = ALU_OP3_CNDE_INT; 6153 alu.is_op3 = 1; 6154 6155 alu.dst.sel = tmp0; 6156 alu.dst.chan = 2; 6157 alu.dst.write = 1; 6158 6159 alu.src[0].sel = tmp0; 6160 alu.src[0].chan = 1; 6161 alu.src[1].sel = tmp0; 6162 alu.src[1].chan = 3; 6163 alu.src[2].sel = tmp0; 6164 alu.src[2].chan = 2; 6165 6166 alu.last = 1; 6167 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6168 return r; 6169 6170 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */ 6171 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6172 alu.op = ALU_OP2_MULHI_UINT; 6173 6174 alu.dst.sel = tmp0; 6175 alu.dst.chan = 3; 6176 alu.dst.write = 1; 6177 6178 alu.src[0].sel = tmp0; 6179 alu.src[0].chan = 2; 6180 6181 alu.src[1].sel = tmp0; 6182 alu.src[1].chan = 0; 6183 6184 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6185 return r; 6186 6187 /* 7. tmp1.x = tmp0.x - tmp0.w */ 6188 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6189 alu.op = ALU_OP2_SUB_INT; 6190 6191 alu.dst.sel = tmp1; 6192 alu.dst.chan = 0; 6193 alu.dst.write = 1; 6194 6195 alu.src[0].sel = tmp0; 6196 alu.src[0].chan = 0; 6197 alu.src[1].sel = tmp0; 6198 alu.src[1].chan = 3; 6199 6200 alu.last = 1; 6201 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6202 return r; 6203 6204 /* 8. tmp1.y = tmp0.x + tmp0.w */ 6205 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6206 alu.op = ALU_OP2_ADD_INT; 6207 6208 alu.dst.sel = tmp1; 6209 alu.dst.chan = 1; 6210 alu.dst.write = 1; 6211 6212 alu.src[0].sel = tmp0; 6213 alu.src[0].chan = 0; 6214 alu.src[1].sel = tmp0; 6215 alu.src[1].chan = 3; 6216 6217 alu.last = 1; 6218 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6219 return r; 6220 6221 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */ 6222 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6223 alu.op = ALU_OP3_CNDE_INT; 6224 alu.is_op3 = 1; 6225 6226 alu.dst.sel = tmp0; 6227 alu.dst.chan = 0; 6228 alu.dst.write = 1; 6229 6230 alu.src[0].sel = tmp0; 6231 alu.src[0].chan = 1; 6232 alu.src[1].sel = tmp1; 6233 alu.src[1].chan = 1; 6234 alu.src[2].sel = tmp1; 6235 alu.src[2].chan = 0; 6236 6237 alu.last = 1; 6238 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6239 return r; 6240 6241 /* 10. tmp0.z = hi(tmp0.x * src1) = q */ 6242 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6243 alu.op = ALU_OP2_MULHI_UINT; 6244 6245 alu.dst.sel = tmp0; 6246 alu.dst.chan = 2; 6247 alu.dst.write = 1; 6248 6249 alu.src[0].sel = tmp0; 6250 alu.src[0].chan = 0; 6251 6252 if (signed_op) { 6253 alu.src[1].sel = tmp2; 6254 alu.src[1].chan = 0; 6255 } else { 6256 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6257 } 6258 6259 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6260 return r; 6261 6262 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */ 6263 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6264 alu.op = ALU_OP2_MULLO_UINT; 6265 6266 alu.dst.sel = tmp0; 6267 alu.dst.chan = 1; 6268 alu.dst.write = 1; 6269 6270 if (signed_op) { 6271 alu.src[0].sel = tmp2; 6272 alu.src[0].chan = 1; 6273 } else { 6274 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 6275 } 6276 6277 alu.src[1].sel = tmp0; 6278 alu.src[1].chan = 2; 6279 6280 if ((r = emit_mul_int_op(ctx->bc, &alu))) 6281 return r; 6282 6283 /* 12. tmp0.w = src1 - tmp0.y = r */ 6284 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6285 alu.op = ALU_OP2_SUB_INT; 6286 6287 alu.dst.sel = tmp0; 6288 alu.dst.chan = 3; 6289 alu.dst.write = 1; 6290 6291 if (signed_op) { 6292 alu.src[0].sel = tmp2; 6293 alu.src[0].chan = 0; 6294 } else { 6295 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6296 } 6297 6298 alu.src[1].sel = tmp0; 6299 alu.src[1].chan = 1; 6300 6301 alu.last = 1; 6302 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6303 return r; 6304 6305 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */ 6306 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6307 alu.op = ALU_OP2_SETGE_UINT; 6308 6309 alu.dst.sel = tmp1; 6310 alu.dst.chan = 0; 6311 alu.dst.write = 1; 6312 6313 alu.src[0].sel = tmp0; 6314 alu.src[0].chan = 3; 6315 if (signed_op) { 6316 alu.src[1].sel = tmp2; 6317 alu.src[1].chan = 1; 6318 } else { 6319 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6320 } 6321 6322 alu.last = 1; 6323 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6324 return r; 6325 6326 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */ 6327 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6328 alu.op = ALU_OP2_SETGE_UINT; 6329 6330 alu.dst.sel = tmp1; 6331 alu.dst.chan = 1; 6332 alu.dst.write = 1; 6333 6334 if (signed_op) { 6335 alu.src[0].sel = tmp2; 6336 alu.src[0].chan = 0; 6337 } else { 6338 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6339 } 6340 6341 alu.src[1].sel = tmp0; 6342 alu.src[1].chan = 1; 6343 6344 alu.last = 1; 6345 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6346 return r; 6347 6348 if (mod) { /* UMOD */ 6349 6350 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */ 6351 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6352 alu.op = ALU_OP2_SUB_INT; 6353 6354 alu.dst.sel = tmp1; 6355 alu.dst.chan = 2; 6356 alu.dst.write = 1; 6357 6358 alu.src[0].sel = tmp0; 6359 alu.src[0].chan = 3; 6360 6361 if (signed_op) { 6362 alu.src[1].sel = tmp2; 6363 alu.src[1].chan = 1; 6364 } else { 6365 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6366 } 6367 6368 alu.last = 1; 6369 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6370 return r; 6371 6372 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */ 6373 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6374 alu.op = ALU_OP2_ADD_INT; 6375 6376 alu.dst.sel = tmp1; 6377 alu.dst.chan = 3; 6378 alu.dst.write = 1; 6379 6380 alu.src[0].sel = tmp0; 6381 alu.src[0].chan = 3; 6382 if (signed_op) { 6383 alu.src[1].sel = tmp2; 6384 alu.src[1].chan = 1; 6385 } else { 6386 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 6387 } 6388 6389 alu.last = 1; 6390 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6391 return r; 6392 6393 } else { /* UDIV */ 6394 6395 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */ 6396 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6397 alu.op = ALU_OP2_ADD_INT; 6398 6399 alu.dst.sel = tmp1; 6400 alu.dst.chan = 2; 6401 alu.dst.write = 1; 6402 6403 alu.src[0].sel = tmp0; 6404 alu.src[0].chan = 2; 6405 alu.src[1].sel = V_SQ_ALU_SRC_1_INT; 6406 6407 alu.last = 1; 6408 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6409 return r; 6410 6411 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */ 6412 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6413 alu.op = ALU_OP2_ADD_INT; 6414 6415 alu.dst.sel = tmp1; 6416 alu.dst.chan = 3; 6417 alu.dst.write = 1; 6418 6419 alu.src[0].sel = tmp0; 6420 alu.src[0].chan = 2; 6421 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT; 6422 6423 alu.last = 1; 6424 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6425 return r; 6426 6427 } 6428 6429 /* 17. tmp1.x = tmp1.x & tmp1.y */ 6430 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6431 alu.op = ALU_OP2_AND_INT; 6432 6433 alu.dst.sel = tmp1; 6434 alu.dst.chan = 0; 6435 alu.dst.write = 1; 6436 6437 alu.src[0].sel = tmp1; 6438 alu.src[0].chan = 0; 6439 alu.src[1].sel = tmp1; 6440 alu.src[1].chan = 1; 6441 6442 alu.last = 1; 6443 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6444 return r; 6445 6446 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */ 6447 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */ 6448 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6449 alu.op = ALU_OP3_CNDE_INT; 6450 alu.is_op3 = 1; 6451 6452 alu.dst.sel = tmp0; 6453 alu.dst.chan = 2; 6454 alu.dst.write = 1; 6455 6456 alu.src[0].sel = tmp1; 6457 alu.src[0].chan = 0; 6458 alu.src[1].sel = tmp0; 6459 alu.src[1].chan = mod ? 3 : 2; 6460 alu.src[2].sel = tmp1; 6461 alu.src[2].chan = 2; 6462 6463 alu.last = 1; 6464 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6465 return r; 6466 6467 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */ 6468 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6469 alu.op = ALU_OP3_CNDE_INT; 6470 alu.is_op3 = 1; 6471 6472 if (signed_op) { 6473 alu.dst.sel = tmp0; 6474 alu.dst.chan = 2; 6475 alu.dst.write = 1; 6476 } else { 6477 if (tmp4 > 0) { 6478 alu.dst.sel = tmp4; 6479 alu.dst.chan = i; 6480 alu.dst.write = 1; 6481 } else { 6482 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6483 } 6484 } 6485 6486 alu.src[0].sel = tmp1; 6487 alu.src[0].chan = 1; 6488 alu.src[1].sel = tmp1; 6489 alu.src[1].chan = 3; 6490 alu.src[2].sel = tmp0; 6491 alu.src[2].chan = 2; 6492 6493 alu.last = 1; 6494 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6495 return r; 6496 6497 if (signed_op) { 6498 6499 /* fix the sign of the result */ 6500 6501 if (mod) { 6502 6503 /* tmp0.x = -tmp0.z */ 6504 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6505 alu.op = ALU_OP2_SUB_INT; 6506 6507 alu.dst.sel = tmp0; 6508 alu.dst.chan = 0; 6509 alu.dst.write = 1; 6510 6511 alu.src[0].sel = V_SQ_ALU_SRC_0; 6512 alu.src[1].sel = tmp0; 6513 alu.src[1].chan = 2; 6514 6515 alu.last = 1; 6516 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6517 return r; 6518 6519 /* sign of the remainder is the same as the sign of src0 */ 6520 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */ 6521 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6522 alu.op = ALU_OP3_CNDGE_INT; 6523 alu.is_op3 = 1; 6524 6525 if (tmp4 > 0) { 6526 alu.dst.sel = tmp4; 6527 alu.dst.chan = i; 6528 alu.dst.write = 1; 6529 } else { 6530 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6531 } 6532 6533 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6534 alu.src[1].sel = tmp0; 6535 alu.src[1].chan = 2; 6536 alu.src[2].sel = tmp0; 6537 alu.src[2].chan = 0; 6538 6539 alu.last = 1; 6540 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6541 return r; 6542 6543 } else { 6544 6545 /* tmp0.x = -tmp0.z */ 6546 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6547 alu.op = ALU_OP2_SUB_INT; 6548 6549 alu.dst.sel = tmp0; 6550 alu.dst.chan = 0; 6551 alu.dst.write = 1; 6552 6553 alu.src[0].sel = V_SQ_ALU_SRC_0; 6554 alu.src[1].sel = tmp0; 6555 alu.src[1].chan = 2; 6556 6557 alu.last = 1; 6558 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6559 return r; 6560 6561 /* fix the quotient sign (same as the sign of src0*src1) */ 6562 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */ 6563 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6564 alu.op = ALU_OP3_CNDGE_INT; 6565 alu.is_op3 = 1; 6566 6567 if (tmp4 > 0) { 6568 alu.dst.sel = tmp4; 6569 alu.dst.chan = i; 6570 alu.dst.write = 1; 6571 } else { 6572 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6573 } 6574 6575 alu.src[0].sel = tmp2; 6576 alu.src[0].chan = 2; 6577 alu.src[1].sel = tmp0; 6578 alu.src[1].chan = 2; 6579 alu.src[2].sel = tmp0; 6580 alu.src[2].chan = 0; 6581 6582 alu.last = 1; 6583 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6584 return r; 6585 } 6586 } 6587 } 6588 6589 if (tmp4 > 0) { 6590 for (i = 0; i <= lasti; ++i) { 6591 if (!(write_mask & (1<<i))) 6592 continue; 6593 6594 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6595 alu.op = ALU_OP1_MOV; 6596 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6597 alu.src[0].sel = tmp4; 6598 alu.src[0].chan = i; 6599 6600 if (i == lasti) 6601 alu.last = 1; 6602 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6603 return r; 6604 } 6605 } 6606 6607 return 0; 6608} 6609 6610static int tgsi_udiv(struct r600_shader_ctx *ctx) 6611{ 6612 return tgsi_divmod(ctx, 0, 0); 6613} 6614 6615static int tgsi_umod(struct r600_shader_ctx *ctx) 6616{ 6617 return tgsi_divmod(ctx, 1, 0); 6618} 6619 6620static int tgsi_idiv(struct r600_shader_ctx *ctx) 6621{ 6622 return tgsi_divmod(ctx, 0, 1); 6623} 6624 6625static int tgsi_imod(struct r600_shader_ctx *ctx) 6626{ 6627 return tgsi_divmod(ctx, 1, 1); 6628} 6629 6630 6631static int tgsi_f2i(struct r600_shader_ctx *ctx) 6632{ 6633 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6634 struct r600_bytecode_alu alu; 6635 int i, r; 6636 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6637 int last_inst = tgsi_last_instruction(write_mask); 6638 6639 for (i = 0; i < 4; i++) { 6640 if (!(write_mask & (1<<i))) 6641 continue; 6642 6643 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6644 alu.op = ALU_OP1_TRUNC; 6645 6646 alu.dst.sel = ctx->temp_reg; 6647 alu.dst.chan = i; 6648 alu.dst.write = 1; 6649 6650 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6651 if (i == last_inst) 6652 alu.last = 1; 6653 r = r600_bytecode_add_alu(ctx->bc, &alu); 6654 if (r) 6655 return r; 6656 } 6657 6658 for (i = 0; i < 4; i++) { 6659 if (!(write_mask & (1<<i))) 6660 continue; 6661 6662 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6663 alu.op = ctx->inst_info->op; 6664 6665 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6666 6667 alu.src[0].sel = ctx->temp_reg; 6668 alu.src[0].chan = i; 6669 6670 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT) 6671 alu.last = 1; 6672 r = r600_bytecode_add_alu(ctx->bc, &alu); 6673 if (r) 6674 return r; 6675 } 6676 6677 return 0; 6678} 6679 6680static int tgsi_iabs(struct r600_shader_ctx *ctx) 6681{ 6682 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6683 struct r600_bytecode_alu alu; 6684 int i, r; 6685 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6686 int last_inst = tgsi_last_instruction(write_mask); 6687 6688 /* tmp = -src */ 6689 for (i = 0; i < 4; i++) { 6690 if (!(write_mask & (1<<i))) 6691 continue; 6692 6693 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6694 alu.op = ALU_OP2_SUB_INT; 6695 6696 alu.dst.sel = ctx->temp_reg; 6697 alu.dst.chan = i; 6698 alu.dst.write = 1; 6699 6700 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6701 alu.src[0].sel = V_SQ_ALU_SRC_0; 6702 6703 if (i == last_inst) 6704 alu.last = 1; 6705 r = r600_bytecode_add_alu(ctx->bc, &alu); 6706 if (r) 6707 return r; 6708 } 6709 6710 /* dst = (src >= 0 ? src : tmp) */ 6711 for (i = 0; i < 4; i++) { 6712 if (!(write_mask & (1<<i))) 6713 continue; 6714 6715 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6716 alu.op = ALU_OP3_CNDGE_INT; 6717 alu.is_op3 = 1; 6718 alu.dst.write = 1; 6719 6720 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6721 6722 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6723 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6724 alu.src[2].sel = ctx->temp_reg; 6725 alu.src[2].chan = i; 6726 6727 if (i == last_inst) 6728 alu.last = 1; 6729 r = r600_bytecode_add_alu(ctx->bc, &alu); 6730 if (r) 6731 return r; 6732 } 6733 return 0; 6734} 6735 6736static int tgsi_issg(struct r600_shader_ctx *ctx) 6737{ 6738 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6739 struct r600_bytecode_alu alu; 6740 int i, r; 6741 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6742 int last_inst = tgsi_last_instruction(write_mask); 6743 6744 /* tmp = (src >= 0 ? src : -1) */ 6745 for (i = 0; i < 4; i++) { 6746 if (!(write_mask & (1<<i))) 6747 continue; 6748 6749 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6750 alu.op = ALU_OP3_CNDGE_INT; 6751 alu.is_op3 = 1; 6752 6753 alu.dst.sel = ctx->temp_reg; 6754 alu.dst.chan = i; 6755 alu.dst.write = 1; 6756 6757 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6758 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6759 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT; 6760 6761 if (i == last_inst) 6762 alu.last = 1; 6763 r = r600_bytecode_add_alu(ctx->bc, &alu); 6764 if (r) 6765 return r; 6766 } 6767 6768 /* dst = (tmp > 0 ? 1 : tmp) */ 6769 for (i = 0; i < 4; i++) { 6770 if (!(write_mask & (1<<i))) 6771 continue; 6772 6773 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6774 alu.op = ALU_OP3_CNDGT_INT; 6775 alu.is_op3 = 1; 6776 alu.dst.write = 1; 6777 6778 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6779 6780 alu.src[0].sel = ctx->temp_reg; 6781 alu.src[0].chan = i; 6782 6783 alu.src[1].sel = V_SQ_ALU_SRC_1_INT; 6784 6785 alu.src[2].sel = ctx->temp_reg; 6786 alu.src[2].chan = i; 6787 6788 if (i == last_inst) 6789 alu.last = 1; 6790 r = r600_bytecode_add_alu(ctx->bc, &alu); 6791 if (r) 6792 return r; 6793 } 6794 return 0; 6795} 6796 6797 6798 6799static int tgsi_ssg(struct r600_shader_ctx *ctx) 6800{ 6801 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6802 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6803 int last_inst = tgsi_last_instruction(write_mask); 6804 struct r600_bytecode_alu alu; 6805 int i, r; 6806 6807 /* tmp = (src > 0 ? 1 : src) */ 6808 for (i = 0; i <= last_inst; i++) { 6809 if (!(write_mask & (1 << i))) 6810 continue; 6811 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6812 alu.op = ALU_OP3_CNDGT; 6813 alu.is_op3 = 1; 6814 6815 alu.dst.sel = ctx->temp_reg; 6816 alu.dst.chan = i; 6817 6818 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6819 alu.src[1].sel = V_SQ_ALU_SRC_1; 6820 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 6821 6822 if (i == last_inst) 6823 alu.last = 1; 6824 r = r600_bytecode_add_alu(ctx->bc, &alu); 6825 if (r) 6826 return r; 6827 } 6828 6829 /* dst = (-tmp > 0 ? -1 : tmp) */ 6830 for (i = 0; i <= last_inst; i++) { 6831 if (!(write_mask & (1 << i))) 6832 continue; 6833 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6834 alu.op = ALU_OP3_CNDGT; 6835 alu.is_op3 = 1; 6836 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6837 6838 alu.src[0].sel = ctx->temp_reg; 6839 alu.src[0].chan = i; 6840 alu.src[0].neg = 1; 6841 6842 alu.src[1].sel = V_SQ_ALU_SRC_1; 6843 alu.src[1].neg = 1; 6844 6845 alu.src[2].sel = ctx->temp_reg; 6846 alu.src[2].chan = i; 6847 6848 if (i == last_inst) 6849 alu.last = 1; 6850 r = r600_bytecode_add_alu(ctx->bc, &alu); 6851 if (r) 6852 return r; 6853 } 6854 return 0; 6855} 6856 6857static int tgsi_bfi(struct r600_shader_ctx *ctx) 6858{ 6859 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6860 struct r600_bytecode_alu alu; 6861 int i, r, t1, t2; 6862 6863 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6864 int last_inst = tgsi_last_instruction(write_mask); 6865 6866 t1 = r600_get_temp(ctx); 6867 6868 for (i = 0; i < 4; i++) { 6869 if (!(write_mask & (1<<i))) 6870 continue; 6871 6872 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6873 alu.op = ALU_OP2_SETGE_INT; 6874 r600_bytecode_src(&alu.src[0], &ctx->src[3], i); 6875 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 6876 alu.src[1].value = 32; 6877 alu.dst.sel = ctx->temp_reg; 6878 alu.dst.chan = i; 6879 alu.dst.write = 1; 6880 alu.last = i == last_inst; 6881 r = r600_bytecode_add_alu(ctx->bc, &alu); 6882 if (r) 6883 return r; 6884 } 6885 6886 for (i = 0; i < 4; i++) { 6887 if (!(write_mask & (1<<i))) 6888 continue; 6889 6890 /* create mask tmp */ 6891 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6892 alu.op = ALU_OP2_BFM_INT; 6893 alu.dst.sel = t1; 6894 alu.dst.chan = i; 6895 alu.dst.write = 1; 6896 alu.last = i == last_inst; 6897 6898 r600_bytecode_src(&alu.src[0], &ctx->src[3], i); 6899 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 6900 6901 r = r600_bytecode_add_alu(ctx->bc, &alu); 6902 if (r) 6903 return r; 6904 } 6905 6906 t2 = r600_get_temp(ctx); 6907 6908 for (i = 0; i < 4; i++) { 6909 if (!(write_mask & (1<<i))) 6910 continue; 6911 6912 /* shift insert left */ 6913 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6914 alu.op = ALU_OP2_LSHL_INT; 6915 alu.dst.sel = t2; 6916 alu.dst.chan = i; 6917 alu.dst.write = 1; 6918 alu.last = i == last_inst; 6919 6920 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 6921 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 6922 6923 r = r600_bytecode_add_alu(ctx->bc, &alu); 6924 if (r) 6925 return r; 6926 } 6927 6928 for (i = 0; i < 4; i++) { 6929 if (!(write_mask & (1<<i))) 6930 continue; 6931 6932 /* actual bitfield insert */ 6933 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6934 alu.op = ALU_OP3_BFI_INT; 6935 alu.is_op3 = 1; 6936 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6937 alu.dst.chan = i; 6938 alu.dst.write = 1; 6939 alu.last = i == last_inst; 6940 6941 alu.src[0].sel = t1; 6942 alu.src[0].chan = i; 6943 alu.src[1].sel = t2; 6944 alu.src[1].chan = i; 6945 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 6946 6947 r = r600_bytecode_add_alu(ctx->bc, &alu); 6948 if (r) 6949 return r; 6950 } 6951 6952 for (i = 0; i < 4; i++) { 6953 if (!(write_mask & (1<<i))) 6954 continue; 6955 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6956 alu.op = ALU_OP3_CNDE_INT; 6957 alu.is_op3 = 1; 6958 alu.src[0].sel = ctx->temp_reg; 6959 alu.src[0].chan = i; 6960 r600_bytecode_src(&alu.src[2], &ctx->src[1], i); 6961 6962 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6963 6964 alu.src[1].sel = alu.dst.sel; 6965 alu.src[1].chan = i; 6966 6967 alu.last = i == last_inst; 6968 r = r600_bytecode_add_alu(ctx->bc, &alu); 6969 if (r) 6970 return r; 6971 } 6972 return 0; 6973} 6974 6975static int tgsi_msb(struct r600_shader_ctx *ctx) 6976{ 6977 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6978 struct r600_bytecode_alu alu; 6979 int i, r, t1, t2; 6980 6981 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6982 int last_inst = tgsi_last_instruction(write_mask); 6983 6984 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT || 6985 ctx->inst_info->op == ALU_OP1_FFBH_UINT); 6986 6987 t1 = ctx->temp_reg; 6988 6989 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */ 6990 for (i = 0; i < 4; i++) { 6991 if (!(write_mask & (1<<i))) 6992 continue; 6993 6994 /* t1 = FFBH_INT / FFBH_UINT */ 6995 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6996 alu.op = ctx->inst_info->op; 6997 alu.dst.sel = t1; 6998 alu.dst.chan = i; 6999 alu.dst.write = 1; 7000 alu.last = i == last_inst; 7001 7002 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 7003 7004 r = r600_bytecode_add_alu(ctx->bc, &alu); 7005 if (r) 7006 return r; 7007 } 7008 7009 t2 = r600_get_temp(ctx); 7010 7011 for (i = 0; i < 4; i++) { 7012 if (!(write_mask & (1<<i))) 7013 continue; 7014 7015 /* t2 = 31 - t1 */ 7016 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7017 alu.op = ALU_OP2_SUB_INT; 7018 alu.dst.sel = t2; 7019 alu.dst.chan = i; 7020 alu.dst.write = 1; 7021 alu.last = i == last_inst; 7022 7023 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 7024 alu.src[0].value = 31; 7025 alu.src[1].sel = t1; 7026 alu.src[1].chan = i; 7027 7028 r = r600_bytecode_add_alu(ctx->bc, &alu); 7029 if (r) 7030 return r; 7031 } 7032 7033 for (i = 0; i < 4; i++) { 7034 if (!(write_mask & (1<<i))) 7035 continue; 7036 7037 /* result = t1 >= 0 ? t2 : t1 */ 7038 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7039 alu.op = ALU_OP3_CNDGE_INT; 7040 alu.is_op3 = 1; 7041 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7042 alu.dst.chan = i; 7043 alu.dst.write = 1; 7044 alu.last = i == last_inst; 7045 7046 alu.src[0].sel = t1; 7047 alu.src[0].chan = i; 7048 alu.src[1].sel = t2; 7049 alu.src[1].chan = i; 7050 alu.src[2].sel = t1; 7051 alu.src[2].chan = i; 7052 7053 r = r600_bytecode_add_alu(ctx->bc, &alu); 7054 if (r) 7055 return r; 7056 } 7057 7058 return 0; 7059} 7060 7061static int tgsi_interp_egcm(struct r600_shader_ctx *ctx) 7062{ 7063 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7064 struct r600_bytecode_alu alu; 7065 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti; 7066 unsigned location; 7067 const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs; 7068 7069 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT); 7070 7071 /* Interpolators have been marked for use already by allocate_system_value_inputs */ 7072 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 7073 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7074 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */ 7075 } 7076 else { 7077 location = TGSI_INTERPOLATE_LOC_CENTROID; 7078 ctx->shader->input[input].uses_interpolate_at_centroid = 1; 7079 } 7080 7081 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location); 7082 if (k < 0) 7083 k = 0; 7084 interp_gpr = ctx->eg_interpolators[k].ij_index / 2; 7085 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2); 7086 7087 /* NOTE: currently offset is not perspective correct */ 7088 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 7089 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7090 int sample_gpr = -1; 7091 int gradientsH, gradientsV; 7092 struct r600_bytecode_tex tex; 7093 7094 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7095 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]); 7096 } 7097 7098 gradientsH = r600_get_temp(ctx); 7099 gradientsV = r600_get_temp(ctx); 7100 for (i = 0; i < 2; i++) { 7101 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 7102 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V; 7103 tex.src_gpr = interp_gpr; 7104 tex.src_sel_x = interp_base_chan + 0; 7105 tex.src_sel_y = interp_base_chan + 1; 7106 tex.src_sel_z = 0; 7107 tex.src_sel_w = 0; 7108 tex.dst_gpr = i == 0 ? gradientsH : gradientsV; 7109 tex.dst_sel_x = 0; 7110 tex.dst_sel_y = 1; 7111 tex.dst_sel_z = 7; 7112 tex.dst_sel_w = 7; 7113 tex.inst_mod = 1; // Use per pixel gradient calculation 7114 tex.sampler_id = 0; 7115 tex.resource_id = tex.sampler_id; 7116 r = r600_bytecode_add_tex(ctx->bc, &tex); 7117 if (r) 7118 return r; 7119 } 7120 7121 for (i = 0; i < 2; i++) { 7122 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7123 alu.op = ALU_OP3_MULADD; 7124 alu.is_op3 = 1; 7125 alu.src[0].sel = gradientsH; 7126 alu.src[0].chan = i; 7127 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7128 alu.src[1].sel = sample_gpr; 7129 alu.src[1].chan = 2; 7130 } 7131 else { 7132 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 7133 } 7134 alu.src[2].sel = interp_gpr; 7135 alu.src[2].chan = interp_base_chan + i; 7136 alu.dst.sel = ctx->temp_reg; 7137 alu.dst.chan = i; 7138 alu.last = i == 1; 7139 7140 r = r600_bytecode_add_alu(ctx->bc, &alu); 7141 if (r) 7142 return r; 7143 } 7144 7145 for (i = 0; i < 2; i++) { 7146 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7147 alu.op = ALU_OP3_MULADD; 7148 alu.is_op3 = 1; 7149 alu.src[0].sel = gradientsV; 7150 alu.src[0].chan = i; 7151 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7152 alu.src[1].sel = sample_gpr; 7153 alu.src[1].chan = 3; 7154 } 7155 else { 7156 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); 7157 } 7158 alu.src[2].sel = ctx->temp_reg; 7159 alu.src[2].chan = i; 7160 alu.dst.sel = ctx->temp_reg; 7161 alu.dst.chan = i; 7162 alu.last = i == 1; 7163 7164 r = r600_bytecode_add_alu(ctx->bc, &alu); 7165 if (r) 7166 return r; 7167 } 7168 } 7169 7170 tmp = r600_get_temp(ctx); 7171 for (i = 0; i < 8; i++) { 7172 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7173 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY; 7174 7175 alu.dst.sel = tmp; 7176 if ((i > 1 && i < 6)) { 7177 alu.dst.write = 1; 7178 } 7179 else { 7180 alu.dst.write = 0; 7181 } 7182 alu.dst.chan = i % 4; 7183 7184 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 7185 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 7186 alu.src[0].sel = ctx->temp_reg; 7187 alu.src[0].chan = 1 - (i % 2); 7188 } else { 7189 alu.src[0].sel = interp_gpr; 7190 alu.src[0].chan = interp_base_chan + 1 - (i % 2); 7191 } 7192 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 7193 alu.src[1].chan = 0; 7194 7195 alu.last = i % 4 == 3; 7196 alu.bank_swizzle_force = SQ_ALU_VEC_210; 7197 7198 r = r600_bytecode_add_alu(ctx->bc, &alu); 7199 if (r) 7200 return r; 7201 } 7202 7203 // INTERP can't swizzle dst 7204 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 7205 for (i = 0; i <= lasti; i++) { 7206 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 7207 continue; 7208 7209 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7210 alu.op = ALU_OP1_MOV; 7211 alu.src[0].sel = tmp; 7212 alu.src[0].chan = ctx->src[0].swizzle[i]; 7213 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7214 alu.dst.write = 1; 7215 alu.last = i == lasti; 7216 r = r600_bytecode_add_alu(ctx->bc, &alu); 7217 if (r) 7218 return r; 7219 } 7220 7221 return 0; 7222} 7223 7224 7225static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst) 7226{ 7227 struct r600_bytecode_alu alu; 7228 int i, r; 7229 7230 for (i = 0; i < 4; i++) { 7231 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7232 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) { 7233 alu.op = ALU_OP0_NOP; 7234 alu.dst.chan = i; 7235 } else { 7236 alu.op = ALU_OP1_MOV; 7237 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7238 alu.src[0].sel = ctx->temp_reg; 7239 alu.src[0].chan = i; 7240 } 7241 if (i == 3) { 7242 alu.last = 1; 7243 } 7244 r = r600_bytecode_add_alu(ctx->bc, &alu); 7245 if (r) 7246 return r; 7247 } 7248 return 0; 7249} 7250 7251static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx, 7252 unsigned writemask, 7253 struct r600_bytecode_alu_src *bc_src, 7254 const struct r600_shader_src *shader_src) 7255{ 7256 struct r600_bytecode_alu alu; 7257 int i, r; 7258 int lasti = tgsi_last_instruction(writemask); 7259 int temp_reg = 0; 7260 7261 r600_bytecode_src(&bc_src[0], shader_src, 0); 7262 r600_bytecode_src(&bc_src[1], shader_src, 1); 7263 r600_bytecode_src(&bc_src[2], shader_src, 2); 7264 r600_bytecode_src(&bc_src[3], shader_src, 3); 7265 7266 if (bc_src->abs) { 7267 temp_reg = r600_get_temp(ctx); 7268 7269 for (i = 0; i < lasti + 1; i++) { 7270 if (!(writemask & (1 << i))) 7271 continue; 7272 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7273 alu.op = ALU_OP1_MOV; 7274 alu.dst.sel = temp_reg; 7275 alu.dst.chan = i; 7276 alu.dst.write = 1; 7277 alu.src[0] = bc_src[i]; 7278 if (i == lasti) { 7279 alu.last = 1; 7280 } 7281 r = r600_bytecode_add_alu(ctx->bc, &alu); 7282 if (r) 7283 return r; 7284 memset(&bc_src[i], 0, sizeof(*bc_src)); 7285 bc_src[i].sel = temp_reg; 7286 bc_src[i].chan = i; 7287 } 7288 } 7289 return 0; 7290} 7291 7292static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst) 7293{ 7294 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7295 struct r600_bytecode_alu alu; 7296 struct r600_bytecode_alu_src srcs[4][4]; 7297 int i, j, r; 7298 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 7299 unsigned op = ctx->inst_info->op; 7300 7301 if (op == ALU_OP3_MULADD_IEEE && 7302 ctx->info.properties[TGSI_PROPERTY_LEGACY_MATH_RULES]) 7303 op = ALU_OP3_MULADD; 7304 7305 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 7306 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask, 7307 srcs[j], &ctx->src[j]); 7308 if (r) 7309 return r; 7310 } 7311 7312 for (i = 0; i < lasti + 1; i++) { 7313 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 7314 continue; 7315 7316 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7317 alu.op = op; 7318 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 7319 alu.src[j] = srcs[j][i]; 7320 } 7321 7322 if (dst == -1) { 7323 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7324 } else { 7325 alu.dst.sel = dst; 7326 } 7327 alu.dst.chan = i; 7328 alu.dst.write = 1; 7329 alu.is_op3 = 1; 7330 if (i == lasti) { 7331 alu.last = 1; 7332 } 7333 r = r600_bytecode_add_alu(ctx->bc, &alu); 7334 if (r) 7335 return r; 7336 } 7337 return 0; 7338} 7339 7340static int tgsi_op3(struct r600_shader_ctx *ctx) 7341{ 7342 return tgsi_op3_dst(ctx, -1); 7343} 7344 7345static int tgsi_dp(struct r600_shader_ctx *ctx) 7346{ 7347 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7348 struct r600_bytecode_alu alu; 7349 int i, j, r; 7350 unsigned op = ctx->inst_info->op; 7351 if (op == ALU_OP2_DOT4_IEEE && 7352 ctx->info.properties[TGSI_PROPERTY_LEGACY_MATH_RULES]) 7353 op = ALU_OP2_DOT4; 7354 7355 for (i = 0; i < 4; i++) { 7356 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7357 alu.op = op; 7358 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 7359 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 7360 } 7361 7362 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 7363 alu.dst.chan = i; 7364 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 7365 /* handle some special cases */ 7366 switch (inst->Instruction.Opcode) { 7367 case TGSI_OPCODE_DP2: 7368 if (i > 1) { 7369 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0; 7370 alu.src[0].chan = alu.src[1].chan = 0; 7371 } 7372 break; 7373 case TGSI_OPCODE_DP3: 7374 if (i > 2) { 7375 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0; 7376 alu.src[0].chan = alu.src[1].chan = 0; 7377 } 7378 break; 7379 default: 7380 break; 7381 } 7382 if (i == 3) { 7383 alu.last = 1; 7384 } 7385 r = r600_bytecode_add_alu(ctx->bc, &alu); 7386 if (r) 7387 return r; 7388 } 7389 return 0; 7390} 7391 7392static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx, 7393 unsigned index) 7394{ 7395 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7396 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY && 7397 inst->Src[index].Register.File != TGSI_FILE_INPUT && 7398 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) || 7399 ctx->src[index].neg || ctx->src[index].abs || 7400 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY); 7401} 7402 7403static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx, 7404 unsigned index) 7405{ 7406 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7407 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index; 7408} 7409 7410static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading) 7411{ 7412 struct r600_bytecode_vtx vtx; 7413 struct r600_bytecode_alu alu; 7414 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7415 int src_gpr, r, i; 7416 int id = tgsi_tex_get_src_gpr(ctx, 1); 7417 int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 7418 7419 src_gpr = tgsi_tex_get_src_gpr(ctx, 0); 7420 if (src_requires_loading) { 7421 for (i = 0; i < 4; i++) { 7422 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7423 alu.op = ALU_OP1_MOV; 7424 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 7425 alu.dst.sel = ctx->temp_reg; 7426 alu.dst.chan = i; 7427 if (i == 3) 7428 alu.last = 1; 7429 alu.dst.write = 1; 7430 r = r600_bytecode_add_alu(ctx->bc, &alu); 7431 if (r) 7432 return r; 7433 } 7434 src_gpr = ctx->temp_reg; 7435 } 7436 7437 memset(&vtx, 0, sizeof(vtx)); 7438 vtx.op = FETCH_OP_VFETCH; 7439 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS; 7440 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 7441 vtx.src_gpr = src_gpr; 7442 vtx.mega_fetch_count = 16; 7443 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 7444 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */ 7445 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */ 7446 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */ 7447 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */ 7448 vtx.use_const_fields = 1; 7449 vtx.buffer_index_mode = sampler_index_mode; 7450 7451 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 7452 return r; 7453 7454 if (ctx->bc->gfx_level >= EVERGREEN) 7455 return 0; 7456 7457 for (i = 0; i < 4; i++) { 7458 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 7459 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 7460 continue; 7461 7462 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7463 alu.op = ALU_OP2_AND_INT; 7464 7465 alu.dst.chan = i; 7466 alu.dst.sel = vtx.dst_gpr; 7467 alu.dst.write = 1; 7468 7469 alu.src[0].sel = vtx.dst_gpr; 7470 alu.src[0].chan = i; 7471 7472 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL; 7473 alu.src[1].sel += (id * 2); 7474 alu.src[1].chan = i % 4; 7475 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 7476 7477 if (i == lasti) 7478 alu.last = 1; 7479 r = r600_bytecode_add_alu(ctx->bc, &alu); 7480 if (r) 7481 return r; 7482 } 7483 7484 if (inst->Dst[0].Register.WriteMask & 3) { 7485 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7486 alu.op = ALU_OP2_OR_INT; 7487 7488 alu.dst.chan = 3; 7489 alu.dst.sel = vtx.dst_gpr; 7490 alu.dst.write = 1; 7491 7492 alu.src[0].sel = vtx.dst_gpr; 7493 alu.src[0].chan = 3; 7494 7495 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1; 7496 alu.src[1].chan = 0; 7497 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 7498 7499 alu.last = 1; 7500 r = r600_bytecode_add_alu(ctx->bc, &alu); 7501 if (r) 7502 return r; 7503 } 7504 return 0; 7505} 7506 7507static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset, int eg_buffer_base) 7508{ 7509 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7510 int r; 7511 int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset; 7512 int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 7513 7514 if (ctx->bc->gfx_level < EVERGREEN) { 7515 struct r600_bytecode_alu alu; 7516 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7517 alu.op = ALU_OP1_MOV; 7518 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; 7519 /* r600 we have them at channel 2 of the second dword */ 7520 alu.src[0].sel += (id * 2) + 1; 7521 alu.src[0].chan = 1; 7522 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 7523 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 7524 alu.last = 1; 7525 r = r600_bytecode_add_alu(ctx->bc, &alu); 7526 if (r) 7527 return r; 7528 return 0; 7529 } else { 7530 struct r600_bytecode_vtx vtx; 7531 memset(&vtx, 0, sizeof(vtx)); 7532 vtx.op = FETCH_OP_GET_BUFFER_RESINFO; 7533 vtx.buffer_id = id + eg_buffer_base; 7534 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 7535 vtx.src_gpr = 0; 7536 vtx.mega_fetch_count = 16; /* no idea here really... */ 7537 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 7538 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */ 7539 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7; /* SEL_Y */ 7540 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7; /* SEL_Z */ 7541 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7; /* SEL_W */ 7542 vtx.data_format = FMT_32_32_32_32; 7543 vtx.buffer_index_mode = sampler_index_mode; 7544 7545 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx))) 7546 return r; 7547 return 0; 7548 } 7549} 7550 7551 7552static int tgsi_tex(struct r600_shader_ctx *ctx) 7553{ 7554 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7555 struct r600_bytecode_tex tex; 7556 struct r600_bytecode_tex grad_offs[3]; 7557 struct r600_bytecode_alu alu; 7558 unsigned src_gpr; 7559 int r, i, j, n_grad_offs = 0; 7560 int opcode; 7561 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing && 7562 inst->Instruction.Opcode == TGSI_OPCODE_TXF && 7563 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA || 7564 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA); 7565 7566 bool txf_add_offsets = inst->Texture.NumOffsets && 7567 inst->Instruction.Opcode == TGSI_OPCODE_TXF && 7568 inst->Texture.Texture != TGSI_TEXTURE_BUFFER; 7569 7570 /* Texture fetch instructions can only use gprs as source. 7571 * Also they cannot negate the source or take the absolute value */ 7572 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS && 7573 tgsi_tex_src_requires_loading(ctx, 0)) || 7574 read_compressed_msaa || txf_add_offsets; 7575 7576 boolean src_loaded = FALSE; 7577 unsigned sampler_src_reg = 1; 7578 int8_t offset_x = 0, offset_y = 0, offset_z = 0; 7579 boolean has_txq_cube_array_z = false; 7580 unsigned sampler_index_mode; 7581 int array_index_offset_channel = -1; 7582 7583 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ && 7584 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 7585 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY))) 7586 if (inst->Dst[0].Register.WriteMask & 4) { 7587 ctx->shader->has_txq_cube_array_z_comp = true; 7588 has_txq_cube_array_z = true; 7589 } 7590 7591 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 || 7592 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 7593 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 || 7594 inst->Instruction.Opcode == TGSI_OPCODE_TG4) 7595 sampler_src_reg = 2; 7596 7597 /* TGSI moves the sampler to src reg 3 for TXD */ 7598 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) 7599 sampler_src_reg = 3; 7600 7601 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 7602 7603 src_gpr = tgsi_tex_get_src_gpr(ctx, 0); 7604 7605 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) { 7606 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) { 7607 if (ctx->bc->gfx_level < EVERGREEN) 7608 ctx->shader->uses_tex_buffers = true; 7609 return r600_do_buffer_txq(ctx, 1, 0, R600_MAX_CONST_BUFFERS); 7610 } 7611 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) { 7612 if (ctx->bc->gfx_level < EVERGREEN) 7613 ctx->shader->uses_tex_buffers = true; 7614 return do_vtx_fetch_inst(ctx, src_requires_loading); 7615 } 7616 } 7617 7618 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) { 7619 int out_chan; 7620 /* Add perspective divide */ 7621 if (ctx->bc->gfx_level == CAYMAN) { 7622 out_chan = 2; 7623 for (i = 0; i < 3; i++) { 7624 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7625 alu.op = ALU_OP1_RECIP_IEEE; 7626 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7627 7628 alu.dst.sel = ctx->temp_reg; 7629 alu.dst.chan = i; 7630 if (i == 2) 7631 alu.last = 1; 7632 if (out_chan == i) 7633 alu.dst.write = 1; 7634 r = r600_bytecode_add_alu(ctx->bc, &alu); 7635 if (r) 7636 return r; 7637 } 7638 7639 } else { 7640 out_chan = 3; 7641 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7642 alu.op = ALU_OP1_RECIP_IEEE; 7643 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7644 7645 alu.dst.sel = ctx->temp_reg; 7646 alu.dst.chan = out_chan; 7647 alu.last = 1; 7648 alu.dst.write = 1; 7649 r = r600_bytecode_add_alu(ctx->bc, &alu); 7650 if (r) 7651 return r; 7652 } 7653 7654 for (i = 0; i < 3; i++) { 7655 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7656 alu.op = ALU_OP2_MUL; 7657 alu.src[0].sel = ctx->temp_reg; 7658 alu.src[0].chan = out_chan; 7659 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 7660 alu.dst.sel = ctx->temp_reg; 7661 alu.dst.chan = i; 7662 alu.dst.write = 1; 7663 r = r600_bytecode_add_alu(ctx->bc, &alu); 7664 if (r) 7665 return r; 7666 } 7667 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7668 alu.op = ALU_OP1_MOV; 7669 alu.src[0].sel = V_SQ_ALU_SRC_1; 7670 alu.src[0].chan = 0; 7671 alu.dst.sel = ctx->temp_reg; 7672 alu.dst.chan = 3; 7673 alu.last = 1; 7674 alu.dst.write = 1; 7675 r = r600_bytecode_add_alu(ctx->bc, &alu); 7676 if (r) 7677 return r; 7678 src_loaded = TRUE; 7679 src_gpr = ctx->temp_reg; 7680 } 7681 7682 7683 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE || 7684 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 7685 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 7686 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && 7687 inst->Instruction.Opcode != TGSI_OPCODE_TXQ) { 7688 7689 static const unsigned src0_swizzle[] = {2, 2, 0, 1}; 7690 static const unsigned src1_swizzle[] = {1, 0, 2, 2}; 7691 7692 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */ 7693 for (i = 0; i < 4; i++) { 7694 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7695 alu.op = ALU_OP2_CUBE; 7696 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]); 7697 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]); 7698 alu.dst.sel = ctx->temp_reg; 7699 alu.dst.chan = i; 7700 if (i == 3) 7701 alu.last = 1; 7702 alu.dst.write = 1; 7703 r = r600_bytecode_add_alu(ctx->bc, &alu); 7704 if (r) 7705 return r; 7706 } 7707 7708 /* tmp1.z = RCP_e(|tmp1.z|) */ 7709 if (ctx->bc->gfx_level == CAYMAN) { 7710 for (i = 0; i < 3; i++) { 7711 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7712 alu.op = ALU_OP1_RECIP_IEEE; 7713 alu.src[0].sel = ctx->temp_reg; 7714 alu.src[0].chan = 2; 7715 alu.src[0].abs = 1; 7716 alu.dst.sel = ctx->temp_reg; 7717 alu.dst.chan = i; 7718 if (i == 2) 7719 alu.dst.write = 1; 7720 if (i == 2) 7721 alu.last = 1; 7722 r = r600_bytecode_add_alu(ctx->bc, &alu); 7723 if (r) 7724 return r; 7725 } 7726 } else { 7727 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7728 alu.op = ALU_OP1_RECIP_IEEE; 7729 alu.src[0].sel = ctx->temp_reg; 7730 alu.src[0].chan = 2; 7731 alu.src[0].abs = 1; 7732 alu.dst.sel = ctx->temp_reg; 7733 alu.dst.chan = 2; 7734 alu.dst.write = 1; 7735 alu.last = 1; 7736 r = r600_bytecode_add_alu(ctx->bc, &alu); 7737 if (r) 7738 return r; 7739 } 7740 7741 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x 7742 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x 7743 * muladd has no writemask, have to use another temp 7744 */ 7745 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7746 alu.op = ALU_OP3_MULADD; 7747 alu.is_op3 = 1; 7748 7749 alu.src[0].sel = ctx->temp_reg; 7750 alu.src[0].chan = 0; 7751 alu.src[1].sel = ctx->temp_reg; 7752 alu.src[1].chan = 2; 7753 7754 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 7755 alu.src[2].chan = 0; 7756 alu.src[2].value = u_bitcast_f2u(1.5f); 7757 7758 alu.dst.sel = ctx->temp_reg; 7759 alu.dst.chan = 0; 7760 alu.dst.write = 1; 7761 7762 r = r600_bytecode_add_alu(ctx->bc, &alu); 7763 if (r) 7764 return r; 7765 7766 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7767 alu.op = ALU_OP3_MULADD; 7768 alu.is_op3 = 1; 7769 7770 alu.src[0].sel = ctx->temp_reg; 7771 alu.src[0].chan = 1; 7772 alu.src[1].sel = ctx->temp_reg; 7773 alu.src[1].chan = 2; 7774 7775 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 7776 alu.src[2].chan = 0; 7777 alu.src[2].value = u_bitcast_f2u(1.5f); 7778 7779 alu.dst.sel = ctx->temp_reg; 7780 alu.dst.chan = 1; 7781 alu.dst.write = 1; 7782 7783 alu.last = 1; 7784 r = r600_bytecode_add_alu(ctx->bc, &alu); 7785 if (r) 7786 return r; 7787 /* write initial compare value into Z component 7788 - W src 0 for shadow cube 7789 - X src 1 for shadow cube array */ 7790 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 7791 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 7792 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7793 alu.op = ALU_OP1_MOV; 7794 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) 7795 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 7796 else 7797 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7798 alu.dst.sel = ctx->temp_reg; 7799 alu.dst.chan = 2; 7800 alu.dst.write = 1; 7801 alu.last = 1; 7802 r = r600_bytecode_add_alu(ctx->bc, &alu); 7803 if (r) 7804 return r; 7805 } 7806 7807 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 7808 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 7809 if (ctx->bc->gfx_level >= EVERGREEN) { 7810 int mytmp = r600_get_temp(ctx); 7811 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7812 alu.op = ALU_OP1_MOV; 7813 alu.src[0].sel = ctx->temp_reg; 7814 alu.src[0].chan = 3; 7815 alu.dst.sel = mytmp; 7816 alu.dst.chan = 0; 7817 alu.dst.write = 1; 7818 alu.last = 1; 7819 r = r600_bytecode_add_alu(ctx->bc, &alu); 7820 if (r) 7821 return r; 7822 7823 /* Evaluate the array index according to floor(idx + 0.5). This 7824 * needs to be done before merging the face select value, because 7825 * otherwise the fractional part of the array index will interfere 7826 * with the face select value */ 7827 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7828 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7829 alu.op = ALU_OP1_RNDNE; 7830 alu.dst.sel = ctx->temp_reg; 7831 alu.dst.chan = 3; 7832 alu.dst.write = 1; 7833 alu.last = 1; 7834 r = r600_bytecode_add_alu(ctx->bc, &alu); 7835 if (r) 7836 return r; 7837 7838 /* Because the array slice index and the cube face index are merged 7839 * into one value we have to make sure the array slice index is >= 0, 7840 * otherwise the face selection will fail */ 7841 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7842 alu.op = ALU_OP2_MAX; 7843 alu.src[0].sel = ctx->temp_reg; 7844 alu.src[0].chan = 3; 7845 alu.src[1].sel = V_SQ_ALU_SRC_0; 7846 alu.dst.sel = ctx->temp_reg; 7847 alu.dst.chan = 3; 7848 alu.dst.write = 1; 7849 alu.last = 1; 7850 r = r600_bytecode_add_alu(ctx->bc, &alu); 7851 if (r) 7852 return r; 7853 7854 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */ 7855 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7856 alu.op = ALU_OP3_MULADD; 7857 alu.is_op3 = 1; 7858 alu.src[0].sel = ctx->temp_reg; 7859 alu.src[0].chan = 3; 7860 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 7861 alu.src[1].chan = 0; 7862 alu.src[1].value = u_bitcast_f2u(8.0f); 7863 alu.src[2].sel = mytmp; 7864 alu.src[2].chan = 0; 7865 alu.dst.sel = ctx->temp_reg; 7866 alu.dst.chan = 3; 7867 alu.dst.write = 1; 7868 alu.last = 1; 7869 r = r600_bytecode_add_alu(ctx->bc, &alu); 7870 if (r) 7871 return r; 7872 } else if (ctx->bc->gfx_level < EVERGREEN) { 7873 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 7874 tex.op = FETCH_OP_SET_CUBEMAP_INDEX; 7875 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 7876 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 7877 tex.src_gpr = r600_get_temp(ctx); 7878 tex.src_sel_x = 0; 7879 tex.src_sel_y = 0; 7880 tex.src_sel_z = 0; 7881 tex.src_sel_w = 0; 7882 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7; 7883 tex.coord_type_x = 1; 7884 tex.coord_type_y = 1; 7885 tex.coord_type_z = 1; 7886 tex.coord_type_w = 1; 7887 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7888 alu.op = ALU_OP1_MOV; 7889 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7890 alu.dst.sel = tex.src_gpr; 7891 alu.dst.chan = 0; 7892 alu.last = 1; 7893 alu.dst.write = 1; 7894 r = r600_bytecode_add_alu(ctx->bc, &alu); 7895 if (r) 7896 return r; 7897 7898 r = r600_bytecode_add_tex(ctx->bc, &tex); 7899 if (r) 7900 return r; 7901 } 7902 7903 } 7904 7905 /* for cube forms of lod and bias we need to route things */ 7906 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB || 7907 inst->Instruction.Opcode == TGSI_OPCODE_TXL || 7908 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 7909 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) { 7910 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7911 alu.op = ALU_OP1_MOV; 7912 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 7913 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) 7914 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 7915 else 7916 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7917 alu.dst.sel = ctx->temp_reg; 7918 alu.dst.chan = 2; 7919 alu.last = 1; 7920 alu.dst.write = 1; 7921 r = r600_bytecode_add_alu(ctx->bc, &alu); 7922 if (r) 7923 return r; 7924 } 7925 7926 src_loaded = TRUE; 7927 src_gpr = ctx->temp_reg; 7928 } 7929 7930 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) { 7931 int temp_h = 0, temp_v = 0; 7932 int start_val = 0; 7933 7934 /* if we've already loaded the src (i.e. CUBE don't reload it). */ 7935 if (src_loaded == TRUE) 7936 start_val = 1; 7937 else 7938 src_loaded = TRUE; 7939 for (i = start_val; i < 3; i++) { 7940 int treg = r600_get_temp(ctx); 7941 7942 if (i == 0) 7943 src_gpr = treg; 7944 else if (i == 1) 7945 temp_h = treg; 7946 else 7947 temp_v = treg; 7948 7949 for (j = 0; j < 4; j++) { 7950 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7951 alu.op = ALU_OP1_MOV; 7952 r600_bytecode_src(&alu.src[0], &ctx->src[i], j); 7953 alu.dst.sel = treg; 7954 alu.dst.chan = j; 7955 if (j == 3) 7956 alu.last = 1; 7957 alu.dst.write = 1; 7958 r = r600_bytecode_add_alu(ctx->bc, &alu); 7959 if (r) 7960 return r; 7961 } 7962 } 7963 for (i = 1; i < 3; i++) { 7964 /* set gradients h/v */ 7965 struct r600_bytecode_tex *t = &grad_offs[n_grad_offs++]; 7966 memset(t, 0, sizeof(struct r600_bytecode_tex)); 7967 t->op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H : 7968 FETCH_OP_SET_GRADIENTS_V; 7969 t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 7970 t->sampler_index_mode = sampler_index_mode; 7971 t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS; 7972 t->resource_index_mode = sampler_index_mode; 7973 7974 t->src_gpr = (i == 1) ? temp_h : temp_v; 7975 t->src_sel_x = 0; 7976 t->src_sel_y = 1; 7977 t->src_sel_z = 2; 7978 t->src_sel_w = 3; 7979 7980 t->dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */ 7981 t->dst_sel_x = t->dst_sel_y = t->dst_sel_z = t->dst_sel_w = 7; 7982 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) { 7983 t->coord_type_x = 1; 7984 t->coord_type_y = 1; 7985 t->coord_type_z = 1; 7986 t->coord_type_w = 1; 7987 } 7988 } 7989 } 7990 7991 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) { 7992 /* Gather4 should follow the same rules as bilinear filtering, but the hardware 7993 * incorrectly forces nearest filtering if the texture format is integer. 7994 * The only effect it has on Gather4, which always returns 4 texels for 7995 * bilinear filtering, is that the final coordinates are off by 0.5 of 7996 * the texel size. 7997 * 7998 * The workaround is to subtract 0.5 from the unnormalized coordinates, 7999 * or (0.5 / size) from the normalized coordinates. 8000 */ 8001 if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT || 8002 inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) { 8003 int treg = r600_get_temp(ctx); 8004 8005 /* mov array and comparison oordinate to temp_reg if needed */ 8006 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 8007 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 8008 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) && !src_loaded) { 8009 int end = inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ? 3 : 2; 8010 for (i = 2; i <= end; i++) { 8011 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8012 alu.op = ALU_OP1_MOV; 8013 alu.dst.sel = ctx->temp_reg; 8014 alu.dst.chan = i; 8015 alu.dst.write = 1; 8016 alu.last = (i == end); 8017 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 8018 r = r600_bytecode_add_alu(ctx->bc, &alu); 8019 if (r) 8020 return r; 8021 } 8022 } 8023 8024 if (inst->Texture.Texture == TGSI_TEXTURE_RECT || 8025 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) { 8026 for (i = 0; i < 2; i++) { 8027 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8028 alu.op = ALU_OP2_ADD; 8029 alu.dst.sel = ctx->temp_reg; 8030 alu.dst.chan = i; 8031 alu.dst.write = 1; 8032 alu.last = i == 1; 8033 if (src_loaded) { 8034 alu.src[0].sel = ctx->temp_reg; 8035 alu.src[0].chan = i; 8036 } else 8037 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 8038 alu.src[1].sel = V_SQ_ALU_SRC_0_5; 8039 alu.src[1].neg = 1; 8040 r = r600_bytecode_add_alu(ctx->bc, &alu); 8041 if (r) 8042 return r; 8043 } 8044 } else { 8045 /* execute a TXQ */ 8046 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 8047 tex.op = FETCH_OP_GET_TEXTURE_RESINFO; 8048 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8049 tex.sampler_index_mode = sampler_index_mode; 8050 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 8051 tex.resource_index_mode = sampler_index_mode; 8052 tex.dst_gpr = treg; 8053 tex.src_sel_x = 4; 8054 tex.src_sel_y = 4; 8055 tex.src_sel_z = 4; 8056 tex.src_sel_w = 4; 8057 tex.dst_sel_x = 0; 8058 tex.dst_sel_y = 1; 8059 tex.dst_sel_z = 7; 8060 tex.dst_sel_w = 7; 8061 r = r600_bytecode_add_tex(ctx->bc, &tex); 8062 if (r) 8063 return r; 8064 8065 /* coord.xy = -0.5 * (1.0/int_to_flt(size)) + coord.xy */ 8066 if (ctx->bc->gfx_level == CAYMAN) { 8067 /* */ 8068 for (i = 0; i < 2; i++) { 8069 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8070 alu.op = ALU_OP1_INT_TO_FLT; 8071 alu.dst.sel = treg; 8072 alu.dst.chan = i; 8073 alu.dst.write = 1; 8074 alu.src[0].sel = treg; 8075 alu.src[0].chan = i; 8076 alu.last = (i == 1) ? 1 : 0; 8077 r = r600_bytecode_add_alu(ctx->bc, &alu); 8078 if (r) 8079 return r; 8080 } 8081 for (j = 0; j < 2; j++) { 8082 for (i = 0; i < 3; i++) { 8083 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8084 alu.op = ALU_OP1_RECIP_IEEE; 8085 alu.src[0].sel = treg; 8086 alu.src[0].chan = j; 8087 alu.dst.sel = treg; 8088 alu.dst.chan = i; 8089 if (i == 2) 8090 alu.last = 1; 8091 if (i == j) 8092 alu.dst.write = 1; 8093 r = r600_bytecode_add_alu(ctx->bc, &alu); 8094 if (r) 8095 return r; 8096 } 8097 } 8098 } else { 8099 for (i = 0; i < 2; i++) { 8100 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8101 alu.op = ALU_OP1_INT_TO_FLT; 8102 alu.dst.sel = treg; 8103 alu.dst.chan = i; 8104 alu.dst.write = 1; 8105 alu.src[0].sel = treg; 8106 alu.src[0].chan = i; 8107 alu.last = 1; 8108 r = r600_bytecode_add_alu(ctx->bc, &alu); 8109 if (r) 8110 return r; 8111 } 8112 for (i = 0; i < 2; i++) { 8113 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8114 alu.op = ALU_OP1_RECIP_IEEE; 8115 alu.src[0].sel = treg; 8116 alu.src[0].chan = i; 8117 alu.dst.sel = treg; 8118 alu.dst.chan = i; 8119 alu.last = 1; 8120 alu.dst.write = 1; 8121 r = r600_bytecode_add_alu(ctx->bc, &alu); 8122 if (r) 8123 return r; 8124 } 8125 } 8126 for (i = 0; i < 2; i++) { 8127 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8128 alu.op = ALU_OP3_MULADD; 8129 alu.is_op3 = 1; 8130 alu.dst.sel = ctx->temp_reg; 8131 alu.dst.chan = i; 8132 alu.dst.write = 1; 8133 alu.last = i == 1; 8134 alu.src[0].sel = treg; 8135 alu.src[0].chan = i; 8136 alu.src[1].sel = V_SQ_ALU_SRC_0_5; 8137 alu.src[1].neg = 1; 8138 if (src_loaded) { 8139 alu.src[2].sel = ctx->temp_reg; 8140 alu.src[2].chan = i; 8141 } else 8142 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 8143 r = r600_bytecode_add_alu(ctx->bc, &alu); 8144 if (r) 8145 return r; 8146 } 8147 } 8148 src_loaded = TRUE; 8149 src_gpr = ctx->temp_reg; 8150 } 8151 } 8152 8153 if (src_requires_loading && !src_loaded) { 8154 for (i = 0; i < 4; i++) { 8155 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8156 alu.op = ALU_OP1_MOV; 8157 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 8158 alu.dst.sel = ctx->temp_reg; 8159 alu.dst.chan = i; 8160 if (i == 3) 8161 alu.last = 1; 8162 alu.dst.write = 1; 8163 r = r600_bytecode_add_alu(ctx->bc, &alu); 8164 if (r) 8165 return r; 8166 } 8167 src_loaded = TRUE; 8168 src_gpr = ctx->temp_reg; 8169 } 8170 8171 /* get offset values */ 8172 if (inst->Texture.NumOffsets) { 8173 assert(inst->Texture.NumOffsets == 1); 8174 8175 /* The texture offset feature doesn't work with the TXF instruction 8176 * and must be emulated by adding the offset to the texture coordinates. */ 8177 if (txf_add_offsets) { 8178 const struct tgsi_texture_offset *off = inst->TexOffsets; 8179 8180 switch (inst->Texture.Texture) { 8181 case TGSI_TEXTURE_3D: 8182 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8183 alu.op = ALU_OP2_ADD_INT; 8184 alu.src[0].sel = src_gpr; 8185 alu.src[0].chan = 2; 8186 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8187 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ]; 8188 alu.dst.sel = src_gpr; 8189 alu.dst.chan = 2; 8190 alu.dst.write = 1; 8191 alu.last = 1; 8192 r = r600_bytecode_add_alu(ctx->bc, &alu); 8193 if (r) 8194 return r; 8195 FALLTHROUGH; 8196 8197 case TGSI_TEXTURE_2D: 8198 case TGSI_TEXTURE_SHADOW2D: 8199 case TGSI_TEXTURE_RECT: 8200 case TGSI_TEXTURE_SHADOWRECT: 8201 case TGSI_TEXTURE_2D_ARRAY: 8202 case TGSI_TEXTURE_SHADOW2D_ARRAY: 8203 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8204 alu.op = ALU_OP2_ADD_INT; 8205 alu.src[0].sel = src_gpr; 8206 alu.src[0].chan = 1; 8207 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8208 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY]; 8209 alu.dst.sel = src_gpr; 8210 alu.dst.chan = 1; 8211 alu.dst.write = 1; 8212 alu.last = 1; 8213 r = r600_bytecode_add_alu(ctx->bc, &alu); 8214 if (r) 8215 return r; 8216 FALLTHROUGH; 8217 8218 case TGSI_TEXTURE_1D: 8219 case TGSI_TEXTURE_SHADOW1D: 8220 case TGSI_TEXTURE_1D_ARRAY: 8221 case TGSI_TEXTURE_SHADOW1D_ARRAY: 8222 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8223 alu.op = ALU_OP2_ADD_INT; 8224 alu.src[0].sel = src_gpr; 8225 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8226 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX]; 8227 alu.dst.sel = src_gpr; 8228 alu.dst.write = 1; 8229 alu.last = 1; 8230 r = r600_bytecode_add_alu(ctx->bc, &alu); 8231 if (r) 8232 return r; 8233 break; 8234 /* texture offsets do not apply to other texture targets */ 8235 } 8236 } else { 8237 switch (inst->Texture.Texture) { 8238 case TGSI_TEXTURE_3D: 8239 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1; 8240 FALLTHROUGH; 8241 case TGSI_TEXTURE_2D: 8242 case TGSI_TEXTURE_SHADOW2D: 8243 case TGSI_TEXTURE_RECT: 8244 case TGSI_TEXTURE_SHADOWRECT: 8245 case TGSI_TEXTURE_2D_ARRAY: 8246 case TGSI_TEXTURE_SHADOW2D_ARRAY: 8247 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1; 8248 FALLTHROUGH; 8249 case TGSI_TEXTURE_1D: 8250 case TGSI_TEXTURE_SHADOW1D: 8251 case TGSI_TEXTURE_1D_ARRAY: 8252 case TGSI_TEXTURE_SHADOW1D_ARRAY: 8253 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1; 8254 } 8255 } 8256 } 8257 8258 /* Obtain the sample index for reading a compressed MSAA color texture. 8259 * To read the FMASK, we use the ldfptr instruction, which tells us 8260 * where the samples are stored. 8261 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210, 8262 * which is the identity mapping. Each nibble says which physical sample 8263 * should be fetched to get that sample. 8264 * 8265 * Assume src.z contains the sample index. It should be modified like this: 8266 * src.z = (ldfptr() >> (src.z * 4)) & 0xF; 8267 * Then fetch the texel with src. 8268 */ 8269 if (read_compressed_msaa) { 8270 unsigned sample_chan = 3; 8271 unsigned temp = r600_get_temp(ctx); 8272 assert(src_loaded); 8273 8274 /* temp.w = ldfptr() */ 8275 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 8276 tex.op = FETCH_OP_LD; 8277 tex.inst_mod = 1; /* to indicate this is ldfptr */ 8278 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8279 tex.sampler_index_mode = sampler_index_mode; 8280 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 8281 tex.resource_index_mode = sampler_index_mode; 8282 tex.src_gpr = src_gpr; 8283 tex.dst_gpr = temp; 8284 tex.dst_sel_x = 7; /* mask out these components */ 8285 tex.dst_sel_y = 7; 8286 tex.dst_sel_z = 7; 8287 tex.dst_sel_w = 0; /* store X */ 8288 tex.src_sel_x = 0; 8289 tex.src_sel_y = 1; 8290 tex.src_sel_z = 2; 8291 tex.src_sel_w = 3; 8292 tex.offset_x = offset_x; 8293 tex.offset_y = offset_y; 8294 tex.offset_z = offset_z; 8295 r = r600_bytecode_add_tex(ctx->bc, &tex); 8296 if (r) 8297 return r; 8298 8299 /* temp.x = sample_index*4 */ 8300 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8301 alu.op = ALU_OP2_MULLO_INT; 8302 alu.src[0].sel = src_gpr; 8303 alu.src[0].chan = sample_chan; 8304 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8305 alu.src[1].value = 4; 8306 alu.dst.sel = temp; 8307 alu.dst.chan = 0; 8308 alu.dst.write = 1; 8309 r = emit_mul_int_op(ctx->bc, &alu); 8310 if (r) 8311 return r; 8312 8313 /* sample_index = temp.w >> temp.x */ 8314 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8315 alu.op = ALU_OP2_LSHR_INT; 8316 alu.src[0].sel = temp; 8317 alu.src[0].chan = 3; 8318 alu.src[1].sel = temp; 8319 alu.src[1].chan = 0; 8320 alu.dst.sel = src_gpr; 8321 alu.dst.chan = sample_chan; 8322 alu.dst.write = 1; 8323 alu.last = 1; 8324 r = r600_bytecode_add_alu(ctx->bc, &alu); 8325 if (r) 8326 return r; 8327 8328 /* sample_index & 0xF */ 8329 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8330 alu.op = ALU_OP2_AND_INT; 8331 alu.src[0].sel = src_gpr; 8332 alu.src[0].chan = sample_chan; 8333 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8334 alu.src[1].value = 0xF; 8335 alu.dst.sel = src_gpr; 8336 alu.dst.chan = sample_chan; 8337 alu.dst.write = 1; 8338 alu.last = 1; 8339 r = r600_bytecode_add_alu(ctx->bc, &alu); 8340 if (r) 8341 return r; 8342#if 0 8343 /* visualize the FMASK */ 8344 for (i = 0; i < 4; i++) { 8345 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8346 alu.op = ALU_OP1_INT_TO_FLT; 8347 alu.src[0].sel = src_gpr; 8348 alu.src[0].chan = sample_chan; 8349 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8350 alu.dst.chan = i; 8351 alu.dst.write = 1; 8352 alu.last = 1; 8353 r = r600_bytecode_add_alu(ctx->bc, &alu); 8354 if (r) 8355 return r; 8356 } 8357 return 0; 8358#endif 8359 } 8360 8361 /* does this shader want a num layers from TXQ for a cube array? */ 8362 if (has_txq_cube_array_z) { 8363 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8364 8365 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8366 alu.op = ALU_OP1_MOV; 8367 8368 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; 8369 if (ctx->bc->gfx_level >= EVERGREEN) { 8370 /* with eg each dword is number of cubes */ 8371 alu.src[0].sel += id / 4; 8372 alu.src[0].chan = id % 4; 8373 } else { 8374 /* r600 we have them at channel 2 of the second dword */ 8375 alu.src[0].sel += (id * 2) + 1; 8376 alu.src[0].chan = 2; 8377 } 8378 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 8379 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 8380 alu.last = 1; 8381 r = r600_bytecode_add_alu(ctx->bc, &alu); 8382 if (r) 8383 return r; 8384 /* disable writemask from texture instruction */ 8385 inst->Dst[0].Register.WriteMask &= ~4; 8386 } 8387 8388 opcode = ctx->inst_info->op; 8389 if (opcode == FETCH_OP_GATHER4 && 8390 inst->TexOffsets[0].File != TGSI_FILE_NULL && 8391 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) { 8392 struct r600_bytecode_tex *t; 8393 opcode = FETCH_OP_GATHER4_O; 8394 8395 /* GATHER4_O/GATHER4_C_O use offset values loaded by 8396 SET_TEXTURE_OFFSETS instruction. The immediate offset values 8397 encoded in the instruction are ignored. */ 8398 t = &grad_offs[n_grad_offs++]; 8399 memset(t, 0, sizeof(struct r600_bytecode_tex)); 8400 t->op = FETCH_OP_SET_TEXTURE_OFFSETS; 8401 t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8402 t->sampler_index_mode = sampler_index_mode; 8403 t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS; 8404 t->resource_index_mode = sampler_index_mode; 8405 8406 t->src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index; 8407 t->src_sel_x = inst->TexOffsets[0].SwizzleX; 8408 t->src_sel_y = inst->TexOffsets[0].SwizzleY; 8409 if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 8410 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) 8411 /* make sure array index selector is 0, this is just a safety 8412 * precausion because TGSI seems to emit something strange here */ 8413 t->src_sel_z = 4; 8414 else 8415 t->src_sel_z = inst->TexOffsets[0].SwizzleZ; 8416 8417 t->src_sel_w = 4; 8418 8419 t->dst_sel_x = 7; 8420 t->dst_sel_y = 7; 8421 t->dst_sel_z = 7; 8422 t->dst_sel_w = 7; 8423 } 8424 8425 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || 8426 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 8427 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT || 8428 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 8429 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY || 8430 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY || 8431 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 8432 switch (opcode) { 8433 case FETCH_OP_SAMPLE: 8434 opcode = FETCH_OP_SAMPLE_C; 8435 break; 8436 case FETCH_OP_SAMPLE_L: 8437 opcode = FETCH_OP_SAMPLE_C_L; 8438 break; 8439 case FETCH_OP_SAMPLE_LB: 8440 opcode = FETCH_OP_SAMPLE_C_LB; 8441 break; 8442 case FETCH_OP_SAMPLE_G: 8443 opcode = FETCH_OP_SAMPLE_C_G; 8444 break; 8445 /* Texture gather variants */ 8446 case FETCH_OP_GATHER4: 8447 opcode = FETCH_OP_GATHER4_C; 8448 break; 8449 case FETCH_OP_GATHER4_O: 8450 opcode = FETCH_OP_GATHER4_C_O; 8451 break; 8452 } 8453 } 8454 8455 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 8456 tex.op = opcode; 8457 8458 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 8459 tex.sampler_index_mode = sampler_index_mode; 8460 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 8461 tex.resource_index_mode = sampler_index_mode; 8462 tex.src_gpr = src_gpr; 8463 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8464 8465 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE || 8466 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) { 8467 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */ 8468 } 8469 8470 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) { 8471 if (inst->Src[1].Register.File != TGSI_FILE_IMMEDIATE) { 8472 /* TGSI doesn't have a spot to put the component for 8473 * shadowcubes, so it drops it on the floor. Just 8474 * assume the user wanted component 0 (it's a shadow, 8475 * anything else would be absurd). 8476 */ 8477 assert(inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY); 8478 tex.inst_mod = 0; 8479 } else { 8480 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX]; 8481 tex.inst_mod = texture_component_select; 8482 } 8483 8484 if (ctx->bc->gfx_level == CAYMAN) { 8485 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 8486 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 8487 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; 8488 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 8489 } else { 8490 /* GATHER4 result order is different from TGSI TG4 */ 8491 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 1 : 7; 8492 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 2 : 7; 8493 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 0 : 7; 8494 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 8495 } 8496 } 8497 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) { 8498 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 8499 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 8500 tex.dst_sel_z = 7; 8501 tex.dst_sel_w = 7; 8502 } 8503 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) { 8504 tex.dst_sel_x = 3; 8505 tex.dst_sel_y = 7; 8506 tex.dst_sel_z = 7; 8507 tex.dst_sel_w = 7; 8508 } 8509 else { 8510 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 8511 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 8512 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; 8513 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 8514 } 8515 8516 8517 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) { 8518 tex.src_sel_x = 4; 8519 tex.src_sel_y = 4; 8520 tex.src_sel_z = 4; 8521 tex.src_sel_w = 4; 8522 } else if (src_loaded) { 8523 tex.src_sel_x = 0; 8524 tex.src_sel_y = 1; 8525 tex.src_sel_z = 2; 8526 tex.src_sel_w = 3; 8527 } else { 8528 tex.src_sel_x = ctx->src[0].swizzle[0]; 8529 tex.src_sel_y = ctx->src[0].swizzle[1]; 8530 tex.src_sel_z = ctx->src[0].swizzle[2]; 8531 tex.src_sel_w = ctx->src[0].swizzle[3]; 8532 tex.src_rel = ctx->src[0].rel; 8533 } 8534 8535 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE || 8536 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 8537 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 8538 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 8539 tex.src_sel_x = 1; 8540 tex.src_sel_y = 0; 8541 tex.src_sel_z = 3; 8542 tex.src_sel_w = 2; /* route Z compare or Lod value into W */ 8543 } 8544 8545 if (inst->Texture.Texture != TGSI_TEXTURE_RECT && 8546 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) { 8547 tex.coord_type_x = 1; 8548 tex.coord_type_y = 1; 8549 } 8550 tex.coord_type_z = 1; 8551 tex.coord_type_w = 1; 8552 8553 tex.offset_x = offset_x; 8554 tex.offset_y = offset_y; 8555 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 && 8556 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 8557 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) { 8558 tex.offset_z = 0; 8559 } 8560 else { 8561 tex.offset_z = offset_z; 8562 } 8563 8564 /* Put the depth for comparison in W. 8565 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W. 8566 * Some instructions expect the depth in Z. */ 8567 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || 8568 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 8569 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT || 8570 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) && 8571 opcode != FETCH_OP_SAMPLE_C_L && 8572 opcode != FETCH_OP_SAMPLE_C_LB) { 8573 tex.src_sel_w = tex.src_sel_z; 8574 } 8575 8576 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY || 8577 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) { 8578 if (opcode == FETCH_OP_SAMPLE_C_L || 8579 opcode == FETCH_OP_SAMPLE_C_LB) { 8580 /* the array index is read from Y */ 8581 tex.coord_type_y = 0; 8582 array_index_offset_channel = tex.src_sel_y; 8583 } else { 8584 /* the array index is read from Z */ 8585 tex.coord_type_z = 0; 8586 tex.src_sel_z = tex.src_sel_y; 8587 array_index_offset_channel = tex.src_sel_z; 8588 } 8589 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 8590 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) { 8591 tex.coord_type_z = 0; 8592 array_index_offset_channel = tex.src_sel_z; 8593 } else if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 8594 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && 8595 (ctx->bc->gfx_level >= EVERGREEN)) 8596 /* the array index is read from Z, coordinate will be corrected elsewhere */ 8597 tex.coord_type_z = 0; 8598 8599 /* We have array access to 1D or 2D ARRAY, the coordinates are not int -> 8600 * evaluate the array index */ 8601 if (array_index_offset_channel >= 0 && 8602 opcode != FETCH_OP_LD && 8603 opcode != FETCH_OP_GET_TEXTURE_RESINFO) { 8604 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8605 alu.src[0].sel = tex.src_gpr; 8606 alu.src[0].chan = array_index_offset_channel; 8607 alu.src[0].rel = tex.src_rel; 8608 alu.op = ALU_OP1_RNDNE; 8609 alu.dst.sel = tex.src_gpr; 8610 alu.dst.chan = array_index_offset_channel; 8611 alu.dst.rel = tex.src_rel; 8612 alu.dst.write = 1; 8613 alu.last = 1; 8614 r = r600_bytecode_add_alu(ctx->bc, &alu); 8615 if (r) 8616 return r; 8617 } 8618 8619 /* mask unused source components */ 8620 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) { 8621 switch (inst->Texture.Texture) { 8622 case TGSI_TEXTURE_2D: 8623 case TGSI_TEXTURE_RECT: 8624 tex.src_sel_z = 7; 8625 tex.src_sel_w = 7; 8626 break; 8627 case TGSI_TEXTURE_1D_ARRAY: 8628 tex.src_sel_y = 7; 8629 tex.src_sel_w = 7; 8630 break; 8631 case TGSI_TEXTURE_1D: 8632 tex.src_sel_y = 7; 8633 tex.src_sel_z = 7; 8634 tex.src_sel_w = 7; 8635 break; 8636 } 8637 } 8638 8639 /* Emit set gradient and offset instructions. */ 8640 for (i = 0; i < n_grad_offs; ++i) { 8641 r = r600_bytecode_add_tex(ctx->bc, &grad_offs[i]); 8642 if (r) 8643 return r; 8644 } 8645 8646 r = r600_bytecode_add_tex(ctx->bc, &tex); 8647 if (r) 8648 return r; 8649 8650 /* add shadow ambient support - gallium doesn't do it yet */ 8651 return 0; 8652} 8653 8654static int find_hw_atomic_counter(struct r600_shader_ctx *ctx, 8655 struct tgsi_full_src_register *src) 8656{ 8657 unsigned i; 8658 8659 uint32_t index = src->Register.Index; 8660 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) { 8661 if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index) 8662 continue; 8663 if (index > ctx->shader->atomics[i].end) 8664 continue; 8665 if (index < ctx->shader->atomics[i].start) 8666 continue; 8667 uint32_t offset = (index - ctx->shader->atomics[i].start); 8668 return ctx->shader->atomics[i].hw_idx + offset; 8669 } 8670 assert(0); 8671 return -1; 8672} 8673 8674static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx, 8675 int *uav_id_p, int *uav_index_mode_p) 8676{ 8677 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8678 int uav_id, uav_index_mode = 0; 8679 int r; 8680 bool is_cm = (ctx->bc->gfx_level == CAYMAN); 8681 8682 uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]); 8683 8684 if (inst->Src[0].Register.Indirect) { 8685 if (is_cm) { 8686 struct r600_bytecode_alu alu; 8687 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8688 alu.op = ALU_OP2_LSHL_INT; 8689 alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index); 8690 alu.src[0].chan = 0; 8691 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8692 alu.src[1].value = 2; 8693 alu.dst.sel = ctx->temp_reg; 8694 alu.dst.chan = 0; 8695 alu.dst.write = 1; 8696 alu.last = 1; 8697 r = r600_bytecode_add_alu(ctx->bc, &alu); 8698 if (r) 8699 return r; 8700 8701 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 8702 ctx->temp_reg, 0, 8703 ctx->temp_reg, 0, 8704 V_SQ_ALU_SRC_LITERAL, uav_id * 4); 8705 if (r) 8706 return r; 8707 } else 8708 uav_index_mode = 2; 8709 } else if (is_cm) { 8710 r = single_alu_op2(ctx, ALU_OP1_MOV, 8711 ctx->temp_reg, 0, 8712 V_SQ_ALU_SRC_LITERAL, uav_id * 4, 8713 0, 0); 8714 if (r) 8715 return r; 8716 } 8717 *uav_id_p = uav_id; 8718 *uav_index_mode_p = uav_index_mode; 8719 return 0; 8720} 8721 8722static int tgsi_load_gds(struct r600_shader_ctx *ctx) 8723{ 8724 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8725 int r; 8726 struct r600_bytecode_gds gds; 8727 int uav_id = 0; 8728 int uav_index_mode = 0; 8729 bool is_cm = (ctx->bc->gfx_level == CAYMAN); 8730 8731 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode); 8732 if (r) 8733 return r; 8734 8735 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 8736 gds.op = FETCH_OP_GDS_READ_RET; 8737 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8738 gds.uav_id = is_cm ? 0 : uav_id; 8739 gds.uav_index_mode = is_cm ? 0 : uav_index_mode; 8740 gds.src_gpr = ctx->temp_reg; 8741 gds.src_sel_x = (is_cm) ? 0 : 4; 8742 gds.src_sel_y = 4; 8743 gds.src_sel_z = 4; 8744 gds.dst_sel_x = 0; 8745 gds.dst_sel_y = 7; 8746 gds.dst_sel_z = 7; 8747 gds.dst_sel_w = 7; 8748 gds.src_gpr2 = 0; 8749 gds.alloc_consume = !is_cm; 8750 r = r600_bytecode_add_gds(ctx->bc, &gds); 8751 if (r) 8752 return r; 8753 8754 ctx->bc->cf_last->vpm = 1; 8755 return 0; 8756} 8757 8758/* this fixes up 1D arrays properly */ 8759static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr) 8760{ 8761 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8762 int r, i; 8763 struct r600_bytecode_alu alu; 8764 int temp_reg = r600_get_temp(ctx); 8765 8766 for (i = 0; i < 4; i++) { 8767 bool def_val = true, write_zero = false; 8768 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8769 alu.op = ALU_OP1_MOV; 8770 alu.dst.sel = temp_reg; 8771 alu.dst.chan = i; 8772 8773 switch (inst->Memory.Texture) { 8774 case TGSI_TEXTURE_BUFFER: 8775 case TGSI_TEXTURE_1D: 8776 if (i == 1 || i == 2 || i == 3) { 8777 write_zero = true; 8778 } 8779 break; 8780 case TGSI_TEXTURE_1D_ARRAY: 8781 if (i == 1 || i == 3) 8782 write_zero = true; 8783 else if (i == 2) { 8784 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1); 8785 def_val = false; 8786 } 8787 break; 8788 case TGSI_TEXTURE_2D: 8789 if (i == 2 || i == 3) 8790 write_zero = true; 8791 break; 8792 default: 8793 if (i == 3) 8794 write_zero = true; 8795 break; 8796 } 8797 8798 if (write_zero) { 8799 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 8800 alu.src[0].value = 0; 8801 } else if (def_val) { 8802 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i); 8803 } 8804 8805 if (i == 3) 8806 alu.last = 1; 8807 alu.dst.write = 1; 8808 r = r600_bytecode_add_alu(ctx->bc, &alu); 8809 if (r) 8810 return r; 8811 } 8812 *idx_gpr = temp_reg; 8813 return 0; 8814} 8815 8816static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx, 8817 int temp_reg) 8818{ 8819 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8820 int r; 8821 if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) { 8822 int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]); 8823 r = single_alu_op2(ctx, ALU_OP1_MOV, 8824 temp_reg, 0, 8825 V_SQ_ALU_SRC_LITERAL, value >> 2, 8826 0, 0); 8827 if (r) 8828 return r; 8829 } else { 8830 struct r600_bytecode_alu alu; 8831 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8832 alu.op = ALU_OP2_LSHR_INT; 8833 r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0); 8834 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8835 alu.src[1].value = 2; 8836 alu.dst.sel = temp_reg; 8837 alu.dst.write = 1; 8838 alu.last = 1; 8839 r = r600_bytecode_add_alu(ctx->bc, &alu); 8840 if (r) 8841 return r; 8842 } 8843 return 0; 8844} 8845 8846/* ADDR[1,2] are stored in index_reg[0,1] on EG, and can be used for indexing 8847 * images and ssbos. We assume that indirects are indexed by ADDR[2], as that's 8848 * what GLSL-to-TGSI emitted. 8849 */ 8850static unsigned tgsi_indirect_to_rat_index_mode(struct tgsi_ind_register ind) 8851{ 8852 if (ind.File == TGSI_FILE_NULL) 8853 return 0; /* CF_INDEX_NONE */ 8854 else { 8855 assert(ind.Index == 2); 8856 return 2; /* CF_INDEX_1 */ 8857 } 8858} 8859 8860static int tgsi_load_buffer(struct r600_shader_ctx *ctx) 8861{ 8862 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8863 /* have to work out the offset into the RAT immediate return buffer */ 8864 struct r600_bytecode_vtx vtx; 8865 struct r600_bytecode_cf *cf; 8866 int r; 8867 int temp_reg = r600_get_temp(ctx); 8868 unsigned rat_index_mode = tgsi_indirect_to_rat_index_mode(inst->Src[0].Indirect); 8869 unsigned base; 8870 8871 base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE]; 8872 8873 r = load_buffer_coord(ctx, 1, temp_reg); 8874 if (r) 8875 return r; 8876 ctx->bc->cf_last->barrier = 1; 8877 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 8878 vtx.op = FETCH_OP_VFETCH; 8879 vtx.buffer_id = inst->Src[0].Register.Index + base; 8880 vtx.buffer_index_mode = rat_index_mode; 8881 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 8882 vtx.src_gpr = temp_reg; 8883 vtx.src_sel_x = 0; 8884 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8885 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */ 8886 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */ 8887 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */ 8888 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */ 8889 vtx.num_format_all = 1; 8890 vtx.format_comp_all = 1; 8891 vtx.srf_mode_all = 0; 8892 8893 if (inst->Dst[0].Register.WriteMask & 8) { 8894 vtx.data_format = FMT_32_32_32_32; 8895 vtx.use_const_fields = 0; 8896 } else if (inst->Dst[0].Register.WriteMask & 4) { 8897 vtx.data_format = FMT_32_32_32; 8898 vtx.use_const_fields = 0; 8899 } else if (inst->Dst[0].Register.WriteMask & 2) { 8900 vtx.data_format = FMT_32_32; 8901 vtx.use_const_fields = 0; 8902 } else { 8903 vtx.data_format = FMT_32; 8904 vtx.use_const_fields = 0; 8905 } 8906 8907 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx); 8908 if (r) 8909 return r; 8910 cf = ctx->bc->cf_last; 8911 cf->barrier = 1; 8912 return 0; 8913} 8914 8915static int tgsi_load_rat(struct r600_shader_ctx *ctx) 8916{ 8917 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8918 /* have to work out the offset into the RAT immediate return buffer */ 8919 struct r600_bytecode_vtx vtx; 8920 struct r600_bytecode_cf *cf; 8921 int r; 8922 int idx_gpr; 8923 unsigned format, num_format, format_comp, endian; 8924 const struct util_format_description *desc; 8925 unsigned rat_index_mode = tgsi_indirect_to_rat_index_mode(inst->Src[0].Indirect); 8926 unsigned immed_base; 8927 8928 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET; 8929 r = load_index_src(ctx, 1, &idx_gpr); 8930 if (r) 8931 return r; 8932 8933 if (rat_index_mode) 8934 egcm_load_index_reg(ctx->bc, 1, false); 8935 8936 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 8937 cf = ctx->bc->cf_last; 8938 8939 cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index; 8940 cf->rat.inst = V_RAT_INST_NOP_RTN; 8941 cf->rat.index_mode = rat_index_mode; 8942 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND; 8943 cf->output.gpr = ctx->thread_id_gpr; 8944 cf->output.index_gpr = idx_gpr; 8945 cf->output.comp_mask = 0xf; 8946 cf->output.burst_count = 1; 8947 cf->vpm = 1; 8948 cf->barrier = 1; 8949 cf->mark = 1; 8950 cf->output.elem_size = 0; 8951 8952 r600_bytecode_add_ack(ctx->bc); 8953 r600_bytecode_wait_acks(ctx->bc); 8954 8955 desc = util_format_description(inst->Memory.Format); 8956 r600_vertex_data_type(inst->Memory.Format, 8957 &format, &num_format, &format_comp, &endian); 8958 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 8959 vtx.op = FETCH_OP_VFETCH; 8960 vtx.buffer_id = immed_base + inst->Src[0].Register.Index; 8961 vtx.buffer_index_mode = rat_index_mode; 8962 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 8963 vtx.src_gpr = ctx->thread_id_gpr; 8964 vtx.src_sel_x = 1; 8965 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8966 vtx.dst_sel_x = desc->swizzle[0]; 8967 vtx.dst_sel_y = desc->swizzle[1]; 8968 vtx.dst_sel_z = desc->swizzle[2]; 8969 vtx.dst_sel_w = desc->swizzle[3]; 8970 vtx.srf_mode_all = 1; 8971 vtx.data_format = format; 8972 vtx.num_format_all = num_format; 8973 vtx.format_comp_all = format_comp; 8974 vtx.endian = endian; 8975 vtx.offset = 0; 8976 vtx.mega_fetch_count = 3; 8977 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx); 8978 if (r) 8979 return r; 8980 cf = ctx->bc->cf_last; 8981 cf->barrier = 1; 8982 return 0; 8983} 8984 8985static int tgsi_load_lds(struct r600_shader_ctx *ctx) 8986{ 8987 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8988 struct r600_bytecode_alu alu; 8989 int r; 8990 int temp_reg = r600_get_temp(ctx); 8991 8992 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8993 alu.op = ALU_OP1_MOV; 8994 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 8995 alu.dst.sel = temp_reg; 8996 alu.dst.write = 1; 8997 alu.last = 1; 8998 r = r600_bytecode_add_alu(ctx->bc, &alu); 8999 if (r) 9000 return r; 9001 9002 r = do_lds_fetch_values(ctx, temp_reg, 9003 ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask); 9004 if (r) 9005 return r; 9006 return 0; 9007} 9008 9009static int tgsi_load(struct r600_shader_ctx *ctx) 9010{ 9011 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9012 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) 9013 return tgsi_load_rat(ctx); 9014 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC) 9015 return tgsi_load_gds(ctx); 9016 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) 9017 return tgsi_load_buffer(ctx); 9018 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) 9019 return tgsi_load_lds(ctx); 9020 return 0; 9021} 9022 9023static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx) 9024{ 9025 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9026 struct r600_bytecode_cf *cf; 9027 int r, i; 9028 unsigned rat_index_mode = tgsi_indirect_to_rat_index_mode(inst->Dst[0].Indirect); 9029 int lasti; 9030 int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx); 9031 9032 r = load_buffer_coord(ctx, 0, treg2); 9033 if (r) 9034 return r; 9035 9036 if (rat_index_mode) 9037 egcm_load_index_reg(ctx->bc, 1, false); 9038 9039 for (i = 0; i <= 3; i++) { 9040 struct r600_bytecode_alu alu; 9041 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9042 alu.op = ALU_OP1_MOV; 9043 alu.dst.sel = temp_reg; 9044 alu.dst.chan = i; 9045 alu.src[0].sel = V_SQ_ALU_SRC_0; 9046 alu.last = (i == 3); 9047 alu.dst.write = 1; 9048 r = r600_bytecode_add_alu(ctx->bc, &alu); 9049 if (r) 9050 return r; 9051 } 9052 9053 cf = NULL; 9054 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9055 for (i = 0; i <= lasti; i++) { 9056 struct r600_bytecode_alu alu; 9057 if (!((1 << i) & inst->Dst[0].Register.WriteMask)) 9058 continue; 9059 9060 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 9061 temp_reg, 0, 9062 treg2, 0, 9063 V_SQ_ALU_SRC_LITERAL, i); 9064 if (r) 9065 return r; 9066 9067 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9068 alu.op = ALU_OP1_MOV; 9069 alu.dst.sel = ctx->temp_reg; 9070 alu.dst.chan = 0; 9071 9072 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 9073 alu.last = 1; 9074 alu.dst.write = 1; 9075 r = r600_bytecode_add_alu(ctx->bc, &alu); 9076 if (r) 9077 return r; 9078 9079 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 9080 cf = ctx->bc->cf_last; 9081 9082 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE]; 9083 cf->rat.inst = V_RAT_INST_STORE_TYPED; 9084 cf->rat.index_mode = rat_index_mode; 9085 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 9086 cf->output.gpr = ctx->temp_reg; 9087 cf->output.index_gpr = temp_reg; 9088 cf->output.comp_mask = 1; 9089 cf->output.burst_count = 1; 9090 cf->vpm = 1; 9091 cf->barrier = 1; 9092 cf->output.elem_size = 0; 9093 } 9094 9095 /* Request an ack from the last write emitted. */ 9096 if (cf) { 9097 cf->mark = true; 9098 cf->output.type = r600_bytecode_write_export_ack_type(ctx->bc, true); 9099 r600_bytecode_add_ack(ctx->bc); 9100 } 9101 9102 return 0; 9103} 9104 9105static int tgsi_store_rat(struct r600_shader_ctx *ctx) 9106{ 9107 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9108 struct r600_bytecode_cf *cf; 9109 bool src_requires_loading = false; 9110 int val_gpr, idx_gpr; 9111 int r, i; 9112 unsigned rat_index_mode = tgsi_indirect_to_rat_index_mode(inst->Dst[0].Indirect); 9113 9114 r = load_index_src(ctx, 0, &idx_gpr); 9115 if (r) 9116 return r; 9117 9118 if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY) 9119 src_requires_loading = true; 9120 9121 if (src_requires_loading) { 9122 struct r600_bytecode_alu alu; 9123 for (i = 0; i < 4; i++) { 9124 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9125 alu.op = ALU_OP1_MOV; 9126 alu.dst.sel = ctx->temp_reg; 9127 alu.dst.chan = i; 9128 9129 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 9130 if (i == 3) 9131 alu.last = 1; 9132 alu.dst.write = 1; 9133 r = r600_bytecode_add_alu(ctx->bc, &alu); 9134 if (r) 9135 return r; 9136 } 9137 val_gpr = ctx->temp_reg; 9138 } else 9139 val_gpr = tgsi_tex_get_src_gpr(ctx, 1); 9140 if (rat_index_mode) 9141 egcm_load_index_reg(ctx->bc, 1, false); 9142 9143 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 9144 cf = ctx->bc->cf_last; 9145 9146 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index; 9147 cf->rat.inst = V_RAT_INST_STORE_TYPED; 9148 cf->rat.index_mode = rat_index_mode; 9149 cf->output.type = r600_bytecode_write_export_ack_type(ctx->bc, true); 9150 cf->output.gpr = val_gpr; 9151 cf->output.index_gpr = idx_gpr; 9152 cf->output.comp_mask = 0xf; 9153 cf->output.burst_count = 1; 9154 cf->vpm = 1; 9155 cf->barrier = 1; 9156 cf->output.elem_size = 0; 9157 cf->mark = 1; 9158 9159 r600_bytecode_add_ack(ctx->bc); 9160 9161 return 0; 9162} 9163 9164static int tgsi_store_lds(struct r600_shader_ctx *ctx) 9165{ 9166 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9167 struct r600_bytecode_alu alu; 9168 int r, i, lasti; 9169 int write_mask = inst->Dst[0].Register.WriteMask; 9170 int temp_reg = r600_get_temp(ctx); 9171 9172 /* LDS write */ 9173 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9174 alu.op = ALU_OP1_MOV; 9175 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9176 alu.dst.sel = temp_reg; 9177 alu.dst.write = 1; 9178 alu.last = 1; 9179 r = r600_bytecode_add_alu(ctx->bc, &alu); 9180 if (r) 9181 return r; 9182 9183 lasti = tgsi_last_instruction(write_mask); 9184 for (i = 1; i <= lasti; i++) { 9185 if (!(write_mask & (1 << i))) 9186 continue; 9187 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 9188 temp_reg, i, 9189 temp_reg, 0, 9190 V_SQ_ALU_SRC_LITERAL, 4 * i); 9191 if (r) 9192 return r; 9193 } 9194 for (i = 0; i <= lasti; i++) { 9195 if (!(write_mask & (1 << i))) 9196 continue; 9197 9198 if ((i == 0 && ((write_mask & 3) == 3)) || 9199 (i == 2 && ((write_mask & 0xc) == 0xc))) { 9200 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9201 alu.op = LDS_OP3_LDS_WRITE_REL; 9202 9203 alu.src[0].sel = temp_reg; 9204 alu.src[0].chan = i; 9205 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 9206 r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1); 9207 alu.last = 1; 9208 alu.is_lds_idx_op = true; 9209 alu.lds_idx = 1; 9210 r = r600_bytecode_add_alu(ctx->bc, &alu); 9211 if (r) 9212 return r; 9213 i += 1; 9214 continue; 9215 } 9216 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9217 alu.op = LDS_OP2_LDS_WRITE; 9218 9219 alu.src[0].sel = temp_reg; 9220 alu.src[0].chan = i; 9221 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 9222 9223 alu.last = 1; 9224 alu.is_lds_idx_op = true; 9225 9226 r = r600_bytecode_add_alu(ctx->bc, &alu); 9227 if (r) 9228 return r; 9229 } 9230 return 0; 9231} 9232 9233static int tgsi_store(struct r600_shader_ctx *ctx) 9234{ 9235 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9236 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) 9237 return tgsi_store_buffer_rat(ctx); 9238 else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) 9239 return tgsi_store_lds(ctx); 9240 else 9241 return tgsi_store_rat(ctx); 9242} 9243 9244static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx) 9245{ 9246 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9247 /* have to work out the offset into the RAT immediate return buffer */ 9248 struct r600_bytecode_alu alu; 9249 struct r600_bytecode_vtx vtx; 9250 struct r600_bytecode_cf *cf; 9251 int r; 9252 int idx_gpr; 9253 unsigned format, num_format, format_comp, endian; 9254 const struct util_format_description *desc; 9255 unsigned rat_index_mode = tgsi_indirect_to_rat_index_mode(inst->Src[0].Indirect); 9256 unsigned immed_base; 9257 unsigned rat_base; 9258 9259 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET; 9260 rat_base = ctx->shader->rat_base; 9261 9262 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { 9263 immed_base += ctx->info.file_count[TGSI_FILE_IMAGE]; 9264 rat_base += ctx->info.file_count[TGSI_FILE_IMAGE]; 9265 9266 r = load_buffer_coord(ctx, 1, ctx->temp_reg); 9267 if (r) 9268 return r; 9269 idx_gpr = ctx->temp_reg; 9270 } else { 9271 r = load_index_src(ctx, 1, &idx_gpr); 9272 if (r) 9273 return r; 9274 } 9275 9276 if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) { 9277 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9278 alu.op = ALU_OP1_MOV; 9279 alu.dst.sel = ctx->thread_id_gpr; 9280 alu.dst.chan = 0; 9281 alu.dst.write = 1; 9282 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0); 9283 alu.last = 1; 9284 r = r600_bytecode_add_alu(ctx->bc, &alu); 9285 if (r) 9286 return r; 9287 9288 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9289 alu.op = ALU_OP1_MOV; 9290 alu.dst.sel = ctx->thread_id_gpr; 9291 if (ctx->bc->gfx_level == CAYMAN) 9292 alu.dst.chan = 2; 9293 else 9294 alu.dst.chan = 3; 9295 alu.dst.write = 1; 9296 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0); 9297 alu.last = 1; 9298 r = r600_bytecode_add_alu(ctx->bc, &alu); 9299 if (r) 9300 return r; 9301 } else { 9302 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9303 alu.op = ALU_OP1_MOV; 9304 alu.dst.sel = ctx->thread_id_gpr; 9305 alu.dst.chan = 0; 9306 alu.dst.write = 1; 9307 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0); 9308 alu.last = 1; 9309 r = r600_bytecode_add_alu(ctx->bc, &alu); 9310 if (r) 9311 return r; 9312 } 9313 9314 if (rat_index_mode) 9315 egcm_load_index_reg(ctx->bc, 1, false); 9316 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 9317 cf = ctx->bc->cf_last; 9318 9319 cf->rat.id = rat_base + inst->Src[0].Register.Index; 9320 cf->rat.inst = ctx->inst_info->op; 9321 cf->rat.index_mode = rat_index_mode; 9322 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND; 9323 cf->output.gpr = ctx->thread_id_gpr; 9324 cf->output.index_gpr = idx_gpr; 9325 cf->output.comp_mask = 0xf; 9326 cf->output.burst_count = 1; 9327 cf->vpm = 1; 9328 cf->barrier = 1; 9329 cf->mark = 1; 9330 cf->output.elem_size = 0; 9331 9332 r600_bytecode_add_ack(ctx->bc); 9333 r600_bytecode_wait_acks(ctx->bc); 9334 9335 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 9336 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) { 9337 desc = util_format_description(inst->Memory.Format); 9338 r600_vertex_data_type(inst->Memory.Format, 9339 &format, &num_format, &format_comp, &endian); 9340 vtx.dst_sel_x = desc->swizzle[0]; 9341 } else { 9342 format = FMT_32; 9343 num_format = 1; 9344 format_comp = 0; 9345 endian = 0; 9346 vtx.dst_sel_x = 0; 9347 } 9348 vtx.op = FETCH_OP_VFETCH; 9349 vtx.buffer_id = immed_base + inst->Src[0].Register.Index; 9350 vtx.buffer_index_mode = rat_index_mode; 9351 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 9352 vtx.src_gpr = ctx->thread_id_gpr; 9353 vtx.src_sel_x = 1; 9354 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 9355 vtx.dst_sel_y = 7; 9356 vtx.dst_sel_z = 7; 9357 vtx.dst_sel_w = 7; 9358 vtx.use_const_fields = 0; 9359 vtx.srf_mode_all = 1; 9360 vtx.data_format = format; 9361 vtx.num_format_all = num_format; 9362 vtx.format_comp_all = format_comp; 9363 vtx.endian = endian; 9364 vtx.offset = 0; 9365 vtx.mega_fetch_count = 0xf; 9366 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx); 9367 if (r) 9368 return r; 9369 cf = ctx->bc->cf_last; 9370 cf->vpm = 1; 9371 cf->barrier = 1; 9372 return 0; 9373} 9374 9375static int get_gds_op(int opcode) 9376{ 9377 switch (opcode) { 9378 case TGSI_OPCODE_ATOMUADD: 9379 return FETCH_OP_GDS_ADD_RET; 9380 case TGSI_OPCODE_ATOMAND: 9381 return FETCH_OP_GDS_AND_RET; 9382 case TGSI_OPCODE_ATOMOR: 9383 return FETCH_OP_GDS_OR_RET; 9384 case TGSI_OPCODE_ATOMXOR: 9385 return FETCH_OP_GDS_XOR_RET; 9386 case TGSI_OPCODE_ATOMUMIN: 9387 return FETCH_OP_GDS_MIN_UINT_RET; 9388 case TGSI_OPCODE_ATOMUMAX: 9389 return FETCH_OP_GDS_MAX_UINT_RET; 9390 case TGSI_OPCODE_ATOMIMIN: 9391 return FETCH_OP_GDS_MIN_INT_RET; 9392 case TGSI_OPCODE_ATOMIMAX: 9393 return FETCH_OP_GDS_MAX_INT_RET; 9394 case TGSI_OPCODE_ATOMXCHG: 9395 return FETCH_OP_GDS_XCHG_RET; 9396 case TGSI_OPCODE_ATOMCAS: 9397 return FETCH_OP_GDS_CMP_XCHG_RET; 9398 default: 9399 return -1; 9400 } 9401} 9402 9403static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx) 9404{ 9405 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9406 struct r600_bytecode_gds gds; 9407 struct r600_bytecode_alu alu; 9408 int gds_op = get_gds_op(inst->Instruction.Opcode); 9409 int r; 9410 int uav_id = 0; 9411 int uav_index_mode = 0; 9412 bool is_cm = (ctx->bc->gfx_level == CAYMAN); 9413 9414 if (gds_op == -1) { 9415 fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode); 9416 return -1; 9417 } 9418 9419 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode); 9420 if (r) 9421 return r; 9422 9423 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) { 9424 if (inst->Src[3].Register.File == TGSI_FILE_IMMEDIATE) { 9425 int value = (ctx->literals[4 * inst->Src[3].Register.Index + inst->Src[3].Register.SwizzleX]); 9426 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9427 alu.op = ALU_OP1_MOV; 9428 alu.dst.sel = ctx->temp_reg; 9429 alu.dst.chan = is_cm ? 2 : 1; 9430 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 9431 alu.src[0].value = value; 9432 alu.last = 1; 9433 alu.dst.write = 1; 9434 r = r600_bytecode_add_alu(ctx->bc, &alu); 9435 if (r) 9436 return r; 9437 } else { 9438 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9439 alu.op = ALU_OP1_MOV; 9440 alu.dst.sel = ctx->temp_reg; 9441 alu.dst.chan = is_cm ? 2 : 1; 9442 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0); 9443 alu.last = 1; 9444 alu.dst.write = 1; 9445 r = r600_bytecode_add_alu(ctx->bc, &alu); 9446 if (r) 9447 return r; 9448 } 9449 } 9450 if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) { 9451 int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]); 9452 int abs_value = abs(value); 9453 if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET) 9454 gds_op = FETCH_OP_GDS_SUB_RET; 9455 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9456 alu.op = ALU_OP1_MOV; 9457 alu.dst.sel = ctx->temp_reg; 9458 alu.dst.chan = is_cm ? 1 : 0; 9459 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 9460 alu.src[0].value = abs_value; 9461 alu.last = 1; 9462 alu.dst.write = 1; 9463 r = r600_bytecode_add_alu(ctx->bc, &alu); 9464 if (r) 9465 return r; 9466 } else { 9467 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9468 alu.op = ALU_OP1_MOV; 9469 alu.dst.sel = ctx->temp_reg; 9470 alu.dst.chan = is_cm ? 1 : 0; 9471 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0); 9472 alu.last = 1; 9473 alu.dst.write = 1; 9474 r = r600_bytecode_add_alu(ctx->bc, &alu); 9475 if (r) 9476 return r; 9477 } 9478 9479 9480 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 9481 gds.op = gds_op; 9482 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 9483 gds.uav_id = is_cm ? 0 : uav_id; 9484 gds.uav_index_mode = is_cm ? 0 : uav_index_mode; 9485 gds.src_gpr = ctx->temp_reg; 9486 gds.src_gpr2 = 0; 9487 gds.src_sel_x = is_cm ? 0 : 4; 9488 gds.src_sel_y = is_cm ? 1 : 0; 9489 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) 9490 gds.src_sel_z = is_cm ? 2 : 1; 9491 else 9492 gds.src_sel_z = 7; 9493 gds.dst_sel_x = 0; 9494 gds.dst_sel_y = 7; 9495 gds.dst_sel_z = 7; 9496 gds.dst_sel_w = 7; 9497 gds.alloc_consume = !is_cm; 9498 9499 r = r600_bytecode_add_gds(ctx->bc, &gds); 9500 if (r) 9501 return r; 9502 ctx->bc->cf_last->vpm = 1; 9503 return 0; 9504} 9505 9506static int get_lds_op(int opcode) 9507{ 9508 switch (opcode) { 9509 case TGSI_OPCODE_ATOMUADD: 9510 return LDS_OP2_LDS_ADD_RET; 9511 case TGSI_OPCODE_ATOMAND: 9512 return LDS_OP2_LDS_AND_RET; 9513 case TGSI_OPCODE_ATOMOR: 9514 return LDS_OP2_LDS_OR_RET; 9515 case TGSI_OPCODE_ATOMXOR: 9516 return LDS_OP2_LDS_XOR_RET; 9517 case TGSI_OPCODE_ATOMUMIN: 9518 return LDS_OP2_LDS_MIN_UINT_RET; 9519 case TGSI_OPCODE_ATOMUMAX: 9520 return LDS_OP2_LDS_MAX_UINT_RET; 9521 case TGSI_OPCODE_ATOMIMIN: 9522 return LDS_OP2_LDS_MIN_INT_RET; 9523 case TGSI_OPCODE_ATOMIMAX: 9524 return LDS_OP2_LDS_MAX_INT_RET; 9525 case TGSI_OPCODE_ATOMXCHG: 9526 return LDS_OP2_LDS_XCHG_RET; 9527 case TGSI_OPCODE_ATOMCAS: 9528 return LDS_OP3_LDS_CMP_XCHG_RET; 9529 default: 9530 return -1; 9531 } 9532} 9533 9534static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx) 9535{ 9536 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9537 int lds_op = get_lds_op(inst->Instruction.Opcode); 9538 int r; 9539 9540 struct r600_bytecode_alu alu; 9541 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9542 alu.op = lds_op; 9543 alu.is_lds_idx_op = true; 9544 alu.last = 1; 9545 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 9546 r600_bytecode_src(&alu.src[1], &ctx->src[2], 0); 9547 if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET) 9548 r600_bytecode_src(&alu.src[2], &ctx->src[3], 0); 9549 else 9550 alu.src[2].sel = V_SQ_ALU_SRC_0; 9551 r = r600_bytecode_add_alu(ctx->bc, &alu); 9552 if (r) 9553 return r; 9554 9555 /* then read from LDS_OQ_A_POP */ 9556 memset(&alu, 0, sizeof(alu)); 9557 9558 alu.op = ALU_OP1_MOV; 9559 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP; 9560 alu.src[0].chan = 0; 9561 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 9562 alu.dst.write = 1; 9563 alu.last = 1; 9564 r = r600_bytecode_add_alu(ctx->bc, &alu); 9565 if (r) 9566 return r; 9567 9568 return 0; 9569} 9570 9571static int tgsi_atomic_op(struct r600_shader_ctx *ctx) 9572{ 9573 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9574 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) 9575 return tgsi_atomic_op_rat(ctx); 9576 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC) 9577 return tgsi_atomic_op_gds(ctx); 9578 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) 9579 return tgsi_atomic_op_rat(ctx); 9580 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) 9581 return tgsi_atomic_op_lds(ctx); 9582 return 0; 9583} 9584 9585static int tgsi_resq(struct r600_shader_ctx *ctx) 9586{ 9587 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9588 unsigned sampler_index_mode; 9589 struct r600_bytecode_tex tex; 9590 int r; 9591 boolean has_txq_cube_array_z = false; 9592 9593 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER || 9594 (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) { 9595 if (ctx->bc->gfx_level < EVERGREEN) 9596 ctx->shader->uses_tex_buffers = true; 9597 unsigned eg_buffer_base = 0; 9598 eg_buffer_base = R600_IMAGE_REAL_RESOURCE_OFFSET; 9599 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) 9600 eg_buffer_base += ctx->info.file_count[TGSI_FILE_IMAGE]; 9601 return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset, eg_buffer_base); 9602 } 9603 9604 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY && 9605 inst->Dst[0].Register.WriteMask & 4) { 9606 ctx->shader->has_txq_cube_array_z_comp = true; 9607 has_txq_cube_array_z = true; 9608 } 9609 9610 sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 9611 if (sampler_index_mode) 9612 egcm_load_index_reg(ctx->bc, 1, false); 9613 9614 9615 /* does this shader want a num layers from TXQ for a cube array? */ 9616 if (has_txq_cube_array_z) { 9617 int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset; 9618 struct r600_bytecode_alu alu; 9619 9620 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9621 alu.op = ALU_OP1_MOV; 9622 9623 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; 9624 /* with eg each dword is either number of cubes */ 9625 alu.src[0].sel += id / 4; 9626 alu.src[0].chan = id % 4; 9627 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 9628 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 9629 alu.last = 1; 9630 r = r600_bytecode_add_alu(ctx->bc, &alu); 9631 if (r) 9632 return r; 9633 /* disable writemask from texture instruction */ 9634 inst->Dst[0].Register.WriteMask &= ~4; 9635 } 9636 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 9637 tex.op = ctx->inst_info->op; 9638 tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index; 9639 tex.sampler_index_mode = sampler_index_mode; 9640 tex.resource_id = tex.sampler_id; 9641 tex.resource_index_mode = sampler_index_mode; 9642 tex.src_sel_x = 4; 9643 tex.src_sel_y = 4; 9644 tex.src_sel_z = 4; 9645 tex.src_sel_w = 4; 9646 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 9647 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 9648 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; 9649 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 9650 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 9651 r = r600_bytecode_add_tex(ctx->bc, &tex); 9652 if (r) 9653 return r; 9654 9655 return 0; 9656} 9657 9658static int tgsi_lrp(struct r600_shader_ctx *ctx) 9659{ 9660 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9661 struct r600_bytecode_alu alu; 9662 unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9663 struct r600_bytecode_alu_src srcs[2][4]; 9664 unsigned i; 9665 int r; 9666 9667 /* optimize if it's just an equal balance */ 9668 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) { 9669 for (i = 0; i < lasti + 1; i++) { 9670 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9671 continue; 9672 9673 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9674 alu.op = ALU_OP2_ADD; 9675 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 9676 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 9677 alu.omod = 3; 9678 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9679 alu.dst.chan = i; 9680 if (i == lasti) { 9681 alu.last = 1; 9682 } 9683 r = r600_bytecode_add_alu(ctx->bc, &alu); 9684 if (r) 9685 return r; 9686 } 9687 return 0; 9688 } 9689 9690 /* 1 - src0 */ 9691 for (i = 0; i < lasti + 1; i++) { 9692 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9693 continue; 9694 9695 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9696 alu.op = ALU_OP2_ADD; 9697 alu.src[0].sel = V_SQ_ALU_SRC_1; 9698 alu.src[0].chan = 0; 9699 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 9700 r600_bytecode_src_toggle_neg(&alu.src[1]); 9701 alu.dst.sel = ctx->temp_reg; 9702 alu.dst.chan = i; 9703 if (i == lasti) { 9704 alu.last = 1; 9705 } 9706 alu.dst.write = 1; 9707 r = r600_bytecode_add_alu(ctx->bc, &alu); 9708 if (r) 9709 return r; 9710 } 9711 9712 /* (1 - src0) * src2 */ 9713 for (i = 0; i < lasti + 1; i++) { 9714 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9715 continue; 9716 9717 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9718 alu.op = ALU_OP2_MUL; 9719 alu.src[0].sel = ctx->temp_reg; 9720 alu.src[0].chan = i; 9721 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 9722 alu.dst.sel = ctx->temp_reg; 9723 alu.dst.chan = i; 9724 if (i == lasti) { 9725 alu.last = 1; 9726 } 9727 alu.dst.write = 1; 9728 r = r600_bytecode_add_alu(ctx->bc, &alu); 9729 if (r) 9730 return r; 9731 } 9732 9733 /* src0 * src1 + (1 - src0) * src2 */ 9734 9735 for (i = 0; i < 2; i++) { 9736 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask, 9737 srcs[i], &ctx->src[i]); 9738 if (r) 9739 return r; 9740 } 9741 9742 for (i = 0; i < lasti + 1; i++) { 9743 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9744 continue; 9745 9746 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9747 alu.op = ALU_OP3_MULADD; 9748 alu.is_op3 = 1; 9749 alu.src[0] = srcs[0][i]; 9750 alu.src[1] = srcs[1][i]; 9751 alu.src[2].sel = ctx->temp_reg; 9752 alu.src[2].chan = i; 9753 9754 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9755 alu.dst.chan = i; 9756 if (i == lasti) { 9757 alu.last = 1; 9758 } 9759 r = r600_bytecode_add_alu(ctx->bc, &alu); 9760 if (r) 9761 return r; 9762 } 9763 return 0; 9764} 9765 9766static int tgsi_cmp(struct r600_shader_ctx *ctx) 9767{ 9768 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9769 struct r600_bytecode_alu alu; 9770 int i, r, j; 9771 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9772 struct r600_bytecode_alu_src srcs[3][4]; 9773 9774 unsigned op; 9775 9776 if (ctx->src[0].abs && ctx->src[0].neg) { 9777 op = ALU_OP3_CNDE; 9778 ctx->src[0].abs = 0; 9779 ctx->src[0].neg = 0; 9780 } else { 9781 op = ALU_OP3_CNDGE; 9782 } 9783 9784 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 9785 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask, 9786 srcs[j], &ctx->src[j]); 9787 if (r) 9788 return r; 9789 } 9790 9791 for (i = 0; i < lasti + 1; i++) { 9792 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9793 continue; 9794 9795 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9796 alu.op = op; 9797 alu.src[0] = srcs[0][i]; 9798 alu.src[1] = srcs[2][i]; 9799 alu.src[2] = srcs[1][i]; 9800 9801 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9802 alu.dst.chan = i; 9803 alu.dst.write = 1; 9804 alu.is_op3 = 1; 9805 if (i == lasti) 9806 alu.last = 1; 9807 r = r600_bytecode_add_alu(ctx->bc, &alu); 9808 if (r) 9809 return r; 9810 } 9811 return 0; 9812} 9813 9814static int tgsi_ucmp(struct r600_shader_ctx *ctx) 9815{ 9816 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9817 struct r600_bytecode_alu alu; 9818 int i, r; 9819 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9820 9821 for (i = 0; i < lasti + 1; i++) { 9822 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9823 continue; 9824 9825 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9826 alu.op = ALU_OP3_CNDE_INT; 9827 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 9828 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 9829 r600_bytecode_src(&alu.src[2], &ctx->src[1], i); 9830 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9831 alu.dst.chan = i; 9832 alu.dst.write = 1; 9833 alu.is_op3 = 1; 9834 if (i == lasti) 9835 alu.last = 1; 9836 r = r600_bytecode_add_alu(ctx->bc, &alu); 9837 if (r) 9838 return r; 9839 } 9840 return 0; 9841} 9842 9843static int tgsi_exp(struct r600_shader_ctx *ctx) 9844{ 9845 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9846 struct r600_bytecode_alu alu; 9847 int r; 9848 unsigned i; 9849 9850 /* result.x = 2^floor(src); */ 9851 if (inst->Dst[0].Register.WriteMask & 1) { 9852 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9853 9854 alu.op = ALU_OP1_FLOOR; 9855 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9856 9857 alu.dst.sel = ctx->temp_reg; 9858 alu.dst.chan = 0; 9859 alu.dst.write = 1; 9860 alu.last = 1; 9861 r = r600_bytecode_add_alu(ctx->bc, &alu); 9862 if (r) 9863 return r; 9864 9865 if (ctx->bc->gfx_level == CAYMAN) { 9866 for (i = 0; i < 3; i++) { 9867 alu.op = ALU_OP1_EXP_IEEE; 9868 alu.src[0].sel = ctx->temp_reg; 9869 alu.src[0].chan = 0; 9870 9871 alu.dst.sel = ctx->temp_reg; 9872 alu.dst.chan = i; 9873 alu.dst.write = i == 0; 9874 alu.last = i == 2; 9875 r = r600_bytecode_add_alu(ctx->bc, &alu); 9876 if (r) 9877 return r; 9878 } 9879 } else { 9880 alu.op = ALU_OP1_EXP_IEEE; 9881 alu.src[0].sel = ctx->temp_reg; 9882 alu.src[0].chan = 0; 9883 9884 alu.dst.sel = ctx->temp_reg; 9885 alu.dst.chan = 0; 9886 alu.dst.write = 1; 9887 alu.last = 1; 9888 r = r600_bytecode_add_alu(ctx->bc, &alu); 9889 if (r) 9890 return r; 9891 } 9892 } 9893 9894 /* result.y = tmp - floor(tmp); */ 9895 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) { 9896 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9897 9898 alu.op = ALU_OP1_FRACT; 9899 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9900 9901 alu.dst.sel = ctx->temp_reg; 9902#if 0 9903 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9904 if (r) 9905 return r; 9906#endif 9907 alu.dst.write = 1; 9908 alu.dst.chan = 1; 9909 9910 alu.last = 1; 9911 9912 r = r600_bytecode_add_alu(ctx->bc, &alu); 9913 if (r) 9914 return r; 9915 } 9916 9917 /* result.z = RoughApprox2ToX(tmp);*/ 9918 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) { 9919 if (ctx->bc->gfx_level == CAYMAN) { 9920 for (i = 0; i < 3; i++) { 9921 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9922 alu.op = ALU_OP1_EXP_IEEE; 9923 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9924 9925 alu.dst.sel = ctx->temp_reg; 9926 alu.dst.chan = i; 9927 if (i == 2) { 9928 alu.dst.write = 1; 9929 alu.last = 1; 9930 } 9931 9932 r = r600_bytecode_add_alu(ctx->bc, &alu); 9933 if (r) 9934 return r; 9935 } 9936 } else { 9937 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9938 alu.op = ALU_OP1_EXP_IEEE; 9939 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9940 9941 alu.dst.sel = ctx->temp_reg; 9942 alu.dst.write = 1; 9943 alu.dst.chan = 2; 9944 9945 alu.last = 1; 9946 9947 r = r600_bytecode_add_alu(ctx->bc, &alu); 9948 if (r) 9949 return r; 9950 } 9951 } 9952 9953 /* result.w = 1.0;*/ 9954 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) { 9955 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9956 9957 alu.op = ALU_OP1_MOV; 9958 alu.src[0].sel = V_SQ_ALU_SRC_1; 9959 alu.src[0].chan = 0; 9960 9961 alu.dst.sel = ctx->temp_reg; 9962 alu.dst.chan = 3; 9963 alu.dst.write = 1; 9964 alu.last = 1; 9965 r = r600_bytecode_add_alu(ctx->bc, &alu); 9966 if (r) 9967 return r; 9968 } 9969 return tgsi_helper_copy(ctx, inst); 9970} 9971 9972static int tgsi_log(struct r600_shader_ctx *ctx) 9973{ 9974 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9975 struct r600_bytecode_alu alu; 9976 int r; 9977 unsigned i; 9978 9979 /* result.x = floor(log2(|src|)); */ 9980 if (inst->Dst[0].Register.WriteMask & 1) { 9981 if (ctx->bc->gfx_level == CAYMAN) { 9982 for (i = 0; i < 3; i++) { 9983 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9984 9985 alu.op = ALU_OP1_LOG_IEEE; 9986 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9987 r600_bytecode_src_set_abs(&alu.src[0]); 9988 9989 alu.dst.sel = ctx->temp_reg; 9990 alu.dst.chan = i; 9991 if (i == 0) 9992 alu.dst.write = 1; 9993 if (i == 2) 9994 alu.last = 1; 9995 r = r600_bytecode_add_alu(ctx->bc, &alu); 9996 if (r) 9997 return r; 9998 } 9999 10000 } else { 10001 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10002 10003 alu.op = ALU_OP1_LOG_IEEE; 10004 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10005 r600_bytecode_src_set_abs(&alu.src[0]); 10006 10007 alu.dst.sel = ctx->temp_reg; 10008 alu.dst.chan = 0; 10009 alu.dst.write = 1; 10010 alu.last = 1; 10011 r = r600_bytecode_add_alu(ctx->bc, &alu); 10012 if (r) 10013 return r; 10014 } 10015 10016 alu.op = ALU_OP1_FLOOR; 10017 alu.src[0].sel = ctx->temp_reg; 10018 alu.src[0].chan = 0; 10019 10020 alu.dst.sel = ctx->temp_reg; 10021 alu.dst.chan = 0; 10022 alu.dst.write = 1; 10023 alu.last = 1; 10024 10025 r = r600_bytecode_add_alu(ctx->bc, &alu); 10026 if (r) 10027 return r; 10028 } 10029 10030 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */ 10031 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) { 10032 10033 if (ctx->bc->gfx_level == CAYMAN) { 10034 for (i = 0; i < 3; i++) { 10035 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10036 10037 alu.op = ALU_OP1_LOG_IEEE; 10038 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10039 r600_bytecode_src_set_abs(&alu.src[0]); 10040 10041 alu.dst.sel = ctx->temp_reg; 10042 alu.dst.chan = i; 10043 if (i == 1) 10044 alu.dst.write = 1; 10045 if (i == 2) 10046 alu.last = 1; 10047 10048 r = r600_bytecode_add_alu(ctx->bc, &alu); 10049 if (r) 10050 return r; 10051 } 10052 } else { 10053 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10054 10055 alu.op = ALU_OP1_LOG_IEEE; 10056 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10057 r600_bytecode_src_set_abs(&alu.src[0]); 10058 10059 alu.dst.sel = ctx->temp_reg; 10060 alu.dst.chan = 1; 10061 alu.dst.write = 1; 10062 alu.last = 1; 10063 10064 r = r600_bytecode_add_alu(ctx->bc, &alu); 10065 if (r) 10066 return r; 10067 } 10068 10069 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10070 10071 alu.op = ALU_OP1_FLOOR; 10072 alu.src[0].sel = ctx->temp_reg; 10073 alu.src[0].chan = 1; 10074 10075 alu.dst.sel = ctx->temp_reg; 10076 alu.dst.chan = 1; 10077 alu.dst.write = 1; 10078 alu.last = 1; 10079 10080 r = r600_bytecode_add_alu(ctx->bc, &alu); 10081 if (r) 10082 return r; 10083 10084 if (ctx->bc->gfx_level == CAYMAN) { 10085 for (i = 0; i < 3; i++) { 10086 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10087 alu.op = ALU_OP1_EXP_IEEE; 10088 alu.src[0].sel = ctx->temp_reg; 10089 alu.src[0].chan = 1; 10090 10091 alu.dst.sel = ctx->temp_reg; 10092 alu.dst.chan = i; 10093 if (i == 1) 10094 alu.dst.write = 1; 10095 if (i == 2) 10096 alu.last = 1; 10097 10098 r = r600_bytecode_add_alu(ctx->bc, &alu); 10099 if (r) 10100 return r; 10101 } 10102 } else { 10103 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10104 alu.op = ALU_OP1_EXP_IEEE; 10105 alu.src[0].sel = ctx->temp_reg; 10106 alu.src[0].chan = 1; 10107 10108 alu.dst.sel = ctx->temp_reg; 10109 alu.dst.chan = 1; 10110 alu.dst.write = 1; 10111 alu.last = 1; 10112 10113 r = r600_bytecode_add_alu(ctx->bc, &alu); 10114 if (r) 10115 return r; 10116 } 10117 10118 if (ctx->bc->gfx_level == CAYMAN) { 10119 for (i = 0; i < 3; i++) { 10120 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10121 alu.op = ALU_OP1_RECIP_IEEE; 10122 alu.src[0].sel = ctx->temp_reg; 10123 alu.src[0].chan = 1; 10124 10125 alu.dst.sel = ctx->temp_reg; 10126 alu.dst.chan = i; 10127 if (i == 1) 10128 alu.dst.write = 1; 10129 if (i == 2) 10130 alu.last = 1; 10131 10132 r = r600_bytecode_add_alu(ctx->bc, &alu); 10133 if (r) 10134 return r; 10135 } 10136 } else { 10137 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10138 alu.op = ALU_OP1_RECIP_IEEE; 10139 alu.src[0].sel = ctx->temp_reg; 10140 alu.src[0].chan = 1; 10141 10142 alu.dst.sel = ctx->temp_reg; 10143 alu.dst.chan = 1; 10144 alu.dst.write = 1; 10145 alu.last = 1; 10146 10147 r = r600_bytecode_add_alu(ctx->bc, &alu); 10148 if (r) 10149 return r; 10150 } 10151 10152 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10153 10154 alu.op = ALU_OP2_MUL; 10155 10156 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10157 r600_bytecode_src_set_abs(&alu.src[0]); 10158 10159 alu.src[1].sel = ctx->temp_reg; 10160 alu.src[1].chan = 1; 10161 10162 alu.dst.sel = ctx->temp_reg; 10163 alu.dst.chan = 1; 10164 alu.dst.write = 1; 10165 alu.last = 1; 10166 10167 r = r600_bytecode_add_alu(ctx->bc, &alu); 10168 if (r) 10169 return r; 10170 } 10171 10172 /* result.z = log2(|src|);*/ 10173 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) { 10174 if (ctx->bc->gfx_level == CAYMAN) { 10175 for (i = 0; i < 3; i++) { 10176 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10177 10178 alu.op = ALU_OP1_LOG_IEEE; 10179 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10180 r600_bytecode_src_set_abs(&alu.src[0]); 10181 10182 alu.dst.sel = ctx->temp_reg; 10183 if (i == 2) 10184 alu.dst.write = 1; 10185 alu.dst.chan = i; 10186 if (i == 2) 10187 alu.last = 1; 10188 10189 r = r600_bytecode_add_alu(ctx->bc, &alu); 10190 if (r) 10191 return r; 10192 } 10193 } else { 10194 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10195 10196 alu.op = ALU_OP1_LOG_IEEE; 10197 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10198 r600_bytecode_src_set_abs(&alu.src[0]); 10199 10200 alu.dst.sel = ctx->temp_reg; 10201 alu.dst.write = 1; 10202 alu.dst.chan = 2; 10203 alu.last = 1; 10204 10205 r = r600_bytecode_add_alu(ctx->bc, &alu); 10206 if (r) 10207 return r; 10208 } 10209 } 10210 10211 /* result.w = 1.0; */ 10212 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) { 10213 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10214 10215 alu.op = ALU_OP1_MOV; 10216 alu.src[0].sel = V_SQ_ALU_SRC_1; 10217 alu.src[0].chan = 0; 10218 10219 alu.dst.sel = ctx->temp_reg; 10220 alu.dst.chan = 3; 10221 alu.dst.write = 1; 10222 alu.last = 1; 10223 10224 r = r600_bytecode_add_alu(ctx->bc, &alu); 10225 if (r) 10226 return r; 10227 } 10228 10229 return tgsi_helper_copy(ctx, inst); 10230} 10231 10232static int tgsi_eg_arl(struct r600_shader_ctx *ctx) 10233{ 10234 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10235 struct r600_bytecode_alu alu; 10236 int r; 10237 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10238 unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index); 10239 10240 assert(inst->Dst[0].Register.Index < 3); 10241 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10242 10243 switch (inst->Instruction.Opcode) { 10244 case TGSI_OPCODE_ARL: 10245 alu.op = ALU_OP1_FLT_TO_INT_FLOOR; 10246 break; 10247 case TGSI_OPCODE_ARR: 10248 alu.op = ALU_OP1_FLT_TO_INT; 10249 break; 10250 case TGSI_OPCODE_UARL: 10251 alu.op = ALU_OP1_MOV; 10252 break; 10253 default: 10254 assert(0); 10255 return -1; 10256 } 10257 10258 for (i = 0; i <= lasti; ++i) { 10259 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10260 continue; 10261 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10262 alu.last = i == lasti; 10263 alu.dst.sel = reg; 10264 alu.dst.chan = i; 10265 alu.dst.write = 1; 10266 r = r600_bytecode_add_alu(ctx->bc, &alu); 10267 if (r) 10268 return r; 10269 } 10270 10271 if (inst->Dst[0].Register.Index > 0) 10272 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0; 10273 else 10274 ctx->bc->ar_loaded = 0; 10275 10276 return 0; 10277} 10278static int tgsi_r600_arl(struct r600_shader_ctx *ctx) 10279{ 10280 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10281 struct r600_bytecode_alu alu; 10282 int r; 10283 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10284 10285 switch (inst->Instruction.Opcode) { 10286 case TGSI_OPCODE_ARL: 10287 memset(&alu, 0, sizeof(alu)); 10288 alu.op = ALU_OP1_FLOOR; 10289 alu.dst.sel = ctx->bc->ar_reg; 10290 alu.dst.write = 1; 10291 for (i = 0; i <= lasti; ++i) { 10292 if (inst->Dst[0].Register.WriteMask & (1 << i)) { 10293 alu.dst.chan = i; 10294 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10295 alu.last = i == lasti; 10296 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 10297 return r; 10298 } 10299 } 10300 10301 memset(&alu, 0, sizeof(alu)); 10302 alu.op = ALU_OP1_FLT_TO_INT; 10303 alu.src[0].sel = ctx->bc->ar_reg; 10304 alu.dst.sel = ctx->bc->ar_reg; 10305 alu.dst.write = 1; 10306 /* FLT_TO_INT is trans-only on r600/r700 */ 10307 alu.last = TRUE; 10308 for (i = 0; i <= lasti; ++i) { 10309 alu.dst.chan = i; 10310 alu.src[0].chan = i; 10311 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 10312 return r; 10313 } 10314 break; 10315 case TGSI_OPCODE_ARR: 10316 memset(&alu, 0, sizeof(alu)); 10317 alu.op = ALU_OP1_FLT_TO_INT; 10318 alu.dst.sel = ctx->bc->ar_reg; 10319 alu.dst.write = 1; 10320 /* FLT_TO_INT is trans-only on r600/r700 */ 10321 alu.last = TRUE; 10322 for (i = 0; i <= lasti; ++i) { 10323 if (inst->Dst[0].Register.WriteMask & (1 << i)) { 10324 alu.dst.chan = i; 10325 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10326 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 10327 return r; 10328 } 10329 } 10330 break; 10331 case TGSI_OPCODE_UARL: 10332 memset(&alu, 0, sizeof(alu)); 10333 alu.op = ALU_OP1_MOV; 10334 alu.dst.sel = ctx->bc->ar_reg; 10335 alu.dst.write = 1; 10336 for (i = 0; i <= lasti; ++i) { 10337 if (inst->Dst[0].Register.WriteMask & (1 << i)) { 10338 alu.dst.chan = i; 10339 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10340 alu.last = i == lasti; 10341 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 10342 return r; 10343 } 10344 } 10345 break; 10346 default: 10347 assert(0); 10348 return -1; 10349 } 10350 10351 ctx->bc->ar_loaded = 0; 10352 return 0; 10353} 10354 10355static int tgsi_opdst(struct r600_shader_ctx *ctx) 10356{ 10357 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10358 struct r600_bytecode_alu alu; 10359 int i, r = 0; 10360 10361 for (i = 0; i < 4; i++) { 10362 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10363 10364 alu.op = ALU_OP2_MUL; 10365 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10366 10367 if (i == 0 || i == 3) { 10368 alu.src[0].sel = V_SQ_ALU_SRC_1; 10369 } else { 10370 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 10371 } 10372 10373 if (i == 0 || i == 2) { 10374 alu.src[1].sel = V_SQ_ALU_SRC_1; 10375 } else { 10376 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 10377 } 10378 if (i == 3) 10379 alu.last = 1; 10380 r = r600_bytecode_add_alu(ctx->bc, &alu); 10381 if (r) 10382 return r; 10383 } 10384 return 0; 10385} 10386 10387static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type, 10388 struct r600_bytecode_alu_src *src) 10389{ 10390 struct r600_bytecode_alu alu; 10391 int r; 10392 10393 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10394 alu.op = opcode; 10395 alu.execute_mask = 1; 10396 alu.update_pred = 1; 10397 10398 alu.dst.sel = ctx->temp_reg; 10399 alu.dst.write = 1; 10400 alu.dst.chan = 0; 10401 10402 alu.src[0] = *src; 10403 alu.src[1].sel = V_SQ_ALU_SRC_0; 10404 alu.src[1].chan = 0; 10405 10406 alu.last = 1; 10407 10408 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type); 10409 if (r) 10410 return r; 10411 return 0; 10412} 10413 10414static int pops(struct r600_shader_ctx *ctx, int pops) 10415{ 10416 unsigned force_pop = ctx->bc->force_add_cf; 10417 10418 if (!force_pop) { 10419 int alu_pop = 3; 10420 if (ctx->bc->cf_last) { 10421 if (ctx->bc->cf_last->op == CF_OP_ALU) 10422 alu_pop = 0; 10423 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER) 10424 alu_pop = 1; 10425 } 10426 alu_pop += pops; 10427 if (alu_pop == 1) { 10428 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER; 10429 ctx->bc->force_add_cf = 1; 10430 } else if (alu_pop == 2) { 10431 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER; 10432 ctx->bc->force_add_cf = 1; 10433 } else { 10434 force_pop = 1; 10435 } 10436 } 10437 10438 if (force_pop) { 10439 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP); 10440 ctx->bc->cf_last->pop_count = pops; 10441 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2; 10442 } 10443 10444 return 0; 10445} 10446 10447static inline int callstack_update_max_depth(struct r600_shader_ctx *ctx, 10448 unsigned reason) 10449{ 10450 struct r600_stack_info *stack = &ctx->bc->stack; 10451 unsigned elements; 10452 int entries; 10453 10454 unsigned entry_size = stack->entry_size; 10455 10456 elements = (stack->loop + stack->push_wqm ) * entry_size; 10457 elements += stack->push; 10458 10459 switch (ctx->bc->gfx_level) { 10460 case R600: 10461 case R700: 10462 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on 10463 * the stack must be reserved to hold the current active/continue 10464 * masks */ 10465 if (reason == FC_PUSH_VPM || stack->push > 0) { 10466 elements += 2; 10467 } 10468 break; 10469 10470 case CAYMAN: 10471 /* r9xx: any stack operation on empty stack consumes 2 additional 10472 * elements */ 10473 elements += 2; 10474 10475 FALLTHROUGH; 10476 /* FIXME: do the two elements added above cover the cases for the 10477 * r8xx+ below? */ 10478 10479 case EVERGREEN: 10480 /* r8xx+: 2 extra elements are not always required, but one extra 10481 * element must be added for each of the following cases: 10482 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest 10483 * stack usage. 10484 * (Currently we don't use ALU_ELSE_AFTER.) 10485 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM 10486 * PUSH instruction executed. 10487 * 10488 * NOTE: it seems we also need to reserve additional element in some 10489 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader, 10490 * then STACK_SIZE should be 2 instead of 1 */ 10491 if (reason == FC_PUSH_VPM || stack->push > 0) { 10492 elements += 1; 10493 } 10494 break; 10495 10496 default: 10497 assert(0); 10498 break; 10499 } 10500 10501 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4 10502 * for all chips, so we use 4 in the final formula, not the real entry_size 10503 * for the chip */ 10504 entry_size = 4; 10505 10506 entries = (elements + (entry_size - 1)) / entry_size; 10507 10508 if (entries > stack->max_entries) 10509 stack->max_entries = entries; 10510 return elements; 10511} 10512 10513static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason) 10514{ 10515 switch(reason) { 10516 case FC_PUSH_VPM: 10517 --ctx->bc->stack.push; 10518 assert(ctx->bc->stack.push >= 0); 10519 break; 10520 case FC_PUSH_WQM: 10521 --ctx->bc->stack.push_wqm; 10522 assert(ctx->bc->stack.push_wqm >= 0); 10523 break; 10524 case FC_LOOP: 10525 --ctx->bc->stack.loop; 10526 assert(ctx->bc->stack.loop >= 0); 10527 break; 10528 default: 10529 assert(0); 10530 break; 10531 } 10532} 10533 10534static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason) 10535{ 10536 switch (reason) { 10537 case FC_PUSH_VPM: 10538 ++ctx->bc->stack.push; 10539 break; 10540 case FC_PUSH_WQM: 10541 ++ctx->bc->stack.push_wqm; 10542 break; 10543 case FC_LOOP: 10544 ++ctx->bc->stack.loop; 10545 break; 10546 default: 10547 assert(0); 10548 } 10549 10550 return callstack_update_max_depth(ctx, reason); 10551} 10552 10553static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp) 10554{ 10555 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp]; 10556 10557 sp->mid = realloc((void *)sp->mid, 10558 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1)); 10559 sp->mid[sp->num_mid] = ctx->bc->cf_last; 10560 sp->num_mid++; 10561} 10562 10563static void fc_pushlevel(struct r600_shader_ctx *ctx, int type) 10564{ 10565 assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack)); 10566 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type; 10567 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last; 10568 ctx->bc->fc_sp++; 10569} 10570 10571static void fc_poplevel(struct r600_shader_ctx *ctx) 10572{ 10573 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1]; 10574 free(sp->mid); 10575 sp->mid = NULL; 10576 sp->num_mid = 0; 10577 sp->start = NULL; 10578 sp->type = 0; 10579 ctx->bc->fc_sp--; 10580} 10581 10582#if 0 10583static int emit_return(struct r600_shader_ctx *ctx) 10584{ 10585 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN)); 10586 return 0; 10587} 10588 10589static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset) 10590{ 10591 10592 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP)); 10593 ctx->bc->cf_last->pop_count = pops; 10594 /* XXX work out offset */ 10595 return 0; 10596} 10597 10598static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value) 10599{ 10600 return 0; 10601} 10602 10603static void emit_testflag(struct r600_shader_ctx *ctx) 10604{ 10605 10606} 10607 10608static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx) 10609{ 10610 emit_testflag(ctx); 10611 emit_jump_to_offset(ctx, 1, 4); 10612 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0); 10613 pops(ctx, ifidx + 1); 10614 emit_return(ctx); 10615} 10616 10617static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp) 10618{ 10619 emit_testflag(ctx); 10620 10621 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 10622 ctx->bc->cf_last->pop_count = 1; 10623 10624 fc_set_mid(ctx, fc_sp); 10625 10626 pops(ctx, 1); 10627} 10628#endif 10629 10630static int emit_if(struct r600_shader_ctx *ctx, int opcode, 10631 struct r600_bytecode_alu_src *src) 10632{ 10633 int alu_type = CF_OP_ALU_PUSH_BEFORE; 10634 bool needs_workaround = false; 10635 int elems = callstack_push(ctx, FC_PUSH_VPM); 10636 10637 if (ctx->bc->gfx_level == CAYMAN && ctx->bc->stack.loop > 1) 10638 needs_workaround = true; 10639 10640 if (ctx->bc->gfx_level == EVERGREEN && ctx_needs_stack_workaround_8xx(ctx)) { 10641 unsigned dmod1 = (elems - 1) % ctx->bc->stack.entry_size; 10642 unsigned dmod2 = (elems) % ctx->bc->stack.entry_size; 10643 10644 if (elems && (!dmod1 || !dmod2)) 10645 needs_workaround = true; 10646 } 10647 10648 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by 10649 * LOOP_STARTxxx for nested loops may put the branch stack into a state 10650 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this 10651 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */ 10652 if (needs_workaround) { 10653 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH); 10654 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2; 10655 alu_type = CF_OP_ALU; 10656 } 10657 10658 emit_logic_pred(ctx, opcode, alu_type, src); 10659 10660 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP); 10661 10662 fc_pushlevel(ctx, FC_IF); 10663 10664 return 0; 10665} 10666 10667static int tgsi_if(struct r600_shader_ctx *ctx) 10668{ 10669 struct r600_bytecode_alu_src alu_src; 10670 r600_bytecode_src(&alu_src, &ctx->src[0], 0); 10671 10672 return emit_if(ctx, ALU_OP2_PRED_SETNE, &alu_src); 10673} 10674 10675static int tgsi_uif(struct r600_shader_ctx *ctx) 10676{ 10677 struct r600_bytecode_alu_src alu_src; 10678 r600_bytecode_src(&alu_src, &ctx->src[0], 0); 10679 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 10680} 10681 10682static int tgsi_else(struct r600_shader_ctx *ctx) 10683{ 10684 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE); 10685 ctx->bc->cf_last->pop_count = 1; 10686 10687 fc_set_mid(ctx, ctx->bc->fc_sp - 1); 10688 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id; 10689 return 0; 10690} 10691 10692static int tgsi_endif(struct r600_shader_ctx *ctx) 10693{ 10694 int offset = 2; 10695 pops(ctx, 1); 10696 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) { 10697 R600_ERR("if/endif unbalanced in shader\n"); 10698 return -1; 10699 } 10700 10701 /* ALU_EXTENDED needs 4 DWords instead of two, adjust jump target offset accordingly */ 10702 if (ctx->bc->cf_last->eg_alu_extended) 10703 offset += 2; 10704 10705 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) { 10706 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + offset; 10707 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1; 10708 } else { 10709 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + offset; 10710 } 10711 fc_poplevel(ctx); 10712 10713 callstack_pop(ctx, FC_PUSH_VPM); 10714 return 0; 10715} 10716 10717static int tgsi_bgnloop(struct r600_shader_ctx *ctx) 10718{ 10719 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not 10720 * limited to 4096 iterations, like the other LOOP_* instructions. */ 10721 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10); 10722 10723 fc_pushlevel(ctx, FC_LOOP); 10724 10725 /* check stack depth */ 10726 callstack_push(ctx, FC_LOOP); 10727 return 0; 10728} 10729 10730static int tgsi_endloop(struct r600_shader_ctx *ctx) 10731{ 10732 int i; 10733 10734 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END); 10735 10736 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) { 10737 R600_ERR("loop/endloop in shader code are not paired.\n"); 10738 return -EINVAL; 10739 } 10740 10741 /* fixup loop pointers - from r600isa 10742 LOOP END points to CF after LOOP START, 10743 LOOP START point to CF after LOOP END 10744 BRK/CONT point to LOOP END CF 10745 */ 10746 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2; 10747 10748 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2; 10749 10750 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) { 10751 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id; 10752 } 10753 /* XXX add LOOPRET support */ 10754 fc_poplevel(ctx); 10755 callstack_pop(ctx, FC_LOOP); 10756 return 0; 10757} 10758 10759static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx) 10760{ 10761 unsigned int fscp; 10762 10763 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--) 10764 { 10765 if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type) 10766 break; 10767 } 10768 10769 if (fscp == 0) { 10770 R600_ERR("Break not inside loop/endloop pair\n"); 10771 return -EINVAL; 10772 } 10773 10774 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 10775 10776 fc_set_mid(ctx, fscp - 1); 10777 10778 return 0; 10779} 10780 10781static int tgsi_gs_emit(struct r600_shader_ctx *ctx) 10782{ 10783 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10784 int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX]; 10785 int r; 10786 10787 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX) 10788 emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE); 10789 10790 r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 10791 if (!r) { 10792 ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream 10793 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX) 10794 return emit_inc_ring_offset(ctx, stream, TRUE); 10795 } 10796 return r; 10797} 10798 10799static int tgsi_umad(struct r600_shader_ctx *ctx) 10800{ 10801 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10802 struct r600_bytecode_alu alu; 10803 int i, j, r; 10804 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10805 10806 /* src0 * src1 */ 10807 for (i = 0; i < lasti + 1; i++) { 10808 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10809 continue; 10810 10811 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10812 10813 alu.dst.chan = i; 10814 alu.dst.sel = ctx->temp_reg; 10815 alu.dst.write = 1; 10816 10817 alu.op = ALU_OP2_MULLO_UINT; 10818 for (j = 0; j < 2; j++) { 10819 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 10820 } 10821 10822 alu.last = 1; 10823 r = emit_mul_int_op(ctx->bc, &alu); 10824 if (r) 10825 return r; 10826 } 10827 10828 10829 for (i = 0; i < lasti + 1; i++) { 10830 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10831 continue; 10832 10833 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10834 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10835 10836 alu.op = ALU_OP2_ADD_INT; 10837 10838 alu.src[0].sel = ctx->temp_reg; 10839 alu.src[0].chan = i; 10840 10841 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 10842 if (i == lasti) { 10843 alu.last = 1; 10844 } 10845 r = r600_bytecode_add_alu(ctx->bc, &alu); 10846 if (r) 10847 return r; 10848 } 10849 return 0; 10850} 10851 10852static int tgsi_pk2h(struct r600_shader_ctx *ctx) 10853{ 10854 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10855 struct r600_bytecode_alu alu; 10856 int r, i; 10857 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10858 10859 /* temp.xy = f32_to_f16(src) */ 10860 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10861 alu.op = ALU_OP1_FLT32_TO_FLT16; 10862 alu.dst.chan = 0; 10863 alu.dst.sel = ctx->temp_reg; 10864 alu.dst.write = 1; 10865 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10866 r = r600_bytecode_add_alu(ctx->bc, &alu); 10867 if (r) 10868 return r; 10869 alu.dst.chan = 1; 10870 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 10871 alu.last = 1; 10872 r = r600_bytecode_add_alu(ctx->bc, &alu); 10873 if (r) 10874 return r; 10875 10876 /* dst.x = temp.y * 0x10000 + temp.x */ 10877 for (i = 0; i < lasti + 1; i++) { 10878 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10879 continue; 10880 10881 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10882 alu.op = ALU_OP3_MULADD_UINT24; 10883 alu.is_op3 = 1; 10884 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10885 alu.last = i == lasti; 10886 alu.src[0].sel = ctx->temp_reg; 10887 alu.src[0].chan = 1; 10888 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 10889 alu.src[1].value = 0x10000; 10890 alu.src[2].sel = ctx->temp_reg; 10891 alu.src[2].chan = 0; 10892 r = r600_bytecode_add_alu(ctx->bc, &alu); 10893 if (r) 10894 return r; 10895 } 10896 10897 return 0; 10898} 10899 10900static int tgsi_up2h(struct r600_shader_ctx *ctx) 10901{ 10902 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10903 struct r600_bytecode_alu alu; 10904 int r, i; 10905 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10906 10907 /* temp.x = src.x */ 10908 /* note: no need to mask out the high bits */ 10909 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10910 alu.op = ALU_OP1_MOV; 10911 alu.dst.chan = 0; 10912 alu.dst.sel = ctx->temp_reg; 10913 alu.dst.write = 1; 10914 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10915 r = r600_bytecode_add_alu(ctx->bc, &alu); 10916 if (r) 10917 return r; 10918 10919 /* temp.y = src.x >> 16 */ 10920 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10921 alu.op = ALU_OP2_LSHR_INT; 10922 alu.dst.chan = 1; 10923 alu.dst.sel = ctx->temp_reg; 10924 alu.dst.write = 1; 10925 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10926 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 10927 alu.src[1].value = 16; 10928 alu.last = 1; 10929 r = r600_bytecode_add_alu(ctx->bc, &alu); 10930 if (r) 10931 return r; 10932 10933 /* dst.wz = dst.xy = f16_to_f32(temp.xy) */ 10934 for (i = 0; i < lasti + 1; i++) { 10935 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10936 continue; 10937 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10938 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10939 alu.op = ALU_OP1_FLT16_TO_FLT32; 10940 alu.src[0].sel = ctx->temp_reg; 10941 alu.src[0].chan = i % 2; 10942 alu.last = i == lasti; 10943 r = r600_bytecode_add_alu(ctx->bc, &alu); 10944 if (r) 10945 return r; 10946 } 10947 10948 return 0; 10949} 10950 10951static int tgsi_bfe(struct r600_shader_ctx *ctx) 10952{ 10953 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10954 struct r600_bytecode_alu alu; 10955 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10956 int r, i; 10957 int dst = -1; 10958 10959 if ((inst->Src[0].Register.File == inst->Dst[0].Register.File && 10960 inst->Src[0].Register.Index == inst->Dst[0].Register.Index) || 10961 (inst->Src[2].Register.File == inst->Dst[0].Register.File && 10962 inst->Src[2].Register.Index == inst->Dst[0].Register.Index)) 10963 dst = r600_get_temp(ctx); 10964 10965 r = tgsi_op3_dst(ctx, dst); 10966 if (r) 10967 return r; 10968 10969 for (i = 0; i < lasti + 1; i++) { 10970 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10971 alu.op = ALU_OP2_SETGE_INT; 10972 r600_bytecode_src(&alu.src[0], &ctx->src[2], i); 10973 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 10974 alu.src[1].value = 32; 10975 alu.dst.sel = ctx->temp_reg; 10976 alu.dst.chan = i; 10977 alu.dst.write = 1; 10978 if (i == lasti) 10979 alu.last = 1; 10980 r = r600_bytecode_add_alu(ctx->bc, &alu); 10981 if (r) 10982 return r; 10983 } 10984 10985 for (i = 0; i < lasti + 1; i++) { 10986 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10987 alu.op = ALU_OP3_CNDE_INT; 10988 alu.is_op3 = 1; 10989 alu.src[0].sel = ctx->temp_reg; 10990 alu.src[0].chan = i; 10991 10992 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10993 if (dst != -1) 10994 alu.src[1].sel = dst; 10995 else 10996 alu.src[1].sel = alu.dst.sel; 10997 alu.src[1].chan = i; 10998 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 10999 alu.dst.write = 1; 11000 if (i == lasti) 11001 alu.last = 1; 11002 r = r600_bytecode_add_alu(ctx->bc, &alu); 11003 if (r) 11004 return r; 11005 } 11006 11007 return 0; 11008} 11009 11010static int tgsi_clock(struct r600_shader_ctx *ctx) 11011{ 11012 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 11013 struct r600_bytecode_alu alu; 11014 int r; 11015 11016 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11017 alu.op = ALU_OP1_MOV; 11018 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11019 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO; 11020 r = r600_bytecode_add_alu(ctx->bc, &alu); 11021 if (r) 11022 return r; 11023 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11024 alu.op = ALU_OP1_MOV; 11025 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 11026 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI; 11027 alu.last = 1; 11028 r = r600_bytecode_add_alu(ctx->bc, &alu); 11029 if (r) 11030 return r; 11031 return 0; 11032} 11033 11034static int emit_u64add(struct r600_shader_ctx *ctx, int op, 11035 int treg, 11036 int src0_sel, int src0_chan, 11037 int src1_sel, int src1_chan) 11038{ 11039 struct r600_bytecode_alu alu; 11040 int r; 11041 int opc; 11042 11043 if (op == ALU_OP2_ADD_INT) 11044 opc = ALU_OP2_ADDC_UINT; 11045 else 11046 opc = ALU_OP2_SUBB_UINT; 11047 11048 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11049 alu.op = op; ; 11050 alu.dst.sel = treg; 11051 alu.dst.chan = 0; 11052 alu.dst.write = 1; 11053 alu.src[0].sel = src0_sel; 11054 alu.src[0].chan = src0_chan + 0; 11055 alu.src[1].sel = src1_sel; 11056 alu.src[1].chan = src1_chan + 0; 11057 alu.src[1].neg = 0; 11058 r = r600_bytecode_add_alu(ctx->bc, &alu); 11059 if (r) 11060 return r; 11061 11062 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11063 alu.op = op; 11064 alu.dst.sel = treg; 11065 alu.dst.chan = 1; 11066 alu.dst.write = 1; 11067 alu.src[0].sel = src0_sel; 11068 alu.src[0].chan = src0_chan + 1; 11069 alu.src[1].sel = src1_sel; 11070 alu.src[1].chan = src1_chan + 1; 11071 alu.src[1].neg = 0; 11072 r = r600_bytecode_add_alu(ctx->bc, &alu); 11073 if (r) 11074 return r; 11075 11076 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11077 alu.op = opc; 11078 alu.dst.sel = treg; 11079 alu.dst.chan = 2; 11080 alu.dst.write = 1; 11081 alu.last = 1; 11082 alu.src[0].sel = src0_sel; 11083 alu.src[0].chan = src0_chan + 0; 11084 alu.src[1].sel = src1_sel; 11085 alu.src[1].chan = src1_chan + 0; 11086 alu.src[1].neg = 0; 11087 r = r600_bytecode_add_alu(ctx->bc, &alu); 11088 if (r) 11089 return r; 11090 11091 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11092 alu.op = op; 11093 alu.dst.sel = treg; 11094 alu.dst.chan = 1; 11095 alu.dst.write = 1; 11096 alu.src[0].sel = treg; 11097 alu.src[0].chan = 1; 11098 alu.src[1].sel = treg; 11099 alu.src[1].chan = 2; 11100 alu.last = 1; 11101 r = r600_bytecode_add_alu(ctx->bc, &alu); 11102 if (r) 11103 return r; 11104 return 0; 11105} 11106 11107static int egcm_u64add(struct r600_shader_ctx *ctx) 11108{ 11109 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 11110 struct r600_bytecode_alu alu; 11111 int r; 11112 int treg = ctx->temp_reg; 11113 int op = ALU_OP2_ADD_INT, opc = ALU_OP2_ADDC_UINT; 11114 11115 if (ctx->src[1].neg) { 11116 op = ALU_OP2_SUB_INT; 11117 opc = ALU_OP2_SUBB_UINT; 11118 } 11119 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11120 alu.op = op; ; 11121 alu.dst.sel = treg; 11122 alu.dst.chan = 0; 11123 alu.dst.write = 1; 11124 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11125 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11126 alu.src[1].neg = 0; 11127 r = r600_bytecode_add_alu(ctx->bc, &alu); 11128 if (r) 11129 return r; 11130 11131 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11132 alu.op = op; 11133 alu.dst.sel = treg; 11134 alu.dst.chan = 1; 11135 alu.dst.write = 1; 11136 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 11137 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); 11138 alu.src[1].neg = 0; 11139 r = r600_bytecode_add_alu(ctx->bc, &alu); 11140 if (r) 11141 return r; 11142 11143 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11144 alu.op = opc ; 11145 alu.dst.sel = treg; 11146 alu.dst.chan = 2; 11147 alu.dst.write = 1; 11148 alu.last = 1; 11149 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11150 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11151 alu.src[1].neg = 0; 11152 r = r600_bytecode_add_alu(ctx->bc, &alu); 11153 if (r) 11154 return r; 11155 11156 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11157 alu.op = op; 11158 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 11159 alu.src[0].sel = treg; 11160 alu.src[0].chan = 1; 11161 alu.src[1].sel = treg; 11162 alu.src[1].chan = 2; 11163 alu.last = 1; 11164 r = r600_bytecode_add_alu(ctx->bc, &alu); 11165 if (r) 11166 return r; 11167 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11168 alu.op = ALU_OP1_MOV; 11169 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11170 alu.src[0].sel = treg; 11171 alu.src[0].chan = 0; 11172 alu.last = 1; 11173 r = r600_bytecode_add_alu(ctx->bc, &alu); 11174 if (r) 11175 return r; 11176 return 0; 11177} 11178 11179 11180static int egcm_i64neg(struct r600_shader_ctx *ctx) 11181{ 11182 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 11183 struct r600_bytecode_alu alu; 11184 int r; 11185 int treg = ctx->temp_reg; 11186 const int op = ALU_OP2_SUB_INT; 11187 const int opc = ALU_OP2_SUBB_UINT; 11188 11189 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11190 alu.op = op; ; 11191 alu.dst.sel = treg; 11192 alu.dst.chan = 0; 11193 alu.dst.write = 1; 11194 alu.src[0].sel = V_SQ_ALU_SRC_0; 11195 r600_bytecode_src(&alu.src[1], &ctx->src[0], 0); 11196 alu.src[1].neg = 0; 11197 r = r600_bytecode_add_alu(ctx->bc, &alu); 11198 if (r) 11199 return r; 11200 11201 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11202 alu.op = op; 11203 alu.dst.sel = treg; 11204 alu.dst.chan = 1; 11205 alu.dst.write = 1; 11206 alu.src[0].sel = V_SQ_ALU_SRC_0; 11207 r600_bytecode_src(&alu.src[1], &ctx->src[0], 1); 11208 alu.src[1].neg = 0; 11209 r = r600_bytecode_add_alu(ctx->bc, &alu); 11210 if (r) 11211 return r; 11212 11213 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11214 alu.op = opc ; 11215 alu.dst.sel = treg; 11216 alu.dst.chan = 2; 11217 alu.dst.write = 1; 11218 alu.last = 1; 11219 alu.src[0].sel = V_SQ_ALU_SRC_0; 11220 r600_bytecode_src(&alu.src[1], &ctx->src[0], 0); 11221 alu.src[1].neg = 0; 11222 r = r600_bytecode_add_alu(ctx->bc, &alu); 11223 if (r) 11224 return r; 11225 11226 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11227 alu.op = op; 11228 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 11229 alu.src[0].sel = treg; 11230 alu.src[0].chan = 1; 11231 alu.src[1].sel = treg; 11232 alu.src[1].chan = 2; 11233 alu.last = 1; 11234 r = r600_bytecode_add_alu(ctx->bc, &alu); 11235 if (r) 11236 return r; 11237 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11238 alu.op = ALU_OP1_MOV; 11239 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11240 alu.src[0].sel = treg; 11241 alu.src[0].chan = 0; 11242 alu.last = 1; 11243 r = r600_bytecode_add_alu(ctx->bc, &alu); 11244 if (r) 11245 return r; 11246 return 0; 11247} 11248 11249/* result.y = mul_high a, b 11250 result.x = mul a,b 11251 result.y += a.x * b.y + a.y * b.x; 11252*/ 11253static int egcm_u64mul(struct r600_shader_ctx *ctx) 11254{ 11255 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 11256 struct r600_bytecode_alu alu; 11257 int r; 11258 int treg = ctx->temp_reg; 11259 11260 /* temp.x = mul_lo a.x, b.x */ 11261 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11262 alu.op = ALU_OP2_MULLO_UINT; 11263 alu.dst.sel = treg; 11264 alu.dst.chan = 0; 11265 alu.dst.write = 1; 11266 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11267 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11268 r = emit_mul_int_op(ctx->bc, &alu); 11269 if (r) 11270 return r; 11271 11272 /* temp.y = mul_hi a.x, b.x */ 11273 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11274 alu.op = ALU_OP2_MULHI_UINT; 11275 alu.dst.sel = treg; 11276 alu.dst.chan = 1; 11277 alu.dst.write = 1; 11278 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11279 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11280 r = emit_mul_int_op(ctx->bc, &alu); 11281 if (r) 11282 return r; 11283 11284 /* temp.z = mul a.x, b.y */ 11285 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11286 alu.op = ALU_OP2_MULLO_UINT; 11287 alu.dst.sel = treg; 11288 alu.dst.chan = 2; 11289 alu.dst.write = 1; 11290 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11291 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); 11292 r = emit_mul_int_op(ctx->bc, &alu); 11293 if (r) 11294 return r; 11295 11296 /* temp.w = mul a.y, b.x */ 11297 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11298 alu.op = ALU_OP2_MULLO_UINT; 11299 alu.dst.sel = treg; 11300 alu.dst.chan = 3; 11301 alu.dst.write = 1; 11302 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 11303 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11304 r = emit_mul_int_op(ctx->bc, &alu); 11305 if (r) 11306 return r; 11307 11308 /* temp.z = temp.z + temp.w */ 11309 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11310 alu.op = ALU_OP2_ADD_INT; 11311 alu.dst.sel = treg; 11312 alu.dst.chan = 2; 11313 alu.dst.write = 1; 11314 alu.src[0].sel = treg; 11315 alu.src[0].chan = 2; 11316 alu.src[1].sel = treg; 11317 alu.src[1].chan = 3; 11318 alu.last = 1; 11319 r = r600_bytecode_add_alu(ctx->bc, &alu); 11320 if (r) 11321 return r; 11322 11323 /* temp.y = temp.y + temp.z */ 11324 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11325 alu.op = ALU_OP2_ADD_INT; 11326 alu.dst.sel = treg; 11327 alu.dst.chan = 1; 11328 alu.dst.write = 1; 11329 alu.src[0].sel = treg; 11330 alu.src[0].chan = 1; 11331 alu.src[1].sel = treg; 11332 alu.src[1].chan = 2; 11333 alu.last = 1; 11334 r = r600_bytecode_add_alu(ctx->bc, &alu); 11335 if (r) 11336 return r; 11337 11338 /* dst.x = temp.x */ 11339 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11340 alu.op = ALU_OP1_MOV; 11341 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11342 alu.src[0].sel = treg; 11343 alu.src[0].chan = 0; 11344 r = r600_bytecode_add_alu(ctx->bc, &alu); 11345 if (r) 11346 return r; 11347 11348 /* dst.y = temp.y */ 11349 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11350 alu.op = ALU_OP1_MOV; 11351 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 11352 alu.src[0].sel = treg; 11353 alu.src[0].chan = 1; 11354 alu.last = 1; 11355 r = r600_bytecode_add_alu(ctx->bc, &alu); 11356 if (r) 11357 return r; 11358 11359 return 0; 11360} 11361 11362static int emit_u64sge(struct r600_shader_ctx *ctx, 11363 int treg, 11364 int src0_sel, int src0_base_chan, 11365 int src1_sel, int src1_base_chan) 11366{ 11367 int r; 11368 /* for 64-bit sge */ 11369 /* result = (src0.y > src1.y) || ((src0.y == src1.y) && src0.x >= src1.x)) */ 11370 r = single_alu_op2(ctx, ALU_OP2_SETGT_UINT, 11371 treg, 1, 11372 src0_sel, src0_base_chan + 1, 11373 src1_sel, src1_base_chan + 1); 11374 if (r) 11375 return r; 11376 11377 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11378 treg, 0, 11379 src0_sel, src0_base_chan, 11380 src1_sel, src1_base_chan); 11381 if (r) 11382 return r; 11383 11384 r = single_alu_op2(ctx, ALU_OP2_SETE_INT, 11385 treg, 2, 11386 src0_sel, src0_base_chan + 1, 11387 src1_sel, src1_base_chan + 1); 11388 if (r) 11389 return r; 11390 11391 r = single_alu_op2(ctx, ALU_OP2_AND_INT, 11392 treg, 0, 11393 treg, 0, 11394 treg, 2); 11395 if (r) 11396 return r; 11397 11398 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11399 treg, 0, 11400 treg, 0, 11401 treg, 1); 11402 if (r) 11403 return r; 11404 return 0; 11405} 11406 11407/* this isn't a complete div it's just enough for qbo shader to work */ 11408static int egcm_u64div(struct r600_shader_ctx *ctx) 11409{ 11410 struct r600_bytecode_alu alu; 11411 struct r600_bytecode_alu_src alu_num_hi, alu_num_lo, alu_denom_hi, alu_denom_lo, alu_src; 11412 int r, i; 11413 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 11414 11415 /* make sure we are dividing my a const with 0 in the high bits */ 11416 if (ctx->src[1].sel != V_SQ_ALU_SRC_LITERAL) 11417 return -1; 11418 if (ctx->src[1].value[ctx->src[1].swizzle[1]] != 0) 11419 return -1; 11420 /* make sure we are doing one division */ 11421 if (inst->Dst[0].Register.WriteMask != 0x3) 11422 return -1; 11423 11424 /* emit_if uses ctx->temp_reg so we can't */ 11425 int treg = r600_get_temp(ctx); 11426 int tmp_num = r600_get_temp(ctx); 11427 int sub_tmp = r600_get_temp(ctx); 11428 11429 /* tmp quot are tmp_num.zw */ 11430 r600_bytecode_src(&alu_num_lo, &ctx->src[0], 0); 11431 r600_bytecode_src(&alu_num_hi, &ctx->src[0], 1); 11432 r600_bytecode_src(&alu_denom_lo, &ctx->src[1], 0); 11433 r600_bytecode_src(&alu_denom_hi, &ctx->src[1], 1); 11434 11435 /* MOV tmp_num.xy, numerator */ 11436 r = single_alu_op2(ctx, ALU_OP1_MOV, 11437 tmp_num, 0, 11438 alu_num_lo.sel, alu_num_lo.chan, 11439 0, 0); 11440 if (r) 11441 return r; 11442 r = single_alu_op2(ctx, ALU_OP1_MOV, 11443 tmp_num, 1, 11444 alu_num_hi.sel, alu_num_hi.chan, 11445 0, 0); 11446 if (r) 11447 return r; 11448 11449 r = single_alu_op2(ctx, ALU_OP1_MOV, 11450 tmp_num, 2, 11451 V_SQ_ALU_SRC_LITERAL, 0, 11452 0, 0); 11453 if (r) 11454 return r; 11455 11456 r = single_alu_op2(ctx, ALU_OP1_MOV, 11457 tmp_num, 3, 11458 V_SQ_ALU_SRC_LITERAL, 0, 11459 0, 0); 11460 if (r) 11461 return r; 11462 11463 /* treg 0 is log2_denom */ 11464 /* normally this gets the MSB for the denom high value 11465 - however we know this will always be 0 here. */ 11466 r = single_alu_op2(ctx, 11467 ALU_OP1_MOV, 11468 treg, 0, 11469 V_SQ_ALU_SRC_LITERAL, 32, 11470 0, 0); 11471 if (r) 11472 return r; 11473 11474 /* normally check demon hi for 0, but we know it is already */ 11475 /* t0.z = num_hi >= denom_lo */ 11476 r = single_alu_op2(ctx, 11477 ALU_OP2_SETGE_UINT, 11478 treg, 1, 11479 alu_num_hi.sel, alu_num_hi.chan, 11480 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value); 11481 if (r) 11482 return r; 11483 11484 memset(&alu_src, 0, sizeof(alu_src)); 11485 alu_src.sel = treg; 11486 alu_src.chan = 1; 11487 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11488 if (r) 11489 return r; 11490 11491 /* for loops in here */ 11492 /* get msb t0.x = msb(src[1].x) first */ 11493 int msb_lo = util_last_bit(alu_denom_lo.value); 11494 r = single_alu_op2(ctx, ALU_OP1_MOV, 11495 treg, 0, 11496 V_SQ_ALU_SRC_LITERAL, msb_lo, 11497 0, 0); 11498 if (r) 11499 return r; 11500 11501 /* unroll the asm here */ 11502 for (i = 0; i < 31; i++) { 11503 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11504 treg, 2, 11505 V_SQ_ALU_SRC_LITERAL, i, 11506 treg, 0); 11507 if (r) 11508 return r; 11509 11510 /* we can do this on the CPU */ 11511 uint32_t denom_lo_shl = alu_denom_lo.value << (31 - i); 11512 /* t0.z = tmp_num.y >= t0.z */ 11513 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11514 treg, 1, 11515 tmp_num, 1, 11516 V_SQ_ALU_SRC_LITERAL, denom_lo_shl); 11517 if (r) 11518 return r; 11519 11520 r = single_alu_op2(ctx, ALU_OP2_AND_INT, 11521 treg, 1, 11522 treg, 1, 11523 treg, 2); 11524 if (r) 11525 return r; 11526 11527 memset(&alu_src, 0, sizeof(alu_src)); 11528 alu_src.sel = treg; 11529 alu_src.chan = 1; 11530 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11531 if (r) 11532 return r; 11533 11534 r = single_alu_op2(ctx, ALU_OP2_SUB_INT, 11535 tmp_num, 1, 11536 tmp_num, 1, 11537 V_SQ_ALU_SRC_LITERAL, denom_lo_shl); 11538 if (r) 11539 return r; 11540 11541 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11542 tmp_num, 3, 11543 tmp_num, 3, 11544 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i)); 11545 if (r) 11546 return r; 11547 11548 r = tgsi_endif(ctx); 11549 if (r) 11550 return r; 11551 } 11552 11553 /* log2_denom is always <= 31, so manually peel the last loop 11554 * iteration. 11555 */ 11556 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11557 treg, 1, 11558 tmp_num, 1, 11559 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value); 11560 if (r) 11561 return r; 11562 11563 memset(&alu_src, 0, sizeof(alu_src)); 11564 alu_src.sel = treg; 11565 alu_src.chan = 1; 11566 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11567 if (r) 11568 return r; 11569 11570 r = single_alu_op2(ctx, ALU_OP2_SUB_INT, 11571 tmp_num, 1, 11572 tmp_num, 1, 11573 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value); 11574 if (r) 11575 return r; 11576 11577 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11578 tmp_num, 3, 11579 tmp_num, 3, 11580 V_SQ_ALU_SRC_LITERAL, 1U); 11581 if (r) 11582 return r; 11583 r = tgsi_endif(ctx); 11584 if (r) 11585 return r; 11586 11587 r = tgsi_endif(ctx); 11588 if (r) 11589 return r; 11590 11591 /* onto the second loop to unroll */ 11592 for (i = 0; i < 31; i++) { 11593 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT, 11594 treg, 1, 11595 V_SQ_ALU_SRC_LITERAL, (63 - (31 - i)), 11596 treg, 0); 11597 if (r) 11598 return r; 11599 11600 uint64_t denom_shl = (uint64_t)alu_denom_lo.value << (31 - i); 11601 r = single_alu_op2(ctx, ALU_OP1_MOV, 11602 treg, 2, 11603 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff), 11604 0, 0); 11605 if (r) 11606 return r; 11607 11608 r = single_alu_op2(ctx, ALU_OP1_MOV, 11609 treg, 3, 11610 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32), 11611 0, 0); 11612 if (r) 11613 return r; 11614 11615 r = emit_u64sge(ctx, sub_tmp, 11616 tmp_num, 0, 11617 treg, 2); 11618 if (r) 11619 return r; 11620 11621 r = single_alu_op2(ctx, ALU_OP2_AND_INT, 11622 treg, 1, 11623 treg, 1, 11624 sub_tmp, 0); 11625 if (r) 11626 return r; 11627 11628 memset(&alu_src, 0, sizeof(alu_src)); 11629 alu_src.sel = treg; 11630 alu_src.chan = 1; 11631 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11632 if (r) 11633 return r; 11634 11635 11636 r = emit_u64add(ctx, ALU_OP2_SUB_INT, 11637 sub_tmp, 11638 tmp_num, 0, 11639 treg, 2); 11640 if (r) 11641 return r; 11642 11643 r = single_alu_op2(ctx, ALU_OP1_MOV, 11644 tmp_num, 0, 11645 sub_tmp, 0, 11646 0, 0); 11647 if (r) 11648 return r; 11649 11650 r = single_alu_op2(ctx, ALU_OP1_MOV, 11651 tmp_num, 1, 11652 sub_tmp, 1, 11653 0, 0); 11654 if (r) 11655 return r; 11656 11657 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11658 tmp_num, 2, 11659 tmp_num, 2, 11660 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i)); 11661 if (r) 11662 return r; 11663 11664 r = tgsi_endif(ctx); 11665 if (r) 11666 return r; 11667 } 11668 11669 /* log2_denom is always <= 63, so manually peel the last loop 11670 * iteration. 11671 */ 11672 uint64_t denom_shl = (uint64_t)alu_denom_lo.value; 11673 r = single_alu_op2(ctx, ALU_OP1_MOV, 11674 treg, 2, 11675 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff), 11676 0, 0); 11677 if (r) 11678 return r; 11679 11680 r = single_alu_op2(ctx, ALU_OP1_MOV, 11681 treg, 3, 11682 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32), 11683 0, 0); 11684 if (r) 11685 return r; 11686 11687 r = emit_u64sge(ctx, sub_tmp, 11688 tmp_num, 0, 11689 treg, 2); 11690 if (r) 11691 return r; 11692 11693 memset(&alu_src, 0, sizeof(alu_src)); 11694 alu_src.sel = sub_tmp; 11695 alu_src.chan = 0; 11696 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src); 11697 if (r) 11698 return r; 11699 11700 r = emit_u64add(ctx, ALU_OP2_SUB_INT, 11701 sub_tmp, 11702 tmp_num, 0, 11703 treg, 2); 11704 if (r) 11705 return r; 11706 11707 r = single_alu_op2(ctx, ALU_OP2_OR_INT, 11708 tmp_num, 2, 11709 tmp_num, 2, 11710 V_SQ_ALU_SRC_LITERAL, 1U); 11711 if (r) 11712 return r; 11713 r = tgsi_endif(ctx); 11714 if (r) 11715 return r; 11716 11717 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11718 alu.op = ALU_OP1_MOV; 11719 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11720 alu.src[0].sel = tmp_num; 11721 alu.src[0].chan = 2; 11722 r = r600_bytecode_add_alu(ctx->bc, &alu); 11723 if (r) 11724 return r; 11725 11726 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11727 alu.op = ALU_OP1_MOV; 11728 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 11729 alu.src[0].sel = tmp_num; 11730 alu.src[0].chan = 3; 11731 alu.last = 1; 11732 r = r600_bytecode_add_alu(ctx->bc, &alu); 11733 if (r) 11734 return r; 11735 return 0; 11736} 11737 11738static int egcm_u64sne(struct r600_shader_ctx *ctx) 11739{ 11740 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 11741 struct r600_bytecode_alu alu; 11742 int r; 11743 int treg = ctx->temp_reg; 11744 11745 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11746 alu.op = ALU_OP2_SETNE_INT; 11747 alu.dst.sel = treg; 11748 alu.dst.chan = 0; 11749 alu.dst.write = 1; 11750 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 11751 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 11752 r = r600_bytecode_add_alu(ctx->bc, &alu); 11753 if (r) 11754 return r; 11755 11756 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11757 alu.op = ALU_OP2_SETNE_INT; 11758 alu.dst.sel = treg; 11759 alu.dst.chan = 1; 11760 alu.dst.write = 1; 11761 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 11762 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); 11763 alu.last = 1; 11764 r = r600_bytecode_add_alu(ctx->bc, &alu); 11765 if (r) 11766 return r; 11767 11768 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 11769 alu.op = ALU_OP2_OR_INT; 11770 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 11771 alu.src[0].sel = treg; 11772 alu.src[0].chan = 0; 11773 alu.src[1].sel = treg; 11774 alu.src[1].chan = 1; 11775 alu.last = 1; 11776 r = r600_bytecode_add_alu(ctx->bc, &alu); 11777 if (r) 11778 return r; 11779 return 0; 11780} 11781 11782static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = { 11783 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl}, 11784 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, 11785 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, 11786 11787 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate}, 11788 11789 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq}, 11790 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, 11791 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, 11792 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, 11793 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2}, 11794 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11795 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11796 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, 11797 /* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */ 11798 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, 11799 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, 11800 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, 11801 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, 11802 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, 11803 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, 11804 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported}, 11805 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate}, 11806 [21] = { ALU_OP0_NOP, tgsi_unsupported}, 11807 [22] = { ALU_OP0_NOP, tgsi_unsupported}, 11808 [23] = { ALU_OP0_NOP, tgsi_unsupported}, 11809 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2}, 11810 [25] = { ALU_OP0_NOP, tgsi_unsupported}, 11811 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2}, 11812 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2}, 11813 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate}, 11814 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate}, 11815 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow}, 11816 [31] = { ALU_OP0_NOP, tgsi_unsupported}, 11817 [32] = { ALU_OP0_NOP, tgsi_unsupported}, 11818 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_unsupported}, 11819 [34] = { ALU_OP0_NOP, tgsi_unsupported}, 11820 [35] = { ALU_OP0_NOP, tgsi_unsupported}, 11821 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig}, 11822 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 11823 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 11824 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 11825 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported}, 11826 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported}, 11827 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported}, 11828 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 11829 [44] = { ALU_OP0_NOP, tgsi_unsupported}, 11830 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2}, 11831 [46] = { ALU_OP0_NOP, tgsi_unsupported}, 11832 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2}, 11833 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig}, 11834 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap}, 11835 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2}, 11836 [51] = { ALU_OP0_NOP, tgsi_unsupported}, 11837 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex}, 11838 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex}, 11839 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex}, 11840 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported}, 11841 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported}, 11842 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported}, 11843 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 11844 [59] = { ALU_OP0_NOP, tgsi_unsupported}, 11845 [60] = { ALU_OP0_NOP, tgsi_unsupported}, 11846 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl}, 11847 [62] = { ALU_OP0_NOP, tgsi_unsupported}, 11848 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported}, 11849 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported}, 11850 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg}, 11851 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp}, 11852 [67] = { ALU_OP0_NOP, tgsi_unsupported}, 11853 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 11854 [69] = { ALU_OP0_NOP, tgsi_unsupported}, 11855 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported}, 11856 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11857 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 11858 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 11859 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if}, 11860 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif}, 11861 [76] = { ALU_OP0_NOP, tgsi_unsupported}, 11862 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else}, 11863 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif}, 11864 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported}, 11865 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported}, 11866 [81] = { ALU_OP0_NOP, tgsi_unsupported}, 11867 [82] = { ALU_OP0_NOP, tgsi_unsupported}, 11868 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2}, 11869 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans}, 11870 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2}, 11871 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2}, 11872 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans}, 11873 [88] = { ALU_OP0_NOP, tgsi_unsupported}, 11874 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2}, 11875 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2}, 11876 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod}, 11877 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2}, 11878 [93] = { ALU_OP0_NOP, tgsi_unsupported}, 11879 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex}, 11880 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 11881 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 11882 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit}, 11883 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit}, 11884 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop}, 11885 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 11886 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, 11887 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 11888 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 11889 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, 11890 [TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported}, 11891 [106] = { ALU_OP0_NOP, tgsi_unsupported}, 11892 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, 11893 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, 11894 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, 11895 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 11896 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 11897 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported}, 11898 [113] = { ALU_OP0_NOP, tgsi_unsupported}, 11899 [114] = { ALU_OP0_NOP, tgsi_unsupported}, 11900 [115] = { ALU_OP0_NOP, tgsi_unsupported}, 11901 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 11902 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 11903 [TGSI_OPCODE_DFMA] = { ALU_OP0_NOP, tgsi_unsupported}, 11904 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans}, 11905 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, 11906 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, 11907 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2}, 11908 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg}, 11909 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2}, 11910 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans}, 11911 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap}, 11912 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans}, 11913 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans}, 11914 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2}, 11915 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv}, 11916 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad}, 11917 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2}, 11918 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2}, 11919 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod}, 11920 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans}, 11921 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2}, 11922 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2}, 11923 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans}, 11924 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 11925 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap}, 11926 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 11927 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported}, 11928 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported}, 11929 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 11930 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported}, 11931 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported}, 11932 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported}, 11933 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported}, 11934 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported}, 11935 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported}, 11936 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported}, 11937 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported}, 11938 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported}, 11939 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported}, 11940 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported}, 11941 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported}, 11942 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl}, 11943 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp}, 11944 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs}, 11945 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg}, 11946 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported}, 11947 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported}, 11948 [163] = { ALU_OP0_NOP, tgsi_unsupported}, 11949 [164] = { ALU_OP0_NOP, tgsi_unsupported}, 11950 [165] = { ALU_OP0_NOP, tgsi_unsupported}, 11951 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported}, 11952 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported}, 11953 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported}, 11954 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported}, 11955 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported}, 11956 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported}, 11957 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported}, 11958 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported}, 11959 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported}, 11960 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported}, 11961 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported}, 11962 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex}, 11963 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 11964 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 11965 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans}, 11966 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans}, 11967 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported}, 11968 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported}, 11969 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported}, 11970 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported}, 11971 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported}, 11972 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported}, 11973 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported}, 11974 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported}, 11975 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported}, 11976 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported}, 11977 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported}, 11978 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported}, 11979 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported}, 11980 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, 11981}; 11982 11983static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = { 11984 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl}, 11985 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, 11986 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, 11987 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate}, 11988 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq}, 11989 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, 11990 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, 11991 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, 11992 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2}, 11993 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11994 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 11995 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, 11996 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, 11997 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, 11998 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, 11999 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, 12000 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, 12001 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, 12002 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3}, 12003 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate}, 12004 [21] = { ALU_OP0_NOP, tgsi_unsupported}, 12005 [22] = { ALU_OP0_NOP, tgsi_unsupported}, 12006 [23] = { ALU_OP0_NOP, tgsi_unsupported}, 12007 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2}, 12008 [25] = { ALU_OP0_NOP, tgsi_unsupported}, 12009 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2}, 12010 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2}, 12011 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate}, 12012 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate}, 12013 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow}, 12014 [31] = { ALU_OP0_NOP, tgsi_unsupported}, 12015 [32] = { ALU_OP0_NOP, tgsi_unsupported}, 12016 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock}, 12017 [34] = { ALU_OP0_NOP, tgsi_unsupported}, 12018 [35] = { ALU_OP0_NOP, tgsi_unsupported}, 12019 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig}, 12020 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 12021 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 12022 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 12023 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h}, 12024 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported}, 12025 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported}, 12026 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 12027 [44] = { ALU_OP0_NOP, tgsi_unsupported}, 12028 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2}, 12029 [46] = { ALU_OP0_NOP, tgsi_unsupported}, 12030 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2}, 12031 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig}, 12032 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap}, 12033 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2}, 12034 [51] = { ALU_OP0_NOP, tgsi_unsupported}, 12035 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex}, 12036 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex}, 12037 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex}, 12038 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h}, 12039 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported}, 12040 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported}, 12041 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 12042 [59] = { ALU_OP0_NOP, tgsi_unsupported}, 12043 [60] = { ALU_OP0_NOP, tgsi_unsupported}, 12044 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl}, 12045 [62] = { ALU_OP0_NOP, tgsi_unsupported}, 12046 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported}, 12047 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported}, 12048 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg}, 12049 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp}, 12050 [67] = { ALU_OP0_NOP, tgsi_unsupported}, 12051 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 12052 [69] = { ALU_OP0_NOP, tgsi_unsupported}, 12053 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported}, 12054 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 12055 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 12056 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 12057 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if}, 12058 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif}, 12059 [76] = { ALU_OP0_NOP, tgsi_unsupported}, 12060 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else}, 12061 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif}, 12062 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 12063 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 12064 [82] = { ALU_OP0_NOP, tgsi_unsupported}, 12065 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2}, 12066 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans}, 12067 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2}, 12068 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2}, 12069 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2}, 12070 [88] = { ALU_OP0_NOP, tgsi_unsupported}, 12071 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2}, 12072 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2}, 12073 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod}, 12074 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2}, 12075 [93] = { ALU_OP0_NOP, tgsi_unsupported}, 12076 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex}, 12077 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 12078 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 12079 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit}, 12080 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit}, 12081 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop}, 12082 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 12083 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, 12084 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 12085 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 12086 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, 12087 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq}, 12088 [106] = { ALU_OP0_NOP, tgsi_unsupported}, 12089 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, 12090 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, 12091 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, 12092 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 12093 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 12094 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_membar}, 12095 [113] = { ALU_OP0_NOP, tgsi_unsupported}, 12096 [114] = { ALU_OP0_NOP, tgsi_unsupported}, 12097 [115] = { ALU_OP0_NOP, tgsi_unsupported}, 12098 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 12099 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 12100 /* Refer below for TGSI_OPCODE_DFMA */ 12101 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i}, 12102 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, 12103 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, 12104 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2}, 12105 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg}, 12106 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2}, 12107 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2}, 12108 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap}, 12109 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i}, 12110 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans}, 12111 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2}, 12112 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv}, 12113 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad}, 12114 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2}, 12115 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2}, 12116 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod}, 12117 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans}, 12118 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2}, 12119 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2}, 12120 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2}, 12121 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 12122 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2}, 12123 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 12124 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported}, 12125 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported}, 12126 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 12127 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported}, 12128 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported}, 12129 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported}, 12130 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported}, 12131 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported}, 12132 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported}, 12133 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported}, 12134 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported}, 12135 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported}, 12136 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported}, 12137 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported}, 12138 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported}, 12139 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl}, 12140 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp}, 12141 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs}, 12142 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg}, 12143 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load}, 12144 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store}, 12145 [163] = { ALU_OP0_NOP, tgsi_unsupported}, 12146 [164] = { ALU_OP0_NOP, tgsi_unsupported}, 12147 [165] = { ALU_OP0_NOP, tgsi_unsupported}, 12148 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 12149 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op}, 12150 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op}, 12151 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op}, 12152 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op}, 12153 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op}, 12154 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op}, 12155 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op}, 12156 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op}, 12157 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op}, 12158 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op}, 12159 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex}, 12160 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 12161 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 12162 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans}, 12163 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans}, 12164 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex}, 12165 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex}, 12166 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe}, 12167 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe}, 12168 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi}, 12169 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2}, 12170 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2}, 12171 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2}, 12172 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb}, 12173 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb}, 12174 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12175 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12176 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12177 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64}, 12178 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest}, 12179 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64}, 12180 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg}, 12181 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64}, 12182 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr}, 12183 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr }, 12184 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64}, 12185 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64}, 12186 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s}, 12187 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest}, 12188 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest}, 12189 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest}, 12190 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr}, 12191 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr}, 12192 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64}, 12193 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64}, 12194 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64}, 12195 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64}, 12196 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp}, 12197 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int}, 12198 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double}, 12199 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int}, 12200 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double}, 12201 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr}, 12202 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne }, 12203 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add }, 12204 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul }, 12205 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div }, 12206 [TGSI_OPCODE_I64NEG] = { ALU_OP0_NOP, egcm_i64neg }, 12207 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, 12208}; 12209 12210static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = { 12211 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl}, 12212 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, 12213 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, 12214 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr}, 12215 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr}, 12216 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, 12217 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, 12218 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, 12219 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2}, 12220 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 12221 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 12222 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, 12223 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, 12224 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, 12225 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, 12226 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, 12227 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, 12228 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, 12229 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3}, 12230 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr}, 12231 [21] = { ALU_OP0_NOP, tgsi_unsupported}, 12232 [22] = { ALU_OP0_NOP, tgsi_unsupported}, 12233 [23] = { ALU_OP0_NOP, tgsi_unsupported}, 12234 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2}, 12235 [25] = { ALU_OP0_NOP, tgsi_unsupported}, 12236 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2}, 12237 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2}, 12238 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr}, 12239 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr}, 12240 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow}, 12241 [31] = { ALU_OP0_NOP, tgsi_unsupported}, 12242 [32] = { ALU_OP0_NOP, tgsi_unsupported}, 12243 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock}, 12244 [34] = { ALU_OP0_NOP, tgsi_unsupported}, 12245 [35] = { ALU_OP0_NOP, tgsi_unsupported}, 12246 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig}, 12247 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 12248 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 12249 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 12250 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h}, 12251 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported}, 12252 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported}, 12253 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 12254 [44] = { ALU_OP0_NOP, tgsi_unsupported}, 12255 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2}, 12256 [46] = { ALU_OP0_NOP, tgsi_unsupported}, 12257 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2}, 12258 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig}, 12259 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap}, 12260 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2}, 12261 [51] = { ALU_OP0_NOP, tgsi_unsupported}, 12262 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex}, 12263 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex}, 12264 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex}, 12265 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h}, 12266 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported}, 12267 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported}, 12268 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 12269 [59] = { ALU_OP0_NOP, tgsi_unsupported}, 12270 [60] = { ALU_OP0_NOP, tgsi_unsupported}, 12271 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl}, 12272 [62] = { ALU_OP0_NOP, tgsi_unsupported}, 12273 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported}, 12274 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported}, 12275 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg}, 12276 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp}, 12277 [67] = { ALU_OP0_NOP, tgsi_unsupported}, 12278 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 12279 [69] = { ALU_OP0_NOP, tgsi_unsupported}, 12280 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported}, 12281 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 12282 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 12283 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 12284 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if}, 12285 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif}, 12286 [76] = { ALU_OP0_NOP, tgsi_unsupported}, 12287 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else}, 12288 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif}, 12289 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 12290 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 12291 [82] = { ALU_OP0_NOP, tgsi_unsupported}, 12292 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2}, 12293 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2}, 12294 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2}, 12295 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2}, 12296 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2}, 12297 [88] = { ALU_OP0_NOP, tgsi_unsupported}, 12298 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2}, 12299 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2}, 12300 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod}, 12301 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2}, 12302 [93] = { ALU_OP0_NOP, tgsi_unsupported}, 12303 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex}, 12304 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 12305 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 12306 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit}, 12307 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit}, 12308 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop}, 12309 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 12310 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, 12311 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 12312 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 12313 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, 12314 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq}, 12315 [106] = { ALU_OP0_NOP, tgsi_unsupported}, 12316 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, 12317 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, 12318 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, 12319 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 12320 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 12321 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_membar}, 12322 [113] = { ALU_OP0_NOP, tgsi_unsupported}, 12323 [114] = { ALU_OP0_NOP, tgsi_unsupported}, 12324 [115] = { ALU_OP0_NOP, tgsi_unsupported}, 12325 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 12326 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 12327 /* Refer below for TGSI_OPCODE_DFMA */ 12328 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2}, 12329 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, 12330 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, 12331 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2}, 12332 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg}, 12333 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2}, 12334 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2}, 12335 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap}, 12336 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2}, 12337 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2}, 12338 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2}, 12339 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv}, 12340 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad}, 12341 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2}, 12342 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2}, 12343 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod}, 12344 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr}, 12345 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2}, 12346 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2}, 12347 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2}, 12348 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 12349 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2}, 12350 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 12351 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported}, 12352 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported}, 12353 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 12354 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported}, 12355 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported}, 12356 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported}, 12357 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported}, 12358 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported}, 12359 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported}, 12360 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported}, 12361 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported}, 12362 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported}, 12363 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported}, 12364 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported}, 12365 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported}, 12366 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl}, 12367 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp}, 12368 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs}, 12369 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg}, 12370 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load}, 12371 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store}, 12372 [163] = { ALU_OP0_NOP, tgsi_unsupported}, 12373 [164] = { ALU_OP0_NOP, tgsi_unsupported}, 12374 [165] = { ALU_OP0_NOP, tgsi_unsupported}, 12375 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 12376 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op}, 12377 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op}, 12378 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op}, 12379 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op}, 12380 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op}, 12381 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op}, 12382 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op}, 12383 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op}, 12384 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op}, 12385 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op}, 12386 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex}, 12387 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 12388 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 12389 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr}, 12390 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr}, 12391 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex}, 12392 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex}, 12393 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe}, 12394 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe}, 12395 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi}, 12396 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2}, 12397 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2}, 12398 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2}, 12399 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb}, 12400 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb}, 12401 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12402 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12403 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm}, 12404 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64}, 12405 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest}, 12406 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64}, 12407 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg}, 12408 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64}, 12409 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr}, 12410 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr }, 12411 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64}, 12412 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64}, 12413 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s}, 12414 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest}, 12415 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest}, 12416 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest}, 12417 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr}, 12418 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr}, 12419 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64}, 12420 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64}, 12421 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64}, 12422 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64}, 12423 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp}, 12424 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int}, 12425 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double}, 12426 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int}, 12427 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double}, 12428 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr}, 12429 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne }, 12430 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add }, 12431 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul }, 12432 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div }, 12433 [TGSI_OPCODE_I64NEG] = { ALU_OP0_NOP, egcm_i64neg }, 12434 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, 12435}; 12436