1/* 2 * Copyright (c) 2012 Rob Clark <robdclark@gmail.com> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#include "ir3.h" 25 26#include <assert.h> 27#include <errno.h> 28#include <stdbool.h> 29#include <stdio.h> 30#include <stdlib.h> 31#include <string.h> 32 33#include "util/bitscan.h" 34#include "util/half_float.h" 35#include "util/ralloc.h" 36#include "util/u_math.h" 37 38#include "instr-a3xx.h" 39#include "ir3_shader.h" 40 41/* simple allocator to carve allocations out of an up-front allocated heap, 42 * so that we can free everything easily in one shot. 43 */ 44void * 45ir3_alloc(struct ir3 *shader, int sz) 46{ 47 return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */ 48} 49 50struct ir3 * 51ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v) 52{ 53 struct ir3 *shader = rzalloc(v, struct ir3); 54 55 shader->compiler = compiler; 56 shader->type = v->type; 57 58 list_inithead(&shader->block_list); 59 list_inithead(&shader->array_list); 60 61 return shader; 62} 63 64void 65ir3_destroy(struct ir3 *shader) 66{ 67 ralloc_free(shader); 68} 69 70static bool 71is_shared_consts(struct ir3_compiler *compiler, 72 struct ir3_const_state *const_state, 73 struct ir3_register *reg) 74{ 75 if (const_state->shared_consts_enable && reg->flags & IR3_REG_CONST) { 76 uint32_t min_const_reg = regid(compiler->shared_consts_base_offset, 0); 77 uint32_t max_const_reg = 78 regid(compiler->shared_consts_base_offset + 79 compiler->shared_consts_size, 0); 80 return reg->num >= min_const_reg && min_const_reg < max_const_reg; 81 } 82 83 return false; 84} 85 86static void 87collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg, 88 struct ir3_info *info) 89{ 90 struct ir3_shader_variant *v = info->data; 91 unsigned repeat = instr->repeat; 92 93 if (reg->flags & IR3_REG_IMMED) { 94 /* nothing to do */ 95 return; 96 } 97 98 /* Shared consts don't need to be included into constlen. */ 99 if (is_shared_consts(v->compiler, ir3_const_state(v), reg)) 100 return; 101 102 if (!(reg->flags & IR3_REG_R)) { 103 repeat = 0; 104 } 105 106 unsigned components; 107 int16_t max; 108 109 if (reg->flags & IR3_REG_RELATIV) { 110 components = reg->size; 111 max = (reg->array.base + components - 1); 112 } else { 113 components = util_last_bit(reg->wrmask); 114 max = (reg->num + repeat + components - 1); 115 } 116 117 if (reg->flags & IR3_REG_CONST) { 118 info->max_const = MAX2(info->max_const, max >> 2); 119 } else if (max < regid(48, 0)) { 120 if (reg->flags & IR3_REG_HALF) { 121 if (v->mergedregs) { 122 /* starting w/ a6xx, half regs conflict with full regs: */ 123 info->max_reg = MAX2(info->max_reg, max >> 3); 124 } else { 125 info->max_half_reg = MAX2(info->max_half_reg, max >> 2); 126 } 127 } else { 128 info->max_reg = MAX2(info->max_reg, max >> 2); 129 } 130 } 131} 132 133bool 134ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count) 135{ 136 const struct ir3_compiler *compiler = v->compiler; 137 138 /* If the user forced a particular wavesize respect that. */ 139 if (v->real_wavesize == IR3_SINGLE_ONLY) 140 return false; 141 if (v->real_wavesize == IR3_DOUBLE_ONLY) 142 return true; 143 144 /* We can't support more than compiler->branchstack_size diverging threads 145 * in a wave. Thus, doubling the threadsize is only possible if we don't 146 * exceed the branchstack size limit. 147 */ 148 if (MIN2(v->branchstack, compiler->threadsize_base * 2) > 149 compiler->branchstack_size) { 150 return false; 151 } 152 153 switch (v->type) { 154 case MESA_SHADER_KERNEL: 155 case MESA_SHADER_COMPUTE: { 156 unsigned threads_per_wg = 157 v->local_size[0] * v->local_size[1] * v->local_size[2]; 158 159 /* For a5xx, if the workgroup size is greater than the maximum number 160 * of threads per core with 32 threads per wave (512) then we have to 161 * use the doubled threadsize because otherwise the workgroup wouldn't 162 * fit. For smaller workgroup sizes, we follow the blob and use the 163 * smaller threadsize. 164 */ 165 if (compiler->gen < 6) { 166 return v->local_size_variable || 167 threads_per_wg > 168 compiler->threadsize_base * compiler->max_waves; 169 } 170 171 /* On a6xx, we prefer the larger threadsize unless the workgroup is 172 * small enough that it would be useless. Note that because 173 * threadsize_base is bumped to 64, we don't have to worry about the 174 * workgroup fitting, unlike the a5xx case. 175 */ 176 if (!v->local_size_variable) { 177 if (threads_per_wg <= compiler->threadsize_base) 178 return false; 179 } 180 } 181 FALLTHROUGH; 182 case MESA_SHADER_FRAGMENT: { 183 /* Check that doubling the threadsize wouldn't exceed the regfile size */ 184 return regs_count * 2 <= compiler->reg_size_vec4; 185 } 186 187 default: 188 /* On a6xx+, it's impossible to use a doubled wavesize in the geometry 189 * stages - the bit doesn't exist. The blob never used it for the VS 190 * on earlier gen's anyway. 191 */ 192 return false; 193 } 194} 195 196/* Get the maximum number of waves that could be used even if this shader 197 * didn't use any registers. 198 */ 199unsigned 200ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v, 201 bool double_threadsize) 202{ 203 const struct ir3_compiler *compiler = v->compiler; 204 unsigned max_waves = compiler->max_waves; 205 206 /* Compute the limit based on branchstack */ 207 if (v->branchstack > 0) { 208 unsigned branchstack_max_waves = compiler->branchstack_size / 209 v->branchstack * 210 compiler->wave_granularity; 211 max_waves = MIN2(max_waves, branchstack_max_waves); 212 } 213 214 /* If this is a compute shader, compute the limit based on shared size */ 215 if ((v->type == MESA_SHADER_COMPUTE) || 216 (v->type == MESA_SHADER_KERNEL)) { 217 unsigned threads_per_wg = 218 v->local_size[0] * v->local_size[1] * v->local_size[2]; 219 unsigned waves_per_wg = 220 DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base * 221 (double_threadsize ? 2 : 1) * 222 compiler->wave_granularity); 223 224 /* Shared is allocated in chunks of 1k */ 225 unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024); 226 if (shared_per_wg > 0 && !v->local_size_variable) { 227 unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg; 228 229 max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core * 230 compiler->wave_granularity); 231 } 232 233 /* If we have a compute shader that has a big workgroup, a barrier, and 234 * a branchstack which limits max_waves - this may result in a situation 235 * when we cannot run concurrently all waves of the workgroup, which 236 * would lead to a hang. 237 * 238 * TODO: Could we spill branchstack or is there other way around? 239 * Blob just explodes in such case. 240 */ 241 if (v->has_barrier && (max_waves < waves_per_wg)) { 242 mesa_loge( 243 "Compute shader (%s) which has workgroup barrier cannot be used " 244 "because it's impossible to have enough concurrent waves.", 245 v->name); 246 exit(1); 247 } 248 } 249 250 return max_waves; 251} 252 253/* Get the maximum number of waves that could be launched limited by reg size. 254 */ 255unsigned 256ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler, 257 unsigned reg_count, bool double_threadsize) 258{ 259 return reg_count ? (compiler->reg_size_vec4 / 260 (reg_count * (double_threadsize ? 2 : 1)) * 261 compiler->wave_granularity) 262 : compiler->max_waves; 263} 264 265void 266ir3_collect_info(struct ir3_shader_variant *v) 267{ 268 struct ir3_info *info = &v->info; 269 struct ir3 *shader = v->ir; 270 const struct ir3_compiler *compiler = v->compiler; 271 272 memset(info, 0, sizeof(*info)); 273 info->data = v; 274 info->max_reg = -1; 275 info->max_half_reg = -1; 276 info->max_const = -1; 277 info->multi_dword_ldp_stp = false; 278 279 uint32_t instr_count = 0; 280 foreach_block (block, &shader->block_list) { 281 foreach_instr (instr, &block->instr_list) { 282 instr_count++; 283 } 284 } 285 286 v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align); 287 288 /* Pad out with NOPs to instrlen, including at least 4 so that cffdump 289 * doesn't try to decode the following data as instructions (such as the 290 * next stage's shader in turnip) 291 */ 292 info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8; 293 info->sizedwords = info->size / 4; 294 295 bool in_preamble = false; 296 297 foreach_block (block, &shader->block_list) { 298 int sfu_delay = 0, mem_delay = 0; 299 300 foreach_instr (instr, &block->instr_list) { 301 302 foreach_src (reg, instr) { 303 collect_reg_info(instr, reg, info); 304 } 305 306 foreach_dst (reg, instr) { 307 if (is_dest_gpr(reg)) { 308 collect_reg_info(instr, reg, info); 309 } 310 } 311 312 if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) { 313 unsigned components = instr->srcs[2]->uim_val; 314 if (components * type_size(instr->cat6.type) > 32) { 315 info->multi_dword_ldp_stp = true; 316 } 317 318 if (instr->opc == OPC_STP) 319 info->stp_count += components; 320 else 321 info->ldp_count += components; 322 } 323 324 if ((instr->opc == OPC_BARY_F || instr->opc == OPC_FLAT_B) && 325 (instr->dsts[0]->flags & IR3_REG_EI)) 326 info->last_baryf = info->instrs_count; 327 328 if (instr->opc == OPC_SHPS) 329 in_preamble = true; 330 331 /* Don't count instructions in the preamble for instruction-count type 332 * stats, because their effect should be much smaller. 333 * TODO: we should probably have separate stats for preamble 334 * instructions, but that would blow up the amount of stats... 335 */ 336 if (!in_preamble) { 337 unsigned instrs_count = 1 + instr->repeat + instr->nop; 338 unsigned nops_count = instr->nop; 339 340 if (instr->opc == OPC_NOP) { 341 nops_count = 1 + instr->repeat; 342 info->instrs_per_cat[0] += nops_count; 343 } else { 344 info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat; 345 info->instrs_per_cat[0] += nops_count; 346 } 347 348 if (instr->opc == OPC_MOV) { 349 if (instr->cat1.src_type == instr->cat1.dst_type) { 350 info->mov_count += 1 + instr->repeat; 351 } else { 352 info->cov_count += 1 + instr->repeat; 353 } 354 } 355 356 info->instrs_count += instrs_count; 357 info->nops_count += nops_count; 358 359 if (instr->flags & IR3_INSTR_SS) { 360 info->ss++; 361 info->sstall += sfu_delay; 362 sfu_delay = 0; 363 } 364 365 if (instr->flags & IR3_INSTR_SY) { 366 info->sy++; 367 info->systall += mem_delay; 368 mem_delay = 0; 369 } 370 371 if (is_ss_producer(instr)) { 372 sfu_delay = soft_ss_delay(instr); 373 } else { 374 int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop); 375 sfu_delay -= n; 376 } 377 378 if (is_sy_producer(instr)) { 379 mem_delay = soft_sy_delay(instr, shader); 380 } else { 381 int n = MIN2(mem_delay, 1 + instr->repeat + instr->nop); 382 mem_delay -= n; 383 } 384 } 385 386 if (instr->opc == OPC_SHPE) 387 in_preamble = false; 388 } 389 } 390 391 /* TODO: for a5xx and below, is there a separate regfile for 392 * half-registers? 393 */ 394 unsigned regs_count = 395 info->max_reg + 1 + 396 (compiler->gen >= 6 ? ((info->max_half_reg + 2) / 2) : 0); 397 398 info->double_threadsize = ir3_should_double_threadsize(v, regs_count); 399 unsigned reg_independent_max_waves = 400 ir3_get_reg_independent_max_waves(v, info->double_threadsize); 401 unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves( 402 compiler, regs_count, info->double_threadsize); 403 info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves); 404 assert(info->max_waves <= v->compiler->max_waves); 405} 406 407static struct ir3_register * 408reg_create(struct ir3 *shader, int num, int flags) 409{ 410 struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register)); 411 reg->wrmask = 1; 412 reg->flags = flags; 413 reg->num = num; 414 return reg; 415} 416 417static void 418insert_instr(struct ir3_block *block, struct ir3_instruction *instr) 419{ 420 struct ir3 *shader = block->shader; 421 422 instr->serialno = ++shader->instr_count; 423 424 list_addtail(&instr->node, &block->instr_list); 425 426 if (is_input(instr)) 427 array_insert(shader, shader->baryfs, instr); 428} 429 430struct ir3_block * 431ir3_block_create(struct ir3 *shader) 432{ 433 struct ir3_block *block = ir3_alloc(shader, sizeof(*block)); 434#ifdef DEBUG 435 block->serialno = ++shader->block_count; 436#endif 437 block->shader = shader; 438 list_inithead(&block->node); 439 list_inithead(&block->instr_list); 440 return block; 441} 442 443void 444ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred) 445{ 446 array_insert(block, block->predecessors, pred); 447} 448 449void 450ir3_block_add_physical_predecessor(struct ir3_block *block, 451 struct ir3_block *pred) 452{ 453 array_insert(block, block->physical_predecessors, pred); 454} 455 456void 457ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred) 458{ 459 for (unsigned i = 0; i < block->predecessors_count; i++) { 460 if (block->predecessors[i] == pred) { 461 if (i < block->predecessors_count - 1) { 462 block->predecessors[i] = 463 block->predecessors[block->predecessors_count - 1]; 464 } 465 466 block->predecessors_count--; 467 return; 468 } 469 } 470} 471 472void 473ir3_block_remove_physical_predecessor(struct ir3_block *block, struct ir3_block *pred) 474{ 475 for (unsigned i = 0; i < block->physical_predecessors_count; i++) { 476 if (block->physical_predecessors[i] == pred) { 477 if (i < block->physical_predecessors_count - 1) { 478 block->physical_predecessors[i] = 479 block->physical_predecessors[block->physical_predecessors_count - 1]; 480 } 481 482 block->physical_predecessors_count--; 483 return; 484 } 485 } 486} 487 488unsigned 489ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred) 490{ 491 for (unsigned i = 0; i < block->predecessors_count; i++) { 492 if (block->predecessors[i] == pred) { 493 return i; 494 } 495 } 496 497 unreachable("ir3_block_get_pred_index() invalid predecessor"); 498} 499 500static struct ir3_instruction * 501instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc) 502{ 503 /* Add extra sources for array destinations and the address reg */ 504 if (1 <= opc_cat(opc)) 505 nsrc += 2; 506 struct ir3_instruction *instr; 507 unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) + 508 (nsrc * sizeof(instr->srcs[0])); 509 char *ptr = ir3_alloc(block->shader, sz); 510 511 instr = (struct ir3_instruction *)ptr; 512 ptr += sizeof(*instr); 513 instr->dsts = (struct ir3_register **)ptr; 514 instr->srcs = instr->dsts + ndst; 515 516#ifdef DEBUG 517 instr->dsts_max = ndst; 518 instr->srcs_max = nsrc; 519#endif 520 521 return instr; 522} 523 524struct ir3_instruction * 525ir3_instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc) 526{ 527 struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc); 528 instr->block = block; 529 instr->opc = opc; 530 insert_instr(block, instr); 531 return instr; 532} 533 534struct ir3_instruction * 535ir3_instr_clone(struct ir3_instruction *instr) 536{ 537 struct ir3_instruction *new_instr = instr_create( 538 instr->block, instr->opc, instr->dsts_count, instr->srcs_count); 539 struct ir3_register **dsts, **srcs; 540 541 dsts = new_instr->dsts; 542 srcs = new_instr->srcs; 543 *new_instr = *instr; 544 new_instr->dsts = dsts; 545 new_instr->srcs = srcs; 546 547 insert_instr(instr->block, new_instr); 548 549 /* clone registers: */ 550 new_instr->dsts_count = 0; 551 new_instr->srcs_count = 0; 552 foreach_dst (reg, instr) { 553 struct ir3_register *new_reg = 554 ir3_dst_create(new_instr, reg->num, reg->flags); 555 *new_reg = *reg; 556 if (new_reg->instr) 557 new_reg->instr = new_instr; 558 } 559 foreach_src (reg, instr) { 560 struct ir3_register *new_reg = 561 ir3_src_create(new_instr, reg->num, reg->flags); 562 *new_reg = *reg; 563 } 564 565 if (instr->address) { 566 assert(instr->srcs_count > 0); 567 new_instr->address = new_instr->srcs[instr->srcs_count - 1]; 568 } 569 570 return new_instr; 571} 572 573/* Add a false dependency to instruction, to ensure it is scheduled first: */ 574void 575ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep) 576{ 577 for (unsigned i = 0; i < instr->deps_count; i++) { 578 if (instr->deps[i] == dep) 579 return; 580 } 581 582 array_insert(instr, instr->deps, dep); 583} 584 585struct ir3_register * 586ir3_src_create(struct ir3_instruction *instr, int num, int flags) 587{ 588 struct ir3 *shader = instr->block->shader; 589#ifdef DEBUG 590 assert(instr->srcs_count < instr->srcs_max); 591#endif 592 struct ir3_register *reg = reg_create(shader, num, flags); 593 instr->srcs[instr->srcs_count++] = reg; 594 return reg; 595} 596 597struct ir3_register * 598ir3_dst_create(struct ir3_instruction *instr, int num, int flags) 599{ 600 struct ir3 *shader = instr->block->shader; 601#ifdef DEBUG 602 assert(instr->dsts_count < instr->dsts_max); 603#endif 604 struct ir3_register *reg = reg_create(shader, num, flags); 605 instr->dsts[instr->dsts_count++] = reg; 606 return reg; 607} 608 609struct ir3_register * 610ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg) 611{ 612 struct ir3_register *new_reg = reg_create(shader, 0, 0); 613 *new_reg = *reg; 614 return new_reg; 615} 616 617void 618ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg, 619 struct ir3_register *last_write) 620{ 621 assert(reg->flags & IR3_REG_ARRAY); 622 struct ir3_register *new_reg = ir3_src_create(instr, 0, 0); 623 *new_reg = *reg; 624 new_reg->def = last_write; 625 ir3_reg_tie(reg, new_reg); 626} 627 628void 629ir3_instr_set_address(struct ir3_instruction *instr, 630 struct ir3_instruction *addr) 631{ 632 if (!instr->address) { 633 struct ir3 *ir = instr->block->shader; 634 635 assert(instr->block == addr->block); 636 637 instr->address = 638 ir3_src_create(instr, addr->dsts[0]->num, addr->dsts[0]->flags); 639 instr->address->def = addr->dsts[0]; 640 assert(reg_num(addr->dsts[0]) == REG_A0); 641 unsigned comp = reg_comp(addr->dsts[0]); 642 if (comp == 0) { 643 array_insert(ir, ir->a0_users, instr); 644 } else { 645 assert(comp == 1); 646 array_insert(ir, ir->a1_users, instr); 647 } 648 } else { 649 assert(instr->address->def->instr == addr); 650 } 651} 652 653void 654ir3_block_clear_mark(struct ir3_block *block) 655{ 656 foreach_instr (instr, &block->instr_list) 657 instr->flags &= ~IR3_INSTR_MARK; 658} 659 660void 661ir3_clear_mark(struct ir3 *ir) 662{ 663 foreach_block (block, &ir->block_list) { 664 ir3_block_clear_mark(block); 665 } 666} 667 668unsigned 669ir3_count_instructions(struct ir3 *ir) 670{ 671 unsigned cnt = 1; 672 foreach_block (block, &ir->block_list) { 673 block->start_ip = cnt; 674 foreach_instr (instr, &block->instr_list) { 675 instr->ip = cnt++; 676 } 677 block->end_ip = cnt; 678 } 679 return cnt; 680} 681 682/* When counting instructions for RA, we insert extra fake instructions at the 683 * beginning of each block, where values become live, and at the end where 684 * values die. This prevents problems where values live-in at the beginning or 685 * live-out at the end of a block from being treated as if they were 686 * live-in/live-out at the first/last instruction, which would be incorrect. 687 * In ir3_legalize these ip's are assumed to be actual ip's of the final 688 * program, so it would be incorrect to use this everywhere. 689 */ 690 691unsigned 692ir3_count_instructions_ra(struct ir3 *ir) 693{ 694 unsigned cnt = 1; 695 foreach_block (block, &ir->block_list) { 696 block->start_ip = cnt++; 697 foreach_instr (instr, &block->instr_list) { 698 instr->ip = cnt++; 699 } 700 block->end_ip = cnt++; 701 } 702 return cnt; 703} 704 705struct ir3_array * 706ir3_lookup_array(struct ir3 *ir, unsigned id) 707{ 708 foreach_array (arr, &ir->array_list) 709 if (arr->id == id) 710 return arr; 711 return NULL; 712} 713 714void 715ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps) 716{ 717 /* We could do this in a single pass if we can assume instructions 718 * are always sorted. Which currently might not always be true. 719 * (In particular after ir3_group pass, but maybe other places.) 720 */ 721 foreach_block (block, &ir->block_list) 722 foreach_instr (instr, &block->instr_list) 723 instr->uses = NULL; 724 725 foreach_block (block, &ir->block_list) { 726 foreach_instr (instr, &block->instr_list) { 727 foreach_ssa_src_n (src, n, instr) { 728 if (__is_false_dep(instr, n) && !falsedeps) 729 continue; 730 if (!src->uses) 731 src->uses = _mesa_pointer_set_create(mem_ctx); 732 _mesa_set_add(src->uses, instr); 733 } 734 } 735 } 736} 737 738/** 739 * Set the destination type of an instruction, for example if a 740 * conversion is folded in, handling the special cases where the 741 * instruction's dest type or opcode needs to be fixed up. 742 */ 743void 744ir3_set_dst_type(struct ir3_instruction *instr, bool half) 745{ 746 if (half) { 747 instr->dsts[0]->flags |= IR3_REG_HALF; 748 } else { 749 instr->dsts[0]->flags &= ~IR3_REG_HALF; 750 } 751 752 switch (opc_cat(instr->opc)) { 753 case 1: /* move instructions */ 754 if (half) { 755 instr->cat1.dst_type = half_type(instr->cat1.dst_type); 756 } else { 757 instr->cat1.dst_type = full_type(instr->cat1.dst_type); 758 } 759 break; 760 case 4: 761 if (half) { 762 instr->opc = cat4_half_opc(instr->opc); 763 } else { 764 instr->opc = cat4_full_opc(instr->opc); 765 } 766 break; 767 case 5: 768 if (half) { 769 instr->cat5.type = half_type(instr->cat5.type); 770 } else { 771 instr->cat5.type = full_type(instr->cat5.type); 772 } 773 break; 774 } 775} 776 777/** 778 * One-time fixup for instruction src-types. Other than cov's that 779 * are folded, an instruction's src type does not change. 780 */ 781void 782ir3_fixup_src_type(struct ir3_instruction *instr) 783{ 784 if (instr->srcs_count == 0) 785 return; 786 787 switch (opc_cat(instr->opc)) { 788 case 1: /* move instructions */ 789 if (instr->srcs[0]->flags & IR3_REG_HALF) { 790 instr->cat1.src_type = half_type(instr->cat1.src_type); 791 } else { 792 instr->cat1.src_type = full_type(instr->cat1.src_type); 793 } 794 break; 795 case 3: 796 if (instr->srcs[0]->flags & IR3_REG_HALF) { 797 instr->opc = cat3_half_opc(instr->opc); 798 } else { 799 instr->opc = cat3_full_opc(instr->opc); 800 } 801 break; 802 } 803} 804 805/** 806 * Map a floating point immed to FLUT (float lookup table) value, 807 * returns negative for immediates that cannot be mapped. 808 */ 809int 810ir3_flut(struct ir3_register *src_reg) 811{ 812 static const struct { 813 uint32_t f32; 814 uint16_t f16; 815 } flut[] = { 816 { .f32 = 0x00000000, .f16 = 0x0000 }, /* 0.0 */ 817 { .f32 = 0x3f000000, .f16 = 0x3800 }, /* 0.5 */ 818 { .f32 = 0x3f800000, .f16 = 0x3c00 }, /* 1.0 */ 819 { .f32 = 0x40000000, .f16 = 0x4000 }, /* 2.0 */ 820 { .f32 = 0x402df854, .f16 = 0x4170 }, /* e */ 821 { .f32 = 0x40490fdb, .f16 = 0x4248 }, /* pi */ 822 { .f32 = 0x3ea2f983, .f16 = 0x3518 }, /* 1/pi */ 823 { .f32 = 0x3f317218, .f16 = 0x398c }, /* 1/log2(e) */ 824 { .f32 = 0x3fb8aa3b, .f16 = 0x3dc5 }, /* log2(e) */ 825 { .f32 = 0x3e9a209b, .f16 = 0x34d1 }, /* 1/log2(10) */ 826 { .f32 = 0x40549a78, .f16 = 0x42a5 }, /* log2(10) */ 827 { .f32 = 0x40800000, .f16 = 0x4400 }, /* 4.0 */ 828 }; 829 830 if (src_reg->flags & IR3_REG_HALF) { 831 /* Note that half-float immeds are already lowered to 16b in nir: */ 832 uint32_t imm = src_reg->uim_val; 833 for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) { 834 if (flut[i].f16 == imm) { 835 return i; 836 } 837 } 838 } else { 839 uint32_t imm = src_reg->uim_val; 840 for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) { 841 if (flut[i].f32 == imm) { 842 return i; 843 } 844 } 845 } 846 847 return -1; 848} 849 850static unsigned 851cp_flags(unsigned flags) 852{ 853 /* only considering these flags (at least for now): */ 854 flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS | 855 IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV | 856 IR3_REG_SHARED); 857 return flags; 858} 859 860bool 861ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags) 862{ 863 struct ir3_compiler *compiler = instr->block->shader->compiler; 864 unsigned valid_flags; 865 866 if ((flags & IR3_REG_SHARED) && opc_cat(instr->opc) > 3) 867 return false; 868 869 flags = cp_flags(flags); 870 871 /* If destination is indirect, then source cannot be.. at least 872 * I don't think so.. 873 */ 874 if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) && 875 (flags & IR3_REG_RELATIV)) 876 return false; 877 878 if (flags & IR3_REG_RELATIV) { 879 /* TODO need to test on earlier gens.. pretty sure the earlier 880 * problem was just that we didn't check that the src was from 881 * same block (since we can't propagate address register values 882 * across blocks currently) 883 */ 884 if (compiler->gen < 6) 885 return false; 886 887 /* NOTE in the special try_swap_mad_two_srcs() case we can be 888 * called on a src that has already had an indirect load folded 889 * in, in which case ssa() returns NULL 890 */ 891 if (instr->srcs[n]->flags & IR3_REG_SSA) { 892 struct ir3_instruction *src = ssa(instr->srcs[n]); 893 if (src->address->def->instr->block != instr->block) 894 return false; 895 } 896 } 897 898 if (is_meta(instr)) { 899 /* collect and phi nodes support const/immed sources, which will be 900 * turned into move instructions, but not anything else. 901 */ 902 if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED)) 903 return false; 904 905 if ((flags & IR3_REG_SHARED) && !(instr->dsts[0]->flags & IR3_REG_SHARED)) 906 return false; 907 908 return true; 909 } 910 911 switch (opc_cat(instr->opc)) { 912 case 0: /* end, chmask */ 913 return flags == 0; 914 case 1: 915 switch (instr->opc) { 916 case OPC_MOVMSK: 917 case OPC_SWZ: 918 case OPC_SCT: 919 case OPC_GAT: 920 valid_flags = IR3_REG_SHARED; 921 break; 922 case OPC_SCAN_MACRO: 923 return flags == 0; 924 break; 925 default: 926 valid_flags = 927 IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED; 928 } 929 if (flags & ~valid_flags) 930 return false; 931 break; 932 case 2: 933 valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST | 934 IR3_REG_RELATIV | IR3_REG_IMMED | IR3_REG_SHARED; 935 936 if (flags & ~valid_flags) 937 return false; 938 939 /* Allow an immediate src1 for flat.b, since it's ignored */ 940 if (instr->opc == OPC_FLAT_B && 941 n == 1 && flags == IR3_REG_IMMED) 942 return true; 943 944 if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) { 945 unsigned m = n ^ 1; 946 /* cannot deal w/ const or shared in both srcs: 947 * (note that some cat2 actually only have a single src) 948 */ 949 if (m < instr->srcs_count) { 950 struct ir3_register *reg = instr->srcs[m]; 951 if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) && 952 (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED))) 953 return false; 954 if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED)) 955 return false; 956 } 957 } 958 break; 959 case 3: 960 valid_flags = 961 ir3_cat3_absneg(instr->opc) | IR3_REG_RELATIV | IR3_REG_SHARED; 962 963 switch (instr->opc) { 964 case OPC_SHRM: 965 case OPC_SHLM: 966 case OPC_SHRG: 967 case OPC_SHLG: 968 case OPC_ANDG: { 969 valid_flags |= IR3_REG_IMMED; 970 /* Can be RELATIV+CONST but not CONST: */ 971 if (flags & IR3_REG_RELATIV) 972 valid_flags |= IR3_REG_CONST; 973 break; 974 } 975 case OPC_WMM: 976 case OPC_WMM_ACCU: { 977 valid_flags = IR3_REG_SHARED; 978 if (n == 2) 979 valid_flags = IR3_REG_CONST; 980 break; 981 } 982 case OPC_DP2ACC: 983 case OPC_DP4ACC: 984 break; 985 default: 986 valid_flags |= IR3_REG_CONST; 987 } 988 989 if (flags & ~valid_flags) 990 return false; 991 992 if (flags & (IR3_REG_CONST | IR3_REG_SHARED | IR3_REG_RELATIV)) { 993 /* cannot deal w/ const/shared/relativ in 2nd src: */ 994 if (n == 1) 995 return false; 996 } 997 998 break; 999 case 4: 1000 /* seems like blob compiler avoids const as src.. */ 1001 /* TODO double check if this is still the case on a4xx */ 1002 if (flags & (IR3_REG_CONST | IR3_REG_IMMED)) 1003 return false; 1004 if (flags & (IR3_REG_SABS | IR3_REG_SNEG)) 1005 return false; 1006 break; 1007 case 5: 1008 /* no flags allowed */ 1009 if (flags) 1010 return false; 1011 break; 1012 case 6: 1013 valid_flags = IR3_REG_IMMED; 1014 if (flags & ~valid_flags) 1015 return false; 1016 1017 if (flags & IR3_REG_IMMED) { 1018 /* doesn't seem like we can have immediate src for store 1019 * instructions: 1020 * 1021 * TODO this restriction could also apply to load instructions, 1022 * but for load instructions this arg is the address (and not 1023 * really sure any good way to test a hard-coded immed addr src) 1024 */ 1025 if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1)) 1026 return false; 1027 1028 if ((instr->opc == OPC_LDL) && (n == 0)) 1029 return false; 1030 1031 if ((instr->opc == OPC_STL) && (n != 2)) 1032 return false; 1033 1034 if ((instr->opc == OPC_LDP) && (n == 0)) 1035 return false; 1036 1037 if ((instr->opc == OPC_STP) && (n != 2)) 1038 return false; 1039 1040 if (instr->opc == OPC_STLW && n == 0) 1041 return false; 1042 1043 if (instr->opc == OPC_LDLW && n == 0) 1044 return false; 1045 1046 /* disallow immediates in anything but the SSBO slot argument for 1047 * cat6 instructions: 1048 */ 1049 if (is_global_a3xx_atomic(instr->opc) && (n != 0)) 1050 return false; 1051 1052 if (is_local_atomic(instr->opc) || is_global_a6xx_atomic(instr->opc) || 1053 is_bindless_atomic(instr->opc)) 1054 return false; 1055 1056 if (instr->opc == OPC_STG && (n == 2)) 1057 return false; 1058 1059 if (instr->opc == OPC_STG_A && (n == 4)) 1060 return false; 1061 1062 if (instr->opc == OPC_LDG && (n == 0)) 1063 return false; 1064 1065 if (instr->opc == OPC_LDG_A && (n < 2)) 1066 return false; 1067 1068 /* as with atomics, these cat6 instrs can only have an immediate 1069 * for SSBO/IBO slot argument 1070 */ 1071 switch (instr->opc) { 1072 case OPC_LDIB: 1073 case OPC_STIB: 1074 case OPC_RESINFO: 1075 if (n != 0) 1076 return false; 1077 break; 1078 default: 1079 break; 1080 } 1081 } 1082 1083 break; 1084 } 1085 1086 return true; 1087} 1088 1089bool 1090ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed) 1091{ 1092 if (instr->opc == OPC_MOV || is_meta(instr)) 1093 return true; 1094 1095 if (is_mem(instr)) { 1096 switch (instr->opc) { 1097 /* Some load/store instructions have a 13-bit offset and size which must 1098 * always be an immediate and the rest of the sources cannot be 1099 * immediates, so the frontend is responsible for checking the size: 1100 */ 1101 case OPC_LDL: 1102 case OPC_STL: 1103 case OPC_LDP: 1104 case OPC_STP: 1105 case OPC_LDG: 1106 case OPC_STG: 1107 case OPC_SPILL_MACRO: 1108 case OPC_RELOAD_MACRO: 1109 case OPC_LDG_A: 1110 case OPC_STG_A: 1111 case OPC_LDLW: 1112 case OPC_STLW: 1113 case OPC_LDLV: 1114 return true; 1115 default: 1116 /* most cat6 src immediates can only encode 8 bits: */ 1117 return !(immed & ~0xff); 1118 } 1119 } 1120 1121 /* Other than cat1 (mov) we can only encode up to 10 bits, sign-extended: */ 1122 return !(immed & ~0x1ff) || !(-immed & ~0x1ff); 1123} 1124