1/* 2 * Copyright (C) 2020 Collabora Ltd. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors (Collabora): 24 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> 25 */ 26 27#include "compiler.h" 28#include "bi_builder.h" 29 30/* Arguments common to worklist, passed by value for convenience */ 31 32struct bi_worklist { 33 /* # of instructions in the block */ 34 unsigned count; 35 36 /* Instructions in the block */ 37 bi_instr **instructions; 38 39 /* Bitset of instructions in the block ready for scheduling */ 40 BITSET_WORD *worklist; 41 42 /* The backwards dependency graph. nr_dependencies is the number of 43 * unscheduled instructions that must still be scheduled after (before) 44 * this instruction. dependents are which instructions need to be 45 * scheduled before (after) this instruction. */ 46 unsigned *dep_counts; 47 BITSET_WORD **dependents; 48}; 49 50/* State of a single tuple and clause under construction */ 51 52struct bi_reg_state { 53 /* Number of register writes */ 54 unsigned nr_writes; 55 56 /* Register reads, expressed as (equivalence classes of) 57 * sources. Only 3 reads are allowed, but up to 2 may spill as 58 * "forced" for the next scheduled tuple, provided such a tuple 59 * can be constructed */ 60 bi_index reads[5]; 61 unsigned nr_reads; 62 63 /* The previous tuple scheduled (= the next tuple executed in the 64 * program) may require certain writes, in order to bypass the register 65 * file and use a temporary passthrough for the value. Up to 2 such 66 * constraints are architecturally satisfiable */ 67 unsigned forced_count; 68 bi_index forceds[2]; 69}; 70 71struct bi_tuple_state { 72 /* Is this the last tuple in the clause */ 73 bool last; 74 75 /* Scheduled ADD instruction, or null if none */ 76 bi_instr *add; 77 78 /* Reads for previous (succeeding) tuple */ 79 bi_index prev_reads[5]; 80 unsigned nr_prev_reads; 81 bi_tuple *prev; 82 83 /* Register slot state for current tuple */ 84 struct bi_reg_state reg; 85 86 /* Constants are shared in the tuple. If constant_count is nonzero, it 87 * is a size for constant count. Otherwise, fau is the slot read from 88 * FAU, or zero if none is assigned. Ordinarily FAU slot 0 reads zero, 89 * but within a tuple, that should be encoded as constant_count != 0 90 * and constants[0] = constants[1] = 0 */ 91 unsigned constant_count; 92 93 union { 94 uint32_t constants[2]; 95 enum bir_fau fau; 96 }; 97 98 unsigned pcrel_idx; 99}; 100 101struct bi_const_state { 102 unsigned constant_count; 103 bool pcrel; /* applies to first const */ 104 uint32_t constants[2]; 105 106 /* Index of the constant into the clause */ 107 unsigned word_idx; 108}; 109 110enum bi_ftz_state { 111 /* No flush-to-zero state assigned yet */ 112 BI_FTZ_STATE_NONE, 113 114 /* Never flush-to-zero */ 115 BI_FTZ_STATE_DISABLE, 116 117 /* Always flush-to-zero */ 118 BI_FTZ_STATE_ENABLE, 119}; 120 121struct bi_clause_state { 122 /* Has a message-passing instruction already been assigned? */ 123 bool message; 124 125 /* Indices already accessed, this needs to be tracked to avoid hazards 126 * around message-passing instructions */ 127 unsigned access_count; 128 bi_index accesses[(BI_MAX_SRCS + BI_MAX_DESTS) * 16]; 129 130 unsigned tuple_count; 131 struct bi_const_state consts[8]; 132 133 /* Numerical state of the clause */ 134 enum bi_ftz_state ftz; 135}; 136 137/* Determines messsage type by checking the table and a few special cases. Only 138 * case missing is tilebuffer instructions that access depth/stencil, which 139 * require a Z_STENCIL message (to implement 140 * ARM_shader_framebuffer_fetch_depth_stencil) */ 141 142static enum bifrost_message_type 143bi_message_type_for_instr(bi_instr *ins) 144{ 145 enum bifrost_message_type msg = bi_opcode_props[ins->op].message; 146 bool ld_var_special = (ins->op == BI_OPCODE_LD_VAR_SPECIAL); 147 148 if (ld_var_special && ins->varying_name == BI_VARYING_NAME_FRAG_Z) 149 return BIFROST_MESSAGE_Z_STENCIL; 150 151 if (msg == BIFROST_MESSAGE_LOAD && ins->seg == BI_SEG_UBO) 152 return BIFROST_MESSAGE_ATTRIBUTE; 153 154 return msg; 155} 156 157/* Attribute, texture, and UBO load (attribute message) instructions support 158 * bindless, so just check the message type */ 159 160ASSERTED static bool 161bi_supports_dtsel(bi_instr *ins) 162{ 163 switch (bi_message_type_for_instr(ins)) { 164 case BIFROST_MESSAGE_ATTRIBUTE: 165 return ins->op != BI_OPCODE_LD_GCLK_U64; 166 case BIFROST_MESSAGE_TEX: 167 return true; 168 default: 169 return false; 170 } 171} 172 173/* Adds an edge to the dependency graph */ 174 175static void 176bi_push_dependency(unsigned parent, unsigned child, 177 BITSET_WORD **dependents, unsigned *dep_counts) 178{ 179 if (!BITSET_TEST(dependents[parent], child)) { 180 BITSET_SET(dependents[parent], child); 181 dep_counts[child]++; 182 } 183} 184 185static void 186add_dependency(struct util_dynarray *table, unsigned index, unsigned child, 187 BITSET_WORD **dependents, unsigned *dep_counts) 188{ 189 assert(index < 64); 190 util_dynarray_foreach(table + index, unsigned, parent) 191 bi_push_dependency(*parent, child, dependents, dep_counts); 192} 193 194static void 195mark_access(struct util_dynarray *table, unsigned index, unsigned parent) 196{ 197 assert(index < 64); 198 util_dynarray_append(&table[index], unsigned, parent); 199} 200 201static bool 202bi_is_sched_barrier(bi_instr *I) 203{ 204 switch (I->op) { 205 case BI_OPCODE_BARRIER: 206 case BI_OPCODE_DISCARD_F32: 207 return true; 208 default: 209 return false; 210 } 211} 212 213static void 214bi_create_dependency_graph(struct bi_worklist st, bool inorder, bool is_blend) 215{ 216 struct util_dynarray last_read[64], last_write[64]; 217 218 for (unsigned i = 0; i < 64; ++i) { 219 util_dynarray_init(&last_read[i], NULL); 220 util_dynarray_init(&last_write[i], NULL); 221 } 222 223 /* Initialize dependency graph */ 224 for (unsigned i = 0; i < st.count; ++i) { 225 st.dependents[i] = 226 calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD)); 227 228 st.dep_counts[i] = 0; 229 } 230 231 unsigned prev_msg = ~0; 232 233 /* Populate dependency graph */ 234 for (signed i = st.count - 1; i >= 0; --i) { 235 bi_instr *ins = st.instructions[i]; 236 237 bi_foreach_src(ins, s) { 238 if (ins->src[s].type != BI_INDEX_REGISTER) continue; 239 unsigned count = bi_count_read_registers(ins, s); 240 241 for (unsigned c = 0; c < count; ++c) 242 add_dependency(last_write, ins->src[s].value + c, i, st.dependents, st.dep_counts); 243 } 244 245 /* Keep message-passing ops in order. (This pass only cares 246 * about bundling; reordering of message-passing instructions 247 * happens during earlier scheduling.) */ 248 249 if (bi_message_type_for_instr(ins)) { 250 if (prev_msg != ~0) 251 bi_push_dependency(prev_msg, i, st.dependents, st.dep_counts); 252 253 prev_msg = i; 254 } 255 256 /* Handle schedule barriers, adding All the deps */ 257 if (inorder || bi_is_sched_barrier(ins)) { 258 for (unsigned j = 0; j < st.count; ++j) { 259 if (i == j) continue; 260 261 bi_push_dependency(MAX2(i, j), MIN2(i, j), 262 st.dependents, st.dep_counts); 263 } 264 } 265 266 bi_foreach_dest(ins, d) { 267 if (ins->dest[d].type != BI_INDEX_REGISTER) continue; 268 unsigned dest = ins->dest[d].value; 269 270 unsigned count = bi_count_write_registers(ins, d); 271 272 for (unsigned c = 0; c < count; ++c) { 273 add_dependency(last_read, dest + c, i, st.dependents, st.dep_counts); 274 add_dependency(last_write, dest + c, i, st.dependents, st.dep_counts); 275 mark_access(last_write, dest + c, i); 276 } 277 } 278 279 /* Blend shaders are allowed to clobber R0-R15. Treat these 280 * registers like extra destinations for scheduling purposes. 281 */ 282 if (ins->op == BI_OPCODE_BLEND && !is_blend) { 283 for (unsigned c = 0; c < 16; ++c) { 284 add_dependency(last_read, c, i, st.dependents, st.dep_counts); 285 add_dependency(last_write, c, i, st.dependents, st.dep_counts); 286 mark_access(last_write, c, i); 287 } 288 } 289 290 bi_foreach_src(ins, s) { 291 if (ins->src[s].type != BI_INDEX_REGISTER) continue; 292 293 unsigned count = bi_count_read_registers(ins, s); 294 295 for (unsigned c = 0; c < count; ++c) 296 mark_access(last_read, ins->src[s].value + c, i); 297 } 298 } 299 300 /* If there is a branch, all instructions depend on it, as interblock 301 * execution must be purely in-order */ 302 303 bi_instr *last = st.instructions[st.count - 1]; 304 if (last->branch_target || last->op == BI_OPCODE_JUMP) { 305 for (signed i = st.count - 2; i >= 0; --i) 306 bi_push_dependency(st.count - 1, i, st.dependents, st.dep_counts); 307 } 308 309 /* Free the intermediate structures */ 310 for (unsigned i = 0; i < 64; ++i) { 311 util_dynarray_fini(&last_read[i]); 312 util_dynarray_fini(&last_write[i]); 313 } 314} 315 316/* Scheduler pseudoinstruction lowerings to enable instruction pairings. 317 * Currently only support CUBEFACE -> *CUBEFACE1/+CUBEFACE2 318 */ 319 320static bi_instr * 321bi_lower_cubeface(bi_context *ctx, 322 struct bi_clause_state *clause, struct bi_tuple_state *tuple) 323{ 324 bi_instr *pinstr = tuple->add; 325 bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr)); 326 bi_instr *cubeface1 = bi_cubeface1_to(&b, pinstr->dest[0], 327 pinstr->src[0], pinstr->src[1], pinstr->src[2]); 328 329 pinstr->op = BI_OPCODE_CUBEFACE2; 330 pinstr->dest[0] = pinstr->dest[1]; 331 pinstr->dest[1] = bi_null(); 332 pinstr->src[0] = cubeface1->dest[0]; 333 pinstr->src[1] = bi_null(); 334 pinstr->src[2] = bi_null(); 335 336 return cubeface1; 337} 338 339/* Psuedo arguments are (rbase, address lo, address hi). We need *ATOM_C.i32 to 340 * have the arguments (address lo, address hi, rbase), and +ATOM_CX to have the 341 * arguments (rbase, address lo, address hi, rbase) */ 342 343static bi_instr * 344bi_lower_atom_c(bi_context *ctx, struct bi_clause_state *clause, struct 345 bi_tuple_state *tuple) 346{ 347 bi_instr *pinstr = tuple->add; 348 bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr)); 349 bi_instr *atom_c = bi_atom_c_return_i32(&b, 350 pinstr->src[1], pinstr->src[2], pinstr->src[0], 351 pinstr->atom_opc); 352 353 if (bi_is_null(pinstr->dest[0])) 354 atom_c->op = BI_OPCODE_ATOM_C_I32; 355 356 pinstr->op = BI_OPCODE_ATOM_CX; 357 pinstr->src[3] = atom_c->src[2]; 358 359 return atom_c; 360} 361 362static bi_instr * 363bi_lower_atom_c1(bi_context *ctx, struct bi_clause_state *clause, struct 364 bi_tuple_state *tuple) 365{ 366 bi_instr *pinstr = tuple->add; 367 bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr)); 368 bi_instr *atom_c = bi_atom_c1_return_i32(&b, 369 pinstr->src[0], pinstr->src[1], pinstr->atom_opc); 370 371 if (bi_is_null(pinstr->dest[0])) 372 atom_c->op = BI_OPCODE_ATOM_C1_I32; 373 374 pinstr->op = BI_OPCODE_ATOM_CX; 375 pinstr->src[2] = pinstr->src[1]; 376 pinstr->src[1] = pinstr->src[0]; 377 pinstr->src[3] = bi_dontcare(&b); 378 pinstr->src[0] = bi_null(); 379 380 return atom_c; 381} 382 383static bi_instr * 384bi_lower_seg_add(bi_context *ctx, 385 struct bi_clause_state *clause, struct bi_tuple_state *tuple) 386{ 387 bi_instr *pinstr = tuple->add; 388 bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr)); 389 390 bi_instr *fma = bi_seg_add_to(&b, pinstr->dest[0], pinstr->src[0], 391 pinstr->preserve_null, pinstr->seg); 392 393 pinstr->op = BI_OPCODE_SEG_ADD; 394 pinstr->src[0] = pinstr->src[1]; 395 pinstr->src[1] = bi_null(); 396 397 assert(pinstr->dest[0].type == BI_INDEX_REGISTER); 398 pinstr->dest[0].value += 1; 399 400 return fma; 401} 402 403static bi_instr * 404bi_lower_dtsel(bi_context *ctx, 405 struct bi_clause_state *clause, struct bi_tuple_state *tuple) 406{ 407 bi_instr *add = tuple->add; 408 bi_builder b = bi_init_builder(ctx, bi_before_instr(add)); 409 410 bi_instr *dtsel = bi_dtsel_imm_to(&b, bi_temp(b.shader), 411 add->src[0], add->table); 412 add->src[0] = dtsel->dest[0]; 413 414 assert(bi_supports_dtsel(add)); 415 return dtsel; 416} 417 418/* Flatten linked list to array for O(1) indexing */ 419 420static bi_instr ** 421bi_flatten_block(bi_block *block, unsigned *len) 422{ 423 if (list_is_empty(&block->instructions)) 424 return NULL; 425 426 *len = list_length(&block->instructions); 427 bi_instr **instructions = malloc(sizeof(bi_instr *) * (*len)); 428 429 unsigned i = 0; 430 431 bi_foreach_instr_in_block(block, ins) 432 instructions[i++] = ins; 433 434 return instructions; 435} 436 437/* The worklist would track instructions without outstanding dependencies. For 438 * debug, force in-order scheduling (no dependency graph is constructed). 439 */ 440 441static struct bi_worklist 442bi_initialize_worklist(bi_block *block, bool inorder, bool is_blend) 443{ 444 struct bi_worklist st = { }; 445 st.instructions = bi_flatten_block(block, &st.count); 446 447 if (!st.count) 448 return st; 449 450 st.dependents = calloc(st.count, sizeof(st.dependents[0])); 451 st.dep_counts = calloc(st.count, sizeof(st.dep_counts[0])); 452 453 bi_create_dependency_graph(st, inorder, is_blend); 454 st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD)); 455 456 for (unsigned i = 0; i < st.count; ++i) { 457 if (st.dep_counts[i] == 0) 458 BITSET_SET(st.worklist, i); 459 } 460 461 return st; 462} 463 464static void 465bi_free_worklist(struct bi_worklist st) 466{ 467 free(st.dep_counts); 468 free(st.dependents); 469 free(st.instructions); 470 free(st.worklist); 471} 472 473static void 474bi_update_worklist(struct bi_worklist st, unsigned idx) 475{ 476 assert(st.dep_counts[idx] == 0); 477 478 if (!st.dependents[idx]) 479 return; 480 481 /* Iterate each dependent to remove one dependency (`done`), 482 * adding dependents to the worklist where possible. */ 483 484 unsigned i; 485 BITSET_FOREACH_SET(i, st.dependents[idx], st.count) { 486 assert(st.dep_counts[i] != 0); 487 unsigned new_deps = --st.dep_counts[i]; 488 489 if (new_deps == 0) 490 BITSET_SET(st.worklist, i); 491 } 492 493 free(st.dependents[idx]); 494} 495 496/* Scheduler predicates */ 497 498/* IADDC.i32 can implement IADD.u32 if no saturation or swizzling is in use */ 499static bool 500bi_can_iaddc(bi_instr *ins) 501{ 502 return (ins->op == BI_OPCODE_IADD_U32 && !ins->saturate && 503 ins->src[0].swizzle == BI_SWIZZLE_H01 && 504 ins->src[1].swizzle == BI_SWIZZLE_H01); 505} 506 507/* 508 * The encoding of *FADD.v2f16 only specifies a single abs flag. All abs 509 * encodings are permitted by swapping operands; however, this scheme fails if 510 * both operands are equal. Test for this case. 511 */ 512static bool 513bi_impacted_abs(bi_instr *I) 514{ 515 return I->src[0].abs && I->src[1].abs && 516 bi_is_word_equiv(I->src[0], I->src[1]); 517} 518 519bool 520bi_can_fma(bi_instr *ins) 521{ 522 /* +IADD.i32 -> *IADDC.i32 */ 523 if (bi_can_iaddc(ins)) 524 return true; 525 526 /* +MUX -> *CSEL */ 527 if (bi_can_replace_with_csel(ins)) 528 return true; 529 530 /* *FADD.v2f16 has restricted abs modifiers, use +FADD.v2f16 instead */ 531 if (ins->op == BI_OPCODE_FADD_V2F16 && bi_impacted_abs(ins)) 532 return false; 533 534 /* TODO: some additional fp16 constraints */ 535 return bi_opcode_props[ins->op].fma; 536} 537 538static bool 539bi_impacted_fadd_widens(bi_instr *I) 540{ 541 enum bi_swizzle swz0 = I->src[0].swizzle; 542 enum bi_swizzle swz1 = I->src[1].swizzle; 543 544 return (swz0 == BI_SWIZZLE_H00 && swz1 == BI_SWIZZLE_H11) || 545 (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H11) || 546 (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H00); 547} 548 549bool 550bi_can_add(bi_instr *ins) 551{ 552 /* +FADD.v2f16 lacks clamp modifier, use *FADD.v2f16 instead */ 553 if (ins->op == BI_OPCODE_FADD_V2F16 && ins->clamp) 554 return false; 555 556 /* +FCMP.v2f16 lacks abs modifier, use *FCMP.v2f16 instead */ 557 if (ins->op == BI_OPCODE_FCMP_V2F16 && (ins->src[0].abs || ins->src[1].abs)) 558 return false; 559 560 /* +FADD.f32 has restricted widens, use +FADD.f32 for the full set */ 561 if (ins->op == BI_OPCODE_FADD_F32 && bi_impacted_fadd_widens(ins)) 562 return false; 563 564 /* TODO: some additional fp16 constraints */ 565 return bi_opcode_props[ins->op].add; 566} 567 568/* Architecturally, no single instruction has a "not last" constraint. However, 569 * pseudoinstructions writing multiple destinations (expanding to multiple 570 * paired instructions) can run afoul of the "no two writes on the last clause" 571 * constraint, so we check for that here. 572 * 573 * Exception to the exception: TEXC, which writes to multiple sets of staging 574 * registers. Staging registers bypass the usual register write mechanism so 575 * this restriction does not apply. 576 */ 577 578static bool 579bi_must_not_last(bi_instr *ins) 580{ 581 return !bi_is_null(ins->dest[0]) && !bi_is_null(ins->dest[1]) && 582 (ins->op != BI_OPCODE_TEXC); 583} 584 585/* Check for a message-passing instruction. +DISCARD.f32 is special-cased; we 586 * treat it as a message-passing instruction for the purpose of scheduling 587 * despite no passing no logical message. Otherwise invalid encoding faults may 588 * be raised for unknown reasons (possibly an errata). 589 */ 590 591bool 592bi_must_message(bi_instr *ins) 593{ 594 return (bi_opcode_props[ins->op].message != BIFROST_MESSAGE_NONE) || 595 (ins->op == BI_OPCODE_DISCARD_F32); 596} 597 598static bool 599bi_fma_atomic(enum bi_opcode op) 600{ 601 switch (op) { 602 case BI_OPCODE_ATOM_C_I32: 603 case BI_OPCODE_ATOM_C_I64: 604 case BI_OPCODE_ATOM_C1_I32: 605 case BI_OPCODE_ATOM_C1_I64: 606 case BI_OPCODE_ATOM_C1_RETURN_I32: 607 case BI_OPCODE_ATOM_C1_RETURN_I64: 608 case BI_OPCODE_ATOM_C_RETURN_I32: 609 case BI_OPCODE_ATOM_C_RETURN_I64: 610 case BI_OPCODE_ATOM_POST_I32: 611 case BI_OPCODE_ATOM_POST_I64: 612 case BI_OPCODE_ATOM_PRE_I64: 613 return true; 614 default: 615 return false; 616 } 617} 618 619bool 620bi_reads_zero(bi_instr *ins) 621{ 622 return !(bi_fma_atomic(ins->op) || ins->op == BI_OPCODE_IMULD); 623} 624 625bool 626bi_reads_temps(bi_instr *ins, unsigned src) 627{ 628 switch (ins->op) { 629 /* Cannot permute a temporary */ 630 case BI_OPCODE_CLPER_I32: 631 case BI_OPCODE_CLPER_OLD_I32: 632 return src != 0; 633 634 /* ATEST isn't supposed to be restricted, but in practice it always 635 * wants to source its coverage mask input (source 0) from register 60, 636 * which won't work properly if we put the input in a temp. This 637 * requires workarounds in both RA and clause scheduling. 638 */ 639 case BI_OPCODE_ATEST: 640 return src != 0; 641 642 case BI_OPCODE_IMULD: 643 return false; 644 default: 645 return true; 646 } 647} 648 649static bool 650bi_impacted_t_modifiers(bi_instr *I, unsigned src) 651{ 652 enum bi_swizzle swizzle = I->src[src].swizzle; 653 654 switch (I->op) { 655 case BI_OPCODE_F16_TO_F32: 656 case BI_OPCODE_F16_TO_S32: 657 case BI_OPCODE_F16_TO_U32: 658 case BI_OPCODE_MKVEC_V2I16: 659 case BI_OPCODE_S16_TO_F32: 660 case BI_OPCODE_S16_TO_S32: 661 case BI_OPCODE_U16_TO_F32: 662 case BI_OPCODE_U16_TO_U32: 663 return (swizzle != BI_SWIZZLE_H00); 664 665 case BI_OPCODE_BRANCH_F32: 666 case BI_OPCODE_LOGB_F32: 667 case BI_OPCODE_ILOGB_F32: 668 case BI_OPCODE_FADD_F32: 669 case BI_OPCODE_FCMP_F32: 670 case BI_OPCODE_FREXPE_F32: 671 case BI_OPCODE_FREXPM_F32: 672 case BI_OPCODE_FROUND_F32: 673 return (swizzle != BI_SWIZZLE_H01); 674 675 case BI_OPCODE_IADD_S32: 676 case BI_OPCODE_IADD_U32: 677 case BI_OPCODE_ISUB_S32: 678 case BI_OPCODE_ISUB_U32: 679 case BI_OPCODE_IADD_V4S8: 680 case BI_OPCODE_IADD_V4U8: 681 case BI_OPCODE_ISUB_V4S8: 682 case BI_OPCODE_ISUB_V4U8: 683 return (src == 1) && (swizzle != BI_SWIZZLE_H01); 684 685 case BI_OPCODE_S8_TO_F32: 686 case BI_OPCODE_S8_TO_S32: 687 case BI_OPCODE_U8_TO_F32: 688 case BI_OPCODE_U8_TO_U32: 689 return (swizzle != BI_SWIZZLE_B0000); 690 691 case BI_OPCODE_V2S8_TO_V2F16: 692 case BI_OPCODE_V2S8_TO_V2S16: 693 case BI_OPCODE_V2U8_TO_V2F16: 694 case BI_OPCODE_V2U8_TO_V2U16: 695 return (swizzle != BI_SWIZZLE_B0022); 696 697 case BI_OPCODE_IADD_V2S16: 698 case BI_OPCODE_IADD_V2U16: 699 case BI_OPCODE_ISUB_V2S16: 700 case BI_OPCODE_ISUB_V2U16: 701 return (src == 1) && (swizzle >= BI_SWIZZLE_H11); 702 703#if 0 704 /* Restriction on IADD in 64-bit clauses on G72 */ 705 case BI_OPCODE_IADD_S64: 706 case BI_OPCODE_IADD_U64: 707 return (src == 1) && (swizzle != BI_SWIZZLE_D0); 708#endif 709 710 default: 711 return false; 712 } 713} 714 715bool 716bi_reads_t(bi_instr *ins, unsigned src) 717{ 718 /* Branch offset cannot come from passthrough */ 719 if (bi_opcode_props[ins->op].branch) 720 return src != 2; 721 722 /* Table can never read passthrough */ 723 if (bi_opcode_props[ins->op].table) 724 return false; 725 726 /* Staging register reads may happen before the succeeding register 727 * block encodes a write, so effectively there is no passthrough */ 728 if (bi_is_staging_src(ins, src)) 729 return false; 730 731 /* Bifrost cores newer than Mali G71 have restrictions on swizzles on 732 * same-cycle temporaries. Check the list for these hazards. */ 733 if (bi_impacted_t_modifiers(ins, src)) 734 return false; 735 736 /* Descriptor must not come from a passthrough */ 737 switch (ins->op) { 738 case BI_OPCODE_LD_CVT: 739 case BI_OPCODE_LD_TILE: 740 case BI_OPCODE_ST_CVT: 741 case BI_OPCODE_ST_TILE: 742 case BI_OPCODE_TEXC: 743 return src != 2; 744 case BI_OPCODE_BLEND: 745 return src != 2 && src != 3; 746 747 /* +JUMP can't read the offset from T */ 748 case BI_OPCODE_JUMP: 749 return false; 750 751 /* Else, just check if we can read any temps */ 752 default: 753 return bi_reads_temps(ins, src); 754 } 755} 756 757/* Counts the number of 64-bit constants required by a clause. TODO: We 758 * might want to account for merging, right now we overestimate, but 759 * that's probably fine most of the time */ 760 761static unsigned 762bi_nconstants(struct bi_clause_state *clause) 763{ 764 unsigned count_32 = 0; 765 766 for (unsigned i = 0; i < ARRAY_SIZE(clause->consts); ++i) 767 count_32 += clause->consts[i].constant_count; 768 769 return DIV_ROUND_UP(count_32, 2); 770} 771 772/* Would there be space for constants if we added one tuple? */ 773 774static bool 775bi_space_for_more_constants(struct bi_clause_state *clause) 776{ 777 return (bi_nconstants(clause) < 13 - (clause->tuple_count + 1)); 778} 779 780/* Updates the FAU assignment for a tuple. A valid FAU assignment must be 781 * possible (as a precondition), though not necessarily on the selected unit; 782 * this is gauranteed per-instruction by bi_lower_fau and per-tuple by 783 * bi_instr_schedulable */ 784 785static bool 786bi_update_fau(struct bi_clause_state *clause, 787 struct bi_tuple_state *tuple, 788 bi_instr *instr, bool fma, bool destructive) 789{ 790 /* Maintain our own constants, for nondestructive mode */ 791 uint32_t copied_constants[2], copied_count; 792 unsigned *constant_count = &tuple->constant_count; 793 uint32_t *constants = tuple->constants; 794 enum bir_fau fau = tuple->fau; 795 796 if (!destructive) { 797 memcpy(copied_constants, tuple->constants, 798 (*constant_count) * sizeof(constants[0])); 799 copied_count = tuple->constant_count; 800 801 constant_count = &copied_count; 802 constants = copied_constants; 803 } 804 805 bi_foreach_src(instr, s) { 806 bi_index src = instr->src[s]; 807 808 if (src.type == BI_INDEX_FAU) { 809 bool no_constants = *constant_count == 0; 810 bool no_other_fau = (fau == src.value) || !fau; 811 bool mergable = no_constants && no_other_fau; 812 813 if (destructive) { 814 assert(mergable); 815 tuple->fau = src.value; 816 } else if (!mergable) { 817 return false; 818 } 819 820 fau = src.value; 821 } else if (src.type == BI_INDEX_CONSTANT) { 822 /* No need to reserve space if we have a fast 0 */ 823 if (src.value == 0 && fma && bi_reads_zero(instr)) 824 continue; 825 826 /* If there is a branch target, #0 by convention is the 827 * PC-relative offset to the target */ 828 bool pcrel = instr->branch_target && src.value == 0; 829 bool found = false; 830 831 for (unsigned i = 0; i < *constant_count; ++i) { 832 found |= (constants[i] == src.value) && 833 (i != tuple->pcrel_idx); 834 } 835 836 /* pcrel constants are unique, so don't match */ 837 if (found && !pcrel) 838 continue; 839 840 bool no_fau = (*constant_count > 0) || !fau; 841 bool mergable = no_fau && ((*constant_count) < 2); 842 843 if (destructive) { 844 assert(mergable); 845 846 if (pcrel) 847 tuple->pcrel_idx = *constant_count; 848 } else if (!mergable) 849 return false; 850 851 constants[(*constant_count)++] = src.value; 852 } 853 } 854 855 /* Constants per clause may be limited by tuple count */ 856 bool room_for_constants = (*constant_count == 0) || 857 bi_space_for_more_constants(clause); 858 859 if (destructive) 860 assert(room_for_constants); 861 else if (!room_for_constants) 862 return false; 863 864 return true; 865} 866 867/* Given an in-progress tuple, a candidate new instruction to add to the tuple, 868 * and a source (index) from that candidate, determine whether this source is 869 * "new", in the sense of requiring an additional read slot. That is, checks 870 * whether the specified source reads from the register file via a read slot 871 * (determined by its type and placement) and whether the source was already 872 * specified by a prior read slot (to avoid double counting) */ 873 874static bool 875bi_tuple_is_new_src(bi_instr *instr, struct bi_reg_state *reg, unsigned src_idx) 876{ 877 bi_index src = instr->src[src_idx]; 878 879 /* Only consider sources which come from the register file */ 880 if (!(src.type == BI_INDEX_NORMAL || src.type == BI_INDEX_REGISTER)) 881 return false; 882 883 /* Staging register reads bypass the usual register file mechanism */ 884 if (bi_is_staging_src(instr, src_idx)) 885 return false; 886 887 /* If a source is already read in the tuple, it is already counted */ 888 for (unsigned t = 0; t < reg->nr_reads; ++t) 889 if (bi_is_word_equiv(src, reg->reads[t])) 890 return false; 891 892 /* If a source is read in _this instruction_, it is already counted */ 893 for (unsigned t = 0; t < src_idx; ++t) 894 if (bi_is_word_equiv(src, instr->src[t])) 895 return false; 896 897 return true; 898} 899 900/* Given two tuples in source order, count the number of register reads of the 901 * successor, determined as the number of unique words accessed that aren't 902 * written by the predecessor (since those are tempable). 903 */ 904 905static unsigned 906bi_count_succ_reads(bi_index t0, bi_index t1, 907 bi_index *succ_reads, unsigned nr_succ_reads) 908{ 909 unsigned reads = 0; 910 911 for (unsigned i = 0; i < nr_succ_reads; ++i) { 912 bool unique = true; 913 914 for (unsigned j = 0; j < i; ++j) 915 if (bi_is_word_equiv(succ_reads[i], succ_reads[j])) 916 unique = false; 917 918 if (!unique) 919 continue; 920 921 if (bi_is_word_equiv(succ_reads[i], t0)) 922 continue; 923 924 if (bi_is_word_equiv(succ_reads[i], t1)) 925 continue; 926 927 reads++; 928 } 929 930 return reads; 931} 932 933/* Not all instructions can read from the staging passthrough (as determined by 934 * reads_t), check if a given pair of instructions has such a restriction. Note 935 * we also use this mechanism to prevent data races around staging register 936 * reads, so we allow the input source to potentially be vector-valued */ 937 938static bool 939bi_has_staging_passthrough_hazard(bi_index fma, bi_instr *add) 940{ 941 bi_foreach_src(add, s) { 942 bi_index src = add->src[s]; 943 944 if (src.type != BI_INDEX_REGISTER) 945 continue; 946 947 unsigned count = bi_count_read_registers(add, s); 948 bool read = false; 949 950 for (unsigned d = 0; d < count; ++d) 951 read |= bi_is_equiv(fma, bi_register(src.value + d)); 952 953 if (read && !bi_reads_t(add, s)) 954 return true; 955 } 956 957 return false; 958} 959 960/* Likewise for cross-tuple passthrough (reads_temps) */ 961 962static bool 963bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins) 964{ 965 bi_foreach_instr_in_tuple(succ, pins) { 966 bi_foreach_src(pins, s) { 967 if (bi_is_word_equiv(ins->dest[0], pins->src[s]) && 968 !bi_reads_temps(pins, s)) 969 return true; 970 } 971 } 972 973 return false; 974} 975 976/* Is a register written other than the staging mechanism? ATEST is special, 977 * writing to both a staging register and a regular register (fixed packing). 978 * BLEND is special since it has to write r48 the normal way even if it never 979 * gets read. This depends on liveness analysis, as a register is not needed 980 * for a write that will be discarded after one tuple. */ 981 982static unsigned 983bi_write_count(bi_instr *instr, uint64_t live_after_temp) 984{ 985 if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND) 986 return 1; 987 988 unsigned count = 0; 989 990 bi_foreach_dest(instr, d) { 991 if (d == 0 && bi_opcode_props[instr->op].sr_write) 992 continue; 993 994 if (bi_is_null(instr->dest[d])) 995 continue; 996 997 assert(instr->dest[0].type == BI_INDEX_REGISTER); 998 if (live_after_temp & BITFIELD64_BIT(instr->dest[0].value)) 999 count++; 1000 } 1001 1002 return count; 1003} 1004 1005/* 1006 * Test if an instruction required flush-to-zero mode. Currently only supported 1007 * for f16<-->f32 conversions to implement fquantize16 1008 */ 1009static bool 1010bi_needs_ftz(bi_instr *I) 1011{ 1012 return (I->op == BI_OPCODE_F16_TO_F32 || 1013 I->op == BI_OPCODE_V2F32_TO_V2F16) && I->ftz; 1014} 1015 1016/* 1017 * Test if an instruction would be numerically incompatible with the clause. At 1018 * present we only consider flush-to-zero modes. 1019 */ 1020static bool 1021bi_numerically_incompatible(struct bi_clause_state *clause, bi_instr *instr) 1022{ 1023 return (clause->ftz != BI_FTZ_STATE_NONE) && 1024 ((clause->ftz == BI_FTZ_STATE_ENABLE) != bi_needs_ftz(instr)); 1025} 1026 1027/* Instruction placement entails two questions: what subset of instructions in 1028 * the block can legally be scheduled? and of those which is the best? That is, 1029 * we seek to maximize a cost function on a subset of the worklist satisfying a 1030 * particular predicate. The necessary predicate is determined entirely by 1031 * Bifrost's architectural limitations and is described in the accompanying 1032 * whitepaper. The cost function is a heuristic. */ 1033 1034static bool 1035bi_instr_schedulable(bi_instr *instr, 1036 struct bi_clause_state *clause, 1037 struct bi_tuple_state *tuple, 1038 uint64_t live_after_temp, 1039 bool fma) 1040{ 1041 /* The units must match */ 1042 if ((fma && !bi_can_fma(instr)) || (!fma && !bi_can_add(instr))) 1043 return false; 1044 1045 /* There can only be one message-passing instruction per clause */ 1046 if (bi_must_message(instr) && clause->message) 1047 return false; 1048 1049 /* Some instructions have placement requirements */ 1050 if (bi_opcode_props[instr->op].last && !tuple->last) 1051 return false; 1052 1053 if (bi_must_not_last(instr) && tuple->last) 1054 return false; 1055 1056 /* Numerical properties must be compatible with the clause */ 1057 if (bi_numerically_incompatible(clause, instr)) 1058 return false; 1059 1060 /* Message-passing instructions are not guaranteed write within the 1061 * same clause (most likely they will not), so if a later instruction 1062 * in the clause accesses the destination, the message-passing 1063 * instruction can't be scheduled */ 1064 if (bi_opcode_props[instr->op].sr_write) { 1065 bi_foreach_dest(instr, d) { 1066 if (bi_is_null(instr->dest[d])) 1067 continue; 1068 1069 unsigned nr = bi_count_write_registers(instr, d); 1070 assert(instr->dest[d].type == BI_INDEX_REGISTER); 1071 unsigned reg = instr->dest[d].value; 1072 1073 for (unsigned i = 0; i < clause->access_count; ++i) { 1074 bi_index idx = clause->accesses[i]; 1075 for (unsigned d = 0; d < nr; ++d) { 1076 if (bi_is_equiv(bi_register(reg + d), idx)) 1077 return false; 1078 } 1079 } 1080 } 1081 } 1082 1083 if (bi_opcode_props[instr->op].sr_read && !bi_is_null(instr->src[0])) { 1084 unsigned nr = bi_count_read_registers(instr, 0); 1085 assert(instr->src[0].type == BI_INDEX_REGISTER); 1086 unsigned reg = instr->src[0].value; 1087 1088 for (unsigned i = 0; i < clause->access_count; ++i) { 1089 bi_index idx = clause->accesses[i]; 1090 for (unsigned d = 0; d < nr; ++d) { 1091 if (bi_is_equiv(bi_register(reg + d), idx)) 1092 return false; 1093 } 1094 } 1095 } 1096 1097 /* If FAU is already assigned, we may not disrupt that. Do a 1098 * non-disruptive test update */ 1099 if (!bi_update_fau(clause, tuple, instr, fma, false)) 1100 return false; 1101 1102 /* If this choice of FMA would force a staging passthrough, the ADD 1103 * instruction must support such a passthrough */ 1104 if (tuple->add && bi_has_staging_passthrough_hazard(instr->dest[0], tuple->add)) 1105 return false; 1106 1107 /* If this choice of destination would force a cross-tuple passthrough, the next tuple must support that */ 1108 if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr)) 1109 return false; 1110 1111 /* Register file writes are limited */ 1112 unsigned total_writes = tuple->reg.nr_writes; 1113 total_writes += bi_write_count(instr, live_after_temp); 1114 1115 /* Last tuple in a clause can only write a single value */ 1116 if (tuple->last && total_writes > 1) 1117 return false; 1118 1119 /* Register file reads are limited, so count unique */ 1120 1121 unsigned unique_new_srcs = 0; 1122 1123 bi_foreach_src(instr, s) { 1124 if (bi_tuple_is_new_src(instr, &tuple->reg, s)) 1125 unique_new_srcs++; 1126 } 1127 1128 unsigned total_srcs = tuple->reg.nr_reads + unique_new_srcs; 1129 1130 bool can_spill_to_moves = (!tuple->add); 1131 can_spill_to_moves &= (bi_nconstants(clause) < 13 - (clause->tuple_count + 2)); 1132 can_spill_to_moves &= (clause->tuple_count < 7); 1133 1134 /* However, we can get an extra 1 or 2 sources by inserting moves */ 1135 if (total_srcs > (can_spill_to_moves ? 4 : 3)) 1136 return false; 1137 1138 /* Count effective reads for the successor */ 1139 unsigned succ_reads = bi_count_succ_reads(instr->dest[0], 1140 tuple->add ? tuple->add->dest[0] : bi_null(), 1141 tuple->prev_reads, tuple->nr_prev_reads); 1142 1143 /* Successor must satisfy R+W <= 4, so we require W <= 4-R */ 1144 if ((signed) total_writes > (4 - (signed) succ_reads)) 1145 return false; 1146 1147 return true; 1148} 1149 1150static signed 1151bi_instr_cost(bi_instr *instr, struct bi_tuple_state *tuple) 1152{ 1153 signed cost = 0; 1154 1155 /* Instructions that can schedule to either FMA or to ADD should be 1156 * deprioritized since they're easier to reschedule elsewhere */ 1157 if (bi_can_fma(instr) && bi_can_add(instr)) 1158 cost++; 1159 1160 /* Message-passing instructions impose constraints on the registers 1161 * later in the clause, so schedule them as late within a clause as 1162 * possible (<==> prioritize them since we're backwards <==> decrease 1163 * cost) */ 1164 if (bi_must_message(instr)) 1165 cost--; 1166 1167 /* Last instructions are big constraints (XXX: no effect on shader-db) */ 1168 if (bi_opcode_props[instr->op].last) 1169 cost -= 2; 1170 1171 return cost; 1172} 1173 1174static unsigned 1175bi_choose_index(struct bi_worklist st, 1176 struct bi_clause_state *clause, 1177 struct bi_tuple_state *tuple, 1178 uint64_t live_after_temp, 1179 bool fma) 1180{ 1181 unsigned i, best_idx = ~0; 1182 signed best_cost = INT_MAX; 1183 1184 BITSET_FOREACH_SET(i, st.worklist, st.count) { 1185 bi_instr *instr = st.instructions[i]; 1186 1187 if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma)) 1188 continue; 1189 1190 signed cost = bi_instr_cost(instr, tuple); 1191 1192 /* Tie break in favour of later instructions, under the 1193 * assumption this promotes temporary usage (reducing pressure 1194 * on the register file). This is a side effect of a prepass 1195 * scheduling for pressure. */ 1196 1197 if (cost <= best_cost) { 1198 best_idx = i; 1199 best_cost = cost; 1200 } 1201 } 1202 1203 return best_idx; 1204} 1205 1206static void 1207bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple, 1208 bi_instr *instr, uint64_t live_after_temp, bool fma) 1209{ 1210 bi_update_fau(clause, tuple, instr, fma, true); 1211 1212 /* TODO: maybe opt a bit? or maybe doesn't matter */ 1213 assert(clause->access_count + BI_MAX_SRCS + BI_MAX_DESTS <= ARRAY_SIZE(clause->accesses)); 1214 memcpy(clause->accesses + clause->access_count, instr->src, sizeof(instr->src)); 1215 clause->access_count += BI_MAX_SRCS; 1216 memcpy(clause->accesses + clause->access_count, instr->dest, sizeof(instr->dest)); 1217 clause->access_count += BI_MAX_DESTS; 1218 tuple->reg.nr_writes += bi_write_count(instr, live_after_temp); 1219 1220 bi_foreach_src(instr, s) { 1221 if (bi_tuple_is_new_src(instr, &tuple->reg, s)) 1222 tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s]; 1223 } 1224 1225 /* This could be optimized to allow pairing integer instructions with 1226 * special flush-to-zero instructions, but punting on this until we have 1227 * a workload that cares. 1228 */ 1229 clause->ftz = bi_needs_ftz(instr) ? BI_FTZ_STATE_ENABLE : 1230 BI_FTZ_STATE_DISABLE; 1231} 1232 1233/* Choose the best instruction and pop it off the worklist. Returns NULL if no 1234 * instruction is available. This function is destructive. */ 1235 1236static bi_instr * 1237bi_take_instr(bi_context *ctx, struct bi_worklist st, 1238 struct bi_clause_state *clause, 1239 struct bi_tuple_state *tuple, 1240 uint64_t live_after_temp, 1241 bool fma) 1242{ 1243 if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE) 1244 return bi_lower_cubeface(ctx, clause, tuple); 1245 else if (tuple->add && tuple->add->op == BI_OPCODE_ATOM_RETURN_I32) 1246 return bi_lower_atom_c(ctx, clause, tuple); 1247 else if (tuple->add && tuple->add->op == BI_OPCODE_ATOM1_RETURN_I32) 1248 return bi_lower_atom_c1(ctx, clause, tuple); 1249 else if (tuple->add && tuple->add->op == BI_OPCODE_SEG_ADD_I64) 1250 return bi_lower_seg_add(ctx, clause, tuple); 1251 else if (tuple->add && tuple->add->table) 1252 return bi_lower_dtsel(ctx, clause, tuple); 1253 1254 /* TODO: Optimize these moves */ 1255 if (!fma && tuple->nr_prev_reads > 3) { 1256 /* Only spill by one source for now */ 1257 assert(tuple->nr_prev_reads == 4); 1258 1259 /* Pick a source to spill */ 1260 bi_index src = tuple->prev_reads[0]; 1261 1262 /* Schedule the spill */ 1263 bi_builder b = bi_init_builder(ctx, bi_before_tuple(tuple->prev)); 1264 bi_instr *mov = bi_mov_i32_to(&b, src, src); 1265 bi_pop_instr(clause, tuple, mov, live_after_temp, fma); 1266 return mov; 1267 } 1268 1269#ifndef NDEBUG 1270 /* Don't pair instructions if debugging */ 1271 if ((bifrost_debug & BIFROST_DBG_NOSCHED) && tuple->add) 1272 return NULL; 1273#endif 1274 1275 unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma); 1276 1277 if (idx >= st.count) 1278 return NULL; 1279 1280 /* Update state to reflect taking the instruction */ 1281 bi_instr *instr = st.instructions[idx]; 1282 1283 BITSET_CLEAR(st.worklist, idx); 1284 bi_update_worklist(st, idx); 1285 bi_pop_instr(clause, tuple, instr, live_after_temp, fma); 1286 1287 /* Fixups */ 1288 if (instr->op == BI_OPCODE_IADD_U32 && fma) { 1289 assert(bi_can_iaddc(instr)); 1290 instr->op = BI_OPCODE_IADDC_I32; 1291 instr->src[2] = bi_zero(); 1292 } else if (fma && bi_can_replace_with_csel(instr)) { 1293 bi_replace_mux_with_csel(instr, false); 1294 } 1295 1296 return instr; 1297} 1298 1299/* Variant of bi_rewrite_index_src_single that uses word-equivalence, rewriting 1300 * to a passthrough register. If except_sr is true, the staging sources are 1301 * skipped, so staging register reads are not accidentally encoded as 1302 * passthrough (which is impossible) */ 1303 1304static void 1305bi_use_passthrough(bi_instr *ins, bi_index old, 1306 enum bifrost_packed_src new, 1307 bool except_sr) 1308{ 1309 /* Optional for convenience */ 1310 if (!ins || bi_is_null(old)) 1311 return; 1312 1313 bi_foreach_src(ins, i) { 1314 if ((i == 0 || i == 4) && except_sr) 1315 continue; 1316 1317 if (bi_is_word_equiv(ins->src[i], old)) { 1318 ins->src[i].type = BI_INDEX_PASS; 1319 ins->src[i].value = new; 1320 ins->src[i].reg = false; 1321 ins->src[i].offset = 0; 1322 } 1323 } 1324} 1325 1326/* Rewrites an adjacent pair of tuples _prec_eding and _succ_eding to use 1327 * intertuple passthroughs where necessary. Passthroughs are allowed as a 1328 * post-condition of scheduling. Note we rewrite ADD first, FMA second -- 1329 * opposite the order of execution. This is deliberate -- if both FMA and ADD 1330 * write to the same logical register, the next executed tuple will get the 1331 * latter result. There's no interference issue under the assumption of correct 1332 * register allocation. */ 1333 1334static void 1335bi_rewrite_passthrough(bi_tuple prec, bi_tuple succ) 1336{ 1337 bool sr_read = succ.add ? bi_opcode_props[succ.add->op].sr_read : false; 1338 1339 if (prec.add) { 1340 bi_use_passthrough(succ.fma, prec.add->dest[0], BIFROST_SRC_PASS_ADD, false); 1341 bi_use_passthrough(succ.add, prec.add->dest[0], BIFROST_SRC_PASS_ADD, sr_read); 1342 } 1343 1344 if (prec.fma) { 1345 bi_use_passthrough(succ.fma, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, false); 1346 bi_use_passthrough(succ.add, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, sr_read); 1347 } 1348} 1349 1350static void 1351bi_rewrite_fau_to_pass(bi_tuple *tuple) 1352{ 1353 bi_foreach_instr_and_src_in_tuple(tuple, ins, s) { 1354 if (ins->src[s].type != BI_INDEX_FAU) continue; 1355 1356 bi_index pass = bi_passthrough(ins->src[s].offset ? 1357 BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO); 1358 1359 ins->src[s] = bi_replace_index(ins->src[s], pass); 1360 } 1361} 1362 1363static void 1364bi_rewrite_zero(bi_instr *ins, bool fma) 1365{ 1366 bi_index zero = bi_passthrough(fma ? BIFROST_SRC_STAGE : BIFROST_SRC_FAU_LO); 1367 1368 bi_foreach_src(ins, s) { 1369 bi_index src = ins->src[s]; 1370 1371 if (src.type == BI_INDEX_CONSTANT && src.value == 0) 1372 ins->src[s] = bi_replace_index(src, zero); 1373 } 1374} 1375 1376/* Assumes #0 to {T, FAU} rewrite has already occurred */ 1377 1378static void 1379bi_rewrite_constants_to_pass(bi_tuple *tuple, uint64_t constant, bool pcrel) 1380{ 1381 bi_foreach_instr_and_src_in_tuple(tuple, ins, s) { 1382 if (ins->src[s].type != BI_INDEX_CONSTANT) continue; 1383 1384 uint32_t cons = ins->src[s].value; 1385 1386 ASSERTED bool lo = (cons == (constant & 0xffffffff)); 1387 bool hi = (cons == (constant >> 32ull)); 1388 1389 /* PC offsets always live in the upper half, set to zero by 1390 * convention before pack time. (This is safe, since if you 1391 * wanted to compare against zero, you would use a BRANCHZ 1392 * instruction instead.) */ 1393 if (cons == 0 && ins->branch_target != NULL) { 1394 assert(pcrel); 1395 hi = true; 1396 lo = false; 1397 } else if (pcrel) { 1398 hi = false; 1399 } 1400 1401 assert(lo || hi); 1402 1403 ins->src[s] = bi_replace_index(ins->src[s], 1404 bi_passthrough(hi ? BIFROST_SRC_FAU_HI : 1405 BIFROST_SRC_FAU_LO)); 1406 } 1407} 1408 1409/* Constructs a constant state given a tuple state. This has the 1410 * postcondition that pcrel applies to the first constant by convention, 1411 * and PC-relative constants will be #0 by convention here, so swap to 1412 * match if needed */ 1413 1414static struct bi_const_state 1415bi_get_const_state(struct bi_tuple_state *tuple) 1416{ 1417 struct bi_const_state consts = { 1418 .constant_count = tuple->constant_count, 1419 .constants[0] = tuple->constants[0], 1420 .constants[1] = tuple->constants[1], 1421 .pcrel = tuple->add && tuple->add->branch_target, 1422 }; 1423 1424 /* pcrel applies to the first constant by convention, and 1425 * PC-relative constants will be #0 by convention here, so swap 1426 * to match if needed */ 1427 if (consts.pcrel && consts.constants[0]) { 1428 assert(consts.constant_count == 2); 1429 assert(consts.constants[1] == 0); 1430 1431 consts.constants[1] = consts.constants[0]; 1432 consts.constants[0] = 0; 1433 } 1434 1435 return consts; 1436} 1437 1438/* Merges constants in a clause, satisfying the following rules, assuming no 1439 * more than one tuple has pcrel: 1440 * 1441 * 1. If a tuple has two constants, they must be packed together. If one is 1442 * pcrel, it must be the high constant to use the M1=4 modification [sx64(E0) + 1443 * (PC << 32)]. Otherwise choose an arbitrary order. 1444 * 1445 * 4. If a tuple has one constant, it may be shared with an existing 1446 * pair that already contains that constant, or it may be combined with another 1447 * (distinct) tuple of a single constant. 1448 * 1449 * This gaurantees a packing is possible. The next routine handles modification 1450 * related swapping, to satisfy format 12 and the lack of modification for 1451 * tuple count 5/8 in EC0. 1452 */ 1453 1454static uint64_t 1455bi_merge_u32(uint32_t c0, uint32_t c1, bool pcrel) 1456{ 1457 /* At this point in the constant merge algorithm, pcrel constants are 1458 * treated as zero, so pcrel implies at least one constants is zero */ 1459 assert(!pcrel || (c0 == 0 || c1 == 0)); 1460 1461 /* Order: pcrel, maximum non-pcrel, minimum non-pcrel */ 1462 uint32_t hi = pcrel ? 0 : MAX2(c0, c1); 1463 uint32_t lo = (c0 == hi) ? c1 : c0; 1464 1465 /* Merge in the selected order */ 1466 return lo | (((uint64_t) hi) << 32ull); 1467} 1468 1469static unsigned 1470bi_merge_pairs(struct bi_const_state *consts, unsigned tuple_count, 1471 uint64_t *merged, unsigned *pcrel_pair) 1472{ 1473 unsigned merge_count = 0; 1474 1475 for (unsigned t = 0; t < tuple_count; ++t) { 1476 if (consts[t].constant_count != 2) continue; 1477 1478 unsigned idx = ~0; 1479 uint64_t val = bi_merge_u32(consts[t].constants[0], 1480 consts[t].constants[1], consts[t].pcrel); 1481 1482 /* Skip the pcrel pair if assigned, because if one is assigned, 1483 * this one is not pcrel by uniqueness so it's a mismatch */ 1484 for (unsigned s = 0; s < merge_count; ++s) { 1485 if (merged[s] == val && (*pcrel_pair) != s) { 1486 idx = s; 1487 break; 1488 } 1489 } 1490 1491 if (idx == ~0) { 1492 idx = merge_count++; 1493 merged[idx] = val; 1494 1495 if (consts[t].pcrel) 1496 (*pcrel_pair) = idx; 1497 } 1498 1499 consts[t].word_idx = idx; 1500 } 1501 1502 return merge_count; 1503} 1504 1505static unsigned 1506bi_merge_singles(struct bi_const_state *consts, unsigned tuple_count, 1507 uint64_t *pairs, unsigned pair_count, unsigned *pcrel_pair) 1508{ 1509 bool pending = false, pending_pcrel = false; 1510 uint32_t pending_single = 0; 1511 1512 for (unsigned t = 0; t < tuple_count; ++t) { 1513 if (consts[t].constant_count != 1) continue; 1514 1515 uint32_t val = consts[t].constants[0]; 1516 unsigned idx = ~0; 1517 1518 /* Try to match, but don't match pcrel with non-pcrel, even 1519 * though we can merge a pcrel with a non-pcrel single */ 1520 for (unsigned i = 0; i < pair_count; ++i) { 1521 bool lo = ((pairs[i] & 0xffffffff) == val); 1522 bool hi = ((pairs[i] >> 32) == val); 1523 bool match = (lo || hi); 1524 match &= ((*pcrel_pair) != i); 1525 if (match && !consts[t].pcrel) { 1526 idx = i; 1527 break; 1528 } 1529 } 1530 1531 if (idx == ~0) { 1532 idx = pair_count; 1533 1534 if (pending && pending_single != val) { 1535 assert(!(pending_pcrel && consts[t].pcrel)); 1536 bool pcrel = pending_pcrel || consts[t].pcrel; 1537 1538 if (pcrel) 1539 *pcrel_pair = idx; 1540 1541 pairs[pair_count++] = bi_merge_u32(pending_single, val, pcrel); 1542 1543 pending = pending_pcrel = false; 1544 } else { 1545 pending = true; 1546 pending_pcrel = consts[t].pcrel; 1547 pending_single = val; 1548 } 1549 } 1550 1551 consts[t].word_idx = idx; 1552 } 1553 1554 /* Shift so it works whether pending_pcrel is set or not */ 1555 if (pending) { 1556 if (pending_pcrel) 1557 *pcrel_pair = pair_count; 1558 1559 pairs[pair_count++] = ((uint64_t) pending_single) << 32ull; 1560 } 1561 1562 return pair_count; 1563} 1564 1565static unsigned 1566bi_merge_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned *pcrel_idx) 1567{ 1568 unsigned pair_count = bi_merge_pairs(consts, 8, pairs, pcrel_idx); 1569 return bi_merge_singles(consts, 8, pairs, pair_count, pcrel_idx); 1570} 1571 1572/* Swap two constants at word i and i+1 by swapping their actual positions and 1573 * swapping all references so the meaning of the clause is preserved */ 1574 1575static void 1576bi_swap_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned i) 1577{ 1578 uint64_t tmp_pair = pairs[i + 0]; 1579 pairs[i + 0] = pairs[i + 1]; 1580 pairs[i + 1] = tmp_pair; 1581 1582 for (unsigned t = 0; t < 8; ++t) { 1583 if (consts[t].word_idx == i) 1584 consts[t].word_idx = (i + 1); 1585 else if (consts[t].word_idx == (i + 1)) 1586 consts[t].word_idx = i; 1587 } 1588} 1589 1590/* Given merged constants, one of which might be PC-relative, fix up the M 1591 * values so the PC-relative constant (if it exists) has the M1=4 modification 1592 * and other constants are used as-is (which might require swapping) */ 1593 1594static unsigned 1595bi_apply_constant_modifiers(struct bi_const_state *consts, 1596 uint64_t *pairs, unsigned *pcrel_idx, 1597 unsigned tuple_count, unsigned constant_count) 1598{ 1599 unsigned start = bi_ec0_packed(tuple_count) ? 1 : 0; 1600 1601 /* Clauses with these tuple counts lack an M field for the packed EC0, 1602 * so EC0 cannot be PC-relative, which might require swapping (and 1603 * possibly adding an unused constant) to fit */ 1604 1605 if (*pcrel_idx == 0 && (tuple_count == 5 || tuple_count == 8)) { 1606 constant_count = MAX2(constant_count, 2); 1607 *pcrel_idx = 1; 1608 bi_swap_constants(consts, pairs, 0); 1609 } 1610 1611 /* EC0 might be packed free, after that constants are packed in pairs 1612 * (with clause format 12), with M1 values computed from the pair */ 1613 1614 for (unsigned i = start; i < constant_count; i += 2) { 1615 bool swap = false; 1616 bool last = (i + 1) == constant_count; 1617 1618 unsigned A1 = (pairs[i] >> 60); 1619 unsigned B1 = (pairs[i + 1] >> 60); 1620 1621 if (*pcrel_idx == i || *pcrel_idx == (i + 1)) { 1622 /* PC-relative constant must be E0, not E1 */ 1623 swap = (*pcrel_idx == (i + 1)); 1624 1625 /* Set M1 = 4 by noting (A - B) mod 16 = 4 is 1626 * equivalent to A = (B + 4) mod 16 and that we can 1627 * control A */ 1628 unsigned B = swap ? A1 : B1; 1629 unsigned A = (B + 4) & 0xF; 1630 pairs[*pcrel_idx] |= ((uint64_t) A) << 60; 1631 1632 /* Swapped if swap set, identity if swap not set */ 1633 *pcrel_idx = i; 1634 } else { 1635 /* Compute M1 value if we don't swap */ 1636 unsigned M1 = (16 + A1 - B1) & 0xF; 1637 1638 /* For M1 = 0 or M1 >= 8, the constants are unchanged, 1639 * we have 0 < (A1 - B1) % 16 < 8, which implies (B1 - 1640 * A1) % 16 >= 8, so swapping will let them be used 1641 * unchanged */ 1642 swap = (M1 != 0) && (M1 < 8); 1643 1644 /* However, we can't swap the last constant, so we 1645 * force M1 = 0 instead for this case */ 1646 if (last && swap) { 1647 pairs[i + 1] |= pairs[i] & (0xfull << 60); 1648 swap = false; 1649 } 1650 } 1651 1652 if (swap) { 1653 assert(!last); 1654 bi_swap_constants(consts, pairs, i); 1655 } 1656 } 1657 1658 return constant_count; 1659} 1660 1661/* Schedule a single clause. If no instructions remain, return NULL. */ 1662 1663static bi_clause * 1664bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint64_t *live) 1665{ 1666 struct bi_clause_state clause_state = { 0 }; 1667 bi_clause *clause = rzalloc(ctx, bi_clause); 1668 bi_tuple *tuple = NULL; 1669 1670 const unsigned max_tuples = ARRAY_SIZE(clause->tuples); 1671 1672 /* TODO: Decide flow control better */ 1673 clause->flow_control = BIFROST_FLOW_NBTB; 1674 1675 /* The last clause can only write one instruction, so initialize that */ 1676 struct bi_reg_state reg_state = {}; 1677 bi_index prev_reads[5] = { bi_null() }; 1678 unsigned nr_prev_reads = 0; 1679 1680 /* We need to track future liveness. The main *live set tracks what is 1681 * live at the current point int he program we are scheduling, but to 1682 * determine temp eligibility, we instead want what will be live after 1683 * the next tuple in the program. If you scheduled forwards, you'd need 1684 * a crystall ball for this. Luckily we schedule backwards, so we just 1685 * delay updates to the live_after_temp by an extra tuple. */ 1686 uint64_t live_after_temp = *live; 1687 uint64_t live_next_tuple = live_after_temp; 1688 1689 do { 1690 struct bi_tuple_state tuple_state = { 1691 .last = (clause->tuple_count == 0), 1692 .reg = reg_state, 1693 .nr_prev_reads = nr_prev_reads, 1694 .prev = tuple, 1695 .pcrel_idx = ~0, 1696 }; 1697 1698 assert(nr_prev_reads < ARRAY_SIZE(prev_reads)); 1699 memcpy(tuple_state.prev_reads, prev_reads, sizeof(prev_reads)); 1700 1701 unsigned idx = max_tuples - clause->tuple_count - 1; 1702 1703 tuple = &clause->tuples[idx]; 1704 1705 if (clause->message && bi_opcode_props[clause->message->op].sr_read && !bi_is_null(clause->message->src[0])) { 1706 unsigned nr = bi_count_read_registers(clause->message, 0); 1707 live_after_temp |= (BITFIELD64_MASK(nr) << clause->message->src[0].value); 1708 } 1709 1710 /* Since we schedule backwards, we schedule ADD first */ 1711 tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, false); 1712 tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, true); 1713 tuple->add = tuple_state.add; 1714 1715 /* Update liveness from the new instructions */ 1716 if (tuple->add) 1717 *live = bi_postra_liveness_ins(*live, tuple->add); 1718 1719 if (tuple->fma) 1720 *live = bi_postra_liveness_ins(*live, tuple->fma); 1721 1722 /* Rotate in the new per-tuple liveness */ 1723 live_after_temp = live_next_tuple; 1724 live_next_tuple = *live; 1725 1726 /* We may have a message, but only one per clause */ 1727 if (tuple->add && bi_must_message(tuple->add)) { 1728 assert(!clause_state.message); 1729 clause_state.message = true; 1730 1731 clause->message_type = 1732 bi_message_type_for_instr(tuple->add); 1733 clause->message = tuple->add; 1734 1735 /* We don't need to set dependencies for blend shaders 1736 * because the BLEND instruction in the fragment 1737 * shader should have already done the wait */ 1738 if (!ctx->inputs->is_blend) { 1739 switch (tuple->add->op) { 1740 case BI_OPCODE_ATEST: 1741 clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH); 1742 break; 1743 case BI_OPCODE_LD_TILE: 1744 case BI_OPCODE_ST_TILE: 1745 clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR); 1746 break; 1747 case BI_OPCODE_BLEND: 1748 clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH); 1749 clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR); 1750 break; 1751 default: 1752 break; 1753 } 1754 } 1755 } 1756 1757 clause_state.consts[idx] = bi_get_const_state(&tuple_state); 1758 1759 /* Before merging constants, eliminate zeroes, otherwise the 1760 * merging will fight over the #0 that never gets read (and is 1761 * never marked as read by update_fau) */ 1762 if (tuple->fma && bi_reads_zero(tuple->fma)) 1763 bi_rewrite_zero(tuple->fma, true); 1764 1765 /* Rewrite away FAU, constant write is deferred */ 1766 if (!tuple_state.constant_count) { 1767 tuple->fau_idx = tuple_state.fau; 1768 bi_rewrite_fau_to_pass(tuple); 1769 } 1770 1771 /* Use passthrough register for cross-stage accesses. Since 1772 * there are just FMA and ADD stages, that means we rewrite to 1773 * passthrough the sources of the ADD that read from the 1774 * destination of the FMA */ 1775 1776 if (tuple->fma) { 1777 bi_use_passthrough(tuple->add, tuple->fma->dest[0], 1778 BIFROST_SRC_STAGE, false); 1779 } 1780 1781 /* Don't add an empty tuple, unless the worklist has nothing 1782 * but a (pseudo)instruction failing to schedule due to a "not 1783 * last instruction" constraint */ 1784 1785 int some_instruction = __bitset_ffs(st.worklist, BITSET_WORDS(st.count)); 1786 bool not_last = (some_instruction > 0) && 1787 bi_must_not_last(st.instructions[some_instruction - 1]); 1788 1789 bool insert_empty = tuple_state.last && not_last; 1790 1791 if (!(tuple->fma || tuple->add || insert_empty)) 1792 break; 1793 1794 clause->tuple_count++; 1795 1796 /* Adding enough tuple might overflow constants */ 1797 if (!bi_space_for_more_constants(&clause_state)) 1798 break; 1799 1800#ifndef NDEBUG 1801 /* Don't schedule more than 1 tuple if debugging */ 1802 if ((bifrost_debug & BIFROST_DBG_NOSCHED) && !insert_empty) 1803 break; 1804#endif 1805 1806 /* Link through the register state */ 1807 STATIC_ASSERT(sizeof(prev_reads) == sizeof(tuple_state.reg.reads)); 1808 memcpy(prev_reads, tuple_state.reg.reads, sizeof(prev_reads)); 1809 nr_prev_reads = tuple_state.reg.nr_reads; 1810 clause_state.tuple_count++; 1811 } while(clause->tuple_count < 8); 1812 1813 /* Don't schedule an empty clause */ 1814 if (!clause->tuple_count) 1815 return NULL; 1816 1817 /* Before merging, rewrite away any tuples that read only zero */ 1818 for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) { 1819 bi_tuple *tuple = &clause->tuples[i]; 1820 struct bi_const_state *st = &clause_state.consts[i]; 1821 1822 if (st->constant_count == 0 || st->constants[0] || st->constants[1] || st->pcrel) 1823 continue; 1824 1825 bi_foreach_instr_in_tuple(tuple, ins) 1826 bi_rewrite_zero(ins, false); 1827 1828 /* Constant has been demoted to FAU, so don't pack it separately */ 1829 st->constant_count = 0; 1830 1831 /* Default */ 1832 assert(tuple->fau_idx == BIR_FAU_ZERO); 1833 } 1834 1835 uint64_t constant_pairs[8] = { 0 }; 1836 unsigned pcrel_idx = ~0; 1837 unsigned constant_words = 1838 bi_merge_constants(clause_state.consts, constant_pairs, &pcrel_idx); 1839 1840 constant_words = bi_apply_constant_modifiers(clause_state.consts, 1841 constant_pairs, &pcrel_idx, clause->tuple_count, 1842 constant_words); 1843 1844 clause->pcrel_idx = pcrel_idx; 1845 1846 for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) { 1847 bi_tuple *tuple = &clause->tuples[i]; 1848 1849 /* If no constants, leave FAU as it is, possibly defaulting to 0 */ 1850 if (clause_state.consts[i].constant_count == 0) 1851 continue; 1852 1853 /* FAU is already handled */ 1854 assert(!tuple->fau_idx); 1855 1856 unsigned word_idx = clause_state.consts[i].word_idx; 1857 assert(word_idx <= 8); 1858 1859 /* We could try to merge regardless of bottom bits as well, but 1860 * that's probably diminishing returns */ 1861 uint64_t pair = constant_pairs[word_idx]; 1862 unsigned lo = pair & 0xF; 1863 1864 tuple->fau_idx = bi_constant_field(word_idx) | lo; 1865 bi_rewrite_constants_to_pass(tuple, pair, word_idx == pcrel_idx); 1866 } 1867 1868 clause->constant_count = constant_words; 1869 memcpy(clause->constants, constant_pairs, sizeof(constant_pairs)); 1870 1871 /* Branches must be last, so this can be factored out */ 1872 bi_instr *last = clause->tuples[max_tuples - 1].add; 1873 clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP); 1874 clause->block = block; 1875 1876 clause->ftz = (clause_state.ftz == BI_FTZ_STATE_ENABLE); 1877 1878 /* We emit in reverse and emitted to the back of the tuples array, so 1879 * move it up front for easy indexing */ 1880 memmove(clause->tuples, 1881 clause->tuples + (max_tuples - clause->tuple_count), 1882 clause->tuple_count * sizeof(clause->tuples[0])); 1883 1884 /* Use passthrough register for cross-tuple accesses. Note this is 1885 * after the memmove, so this is forwards. Skip the first tuple since 1886 * there is nothing before it to passthrough */ 1887 1888 for (unsigned t = 1; t < clause->tuple_count; ++t) 1889 bi_rewrite_passthrough(clause->tuples[t - 1], clause->tuples[t]); 1890 1891 return clause; 1892} 1893 1894static void 1895bi_schedule_block(bi_context *ctx, bi_block *block) 1896{ 1897 list_inithead(&block->clauses); 1898 1899 /* Copy list to dynamic array */ 1900 struct bi_worklist st = bi_initialize_worklist(block, 1901 bifrost_debug & BIFROST_DBG_INORDER, 1902 ctx->inputs->is_blend); 1903 1904 if (!st.count) { 1905 bi_free_worklist(st); 1906 return; 1907 } 1908 1909 /* We need to track liveness during scheduling in order to determine whether we can use temporary (passthrough) registers */ 1910 uint64_t live = block->reg_live_out; 1911 1912 /* Schedule as many clauses as needed to fill the block */ 1913 bi_clause *u = NULL; 1914 while((u = bi_schedule_clause(ctx, block, st, &live))) 1915 list_add(&u->link, &block->clauses); 1916 1917 /* Back-to-back bit affects only the last clause of a block, 1918 * the rest are implicitly true */ 1919 if (!list_is_empty(&block->clauses)) { 1920 bi_clause *last_clause = list_last_entry(&block->clauses, bi_clause, link); 1921 if (bi_reconverge_branches(block)) 1922 last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL; 1923 } 1924 1925 /* Reorder instructions to match the new schedule. First remove 1926 * existing instructions and then recreate the list */ 1927 1928 bi_foreach_instr_in_block_safe(block, ins) { 1929 list_del(&ins->link); 1930 } 1931 1932 bi_foreach_clause_in_block(block, clause) { 1933 for (unsigned i = 0; i < clause->tuple_count; ++i) { 1934 bi_foreach_instr_in_tuple(&clause->tuples[i], ins) { 1935 list_addtail(&ins->link, &block->instructions); 1936 } 1937 } 1938 } 1939 1940 block->scheduled = true; 1941 1942#ifndef NDEBUG 1943 unsigned i; 1944 bool incomplete = false; 1945 1946 BITSET_FOREACH_SET(i, st.worklist, st.count) { 1947 bi_print_instr(st.instructions[i], stderr); 1948 incomplete = true; 1949 } 1950 1951 if (incomplete) 1952 unreachable("The above instructions failed to schedule."); 1953#endif 1954 1955 bi_free_worklist(st); 1956} 1957 1958static bool 1959bi_check_fau_src(bi_instr *ins, unsigned s, uint32_t *constants, unsigned *cwords, bi_index *fau) 1960{ 1961 bi_index src = ins->src[s]; 1962 1963 /* Staging registers can't have FAU accesses */ 1964 if (bi_is_staging_src(ins, s)) 1965 return (src.type != BI_INDEX_CONSTANT) && (src.type != BI_INDEX_FAU); 1966 1967 if (src.type == BI_INDEX_CONSTANT) { 1968 /* Allow fast zero */ 1969 if (src.value == 0 && bi_opcode_props[ins->op].fma && bi_reads_zero(ins)) 1970 return true; 1971 1972 if (!bi_is_null(*fau)) 1973 return false; 1974 1975 /* Else, try to inline a constant */ 1976 for (unsigned i = 0; i < *cwords; ++i) { 1977 if (src.value == constants[i]) 1978 return true; 1979 } 1980 1981 if (*cwords >= 2) 1982 return false; 1983 1984 constants[(*cwords)++] = src.value; 1985 } else if (src.type == BI_INDEX_FAU) { 1986 if (*cwords != 0) 1987 return false; 1988 1989 /* Can only read from one pair of FAU words */ 1990 if (!bi_is_null(*fau) && (src.value != fau->value)) 1991 return false; 1992 1993 /* If there is a target, we'll need a PC-relative constant */ 1994 if (ins->branch_target) 1995 return false; 1996 1997 *fau = src; 1998 } 1999 2000 return true; 2001} 2002 2003void 2004bi_lower_fau(bi_context *ctx) 2005{ 2006 bi_foreach_instr_global_safe(ctx, ins) { 2007 bi_builder b = bi_init_builder(ctx, bi_before_instr(ins)); 2008 2009 uint32_t constants[2]; 2010 unsigned cwords = 0; 2011 bi_index fau = bi_null(); 2012 2013 /* ATEST must have the ATEST datum encoded, not any other 2014 * uniform. See to it this is the case. */ 2015 if (ins->op == BI_OPCODE_ATEST) 2016 fau = ins->src[2]; 2017 2018 /* Dual texturing requires the texture operation descriptor 2019 * encoded as an immediate so we can fix up. 2020 */ 2021 if (ins->op == BI_OPCODE_TEXC) { 2022 assert(ins->src[3].type == BI_INDEX_CONSTANT); 2023 constants[cwords++] = ins->src[3].value; 2024 } 2025 2026 bi_foreach_src(ins, s) { 2027 if (bi_check_fau_src(ins, s, constants, &cwords, &fau)) continue; 2028 2029 bi_index copy = bi_mov_i32(&b, ins->src[s]); 2030 ins->src[s] = bi_replace_index(ins->src[s], copy); 2031 } 2032 } 2033} 2034 2035/* Only v7 allows specifying a dependency on the tilebuffer for the first 2036 * clause of a shader. v6 requires adding a NOP clause with the depedency. */ 2037 2038static void 2039bi_add_nop_for_atest(bi_context *ctx) 2040{ 2041 /* Only needed on v6 */ 2042 if (ctx->arch >= 7) 2043 return; 2044 2045 if (list_is_empty(&ctx->blocks)) 2046 return; 2047 2048 /* Fetch the first clause of the shader */ 2049 bi_block *block = list_first_entry(&ctx->blocks, bi_block, link); 2050 bi_clause *clause = bi_next_clause(ctx, block, NULL); 2051 2052 if (!clause || !(clause->dependencies & ((1 << BIFROST_SLOT_ELDEST_DEPTH) | 2053 (1 << BIFROST_SLOT_ELDEST_COLOUR)))) 2054 return; 2055 2056 /* Add a NOP so we can wait for the dependencies required by the first 2057 * clause */ 2058 2059 bi_instr *I = rzalloc(ctx, bi_instr); 2060 I->op = BI_OPCODE_NOP; 2061 I->dest[0] = bi_null(); 2062 2063 bi_clause *new_clause = ralloc(ctx, bi_clause); 2064 *new_clause = (bi_clause) { 2065 .flow_control = BIFROST_FLOW_NBTB, 2066 .next_clause_prefetch = true, 2067 .block = clause->block, 2068 2069 .tuple_count = 1, 2070 .tuples[0] = { .fma = I, }, 2071 }; 2072 2073 list_add(&new_clause->link, &clause->block->clauses); 2074} 2075 2076void 2077bi_schedule(bi_context *ctx) 2078{ 2079 /* Fed into both scheduling and DCE */ 2080 bi_postra_liveness(ctx); 2081 2082 bi_foreach_block(ctx, block) { 2083 bi_schedule_block(ctx, block); 2084 } 2085 2086 bi_opt_dce_post_ra(ctx); 2087 bi_add_nop_for_atest(ctx); 2088} 2089