1/* 2 * Copyright (C) 2020 Collabora, Ltd. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#include "compiler.h" 25#include "bi_quirks.h" 26 27/* This file contains the final passes of the compiler. Running after 28 * scheduling and RA, the IR is now finalized, so we need to emit it to actual 29 * bits on the wire (as well as fixup branches) */ 30 31static uint64_t 32bi_pack_header(bi_clause *clause, bi_clause *next_1, bi_clause *next_2) 33{ 34 /* next_dependencies are the union of the dependencies of successors' 35 * dependencies */ 36 37 unsigned dependency_wait = next_1 ? next_1->dependencies : 0; 38 dependency_wait |= next_2 ? next_2->dependencies : 0; 39 40 /* Signal barriers (slot #7) immediately. This is not optimal but good 41 * enough. Doing better requires extending the IR and scheduler. 42 */ 43 if (clause->message_type == BIFROST_MESSAGE_BARRIER) 44 dependency_wait |= BITFIELD_BIT(7); 45 46 bool staging_barrier = next_1 ? next_1->staging_barrier : false; 47 staging_barrier |= next_2 ? next_2->staging_barrier : 0; 48 49 struct bifrost_header header = { 50 .flow_control = 51 (next_1 == NULL && next_2 == NULL) ? 52 BIFROST_FLOW_END : clause->flow_control, 53 .terminate_discarded_threads = clause->td, 54 .next_clause_prefetch = clause->next_clause_prefetch && next_1, 55 .staging_barrier = staging_barrier, 56 .staging_register = clause->staging_register, 57 .dependency_wait = dependency_wait, 58 .dependency_slot = clause->scoreboard_id, 59 .message_type = clause->message_type, 60 .next_message_type = next_1 ? next_1->message_type : 0, 61 .flush_to_zero = clause->ftz ? BIFROST_FTZ_ALWAYS : BIFROST_FTZ_DISABLE 62 }; 63 64 uint64_t u = 0; 65 memcpy(&u, &header, sizeof(header)); 66 return u; 67} 68 69/* Assigns a slot for reading, before anything is written */ 70 71static void 72bi_assign_slot_read(bi_registers *regs, bi_index src) 73{ 74 /* We only assign for registers */ 75 if (src.type != BI_INDEX_REGISTER) 76 return; 77 78 /* Check if we already assigned the slot */ 79 for (unsigned i = 0; i <= 1; ++i) { 80 if (regs->slot[i] == src.value && regs->enabled[i]) 81 return; 82 } 83 84 if (regs->slot[2] == src.value && regs->slot23.slot2 == BIFROST_OP_READ) 85 return; 86 87 /* Assign it now */ 88 89 for (unsigned i = 0; i <= 1; ++i) { 90 if (!regs->enabled[i]) { 91 regs->slot[i] = src.value; 92 regs->enabled[i] = true; 93 return; 94 } 95 } 96 97 if (!regs->slot23.slot3) { 98 regs->slot[2] = src.value; 99 regs->slot23.slot2 = BIFROST_OP_READ; 100 return; 101 } 102 103 bi_print_slots(regs, stderr); 104 unreachable("Failed to find a free slot for src"); 105} 106 107static bi_registers 108bi_assign_slots(bi_tuple *now, bi_tuple *prev) 109{ 110 /* We assign slots for the main register mechanism. Special ops 111 * use the data registers, which has its own mechanism entirely 112 * and thus gets skipped over here. */ 113 114 bool read_dreg = now->add && bi_opcode_props[now->add->op].sr_read; 115 bool write_dreg = prev->add && bi_opcode_props[prev->add->op].sr_write; 116 117 /* First, assign reads */ 118 119 if (now->fma) 120 bi_foreach_src(now->fma, src) 121 bi_assign_slot_read(&now->regs, (now->fma)->src[src]); 122 123 if (now->add) { 124 bi_foreach_src(now->add, src) { 125 /* This is not a real source, we shouldn't assign a 126 * slot for it. 127 */ 128 if (now->add->op == BI_OPCODE_BLEND && src == 4) 129 continue; 130 131 if (!(src == 0 && read_dreg)) 132 bi_assign_slot_read(&now->regs, (now->add)->src[src]); 133 } 134 } 135 136 /* Next, assign writes. Staging writes are assigned separately, but 137 * +ATEST wants its destination written to both a staging register 138 * _and_ a regular write, because it may not generate a message */ 139 140 if (prev->add && (!write_dreg || prev->add->op == BI_OPCODE_ATEST)) { 141 bi_index idx = prev->add->dest[0]; 142 143 if (idx.type == BI_INDEX_REGISTER) { 144 now->regs.slot[3] = idx.value; 145 now->regs.slot23.slot3 = BIFROST_OP_WRITE; 146 } 147 } 148 149 if (prev->fma) { 150 bi_index idx = (prev->fma)->dest[0]; 151 152 if (idx.type == BI_INDEX_REGISTER) { 153 if (now->regs.slot23.slot3) { 154 /* Scheduler constraint: cannot read 3 and write 2 */ 155 assert(!now->regs.slot23.slot2); 156 now->regs.slot[2] = idx.value; 157 now->regs.slot23.slot2 = BIFROST_OP_WRITE; 158 } else { 159 now->regs.slot[3] = idx.value; 160 now->regs.slot23.slot3 = BIFROST_OP_WRITE; 161 now->regs.slot23.slot3_fma = true; 162 } 163 } 164 } 165 166 return now->regs; 167} 168 169static enum bifrost_reg_mode 170bi_pack_register_mode(bi_registers r) 171{ 172 /* Handle idle as a special case */ 173 if (!(r.slot23.slot2 | r.slot23.slot3)) 174 return r.first_instruction ? BIFROST_IDLE_1 : BIFROST_IDLE; 175 176 /* Otherwise, use the LUT */ 177 for (unsigned i = 0; i < ARRAY_SIZE(bifrost_reg_ctrl_lut); ++i) { 178 if (memcmp(bifrost_reg_ctrl_lut + i, &r.slot23, sizeof(r.slot23)) == 0) 179 return i; 180 } 181 182 bi_print_slots(&r, stderr); 183 unreachable("Invalid slot assignment"); 184} 185 186static uint64_t 187bi_pack_registers(bi_registers regs) 188{ 189 enum bifrost_reg_mode mode = bi_pack_register_mode(regs); 190 struct bifrost_regs s = { 0 }; 191 uint64_t packed = 0; 192 193 /* Need to pack 5-bit mode as a 4-bit field. The decoder moves bit 3 to bit 4 for 194 * first instruction and adds 16 when reg 2 == reg 3 */ 195 196 unsigned ctrl; 197 bool r2_equals_r3 = false; 198 199 if (regs.first_instruction) { 200 /* Bit 3 implicitly must be clear for first instructions. 201 * The affected patterns all write both ADD/FMA, but that 202 * is forbidden for the last instruction (whose writes are 203 * encoded by the first), so this does not add additional 204 * encoding constraints */ 205 assert(!(mode & 0x8)); 206 207 /* Move bit 4 to bit 3, since bit 3 is clear */ 208 ctrl = (mode & 0x7) | ((mode & 0x10) >> 1); 209 210 /* If we can let r2 equal r3, we have to or the hardware raises 211 * INSTR_INVALID_ENC (it's unclear why). */ 212 if (!(regs.slot23.slot2 && regs.slot23.slot3)) 213 r2_equals_r3 = true; 214 } else { 215 /* We force r2=r3 or not for the upper bit */ 216 ctrl = (mode & 0xF); 217 r2_equals_r3 = (mode & 0x10); 218 } 219 220 if (regs.enabled[1]) { 221 /* Gotta save that bit!~ Required by the 63-x trick */ 222 assert(regs.slot[1] > regs.slot[0]); 223 assert(regs.enabled[0]); 224 225 /* Do the 63-x trick, see docs/disasm */ 226 if (regs.slot[0] > 31) { 227 regs.slot[0] = 63 - regs.slot[0]; 228 regs.slot[1] = 63 - regs.slot[1]; 229 } 230 231 assert(regs.slot[0] <= 31); 232 assert(regs.slot[1] <= 63); 233 234 s.ctrl = ctrl; 235 s.reg1 = regs.slot[1]; 236 s.reg0 = regs.slot[0]; 237 } else { 238 /* slot 1 disabled, so set to zero and use slot 1 for ctrl */ 239 s.ctrl = 0; 240 s.reg1 = ctrl << 2; 241 242 if (regs.enabled[0]) { 243 /* Bit 0 upper bit of slot 0 */ 244 s.reg1 |= (regs.slot[0] >> 5); 245 246 /* Rest of slot 0 in usual spot */ 247 s.reg0 = (regs.slot[0] & 0b11111); 248 } else { 249 /* Bit 1 set if slot 0 also disabled */ 250 s.reg1 |= (1 << 1); 251 } 252 } 253 254 /* Force r2 =/!= r3 as needed */ 255 if (r2_equals_r3) { 256 assert(regs.slot[3] == regs.slot[2] || !(regs.slot23.slot2 && regs.slot23.slot3)); 257 258 if (regs.slot23.slot2) 259 regs.slot[3] = regs.slot[2]; 260 else 261 regs.slot[2] = regs.slot[3]; 262 } else if (!regs.first_instruction) { 263 /* Enforced by the encoding anyway */ 264 assert(regs.slot[2] != regs.slot[3]); 265 } 266 267 s.reg2 = regs.slot[2]; 268 s.reg3 = regs.slot[3]; 269 s.fau_idx = regs.fau_idx; 270 271 memcpy(&packed, &s, sizeof(s)); 272 return packed; 273} 274 275/* We must ensure slot 1 > slot 0 for the 63-x trick to function, so we fix 276 * this up at pack time. (Scheduling doesn't care.) */ 277 278static void 279bi_flip_slots(bi_registers *regs) 280{ 281 if (regs->enabled[0] && regs->enabled[1] && regs->slot[1] < regs->slot[0]) { 282 unsigned temp = regs->slot[0]; 283 regs->slot[0] = regs->slot[1]; 284 regs->slot[1] = temp; 285 } 286 287} 288 289static inline enum bifrost_packed_src 290bi_get_src_slot(bi_registers *regs, unsigned reg) 291{ 292 if (regs->slot[0] == reg && regs->enabled[0]) 293 return BIFROST_SRC_PORT0; 294 else if (regs->slot[1] == reg && regs->enabled[1]) 295 return BIFROST_SRC_PORT1; 296 else if (regs->slot[2] == reg && regs->slot23.slot2 == BIFROST_OP_READ) 297 return BIFROST_SRC_PORT2; 298 else 299 unreachable("Tried to access register with no port"); 300} 301 302static inline enum bifrost_packed_src 303bi_get_src_new(bi_instr *ins, bi_registers *regs, unsigned s) 304{ 305 if (!ins) 306 return 0; 307 308 bi_index src = ins->src[s]; 309 310 if (src.type == BI_INDEX_REGISTER) 311 return bi_get_src_slot(regs, src.value); 312 else if (src.type == BI_INDEX_PASS) 313 return src.value; 314 else { 315 /* TODO make safer */ 316 return BIFROST_SRC_STAGE; 317 } 318} 319 320static struct bi_packed_tuple 321bi_pack_tuple(bi_clause *clause, bi_tuple *tuple, bi_tuple *prev, bool first_tuple, gl_shader_stage stage) 322{ 323 bi_assign_slots(tuple, prev); 324 tuple->regs.fau_idx = tuple->fau_idx; 325 tuple->regs.first_instruction = first_tuple; 326 327 bi_flip_slots(&tuple->regs); 328 329 bool sr_read = tuple->add && 330 bi_opcode_props[(tuple->add)->op].sr_read; 331 332 uint64_t reg = bi_pack_registers(tuple->regs); 333 uint64_t fma = bi_pack_fma(tuple->fma, 334 bi_get_src_new(tuple->fma, &tuple->regs, 0), 335 bi_get_src_new(tuple->fma, &tuple->regs, 1), 336 bi_get_src_new(tuple->fma, &tuple->regs, 2), 337 bi_get_src_new(tuple->fma, &tuple->regs, 3)); 338 339 uint64_t add = bi_pack_add(tuple->add, 340 bi_get_src_new(tuple->add, &tuple->regs, sr_read + 0), 341 bi_get_src_new(tuple->add, &tuple->regs, sr_read + 1), 342 bi_get_src_new(tuple->add, &tuple->regs, sr_read + 2), 343 0); 344 345 if (tuple->add) { 346 bi_instr *add = tuple->add; 347 348 bool sr_write = bi_opcode_props[add->op].sr_write && 349 !bi_is_null(add->dest[0]); 350 351 if (sr_read && !bi_is_null(add->src[0])) { 352 assert(add->src[0].type == BI_INDEX_REGISTER); 353 clause->staging_register = add->src[0].value; 354 355 if (sr_write) 356 assert(bi_is_equiv(add->src[0], add->dest[0])); 357 } else if (sr_write) { 358 assert(add->dest[0].type == BI_INDEX_REGISTER); 359 clause->staging_register = add->dest[0].value; 360 } 361 } 362 363 struct bi_packed_tuple packed = { 364 .lo = reg | (fma << 35) | ((add & 0b111111) << 58), 365 .hi = add >> 6 366 }; 367 368 return packed; 369} 370 371/* A block contains at most one PC-relative constant, from a terminal branch. 372 * Find the last instruction and if it is a relative branch, fix up the 373 * PC-relative constant to contain the absolute offset. This occurs at pack 374 * time instead of schedule time because the number of quadwords between each 375 * block is not known until after all other passes have finished. 376 */ 377 378static void 379bi_assign_branch_offset(bi_context *ctx, bi_block *block) 380{ 381 if (list_is_empty(&block->clauses)) 382 return; 383 384 bi_clause *clause = list_last_entry(&block->clauses, bi_clause, link); 385 bi_instr *br = bi_last_instr_in_clause(clause); 386 387 if (!br->branch_target) 388 return; 389 390 /* Put it in the high place */ 391 int32_t qwords = bi_block_offset(ctx, clause, br->branch_target); 392 int32_t bytes = qwords * 16; 393 394 /* Copy so we can toy with the sign without undefined behaviour */ 395 uint32_t raw = 0; 396 memcpy(&raw, &bytes, sizeof(raw)); 397 398 /* Clear off top bits for A1/B1 bits */ 399 raw &= ~0xF0000000; 400 401 /* Put in top 32-bits */ 402 assert(clause->pcrel_idx < 8); 403 clause->constants[clause->pcrel_idx] |= ((uint64_t) raw) << 32ull; 404} 405 406static void 407bi_pack_constants(unsigned tuple_count, uint64_t *constants, 408 unsigned word_idx, unsigned constant_words, bool ec0_packed, 409 struct util_dynarray *emission) 410{ 411 unsigned index = (word_idx << 1) + ec0_packed; 412 413 /* Do more constants follow */ 414 bool more = (word_idx + 1) < constant_words; 415 416 /* Indexed first by tuple count and second by constant word number, 417 * indicates the position in the clause */ 418 unsigned pos_lookup[8][3] = { 419 { 0 }, 420 { 1 }, 421 { 3 }, 422 { 2, 5 }, 423 { 4, 8 }, 424 { 7, 11, 14 }, 425 { 6, 10, 13 }, 426 { 9, 12 } 427 }; 428 429 /* Compute the pos, and check everything is reasonable */ 430 assert((tuple_count - 1) < 8); 431 assert(word_idx < 3); 432 unsigned pos = pos_lookup[tuple_count - 1][word_idx]; 433 assert(pos != 0 || (tuple_count == 1 && word_idx == 0)); 434 435 struct bifrost_fmt_constant quad = { 436 .pos = pos, 437 .tag = more ? BIFROST_FMTC_CONSTANTS : BIFROST_FMTC_FINAL, 438 .imm_1 = constants[index + 0] >> 4, 439 .imm_2 = constants[index + 1] >> 4, 440 }; 441 442 util_dynarray_append(emission, struct bifrost_fmt_constant, quad); 443} 444 445uint8_t 446bi_pack_literal(enum bi_clause_subword literal) 447{ 448 assert(literal >= BI_CLAUSE_SUBWORD_LITERAL_0); 449 assert(literal <= BI_CLAUSE_SUBWORD_LITERAL_7); 450 451 return (literal - BI_CLAUSE_SUBWORD_LITERAL_0); 452} 453 454static inline uint8_t 455bi_clause_upper(unsigned val, 456 struct bi_packed_tuple *tuples, 457 ASSERTED unsigned tuple_count) 458{ 459 assert(val < tuple_count); 460 461 /* top 3-bits of 78-bits is tuple >> 75 == (tuple >> 64) >> 11 */ 462 struct bi_packed_tuple tuple = tuples[val]; 463 return (tuple.hi >> 11); 464} 465 466uint8_t 467bi_pack_upper(enum bi_clause_subword upper, 468 struct bi_packed_tuple *tuples, 469 ASSERTED unsigned tuple_count) 470{ 471 assert(upper >= BI_CLAUSE_SUBWORD_UPPER_0); 472 assert(upper <= BI_CLAUSE_SUBWORD_UPPER_7); 473 474 return bi_clause_upper(upper - BI_CLAUSE_SUBWORD_UPPER_0, tuples, 475 tuple_count); 476} 477 478uint64_t 479bi_pack_tuple_bits(enum bi_clause_subword idx, 480 struct bi_packed_tuple *tuples, 481 ASSERTED unsigned tuple_count, 482 unsigned offset, unsigned nbits) 483{ 484 assert(idx >= BI_CLAUSE_SUBWORD_TUPLE_0); 485 assert(idx <= BI_CLAUSE_SUBWORD_TUPLE_7); 486 487 unsigned val = (idx - BI_CLAUSE_SUBWORD_TUPLE_0); 488 assert(val < tuple_count); 489 490 struct bi_packed_tuple tuple = tuples[val]; 491 492 assert(offset + nbits < 78); 493 assert(nbits <= 64); 494 495 /* (X >> start) & m 496 * = (((hi << 64) | lo) >> start) & m 497 * = (((hi << 64) >> start) | (lo >> start)) & m 498 * = { ((hi << (64 - start)) | (lo >> start)) & m if start <= 64 499 * { ((hi >> (start - 64)) | (lo >> start)) & m if start >= 64 500 * = { ((hi << (64 - start)) & m) | ((lo >> start) & m) if start <= 64 501 * { ((hi >> (start - 64)) & m) | ((lo >> start) & m) if start >= 64 502 * 503 * By setting m = 2^64 - 1, we justify doing the respective shifts as 504 * 64-bit integers. Zero special cased to avoid undefined behaviour. 505 */ 506 507 uint64_t lo = (tuple.lo >> offset); 508 uint64_t hi = (offset == 0) ? 0 509 : (offset > 64) ? (tuple.hi >> (offset - 64)) 510 : (tuple.hi << (64 - offset)); 511 512 return (lo | hi) & ((1ULL << nbits) - 1); 513} 514 515static inline uint16_t 516bi_pack_lu(enum bi_clause_subword word, 517 struct bi_packed_tuple *tuples, 518 ASSERTED unsigned tuple_count) 519{ 520 return (word >= BI_CLAUSE_SUBWORD_UPPER_0) ? 521 bi_pack_upper(word, tuples, tuple_count) : 522 bi_pack_literal(word); 523} 524 525uint8_t 526bi_pack_sync(enum bi_clause_subword t1, 527 enum bi_clause_subword t2, 528 enum bi_clause_subword t3, 529 struct bi_packed_tuple *tuples, 530 ASSERTED unsigned tuple_count, 531 bool z) 532{ 533 uint8_t sync = 534 (bi_pack_lu(t3, tuples, tuple_count) << 0) | 535 (bi_pack_lu(t2, tuples, tuple_count) << 3); 536 537 if (t1 == BI_CLAUSE_SUBWORD_Z) 538 sync |= z << 6; 539 else 540 sync |= bi_pack_literal(t1) << 6; 541 542 return sync; 543} 544 545static inline uint64_t 546bi_pack_t_ec(enum bi_clause_subword word, 547 struct bi_packed_tuple *tuples, 548 ASSERTED unsigned tuple_count, 549 uint64_t ec0) 550{ 551 if (word == BI_CLAUSE_SUBWORD_CONSTANT) 552 return ec0; 553 else 554 return bi_pack_tuple_bits(word, tuples, tuple_count, 0, 60); 555} 556 557static uint32_t 558bi_pack_subwords_56(enum bi_clause_subword t, 559 struct bi_packed_tuple *tuples, 560 ASSERTED unsigned tuple_count, 561 uint64_t header, uint64_t ec0, 562 unsigned tuple_subword) 563{ 564 switch (t) { 565 case BI_CLAUSE_SUBWORD_HEADER: 566 return (header & ((1 << 30) - 1)); 567 case BI_CLAUSE_SUBWORD_RESERVED: 568 return 0; 569 case BI_CLAUSE_SUBWORD_CONSTANT: 570 return (ec0 >> 15) & ((1 << 30) - 1); 571 default: 572 return bi_pack_tuple_bits(t, tuples, tuple_count, tuple_subword * 15, 30); 573 } 574} 575 576static uint16_t 577bi_pack_subword(enum bi_clause_subword t, unsigned format, 578 struct bi_packed_tuple *tuples, 579 ASSERTED unsigned tuple_count, 580 uint64_t header, uint64_t ec0, unsigned m0, 581 unsigned tuple_subword) 582{ 583 switch (t) { 584 case BI_CLAUSE_SUBWORD_HEADER: 585 return header >> 30; 586 case BI_CLAUSE_SUBWORD_M: 587 return m0; 588 case BI_CLAUSE_SUBWORD_CONSTANT: 589 return (format == 5 || format == 10) ? 590 (ec0 & ((1 << 15) - 1)) : 591 (ec0 >> (15 + 30)); 592 case BI_CLAUSE_SUBWORD_UPPER_23: 593 return (bi_clause_upper(2, tuples, tuple_count) << 12) | 594 (bi_clause_upper(3, tuples, tuple_count) << 9); 595 case BI_CLAUSE_SUBWORD_UPPER_56: 596 return (bi_clause_upper(5, tuples, tuple_count) << 12) | 597 (bi_clause_upper(6, tuples, tuple_count) << 9); 598 case BI_CLAUSE_SUBWORD_UPPER_0 ... BI_CLAUSE_SUBWORD_UPPER_7: 599 return bi_pack_upper(t, tuples, tuple_count) << 12; 600 default: 601 return bi_pack_tuple_bits(t, tuples, tuple_count, tuple_subword * 15, 15); 602 } 603} 604 605/* EC0 is 60-bits (bottom 4 already shifted off) */ 606void 607bi_pack_format(struct util_dynarray *emission, 608 unsigned index, 609 struct bi_packed_tuple *tuples, 610 ASSERTED unsigned tuple_count, 611 uint64_t header, uint64_t ec0, 612 unsigned m0, bool z) 613{ 614 struct bi_clause_format format = bi_clause_formats[index]; 615 616 uint8_t sync = bi_pack_sync(format.tag_1, format.tag_2, format.tag_3, 617 tuples, tuple_count, z); 618 619 uint64_t s0_s3 = bi_pack_t_ec(format.s0_s3, tuples, tuple_count, ec0); 620 621 uint16_t s4 = bi_pack_subword(format.s4, format.format, tuples, tuple_count, header, ec0, m0, 4); 622 623 uint32_t s5_s6 = bi_pack_subwords_56(format.s5_s6, 624 tuples, tuple_count, header, ec0, 625 (format.format == 2 || format.format == 7) ? 0 : 3); 626 627 uint64_t s7 = bi_pack_subword(format.s7, format.format, tuples, tuple_count, header, ec0, m0, 2); 628 629 /* Now that subwords are packed, split into 64-bit halves and emit */ 630 uint64_t lo = sync | ((s0_s3 & ((1ull << 56) - 1)) << 8); 631 uint64_t hi = (s0_s3 >> 56) | ((uint64_t) s4 << 4) | ((uint64_t) s5_s6 << 19) | ((uint64_t) s7 << 49); 632 633 util_dynarray_append(emission, uint64_t, lo); 634 util_dynarray_append(emission, uint64_t, hi); 635} 636 637static void 638bi_pack_clause(bi_context *ctx, bi_clause *clause, 639 bi_clause *next_1, bi_clause *next_2, 640 struct util_dynarray *emission, gl_shader_stage stage) 641{ 642 struct bi_packed_tuple ins[8] = { 0 }; 643 644 for (unsigned i = 0; i < clause->tuple_count; ++i) { 645 unsigned prev = ((i == 0) ? clause->tuple_count : i) - 1; 646 ins[i] = bi_pack_tuple(clause, &clause->tuples[i], 647 &clause->tuples[prev], i == 0, stage); 648 649 bi_instr *add = clause->tuples[i].add; 650 651 /* Different GPUs support different forms of the CLPER.i32 652 * instruction. Check we use the right one for the target. 653 */ 654 if (add && add->op == BI_OPCODE_CLPER_OLD_I32) 655 assert(ctx->quirks & BIFROST_LIMITED_CLPER); 656 else if (add && add->op == BI_OPCODE_CLPER_I32) 657 assert(!(ctx->quirks & BIFROST_LIMITED_CLPER)); 658 } 659 660 bool ec0_packed = bi_ec0_packed(clause->tuple_count); 661 662 if (ec0_packed) 663 clause->constant_count = MAX2(clause->constant_count, 1); 664 665 unsigned constant_quads = 666 DIV_ROUND_UP(clause->constant_count - (ec0_packed ? 1 : 0), 2); 667 668 uint64_t header = bi_pack_header(clause, next_1, next_2); 669 uint64_t ec0 = (clause->constants[0] >> 4); 670 unsigned m0 = (clause->pcrel_idx == 0) ? 4 : 0; 671 672 unsigned counts[8] = { 673 1, 2, 3, 3, 4, 5, 5, 6 674 }; 675 676 unsigned indices[8][6] = { 677 { 1 }, 678 { 0, 2 }, 679 { 0, 3, 4 }, 680 { 0, 3, 6 }, 681 { 0, 3, 7, 8 }, 682 { 0, 3, 5, 9, 10 }, 683 { 0, 3, 5, 9, 11 }, 684 { 0, 3, 5, 9, 12, 13 }, 685 }; 686 687 unsigned count = counts[clause->tuple_count - 1]; 688 689 for (unsigned pos = 0; pos < count; ++pos) { 690 ASSERTED unsigned idx = indices[clause->tuple_count - 1][pos]; 691 assert(bi_clause_formats[idx].pos == pos); 692 assert((bi_clause_formats[idx].tag_1 == BI_CLAUSE_SUBWORD_Z) == 693 (pos == count - 1)); 694 695 /* Whether to end the clause immediately after the last tuple */ 696 bool z = (constant_quads == 0); 697 698 bi_pack_format(emission, indices[clause->tuple_count - 1][pos], 699 ins, clause->tuple_count, header, ec0, m0, 700 z); 701 } 702 703 /* Pack the remaining constants */ 704 705 for (unsigned pos = 0; pos < constant_quads; ++pos) { 706 bi_pack_constants(clause->tuple_count, clause->constants, 707 pos, constant_quads, ec0_packed, emission); 708 } 709} 710 711static void 712bi_collect_blend_ret_addr(bi_context *ctx, struct util_dynarray *emission, 713 const bi_clause *clause) 714{ 715 /* No need to collect return addresses when we're in a blend shader. */ 716 if (ctx->inputs->is_blend) 717 return; 718 719 const bi_tuple *tuple = &clause->tuples[clause->tuple_count - 1]; 720 const bi_instr *ins = tuple->add; 721 722 if (!ins || ins->op != BI_OPCODE_BLEND) 723 return; 724 725 726 unsigned loc = tuple->regs.fau_idx - BIR_FAU_BLEND_0; 727 assert(loc < ARRAY_SIZE(ctx->info.bifrost->blend)); 728 assert(!ctx->info.bifrost->blend[loc].return_offset); 729 ctx->info.bifrost->blend[loc].return_offset = 730 util_dynarray_num_elements(emission, uint8_t); 731 assert(!(ctx->info.bifrost->blend[loc].return_offset & 0x7)); 732} 733 734unsigned 735bi_pack(bi_context *ctx, struct util_dynarray *emission) 736{ 737 unsigned previous_size = emission->size; 738 739 bi_foreach_block(ctx, block) { 740 bi_assign_branch_offset(ctx, block); 741 742 bi_foreach_clause_in_block(block, clause) { 743 bool is_last = (clause->link.next == &block->clauses); 744 745 /* Get the succeeding clauses, either two successors of 746 * the block for the last clause in the block or just 747 * the next clause within the block */ 748 749 bi_clause *next = NULL, *next_2 = NULL; 750 751 if (is_last) { 752 next = bi_next_clause(ctx, block->successors[0], NULL); 753 next_2 = bi_next_clause(ctx, block->successors[1], NULL); 754 } else { 755 next = bi_next_clause(ctx, block, clause); 756 } 757 758 759 previous_size = emission->size; 760 761 bi_pack_clause(ctx, clause, next, next_2, emission, ctx->stage); 762 763 if (!is_last) 764 bi_collect_blend_ret_addr(ctx, emission, clause); 765 } 766 } 767 768 return emission->size - previous_size; 769} 770