1/* 2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 */ 23#include "r600_sq.h" 24#include "r600_opcodes.h" 25#include "r600_formats.h" 26#include "r600_shader.h" 27#include "r600d.h" 28 29#include <errno.h> 30#include "util/u_bitcast.h" 31#include "util/u_dump.h" 32#include "util/u_memory.h" 33#include "util/u_math.h" 34#include "pipe/p_shader_tokens.h" 35 36#include "sb/sb_public.h" 37 38#define NUM_OF_CYCLES 3 39#define NUM_OF_COMPONENTS 4 40 41static inline bool alu_writes(struct r600_bytecode_alu *alu) 42{ 43 return alu->dst.write || alu->is_op3; 44} 45 46static inline unsigned int r600_bytecode_get_num_operands(const struct r600_bytecode_alu *alu) 47{ 48 return r600_isa_alu(alu->op)->src_count; 49} 50 51static struct r600_bytecode_cf *r600_bytecode_cf(void) 52{ 53 struct r600_bytecode_cf *cf = CALLOC_STRUCT(r600_bytecode_cf); 54 55 if (!cf) 56 return NULL; 57 list_inithead(&cf->list); 58 list_inithead(&cf->alu); 59 list_inithead(&cf->vtx); 60 list_inithead(&cf->tex); 61 list_inithead(&cf->gds); 62 return cf; 63} 64 65static struct r600_bytecode_alu *r600_bytecode_alu(void) 66{ 67 struct r600_bytecode_alu *alu = CALLOC_STRUCT(r600_bytecode_alu); 68 69 if (!alu) 70 return NULL; 71 list_inithead(&alu->list); 72 return alu; 73} 74 75static struct r600_bytecode_vtx *r600_bytecode_vtx(void) 76{ 77 struct r600_bytecode_vtx *vtx = CALLOC_STRUCT(r600_bytecode_vtx); 78 79 if (!vtx) 80 return NULL; 81 list_inithead(&vtx->list); 82 return vtx; 83} 84 85static struct r600_bytecode_tex *r600_bytecode_tex(void) 86{ 87 struct r600_bytecode_tex *tex = CALLOC_STRUCT(r600_bytecode_tex); 88 89 if (!tex) 90 return NULL; 91 list_inithead(&tex->list); 92 return tex; 93} 94 95static struct r600_bytecode_gds *r600_bytecode_gds(void) 96{ 97 struct r600_bytecode_gds *gds = CALLOC_STRUCT(r600_bytecode_gds); 98 99 if (gds == NULL) 100 return NULL; 101 list_inithead(&gds->list); 102 return gds; 103} 104 105static unsigned stack_entry_size(enum radeon_family chip) { 106 /* Wavefront size: 107 * 64: R600/RV670/RV770/Cypress/R740/Barts/Turks/Caicos/ 108 * Aruba/Sumo/Sumo2/redwood/juniper 109 * 32: R630/R730/R710/Palm/Cedar 110 * 16: R610/Rs780 111 * 112 * Stack row size: 113 * Wavefront Size 16 32 48 64 114 * Columns per Row (R6xx/R7xx/R8xx only) 8 8 4 4 115 * Columns per Row (R9xx+) 8 4 4 4 */ 116 117 switch (chip) { 118 /* FIXME: are some chips missing here? */ 119 /* wavefront size 16 */ 120 case CHIP_RV610: 121 case CHIP_RS780: 122 case CHIP_RV620: 123 case CHIP_RS880: 124 /* wavefront size 32 */ 125 case CHIP_RV630: 126 case CHIP_RV635: 127 case CHIP_RV730: 128 case CHIP_RV710: 129 case CHIP_PALM: 130 case CHIP_CEDAR: 131 return 8; 132 133 /* wavefront size 64 */ 134 default: 135 return 4; 136 } 137} 138 139void r600_bytecode_init(struct r600_bytecode *bc, 140 enum amd_gfx_level gfx_level, 141 enum radeon_family family, 142 bool has_compressed_msaa_texturing) 143{ 144 static unsigned next_shader_id = 0; 145 146 bc->debug_id = ++next_shader_id; 147 148 if ((gfx_level == R600) && 149 (family != CHIP_RV670 && family != CHIP_RS780 && family != CHIP_RS880)) { 150 bc->ar_handling = AR_HANDLE_RV6XX; 151 152 /* Insert a nop after a relative temp write so that a read in 153 * the following instruction group gets the right value. The 154 * r600 and EG ISA specs both say that read-after-rel-write of a 155 * register in the next instr group is illegal, but apparently 156 * that's not true on all chips (see commit 157 * c96b9834032952492efbd2d1f5511fe225704918). 158 */ 159 bc->r6xx_nop_after_rel_dst = 1; 160 } else if (family == CHIP_RV770) { 161 bc->ar_handling = AR_HANDLE_NORMAL; 162 bc->r6xx_nop_after_rel_dst = 1; 163 } else { 164 bc->ar_handling = AR_HANDLE_NORMAL; 165 bc->r6xx_nop_after_rel_dst = 0; 166 } 167 168 list_inithead(&bc->cf); 169 bc->gfx_level = gfx_level; 170 bc->family = family; 171 bc->has_compressed_msaa_texturing = has_compressed_msaa_texturing; 172 bc->stack.entry_size = stack_entry_size(family); 173} 174 175int r600_bytecode_add_cf(struct r600_bytecode *bc) 176{ 177 struct r600_bytecode_cf *cf = r600_bytecode_cf(); 178 179 if (!cf) 180 return -ENOMEM; 181 list_addtail(&cf->list, &bc->cf); 182 if (bc->cf_last) { 183 cf->id = bc->cf_last->id + 2; 184 if (bc->cf_last->eg_alu_extended) { 185 /* take into account extended alu size */ 186 cf->id += 2; 187 bc->ndw += 2; 188 } 189 } 190 bc->cf_last = cf; 191 bc->ncf++; 192 bc->ndw += 2; 193 bc->force_add_cf = 0; 194 bc->ar_loaded = 0; 195 return 0; 196} 197 198int r600_bytecode_add_output(struct r600_bytecode *bc, 199 const struct r600_bytecode_output *output) 200{ 201 int r; 202 203 if (output->gpr >= bc->ngpr) 204 bc->ngpr = output->gpr + 1; 205 206 if (bc->cf_last && (bc->cf_last->op == output->op || 207 (bc->cf_last->op == CF_OP_EXPORT && 208 output->op == CF_OP_EXPORT_DONE)) && 209 output->type == bc->cf_last->output.type && 210 output->elem_size == bc->cf_last->output.elem_size && 211 output->swizzle_x == bc->cf_last->output.swizzle_x && 212 output->swizzle_y == bc->cf_last->output.swizzle_y && 213 output->swizzle_z == bc->cf_last->output.swizzle_z && 214 output->swizzle_w == bc->cf_last->output.swizzle_w && 215 output->comp_mask == bc->cf_last->output.comp_mask && 216 (output->burst_count + bc->cf_last->output.burst_count) <= 16) { 217 218 if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr && 219 (output->array_base + output->burst_count) == bc->cf_last->output.array_base) { 220 221 bc->cf_last->op = bc->cf_last->output.op = output->op; 222 bc->cf_last->output.gpr = output->gpr; 223 bc->cf_last->output.array_base = output->array_base; 224 bc->cf_last->output.burst_count += output->burst_count; 225 return 0; 226 227 } else if (output->gpr == (bc->cf_last->output.gpr + bc->cf_last->output.burst_count) && 228 output->array_base == (bc->cf_last->output.array_base + bc->cf_last->output.burst_count)) { 229 230 bc->cf_last->op = bc->cf_last->output.op = output->op; 231 bc->cf_last->output.burst_count += output->burst_count; 232 return 0; 233 } 234 } 235 236 r = r600_bytecode_add_cf(bc); 237 if (r) 238 return r; 239 bc->cf_last->op = output->op; 240 memcpy(&bc->cf_last->output, output, sizeof(struct r600_bytecode_output)); 241 bc->cf_last->barrier = 1; 242 return 0; 243} 244 245int r600_bytecode_add_pending_output(struct r600_bytecode *bc, 246 const struct r600_bytecode_output *output) 247{ 248 assert(bc->n_pending_outputs + 1 < ARRAY_SIZE(bc->pending_outputs)); 249 bc->pending_outputs[bc->n_pending_outputs++] = *output; 250 251 return 0; 252} 253 254void 255r600_bytecode_add_ack(struct r600_bytecode *bc) 256{ 257 bc->need_wait_ack = true; 258} 259 260int 261r600_bytecode_wait_acks(struct r600_bytecode *bc) 262{ 263 /* Store acks are an R700+ feature. */ 264 if (bc->gfx_level < R700) 265 return 0; 266 267 if (!bc->need_wait_ack) 268 return 0; 269 270 int ret = r600_bytecode_add_cfinst(bc, CF_OP_WAIT_ACK); 271 if (ret != 0) 272 return ret; 273 274 struct r600_bytecode_cf *cf = bc->cf_last; 275 cf->barrier = 1; 276 /* Request a wait if the number of outstanding acks is > 0 */ 277 cf->cf_addr = 0; 278 279 return 0; 280} 281 282uint32_t 283r600_bytecode_write_export_ack_type(struct r600_bytecode *bc, bool indirect) 284{ 285 if (bc->gfx_level >= R700) { 286 if (indirect) 287 return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK_EG; 288 else 289 return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_ACK_EG; 290 } else { 291 if (indirect) 292 return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 293 else 294 return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; 295 } 296} 297 298/* alu instructions that can ony exits once per group */ 299static int is_alu_once_inst(struct r600_bytecode_alu *alu) 300{ 301 return r600_isa_alu(alu->op)->flags & (AF_KILL | AF_PRED) || alu->is_lds_idx_op || alu->op == ALU_OP0_GROUP_BARRIER; 302} 303 304static int is_alu_reduction_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 305{ 306 return (r600_isa_alu(alu->op)->flags & AF_REPL) && 307 (r600_isa_alu_slots(bc->isa->hw_class, alu->op) == AF_4V); 308} 309 310static int is_alu_mova_inst(struct r600_bytecode_alu *alu) 311{ 312 return r600_isa_alu(alu->op)->flags & AF_MOVA; 313} 314 315static int alu_uses_rel(struct r600_bytecode_alu *alu) 316{ 317 unsigned num_src = r600_bytecode_get_num_operands(alu); 318 unsigned src; 319 320 if (alu->dst.rel) { 321 return 1; 322 } 323 324 for (src = 0; src < num_src; ++src) { 325 if (alu->src[src].rel) { 326 return 1; 327 } 328 } 329 return 0; 330} 331 332static int is_lds_read(int sel) 333{ 334 return sel == EG_V_SQ_ALU_SRC_LDS_OQ_A_POP || sel == EG_V_SQ_ALU_SRC_LDS_OQ_B_POP; 335} 336 337static int alu_uses_lds(struct r600_bytecode_alu *alu) 338{ 339 unsigned num_src = r600_bytecode_get_num_operands(alu); 340 unsigned src; 341 342 for (src = 0; src < num_src; ++src) { 343 if (is_lds_read(alu->src[src].sel)) { 344 return 1; 345 } 346 } 347 return 0; 348} 349 350static int is_alu_64bit_inst(struct r600_bytecode_alu *alu) 351{ 352 const struct alu_op_info *op = r600_isa_alu(alu->op); 353 return (op->flags & AF_64); 354} 355 356static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 357{ 358 unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op); 359 return !(slots & AF_S); 360} 361 362static int is_alu_trans_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 363{ 364 unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op); 365 return !(slots & AF_V); 366} 367 368/* alu instructions that can execute on any unit */ 369static int is_alu_any_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 370{ 371 unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op); 372 return slots == AF_VS; 373} 374 375static int is_nop_inst(struct r600_bytecode_alu *alu) 376{ 377 return alu->op == ALU_OP0_NOP; 378} 379 380static int assign_alu_units(struct r600_bytecode *bc, struct r600_bytecode_alu *alu_first, 381 struct r600_bytecode_alu *assignment[5]) 382{ 383 struct r600_bytecode_alu *alu; 384 unsigned i, chan, trans; 385 int max_slots = bc->gfx_level == CAYMAN ? 4 : 5; 386 387 for (i = 0; i < max_slots; i++) 388 assignment[i] = NULL; 389 390 for (alu = alu_first; alu; alu = list_entry(alu->list.next, struct r600_bytecode_alu, list)) { 391 chan = alu->dst.chan; 392 if (max_slots == 4) 393 trans = 0; 394 else if (is_alu_trans_unit_inst(bc, alu)) 395 trans = 1; 396 else if (is_alu_vec_unit_inst(bc, alu)) 397 trans = 0; 398 else if (assignment[chan]) 399 trans = 1; /* Assume ALU_INST_PREFER_VECTOR. */ 400 else 401 trans = 0; 402 403 if (trans) { 404 if (assignment[4]) { 405 assert(0); /* ALU.Trans has already been allocated. */ 406 return -1; 407 } 408 assignment[4] = alu; 409 } else { 410 if (assignment[chan]) { 411 assert(0); /* ALU.chan has already been allocated. */ 412 return -1; 413 } 414 assignment[chan] = alu; 415 } 416 417 if (alu->last) 418 break; 419 } 420 return 0; 421} 422 423struct alu_bank_swizzle { 424 int hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS]; 425 int hw_cfile_addr[4]; 426 int hw_cfile_elem[4]; 427}; 428 429static const unsigned cycle_for_bank_swizzle_vec[][3] = { 430 [SQ_ALU_VEC_012] = { 0, 1, 2 }, 431 [SQ_ALU_VEC_021] = { 0, 2, 1 }, 432 [SQ_ALU_VEC_120] = { 1, 2, 0 }, 433 [SQ_ALU_VEC_102] = { 1, 0, 2 }, 434 [SQ_ALU_VEC_201] = { 2, 0, 1 }, 435 [SQ_ALU_VEC_210] = { 2, 1, 0 } 436}; 437 438static const unsigned cycle_for_bank_swizzle_scl[][3] = { 439 [SQ_ALU_SCL_210] = { 2, 1, 0 }, 440 [SQ_ALU_SCL_122] = { 1, 2, 2 }, 441 [SQ_ALU_SCL_212] = { 2, 1, 2 }, 442 [SQ_ALU_SCL_221] = { 2, 2, 1 } 443}; 444 445static void init_bank_swizzle(struct alu_bank_swizzle *bs) 446{ 447 int i, cycle, component; 448 /* set up gpr use */ 449 for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++) 450 for (component = 0; component < NUM_OF_COMPONENTS; component++) 451 bs->hw_gpr[cycle][component] = -1; 452 for (i = 0; i < 4; i++) 453 bs->hw_cfile_addr[i] = -1; 454 for (i = 0; i < 4; i++) 455 bs->hw_cfile_elem[i] = -1; 456} 457 458static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle) 459{ 460 if (bs->hw_gpr[cycle][chan] == -1) 461 bs->hw_gpr[cycle][chan] = sel; 462 else if (bs->hw_gpr[cycle][chan] != (int)sel) { 463 /* Another scalar operation has already used the GPR read port for the channel. */ 464 return -1; 465 } 466 return 0; 467} 468 469static int reserve_cfile(const struct r600_bytecode *bc, 470 struct alu_bank_swizzle *bs, unsigned sel, unsigned chan) 471{ 472 int res, num_res = 4; 473 if (bc->gfx_level >= R700) { 474 num_res = 2; 475 chan /= 2; 476 } 477 for (res = 0; res < num_res; ++res) { 478 if (bs->hw_cfile_addr[res] == -1) { 479 bs->hw_cfile_addr[res] = sel; 480 bs->hw_cfile_elem[res] = chan; 481 return 0; 482 } else if (bs->hw_cfile_addr[res] == sel && 483 bs->hw_cfile_elem[res] == chan) 484 return 0; /* Read for this scalar element already reserved, nothing to do here. */ 485 } 486 /* All cfile read ports are used, cannot reference vector element. */ 487 return -1; 488} 489 490static int is_gpr(unsigned sel) 491{ 492 return (sel <= 127); 493} 494 495/* CB constants start at 512, and get translated to a kcache index when ALU 496 * clauses are constructed. Note that we handle kcache constants the same way 497 * as (the now gone) cfile constants, is that really required? */ 498static int is_kcache(unsigned sel) 499{ 500 return (sel > 511 && sel < 4607) || /* Kcache before translation. */ 501 (sel > 127 && sel < 192) || /* Kcache 0 & 1 after translation. */ 502 (sel > 256 && sel < 320); /* Kcache 2 & 3 after translation (EG). */ 503} 504 505static int is_const(int sel) 506{ 507 return is_kcache(sel) || 508 (sel >= V_SQ_ALU_SRC_0 && 509 sel <= V_SQ_ALU_SRC_LITERAL); 510} 511 512static int check_vector(const struct r600_bytecode *bc, const struct r600_bytecode_alu *alu, 513 struct alu_bank_swizzle *bs, int bank_swizzle) 514{ 515 int r, src, num_src, sel, elem, cycle; 516 517 num_src = r600_bytecode_get_num_operands(alu); 518 for (src = 0; src < num_src; src++) { 519 sel = alu->src[src].sel; 520 elem = alu->src[src].chan; 521 if (is_gpr(sel)) { 522 cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src]; 523 if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan) 524 /* Nothing to do; special-case optimization, 525 * second source uses first source’s reservation. */ 526 continue; 527 else { 528 r = reserve_gpr(bs, sel, elem, cycle); 529 if (r) 530 return r; 531 } 532 } else if (is_kcache(sel)) { 533 r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem); 534 if (r) 535 return r; 536 } 537 /* No restrictions on PV, PS, literal or special constants. */ 538 } 539 return 0; 540} 541 542static int check_scalar(const struct r600_bytecode *bc, const struct r600_bytecode_alu *alu, 543 struct alu_bank_swizzle *bs, int bank_swizzle) 544{ 545 int r, src, num_src, const_count, sel, elem, cycle; 546 547 num_src = r600_bytecode_get_num_operands(alu); 548 for (const_count = 0, src = 0; src < num_src; ++src) { 549 sel = alu->src[src].sel; 550 elem = alu->src[src].chan; 551 if (is_const(sel)) { /* Any constant, including literal and inline constants. */ 552 if (const_count >= 2) 553 /* More than two references to a constant in 554 * transcendental operation. */ 555 return -1; 556 else 557 const_count++; 558 } 559 if (is_kcache(sel)) { 560 r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem); 561 if (r) 562 return r; 563 } 564 } 565 for (src = 0; src < num_src; ++src) { 566 sel = alu->src[src].sel; 567 elem = alu->src[src].chan; 568 if (is_gpr(sel)) { 569 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src]; 570 if (cycle < const_count) 571 /* Cycle for GPR load conflicts with 572 * constant load in transcendental operation. */ 573 return -1; 574 r = reserve_gpr(bs, sel, elem, cycle); 575 if (r) 576 return r; 577 } 578 /* PV PS restrictions */ 579 if (const_count && (sel == 254 || sel == 255)) { 580 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src]; 581 if (cycle < const_count) 582 return -1; 583 } 584 } 585 return 0; 586} 587 588static int check_and_set_bank_swizzle(const struct r600_bytecode *bc, 589 struct r600_bytecode_alu *slots[5]) 590{ 591 struct alu_bank_swizzle bs; 592 int bank_swizzle[5]; 593 int i, r = 0, forced = 1; 594 boolean scalar_only = bc->gfx_level == CAYMAN ? false : true; 595 int max_slots = bc->gfx_level == CAYMAN ? 4 : 5; 596 int max_checks = max_slots * 1000; 597 598 for (i = 0; i < max_slots; i++) { 599 if (slots[i]) { 600 if (slots[i]->bank_swizzle_force) { 601 slots[i]->bank_swizzle = slots[i]->bank_swizzle_force; 602 } else { 603 forced = 0; 604 } 605 } 606 607 if (i < 4 && slots[i]) 608 scalar_only = false; 609 } 610 if (forced) 611 return 0; 612 613 /* Just check every possible combination of bank swizzle. 614 * Not very efficent, but works on the first try in most of the cases. */ 615 for (i = 0; i < 4; i++) 616 if (!slots[i] || !slots[i]->bank_swizzle_force || slots[i]->is_lds_idx_op) 617 bank_swizzle[i] = SQ_ALU_VEC_012; 618 else 619 bank_swizzle[i] = slots[i]->bank_swizzle; 620 621 bank_swizzle[4] = SQ_ALU_SCL_210; 622 623 while(bank_swizzle[4] <= SQ_ALU_SCL_221 && max_checks--) { 624 init_bank_swizzle(&bs); 625 if (scalar_only == false) { 626 for (i = 0; i < 4; i++) { 627 if (slots[i]) { 628 r = check_vector(bc, slots[i], &bs, bank_swizzle[i]); 629 if (r) 630 break; 631 } 632 } 633 } else 634 r = 0; 635 636 if (!r && max_slots == 5 && slots[4]) { 637 r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]); 638 } 639 if (!r) { 640 for (i = 0; i < max_slots; i++) { 641 if (slots[i]) 642 slots[i]->bank_swizzle = bank_swizzle[i]; 643 } 644 return 0; 645 } 646 647 if (scalar_only) { 648 bank_swizzle[4]++; 649 } else { 650 for (i = 0; i < max_slots; i++) { 651 if (!slots[i] || (!slots[i]->bank_swizzle_force && !slots[i]->is_lds_idx_op)) { 652 bank_swizzle[i]++; 653 if (bank_swizzle[i] <= SQ_ALU_VEC_210) 654 break; 655 else if (i < max_slots - 1) 656 bank_swizzle[i] = SQ_ALU_VEC_012; 657 else 658 return -1; 659 } 660 } 661 } 662 } 663 664 /* Couldn't find a working swizzle. */ 665 return -1; 666} 667 668static int replace_gpr_with_pv_ps(struct r600_bytecode *bc, 669 struct r600_bytecode_alu *slots[5], struct r600_bytecode_alu *alu_prev) 670{ 671 struct r600_bytecode_alu *prev[5]; 672 int gpr[5], chan[5]; 673 int i, j, r, src, num_src; 674 int max_slots = bc->gfx_level == CAYMAN ? 4 : 5; 675 676 r = assign_alu_units(bc, alu_prev, prev); 677 if (r) 678 return r; 679 680 for (i = 0; i < max_slots; ++i) { 681 if (prev[i] && alu_writes(prev[i]) && !prev[i]->dst.rel) { 682 683 if (is_alu_64bit_inst(prev[i])) { 684 gpr[i] = -1; 685 continue; 686 } 687 688 gpr[i] = prev[i]->dst.sel; 689 /* cube writes more than PV.X */ 690 if (is_alu_reduction_inst(bc, prev[i])) 691 chan[i] = 0; 692 else 693 chan[i] = prev[i]->dst.chan; 694 } else 695 gpr[i] = -1; 696 } 697 698 for (i = 0; i < max_slots; ++i) { 699 struct r600_bytecode_alu *alu = slots[i]; 700 if (!alu) 701 continue; 702 703 if (is_alu_64bit_inst(alu)) 704 continue; 705 num_src = r600_bytecode_get_num_operands(alu); 706 for (src = 0; src < num_src; ++src) { 707 if (!is_gpr(alu->src[src].sel) || alu->src[src].rel) 708 continue; 709 710 if (bc->gfx_level < CAYMAN) { 711 if (alu->src[src].sel == gpr[4] && 712 alu->src[src].chan == chan[4] && 713 alu_prev->pred_sel == alu->pred_sel) { 714 alu->src[src].sel = V_SQ_ALU_SRC_PS; 715 alu->src[src].chan = 0; 716 continue; 717 } 718 } 719 720 for (j = 0; j < 4; ++j) { 721 if (alu->src[src].sel == gpr[j] && 722 alu->src[src].chan == j && 723 alu_prev->pred_sel == alu->pred_sel) { 724 alu->src[src].sel = V_SQ_ALU_SRC_PV; 725 alu->src[src].chan = chan[j]; 726 break; 727 } 728 } 729 } 730 } 731 732 return 0; 733} 734 735void r600_bytecode_special_constants(uint32_t value, unsigned *sel) 736{ 737 switch(value) { 738 case 0: 739 *sel = V_SQ_ALU_SRC_0; 740 break; 741 case 1: 742 *sel = V_SQ_ALU_SRC_1_INT; 743 break; 744 case -1: 745 *sel = V_SQ_ALU_SRC_M_1_INT; 746 break; 747 case 0x3F800000: /* 1.0f */ 748 *sel = V_SQ_ALU_SRC_1; 749 break; 750 case 0x3F000000: /* 0.5f */ 751 *sel = V_SQ_ALU_SRC_0_5; 752 break; 753 default: 754 *sel = V_SQ_ALU_SRC_LITERAL; 755 break; 756 } 757} 758 759/* compute how many literal are needed */ 760static int r600_bytecode_alu_nliterals(struct r600_bytecode_alu *alu, 761 uint32_t literal[4], unsigned *nliteral) 762{ 763 unsigned num_src = r600_bytecode_get_num_operands(alu); 764 unsigned i, j; 765 766 for (i = 0; i < num_src; ++i) { 767 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 768 uint32_t value = alu->src[i].value; 769 unsigned found = 0; 770 for (j = 0; j < *nliteral; ++j) { 771 if (literal[j] == value) { 772 found = 1; 773 break; 774 } 775 } 776 if (!found) { 777 if (*nliteral >= 4) 778 return -EINVAL; 779 literal[(*nliteral)++] = value; 780 } 781 } 782 } 783 return 0; 784} 785 786static void r600_bytecode_alu_adjust_literals(struct r600_bytecode_alu *alu, 787 uint32_t literal[4], unsigned nliteral) 788{ 789 unsigned num_src = r600_bytecode_get_num_operands(alu); 790 unsigned i, j; 791 792 for (i = 0; i < num_src; ++i) { 793 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 794 uint32_t value = alu->src[i].value; 795 for (j = 0; j < nliteral; ++j) { 796 if (literal[j] == value) { 797 alu->src[i].chan = j; 798 break; 799 } 800 } 801 } 802 } 803} 804 805static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu *slots[5], 806 struct r600_bytecode_alu *alu_prev) 807{ 808 struct r600_bytecode_alu *prev[5]; 809 struct r600_bytecode_alu *result[5] = { NULL }; 810 811 uint8_t interp_xz = 0; 812 813 uint32_t literal[4], prev_literal[4]; 814 unsigned nliteral = 0, prev_nliteral = 0; 815 816 int i, j, r, src, num_src; 817 int num_once_inst = 0; 818 int have_mova = 0, have_rel = 0; 819 int max_slots = bc->gfx_level == CAYMAN ? 4 : 5; 820 821 r = assign_alu_units(bc, alu_prev, prev); 822 if (r) 823 return r; 824 825 for (i = 0; i < max_slots; ++i) { 826 if (prev[i]) { 827 if (prev[i]->pred_sel) 828 return 0; 829 if (is_alu_once_inst(prev[i])) 830 return 0; 831 832 if (prev[i]->op == ALU_OP1_INTERP_LOAD_P0) 833 interp_xz |= 3; 834 if (prev[i]->op == ALU_OP2_INTERP_X) 835 interp_xz |= 1; 836 if (prev[i]->op == ALU_OP2_INTERP_Z) 837 interp_xz |= 2; 838 } 839 if (slots[i]) { 840 if (slots[i]->pred_sel) 841 return 0; 842 if (is_alu_once_inst(slots[i])) 843 return 0; 844 if (slots[i]->op == ALU_OP1_INTERP_LOAD_P0) 845 interp_xz |= 3; 846 if (slots[i]->op == ALU_OP2_INTERP_X) 847 interp_xz |= 1; 848 if (slots[i]->op == ALU_OP2_INTERP_Z) 849 interp_xz |= 2; 850 } 851 if (interp_xz == 3) 852 return 0; 853 } 854 855 for (i = 0; i < max_slots; ++i) { 856 struct r600_bytecode_alu *alu; 857 858 if (num_once_inst > 0) 859 return 0; 860 861 /* check number of literals */ 862 if (prev[i]) { 863 if (r600_bytecode_alu_nliterals(prev[i], literal, &nliteral)) 864 return 0; 865 if (r600_bytecode_alu_nliterals(prev[i], prev_literal, &prev_nliteral)) 866 return 0; 867 if (is_alu_mova_inst(prev[i])) { 868 if (have_rel) 869 return 0; 870 have_mova = 1; 871 } 872 873 if (alu_uses_rel(prev[i])) { 874 if (have_mova) { 875 return 0; 876 } 877 have_rel = 1; 878 } 879 if (alu_uses_lds(prev[i])) 880 return 0; 881 882 num_once_inst += is_alu_once_inst(prev[i]); 883 } 884 if (slots[i] && r600_bytecode_alu_nliterals(slots[i], literal, &nliteral)) 885 return 0; 886 887 /* Let's check used slots. */ 888 if (prev[i] && !slots[i]) { 889 result[i] = prev[i]; 890 continue; 891 } else if (prev[i] && slots[i]) { 892 if (max_slots == 5 && result[4] == NULL && prev[4] == NULL && slots[4] == NULL) { 893 /* Trans unit is still free try to use it. */ 894 if (is_alu_any_unit_inst(bc, slots[i]) && !alu_uses_lds(slots[i])) { 895 result[i] = prev[i]; 896 result[4] = slots[i]; 897 } else if (is_alu_any_unit_inst(bc, prev[i])) { 898 if (slots[i]->dst.sel == prev[i]->dst.sel && 899 alu_writes(slots[i]) && 900 alu_writes(prev[i])) 901 return 0; 902 903 result[i] = slots[i]; 904 result[4] = prev[i]; 905 } else 906 return 0; 907 } else 908 return 0; 909 } else if(!slots[i]) { 910 continue; 911 } else { 912 if (max_slots == 5 && slots[i] && prev[4] && 913 slots[i]->dst.sel == prev[4]->dst.sel && 914 slots[i]->dst.chan == prev[4]->dst.chan && 915 alu_writes(slots[i]) && 916 alu_writes(prev[4])) 917 return 0; 918 919 result[i] = slots[i]; 920 } 921 922 alu = slots[i]; 923 num_once_inst += is_alu_once_inst(alu); 924 925 /* don't reschedule NOPs */ 926 if (is_nop_inst(alu)) 927 return 0; 928 929 if (is_alu_mova_inst(alu)) { 930 if (have_rel) { 931 return 0; 932 } 933 have_mova = 1; 934 } 935 936 if (alu_uses_rel(alu)) { 937 if (have_mova) { 938 return 0; 939 } 940 have_rel = 1; 941 } 942 943 if (alu->op == ALU_OP0_SET_CF_IDX0 || 944 alu->op == ALU_OP0_SET_CF_IDX1) 945 return 0; /* data hazard with MOVA */ 946 947 /* Let's check source gprs */ 948 num_src = r600_bytecode_get_num_operands(alu); 949 for (src = 0; src < num_src; ++src) { 950 951 /* Constants don't matter. */ 952 if (!is_gpr(alu->src[src].sel)) 953 continue; 954 955 for (j = 0; j < max_slots; ++j) { 956 if (!prev[j] || !alu_writes(prev[j])) 957 continue; 958 959 /* If it's relative then we can't determin which gpr is really used. */ 960 if (prev[j]->dst.chan == alu->src[src].chan && 961 (prev[j]->dst.sel == alu->src[src].sel || 962 prev[j]->dst.rel || alu->src[src].rel)) 963 return 0; 964 } 965 } 966 } 967 968 /* more than one PRED_ or KILL_ ? */ 969 if (num_once_inst > 1) 970 return 0; 971 972 /* check if the result can still be swizzlet */ 973 r = check_and_set_bank_swizzle(bc, result); 974 if (r) 975 return 0; 976 977 /* looks like everything worked out right, apply the changes */ 978 979 /* undo adding previus literals */ 980 bc->cf_last->ndw -= align(prev_nliteral, 2); 981 982 /* sort instructions */ 983 for (i = 0; i < max_slots; ++i) { 984 slots[i] = result[i]; 985 if (result[i]) { 986 list_del(&result[i]->list); 987 result[i]->last = 0; 988 list_addtail(&result[i]->list, &bc->cf_last->alu); 989 } 990 } 991 992 /* determine new last instruction */ 993 list_entry(bc->cf_last->alu.prev, struct r600_bytecode_alu, list)->last = 1; 994 995 /* determine new first instruction */ 996 for (i = 0; i < max_slots; ++i) { 997 if (result[i]) { 998 bc->cf_last->curr_bs_head = result[i]; 999 break; 1000 } 1001 } 1002 1003 bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head; 1004 bc->cf_last->prev2_bs_head = NULL; 1005 1006 return 0; 1007} 1008 1009/* we'll keep kcache sets sorted by bank & addr */ 1010static int r600_bytecode_alloc_kcache_line(struct r600_bytecode *bc, 1011 struct r600_bytecode_kcache *kcache, 1012 unsigned bank, unsigned line, unsigned index_mode) 1013{ 1014 int i, kcache_banks = bc->gfx_level >= EVERGREEN ? 4 : 2; 1015 1016 for (i = 0; i < kcache_banks; i++) { 1017 if (kcache[i].mode) { 1018 int d; 1019 1020 if (kcache[i].bank < bank) 1021 continue; 1022 1023 if ((kcache[i].bank == bank && kcache[i].addr > line+1) || 1024 kcache[i].bank > bank) { 1025 /* try to insert new line */ 1026 if (kcache[kcache_banks-1].mode) { 1027 /* all sets are in use */ 1028 return -ENOMEM; 1029 } 1030 1031 memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(struct r600_bytecode_kcache)); 1032 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1; 1033 kcache[i].bank = bank; 1034 kcache[i].addr = line; 1035 kcache[i].index_mode = index_mode; 1036 return 0; 1037 } 1038 1039 d = line - kcache[i].addr; 1040 1041 if (d == -1) { 1042 kcache[i].addr--; 1043 if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_2) { 1044 /* we are prepending the line to the current set, 1045 * discarding the existing second line, 1046 * so we'll have to insert line+2 after it */ 1047 line += 2; 1048 continue; 1049 } else if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_1) { 1050 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2; 1051 return 0; 1052 } else { 1053 /* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */ 1054 return -ENOMEM; 1055 } 1056 } else if (d == 1) { 1057 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2; 1058 return 0; 1059 } else if (d == 0) 1060 return 0; 1061 } else { /* free kcache set - use it */ 1062 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1; 1063 kcache[i].bank = bank; 1064 kcache[i].addr = line; 1065 kcache[i].index_mode = index_mode; 1066 return 0; 1067 } 1068 } 1069 return -ENOMEM; 1070} 1071 1072static int r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode *bc, 1073 struct r600_bytecode_kcache *kcache, 1074 struct r600_bytecode_alu *alu) 1075{ 1076 int i, r; 1077 1078 for (i = 0; i < 3; i++) { 1079 unsigned bank, line, sel = alu->src[i].sel, index_mode; 1080 1081 if (sel < 512) 1082 continue; 1083 1084 bank = alu->src[i].kc_bank; 1085 assert(bank < R600_MAX_HW_CONST_BUFFERS); 1086 line = (sel-512)>>4; 1087 index_mode = alu->src[i].kc_rel ? 1 : 0; // V_SQ_CF_INDEX_0 / V_SQ_CF_INDEX_NONE 1088 1089 if ((r = r600_bytecode_alloc_kcache_line(bc, kcache, bank, line, index_mode))) 1090 return r; 1091 } 1092 return 0; 1093} 1094 1095static int r600_bytecode_assign_kcache_banks( 1096 struct r600_bytecode_alu *alu, 1097 struct r600_bytecode_kcache * kcache) 1098{ 1099 int i, j; 1100 1101 /* Alter the src operands to refer to the kcache. */ 1102 for (i = 0; i < 3; ++i) { 1103 static const unsigned int base[] = {128, 160, 256, 288}; 1104 unsigned int line, sel = alu->src[i].sel, found = 0; 1105 1106 if (sel < 512) 1107 continue; 1108 1109 sel -= 512; 1110 line = sel>>4; 1111 1112 for (j = 0; j < 4 && !found; ++j) { 1113 switch (kcache[j].mode) { 1114 case V_SQ_CF_KCACHE_NOP: 1115 case V_SQ_CF_KCACHE_LOCK_LOOP_INDEX: 1116 R600_ERR("unexpected kcache line mode\n"); 1117 return -ENOMEM; 1118 default: 1119 if (kcache[j].bank == alu->src[i].kc_bank && 1120 kcache[j].addr <= line && 1121 line < kcache[j].addr + kcache[j].mode) { 1122 alu->src[i].sel = sel - (kcache[j].addr<<4); 1123 alu->src[i].sel += base[j]; 1124 found=1; 1125 } 1126 } 1127 } 1128 } 1129 return 0; 1130} 1131 1132static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, 1133 struct r600_bytecode_alu *alu, 1134 unsigned type) 1135{ 1136 struct r600_bytecode_kcache kcache_sets[4]; 1137 struct r600_bytecode_kcache *kcache = kcache_sets; 1138 int r; 1139 1140 memcpy(kcache, bc->cf_last->kcache, 4 * sizeof(struct r600_bytecode_kcache)); 1141 1142 if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) { 1143 /* can't alloc, need to start new clause */ 1144 1145 /* Make sure the CF ends with an "last" instruction when 1146 * we split an ALU group because of a new CF */ 1147 if (!list_is_empty(&bc->cf_last->alu)) { 1148 struct r600_bytecode_alu *last_submitted = 1149 list_last_entry(&bc->cf_last->alu, struct r600_bytecode_alu, list); 1150 last_submitted->last = 1; 1151 } 1152 1153 if ((r = r600_bytecode_add_cf(bc))) { 1154 return r; 1155 } 1156 bc->cf_last->op = type; 1157 1158 /* retry with the new clause */ 1159 kcache = bc->cf_last->kcache; 1160 if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) { 1161 /* can't alloc again- should never happen */ 1162 return r; 1163 } 1164 } else { 1165 /* update kcache sets */ 1166 memcpy(bc->cf_last->kcache, kcache, 4 * sizeof(struct r600_bytecode_kcache)); 1167 } 1168 1169 /* if we actually used more than 2 kcache sets, or have relative indexing - use ALU_EXTENDED on eg+ */ 1170 if (kcache[2].mode != V_SQ_CF_KCACHE_NOP || 1171 kcache[0].index_mode || kcache[1].index_mode || kcache[2].index_mode || kcache[3].index_mode) { 1172 if (bc->gfx_level < EVERGREEN) 1173 return -ENOMEM; 1174 bc->cf_last->eg_alu_extended = 1; 1175 } 1176 1177 return 0; 1178} 1179 1180static int insert_nop_r6xx(struct r600_bytecode *bc, int max_slots) 1181{ 1182 struct r600_bytecode_alu alu; 1183 int r, i; 1184 1185 for (i = 0; i < max_slots; i++) { 1186 memset(&alu, 0, sizeof(alu)); 1187 alu.op = ALU_OP0_NOP; 1188 alu.src[0].chan = i & 3; 1189 alu.dst.chan = i & 3; 1190 alu.last = (i == max_slots - 1); 1191 r = r600_bytecode_add_alu(bc, &alu); 1192 if (r) 1193 return r; 1194 } 1195 return 0; 1196} 1197 1198/* load AR register from gpr (bc->ar_reg) with MOVA_INT */ 1199static int load_ar_r6xx(struct r600_bytecode *bc, bool for_src) 1200{ 1201 struct r600_bytecode_alu alu; 1202 int r; 1203 1204 if (bc->ar_loaded) 1205 return 0; 1206 1207 /* hack to avoid making MOVA the last instruction in the clause */ 1208 if ((bc->cf_last->ndw>>1) >= 110) 1209 bc->force_add_cf = 1; 1210 else if (for_src) { 1211 insert_nop_r6xx(bc, 4); 1212 bc->nalu_groups++; 1213 } 1214 1215 memset(&alu, 0, sizeof(alu)); 1216 alu.op = ALU_OP1_MOVA_GPR_INT; 1217 alu.src[0].sel = bc->ar_reg; 1218 alu.src[0].chan = bc->ar_chan; 1219 alu.last = 1; 1220 alu.index_mode = INDEX_MODE_LOOP; 1221 r = r600_bytecode_add_alu(bc, &alu); 1222 if (r) 1223 return r; 1224 1225 /* no requirement to set uses waterfall on MOVA_GPR_INT */ 1226 bc->ar_loaded = 1; 1227 return 0; 1228} 1229 1230/* load AR register from gpr (bc->ar_reg) with MOVA_INT */ 1231int r600_load_ar(struct r600_bytecode *bc, bool for_src) 1232{ 1233 struct r600_bytecode_alu alu; 1234 int r; 1235 1236 if (bc->ar_handling) 1237 return load_ar_r6xx(bc, for_src); 1238 1239 if (bc->ar_loaded) 1240 return 0; 1241 1242 /* hack to avoid making MOVA the last instruction in the clause */ 1243 if ((bc->cf_last->ndw>>1) >= 110) 1244 bc->force_add_cf = 1; 1245 1246 memset(&alu, 0, sizeof(alu)); 1247 alu.op = ALU_OP1_MOVA_INT; 1248 alu.src[0].sel = bc->ar_reg; 1249 alu.src[0].chan = bc->ar_chan; 1250 alu.last = 1; 1251 r = r600_bytecode_add_alu(bc, &alu); 1252 if (r) 1253 return r; 1254 1255 bc->cf_last->r6xx_uses_waterfall = 1; 1256 bc->ar_loaded = 1; 1257 return 0; 1258} 1259 1260int r600_bytecode_add_alu_type(struct r600_bytecode *bc, 1261 const struct r600_bytecode_alu *alu, unsigned type) 1262{ 1263 struct r600_bytecode_alu *nalu = r600_bytecode_alu(); 1264 struct r600_bytecode_alu *lalu; 1265 int i, r; 1266 1267 if (!nalu) 1268 return -ENOMEM; 1269 memcpy(nalu, alu, sizeof(struct r600_bytecode_alu)); 1270 1271 if (alu->is_op3) { 1272 /* will fail later since alu does not support it. */ 1273 assert(!alu->src[0].abs && !alu->src[1].abs && !alu->src[2].abs); 1274 } 1275 1276 if (bc->cf_last != NULL && bc->cf_last->op != type) { 1277 /* check if we could add it anyway */ 1278 if ((bc->cf_last->op == CF_OP_ALU && type == CF_OP_ALU_PUSH_BEFORE) || 1279 (bc->cf_last->op == CF_OP_ALU_PUSH_BEFORE && type == CF_OP_ALU)) { 1280 LIST_FOR_EACH_ENTRY(lalu, &bc->cf_last->alu, list) { 1281 if (lalu->execute_mask) { 1282 bc->force_add_cf = 1; 1283 break; 1284 } 1285 type = CF_OP_ALU_PUSH_BEFORE; 1286 } 1287 } else 1288 bc->force_add_cf = 1; 1289 } 1290 1291 /* cf can contains only alu or only vtx or only tex */ 1292 if (bc->cf_last == NULL || bc->force_add_cf) { 1293 if (bc->cf_last && bc->cf_last->curr_bs_head) 1294 bc->cf_last->curr_bs_head->last = 1; 1295 r = r600_bytecode_add_cf(bc); 1296 if (r) { 1297 free(nalu); 1298 return r; 1299 } 1300 } 1301 bc->cf_last->op = type; 1302 1303 /* Load index register if required */ 1304 if (bc->gfx_level >= EVERGREEN) { 1305 for (i = 0; i < 3; i++) 1306 if (nalu->src[i].kc_bank && nalu->src[i].kc_rel) 1307 egcm_load_index_reg(bc, 0, true); 1308 } 1309 1310 /* Check AR usage and load it if required */ 1311 for (i = 0; i < 3; i++) 1312 if (nalu->src[i].rel && !bc->ar_loaded) 1313 r600_load_ar(bc, true); 1314 1315 if (nalu->dst.rel && !bc->ar_loaded) 1316 r600_load_ar(bc, false); 1317 1318 /* Setup the kcache for this ALU instruction. This will start a new 1319 * ALU clause if needed. */ 1320 if ((r = r600_bytecode_alloc_kcache_lines(bc, nalu, type))) { 1321 free(nalu); 1322 return r; 1323 } 1324 1325 if (!bc->cf_last->curr_bs_head) { 1326 bc->cf_last->curr_bs_head = nalu; 1327 } 1328 /* number of gpr == the last gpr used in any alu */ 1329 for (i = 0; i < 3; i++) { 1330 if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) { 1331 bc->ngpr = nalu->src[i].sel + 1; 1332 } 1333 if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL) 1334 r600_bytecode_special_constants(nalu->src[i].value, 1335 &nalu->src[i].sel); 1336 } 1337 if (nalu->dst.write && nalu->dst.sel >= bc->ngpr) { 1338 bc->ngpr = nalu->dst.sel + 1; 1339 } 1340 list_addtail(&nalu->list, &bc->cf_last->alu); 1341 /* each alu use 2 dwords */ 1342 bc->cf_last->ndw += 2; 1343 bc->ndw += 2; 1344 1345 /* process cur ALU instructions for bank swizzle */ 1346 if (nalu->last) { 1347 uint32_t literal[4]; 1348 unsigned nliteral; 1349 struct r600_bytecode_alu *slots[5]; 1350 int max_slots = bc->gfx_level == CAYMAN ? 4 : 5; 1351 r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots); 1352 if (r) 1353 return r; 1354 1355 if (bc->cf_last->prev_bs_head) { 1356 struct r600_bytecode_alu *cur_prev_head = bc->cf_last->prev_bs_head; 1357 r = merge_inst_groups(bc, slots, cur_prev_head); 1358 if (r) 1359 return r; 1360 if (cur_prev_head != bc->cf_last->prev_bs_head) 1361 bc->nalu_groups--; 1362 } 1363 1364 if (bc->cf_last->prev_bs_head) { 1365 r = replace_gpr_with_pv_ps(bc, slots, bc->cf_last->prev_bs_head); 1366 if (r) 1367 return r; 1368 } 1369 1370 r = check_and_set_bank_swizzle(bc, slots); 1371 if (r) 1372 return r; 1373 1374 for (i = 0, nliteral = 0; i < max_slots; i++) { 1375 if (slots[i]) { 1376 r = r600_bytecode_alu_nliterals(slots[i], literal, &nliteral); 1377 if (r) 1378 return r; 1379 } 1380 } 1381 bc->cf_last->ndw += align(nliteral, 2); 1382 1383 /* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots) 1384 * worst case */ 1385 if ((bc->cf_last->ndw >> 1) >= 120) { 1386 bc->force_add_cf = 1; 1387 } 1388 1389 bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head; 1390 bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head; 1391 bc->cf_last->curr_bs_head = NULL; 1392 1393 bc->nalu_groups++; 1394 1395 if (bc->r6xx_nop_after_rel_dst) { 1396 for (int i = 0; i < max_slots; ++i) { 1397 if (slots[i] && slots[i]->dst.rel) { 1398 insert_nop_r6xx(bc, max_slots); 1399 bc->nalu_groups++; 1400 break; 1401 } 1402 } 1403 } 1404 } 1405 1406 /* Might need to insert spill write ops after current clause */ 1407 if (nalu->last && bc->n_pending_outputs) { 1408 while (bc->n_pending_outputs) { 1409 r = r600_bytecode_add_output(bc, &bc->pending_outputs[--bc->n_pending_outputs]); 1410 if (r) 1411 return r; 1412 } 1413 } 1414 1415 return 0; 1416} 1417 1418int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu) 1419{ 1420 return r600_bytecode_add_alu_type(bc, alu, CF_OP_ALU); 1421} 1422 1423static unsigned r600_bytecode_num_tex_and_vtx_instructions(const struct r600_bytecode *bc) 1424{ 1425 switch (bc->gfx_level) { 1426 case R600: 1427 return 8; 1428 1429 case R700: 1430 case EVERGREEN: 1431 case CAYMAN: 1432 return 16; 1433 1434 default: 1435 R600_ERR("Unknown gfx level %d.\n", bc->gfx_level); 1436 return 8; 1437 } 1438} 1439 1440static inline boolean last_inst_was_not_vtx_fetch(struct r600_bytecode *bc) 1441{ 1442 return !((r600_isa_cf(bc->cf_last->op)->flags & CF_FETCH) && 1443 bc->cf_last->op != CF_OP_GDS && 1444 (bc->gfx_level == CAYMAN || 1445 bc->cf_last->op != CF_OP_TEX)); 1446} 1447 1448static int r600_bytecode_add_vtx_internal(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx, 1449 bool use_tc) 1450{ 1451 struct r600_bytecode_vtx *nvtx = r600_bytecode_vtx(); 1452 int r; 1453 1454 if (!nvtx) 1455 return -ENOMEM; 1456 memcpy(nvtx, vtx, sizeof(struct r600_bytecode_vtx)); 1457 1458 /* Load index register if required */ 1459 if (bc->gfx_level >= EVERGREEN) { 1460 if (vtx->buffer_index_mode) 1461 egcm_load_index_reg(bc, vtx->buffer_index_mode - 1, false); 1462 } 1463 1464 /* cf can contains only alu or only vtx or only tex */ 1465 if (bc->cf_last == NULL || 1466 last_inst_was_not_vtx_fetch(bc) || 1467 bc->force_add_cf) { 1468 r = r600_bytecode_add_cf(bc); 1469 if (r) { 1470 free(nvtx); 1471 return r; 1472 } 1473 switch (bc->gfx_level) { 1474 case R600: 1475 case R700: 1476 bc->cf_last->op = CF_OP_VTX; 1477 break; 1478 case EVERGREEN: 1479 if (use_tc) 1480 bc->cf_last->op = CF_OP_TEX; 1481 else 1482 bc->cf_last->op = CF_OP_VTX; 1483 break; 1484 case CAYMAN: 1485 bc->cf_last->op = CF_OP_TEX; 1486 break; 1487 default: 1488 R600_ERR("Unknown gfx level %d.\n", bc->gfx_level); 1489 free(nvtx); 1490 return -EINVAL; 1491 } 1492 } 1493 list_addtail(&nvtx->list, &bc->cf_last->vtx); 1494 /* each fetch use 4 dwords */ 1495 bc->cf_last->ndw += 4; 1496 bc->ndw += 4; 1497 if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc)) 1498 bc->force_add_cf = 1; 1499 1500 bc->ngpr = MAX2(bc->ngpr, vtx->src_gpr + 1); 1501 bc->ngpr = MAX2(bc->ngpr, vtx->dst_gpr + 1); 1502 1503 return 0; 1504} 1505 1506int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx) 1507{ 1508 return r600_bytecode_add_vtx_internal(bc, vtx, false); 1509} 1510 1511int r600_bytecode_add_vtx_tc(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx) 1512{ 1513 return r600_bytecode_add_vtx_internal(bc, vtx, true); 1514} 1515 1516int r600_bytecode_add_tex(struct r600_bytecode *bc, const struct r600_bytecode_tex *tex) 1517{ 1518 struct r600_bytecode_tex *ntex = r600_bytecode_tex(); 1519 int r; 1520 1521 if (!ntex) 1522 return -ENOMEM; 1523 memcpy(ntex, tex, sizeof(struct r600_bytecode_tex)); 1524 1525 /* Load index register if required */ 1526 if (bc->gfx_level >= EVERGREEN) { 1527 if (tex->sampler_index_mode || tex->resource_index_mode) 1528 egcm_load_index_reg(bc, 1, false); 1529 } 1530 1531 /* we can't fetch data und use it as texture lookup address in the same TEX clause */ 1532 if (bc->cf_last != NULL && 1533 bc->cf_last->op == CF_OP_TEX) { 1534 struct r600_bytecode_tex *ttex; 1535 LIST_FOR_EACH_ENTRY(ttex, &bc->cf_last->tex, list) { 1536 if (ttex->dst_gpr == ntex->src_gpr && 1537 (ttex->dst_sel_x < 4 || ttex->dst_sel_y < 4 || 1538 ttex->dst_sel_z < 4 || ttex->dst_sel_w < 4)) { 1539 bc->force_add_cf = 1; 1540 break; 1541 } 1542 } 1543 /* vtx instrs get inserted after tex, so make sure we aren't moving the tex 1544 * before (say) the instr fetching the texcoord. 1545 */ 1546 if (!list_is_empty(&bc->cf_last->vtx)) 1547 bc->force_add_cf = 1; 1548 1549 /* slight hack to make gradients always go into same cf */ 1550 if (ntex->op == FETCH_OP_SET_GRADIENTS_H) 1551 bc->force_add_cf = 1; 1552 } 1553 1554 /* cf can contains only alu or only vtx or only tex */ 1555 if (bc->cf_last == NULL || 1556 bc->cf_last->op != CF_OP_TEX || 1557 bc->force_add_cf) { 1558 r = r600_bytecode_add_cf(bc); 1559 if (r) { 1560 free(ntex); 1561 return r; 1562 } 1563 bc->cf_last->op = CF_OP_TEX; 1564 } 1565 if (ntex->src_gpr >= bc->ngpr) { 1566 bc->ngpr = ntex->src_gpr + 1; 1567 } 1568 if (ntex->dst_gpr >= bc->ngpr) { 1569 bc->ngpr = ntex->dst_gpr + 1; 1570 } 1571 list_addtail(&ntex->list, &bc->cf_last->tex); 1572 /* each texture fetch use 4 dwords */ 1573 bc->cf_last->ndw += 4; 1574 bc->ndw += 4; 1575 if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc)) 1576 bc->force_add_cf = 1; 1577 return 0; 1578} 1579 1580int r600_bytecode_add_gds(struct r600_bytecode *bc, const struct r600_bytecode_gds *gds) 1581{ 1582 struct r600_bytecode_gds *ngds = r600_bytecode_gds(); 1583 int r; 1584 1585 if (ngds == NULL) 1586 return -ENOMEM; 1587 memcpy(ngds, gds, sizeof(struct r600_bytecode_gds)); 1588 1589 if (bc->gfx_level >= EVERGREEN) { 1590 if (gds->uav_index_mode) 1591 egcm_load_index_reg(bc, gds->uav_index_mode - 1, false); 1592 } 1593 1594 if (bc->cf_last == NULL || 1595 bc->cf_last->op != CF_OP_GDS || 1596 bc->force_add_cf) { 1597 r = r600_bytecode_add_cf(bc); 1598 if (r) { 1599 free(ngds); 1600 return r; 1601 } 1602 bc->cf_last->op = CF_OP_GDS; 1603 } 1604 1605 list_addtail(&ngds->list, &bc->cf_last->gds); 1606 bc->cf_last->ndw += 4; /* each GDS uses 4 dwords */ 1607 if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc)) 1608 bc->force_add_cf = 1; 1609 return 0; 1610} 1611 1612int r600_bytecode_add_cfinst(struct r600_bytecode *bc, unsigned op) 1613{ 1614 int r; 1615 1616 /* Emit WAIT_ACK before control flow to ensure pending writes are always acked. */ 1617 if (op != CF_OP_WAIT_ACK && op != CF_OP_MEM_SCRATCH) 1618 r600_bytecode_wait_acks(bc); 1619 1620 r = r600_bytecode_add_cf(bc); 1621 if (r) 1622 return r; 1623 1624 bc->cf_last->cond = V_SQ_CF_COND_ACTIVE; 1625 bc->cf_last->op = op; 1626 return 0; 1627} 1628 1629int cm_bytecode_add_cf_end(struct r600_bytecode *bc) 1630{ 1631 return r600_bytecode_add_cfinst(bc, CF_OP_CF_END); 1632} 1633 1634/* common to all 3 families */ 1635static int r600_bytecode_vtx_build(struct r600_bytecode *bc, struct r600_bytecode_vtx *vtx, unsigned id) 1636{ 1637 if (r600_isa_fetch(vtx->op)->flags & FF_MEM) 1638 return r700_bytecode_fetch_mem_build(bc, vtx, id); 1639 bc->bytecode[id] = S_SQ_VTX_WORD0_VTX_INST(r600_isa_fetch_opcode(bc->isa->hw_class, vtx->op)) | 1640 S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) | 1641 S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) | 1642 S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) | 1643 S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x); 1644 if (bc->gfx_level < CAYMAN) 1645 bc->bytecode[id] |= S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count); 1646 id++; 1647 bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) | 1648 S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) | 1649 S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) | 1650 S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) | 1651 S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) | 1652 S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) | 1653 S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) | 1654 S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) | 1655 S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) | 1656 S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr); 1657 bc->bytecode[id] = S_SQ_VTX_WORD2_OFFSET(vtx->offset)| 1658 S_SQ_VTX_WORD2_ENDIAN_SWAP(vtx->endian); 1659 if (bc->gfx_level >= EVERGREEN) 1660 bc->bytecode[id] |= ((vtx->buffer_index_mode & 0x3) << 21); // S_SQ_VTX_WORD2_BIM(vtx->buffer_index_mode); 1661 if (bc->gfx_level < CAYMAN) 1662 bc->bytecode[id] |= S_SQ_VTX_WORD2_MEGA_FETCH(1); 1663 id++; 1664 bc->bytecode[id++] = 0; 1665 return 0; 1666} 1667 1668/* common to all 3 families */ 1669static int r600_bytecode_tex_build(struct r600_bytecode *bc, struct r600_bytecode_tex *tex, unsigned id) 1670{ 1671 bc->bytecode[id] = S_SQ_TEX_WORD0_TEX_INST( 1672 r600_isa_fetch_opcode(bc->isa->hw_class, tex->op)) | 1673 EG_S_SQ_TEX_WORD0_INST_MOD(tex->inst_mod) | 1674 S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) | 1675 S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) | 1676 S_SQ_TEX_WORD0_SRC_REL(tex->src_rel); 1677 if (bc->gfx_level >= EVERGREEN) 1678 bc->bytecode[id] |= ((tex->sampler_index_mode & 0x3) << 27) | // S_SQ_TEX_WORD0_SIM(tex->sampler_index_mode); 1679 ((tex->resource_index_mode & 0x3) << 25); // S_SQ_TEX_WORD0_RIM(tex->resource_index_mode) 1680 id++; 1681 bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) | 1682 S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) | 1683 S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) | 1684 S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) | 1685 S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) | 1686 S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) | 1687 S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) | 1688 S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) | 1689 S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) | 1690 S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) | 1691 S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w); 1692 bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) | 1693 S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) | 1694 S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) | 1695 S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) | 1696 S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) | 1697 S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) | 1698 S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) | 1699 S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w); 1700 bc->bytecode[id++] = 0; 1701 return 0; 1702} 1703 1704/* r600 only, r700/eg bits in r700_asm.c */ 1705static int r600_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, unsigned id) 1706{ 1707 unsigned opcode = r600_isa_alu_opcode(bc->isa->hw_class, alu->op); 1708 1709 /* don't replace gpr by pv or ps for destination register */ 1710 bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) | 1711 S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) | 1712 S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) | 1713 S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) | 1714 S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) | 1715 S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) | 1716 S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) | 1717 S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) | 1718 S_SQ_ALU_WORD0_INDEX_MODE(alu->index_mode) | 1719 S_SQ_ALU_WORD0_PRED_SEL(alu->pred_sel) | 1720 S_SQ_ALU_WORD0_LAST(alu->last); 1721 1722 if (alu->is_op3) { 1723 assert(!alu->src[0].abs && !alu->src[1].abs && !alu->src[2].abs); 1724 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) | 1725 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) | 1726 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) | 1727 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) | 1728 S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) | 1729 S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) | 1730 S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) | 1731 S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) | 1732 S_SQ_ALU_WORD1_OP3_ALU_INST(opcode) | 1733 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle); 1734 } else { 1735 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) | 1736 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) | 1737 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) | 1738 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) | 1739 S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) | 1740 S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) | 1741 S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) | 1742 S_SQ_ALU_WORD1_OP2_OMOD(alu->omod) | 1743 S_SQ_ALU_WORD1_OP2_ALU_INST(opcode) | 1744 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) | 1745 S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->execute_mask) | 1746 S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->update_pred); 1747 } 1748 return 0; 1749} 1750 1751static void r600_bytecode_cf_vtx_build(uint32_t *bytecode, const struct r600_bytecode_cf *cf) 1752{ 1753 *bytecode++ = S_SQ_CF_WORD0_ADDR(cf->addr >> 1); 1754 *bytecode++ = S_SQ_CF_WORD1_CF_INST(r600_isa_cf_opcode(ISA_CC_R600, cf->op)) | 1755 S_SQ_CF_WORD1_BARRIER(1) | 1756 S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1)| 1757 S_SQ_CF_WORD1_END_OF_PROGRAM(cf->end_of_program); 1758} 1759 1760/* common for r600/r700 - eg in eg_asm.c */ 1761static int r600_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf) 1762{ 1763 unsigned id = cf->id; 1764 const struct cf_op_info *cfop = r600_isa_cf(cf->op); 1765 unsigned opcode = r600_isa_cf_opcode(bc->isa->hw_class, cf->op); 1766 1767 1768 if (cf->op == CF_NATIVE) { 1769 bc->bytecode[id++] = cf->isa[0]; 1770 bc->bytecode[id++] = cf->isa[1]; 1771 } else if (cfop->flags & CF_ALU) { 1772 bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) | 1773 S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) | 1774 S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) | 1775 S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank); 1776 1777 bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(opcode) | 1778 S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) | 1779 S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) | 1780 S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) | 1781 S_SQ_CF_ALU_WORD1_BARRIER(1) | 1782 S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->gfx_level == R600 ? cf->r6xx_uses_waterfall : 0) | 1783 S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1); 1784 } else if (cfop->flags & CF_FETCH) { 1785 if (bc->gfx_level == R700) 1786 r700_bytecode_cf_vtx_build(&bc->bytecode[id], cf); 1787 else 1788 r600_bytecode_cf_vtx_build(&bc->bytecode[id], cf); 1789 } else if (cfop->flags & CF_EXP) { 1790 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) | 1791 S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) | 1792 S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) | 1793 S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type) | 1794 S_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(cf->output.index_gpr); 1795 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) | 1796 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) | 1797 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) | 1798 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) | 1799 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) | 1800 S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->barrier) | 1801 S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(opcode) | 1802 S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->end_of_program); 1803 } else if (cfop->flags & CF_MEM) { 1804 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) | 1805 S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) | 1806 S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) | 1807 S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type) | 1808 S_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(cf->output.index_gpr); 1809 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) | 1810 S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->barrier) | 1811 S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(opcode) | 1812 S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->end_of_program) | 1813 S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size) | 1814 S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask); 1815 } else { 1816 bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1); 1817 bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(opcode) | 1818 S_SQ_CF_WORD1_BARRIER(1) | 1819 S_SQ_CF_WORD1_COND(cf->cond) | 1820 S_SQ_CF_WORD1_POP_COUNT(cf->pop_count) | 1821 S_SQ_CF_WORD1_END_OF_PROGRAM(cf->end_of_program); 1822 } 1823 return 0; 1824} 1825 1826int r600_bytecode_build(struct r600_bytecode *bc) 1827{ 1828 struct r600_bytecode_cf *cf; 1829 struct r600_bytecode_alu *alu; 1830 struct r600_bytecode_vtx *vtx; 1831 struct r600_bytecode_tex *tex; 1832 struct r600_bytecode_gds *gds; 1833 uint32_t literal[4]; 1834 unsigned nliteral; 1835 unsigned addr; 1836 int i, r; 1837 1838 if (!bc->nstack) { // If not 0, Stack_size already provided by llvm 1839 if (bc->stack.max_entries) 1840 bc->nstack = bc->stack.max_entries; 1841 else if (bc->type == PIPE_SHADER_VERTEX || 1842 bc->type == PIPE_SHADER_TESS_EVAL || 1843 bc->type == PIPE_SHADER_TESS_CTRL) 1844 bc->nstack = 1; 1845 } 1846 1847 /* first path compute addr of each CF block */ 1848 /* addr start after all the CF instructions */ 1849 addr = bc->cf_last->id + 2; 1850 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) { 1851 if (r600_isa_cf(cf->op)->flags & CF_FETCH) { 1852 addr += 3; 1853 addr &= 0xFFFFFFFCUL; 1854 } 1855 cf->addr = addr; 1856 addr += cf->ndw; 1857 bc->ndw = cf->addr + cf->ndw; 1858 } 1859 free(bc->bytecode); 1860 bc->bytecode = calloc(4, bc->ndw); 1861 if (bc->bytecode == NULL) 1862 return -ENOMEM; 1863 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) { 1864 const struct cf_op_info *cfop = r600_isa_cf(cf->op); 1865 addr = cf->addr; 1866 if (bc->gfx_level >= EVERGREEN) 1867 r = eg_bytecode_cf_build(bc, cf); 1868 else 1869 r = r600_bytecode_cf_build(bc, cf); 1870 if (r) 1871 return r; 1872 if (cfop->flags & CF_ALU) { 1873 nliteral = 0; 1874 memset(literal, 0, sizeof(literal)); 1875 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) { 1876 r = r600_bytecode_alu_nliterals(alu, literal, &nliteral); 1877 if (r) 1878 return r; 1879 r600_bytecode_alu_adjust_literals(alu, literal, nliteral); 1880 r600_bytecode_assign_kcache_banks(alu, cf->kcache); 1881 1882 switch(bc->gfx_level) { 1883 case R600: 1884 r = r600_bytecode_alu_build(bc, alu, addr); 1885 break; 1886 case R700: 1887 r = r700_bytecode_alu_build(bc, alu, addr); 1888 break; 1889 case EVERGREEN: 1890 case CAYMAN: 1891 r = eg_bytecode_alu_build(bc, alu, addr); 1892 break; 1893 default: 1894 R600_ERR("unknown gfx level %d.\n", bc->gfx_level); 1895 return -EINVAL; 1896 } 1897 if (r) 1898 return r; 1899 addr += 2; 1900 if (alu->last) { 1901 for (i = 0; i < align(nliteral, 2); ++i) { 1902 bc->bytecode[addr++] = literal[i]; 1903 } 1904 nliteral = 0; 1905 memset(literal, 0, sizeof(literal)); 1906 } 1907 } 1908 } else if (cf->op == CF_OP_VTX) { 1909 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) { 1910 r = r600_bytecode_vtx_build(bc, vtx, addr); 1911 if (r) 1912 return r; 1913 addr += 4; 1914 } 1915 } else if (cf->op == CF_OP_GDS) { 1916 assert(bc->gfx_level >= EVERGREEN); 1917 LIST_FOR_EACH_ENTRY(gds, &cf->gds, list) { 1918 r = eg_bytecode_gds_build(bc, gds, addr); 1919 if (r) 1920 return r; 1921 addr += 4; 1922 } 1923 } else if (cf->op == CF_OP_TEX) { 1924 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) { 1925 assert(bc->gfx_level >= EVERGREEN); 1926 r = r600_bytecode_vtx_build(bc, vtx, addr); 1927 if (r) 1928 return r; 1929 addr += 4; 1930 } 1931 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) { 1932 r = r600_bytecode_tex_build(bc, tex, addr); 1933 if (r) 1934 return r; 1935 addr += 4; 1936 } 1937 } 1938 } 1939 return 0; 1940} 1941 1942void r600_bytecode_clear(struct r600_bytecode *bc) 1943{ 1944 struct r600_bytecode_cf *cf = NULL, *next_cf; 1945 1946 free(bc->bytecode); 1947 bc->bytecode = NULL; 1948 1949 LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) { 1950 struct r600_bytecode_alu *alu = NULL, *next_alu; 1951 struct r600_bytecode_tex *tex = NULL, *next_tex; 1952 struct r600_bytecode_tex *vtx = NULL, *next_vtx; 1953 struct r600_bytecode_gds *gds = NULL, *next_gds; 1954 1955 LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) { 1956 free(alu); 1957 } 1958 1959 list_inithead(&cf->alu); 1960 1961 LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) { 1962 free(tex); 1963 } 1964 1965 list_inithead(&cf->tex); 1966 1967 LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) { 1968 free(vtx); 1969 } 1970 1971 list_inithead(&cf->vtx); 1972 1973 LIST_FOR_EACH_ENTRY_SAFE(gds, next_gds, &cf->gds, list) { 1974 free(gds); 1975 } 1976 1977 list_inithead(&cf->gds); 1978 1979 free(cf); 1980 } 1981 1982 list_inithead(&cf->list); 1983} 1984 1985static int print_swizzle(unsigned swz) 1986{ 1987 const char * swzchars = "xyzw01?_"; 1988 assert(swz<8 && swz != 6); 1989 return fprintf(stderr, "%c", swzchars[swz]); 1990} 1991 1992static int print_sel(unsigned sel, unsigned rel, unsigned index_mode, 1993 unsigned need_brackets) 1994{ 1995 int o = 0; 1996 if (rel && index_mode >= 5 && sel < 128) 1997 o += fprintf(stderr, "G"); 1998 if (rel || need_brackets) { 1999 o += fprintf(stderr, "["); 2000 } 2001 o += fprintf(stderr, "%d", sel); 2002 if (rel) { 2003 if (index_mode == 0 || index_mode == 6) 2004 o += fprintf(stderr, "+AR"); 2005 else if (index_mode == 4) 2006 o += fprintf(stderr, "+AL"); 2007 } 2008 if (rel || need_brackets) { 2009 o += fprintf(stderr, "]"); 2010 } 2011 return o; 2012} 2013 2014static int print_dst(struct r600_bytecode_alu *alu) 2015{ 2016 int o = 0; 2017 unsigned sel = alu->dst.sel; 2018 char reg_char = 'R'; 2019 if (sel > 128 - 4) { /* clause temporary gpr */ 2020 sel -= 128 - 4; 2021 reg_char = 'T'; 2022 } 2023 2024 if (alu_writes(alu)) { 2025 o += fprintf(stderr, "%c", reg_char); 2026 o += print_sel(alu->dst.sel, alu->dst.rel, alu->index_mode, 0); 2027 } else { 2028 o += fprintf(stderr, "__"); 2029 } 2030 o += fprintf(stderr, "."); 2031 o += print_swizzle(alu->dst.chan); 2032 return o; 2033} 2034 2035static int print_src(struct r600_bytecode_alu *alu, unsigned idx) 2036{ 2037 int o = 0; 2038 struct r600_bytecode_alu_src *src = &alu->src[idx]; 2039 unsigned sel = src->sel, need_sel = 1, need_chan = 1, need_brackets = 0; 2040 2041 if (src->neg) 2042 o += fprintf(stderr,"-"); 2043 if (src->abs) 2044 o += fprintf(stderr,"|"); 2045 2046 if (sel < 128 - 4) { 2047 o += fprintf(stderr, "R"); 2048 } else if (sel < 128) { 2049 o += fprintf(stderr, "T"); 2050 sel -= 128 - 4; 2051 } else if (sel < 160) { 2052 o += fprintf(stderr, "KC0"); 2053 need_brackets = 1; 2054 sel -= 128; 2055 } else if (sel < 192) { 2056 o += fprintf(stderr, "KC1"); 2057 need_brackets = 1; 2058 sel -= 160; 2059 } else if (sel >= 512) { 2060 o += fprintf(stderr, "C%d", src->kc_bank); 2061 need_brackets = 1; 2062 sel -= 512; 2063 } else if (sel >= 448) { 2064 o += fprintf(stderr, "Param"); 2065 sel -= 448; 2066 need_chan = 0; 2067 } else if (sel >= 288) { 2068 o += fprintf(stderr, "KC3"); 2069 need_brackets = 1; 2070 sel -= 288; 2071 } else if (sel >= 256) { 2072 o += fprintf(stderr, "KC2"); 2073 need_brackets = 1; 2074 sel -= 256; 2075 } else { 2076 need_sel = 0; 2077 need_chan = 0; 2078 switch (sel) { 2079 case EG_V_SQ_ALU_SRC_LDS_DIRECT_A: 2080 o += fprintf(stderr, "LDS_A[0x%08X]", src->value); 2081 break; 2082 case EG_V_SQ_ALU_SRC_LDS_DIRECT_B: 2083 o += fprintf(stderr, "LDS_B[0x%08X]", src->value); 2084 break; 2085 case EG_V_SQ_ALU_SRC_LDS_OQ_A: 2086 o += fprintf(stderr, "LDS_OQ_A"); 2087 need_chan = 1; 2088 break; 2089 case EG_V_SQ_ALU_SRC_LDS_OQ_B: 2090 o += fprintf(stderr, "LDS_OQ_B"); 2091 need_chan = 1; 2092 break; 2093 case EG_V_SQ_ALU_SRC_LDS_OQ_A_POP: 2094 o += fprintf(stderr, "LDS_OQ_A_POP"); 2095 need_chan = 1; 2096 break; 2097 case EG_V_SQ_ALU_SRC_LDS_OQ_B_POP: 2098 o += fprintf(stderr, "LDS_OQ_B_POP"); 2099 need_chan = 1; 2100 break; 2101 case EG_V_SQ_ALU_SRC_TIME_LO: 2102 o += fprintf(stderr, "TIME_LO"); 2103 break; 2104 case EG_V_SQ_ALU_SRC_TIME_HI: 2105 o += fprintf(stderr, "TIME_HI"); 2106 break; 2107 case EG_V_SQ_ALU_SRC_SE_ID: 2108 o += fprintf(stderr, "SE_ID"); 2109 break; 2110 case EG_V_SQ_ALU_SRC_SIMD_ID: 2111 o += fprintf(stderr, "SIMD_ID"); 2112 break; 2113 case EG_V_SQ_ALU_SRC_HW_WAVE_ID: 2114 o += fprintf(stderr, "HW_WAVE_ID"); 2115 break; 2116 case V_SQ_ALU_SRC_PS: 2117 o += fprintf(stderr, "PS"); 2118 break; 2119 case V_SQ_ALU_SRC_PV: 2120 o += fprintf(stderr, "PV"); 2121 need_chan = 1; 2122 break; 2123 case V_SQ_ALU_SRC_LITERAL: 2124 o += fprintf(stderr, "[0x%08X %f]", src->value, u_bitcast_u2f(src->value)); 2125 break; 2126 case V_SQ_ALU_SRC_0_5: 2127 o += fprintf(stderr, "0.5"); 2128 break; 2129 case V_SQ_ALU_SRC_M_1_INT: 2130 o += fprintf(stderr, "-1"); 2131 break; 2132 case V_SQ_ALU_SRC_1_INT: 2133 o += fprintf(stderr, "1"); 2134 break; 2135 case V_SQ_ALU_SRC_1: 2136 o += fprintf(stderr, "1.0"); 2137 break; 2138 case V_SQ_ALU_SRC_0: 2139 o += fprintf(stderr, "0"); 2140 break; 2141 default: 2142 o += fprintf(stderr, "??IMM_%d", sel); 2143 break; 2144 } 2145 } 2146 2147 if (need_sel) 2148 o += print_sel(sel, src->rel, alu->index_mode, need_brackets); 2149 2150 if (need_chan) { 2151 o += fprintf(stderr, "."); 2152 o += print_swizzle(src->chan); 2153 } 2154 2155 if (src->abs) 2156 o += fprintf(stderr,"|"); 2157 2158 return o; 2159} 2160 2161static int print_indent(int p, int c) 2162{ 2163 int o = 0; 2164 while (p++ < c) 2165 o += fprintf(stderr, " "); 2166 return o; 2167} 2168 2169void r600_bytecode_disasm(struct r600_bytecode *bc) 2170{ 2171 const char *index_mode[] = {"CF_INDEX_NONE", "CF_INDEX_0", "CF_INDEX_1"}; 2172 static int index = 0; 2173 struct r600_bytecode_cf *cf = NULL; 2174 struct r600_bytecode_alu *alu = NULL; 2175 struct r600_bytecode_vtx *vtx = NULL; 2176 struct r600_bytecode_tex *tex = NULL; 2177 struct r600_bytecode_gds *gds = NULL; 2178 2179 unsigned i, id, ngr = 0, last; 2180 uint32_t literal[4]; 2181 unsigned nliteral; 2182 char chip = '6'; 2183 2184 switch (bc->gfx_level) { 2185 case R700: 2186 chip = '7'; 2187 break; 2188 case EVERGREEN: 2189 chip = 'E'; 2190 break; 2191 case CAYMAN: 2192 chip = 'C'; 2193 break; 2194 case R600: 2195 default: 2196 chip = '6'; 2197 break; 2198 } 2199 fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack -------------\n", 2200 bc->ndw, bc->ngpr, bc->nstack); 2201 fprintf(stderr, "shader %d -- %c\n", index++, chip); 2202 2203 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) { 2204 id = cf->id; 2205 if (cf->op == CF_NATIVE) { 2206 fprintf(stderr, "%04d %08X %08X CF_NATIVE\n", id, bc->bytecode[id], 2207 bc->bytecode[id + 1]); 2208 } else { 2209 const struct cf_op_info *cfop = r600_isa_cf(cf->op); 2210 if (cfop->flags & CF_ALU) { 2211 if (cf->eg_alu_extended) { 2212 fprintf(stderr, "%04d %08X %08X %s\n", id, bc->bytecode[id], 2213 bc->bytecode[id + 1], "ALU_EXT"); 2214 id += 2; 2215 } 2216 fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id], 2217 bc->bytecode[id + 1], cfop->name); 2218 fprintf(stderr, "%d @%d ", cf->ndw / 2, cf->addr); 2219 for (i = 0; i < 4; ++i) { 2220 if (cf->kcache[i].mode) { 2221 int c_start = (cf->kcache[i].addr << 4); 2222 int c_end = c_start + (cf->kcache[i].mode << 4); 2223 fprintf(stderr, "KC%d[CB%d:%d-%d%s%s] ", 2224 i, cf->kcache[i].bank, c_start, c_end, 2225 cf->kcache[i].index_mode ? " " : "", 2226 cf->kcache[i].index_mode ? index_mode[cf->kcache[i].index_mode] : ""); 2227 } 2228 } 2229 fprintf(stderr, "\n"); 2230 } else if (cfop->flags & CF_FETCH) { 2231 fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id], 2232 bc->bytecode[id + 1], cfop->name); 2233 fprintf(stderr, "%d @%d ", cf->ndw / 4, cf->addr); 2234 if (cf->vpm) 2235 fprintf(stderr, "VPM "); 2236 if (cf->end_of_program) 2237 fprintf(stderr, "EOP "); 2238 fprintf(stderr, "\n"); 2239 2240 } else if (cfop->flags & CF_EXP) { 2241 int o = 0; 2242 const char *exp_type[] = {"PIXEL", "POS ", "PARAM"}; 2243 o += fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id], 2244 bc->bytecode[id + 1], cfop->name); 2245 o += print_indent(o, 43); 2246 o += fprintf(stderr, "%s ", exp_type[cf->output.type]); 2247 if (cf->output.burst_count > 1) { 2248 o += fprintf(stderr, "%d-%d ", cf->output.array_base, 2249 cf->output.array_base + cf->output.burst_count - 1); 2250 2251 o += print_indent(o, 55); 2252 o += fprintf(stderr, "R%d-%d.", cf->output.gpr, 2253 cf->output.gpr + cf->output.burst_count - 1); 2254 } else { 2255 o += fprintf(stderr, "%d ", cf->output.array_base); 2256 o += print_indent(o, 55); 2257 o += fprintf(stderr, "R%d.", cf->output.gpr); 2258 } 2259 2260 o += print_swizzle(cf->output.swizzle_x); 2261 o += print_swizzle(cf->output.swizzle_y); 2262 o += print_swizzle(cf->output.swizzle_z); 2263 o += print_swizzle(cf->output.swizzle_w); 2264 2265 print_indent(o, 67); 2266 2267 fprintf(stderr, " ES:%X ", cf->output.elem_size); 2268 if (cf->mark) 2269 fprintf(stderr, "MARK "); 2270 if (!cf->barrier) 2271 fprintf(stderr, "NO_BARRIER "); 2272 if (cf->end_of_program) 2273 fprintf(stderr, "EOP "); 2274 fprintf(stderr, "\n"); 2275 } else if (r600_isa_cf(cf->op)->flags & CF_MEM) { 2276 int o = 0; 2277 const char *exp_type[] = {"WRITE", "WRITE_IND", "WRITE_ACK", 2278 "WRITE_IND_ACK"}; 2279 o += fprintf(stderr, "%04d %08X %08X %s ", id, 2280 bc->bytecode[id], bc->bytecode[id + 1], cfop->name); 2281 o += print_indent(o, 43); 2282 o += fprintf(stderr, "%s ", exp_type[cf->output.type]); 2283 2284 if (r600_isa_cf(cf->op)->flags & CF_RAT) { 2285 o += fprintf(stderr, "RAT%d", cf->rat.id); 2286 if (cf->rat.index_mode) { 2287 o += fprintf(stderr, "[IDX%d]", cf->rat.index_mode - 1); 2288 } 2289 o += fprintf(stderr, " INST: %d ", cf->rat.inst); 2290 } 2291 2292 if (cf->output.burst_count > 1) { 2293 o += fprintf(stderr, "%d-%d ", cf->output.array_base, 2294 cf->output.array_base + cf->output.burst_count - 1); 2295 o += print_indent(o, 55); 2296 o += fprintf(stderr, "R%d-%d.", cf->output.gpr, 2297 cf->output.gpr + cf->output.burst_count - 1); 2298 } else { 2299 o += fprintf(stderr, "%d ", cf->output.array_base); 2300 o += print_indent(o, 55); 2301 o += fprintf(stderr, "R%d.", cf->output.gpr); 2302 } 2303 for (i = 0; i < 4; ++i) { 2304 if (cf->output.comp_mask & (1 << i)) 2305 o += print_swizzle(i); 2306 else 2307 o += print_swizzle(7); 2308 } 2309 2310 if (cf->output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND || 2311 cf->output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND) 2312 o += fprintf(stderr, " R%d", cf->output.index_gpr); 2313 2314 o += print_indent(o, 67); 2315 2316 fprintf(stderr, " ES:%i ", cf->output.elem_size); 2317 if (cf->output.array_size != 0xFFF) 2318 fprintf(stderr, "AS:%i ", cf->output.array_size); 2319 if (cf->mark) 2320 fprintf(stderr, "MARK "); 2321 if (!cf->barrier) 2322 fprintf(stderr, "NO_BARRIER "); 2323 if (cf->end_of_program) 2324 fprintf(stderr, "EOP "); 2325 2326 if (cf->output.mark) 2327 fprintf(stderr, "MARK "); 2328 2329 fprintf(stderr, "\n"); 2330 } else { 2331 fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id], 2332 bc->bytecode[id + 1], cfop->name); 2333 fprintf(stderr, "@%d ", cf->cf_addr); 2334 if (cf->cond) 2335 fprintf(stderr, "CND:%X ", cf->cond); 2336 if (cf->pop_count) 2337 fprintf(stderr, "POP:%X ", cf->pop_count); 2338 if (cf->count && (cfop->flags & CF_EMIT)) 2339 fprintf(stderr, "STREAM%d ", cf->count); 2340 if (cf->vpm) 2341 fprintf(stderr, "VPM "); 2342 if (cf->end_of_program) 2343 fprintf(stderr, "EOP "); 2344 fprintf(stderr, "\n"); 2345 } 2346 } 2347 2348 id = cf->addr; 2349 nliteral = 0; 2350 last = 1; 2351 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) { 2352 const char *omod_str[] = {"","*2","*4","/2"}; 2353 const struct alu_op_info *aop = r600_isa_alu(alu->op); 2354 int o = 0; 2355 2356 r600_bytecode_alu_nliterals(alu, literal, &nliteral); 2357 o += fprintf(stderr, " %04d %08X %08X ", id, bc->bytecode[id], bc->bytecode[id+1]); 2358 if (last) 2359 o += fprintf(stderr, "%4d ", ++ngr); 2360 else 2361 o += fprintf(stderr, " "); 2362 o += fprintf(stderr, "%c%c %c ", alu->execute_mask ? 'M':' ', 2363 alu->update_pred ? 'P':' ', 2364 alu->pred_sel ? alu->pred_sel==2 ? '0':'1':' '); 2365 2366 o += fprintf(stderr, "%s%s%s ", aop->name, 2367 omod_str[alu->omod], alu->dst.clamp ? "_sat":""); 2368 2369 o += print_indent(o,60); 2370 o += print_dst(alu); 2371 for (i = 0; i < aop->src_count; ++i) { 2372 o += fprintf(stderr, i == 0 ? ", ": ", "); 2373 o += print_src(alu, i); 2374 } 2375 2376 if (alu->bank_swizzle) { 2377 o += print_indent(o,75); 2378 o += fprintf(stderr, " BS:%d", alu->bank_swizzle); 2379 } 2380 2381 fprintf(stderr, "\n"); 2382 id += 2; 2383 2384 if (alu->last) { 2385 for (i = 0; i < nliteral; i++, id++) { 2386 float *f = (float*)(bc->bytecode + id); 2387 o = fprintf(stderr, " %04d %08X", id, bc->bytecode[id]); 2388 print_indent(o, 60); 2389 fprintf(stderr, " %f (%d)\n", *f, *(bc->bytecode + id)); 2390 } 2391 id += nliteral & 1; 2392 nliteral = 0; 2393 } 2394 last = alu->last; 2395 } 2396 2397 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) { 2398 int o = 0; 2399 o += fprintf(stderr, " %04d %08X %08X %08X ", id, bc->bytecode[id], 2400 bc->bytecode[id + 1], bc->bytecode[id + 2]); 2401 2402 o += fprintf(stderr, "%s ", r600_isa_fetch(tex->op)->name); 2403 2404 o += print_indent(o, 50); 2405 2406 o += fprintf(stderr, "R%d.", tex->dst_gpr); 2407 o += print_swizzle(tex->dst_sel_x); 2408 o += print_swizzle(tex->dst_sel_y); 2409 o += print_swizzle(tex->dst_sel_z); 2410 o += print_swizzle(tex->dst_sel_w); 2411 2412 o += fprintf(stderr, ", R%d.", tex->src_gpr); 2413 o += print_swizzle(tex->src_sel_x); 2414 o += print_swizzle(tex->src_sel_y); 2415 o += print_swizzle(tex->src_sel_z); 2416 o += print_swizzle(tex->src_sel_w); 2417 2418 o += fprintf(stderr, ", RID:%d", tex->resource_id); 2419 o += fprintf(stderr, ", SID:%d ", tex->sampler_id); 2420 2421 if (tex->sampler_index_mode) 2422 fprintf(stderr, "SQ_%s ", index_mode[tex->sampler_index_mode]); 2423 2424 if (tex->lod_bias) 2425 fprintf(stderr, "LB:%d ", tex->lod_bias); 2426 2427 fprintf(stderr, "CT:%c%c%c%c ", 2428 tex->coord_type_x ? 'N' : 'U', 2429 tex->coord_type_y ? 'N' : 'U', 2430 tex->coord_type_z ? 'N' : 'U', 2431 tex->coord_type_w ? 'N' : 'U'); 2432 2433 if (tex->offset_x) 2434 fprintf(stderr, "OX:%d ", tex->offset_x); 2435 if (tex->offset_y) 2436 fprintf(stderr, "OY:%d ", tex->offset_y); 2437 if (tex->offset_z) 2438 fprintf(stderr, "OZ:%d ", tex->offset_z); 2439 2440 id += 4; 2441 fprintf(stderr, "\n"); 2442 } 2443 2444 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) { 2445 int o = 0; 2446 const char * fetch_type[] = {"VERTEX", "INSTANCE", ""}; 2447 o += fprintf(stderr, " %04d %08X %08X %08X ", id, bc->bytecode[id], 2448 bc->bytecode[id + 1], bc->bytecode[id + 2]); 2449 2450 o += fprintf(stderr, "%s ", r600_isa_fetch(vtx->op)->name); 2451 2452 o += print_indent(o, 50); 2453 2454 o += fprintf(stderr, "R%d.", vtx->dst_gpr); 2455 o += print_swizzle(vtx->dst_sel_x); 2456 o += print_swizzle(vtx->dst_sel_y); 2457 o += print_swizzle(vtx->dst_sel_z); 2458 o += print_swizzle(vtx->dst_sel_w); 2459 2460 o += fprintf(stderr, ", R%d.", vtx->src_gpr); 2461 o += print_swizzle(vtx->src_sel_x); 2462 if (r600_isa_fetch(vtx->op)->flags & FF_MEM) 2463 o += print_swizzle(vtx->src_sel_y); 2464 2465 if (vtx->offset) 2466 fprintf(stderr, " +%db", vtx->offset); 2467 2468 o += print_indent(o, 55); 2469 2470 fprintf(stderr, ", RID:%d ", vtx->buffer_id); 2471 2472 fprintf(stderr, "%s ", fetch_type[vtx->fetch_type]); 2473 2474 if (bc->gfx_level < CAYMAN && vtx->mega_fetch_count) 2475 fprintf(stderr, "MFC:%d ", vtx->mega_fetch_count); 2476 2477 if (bc->gfx_level >= EVERGREEN && vtx->buffer_index_mode) 2478 fprintf(stderr, "SQ_%s ", index_mode[vtx->buffer_index_mode]); 2479 2480 if (r600_isa_fetch(vtx->op)->flags & FF_MEM) { 2481 if (vtx->uncached) 2482 fprintf(stderr, "UNCACHED "); 2483 if (vtx->indexed) 2484 fprintf(stderr, "INDEXED:%d ", vtx->indexed); 2485 2486 fprintf(stderr, "ELEM_SIZE:%d ", vtx->elem_size); 2487 if (vtx->burst_count) 2488 fprintf(stderr, "BURST_COUNT:%d ", vtx->burst_count); 2489 fprintf(stderr, "ARRAY_BASE:%d ", vtx->array_base); 2490 fprintf(stderr, "ARRAY_SIZE:%d ", vtx->array_size); 2491 } 2492 2493 fprintf(stderr, "UCF:%d ", vtx->use_const_fields); 2494 fprintf(stderr, "FMT(DTA:%d ", vtx->data_format); 2495 fprintf(stderr, "NUM:%d ", vtx->num_format_all); 2496 fprintf(stderr, "COMP:%d ", vtx->format_comp_all); 2497 fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all); 2498 2499 id += 4; 2500 } 2501 2502 LIST_FOR_EACH_ENTRY(gds, &cf->gds, list) { 2503 int o = 0; 2504 o += fprintf(stderr, " %04d %08X %08X %08X ", id, bc->bytecode[id], 2505 bc->bytecode[id + 1], bc->bytecode[id + 2]); 2506 2507 o += fprintf(stderr, "%s ", r600_isa_fetch(gds->op)->name); 2508 2509 if (gds->op != FETCH_OP_TF_WRITE) { 2510 o += fprintf(stderr, "R%d.", gds->dst_gpr); 2511 o += print_swizzle(gds->dst_sel_x); 2512 o += print_swizzle(gds->dst_sel_y); 2513 o += print_swizzle(gds->dst_sel_z); 2514 o += print_swizzle(gds->dst_sel_w); 2515 } 2516 2517 o += fprintf(stderr, ", R%d.", gds->src_gpr); 2518 o += print_swizzle(gds->src_sel_x); 2519 o += print_swizzle(gds->src_sel_y); 2520 o += print_swizzle(gds->src_sel_z); 2521 2522 if (gds->op != FETCH_OP_TF_WRITE) { 2523 o += fprintf(stderr, ", R%d.", gds->src_gpr2); 2524 } 2525 if (gds->alloc_consume) { 2526 o += fprintf(stderr, " UAV: %d", gds->uav_id); 2527 if (gds->uav_index_mode) 2528 o += fprintf(stderr, "[%s]", index_mode[gds->uav_index_mode]); 2529 } 2530 fprintf(stderr, "\n"); 2531 id += 4; 2532 } 2533 } 2534 2535 fprintf(stderr, "--------------------------------------\n"); 2536} 2537 2538void r600_vertex_data_type(enum pipe_format pformat, 2539 unsigned *format, 2540 unsigned *num_format, unsigned *format_comp, unsigned *endian) 2541{ 2542 const struct util_format_description *desc; 2543 unsigned i; 2544 2545 *format = 0; 2546 *num_format = 0; 2547 *format_comp = 0; 2548 *endian = ENDIAN_NONE; 2549 2550 if (pformat == PIPE_FORMAT_R11G11B10_FLOAT) { 2551 *format = FMT_10_11_11_FLOAT; 2552 *endian = r600_endian_swap(32); 2553 return; 2554 } 2555 2556 if (pformat == PIPE_FORMAT_B5G6R5_UNORM) { 2557 *format = FMT_5_6_5; 2558 *endian = r600_endian_swap(16); 2559 return; 2560 } 2561 2562 if (pformat == PIPE_FORMAT_B5G5R5A1_UNORM) { 2563 *format = FMT_1_5_5_5; 2564 *endian = r600_endian_swap(16); 2565 return; 2566 } 2567 2568 if (pformat == PIPE_FORMAT_A1B5G5R5_UNORM) { 2569 *format = FMT_5_5_5_1; 2570 return; 2571 } 2572 2573 desc = util_format_description(pformat); 2574 if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) { 2575 goto out_unknown; 2576 } 2577 2578 /* Find the first non-VOID channel. */ 2579 for (i = 0; i < 4; i++) { 2580 if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) { 2581 break; 2582 } 2583 } 2584 2585 *endian = r600_endian_swap(desc->channel[i].size); 2586 2587 switch (desc->channel[i].type) { 2588 /* Half-floats, floats, ints */ 2589 case UTIL_FORMAT_TYPE_FLOAT: 2590 switch (desc->channel[i].size) { 2591 case 16: 2592 switch (desc->nr_channels) { 2593 case 1: 2594 *format = FMT_16_FLOAT; 2595 break; 2596 case 2: 2597 *format = FMT_16_16_FLOAT; 2598 break; 2599 case 3: 2600 case 4: 2601 *format = FMT_16_16_16_16_FLOAT; 2602 break; 2603 } 2604 break; 2605 case 32: 2606 switch (desc->nr_channels) { 2607 case 1: 2608 *format = FMT_32_FLOAT; 2609 break; 2610 case 2: 2611 *format = FMT_32_32_FLOAT; 2612 break; 2613 case 3: 2614 *format = FMT_32_32_32_FLOAT; 2615 break; 2616 case 4: 2617 *format = FMT_32_32_32_32_FLOAT; 2618 break; 2619 } 2620 break; 2621 default: 2622 goto out_unknown; 2623 } 2624 break; 2625 /* Unsigned ints */ 2626 case UTIL_FORMAT_TYPE_UNSIGNED: 2627 /* Signed ints */ 2628 case UTIL_FORMAT_TYPE_SIGNED: 2629 switch (desc->channel[i].size) { 2630 case 4: 2631 switch (desc->nr_channels) { 2632 case 2: 2633 *format = FMT_4_4; 2634 break; 2635 case 4: 2636 *format = FMT_4_4_4_4; 2637 break; 2638 } 2639 break; 2640 case 8: 2641 switch (desc->nr_channels) { 2642 case 1: 2643 *format = FMT_8; 2644 break; 2645 case 2: 2646 *format = FMT_8_8; 2647 break; 2648 case 3: 2649 case 4: 2650 *format = FMT_8_8_8_8; 2651 break; 2652 } 2653 break; 2654 case 10: 2655 if (desc->nr_channels != 4) 2656 goto out_unknown; 2657 2658 *format = FMT_2_10_10_10; 2659 break; 2660 case 16: 2661 switch (desc->nr_channels) { 2662 case 1: 2663 *format = FMT_16; 2664 break; 2665 case 2: 2666 *format = FMT_16_16; 2667 break; 2668 case 3: 2669 case 4: 2670 *format = FMT_16_16_16_16; 2671 break; 2672 } 2673 break; 2674 case 32: 2675 switch (desc->nr_channels) { 2676 case 1: 2677 *format = FMT_32; 2678 break; 2679 case 2: 2680 *format = FMT_32_32; 2681 break; 2682 case 3: 2683 *format = FMT_32_32_32; 2684 break; 2685 case 4: 2686 *format = FMT_32_32_32_32; 2687 break; 2688 } 2689 break; 2690 default: 2691 goto out_unknown; 2692 } 2693 break; 2694 default: 2695 goto out_unknown; 2696 } 2697 2698 if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) { 2699 *format_comp = 1; 2700 } 2701 2702 *num_format = 0; 2703 if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED || 2704 desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) { 2705 if (!desc->channel[i].normalized) { 2706 if (desc->channel[i].pure_integer) 2707 *num_format = 1; 2708 else 2709 *num_format = 2; 2710 } 2711 } 2712 return; 2713out_unknown: 2714 R600_ERR("unsupported vertex format %s\n", util_format_name(pformat)); 2715} 2716 2717void *r600_create_vertex_fetch_shader(struct pipe_context *ctx, 2718 unsigned count, 2719 const struct pipe_vertex_element *elements) 2720{ 2721 struct r600_context *rctx = (struct r600_context *)ctx; 2722 struct r600_bytecode bc; 2723 struct r600_bytecode_vtx vtx; 2724 const struct util_format_description *desc; 2725 unsigned fetch_resource_start = rctx->b.gfx_level >= EVERGREEN ? 0 : 160; 2726 unsigned format, num_format, format_comp, endian; 2727 uint32_t *bytecode; 2728 int i, j, r, fs_size; 2729 struct r600_fetch_shader *shader; 2730 unsigned no_sb = rctx->screen->b.debug_flags & DBG_NO_SB || 2731 (rctx->screen->b.debug_flags & DBG_NIR); 2732 unsigned sb_disasm = !no_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM); 2733 2734 assert(count < 32); 2735 2736 memset(&bc, 0, sizeof(bc)); 2737 r600_bytecode_init(&bc, rctx->b.gfx_level, rctx->b.family, 2738 rctx->screen->has_compressed_msaa_texturing); 2739 2740 bc.isa = rctx->isa; 2741 2742 for (i = 0; i < count; i++) { 2743 if (elements[i].instance_divisor > 1) { 2744 if (rctx->b.gfx_level == CAYMAN) { 2745 for (j = 0; j < 4; j++) { 2746 struct r600_bytecode_alu alu; 2747 memset(&alu, 0, sizeof(alu)); 2748 alu.op = ALU_OP2_MULHI_UINT; 2749 alu.src[0].sel = 0; 2750 alu.src[0].chan = 3; 2751 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2752 alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1; 2753 alu.dst.sel = i + 1; 2754 alu.dst.chan = j; 2755 alu.dst.write = j == 3; 2756 alu.last = j == 3; 2757 if ((r = r600_bytecode_add_alu(&bc, &alu))) { 2758 r600_bytecode_clear(&bc); 2759 return NULL; 2760 } 2761 } 2762 } else { 2763 struct r600_bytecode_alu alu; 2764 memset(&alu, 0, sizeof(alu)); 2765 alu.op = ALU_OP2_MULHI_UINT; 2766 alu.src[0].sel = 0; 2767 alu.src[0].chan = 3; 2768 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2769 alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1; 2770 alu.dst.sel = i + 1; 2771 alu.dst.chan = 3; 2772 alu.dst.write = 1; 2773 alu.last = 1; 2774 if ((r = r600_bytecode_add_alu(&bc, &alu))) { 2775 r600_bytecode_clear(&bc); 2776 return NULL; 2777 } 2778 } 2779 } 2780 } 2781 2782 for (i = 0; i < count; i++) { 2783 r600_vertex_data_type(elements[i].src_format, 2784 &format, &num_format, &format_comp, &endian); 2785 2786 desc = util_format_description(elements[i].src_format); 2787 2788 if (elements[i].src_offset > 65535) { 2789 r600_bytecode_clear(&bc); 2790 R600_ERR("too big src_offset: %u\n", elements[i].src_offset); 2791 return NULL; 2792 } 2793 2794 memset(&vtx, 0, sizeof(vtx)); 2795 vtx.buffer_id = elements[i].vertex_buffer_index + fetch_resource_start; 2796 vtx.fetch_type = elements[i].instance_divisor ? SQ_VTX_FETCH_INSTANCE_DATA : SQ_VTX_FETCH_VERTEX_DATA; 2797 vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0; 2798 vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0; 2799 vtx.mega_fetch_count = 0x1F; 2800 vtx.dst_gpr = i + 1; 2801 vtx.dst_sel_x = desc->swizzle[0]; 2802 vtx.dst_sel_y = desc->swizzle[1]; 2803 vtx.dst_sel_z = desc->swizzle[2]; 2804 vtx.dst_sel_w = desc->swizzle[3]; 2805 vtx.data_format = format; 2806 vtx.num_format_all = num_format; 2807 vtx.format_comp_all = format_comp; 2808 vtx.offset = elements[i].src_offset; 2809 vtx.endian = endian; 2810 2811 if ((r = r600_bytecode_add_vtx(&bc, &vtx))) { 2812 r600_bytecode_clear(&bc); 2813 return NULL; 2814 } 2815 } 2816 2817 r600_bytecode_add_cfinst(&bc, CF_OP_RET); 2818 2819 if ((r = r600_bytecode_build(&bc))) { 2820 r600_bytecode_clear(&bc); 2821 return NULL; 2822 } 2823 2824 if (rctx->screen->b.debug_flags & DBG_FS) { 2825 fprintf(stderr, "--------------------------------------------------------------\n"); 2826 fprintf(stderr, "Vertex elements state:\n"); 2827 for (i = 0; i < count; i++) { 2828 fprintf(stderr, " "); 2829 util_dump_vertex_element(stderr, elements+i); 2830 fprintf(stderr, "\n"); 2831 } 2832 2833 if (!sb_disasm) { 2834 r600_bytecode_disasm(&bc); 2835 2836 fprintf(stderr, "______________________________________________________________\n"); 2837 } else { 2838 r600_sb_bytecode_process(rctx, &bc, NULL, 1 /*dump*/, 0 /*optimize*/); 2839 } 2840 } 2841 2842 fs_size = bc.ndw*4; 2843 2844 /* Allocate the CSO. */ 2845 shader = CALLOC_STRUCT(r600_fetch_shader); 2846 if (!shader) { 2847 r600_bytecode_clear(&bc); 2848 return NULL; 2849 } 2850 2851 u_suballocator_alloc(&rctx->allocator_fetch_shader, fs_size, 256, 2852 &shader->offset, 2853 (struct pipe_resource**)&shader->buffer); 2854 if (!shader->buffer) { 2855 r600_bytecode_clear(&bc); 2856 FREE(shader); 2857 return NULL; 2858 } 2859 2860 bytecode = r600_buffer_map_sync_with_rings 2861 (&rctx->b, shader->buffer, 2862 PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY); 2863 bytecode += shader->offset / 4; 2864 2865 if (R600_BIG_ENDIAN) { 2866 for (i = 0; i < fs_size / 4; ++i) { 2867 bytecode[i] = util_cpu_to_le32(bc.bytecode[i]); 2868 } 2869 } else { 2870 memcpy(bytecode, bc.bytecode, fs_size); 2871 } 2872 rctx->b.ws->buffer_unmap(rctx->b.ws, shader->buffer->buf); 2873 2874 r600_bytecode_clear(&bc); 2875 return shader; 2876} 2877 2878void r600_bytecode_alu_read(struct r600_bytecode *bc, 2879 struct r600_bytecode_alu *alu, uint32_t word0, uint32_t word1) 2880{ 2881 /* WORD0 */ 2882 alu->src[0].sel = G_SQ_ALU_WORD0_SRC0_SEL(word0); 2883 alu->src[0].rel = G_SQ_ALU_WORD0_SRC0_REL(word0); 2884 alu->src[0].chan = G_SQ_ALU_WORD0_SRC0_CHAN(word0); 2885 alu->src[0].neg = G_SQ_ALU_WORD0_SRC0_NEG(word0); 2886 alu->src[1].sel = G_SQ_ALU_WORD0_SRC1_SEL(word0); 2887 alu->src[1].rel = G_SQ_ALU_WORD0_SRC1_REL(word0); 2888 alu->src[1].chan = G_SQ_ALU_WORD0_SRC1_CHAN(word0); 2889 alu->src[1].neg = G_SQ_ALU_WORD0_SRC1_NEG(word0); 2890 alu->index_mode = G_SQ_ALU_WORD0_INDEX_MODE(word0); 2891 alu->pred_sel = G_SQ_ALU_WORD0_PRED_SEL(word0); 2892 alu->last = G_SQ_ALU_WORD0_LAST(word0); 2893 2894 /* WORD1 */ 2895 alu->bank_swizzle = G_SQ_ALU_WORD1_BANK_SWIZZLE(word1); 2896 if (alu->bank_swizzle) 2897 alu->bank_swizzle_force = alu->bank_swizzle; 2898 alu->dst.sel = G_SQ_ALU_WORD1_DST_GPR(word1); 2899 alu->dst.rel = G_SQ_ALU_WORD1_DST_REL(word1); 2900 alu->dst.chan = G_SQ_ALU_WORD1_DST_CHAN(word1); 2901 alu->dst.clamp = G_SQ_ALU_WORD1_CLAMP(word1); 2902 if (G_SQ_ALU_WORD1_ENCODING(word1)) /*ALU_DWORD1_OP3*/ 2903 { 2904 alu->is_op3 = 1; 2905 alu->src[2].sel = G_SQ_ALU_WORD1_OP3_SRC2_SEL(word1); 2906 alu->src[2].rel = G_SQ_ALU_WORD1_OP3_SRC2_REL(word1); 2907 alu->src[2].chan = G_SQ_ALU_WORD1_OP3_SRC2_CHAN(word1); 2908 alu->src[2].neg = G_SQ_ALU_WORD1_OP3_SRC2_NEG(word1); 2909 alu->op = r600_isa_alu_by_opcode(bc->isa, 2910 G_SQ_ALU_WORD1_OP3_ALU_INST(word1), /* is_op3 = */ 1); 2911 2912 } 2913 else /*ALU_DWORD1_OP2*/ 2914 { 2915 alu->src[0].abs = G_SQ_ALU_WORD1_OP2_SRC0_ABS(word1); 2916 alu->src[1].abs = G_SQ_ALU_WORD1_OP2_SRC1_ABS(word1); 2917 alu->op = r600_isa_alu_by_opcode(bc->isa, 2918 G_SQ_ALU_WORD1_OP2_ALU_INST(word1), /* is_op3 = */ 0); 2919 alu->omod = G_SQ_ALU_WORD1_OP2_OMOD(word1); 2920 alu->dst.write = G_SQ_ALU_WORD1_OP2_WRITE_MASK(word1); 2921 alu->update_pred = G_SQ_ALU_WORD1_OP2_UPDATE_PRED(word1); 2922 alu->execute_mask = 2923 G_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(word1); 2924 } 2925} 2926 2927#if 0 2928void r600_bytecode_export_read(struct r600_bytecode *bc, 2929 struct r600_bytecode_output *output, uint32_t word0, uint32_t word1) 2930{ 2931 output->array_base = G_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(word0); 2932 output->type = G_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(word0); 2933 output->gpr = G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(word0); 2934 output->elem_size = G_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(word0); 2935 2936 output->swizzle_x = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(word1); 2937 output->swizzle_y = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(word1); 2938 output->swizzle_z = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(word1); 2939 output->swizzle_w = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(word1); 2940 output->burst_count = G_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(word1); 2941 output->end_of_program = G_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(word1); 2942 output->op = r600_isa_cf_by_opcode(bc->isa, 2943 G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(word1), 0); 2944 output->barrier = G_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(word1); 2945 output->array_size = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(word1); 2946 output->comp_mask = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(word1); 2947} 2948#endif 2949