1/* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** @file brw_fs_combine_constants.cpp 25 * 26 * This file contains the opt_combine_constants() pass that runs after the 27 * regular optimization loop. It passes over the instruction list and 28 * selectively promotes immediate values to registers by emitting a mov(1) 29 * instruction. 30 * 31 * This is useful on Gen 7 particularly, because a few instructions can be 32 * coissued (i.e., issued in the same cycle as another thread on the same EU 33 * issues an instruction) under some circumstances, one of which is that they 34 * cannot use immediate values. 35 */ 36 37#include "brw_fs.h" 38#include "brw_cfg.h" 39#include "util/half_float.h" 40 41using namespace brw; 42 43static const bool debug = false; 44 45/* Returns whether an instruction could co-issue if its immediate source were 46 * replaced with a GRF source. 47 */ 48static bool 49could_coissue(const struct intel_device_info *devinfo, const fs_inst *inst) 50{ 51 if (devinfo->ver != 7) 52 return false; 53 54 switch (inst->opcode) { 55 case BRW_OPCODE_MOV: 56 case BRW_OPCODE_CMP: 57 case BRW_OPCODE_ADD: 58 case BRW_OPCODE_MUL: 59 /* Only float instructions can coissue. We don't have a great 60 * understanding of whether or not something like float(int(a) + int(b)) 61 * would be considered float (based on the destination type) or integer 62 * (based on the source types), so we take the conservative choice of 63 * only promoting when both destination and source are float. 64 */ 65 return inst->dst.type == BRW_REGISTER_TYPE_F && 66 inst->src[0].type == BRW_REGISTER_TYPE_F; 67 default: 68 return false; 69 } 70} 71 72/** 73 * Returns true for instructions that don't support immediate sources. 74 */ 75static bool 76must_promote_imm(const struct intel_device_info *devinfo, const fs_inst *inst) 77{ 78 switch (inst->opcode) { 79 case SHADER_OPCODE_POW: 80 return devinfo->ver < 8; 81 case BRW_OPCODE_MAD: 82 case BRW_OPCODE_ADD3: 83 case BRW_OPCODE_LRP: 84 return true; 85 default: 86 return false; 87 } 88} 89 90/** A box for putting fs_regs in a linked list. */ 91struct reg_link { 92 DECLARE_RALLOC_CXX_OPERATORS(reg_link) 93 94 reg_link(fs_reg *reg) : reg(reg) {} 95 96 struct exec_node link; 97 fs_reg *reg; 98}; 99 100static struct exec_node * 101link(void *mem_ctx, fs_reg *reg) 102{ 103 reg_link *l = new(mem_ctx) reg_link(reg); 104 return &l->link; 105} 106 107/** 108 * Information about an immediate value. 109 */ 110struct imm { 111 /** The common ancestor of all blocks using this immediate value. */ 112 bblock_t *block; 113 114 /** 115 * The instruction generating the immediate value, if all uses are contained 116 * within a single basic block. Otherwise, NULL. 117 */ 118 fs_inst *inst; 119 120 /** 121 * A list of fs_regs that refer to this immediate. If we promote it, we'll 122 * have to patch these up to refer to the new GRF. 123 */ 124 exec_list *uses; 125 126 /** The immediate value */ 127 union { 128 char bytes[8]; 129 double df; 130 int64_t d64; 131 float f; 132 int32_t d; 133 int16_t w; 134 }; 135 uint8_t size; 136 137 /** When promoting half-float we need to account for certain restrictions */ 138 bool is_half_float; 139 140 /** 141 * The GRF register and subregister number where we've decided to store the 142 * constant value. 143 */ 144 uint8_t subreg_offset; 145 uint16_t nr; 146 147 /** The number of coissuable instructions using this immediate. */ 148 uint16_t uses_by_coissue; 149 150 /** 151 * Whether this constant is used by an instruction that can't handle an 152 * immediate source (and already has to be promoted to a GRF). 153 */ 154 bool must_promote; 155 156 uint16_t first_use_ip; 157 uint16_t last_use_ip; 158}; 159 160/** The working set of information about immediates. */ 161struct table { 162 struct imm *imm; 163 int size; 164 int len; 165}; 166 167static struct imm * 168find_imm(struct table *table, void *data, uint8_t size) 169{ 170 for (int i = 0; i < table->len; i++) { 171 if (table->imm[i].size == size && 172 !memcmp(table->imm[i].bytes, data, size)) { 173 return &table->imm[i]; 174 } 175 } 176 return NULL; 177} 178 179static struct imm * 180new_imm(struct table *table, void *mem_ctx) 181{ 182 if (table->len == table->size) { 183 table->size *= 2; 184 table->imm = reralloc(mem_ctx, table->imm, struct imm, table->size); 185 } 186 return &table->imm[table->len++]; 187} 188 189/** 190 * Comparator used for sorting an array of imm structures. 191 * 192 * We sort by basic block number, then last use IP, then first use IP (least 193 * to greatest). This sorting causes immediates live in the same area to be 194 * allocated to the same register in the hopes that all values will be dead 195 * about the same time and the register can be reused. 196 */ 197static int 198compare(const void *_a, const void *_b) 199{ 200 const struct imm *a = (const struct imm *)_a, 201 *b = (const struct imm *)_b; 202 203 int block_diff = a->block->num - b->block->num; 204 if (block_diff) 205 return block_diff; 206 207 int end_diff = a->last_use_ip - b->last_use_ip; 208 if (end_diff) 209 return end_diff; 210 211 return a->first_use_ip - b->first_use_ip; 212} 213 214static bool 215get_constant_value(const struct intel_device_info *devinfo, 216 const fs_inst *inst, uint32_t src_idx, 217 void *out, brw_reg_type *out_type) 218{ 219 const bool can_do_source_mods = inst->can_do_source_mods(devinfo); 220 const fs_reg *src = &inst->src[src_idx]; 221 222 *out_type = src->type; 223 224 switch (*out_type) { 225 case BRW_REGISTER_TYPE_DF: { 226 double val = !can_do_source_mods ? src->df : fabs(src->df); 227 memcpy(out, &val, 8); 228 break; 229 } 230 case BRW_REGISTER_TYPE_F: { 231 float val = !can_do_source_mods ? src->f : fabsf(src->f); 232 memcpy(out, &val, 4); 233 break; 234 } 235 case BRW_REGISTER_TYPE_HF: { 236 uint16_t val = src->d & 0xffffu; 237 if (can_do_source_mods) 238 val = _mesa_float_to_half(fabsf(_mesa_half_to_float(val))); 239 memcpy(out, &val, 2); 240 break; 241 } 242 case BRW_REGISTER_TYPE_Q: { 243 int64_t val = !can_do_source_mods ? src->d64 : llabs(src->d64); 244 memcpy(out, &val, 8); 245 break; 246 } 247 case BRW_REGISTER_TYPE_UQ: 248 memcpy(out, &src->u64, 8); 249 break; 250 case BRW_REGISTER_TYPE_D: { 251 int32_t val = !can_do_source_mods ? src->d : abs(src->d); 252 memcpy(out, &val, 4); 253 break; 254 } 255 case BRW_REGISTER_TYPE_UD: 256 memcpy(out, &src->ud, 4); 257 break; 258 case BRW_REGISTER_TYPE_W: { 259 int16_t val = src->d & 0xffffu; 260 if (can_do_source_mods) 261 val = abs(val); 262 memcpy(out, &val, 2); 263 break; 264 } 265 case BRW_REGISTER_TYPE_UW: 266 memcpy(out, &src->ud, 2); 267 break; 268 default: 269 return false; 270 }; 271 272 return true; 273} 274 275static struct brw_reg 276build_imm_reg_for_copy(struct imm *imm) 277{ 278 switch (imm->size) { 279 case 8: 280 return brw_imm_d(imm->d64); 281 case 4: 282 return brw_imm_d(imm->d); 283 case 2: 284 return brw_imm_w(imm->w); 285 default: 286 unreachable("not implemented"); 287 } 288} 289 290static inline uint32_t 291get_alignment_for_imm(const struct imm *imm) 292{ 293 if (imm->is_half_float) 294 return 4; /* At least MAD seems to require this */ 295 else 296 return imm->size; 297} 298 299static bool 300needs_negate(const fs_reg *reg, const struct imm *imm) 301{ 302 switch (reg->type) { 303 case BRW_REGISTER_TYPE_DF: 304 return signbit(reg->df) != signbit(imm->df); 305 case BRW_REGISTER_TYPE_F: 306 return signbit(reg->f) != signbit(imm->f); 307 case BRW_REGISTER_TYPE_Q: 308 return (reg->d64 < 0) != (imm->d64 < 0); 309 case BRW_REGISTER_TYPE_D: 310 return (reg->d < 0) != (imm->d < 0); 311 case BRW_REGISTER_TYPE_HF: 312 return (reg->d & 0x8000u) != (imm->w & 0x8000u); 313 case BRW_REGISTER_TYPE_W: 314 return ((int16_t)reg->d < 0) != (imm->w < 0); 315 case BRW_REGISTER_TYPE_UQ: 316 case BRW_REGISTER_TYPE_UD: 317 case BRW_REGISTER_TYPE_UW: 318 return false; 319 default: 320 unreachable("not implemented"); 321 }; 322} 323 324static bool 325representable_as_hf(float f, uint16_t *hf) 326{ 327 union fi u; 328 uint16_t h = _mesa_float_to_half(f); 329 u.f = _mesa_half_to_float(h); 330 331 if (u.f == f) { 332 *hf = h; 333 return true; 334 } 335 336 return false; 337} 338 339static bool 340representable_as_w(int d, int16_t *w) 341{ 342 int res = ((d & 0xffff8000) + 0x8000) & 0xffff7fff; 343 if (!res) { 344 *w = d; 345 return true; 346 } 347 348 return false; 349} 350 351static bool 352representable_as_uw(unsigned ud, uint16_t *uw) 353{ 354 if (!(ud & 0xffff0000)) { 355 *uw = ud; 356 return true; 357 } 358 359 return false; 360} 361 362static bool 363supports_src_as_imm(const struct intel_device_info *devinfo, enum opcode op) 364{ 365 switch (op) { 366 case BRW_OPCODE_ADD3: 367 return devinfo->verx10 >= 125; 368 case BRW_OPCODE_MAD: 369 return devinfo->ver == 12 && devinfo->verx10 < 125; 370 default: 371 return false; 372 } 373} 374 375static bool 376can_promote_src_as_imm(const struct intel_device_info *devinfo, fs_inst *inst, 377 unsigned src_idx) 378{ 379 bool can_promote = false; 380 381 /* Experiment shows that we can only support src0 as immediate */ 382 if (src_idx != 0) 383 return false; 384 385 if (!supports_src_as_imm(devinfo, inst->opcode)) 386 return false; 387 388 /* TODO - Fix the codepath below to use a bfloat16 immediate on XeHP, 389 * since HF/F mixed mode has been removed from the hardware. 390 */ 391 switch (inst->src[src_idx].type) { 392 case BRW_REGISTER_TYPE_F: { 393 uint16_t hf; 394 if (representable_as_hf(inst->src[src_idx].f, &hf)) { 395 inst->src[src_idx] = retype(brw_imm_uw(hf), BRW_REGISTER_TYPE_HF); 396 can_promote = true; 397 } 398 break; 399 } 400 case BRW_REGISTER_TYPE_W: { 401 int16_t w; 402 if (representable_as_w(inst->src[src_idx].d, &w)) { 403 inst->src[src_idx] = brw_imm_w(w); 404 can_promote = true; 405 } 406 break; 407 } 408 case BRW_REGISTER_TYPE_UW: { 409 uint16_t uw; 410 if (representable_as_uw(inst->src[src_idx].ud, &uw)) { 411 inst->src[src_idx] = brw_imm_uw(uw); 412 can_promote = true; 413 } 414 break; 415 } 416 default: 417 break; 418 } 419 420 return can_promote; 421} 422 423bool 424fs_visitor::opt_combine_constants() 425{ 426 void *const_ctx = ralloc_context(NULL); 427 428 struct table table; 429 table.size = 8; 430 table.len = 0; 431 table.imm = ralloc_array(const_ctx, struct imm, table.size); 432 433 const brw::idom_tree &idom = idom_analysis.require(); 434 unsigned ip = -1; 435 436 /* Make a pass through all instructions and count the number of times each 437 * constant is used by coissueable instructions or instructions that cannot 438 * take immediate arguments. 439 */ 440 foreach_block_and_inst(block, fs_inst, inst, cfg) { 441 ip++; 442 443 if (!could_coissue(devinfo, inst) && !must_promote_imm(devinfo, inst)) 444 continue; 445 446 for (int i = 0; i < inst->sources; i++) { 447 if (inst->src[i].file != IMM) 448 continue; 449 450 if (can_promote_src_as_imm(devinfo, inst, i)) 451 continue; 452 453 char data[8]; 454 brw_reg_type type; 455 if (!get_constant_value(devinfo, inst, i, data, &type)) 456 continue; 457 458 uint8_t size = type_sz(type); 459 460 struct imm *imm = find_imm(&table, data, size); 461 462 if (imm) { 463 bblock_t *intersection = idom.intersect(block, imm->block); 464 if (intersection != imm->block) 465 imm->inst = NULL; 466 imm->block = intersection; 467 imm->uses->push_tail(link(const_ctx, &inst->src[i])); 468 imm->uses_by_coissue += could_coissue(devinfo, inst); 469 imm->must_promote = imm->must_promote || must_promote_imm(devinfo, inst); 470 imm->last_use_ip = ip; 471 if (type == BRW_REGISTER_TYPE_HF) 472 imm->is_half_float = true; 473 } else { 474 imm = new_imm(&table, const_ctx); 475 imm->block = block; 476 imm->inst = inst; 477 imm->uses = new(const_ctx) exec_list(); 478 imm->uses->push_tail(link(const_ctx, &inst->src[i])); 479 memcpy(imm->bytes, data, size); 480 imm->size = size; 481 imm->is_half_float = type == BRW_REGISTER_TYPE_HF; 482 imm->uses_by_coissue = could_coissue(devinfo, inst); 483 imm->must_promote = must_promote_imm(devinfo, inst); 484 imm->first_use_ip = ip; 485 imm->last_use_ip = ip; 486 } 487 } 488 } 489 490 /* Remove constants from the table that don't have enough uses to make them 491 * profitable to store in a register. 492 */ 493 for (int i = 0; i < table.len;) { 494 struct imm *imm = &table.imm[i]; 495 496 if (!imm->must_promote && imm->uses_by_coissue < 4) { 497 table.imm[i] = table.imm[table.len - 1]; 498 table.len--; 499 continue; 500 } 501 i++; 502 } 503 if (table.len == 0) { 504 ralloc_free(const_ctx); 505 return false; 506 } 507 if (cfg->num_blocks != 1) 508 qsort(table.imm, table.len, sizeof(struct imm), compare); 509 510 /* Insert MOVs to load the constant values into GRFs. */ 511 fs_reg reg(VGRF, alloc.allocate(1)); 512 reg.stride = 0; 513 for (int i = 0; i < table.len; i++) { 514 struct imm *imm = &table.imm[i]; 515 /* Insert it either before the instruction that generated the immediate 516 * or after the last non-control flow instruction of the common ancestor. 517 */ 518 exec_node *n = (imm->inst ? imm->inst : 519 imm->block->last_non_control_flow_inst()->next); 520 521 /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions: 522 * 523 * "In Align16 mode, the channel selects and channel enables apply to a 524 * pair of half-floats, because these parameters are defined for DWord 525 * elements ONLY. This is applicable when both source and destination 526 * are half-floats." 527 * 528 * This means that Align16 instructions that use promoted HF immediates 529 * and use a <0,1,0>:HF region would read 2 HF slots instead of 530 * replicating the single one we want. To avoid this, we always populate 531 * both HF slots within a DWord with the constant. 532 */ 533 const uint32_t width = devinfo->ver == 8 && imm->is_half_float ? 2 : 1; 534 const fs_builder ibld = bld.at(imm->block, n).exec_all().group(width, 0); 535 536 /* Put the immediate in an offset aligned to its size. Some instructions 537 * seem to have additional alignment requirements, so account for that 538 * too. 539 */ 540 reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm)); 541 542 /* Ensure we have enough space in the register to copy the immediate */ 543 struct brw_reg imm_reg = build_imm_reg_for_copy(imm); 544 if (reg.offset + type_sz(imm_reg.type) * width > REG_SIZE) { 545 reg.nr = alloc.allocate(1); 546 reg.offset = 0; 547 } 548 549 ibld.MOV(retype(reg, imm_reg.type), imm_reg); 550 imm->nr = reg.nr; 551 imm->subreg_offset = reg.offset; 552 553 reg.offset += imm->size * width; 554 } 555 shader_stats.promoted_constants = table.len; 556 557 /* Rewrite the immediate sources to refer to the new GRFs. */ 558 for (int i = 0; i < table.len; i++) { 559 foreach_list_typed(reg_link, link, link, table.imm[i].uses) { 560 fs_reg *reg = link->reg; 561#ifdef DEBUG 562 switch (reg->type) { 563 case BRW_REGISTER_TYPE_DF: 564 assert((isnan(reg->df) && isnan(table.imm[i].df)) || 565 (fabs(reg->df) == fabs(table.imm[i].df))); 566 break; 567 case BRW_REGISTER_TYPE_F: 568 assert((isnan(reg->f) && isnan(table.imm[i].f)) || 569 (fabsf(reg->f) == fabsf(table.imm[i].f))); 570 break; 571 case BRW_REGISTER_TYPE_HF: 572 assert((isnan(_mesa_half_to_float(reg->d & 0xffffu)) && 573 isnan(_mesa_half_to_float(table.imm[i].w))) || 574 (fabsf(_mesa_half_to_float(reg->d & 0xffffu)) == 575 fabsf(_mesa_half_to_float(table.imm[i].w)))); 576 break; 577 case BRW_REGISTER_TYPE_Q: 578 assert(abs(reg->d64) == abs(table.imm[i].d64)); 579 break; 580 case BRW_REGISTER_TYPE_UQ: 581 assert(reg->d64 == table.imm[i].d64); 582 break; 583 case BRW_REGISTER_TYPE_D: 584 assert(abs(reg->d) == abs(table.imm[i].d)); 585 break; 586 case BRW_REGISTER_TYPE_UD: 587 assert(reg->d == table.imm[i].d); 588 break; 589 case BRW_REGISTER_TYPE_W: 590 assert(abs((int16_t) (reg->d & 0xffff)) == table.imm[i].w); 591 break; 592 case BRW_REGISTER_TYPE_UW: 593 assert((reg->ud & 0xffffu) == (uint16_t) table.imm[i].w); 594 break; 595 default: 596 break; 597 } 598#endif 599 600 reg->file = VGRF; 601 reg->offset = table.imm[i].subreg_offset; 602 reg->stride = 0; 603 reg->negate = needs_negate(reg, &table.imm[i]); 604 reg->nr = table.imm[i].nr; 605 } 606 } 607 608 if (debug) { 609 for (int i = 0; i < table.len; i++) { 610 struct imm *imm = &table.imm[i]; 611 612 printf("0x%016" PRIx64 " - block %3d, reg %3d sub %2d, " 613 "Uses: (%2d, %2d), IP: %4d to %4d, length %4d\n", 614 (uint64_t)(imm->d & BITFIELD64_MASK(imm->size * 8)), 615 imm->block->num, 616 imm->nr, 617 imm->subreg_offset, 618 imm->must_promote, 619 imm->uses_by_coissue, 620 imm->first_use_ip, 621 imm->last_use_ip, 622 imm->last_use_ip - imm->first_use_ip); 623 } 624 } 625 626 ralloc_free(const_ctx); 627 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); 628 629 return true; 630} 631