1/* 2 * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io> 3 * Copyright (C) 2019-2020 Collabora, Ltd. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25#include <sys/types.h> 26#include <sys/stat.h> 27#include <sys/mman.h> 28#include <fcntl.h> 29#include <stdint.h> 30#include <stdlib.h> 31#include <stdio.h> 32#include <err.h> 33 34#include "compiler/glsl/glsl_to_nir.h" 35#include "compiler/nir_types.h" 36#include "compiler/nir/nir_builder.h" 37#include "util/half_float.h" 38#include "util/u_math.h" 39#include "util/u_debug.h" 40#include "util/u_dynarray.h" 41#include "util/list.h" 42 43#include "midgard.h" 44#include "midgard_nir.h" 45#include "midgard_compile.h" 46#include "midgard_ops.h" 47#include "helpers.h" 48#include "compiler.h" 49#include "midgard_quirks.h" 50#include "panfrost/util/pan_lower_framebuffer.h" 51 52#include "disassemble.h" 53 54static const struct debug_named_value midgard_debug_options[] = { 55 {"msgs", MIDGARD_DBG_MSGS, "Print debug messages"}, 56 {"shaders", MIDGARD_DBG_SHADERS, "Dump shaders in NIR and MIR"}, 57 {"shaderdb", MIDGARD_DBG_SHADERDB, "Prints shader-db statistics"}, 58 {"inorder", MIDGARD_DBG_INORDER, "Disables out-of-order scheduling"}, 59 {"verbose", MIDGARD_DBG_VERBOSE, "Dump shaders verbosely"}, 60 {"internal", MIDGARD_DBG_INTERNAL, "Dump internal shaders"}, 61 DEBUG_NAMED_VALUE_END 62}; 63 64DEBUG_GET_ONCE_FLAGS_OPTION(midgard_debug, "MIDGARD_MESA_DEBUG", midgard_debug_options, 0) 65 66int midgard_debug = 0; 67 68#define DBG(fmt, ...) \ 69 do { if (midgard_debug & MIDGARD_DBG_MSGS) \ 70 fprintf(stderr, "%s:%d: "fmt, \ 71 __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0) 72static midgard_block * 73create_empty_block(compiler_context *ctx) 74{ 75 midgard_block *blk = rzalloc(ctx, midgard_block); 76 77 blk->base.predecessors = _mesa_set_create(blk, 78 _mesa_hash_pointer, 79 _mesa_key_pointer_equal); 80 81 blk->base.name = ctx->block_source_count++; 82 83 return blk; 84} 85 86static void 87schedule_barrier(compiler_context *ctx) 88{ 89 midgard_block *temp = ctx->after_block; 90 ctx->after_block = create_empty_block(ctx); 91 ctx->block_count++; 92 list_addtail(&ctx->after_block->base.link, &ctx->blocks); 93 list_inithead(&ctx->after_block->base.instructions); 94 pan_block_add_successor(&ctx->current_block->base, &ctx->after_block->base); 95 ctx->current_block = ctx->after_block; 96 ctx->after_block = temp; 97} 98 99/* Helpers to generate midgard_instruction's using macro magic, since every 100 * driver seems to do it that way */ 101 102#define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__)); 103 104#define M_LOAD_STORE(name, store, T) \ 105 static midgard_instruction m_##name(unsigned ssa, unsigned address) { \ 106 midgard_instruction i = { \ 107 .type = TAG_LOAD_STORE_4, \ 108 .mask = 0xF, \ 109 .dest = ~0, \ 110 .src = { ~0, ~0, ~0, ~0 }, \ 111 .swizzle = SWIZZLE_IDENTITY_4, \ 112 .op = midgard_op_##name, \ 113 .load_store = { \ 114 .signed_offset = address \ 115 } \ 116 }; \ 117 \ 118 if (store) { \ 119 i.src[0] = ssa; \ 120 i.src_types[0] = T; \ 121 i.dest_type = T; \ 122 } else { \ 123 i.dest = ssa; \ 124 i.dest_type = T; \ 125 } \ 126 return i; \ 127 } 128 129#define M_LOAD(name, T) M_LOAD_STORE(name, false, T) 130#define M_STORE(name, T) M_LOAD_STORE(name, true, T) 131 132M_LOAD(ld_attr_32, nir_type_uint32); 133M_LOAD(ld_vary_32, nir_type_uint32); 134M_LOAD(ld_ubo_u8, nir_type_uint32); /* mandatory extension to 32-bit */ 135M_LOAD(ld_ubo_u16, nir_type_uint32); 136M_LOAD(ld_ubo_32, nir_type_uint32); 137M_LOAD(ld_ubo_64, nir_type_uint32); 138M_LOAD(ld_ubo_128, nir_type_uint32); 139M_LOAD(ld_u8, nir_type_uint8); 140M_LOAD(ld_u16, nir_type_uint16); 141M_LOAD(ld_32, nir_type_uint32); 142M_LOAD(ld_64, nir_type_uint32); 143M_LOAD(ld_128, nir_type_uint32); 144M_STORE(st_u8, nir_type_uint8); 145M_STORE(st_u16, nir_type_uint16); 146M_STORE(st_32, nir_type_uint32); 147M_STORE(st_64, nir_type_uint32); 148M_STORE(st_128, nir_type_uint32); 149M_LOAD(ld_tilebuffer_raw, nir_type_uint32); 150M_LOAD(ld_tilebuffer_16f, nir_type_float16); 151M_LOAD(ld_tilebuffer_32f, nir_type_float32); 152M_STORE(st_vary_32, nir_type_uint32); 153M_LOAD(ld_cubemap_coords, nir_type_uint32); 154M_LOAD(ldst_mov, nir_type_uint32); 155M_LOAD(ld_image_32f, nir_type_float32); 156M_LOAD(ld_image_16f, nir_type_float16); 157M_LOAD(ld_image_32u, nir_type_uint32); 158M_LOAD(ld_image_32i, nir_type_int32); 159M_STORE(st_image_32f, nir_type_float32); 160M_STORE(st_image_16f, nir_type_float16); 161M_STORE(st_image_32u, nir_type_uint32); 162M_STORE(st_image_32i, nir_type_int32); 163M_LOAD(lea_image, nir_type_uint64); 164 165#define M_IMAGE(op) \ 166static midgard_instruction \ 167op ## _image(nir_alu_type type, unsigned val, unsigned address) \ 168{ \ 169 switch (type) { \ 170 case nir_type_float32: \ 171 return m_ ## op ## _image_32f(val, address); \ 172 case nir_type_float16: \ 173 return m_ ## op ## _image_16f(val, address); \ 174 case nir_type_uint32: \ 175 return m_ ## op ## _image_32u(val, address); \ 176 case nir_type_int32: \ 177 return m_ ## op ## _image_32i(val, address); \ 178 default: \ 179 unreachable("Invalid image type"); \ 180 } \ 181} 182 183M_IMAGE(ld); 184M_IMAGE(st); 185 186static midgard_instruction 187v_branch(bool conditional, bool invert) 188{ 189 midgard_instruction ins = { 190 .type = TAG_ALU_4, 191 .unit = ALU_ENAB_BRANCH, 192 .compact_branch = true, 193 .branch = { 194 .conditional = conditional, 195 .invert_conditional = invert 196 }, 197 .dest = ~0, 198 .src = { ~0, ~0, ~0, ~0 }, 199 }; 200 201 return ins; 202} 203 204static void 205attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name) 206{ 207 ins->has_constants = true; 208 memcpy(&ins->constants, constants, 16); 209} 210 211static int 212glsl_type_size(const struct glsl_type *type, bool bindless) 213{ 214 return glsl_count_attribute_slots(type, false); 215} 216 217static bool 218midgard_nir_lower_global_load_instr(nir_builder *b, nir_instr *instr, void *data) 219{ 220 if (instr->type != nir_instr_type_intrinsic) 221 return false; 222 223 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 224 if (intr->intrinsic != nir_intrinsic_load_global && 225 intr->intrinsic != nir_intrinsic_load_shared) 226 return false; 227 228 unsigned compsz = nir_dest_bit_size(intr->dest); 229 unsigned totalsz = compsz * nir_dest_num_components(intr->dest); 230 /* 8, 16, 32, 64 and 128 bit loads don't need to be lowered */ 231 if (util_bitcount(totalsz) < 2 && totalsz <= 128) 232 return false; 233 234 b->cursor = nir_before_instr(instr); 235 236 assert(intr->src[0].is_ssa); 237 nir_ssa_def *addr = intr->src[0].ssa; 238 239 nir_ssa_def *comps[MIR_VEC_COMPONENTS]; 240 unsigned ncomps = 0; 241 242 while (totalsz) { 243 unsigned loadsz = MIN2(1 << (util_last_bit(totalsz) - 1), 128); 244 unsigned loadncomps = loadsz / compsz; 245 246 nir_ssa_def *load; 247 if (intr->intrinsic == nir_intrinsic_load_global) { 248 load = nir_load_global(b, addr, compsz / 8, loadncomps, compsz); 249 } else { 250 assert(intr->intrinsic == nir_intrinsic_load_shared); 251 nir_intrinsic_instr *shared_load = 252 nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_shared); 253 shared_load->num_components = loadncomps; 254 shared_load->src[0] = nir_src_for_ssa(addr); 255 nir_intrinsic_set_align(shared_load, compsz / 8, 0); 256 nir_intrinsic_set_base(shared_load, nir_intrinsic_base(intr)); 257 nir_ssa_dest_init(&shared_load->instr, &shared_load->dest, 258 shared_load->num_components, compsz, NULL); 259 nir_builder_instr_insert(b, &shared_load->instr); 260 load = &shared_load->dest.ssa; 261 } 262 263 for (unsigned i = 0; i < loadncomps; i++) 264 comps[ncomps++] = nir_channel(b, load, i); 265 266 totalsz -= loadsz; 267 addr = nir_iadd(b, addr, nir_imm_intN_t(b, loadsz / 8, addr->bit_size)); 268 } 269 270 assert(ncomps == nir_dest_num_components(intr->dest)); 271 nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_vec(b, comps, ncomps)); 272 273 return true; 274} 275 276static bool 277midgard_nir_lower_global_load(nir_shader *shader) 278{ 279 return nir_shader_instructions_pass(shader, 280 midgard_nir_lower_global_load_instr, 281 nir_metadata_block_index | nir_metadata_dominance, 282 NULL); 283} 284 285static bool 286mdg_should_scalarize(const nir_instr *instr, const void *_unused) 287{ 288 const nir_alu_instr *alu = nir_instr_as_alu(instr); 289 290 if (nir_src_bit_size(alu->src[0].src) == 64) 291 return true; 292 293 if (nir_dest_bit_size(alu->dest.dest) == 64) 294 return true; 295 296 switch (alu->op) { 297 case nir_op_fdot2: 298 case nir_op_umul_high: 299 case nir_op_imul_high: 300 case nir_op_pack_half_2x16: 301 case nir_op_unpack_half_2x16: 302 return true; 303 default: 304 return false; 305 } 306} 307 308/* Only vectorize int64 up to vec2 */ 309static uint8_t 310midgard_vectorize_filter(const nir_instr *instr, const void *data) 311{ 312 if (instr->type != nir_instr_type_alu) 313 return 0; 314 315 const nir_alu_instr *alu = nir_instr_as_alu(instr); 316 int src_bit_size = nir_src_bit_size(alu->src[0].src); 317 int dst_bit_size = nir_dest_bit_size(alu->dest.dest); 318 319 if (src_bit_size == 64 || dst_bit_size == 64) 320 return 2; 321 322 return 4; 323} 324 325static void 326optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend, bool is_blit) 327{ 328 bool progress; 329 unsigned lower_flrp = 330 (nir->options->lower_flrp16 ? 16 : 0) | 331 (nir->options->lower_flrp32 ? 32 : 0) | 332 (nir->options->lower_flrp64 ? 64 : 0); 333 334 NIR_PASS(progress, nir, nir_lower_regs_to_ssa); 335 nir_lower_idiv_options idiv_options = { 336 .imprecise_32bit_lowering = true, 337 .allow_fp16 = true, 338 }; 339 NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options); 340 341 nir_lower_tex_options lower_tex_options = { 342 .lower_txs_lod = true, 343 .lower_txp = ~0, 344 .lower_tg4_broadcom_swizzle = true, 345 /* TODO: we have native gradient.. */ 346 .lower_txd = true, 347 .lower_invalid_implicit_lod = true, 348 }; 349 350 NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options); 351 352 353 /* TEX_GRAD fails to apply sampler descriptor settings on some 354 * implementations, requiring a lowering. However, blit shaders do not 355 * use the affected settings and should skip the workaround. 356 */ 357 if ((quirks & MIDGARD_BROKEN_LOD) && !is_blit) 358 NIR_PASS_V(nir, midgard_nir_lod_errata); 359 360 /* Midgard image ops coordinates are 16-bit instead of 32-bit */ 361 NIR_PASS(progress, nir, midgard_nir_lower_image_bitsize); 362 NIR_PASS(progress, nir, midgard_nir_lower_helper_writes); 363 NIR_PASS(progress, nir, pan_lower_helper_invocation); 364 NIR_PASS(progress, nir, pan_lower_sample_pos); 365 366 if (nir->xfb_info != NULL && nir->info.has_transform_feedback_varyings) { 367 NIR_PASS_V(nir, nir_io_add_const_offset_to_base, 368 nir_var_shader_in | nir_var_shader_out); 369 NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info); 370 NIR_PASS_V(nir, pan_lower_xfb); 371 } 372 373 NIR_PASS(progress, nir, midgard_nir_lower_algebraic_early); 374 NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_should_scalarize, NULL); 375 376 do { 377 progress = false; 378 379 NIR_PASS(progress, nir, nir_lower_var_copies); 380 NIR_PASS(progress, nir, nir_lower_vars_to_ssa); 381 382 NIR_PASS(progress, nir, nir_copy_prop); 383 NIR_PASS(progress, nir, nir_opt_remove_phis); 384 NIR_PASS(progress, nir, nir_opt_dce); 385 NIR_PASS(progress, nir, nir_opt_dead_cf); 386 NIR_PASS(progress, nir, nir_opt_cse); 387 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); 388 NIR_PASS(progress, nir, nir_opt_algebraic); 389 NIR_PASS(progress, nir, nir_opt_constant_folding); 390 391 if (lower_flrp != 0) { 392 bool lower_flrp_progress = false; 393 NIR_PASS(lower_flrp_progress, 394 nir, 395 nir_lower_flrp, 396 lower_flrp, 397 false /* always_precise */); 398 if (lower_flrp_progress) { 399 NIR_PASS(progress, nir, 400 nir_opt_constant_folding); 401 progress = true; 402 } 403 404 /* Nothing should rematerialize any flrps, so we only 405 * need to do this lowering once. 406 */ 407 lower_flrp = 0; 408 } 409 410 NIR_PASS(progress, nir, nir_opt_undef); 411 NIR_PASS(progress, nir, nir_lower_undef_to_zero); 412 413 NIR_PASS(progress, nir, nir_opt_loop_unroll); 414 415 NIR_PASS(progress, nir, nir_opt_vectorize, 416 midgard_vectorize_filter, NULL); 417 } while (progress); 418 419 NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_should_scalarize, NULL); 420 421 /* Run after opts so it can hit more */ 422 if (!is_blend) 423 NIR_PASS(progress, nir, nir_fuse_io_16); 424 425 /* Must be run at the end to prevent creation of fsin/fcos ops */ 426 NIR_PASS(progress, nir, midgard_nir_scale_trig); 427 428 do { 429 progress = false; 430 431 NIR_PASS(progress, nir, nir_opt_dce); 432 NIR_PASS(progress, nir, nir_opt_algebraic); 433 NIR_PASS(progress, nir, nir_opt_constant_folding); 434 NIR_PASS(progress, nir, nir_copy_prop); 435 } while (progress); 436 437 NIR_PASS(progress, nir, nir_opt_algebraic_late); 438 NIR_PASS(progress, nir, nir_opt_algebraic_distribute_src_mods); 439 440 /* We implement booleans as 32-bit 0/~0 */ 441 NIR_PASS(progress, nir, nir_lower_bool_to_int32); 442 443 /* Now that booleans are lowered, we can run out late opts */ 444 NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late); 445 NIR_PASS(progress, nir, midgard_nir_cancel_inot); 446 447 NIR_PASS(progress, nir, nir_copy_prop); 448 NIR_PASS(progress, nir, nir_opt_dce); 449 450 /* Backend scheduler is purely local, so do some global optimizations 451 * to reduce register pressure. */ 452 nir_move_options move_all = 453 nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | 454 nir_move_comparisons | nir_move_copies | nir_move_load_ssbo; 455 456 NIR_PASS_V(nir, nir_opt_sink, move_all); 457 NIR_PASS_V(nir, nir_opt_move, move_all); 458 459 /* Take us out of SSA */ 460 NIR_PASS(progress, nir, nir_lower_locals_to_regs); 461 NIR_PASS(progress, nir, nir_convert_from_ssa, true); 462 463 /* We are a vector architecture; write combine where possible */ 464 NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest); 465 NIR_PASS(progress, nir, nir_lower_vec_to_movs, NULL, NULL); 466 467 NIR_PASS(progress, nir, nir_opt_dce); 468} 469 470/* Do not actually emit a load; instead, cache the constant for inlining */ 471 472static void 473emit_load_const(compiler_context *ctx, nir_load_const_instr *instr) 474{ 475 nir_ssa_def def = instr->def; 476 477 midgard_constants *consts = rzalloc(ctx, midgard_constants); 478 479 assert(instr->def.num_components * instr->def.bit_size <= sizeof(*consts) * 8); 480 481#define RAW_CONST_COPY(bits) \ 482 nir_const_value_to_array(consts->u##bits, instr->value, \ 483 instr->def.num_components, u##bits) 484 485 switch (instr->def.bit_size) { 486 case 64: 487 RAW_CONST_COPY(64); 488 break; 489 case 32: 490 RAW_CONST_COPY(32); 491 break; 492 case 16: 493 RAW_CONST_COPY(16); 494 break; 495 case 8: 496 RAW_CONST_COPY(8); 497 break; 498 default: 499 unreachable("Invalid bit_size for load_const instruction\n"); 500 } 501 502 /* Shifted for SSA, +1 for off-by-one */ 503 _mesa_hash_table_u64_insert(ctx->ssa_constants, (def.index << 1) + 1, consts); 504} 505 506/* Normally constants are embedded implicitly, but for I/O and such we have to 507 * explicitly emit a move with the constant source */ 508 509static void 510emit_explicit_constant(compiler_context *ctx, unsigned node, unsigned to) 511{ 512 void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, node + 1); 513 514 if (constant_value) { 515 midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), to); 516 attach_constants(ctx, &ins, constant_value, node + 1); 517 emit_mir_instruction(ctx, ins); 518 } 519} 520 521static bool 522nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components) 523{ 524 unsigned comp = src->swizzle[0]; 525 526 for (unsigned c = 1; c < nr_components; ++c) { 527 if (src->swizzle[c] != comp) 528 return true; 529 } 530 531 return false; 532} 533 534#define ATOMIC_CASE_IMPL(ctx, instr, nir, op, is_shared) \ 535 case nir_intrinsic_##nir: \ 536 emit_atomic(ctx, instr, is_shared, midgard_op_##op, ~0); \ 537 break; 538 539#define ATOMIC_CASE(ctx, instr, nir, op) \ 540 ATOMIC_CASE_IMPL(ctx, instr, shared_atomic_##nir, atomic_##op, true); \ 541 ATOMIC_CASE_IMPL(ctx, instr, global_atomic_##nir, atomic_##op, false); 542 543#define IMAGE_ATOMIC_CASE(ctx, instr, nir, op) \ 544 case nir_intrinsic_image_atomic_##nir: { \ 545 midgard_instruction ins = emit_image_op(ctx, instr, true); \ 546 emit_atomic(ctx, instr, false, midgard_op_atomic_##op, ins.dest); \ 547 break; \ 548 } 549 550#define ALU_CASE(nir, _op) \ 551 case nir_op_##nir: \ 552 op = midgard_alu_op_##_op; \ 553 assert(src_bitsize == dst_bitsize); \ 554 break; 555 556#define ALU_CASE_RTZ(nir, _op) \ 557 case nir_op_##nir: \ 558 op = midgard_alu_op_##_op; \ 559 roundmode = MIDGARD_RTZ; \ 560 break; 561 562#define ALU_CHECK_CMP() \ 563 assert(src_bitsize == 16 || src_bitsize == 32 || src_bitsize == 64); \ 564 assert(dst_bitsize == 16 || dst_bitsize == 32); \ 565 566#define ALU_CASE_BCAST(nir, _op, count) \ 567 case nir_op_##nir: \ 568 op = midgard_alu_op_##_op; \ 569 broadcast_swizzle = count; \ 570 ALU_CHECK_CMP(); \ 571 break; 572 573#define ALU_CASE_CMP(nir, _op) \ 574 case nir_op_##nir: \ 575 op = midgard_alu_op_##_op; \ 576 ALU_CHECK_CMP(); \ 577 break; 578 579/* Compare mir_lower_invert */ 580static bool 581nir_accepts_inot(nir_op op, unsigned src) 582{ 583 switch (op) { 584 case nir_op_ior: 585 case nir_op_iand: /* TODO: b2f16 */ 586 case nir_op_ixor: 587 return true; 588 case nir_op_b32csel: 589 /* Only the condition */ 590 return (src == 0); 591 default: 592 return false; 593 } 594} 595 596static bool 597mir_accept_dest_mod(compiler_context *ctx, nir_dest **dest, nir_op op) 598{ 599 if (pan_has_dest_mod(dest, op)) { 600 assert((*dest)->is_ssa); 601 BITSET_SET(ctx->already_emitted, (*dest)->ssa.index); 602 return true; 603 } 604 605 return false; 606} 607 608/* Look for floating point mods. We have the mods clamp_m1_1, clamp_0_1, 609 * and clamp_0_inf. We also have the relations (note 3 * 2 = 6 cases): 610 * 611 * clamp_0_1(clamp_0_inf(x)) = clamp_m1_1(x) 612 * clamp_0_1(clamp_m1_1(x)) = clamp_m1_1(x) 613 * clamp_0_inf(clamp_0_1(x)) = clamp_m1_1(x) 614 * clamp_0_inf(clamp_m1_1(x)) = clamp_m1_1(x) 615 * clamp_m1_1(clamp_0_1(x)) = clamp_m1_1(x) 616 * clamp_m1_1(clamp_0_inf(x)) = clamp_m1_1(x) 617 * 618 * So by cases any composition of output modifiers is equivalent to 619 * clamp_m1_1 alone. 620 */ 621static unsigned 622mir_determine_float_outmod(compiler_context *ctx, nir_dest **dest, unsigned prior_outmod) 623{ 624 bool clamp_0_inf = mir_accept_dest_mod(ctx, dest, nir_op_fclamp_pos_mali); 625 bool clamp_0_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat); 626 bool clamp_m1_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat_signed_mali); 627 bool prior = (prior_outmod != midgard_outmod_none); 628 int count = (int) prior + (int) clamp_0_inf + (int) clamp_0_1 + (int) clamp_m1_1; 629 630 return ((count > 1) || clamp_0_1) ? midgard_outmod_clamp_0_1 : 631 clamp_0_inf ? midgard_outmod_clamp_0_inf : 632 clamp_m1_1 ? midgard_outmod_clamp_m1_1 : 633 prior_outmod; 634} 635 636static void 637mir_copy_src(midgard_instruction *ins, nir_alu_instr *instr, unsigned i, unsigned to, bool *abs, bool *neg, bool *not, enum midgard_roundmode *roundmode, bool is_int, unsigned bcast_count) 638{ 639 nir_alu_src src = instr->src[i]; 640 641 if (!is_int) { 642 if (pan_has_source_mod(&src, nir_op_fneg)) 643 *neg = !(*neg); 644 645 if (pan_has_source_mod(&src, nir_op_fabs)) 646 *abs = true; 647 } 648 649 if (nir_accepts_inot(instr->op, i) && pan_has_source_mod(&src, nir_op_inot)) 650 *not = true; 651 652 if (roundmode) { 653 if (pan_has_source_mod(&src, nir_op_fround_even)) 654 *roundmode = MIDGARD_RTE; 655 656 if (pan_has_source_mod(&src, nir_op_ftrunc)) 657 *roundmode = MIDGARD_RTZ; 658 659 if (pan_has_source_mod(&src, nir_op_ffloor)) 660 *roundmode = MIDGARD_RTN; 661 662 if (pan_has_source_mod(&src, nir_op_fceil)) 663 *roundmode = MIDGARD_RTP; 664 } 665 666 unsigned bits = nir_src_bit_size(src.src); 667 668 ins->src[to] = nir_src_index(NULL, &src.src); 669 ins->src_types[to] = nir_op_infos[instr->op].input_types[i] | bits; 670 671 for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) { 672 ins->swizzle[to][c] = src.swizzle[ 673 (!bcast_count || c < bcast_count) ? c : 674 (bcast_count - 1)]; 675 } 676} 677 678/* Midgard features both fcsel and icsel, depending on whether you want int or 679 * float modifiers. NIR's csel is typeless, so we want a heuristic to guess if 680 * we should emit an int or float csel depending on what modifiers could be 681 * placed. In the absense of modifiers, this is probably arbitrary. */ 682 683static bool 684mir_is_bcsel_float(nir_alu_instr *instr) 685{ 686 nir_op intmods[] = { 687 nir_op_i2i8, nir_op_i2i16, 688 nir_op_i2i32, nir_op_i2i64 689 }; 690 691 nir_op floatmods[] = { 692 nir_op_fabs, nir_op_fneg, 693 nir_op_f2f16, nir_op_f2f32, 694 nir_op_f2f64 695 }; 696 697 nir_op floatdestmods[] = { 698 nir_op_fsat, nir_op_fsat_signed_mali, nir_op_fclamp_pos_mali, 699 nir_op_f2f16, nir_op_f2f32 700 }; 701 702 signed score = 0; 703 704 for (unsigned i = 1; i < 3; ++i) { 705 nir_alu_src s = instr->src[i]; 706 for (unsigned q = 0; q < ARRAY_SIZE(intmods); ++q) { 707 if (pan_has_source_mod(&s, intmods[q])) 708 score--; 709 } 710 } 711 712 for (unsigned i = 1; i < 3; ++i) { 713 nir_alu_src s = instr->src[i]; 714 for (unsigned q = 0; q < ARRAY_SIZE(floatmods); ++q) { 715 if (pan_has_source_mod(&s, floatmods[q])) 716 score++; 717 } 718 } 719 720 for (unsigned q = 0; q < ARRAY_SIZE(floatdestmods); ++q) { 721 nir_dest *dest = &instr->dest.dest; 722 if (pan_has_dest_mod(&dest, floatdestmods[q])) 723 score++; 724 } 725 726 return (score > 0); 727} 728 729static void 730emit_alu(compiler_context *ctx, nir_alu_instr *instr) 731{ 732 nir_dest *dest = &instr->dest.dest; 733 734 if (dest->is_ssa && BITSET_TEST(ctx->already_emitted, dest->ssa.index)) 735 return; 736 737 /* Derivatives end up emitted on the texture pipe, not the ALUs. This 738 * is handled elsewhere */ 739 740 if (instr->op == nir_op_fddx || instr->op == nir_op_fddy) { 741 midgard_emit_derivatives(ctx, instr); 742 return; 743 } 744 745 bool is_ssa = dest->is_ssa; 746 747 unsigned nr_components = nir_dest_num_components(*dest); 748 unsigned nr_inputs = nir_op_infos[instr->op].num_inputs; 749 unsigned op = 0; 750 751 /* Number of components valid to check for the instruction (the rest 752 * will be forced to the last), or 0 to use as-is. Relevant as 753 * ball-type instructions have a channel count in NIR but are all vec4 754 * in Midgard */ 755 756 unsigned broadcast_swizzle = 0; 757 758 /* Should we swap arguments? */ 759 bool flip_src12 = false; 760 761 ASSERTED unsigned src_bitsize = nir_src_bit_size(instr->src[0].src); 762 ASSERTED unsigned dst_bitsize = nir_dest_bit_size(*dest); 763 764 enum midgard_roundmode roundmode = MIDGARD_RTE; 765 766 switch (instr->op) { 767 ALU_CASE(fadd, fadd); 768 ALU_CASE(fmul, fmul); 769 ALU_CASE(fmin, fmin); 770 ALU_CASE(fmax, fmax); 771 ALU_CASE(imin, imin); 772 ALU_CASE(imax, imax); 773 ALU_CASE(umin, umin); 774 ALU_CASE(umax, umax); 775 ALU_CASE(ffloor, ffloor); 776 ALU_CASE(fround_even, froundeven); 777 ALU_CASE(ftrunc, ftrunc); 778 ALU_CASE(fceil, fceil); 779 ALU_CASE(fdot3, fdot3); 780 ALU_CASE(fdot4, fdot4); 781 ALU_CASE(iadd, iadd); 782 ALU_CASE(isub, isub); 783 ALU_CASE(iadd_sat, iaddsat); 784 ALU_CASE(isub_sat, isubsat); 785 ALU_CASE(uadd_sat, uaddsat); 786 ALU_CASE(usub_sat, usubsat); 787 ALU_CASE(imul, imul); 788 ALU_CASE(imul_high, imul); 789 ALU_CASE(umul_high, imul); 790 ALU_CASE(uclz, iclz); 791 792 /* Zero shoved as second-arg */ 793 ALU_CASE(iabs, iabsdiff); 794 795 ALU_CASE(uabs_isub, iabsdiff); 796 ALU_CASE(uabs_usub, uabsdiff); 797 798 ALU_CASE(mov, imov); 799 800 ALU_CASE_CMP(feq32, feq); 801 ALU_CASE_CMP(fneu32, fne); 802 ALU_CASE_CMP(flt32, flt); 803 ALU_CASE_CMP(ieq32, ieq); 804 ALU_CASE_CMP(ine32, ine); 805 ALU_CASE_CMP(ilt32, ilt); 806 ALU_CASE_CMP(ult32, ult); 807 808 /* We don't have a native b2f32 instruction. Instead, like many 809 * GPUs, we exploit booleans as 0/~0 for false/true, and 810 * correspondingly AND 811 * by 1.0 to do the type conversion. For the moment, prime us 812 * to emit: 813 * 814 * iand [whatever], #0 815 * 816 * At the end of emit_alu (as MIR), we'll fix-up the constant 817 */ 818 819 ALU_CASE_CMP(b2f32, iand); 820 ALU_CASE_CMP(b2f16, iand); 821 ALU_CASE_CMP(b2i32, iand); 822 823 /* Likewise, we don't have a dedicated f2b32 instruction, but 824 * we can do a "not equal to 0.0" test. */ 825 826 ALU_CASE_CMP(f2b32, fne); 827 ALU_CASE_CMP(i2b32, ine); 828 829 ALU_CASE(frcp, frcp); 830 ALU_CASE(frsq, frsqrt); 831 ALU_CASE(fsqrt, fsqrt); 832 ALU_CASE(fexp2, fexp2); 833 ALU_CASE(flog2, flog2); 834 835 ALU_CASE_RTZ(f2i64, f2i_rte); 836 ALU_CASE_RTZ(f2u64, f2u_rte); 837 ALU_CASE_RTZ(i2f64, i2f_rte); 838 ALU_CASE_RTZ(u2f64, u2f_rte); 839 840 ALU_CASE_RTZ(f2i32, f2i_rte); 841 ALU_CASE_RTZ(f2u32, f2u_rte); 842 ALU_CASE_RTZ(i2f32, i2f_rte); 843 ALU_CASE_RTZ(u2f32, u2f_rte); 844 845 ALU_CASE_RTZ(f2i8, f2i_rte); 846 ALU_CASE_RTZ(f2u8, f2u_rte); 847 848 ALU_CASE_RTZ(f2i16, f2i_rte); 849 ALU_CASE_RTZ(f2u16, f2u_rte); 850 ALU_CASE_RTZ(i2f16, i2f_rte); 851 ALU_CASE_RTZ(u2f16, u2f_rte); 852 853 ALU_CASE(fsin, fsinpi); 854 ALU_CASE(fcos, fcospi); 855 856 /* We'll get 0 in the second arg, so: 857 * ~a = ~(a | 0) = nor(a, 0) */ 858 ALU_CASE(inot, inor); 859 ALU_CASE(iand, iand); 860 ALU_CASE(ior, ior); 861 ALU_CASE(ixor, ixor); 862 ALU_CASE(ishl, ishl); 863 ALU_CASE(ishr, iasr); 864 ALU_CASE(ushr, ilsr); 865 866 ALU_CASE_BCAST(b32all_fequal2, fball_eq, 2); 867 ALU_CASE_BCAST(b32all_fequal3, fball_eq, 3); 868 ALU_CASE_CMP(b32all_fequal4, fball_eq); 869 870 ALU_CASE_BCAST(b32any_fnequal2, fbany_neq, 2); 871 ALU_CASE_BCAST(b32any_fnequal3, fbany_neq, 3); 872 ALU_CASE_CMP(b32any_fnequal4, fbany_neq); 873 874 ALU_CASE_BCAST(b32all_iequal2, iball_eq, 2); 875 ALU_CASE_BCAST(b32all_iequal3, iball_eq, 3); 876 ALU_CASE_CMP(b32all_iequal4, iball_eq); 877 878 ALU_CASE_BCAST(b32any_inequal2, ibany_neq, 2); 879 ALU_CASE_BCAST(b32any_inequal3, ibany_neq, 3); 880 ALU_CASE_CMP(b32any_inequal4, ibany_neq); 881 882 /* Source mods will be shoved in later */ 883 ALU_CASE(fabs, fmov); 884 ALU_CASE(fneg, fmov); 885 ALU_CASE(fsat, fmov); 886 ALU_CASE(fsat_signed_mali, fmov); 887 ALU_CASE(fclamp_pos_mali, fmov); 888 889 /* For size conversion, we use a move. Ideally though we would squash 890 * these ops together; maybe that has to happen after in NIR as part of 891 * propagation...? An earlier algebraic pass ensured we step down by 892 * only / exactly one size. If stepping down, we use a dest override to 893 * reduce the size; if stepping up, we use a larger-sized move with a 894 * half source and a sign/zero-extension modifier */ 895 896 case nir_op_i2i8: 897 case nir_op_i2i16: 898 case nir_op_i2i32: 899 case nir_op_i2i64: 900 case nir_op_u2u8: 901 case nir_op_u2u16: 902 case nir_op_u2u32: 903 case nir_op_u2u64: 904 case nir_op_f2f16: 905 case nir_op_f2f32: 906 case nir_op_f2f64: { 907 if (instr->op == nir_op_f2f16 || instr->op == nir_op_f2f32 || 908 instr->op == nir_op_f2f64) 909 op = midgard_alu_op_fmov; 910 else 911 op = midgard_alu_op_imov; 912 913 break; 914 } 915 916 /* For greater-or-equal, we lower to less-or-equal and flip the 917 * arguments */ 918 919 case nir_op_fge: 920 case nir_op_fge32: 921 case nir_op_ige32: 922 case nir_op_uge32: { 923 op = 924 instr->op == nir_op_fge ? midgard_alu_op_fle : 925 instr->op == nir_op_fge32 ? midgard_alu_op_fle : 926 instr->op == nir_op_ige32 ? midgard_alu_op_ile : 927 instr->op == nir_op_uge32 ? midgard_alu_op_ule : 928 0; 929 930 flip_src12 = true; 931 ALU_CHECK_CMP(); 932 break; 933 } 934 935 case nir_op_b32csel: { 936 bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components); 937 bool is_float = mir_is_bcsel_float(instr); 938 op = is_float ? 939 (mixed ? midgard_alu_op_fcsel_v : midgard_alu_op_fcsel) : 940 (mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel); 941 942 break; 943 } 944 945 case nir_op_unpack_32_2x16: 946 case nir_op_unpack_32_4x8: 947 case nir_op_pack_32_2x16: 948 case nir_op_pack_32_4x8: { 949 op = midgard_alu_op_imov; 950 break; 951 } 952 953 default: 954 mesa_loge("Unhandled ALU op %s\n", nir_op_infos[instr->op].name); 955 assert(0); 956 return; 957 } 958 959 /* Promote imov to fmov if it might help inline a constant */ 960 if (op == midgard_alu_op_imov && nir_src_is_const(instr->src[0].src) 961 && nir_src_bit_size(instr->src[0].src) == 32 962 && nir_is_same_comp_swizzle(instr->src[0].swizzle, 963 nir_src_num_components(instr->src[0].src))) { 964 op = midgard_alu_op_fmov; 965 } 966 967 /* Midgard can perform certain modifiers on output of an ALU op */ 968 969 unsigned outmod = 0; 970 bool is_int = midgard_is_integer_op(op); 971 972 if (instr->op == nir_op_umul_high || instr->op == nir_op_imul_high) { 973 outmod = midgard_outmod_keephi; 974 } else if (midgard_is_integer_out_op(op)) { 975 outmod = midgard_outmod_keeplo; 976 } else if (instr->op == nir_op_fsat) { 977 outmod = midgard_outmod_clamp_0_1; 978 } else if (instr->op == nir_op_fsat_signed_mali) { 979 outmod = midgard_outmod_clamp_m1_1; 980 } else if (instr->op == nir_op_fclamp_pos_mali) { 981 outmod = midgard_outmod_clamp_0_inf; 982 } 983 984 /* Fetch unit, quirks, etc information */ 985 unsigned opcode_props = alu_opcode_props[op].props; 986 bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24; 987 988 if (!midgard_is_integer_out_op(op)) { 989 outmod = mir_determine_float_outmod(ctx, &dest, outmod); 990 } 991 992 midgard_instruction ins = { 993 .type = TAG_ALU_4, 994 .dest = nir_dest_index(dest), 995 .dest_type = nir_op_infos[instr->op].output_type 996 | nir_dest_bit_size(*dest), 997 .roundmode = roundmode, 998 }; 999 1000 enum midgard_roundmode *roundptr = (opcode_props & MIDGARD_ROUNDS) ? 1001 &ins.roundmode : NULL; 1002 1003 for (unsigned i = nr_inputs; i < ARRAY_SIZE(ins.src); ++i) 1004 ins.src[i] = ~0; 1005 1006 if (quirk_flipped_r24) { 1007 ins.src[0] = ~0; 1008 mir_copy_src(&ins, instr, 0, 1, &ins.src_abs[1], &ins.src_neg[1], &ins.src_invert[1], roundptr, is_int, broadcast_swizzle); 1009 } else { 1010 for (unsigned i = 0; i < nr_inputs; ++i) { 1011 unsigned to = i; 1012 1013 if (instr->op == nir_op_b32csel) { 1014 /* The condition is the first argument; move 1015 * the other arguments up one to be a binary 1016 * instruction for Midgard with the condition 1017 * last */ 1018 1019 if (i == 0) 1020 to = 2; 1021 else if (flip_src12) 1022 to = 2 - i; 1023 else 1024 to = i - 1; 1025 } else if (flip_src12) { 1026 to = 1 - to; 1027 } 1028 1029 mir_copy_src(&ins, instr, i, to, &ins.src_abs[to], &ins.src_neg[to], &ins.src_invert[to], roundptr, is_int, broadcast_swizzle); 1030 1031 /* (!c) ? a : b = c ? b : a */ 1032 if (instr->op == nir_op_b32csel && ins.src_invert[2]) { 1033 ins.src_invert[2] = false; 1034 flip_src12 ^= true; 1035 } 1036 } 1037 } 1038 1039 if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) { 1040 /* Lowered to move */ 1041 if (instr->op == nir_op_fneg) 1042 ins.src_neg[1] ^= true; 1043 1044 if (instr->op == nir_op_fabs) 1045 ins.src_abs[1] = true; 1046 } 1047 1048 ins.mask = mask_of(nr_components); 1049 1050 /* Apply writemask if non-SSA, keeping in mind that we can't write to 1051 * components that don't exist. Note modifier => SSA => !reg => no 1052 * writemask, so we don't have to worry about writemasks here.*/ 1053 1054 if (!is_ssa) 1055 ins.mask &= instr->dest.write_mask; 1056 1057 ins.op = op; 1058 ins.outmod = outmod; 1059 1060 /* Late fixup for emulated instructions */ 1061 1062 if (instr->op == nir_op_b2f32 || instr->op == nir_op_b2i32) { 1063 /* Presently, our second argument is an inline #0 constant. 1064 * Switch over to an embedded 1.0 constant (that can't fit 1065 * inline, since we're 32-bit, not 16-bit like the inline 1066 * constants) */ 1067 1068 ins.has_inline_constant = false; 1069 ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); 1070 ins.src_types[1] = nir_type_float32; 1071 ins.has_constants = true; 1072 1073 if (instr->op == nir_op_b2f32) 1074 ins.constants.f32[0] = 1.0f; 1075 else 1076 ins.constants.i32[0] = 1; 1077 1078 for (unsigned c = 0; c < 16; ++c) 1079 ins.swizzle[1][c] = 0; 1080 } else if (instr->op == nir_op_b2f16) { 1081 ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); 1082 ins.src_types[1] = nir_type_float16; 1083 ins.has_constants = true; 1084 ins.constants.i16[0] = _mesa_float_to_half(1.0); 1085 1086 for (unsigned c = 0; c < 16; ++c) 1087 ins.swizzle[1][c] = 0; 1088 } else if (nr_inputs == 1 && !quirk_flipped_r24) { 1089 /* Lots of instructions need a 0 plonked in */ 1090 ins.has_inline_constant = false; 1091 ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); 1092 ins.src_types[1] = ins.src_types[0]; 1093 ins.has_constants = true; 1094 ins.constants.u32[0] = 0; 1095 1096 for (unsigned c = 0; c < 16; ++c) 1097 ins.swizzle[1][c] = 0; 1098 } else if (instr->op == nir_op_pack_32_2x16) { 1099 ins.dest_type = nir_type_uint16; 1100 ins.mask = mask_of(nr_components * 2); 1101 ins.is_pack = true; 1102 } else if (instr->op == nir_op_pack_32_4x8) { 1103 ins.dest_type = nir_type_uint8; 1104 ins.mask = mask_of(nr_components * 4); 1105 ins.is_pack = true; 1106 } else if (instr->op == nir_op_unpack_32_2x16) { 1107 ins.dest_type = nir_type_uint32; 1108 ins.mask = mask_of(nr_components >> 1); 1109 ins.is_pack = true; 1110 } else if (instr->op == nir_op_unpack_32_4x8) { 1111 ins.dest_type = nir_type_uint32; 1112 ins.mask = mask_of(nr_components >> 2); 1113 ins.is_pack = true; 1114 } 1115 1116 if ((opcode_props & UNITS_ALL) == UNIT_VLUT) { 1117 /* To avoid duplicating the lookup tables (probably), true LUT 1118 * instructions can only operate as if they were scalars. Lower 1119 * them here by changing the component. */ 1120 1121 unsigned orig_mask = ins.mask; 1122 1123 unsigned swizzle_back[MIR_VEC_COMPONENTS]; 1124 memcpy(&swizzle_back, ins.swizzle[0], sizeof(swizzle_back)); 1125 1126 midgard_instruction ins_split[MIR_VEC_COMPONENTS]; 1127 unsigned ins_count = 0; 1128 1129 for (int i = 0; i < nr_components; ++i) { 1130 /* Mask the associated component, dropping the 1131 * instruction if needed */ 1132 1133 ins.mask = 1 << i; 1134 ins.mask &= orig_mask; 1135 1136 for (unsigned j = 0; j < ins_count; ++j) { 1137 if (swizzle_back[i] == ins_split[j].swizzle[0][0]) { 1138 ins_split[j].mask |= ins.mask; 1139 ins.mask = 0; 1140 break; 1141 } 1142 } 1143 1144 if (!ins.mask) 1145 continue; 1146 1147 for (unsigned j = 0; j < MIR_VEC_COMPONENTS; ++j) 1148 ins.swizzle[0][j] = swizzle_back[i]; /* Pull from the correct component */ 1149 1150 ins_split[ins_count] = ins; 1151 1152 ++ins_count; 1153 } 1154 1155 for (unsigned i = 0; i < ins_count; ++i) { 1156 emit_mir_instruction(ctx, ins_split[i]); 1157 } 1158 } else { 1159 emit_mir_instruction(ctx, ins); 1160 } 1161} 1162 1163#undef ALU_CASE 1164 1165static void 1166mir_set_intr_mask(nir_instr *instr, midgard_instruction *ins, bool is_read) 1167{ 1168 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 1169 unsigned nir_mask = 0; 1170 unsigned dsize = 0; 1171 1172 if (is_read) { 1173 nir_mask = mask_of(nir_intrinsic_dest_components(intr)); 1174 1175 /* Extension is mandatory for 8/16-bit loads */ 1176 dsize = nir_dest_bit_size(intr->dest) == 64 ? 64 : 32; 1177 } else { 1178 nir_mask = nir_intrinsic_write_mask(intr); 1179 dsize = OP_IS_COMMON_STORE(ins->op) ? 1180 nir_src_bit_size(intr->src[0]) : 32; 1181 } 1182 1183 /* Once we have the NIR mask, we need to normalize to work in 32-bit space */ 1184 unsigned bytemask = pan_to_bytemask(dsize, nir_mask); 1185 ins->dest_type = nir_type_uint | dsize; 1186 mir_set_bytemask(ins, bytemask); 1187} 1188 1189/* Uniforms and UBOs use a shared code path, as uniforms are just (slightly 1190 * optimized) versions of UBO #0 */ 1191 1192static midgard_instruction * 1193emit_ubo_read( 1194 compiler_context *ctx, 1195 nir_instr *instr, 1196 unsigned dest, 1197 unsigned offset, 1198 nir_src *indirect_offset, 1199 unsigned indirect_shift, 1200 unsigned index, 1201 unsigned nr_comps) 1202{ 1203 midgard_instruction ins; 1204 1205 unsigned dest_size = (instr->type == nir_instr_type_intrinsic) ? 1206 nir_dest_bit_size(nir_instr_as_intrinsic(instr)->dest) : 32; 1207 1208 unsigned bitsize = dest_size * nr_comps; 1209 1210 /* Pick the smallest intrinsic to avoid out-of-bounds reads */ 1211 if (bitsize <= 8) 1212 ins = m_ld_ubo_u8(dest, 0); 1213 else if (bitsize <= 16) 1214 ins = m_ld_ubo_u16(dest, 0); 1215 else if (bitsize <= 32) 1216 ins = m_ld_ubo_32(dest, 0); 1217 else if (bitsize <= 64) 1218 ins = m_ld_ubo_64(dest, 0); 1219 else if (bitsize <= 128) 1220 ins = m_ld_ubo_128(dest, 0); 1221 else 1222 unreachable("Invalid UBO read size"); 1223 1224 ins.constants.u32[0] = offset; 1225 1226 if (instr->type == nir_instr_type_intrinsic) 1227 mir_set_intr_mask(instr, &ins, true); 1228 1229 if (indirect_offset) { 1230 ins.src[2] = nir_src_index(ctx, indirect_offset); 1231 ins.src_types[2] = nir_type_uint32; 1232 ins.load_store.index_shift = indirect_shift; 1233 1234 /* X component for the whole swizzle to prevent register 1235 * pressure from ballooning from the extra components */ 1236 for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[2]); ++i) 1237 ins.swizzle[2][i] = 0; 1238 } else { 1239 ins.load_store.index_reg = REGISTER_LDST_ZERO; 1240 } 1241 1242 if (indirect_offset && indirect_offset->is_ssa && !indirect_shift) 1243 mir_set_ubo_offset(&ins, indirect_offset, offset); 1244 1245 midgard_pack_ubo_index_imm(&ins.load_store, index); 1246 1247 return emit_mir_instruction(ctx, ins); 1248} 1249 1250/* Globals are like UBOs if you squint. And shared memory is like globals if 1251 * you squint even harder */ 1252 1253static void 1254emit_global( 1255 compiler_context *ctx, 1256 nir_instr *instr, 1257 bool is_read, 1258 unsigned srcdest, 1259 nir_src *offset, 1260 unsigned seg) 1261{ 1262 midgard_instruction ins; 1263 1264 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 1265 if (is_read) { 1266 unsigned bitsize = nir_dest_bit_size(intr->dest) * 1267 nir_dest_num_components(intr->dest); 1268 1269 switch (bitsize) { 1270 case 8: ins = m_ld_u8(srcdest, 0); break; 1271 case 16: ins = m_ld_u16(srcdest, 0); break; 1272 case 32: ins = m_ld_32(srcdest, 0); break; 1273 case 64: ins = m_ld_64(srcdest, 0); break; 1274 case 128: ins = m_ld_128(srcdest, 0); break; 1275 default: unreachable("Invalid global read size"); 1276 } 1277 1278 mir_set_intr_mask(instr, &ins, is_read); 1279 1280 /* For anything not aligned on 32bit, make sure we write full 1281 * 32 bits registers. */ 1282 if (bitsize & 31) { 1283 unsigned comps_per_32b = 32 / nir_dest_bit_size(intr->dest); 1284 1285 for (unsigned c = 0; c < 4 * comps_per_32b; c += comps_per_32b) { 1286 if (!(ins.mask & BITFIELD_RANGE(c, comps_per_32b))) 1287 continue; 1288 1289 unsigned base = ~0; 1290 for (unsigned i = 0; i < comps_per_32b; i++) { 1291 if (ins.mask & BITFIELD_BIT(c + i)) { 1292 base = ins.swizzle[0][c + i]; 1293 break; 1294 } 1295 } 1296 1297 assert(base != ~0); 1298 1299 for (unsigned i = 0; i < comps_per_32b; i++) { 1300 if (!(ins.mask & BITFIELD_BIT(c + i))) { 1301 ins.swizzle[0][c + i] = base + i; 1302 ins.mask |= BITFIELD_BIT(c + i); 1303 } 1304 assert(ins.swizzle[0][c + i] == base + i); 1305 } 1306 } 1307 1308 } 1309 } else { 1310 unsigned bitsize = nir_src_bit_size(intr->src[0]) * 1311 nir_src_num_components(intr->src[0]); 1312 1313 if (bitsize == 8) 1314 ins = m_st_u8(srcdest, 0); 1315 else if (bitsize == 16) 1316 ins = m_st_u16(srcdest, 0); 1317 else if (bitsize <= 32) 1318 ins = m_st_32(srcdest, 0); 1319 else if (bitsize <= 64) 1320 ins = m_st_64(srcdest, 0); 1321 else if (bitsize <= 128) 1322 ins = m_st_128(srcdest, 0); 1323 else 1324 unreachable("Invalid global store size"); 1325 1326 mir_set_intr_mask(instr, &ins, is_read); 1327 } 1328 1329 mir_set_offset(ctx, &ins, offset, seg); 1330 1331 /* Set a valid swizzle for masked out components */ 1332 assert(ins.mask); 1333 unsigned first_component = __builtin_ffs(ins.mask) - 1; 1334 1335 for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i) { 1336 if (!(ins.mask & (1 << i))) 1337 ins.swizzle[0][i] = first_component; 1338 } 1339 1340 emit_mir_instruction(ctx, ins); 1341} 1342 1343/* If is_shared is off, the only other possible value are globals, since 1344 * SSBO's are being lowered to globals through a NIR pass. 1345 * `image_direct_address` should be ~0 when instr is not an image_atomic 1346 * and the destination register of a lea_image op when it is an image_atomic. */ 1347static void 1348emit_atomic( 1349 compiler_context *ctx, 1350 nir_intrinsic_instr *instr, 1351 bool is_shared, 1352 midgard_load_store_op op, 1353 unsigned image_direct_address) 1354{ 1355 nir_alu_type type = 1356 (op == midgard_op_atomic_imin || op == midgard_op_atomic_imax) ? 1357 nir_type_int : nir_type_uint; 1358 1359 bool is_image = image_direct_address != ~0; 1360 1361 unsigned dest = nir_dest_index(&instr->dest); 1362 unsigned val_src = is_image ? 3 : 1; 1363 unsigned val = nir_src_index(ctx, &instr->src[val_src]); 1364 unsigned bitsize = nir_src_bit_size(instr->src[val_src]); 1365 emit_explicit_constant(ctx, val, val); 1366 1367 midgard_instruction ins = { 1368 .type = TAG_LOAD_STORE_4, 1369 .mask = 0xF, 1370 .dest = dest, 1371 .src = { ~0, ~0, ~0, val }, 1372 .src_types = { 0, 0, 0, type | bitsize }, 1373 .op = op 1374 }; 1375 1376 nir_src *src_offset = nir_get_io_offset_src(instr); 1377 1378 if (op == midgard_op_atomic_cmpxchg) { 1379 unsigned xchg_val_src = is_image ? 4 : 2; 1380 unsigned xchg_val = nir_src_index(ctx, &instr->src[xchg_val_src]); 1381 emit_explicit_constant(ctx, xchg_val, xchg_val); 1382 1383 ins.src[2] = val; 1384 ins.src_types[2] = type | bitsize; 1385 ins.src[3] = xchg_val; 1386 1387 if (is_shared) { 1388 ins.load_store.arg_reg = REGISTER_LDST_LOCAL_STORAGE_PTR; 1389 ins.load_store.arg_comp = COMPONENT_Z; 1390 ins.load_store.bitsize_toggle = true; 1391 } else { 1392 for(unsigned i = 0; i < 2; ++i) 1393 ins.swizzle[1][i] = i; 1394 1395 ins.src[1] = is_image ? image_direct_address : 1396 nir_src_index(ctx, src_offset); 1397 ins.src_types[1] = nir_type_uint64; 1398 } 1399 } else if (is_image) { 1400 for(unsigned i = 0; i < 2; ++i) 1401 ins.swizzle[2][i] = i; 1402 1403 ins.src[2] = image_direct_address; 1404 ins.src_types[2] = nir_type_uint64; 1405 1406 ins.load_store.arg_reg = REGISTER_LDST_ZERO; 1407 ins.load_store.bitsize_toggle = true; 1408 ins.load_store.index_format = midgard_index_address_u64; 1409 } else 1410 mir_set_offset(ctx, &ins, src_offset, is_shared ? LDST_SHARED : LDST_GLOBAL); 1411 1412 mir_set_intr_mask(&instr->instr, &ins, true); 1413 1414 emit_mir_instruction(ctx, ins); 1415} 1416 1417static void 1418emit_varying_read( 1419 compiler_context *ctx, 1420 unsigned dest, unsigned offset, 1421 unsigned nr_comp, unsigned component, 1422 nir_src *indirect_offset, nir_alu_type type, bool flat) 1423{ 1424 /* XXX: Half-floats? */ 1425 /* TODO: swizzle, mask */ 1426 1427 midgard_instruction ins = m_ld_vary_32(dest, PACK_LDST_ATTRIB_OFS(offset)); 1428 ins.mask = mask_of(nr_comp); 1429 ins.dest_type = type; 1430 1431 if (type == nir_type_float16) { 1432 /* Ensure we are aligned so we can pack it later */ 1433 ins.mask = mask_of(ALIGN_POT(nr_comp, 2)); 1434 } 1435 1436 for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i) 1437 ins.swizzle[0][i] = MIN2(i + component, COMPONENT_W); 1438 1439 1440 midgard_varying_params p = { 1441 .flat_shading = flat, 1442 .perspective_correction = 1, 1443 .interpolate_sample = true, 1444 }; 1445 midgard_pack_varying_params(&ins.load_store, p); 1446 1447 if (indirect_offset) { 1448 ins.src[2] = nir_src_index(ctx, indirect_offset); 1449 ins.src_types[2] = nir_type_uint32; 1450 } else 1451 ins.load_store.index_reg = REGISTER_LDST_ZERO; 1452 1453 ins.load_store.arg_reg = REGISTER_LDST_ZERO; 1454 ins.load_store.index_format = midgard_index_address_u32; 1455 1456 /* Use the type appropriate load */ 1457 switch (type) { 1458 case nir_type_uint32: 1459 case nir_type_bool32: 1460 ins.op = midgard_op_ld_vary_32u; 1461 break; 1462 case nir_type_int32: 1463 ins.op = midgard_op_ld_vary_32i; 1464 break; 1465 case nir_type_float32: 1466 ins.op = midgard_op_ld_vary_32; 1467 break; 1468 case nir_type_float16: 1469 ins.op = midgard_op_ld_vary_16; 1470 break; 1471 default: 1472 unreachable("Attempted to load unknown type"); 1473 break; 1474 } 1475 1476 emit_mir_instruction(ctx, ins); 1477} 1478 1479 1480/* If `is_atomic` is true, we emit a `lea_image` since midgard doesn't not have special 1481 * image_atomic opcodes. The caller can then use that address to emit a normal atomic opcode. */ 1482static midgard_instruction 1483emit_image_op(compiler_context *ctx, nir_intrinsic_instr *instr, bool is_atomic) 1484{ 1485 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 1486 unsigned nr_attr = ctx->stage == MESA_SHADER_VERTEX ? 1487 util_bitcount64(ctx->nir->info.inputs_read) : 0; 1488 unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); 1489 bool is_array = nir_intrinsic_image_array(instr); 1490 bool is_store = instr->intrinsic == nir_intrinsic_image_store; 1491 1492 /* TODO: MSAA */ 1493 assert(dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); 1494 1495 unsigned coord_reg = nir_src_index(ctx, &instr->src[1]); 1496 emit_explicit_constant(ctx, coord_reg, coord_reg); 1497 1498 nir_src *index = &instr->src[0]; 1499 bool is_direct = nir_src_is_const(*index); 1500 1501 /* For image opcodes, address is used as an index into the attribute descriptor */ 1502 unsigned address = nr_attr; 1503 if (is_direct) 1504 address += nir_src_as_uint(*index); 1505 1506 midgard_instruction ins; 1507 if (is_store) { /* emit st_image_* */ 1508 unsigned val = nir_src_index(ctx, &instr->src[3]); 1509 emit_explicit_constant(ctx, val, val); 1510 1511 nir_alu_type type = nir_intrinsic_src_type(instr); 1512 ins = st_image(type, val, PACK_LDST_ATTRIB_OFS(address)); 1513 nir_alu_type base_type = nir_alu_type_get_base_type(type); 1514 ins.src_types[0] = base_type | nir_src_bit_size(instr->src[3]); 1515 } else if (is_atomic) { /* emit lea_image */ 1516 unsigned dest = make_compiler_temp_reg(ctx); 1517 ins = m_lea_image(dest, PACK_LDST_ATTRIB_OFS(address)); 1518 ins.mask = mask_of(2); /* 64-bit memory address */ 1519 } else { /* emit ld_image_* */ 1520 nir_alu_type type = nir_intrinsic_dest_type(instr); 1521 ins = ld_image(type, nir_dest_index(&instr->dest), PACK_LDST_ATTRIB_OFS(address)); 1522 ins.mask = mask_of(nir_intrinsic_dest_components(instr)); 1523 ins.dest_type = type; 1524 } 1525 1526 /* Coord reg */ 1527 ins.src[1] = coord_reg; 1528 ins.src_types[1] = nir_type_uint16; 1529 if (nr_dim == 3 || is_array) { 1530 ins.load_store.bitsize_toggle = true; 1531 } 1532 1533 /* Image index reg */ 1534 if (!is_direct) { 1535 ins.src[2] = nir_src_index(ctx, index); 1536 ins.src_types[2] = nir_type_uint32; 1537 } else 1538 ins.load_store.index_reg = REGISTER_LDST_ZERO; 1539 1540 emit_mir_instruction(ctx, ins); 1541 1542 return ins; 1543} 1544 1545static void 1546emit_attr_read( 1547 compiler_context *ctx, 1548 unsigned dest, unsigned offset, 1549 unsigned nr_comp, nir_alu_type t) 1550{ 1551 midgard_instruction ins = m_ld_attr_32(dest, PACK_LDST_ATTRIB_OFS(offset)); 1552 ins.load_store.arg_reg = REGISTER_LDST_ZERO; 1553 ins.load_store.index_reg = REGISTER_LDST_ZERO; 1554 ins.mask = mask_of(nr_comp); 1555 1556 /* Use the type appropriate load */ 1557 switch (t) { 1558 case nir_type_uint: 1559 case nir_type_bool: 1560 ins.op = midgard_op_ld_attr_32u; 1561 break; 1562 case nir_type_int: 1563 ins.op = midgard_op_ld_attr_32i; 1564 break; 1565 case nir_type_float: 1566 ins.op = midgard_op_ld_attr_32; 1567 break; 1568 default: 1569 unreachable("Attempted to load unknown type"); 1570 break; 1571 } 1572 1573 emit_mir_instruction(ctx, ins); 1574} 1575 1576static void 1577emit_sysval_read(compiler_context *ctx, nir_instr *instr, 1578 unsigned nr_components, unsigned offset) 1579{ 1580 nir_dest nir_dest; 1581 1582 /* Figure out which uniform this is */ 1583 unsigned sysval_ubo = ctx->inputs->fixed_sysval_ubo >= 0 ? 1584 ctx->inputs->fixed_sysval_ubo : 1585 ctx->nir->info.num_ubos; 1586 int sysval = panfrost_sysval_for_instr(instr, &nir_dest); 1587 unsigned dest = nir_dest_index(&nir_dest); 1588 unsigned uniform = 1589 pan_lookup_sysval(ctx->sysval_to_id, &ctx->info->sysvals, sysval); 1590 1591 /* Emit the read itself -- this is never indirect */ 1592 midgard_instruction *ins = 1593 emit_ubo_read(ctx, instr, dest, (uniform * 16) + offset, NULL, 0, 1594 sysval_ubo, nr_components); 1595 1596 ins->mask = mask_of(nr_components); 1597} 1598 1599static unsigned 1600compute_builtin_arg(nir_intrinsic_op op) 1601{ 1602 switch (op) { 1603 case nir_intrinsic_load_workgroup_id: 1604 return REGISTER_LDST_GROUP_ID; 1605 case nir_intrinsic_load_local_invocation_id: 1606 return REGISTER_LDST_LOCAL_THREAD_ID; 1607 case nir_intrinsic_load_global_invocation_id: 1608 case nir_intrinsic_load_global_invocation_id_zero_base: 1609 return REGISTER_LDST_GLOBAL_THREAD_ID; 1610 default: 1611 unreachable("Invalid compute paramater loaded"); 1612 } 1613} 1614 1615static void 1616emit_fragment_store(compiler_context *ctx, unsigned src, unsigned src_z, unsigned src_s, 1617 enum midgard_rt_id rt, unsigned sample_iter) 1618{ 1619 assert(rt < ARRAY_SIZE(ctx->writeout_branch)); 1620 assert(sample_iter < ARRAY_SIZE(ctx->writeout_branch[0])); 1621 1622 midgard_instruction *br = ctx->writeout_branch[rt][sample_iter]; 1623 1624 assert(!br); 1625 1626 emit_explicit_constant(ctx, src, src); 1627 1628 struct midgard_instruction ins = 1629 v_branch(false, false); 1630 1631 bool depth_only = (rt == MIDGARD_ZS_RT); 1632 1633 ins.writeout = depth_only ? 0 : PAN_WRITEOUT_C; 1634 1635 /* Add dependencies */ 1636 ins.src[0] = src; 1637 ins.src_types[0] = nir_type_uint32; 1638 1639 if (depth_only) 1640 ins.constants.u32[0] = 0xFF; 1641 else 1642 ins.constants.u32[0] = ((rt - MIDGARD_COLOR_RT0) << 8) | sample_iter; 1643 1644 for (int i = 0; i < 4; ++i) 1645 ins.swizzle[0][i] = i; 1646 1647 if (~src_z) { 1648 emit_explicit_constant(ctx, src_z, src_z); 1649 ins.src[2] = src_z; 1650 ins.src_types[2] = nir_type_uint32; 1651 ins.writeout |= PAN_WRITEOUT_Z; 1652 } 1653 if (~src_s) { 1654 emit_explicit_constant(ctx, src_s, src_s); 1655 ins.src[3] = src_s; 1656 ins.src_types[3] = nir_type_uint32; 1657 ins.writeout |= PAN_WRITEOUT_S; 1658 } 1659 1660 /* Emit the branch */ 1661 br = emit_mir_instruction(ctx, ins); 1662 schedule_barrier(ctx); 1663 ctx->writeout_branch[rt][sample_iter] = br; 1664 1665 /* Push our current location = current block count - 1 = where we'll 1666 * jump to. Maybe a bit too clever for my own good */ 1667 1668 br->branch.target_block = ctx->block_count - 1; 1669} 1670 1671static void 1672emit_compute_builtin(compiler_context *ctx, nir_intrinsic_instr *instr) 1673{ 1674 unsigned reg = nir_dest_index(&instr->dest); 1675 midgard_instruction ins = m_ldst_mov(reg, 0); 1676 ins.mask = mask_of(3); 1677 ins.swizzle[0][3] = COMPONENT_X; /* xyzx */ 1678 ins.load_store.arg_reg = compute_builtin_arg(instr->intrinsic); 1679 emit_mir_instruction(ctx, ins); 1680} 1681 1682static unsigned 1683vertex_builtin_arg(nir_intrinsic_op op) 1684{ 1685 switch (op) { 1686 case nir_intrinsic_load_vertex_id_zero_base: 1687 return PAN_VERTEX_ID; 1688 case nir_intrinsic_load_instance_id: 1689 return PAN_INSTANCE_ID; 1690 default: 1691 unreachable("Invalid vertex builtin"); 1692 } 1693} 1694 1695static void 1696emit_vertex_builtin(compiler_context *ctx, nir_intrinsic_instr *instr) 1697{ 1698 unsigned reg = nir_dest_index(&instr->dest); 1699 emit_attr_read(ctx, reg, vertex_builtin_arg(instr->intrinsic), 1, nir_type_int); 1700} 1701 1702static void 1703emit_special(compiler_context *ctx, nir_intrinsic_instr *instr, unsigned idx) 1704{ 1705 unsigned reg = nir_dest_index(&instr->dest); 1706 1707 midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0); 1708 ld.op = midgard_op_ld_special_32u; 1709 ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(idx); 1710 ld.load_store.index_reg = REGISTER_LDST_ZERO; 1711 1712 for (int i = 0; i < 4; ++i) 1713 ld.swizzle[0][i] = COMPONENT_X; 1714 1715 emit_mir_instruction(ctx, ld); 1716} 1717 1718static void 1719emit_control_barrier(compiler_context *ctx) 1720{ 1721 midgard_instruction ins = { 1722 .type = TAG_TEXTURE_4, 1723 .dest = ~0, 1724 .src = { ~0, ~0, ~0, ~0 }, 1725 .op = midgard_tex_op_barrier, 1726 }; 1727 1728 emit_mir_instruction(ctx, ins); 1729} 1730 1731static unsigned 1732mir_get_branch_cond(nir_src *src, bool *invert) 1733{ 1734 /* Wrap it. No swizzle since it's a scalar */ 1735 1736 nir_alu_src alu = { 1737 .src = *src 1738 }; 1739 1740 *invert = pan_has_source_mod(&alu, nir_op_inot); 1741 return nir_src_index(NULL, &alu.src); 1742} 1743 1744static uint8_t 1745output_load_rt_addr(compiler_context *ctx, nir_intrinsic_instr *instr) 1746{ 1747 if (ctx->inputs->is_blend) 1748 return MIDGARD_COLOR_RT0 + ctx->inputs->blend.rt; 1749 1750 const nir_variable *var; 1751 var = nir_find_variable_with_driver_location(ctx->nir, nir_var_shader_out, nir_intrinsic_base(instr)); 1752 assert(var); 1753 1754 unsigned loc = var->data.location; 1755 1756 if (loc >= FRAG_RESULT_DATA0) 1757 return loc - FRAG_RESULT_DATA0; 1758 1759 if (loc == FRAG_RESULT_DEPTH) 1760 return 0x1F; 1761 if (loc == FRAG_RESULT_STENCIL) 1762 return 0x1E; 1763 1764 unreachable("Invalid RT to load from"); 1765} 1766 1767static void 1768emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) 1769{ 1770 unsigned offset = 0, reg; 1771 1772 switch (instr->intrinsic) { 1773 case nir_intrinsic_discard_if: 1774 case nir_intrinsic_discard: { 1775 bool conditional = instr->intrinsic == nir_intrinsic_discard_if; 1776 struct midgard_instruction discard = v_branch(conditional, false); 1777 discard.branch.target_type = TARGET_DISCARD; 1778 1779 if (conditional) { 1780 discard.src[0] = mir_get_branch_cond(&instr->src[0], 1781 &discard.branch.invert_conditional); 1782 discard.src_types[0] = nir_type_uint32; 1783 } 1784 1785 emit_mir_instruction(ctx, discard); 1786 schedule_barrier(ctx); 1787 1788 break; 1789 } 1790 1791 case nir_intrinsic_image_load: 1792 case nir_intrinsic_image_store: 1793 emit_image_op(ctx, instr, false); 1794 break; 1795 1796 case nir_intrinsic_image_size: { 1797 unsigned nr_comp = nir_intrinsic_dest_components(instr); 1798 emit_sysval_read(ctx, &instr->instr, nr_comp, 0); 1799 break; 1800 } 1801 1802 case nir_intrinsic_load_ubo: 1803 case nir_intrinsic_load_global: 1804 case nir_intrinsic_load_global_constant: 1805 case nir_intrinsic_load_shared: 1806 case nir_intrinsic_load_scratch: 1807 case nir_intrinsic_load_input: 1808 case nir_intrinsic_load_kernel_input: 1809 case nir_intrinsic_load_interpolated_input: { 1810 bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo; 1811 bool is_global = instr->intrinsic == nir_intrinsic_load_global || 1812 instr->intrinsic == nir_intrinsic_load_global_constant; 1813 bool is_shared = instr->intrinsic == nir_intrinsic_load_shared; 1814 bool is_scratch = instr->intrinsic == nir_intrinsic_load_scratch; 1815 bool is_flat = instr->intrinsic == nir_intrinsic_load_input; 1816 bool is_kernel = instr->intrinsic == nir_intrinsic_load_kernel_input; 1817 bool is_interp = instr->intrinsic == nir_intrinsic_load_interpolated_input; 1818 1819 /* Get the base type of the intrinsic */ 1820 /* TODO: Infer type? Does it matter? */ 1821 nir_alu_type t = 1822 (is_interp) ? nir_type_float : 1823 (is_flat) ? nir_intrinsic_dest_type(instr) : 1824 nir_type_uint; 1825 1826 t = nir_alu_type_get_base_type(t); 1827 1828 if (!(is_ubo || is_global || is_scratch)) { 1829 offset = nir_intrinsic_base(instr); 1830 } 1831 1832 unsigned nr_comp = nir_intrinsic_dest_components(instr); 1833 1834 nir_src *src_offset = nir_get_io_offset_src(instr); 1835 1836 bool direct = nir_src_is_const(*src_offset); 1837 nir_src *indirect_offset = direct ? NULL : src_offset; 1838 1839 if (direct) 1840 offset += nir_src_as_uint(*src_offset); 1841 1842 /* We may need to apply a fractional offset */ 1843 int component = (is_flat || is_interp) ? 1844 nir_intrinsic_component(instr) : 0; 1845 reg = nir_dest_index(&instr->dest); 1846 1847 if (is_kernel) { 1848 emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, 0, nr_comp); 1849 } else if (is_ubo) { 1850 nir_src index = instr->src[0]; 1851 1852 /* TODO: Is indirect block number possible? */ 1853 assert(nir_src_is_const(index)); 1854 1855 uint32_t uindex = nir_src_as_uint(index); 1856 emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, uindex, nr_comp); 1857 } else if (is_global || is_shared || is_scratch) { 1858 unsigned seg = is_global ? LDST_GLOBAL : (is_shared ? LDST_SHARED : LDST_SCRATCH); 1859 emit_global(ctx, &instr->instr, true, reg, src_offset, seg); 1860 } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend) { 1861 emit_varying_read(ctx, reg, offset, nr_comp, component, indirect_offset, t | nir_dest_bit_size(instr->dest), is_flat); 1862 } else if (ctx->inputs->is_blend) { 1863 /* ctx->blend_input will be precoloured to r0/r2, where 1864 * the input is preloaded */ 1865 1866 unsigned *input = offset ? &ctx->blend_src1 : &ctx->blend_input; 1867 1868 if (*input == ~0) 1869 *input = reg; 1870 else 1871 emit_mir_instruction(ctx, v_mov(*input, reg)); 1872 } else if (ctx->stage == MESA_SHADER_VERTEX) { 1873 emit_attr_read(ctx, reg, offset, nr_comp, t); 1874 } else { 1875 DBG("Unknown load\n"); 1876 assert(0); 1877 } 1878 1879 break; 1880 } 1881 1882 /* Handled together with load_interpolated_input */ 1883 case nir_intrinsic_load_barycentric_pixel: 1884 case nir_intrinsic_load_barycentric_centroid: 1885 case nir_intrinsic_load_barycentric_sample: 1886 break; 1887 1888 /* Reads 128-bit value raw off the tilebuffer during blending, tasty */ 1889 1890 case nir_intrinsic_load_raw_output_pan: { 1891 reg = nir_dest_index(&instr->dest); 1892 1893 /* T720 and below use different blend opcodes with slightly 1894 * different semantics than T760 and up */ 1895 1896 midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0); 1897 1898 unsigned target = output_load_rt_addr(ctx, instr); 1899 ld.load_store.index_comp = target & 0x3; 1900 ld.load_store.index_reg = target >> 2; 1901 1902 if (nir_src_is_const(instr->src[0])) { 1903 unsigned sample = nir_src_as_uint(instr->src[0]); 1904 ld.load_store.arg_comp = sample & 0x3; 1905 ld.load_store.arg_reg = sample >> 2; 1906 } else { 1907 /* Enable sample index via register. */ 1908 ld.load_store.signed_offset |= 1; 1909 ld.src[1] = nir_src_index(ctx, &instr->src[0]); 1910 ld.src_types[1] = nir_type_int32; 1911 } 1912 1913 if (ctx->quirks & MIDGARD_OLD_BLEND) { 1914 ld.op = midgard_op_ld_special_32u; 1915 ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(16); 1916 ld.load_store.index_reg = REGISTER_LDST_ZERO; 1917 } 1918 1919 emit_mir_instruction(ctx, ld); 1920 break; 1921 } 1922 1923 case nir_intrinsic_load_output: { 1924 reg = nir_dest_index(&instr->dest); 1925 1926 unsigned bits = nir_dest_bit_size(instr->dest); 1927 1928 midgard_instruction ld; 1929 if (bits == 16) 1930 ld = m_ld_tilebuffer_16f(reg, 0); 1931 else 1932 ld = m_ld_tilebuffer_32f(reg, 0); 1933 1934 unsigned index = output_load_rt_addr(ctx, instr); 1935 ld.load_store.index_comp = index & 0x3; 1936 ld.load_store.index_reg = index >> 2; 1937 1938 for (unsigned c = 4; c < 16; ++c) 1939 ld.swizzle[0][c] = 0; 1940 1941 if (ctx->quirks & MIDGARD_OLD_BLEND) { 1942 if (bits == 16) 1943 ld.op = midgard_op_ld_special_16f; 1944 else 1945 ld.op = midgard_op_ld_special_32f; 1946 ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(1); 1947 ld.load_store.index_reg = REGISTER_LDST_ZERO; 1948 } 1949 1950 emit_mir_instruction(ctx, ld); 1951 break; 1952 } 1953 1954 case nir_intrinsic_store_output: 1955 case nir_intrinsic_store_combined_output_pan: 1956 assert(nir_src_is_const(instr->src[1]) && "no indirect outputs"); 1957 1958 reg = nir_src_index(ctx, &instr->src[0]); 1959 1960 if (ctx->stage == MESA_SHADER_FRAGMENT) { 1961 bool combined = instr->intrinsic == 1962 nir_intrinsic_store_combined_output_pan; 1963 1964 enum midgard_rt_id rt; 1965 1966 unsigned reg_z = ~0, reg_s = ~0, reg_2 = ~0; 1967 unsigned writeout = PAN_WRITEOUT_C; 1968 if (combined) { 1969 writeout = nir_intrinsic_component(instr); 1970 if (writeout & PAN_WRITEOUT_Z) 1971 reg_z = nir_src_index(ctx, &instr->src[2]); 1972 if (writeout & PAN_WRITEOUT_S) 1973 reg_s = nir_src_index(ctx, &instr->src[3]); 1974 if (writeout & PAN_WRITEOUT_2) 1975 reg_2 = nir_src_index(ctx, &instr->src[4]); 1976 } 1977 1978 if (writeout & PAN_WRITEOUT_C) { 1979 const nir_variable *var = 1980 nir_find_variable_with_driver_location(ctx->nir, nir_var_shader_out, 1981 nir_intrinsic_base(instr)); 1982 1983 assert(var != NULL); 1984 assert(var->data.location >= FRAG_RESULT_DATA0); 1985 1986 rt = MIDGARD_COLOR_RT0 + var->data.location - 1987 FRAG_RESULT_DATA0; 1988 } else { 1989 rt = MIDGARD_ZS_RT; 1990 } 1991 1992 /* Dual-source blend writeout is done by leaving the 1993 * value in r2 for the blend shader to use. */ 1994 if (~reg_2) { 1995 if (instr->src[4].is_ssa) { 1996 emit_explicit_constant(ctx, reg_2, reg_2); 1997 1998 unsigned out = make_compiler_temp(ctx); 1999 2000 midgard_instruction ins = v_mov(reg_2, out); 2001 emit_mir_instruction(ctx, ins); 2002 2003 ctx->blend_src1 = out; 2004 } else { 2005 ctx->blend_src1 = reg_2; 2006 } 2007 } 2008 2009 emit_fragment_store(ctx, reg, reg_z, reg_s, rt, 0); 2010 } else if (ctx->stage == MESA_SHADER_VERTEX) { 2011 assert(instr->intrinsic == nir_intrinsic_store_output); 2012 2013 /* We should have been vectorized, though we don't 2014 * currently check that st_vary is emitted only once 2015 * per slot (this is relevant, since there's not a mask 2016 * parameter available on the store [set to 0 by the 2017 * blob]). We do respect the component by adjusting the 2018 * swizzle. If this is a constant source, we'll need to 2019 * emit that explicitly. */ 2020 2021 emit_explicit_constant(ctx, reg, reg); 2022 2023 offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[1]); 2024 2025 unsigned dst_component = nir_intrinsic_component(instr); 2026 unsigned nr_comp = nir_src_num_components(instr->src[0]); 2027 2028 midgard_instruction st = m_st_vary_32(reg, PACK_LDST_ATTRIB_OFS(offset)); 2029 st.load_store.arg_reg = REGISTER_LDST_ZERO; 2030 st.load_store.index_format = midgard_index_address_u32; 2031 st.load_store.index_reg = REGISTER_LDST_ZERO; 2032 2033 switch (nir_alu_type_get_base_type(nir_intrinsic_src_type(instr))) { 2034 case nir_type_uint: 2035 case nir_type_bool: 2036 st.op = midgard_op_st_vary_32u; 2037 break; 2038 case nir_type_int: 2039 st.op = midgard_op_st_vary_32i; 2040 break; 2041 case nir_type_float: 2042 st.op = midgard_op_st_vary_32; 2043 break; 2044 default: 2045 unreachable("Attempted to store unknown type"); 2046 break; 2047 } 2048 2049 /* nir_intrinsic_component(store_intr) encodes the 2050 * destination component start. Source component offset 2051 * adjustment is taken care of in 2052 * install_registers_instr(), when offset_swizzle() is 2053 * called. 2054 */ 2055 unsigned src_component = COMPONENT_X; 2056 2057 assert(nr_comp > 0); 2058 for (unsigned i = 0; i < ARRAY_SIZE(st.swizzle); ++i) { 2059 st.swizzle[0][i] = src_component; 2060 if (i >= dst_component && i < dst_component + nr_comp - 1) 2061 src_component++; 2062 } 2063 2064 emit_mir_instruction(ctx, st); 2065 } else { 2066 DBG("Unknown store\n"); 2067 assert(0); 2068 } 2069 2070 break; 2071 2072 /* Special case of store_output for lowered blend shaders */ 2073 case nir_intrinsic_store_raw_output_pan: 2074 assert (ctx->stage == MESA_SHADER_FRAGMENT); 2075 reg = nir_src_index(ctx, &instr->src[0]); 2076 for (unsigned s = 0; s < ctx->blend_sample_iterations; s++) 2077 emit_fragment_store(ctx, reg, ~0, ~0, 2078 ctx->inputs->blend.rt + MIDGARD_COLOR_RT0, 2079 s); 2080 break; 2081 2082 case nir_intrinsic_store_global: 2083 case nir_intrinsic_store_shared: 2084 case nir_intrinsic_store_scratch: 2085 reg = nir_src_index(ctx, &instr->src[0]); 2086 emit_explicit_constant(ctx, reg, reg); 2087 2088 unsigned seg; 2089 if (instr->intrinsic == nir_intrinsic_store_global) 2090 seg = LDST_GLOBAL; 2091 else if (instr->intrinsic == nir_intrinsic_store_shared) 2092 seg = LDST_SHARED; 2093 else 2094 seg = LDST_SCRATCH; 2095 2096 emit_global(ctx, &instr->instr, false, reg, &instr->src[1], seg); 2097 break; 2098 2099 case nir_intrinsic_load_ssbo_address: 2100 case nir_intrinsic_load_xfb_address: 2101 emit_sysval_read(ctx, &instr->instr, 2, 0); 2102 break; 2103 2104 case nir_intrinsic_load_first_vertex: 2105 case nir_intrinsic_load_work_dim: 2106 case nir_intrinsic_load_num_vertices: 2107 emit_sysval_read(ctx, &instr->instr, 1, 0); 2108 break; 2109 2110 case nir_intrinsic_load_base_vertex: 2111 emit_sysval_read(ctx, &instr->instr, 1, 4); 2112 break; 2113 2114 case nir_intrinsic_load_base_instance: 2115 case nir_intrinsic_get_ssbo_size: 2116 emit_sysval_read(ctx, &instr->instr, 1, 8); 2117 break; 2118 2119 case nir_intrinsic_load_sample_positions_pan: 2120 emit_sysval_read(ctx, &instr->instr, 2, 0); 2121 break; 2122 2123 case nir_intrinsic_load_viewport_scale: 2124 case nir_intrinsic_load_viewport_offset: 2125 case nir_intrinsic_load_num_workgroups: 2126 case nir_intrinsic_load_sampler_lod_parameters_pan: 2127 case nir_intrinsic_load_workgroup_size: 2128 emit_sysval_read(ctx, &instr->instr, 3, 0); 2129 break; 2130 2131 case nir_intrinsic_load_blend_const_color_rgba: 2132 emit_sysval_read(ctx, &instr->instr, 4, 0); 2133 break; 2134 2135 case nir_intrinsic_load_workgroup_id: 2136 case nir_intrinsic_load_local_invocation_id: 2137 case nir_intrinsic_load_global_invocation_id: 2138 case nir_intrinsic_load_global_invocation_id_zero_base: 2139 emit_compute_builtin(ctx, instr); 2140 break; 2141 2142 case nir_intrinsic_load_vertex_id_zero_base: 2143 case nir_intrinsic_load_instance_id: 2144 emit_vertex_builtin(ctx, instr); 2145 break; 2146 2147 case nir_intrinsic_load_sample_mask_in: 2148 emit_special(ctx, instr, 96); 2149 break; 2150 2151 case nir_intrinsic_load_sample_id: 2152 emit_special(ctx, instr, 97); 2153 break; 2154 2155 /* Midgard doesn't seem to want special handling */ 2156 case nir_intrinsic_memory_barrier: 2157 case nir_intrinsic_memory_barrier_buffer: 2158 case nir_intrinsic_memory_barrier_image: 2159 case nir_intrinsic_memory_barrier_shared: 2160 case nir_intrinsic_group_memory_barrier: 2161 break; 2162 2163 case nir_intrinsic_control_barrier: 2164 schedule_barrier(ctx); 2165 emit_control_barrier(ctx); 2166 schedule_barrier(ctx); 2167 break; 2168 2169 ATOMIC_CASE(ctx, instr, add, add); 2170 ATOMIC_CASE(ctx, instr, and, and); 2171 ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg); 2172 ATOMIC_CASE(ctx, instr, exchange, xchg); 2173 ATOMIC_CASE(ctx, instr, imax, imax); 2174 ATOMIC_CASE(ctx, instr, imin, imin); 2175 ATOMIC_CASE(ctx, instr, or, or); 2176 ATOMIC_CASE(ctx, instr, umax, umax); 2177 ATOMIC_CASE(ctx, instr, umin, umin); 2178 ATOMIC_CASE(ctx, instr, xor, xor); 2179 2180 IMAGE_ATOMIC_CASE(ctx, instr, add, add); 2181 IMAGE_ATOMIC_CASE(ctx, instr, and, and); 2182 IMAGE_ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg); 2183 IMAGE_ATOMIC_CASE(ctx, instr, exchange, xchg); 2184 IMAGE_ATOMIC_CASE(ctx, instr, imax, imax); 2185 IMAGE_ATOMIC_CASE(ctx, instr, imin, imin); 2186 IMAGE_ATOMIC_CASE(ctx, instr, or, or); 2187 IMAGE_ATOMIC_CASE(ctx, instr, umax, umax); 2188 IMAGE_ATOMIC_CASE(ctx, instr, umin, umin); 2189 IMAGE_ATOMIC_CASE(ctx, instr, xor, xor); 2190 2191 default: 2192 fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name); 2193 assert(0); 2194 break; 2195 } 2196} 2197 2198/* Returns dimension with 0 special casing cubemaps */ 2199static unsigned 2200midgard_tex_format(enum glsl_sampler_dim dim) 2201{ 2202 switch (dim) { 2203 case GLSL_SAMPLER_DIM_1D: 2204 case GLSL_SAMPLER_DIM_BUF: 2205 return 1; 2206 2207 case GLSL_SAMPLER_DIM_2D: 2208 case GLSL_SAMPLER_DIM_MS: 2209 case GLSL_SAMPLER_DIM_EXTERNAL: 2210 case GLSL_SAMPLER_DIM_RECT: 2211 return 2; 2212 2213 case GLSL_SAMPLER_DIM_3D: 2214 return 3; 2215 2216 case GLSL_SAMPLER_DIM_CUBE: 2217 return 0; 2218 2219 default: 2220 DBG("Unknown sampler dim type\n"); 2221 assert(0); 2222 return 0; 2223 } 2224} 2225 2226/* Tries to attach an explicit LOD or bias as a constant. Returns whether this 2227 * was successful */ 2228 2229static bool 2230pan_attach_constant_bias( 2231 compiler_context *ctx, 2232 nir_src lod, 2233 midgard_texture_word *word) 2234{ 2235 /* To attach as constant, it has to *be* constant */ 2236 2237 if (!nir_src_is_const(lod)) 2238 return false; 2239 2240 float f = nir_src_as_float(lod); 2241 2242 /* Break into fixed-point */ 2243 signed lod_int = f; 2244 float lod_frac = f - lod_int; 2245 2246 /* Carry over negative fractions */ 2247 if (lod_frac < 0.0) { 2248 lod_int--; 2249 lod_frac += 1.0; 2250 } 2251 2252 /* Encode */ 2253 word->bias = float_to_ubyte(lod_frac); 2254 word->bias_int = lod_int; 2255 2256 return true; 2257} 2258 2259static enum mali_texture_mode 2260mdg_texture_mode(nir_tex_instr *instr) 2261{ 2262 if (instr->op == nir_texop_tg4 && instr->is_shadow) 2263 return TEXTURE_GATHER_SHADOW; 2264 else if (instr->op == nir_texop_tg4) 2265 return TEXTURE_GATHER_X + instr->component; 2266 else if (instr->is_shadow) 2267 return TEXTURE_SHADOW; 2268 else 2269 return TEXTURE_NORMAL; 2270} 2271 2272static void 2273set_tex_coord(compiler_context *ctx, nir_tex_instr *instr, 2274 midgard_instruction *ins) 2275{ 2276 int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord); 2277 2278 assert(coord_idx >= 0); 2279 2280 int comparator_idx = nir_tex_instr_src_index(instr, nir_tex_src_comparator); 2281 int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); 2282 assert(comparator_idx < 0 || ms_idx < 0); 2283 int ms_or_comparator_idx = ms_idx >= 0 ? ms_idx : comparator_idx; 2284 2285 unsigned coords = nir_src_index(ctx, &instr->src[coord_idx].src); 2286 2287 emit_explicit_constant(ctx, coords, coords); 2288 2289 ins->src_types[1] = nir_tex_instr_src_type(instr, coord_idx) | 2290 nir_src_bit_size(instr->src[coord_idx].src); 2291 2292 unsigned nr_comps = instr->coord_components; 2293 unsigned written_mask = 0, write_mask = 0; 2294 2295 /* Initialize all components to coord.x which is expected to always be 2296 * present. Swizzle is updated below based on the texture dimension 2297 * and extra attributes that are packed in the coordinate argument. 2298 */ 2299 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) 2300 ins->swizzle[1][c] = COMPONENT_X; 2301 2302 /* Shadow ref value is part of the coordinates if there's no comparator 2303 * source, in that case it's always placed in the last component. 2304 * Midgard wants the ref value in coord.z. 2305 */ 2306 if (instr->is_shadow && comparator_idx < 0) { 2307 ins->swizzle[1][COMPONENT_Z] = --nr_comps; 2308 write_mask |= 1 << COMPONENT_Z; 2309 } 2310 2311 /* The array index is the last component if there's no shadow ref value 2312 * or second last if there's one. We already decremented the number of 2313 * components to account for the shadow ref value above. 2314 * Midgard wants the array index in coord.w. 2315 */ 2316 if (instr->is_array) { 2317 ins->swizzle[1][COMPONENT_W] = --nr_comps; 2318 write_mask |= 1 << COMPONENT_W; 2319 } 2320 2321 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { 2322 /* texelFetch is undefined on samplerCube */ 2323 assert(ins->op != midgard_tex_op_fetch); 2324 2325 ins->src[1] = make_compiler_temp_reg(ctx); 2326 2327 /* For cubemaps, we use a special ld/st op to select the face 2328 * and copy the xy into the texture register 2329 */ 2330 midgard_instruction ld = m_ld_cubemap_coords(ins->src[1], 0); 2331 ld.src[1] = coords; 2332 ld.src_types[1] = ins->src_types[1]; 2333 ld.mask = 0x3; /* xy */ 2334 ld.load_store.bitsize_toggle = true; 2335 ld.swizzle[1][3] = COMPONENT_X; 2336 emit_mir_instruction(ctx, ld); 2337 2338 /* We packed cube coordiates (X,Y,Z) into (X,Y), update the 2339 * written mask accordingly and decrement the number of 2340 * components 2341 */ 2342 nr_comps--; 2343 written_mask |= 3; 2344 } 2345 2346 /* Now flag tex coord components that have not been written yet */ 2347 write_mask |= mask_of(nr_comps) & ~written_mask; 2348 for (unsigned c = 0; c < nr_comps; c++) 2349 ins->swizzle[1][c] = c; 2350 2351 /* Sample index and shadow ref are expected in coord.z */ 2352 if (ms_or_comparator_idx >= 0) { 2353 assert(!((write_mask | written_mask) & (1 << COMPONENT_Z))); 2354 2355 unsigned sample_or_ref = 2356 nir_src_index(ctx, &instr->src[ms_or_comparator_idx].src); 2357 2358 emit_explicit_constant(ctx, sample_or_ref, sample_or_ref); 2359 2360 if (ins->src[1] == ~0) 2361 ins->src[1] = make_compiler_temp_reg(ctx); 2362 2363 midgard_instruction mov = v_mov(sample_or_ref, ins->src[1]); 2364 2365 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) 2366 mov.swizzle[1][c] = COMPONENT_X; 2367 2368 mov.mask = 1 << COMPONENT_Z; 2369 written_mask |= 1 << COMPONENT_Z; 2370 ins->swizzle[1][COMPONENT_Z] = COMPONENT_Z; 2371 emit_mir_instruction(ctx, mov); 2372 } 2373 2374 /* Texelfetch coordinates uses all four elements (xyz/index) regardless 2375 * of texture dimensionality, which means it's necessary to zero the 2376 * unused components to keep everything happy. 2377 */ 2378 if (ins->op == midgard_tex_op_fetch && 2379 (written_mask | write_mask) != 0xF) { 2380 if (ins->src[1] == ~0) 2381 ins->src[1] = make_compiler_temp_reg(ctx); 2382 2383 /* mov index.zw, #0, or generalized */ 2384 midgard_instruction mov = 2385 v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), ins->src[1]); 2386 mov.has_constants = true; 2387 mov.mask = (written_mask | write_mask) ^ 0xF; 2388 emit_mir_instruction(ctx, mov); 2389 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) { 2390 if (mov.mask & (1 << c)) 2391 ins->swizzle[1][c] = c; 2392 } 2393 } 2394 2395 if (ins->src[1] == ~0) { 2396 /* No temporary reg created, use the src coords directly */ 2397 ins->src[1] = coords; 2398 } else if (write_mask) { 2399 /* Move the remaining coordinates to the temporary reg */ 2400 midgard_instruction mov = v_mov(coords, ins->src[1]); 2401 2402 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) { 2403 if ((1 << c) & write_mask) { 2404 mov.swizzle[1][c] = ins->swizzle[1][c]; 2405 ins->swizzle[1][c] = c; 2406 } else { 2407 mov.swizzle[1][c] = COMPONENT_X; 2408 } 2409 } 2410 2411 mov.mask = write_mask; 2412 emit_mir_instruction(ctx, mov); 2413 } 2414} 2415 2416static void 2417emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, 2418 unsigned midgard_texop) 2419{ 2420 nir_dest *dest = &instr->dest; 2421 2422 int texture_index = instr->texture_index; 2423 int sampler_index = instr->sampler_index; 2424 2425 nir_alu_type dest_base = nir_alu_type_get_base_type(instr->dest_type); 2426 2427 /* texture instructions support float outmods */ 2428 unsigned outmod = midgard_outmod_none; 2429 if (dest_base == nir_type_float) { 2430 outmod = mir_determine_float_outmod(ctx, &dest, 0); 2431 } 2432 2433 midgard_instruction ins = { 2434 .type = TAG_TEXTURE_4, 2435 .mask = 0xF, 2436 .dest = nir_dest_index(dest), 2437 .src = { ~0, ~0, ~0, ~0 }, 2438 .dest_type = instr->dest_type, 2439 .swizzle = SWIZZLE_IDENTITY_4, 2440 .outmod = outmod, 2441 .op = midgard_texop, 2442 .texture = { 2443 .format = midgard_tex_format(instr->sampler_dim), 2444 .texture_handle = texture_index, 2445 .sampler_handle = sampler_index, 2446 .mode = mdg_texture_mode(instr) 2447 } 2448 }; 2449 2450 if (instr->is_shadow && !instr->is_new_style_shadow && instr->op != nir_texop_tg4) 2451 for (int i = 0; i < 4; ++i) 2452 ins.swizzle[0][i] = COMPONENT_X; 2453 2454 for (unsigned i = 0; i < instr->num_srcs; ++i) { 2455 int index = nir_src_index(ctx, &instr->src[i].src); 2456 unsigned sz = nir_src_bit_size(instr->src[i].src); 2457 nir_alu_type T = nir_tex_instr_src_type(instr, i) | sz; 2458 2459 switch (instr->src[i].src_type) { 2460 case nir_tex_src_coord: 2461 set_tex_coord(ctx, instr, &ins); 2462 break; 2463 2464 case nir_tex_src_bias: 2465 case nir_tex_src_lod: { 2466 /* Try as a constant if we can */ 2467 2468 bool is_txf = midgard_texop == midgard_tex_op_fetch; 2469 if (!is_txf && pan_attach_constant_bias(ctx, instr->src[i].src, &ins.texture)) 2470 break; 2471 2472 ins.texture.lod_register = true; 2473 ins.src[2] = index; 2474 ins.src_types[2] = T; 2475 2476 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) 2477 ins.swizzle[2][c] = COMPONENT_X; 2478 2479 emit_explicit_constant(ctx, index, index); 2480 2481 break; 2482 }; 2483 2484 case nir_tex_src_offset: { 2485 ins.texture.offset_register = true; 2486 ins.src[3] = index; 2487 ins.src_types[3] = T; 2488 2489 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) 2490 ins.swizzle[3][c] = (c > COMPONENT_Z) ? 0 : c; 2491 2492 emit_explicit_constant(ctx, index, index); 2493 break; 2494 }; 2495 2496 case nir_tex_src_comparator: 2497 case nir_tex_src_ms_index: 2498 /* Nothing to do, handled in set_tex_coord() */ 2499 break; 2500 2501 default: { 2502 fprintf(stderr, "Unknown texture source type: %d\n", instr->src[i].src_type); 2503 assert(0); 2504 } 2505 } 2506 } 2507 2508 emit_mir_instruction(ctx, ins); 2509} 2510 2511static void 2512emit_tex(compiler_context *ctx, nir_tex_instr *instr) 2513{ 2514 switch (instr->op) { 2515 case nir_texop_tex: 2516 case nir_texop_txb: 2517 emit_texop_native(ctx, instr, midgard_tex_op_normal); 2518 break; 2519 case nir_texop_txl: 2520 case nir_texop_tg4: 2521 emit_texop_native(ctx, instr, midgard_tex_op_gradient); 2522 break; 2523 case nir_texop_txf: 2524 case nir_texop_txf_ms: 2525 emit_texop_native(ctx, instr, midgard_tex_op_fetch); 2526 break; 2527 case nir_texop_txs: 2528 emit_sysval_read(ctx, &instr->instr, 4, 0); 2529 break; 2530 default: { 2531 fprintf(stderr, "Unhandled texture op: %d\n", instr->op); 2532 assert(0); 2533 } 2534 } 2535} 2536 2537static void 2538emit_jump(compiler_context *ctx, nir_jump_instr *instr) 2539{ 2540 switch (instr->type) { 2541 case nir_jump_break: { 2542 /* Emit a branch out of the loop */ 2543 struct midgard_instruction br = v_branch(false, false); 2544 br.branch.target_type = TARGET_BREAK; 2545 br.branch.target_break = ctx->current_loop_depth; 2546 emit_mir_instruction(ctx, br); 2547 break; 2548 } 2549 2550 default: 2551 unreachable("Unhandled jump"); 2552 } 2553} 2554 2555static void 2556emit_instr(compiler_context *ctx, struct nir_instr *instr) 2557{ 2558 switch (instr->type) { 2559 case nir_instr_type_load_const: 2560 emit_load_const(ctx, nir_instr_as_load_const(instr)); 2561 break; 2562 2563 case nir_instr_type_intrinsic: 2564 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); 2565 break; 2566 2567 case nir_instr_type_alu: 2568 emit_alu(ctx, nir_instr_as_alu(instr)); 2569 break; 2570 2571 case nir_instr_type_tex: 2572 emit_tex(ctx, nir_instr_as_tex(instr)); 2573 break; 2574 2575 case nir_instr_type_jump: 2576 emit_jump(ctx, nir_instr_as_jump(instr)); 2577 break; 2578 2579 case nir_instr_type_ssa_undef: 2580 /* Spurious */ 2581 break; 2582 2583 default: 2584 DBG("Unhandled instruction type\n"); 2585 break; 2586 } 2587} 2588 2589 2590/* ALU instructions can inline or embed constants, which decreases register 2591 * pressure and saves space. */ 2592 2593#define CONDITIONAL_ATTACH(idx) { \ 2594 void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[idx] + 1); \ 2595\ 2596 if (entry) { \ 2597 attach_constants(ctx, alu, entry, alu->src[idx] + 1); \ 2598 alu->src[idx] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \ 2599 } \ 2600} 2601 2602static void 2603inline_alu_constants(compiler_context *ctx, midgard_block *block) 2604{ 2605 mir_foreach_instr_in_block(block, alu) { 2606 /* Other instructions cannot inline constants */ 2607 if (alu->type != TAG_ALU_4) continue; 2608 if (alu->compact_branch) continue; 2609 2610 /* If there is already a constant here, we can do nothing */ 2611 if (alu->has_constants) continue; 2612 2613 CONDITIONAL_ATTACH(0); 2614 2615 if (!alu->has_constants) { 2616 CONDITIONAL_ATTACH(1) 2617 } else if (!alu->inline_constant) { 2618 /* Corner case: _two_ vec4 constants, for instance with a 2619 * csel. For this case, we can only use a constant 2620 * register for one, we'll have to emit a move for the 2621 * other. */ 2622 2623 void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[1] + 1); 2624 unsigned scratch = make_compiler_temp(ctx); 2625 2626 if (entry) { 2627 midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), scratch); 2628 attach_constants(ctx, &ins, entry, alu->src[1] + 1); 2629 2630 /* Set the source */ 2631 alu->src[1] = scratch; 2632 2633 /* Inject us -before- the last instruction which set r31 */ 2634 mir_insert_instruction_before(ctx, mir_prev_op(alu), ins); 2635 } 2636 } 2637 } 2638} 2639 2640unsigned 2641max_bitsize_for_alu(midgard_instruction *ins) 2642{ 2643 unsigned max_bitsize = 0; 2644 for (int i = 0; i < MIR_SRC_COUNT; i++) { 2645 if (ins->src[i] == ~0) continue; 2646 unsigned src_bitsize = nir_alu_type_get_type_size(ins->src_types[i]); 2647 max_bitsize = MAX2(src_bitsize, max_bitsize); 2648 } 2649 unsigned dst_bitsize = nir_alu_type_get_type_size(ins->dest_type); 2650 max_bitsize = MAX2(dst_bitsize, max_bitsize); 2651 2652 /* We don't have fp16 LUTs, so we'll want to emit code like: 2653 * 2654 * vlut.fsinr hr0, hr0 2655 * 2656 * where both input and output are 16-bit but the operation is carried 2657 * out in 32-bit 2658 */ 2659 2660 switch (ins->op) { 2661 case midgard_alu_op_fsqrt: 2662 case midgard_alu_op_frcp: 2663 case midgard_alu_op_frsqrt: 2664 case midgard_alu_op_fsinpi: 2665 case midgard_alu_op_fcospi: 2666 case midgard_alu_op_fexp2: 2667 case midgard_alu_op_flog2: 2668 max_bitsize = MAX2(max_bitsize, 32); 2669 break; 2670 2671 default: 2672 break; 2673 } 2674 2675 /* High implies computing at a higher bitsize, e.g umul_high of 32-bit 2676 * requires computing at 64-bit */ 2677 if (midgard_is_integer_out_op(ins->op) && ins->outmod == midgard_outmod_keephi) { 2678 max_bitsize *= 2; 2679 assert(max_bitsize <= 64); 2680 } 2681 2682 return max_bitsize; 2683} 2684 2685midgard_reg_mode 2686reg_mode_for_bitsize(unsigned bitsize) 2687{ 2688 switch (bitsize) { 2689 /* use 16 pipe for 8 since we don't support vec16 yet */ 2690 case 8: 2691 case 16: 2692 return midgard_reg_mode_16; 2693 case 32: 2694 return midgard_reg_mode_32; 2695 case 64: 2696 return midgard_reg_mode_64; 2697 default: 2698 unreachable("invalid bit size"); 2699 } 2700} 2701 2702/* Midgard supports two types of constants, embedded constants (128-bit) and 2703 * inline constants (16-bit). Sometimes, especially with scalar ops, embedded 2704 * constants can be demoted to inline constants, for space savings and 2705 * sometimes a performance boost */ 2706 2707static void 2708embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) 2709{ 2710 mir_foreach_instr_in_block(block, ins) { 2711 if (!ins->has_constants) continue; 2712 if (ins->has_inline_constant) continue; 2713 2714 unsigned max_bitsize = max_bitsize_for_alu(ins); 2715 2716 /* We can inline 32-bit (sometimes) or 16-bit (usually) */ 2717 bool is_16 = max_bitsize == 16; 2718 bool is_32 = max_bitsize == 32; 2719 2720 if (!(is_16 || is_32)) 2721 continue; 2722 2723 /* src1 cannot be an inline constant due to encoding 2724 * restrictions. So, if possible we try to flip the arguments 2725 * in that case */ 2726 2727 int op = ins->op; 2728 2729 if (ins->src[0] == SSA_FIXED_REGISTER(REGISTER_CONSTANT) && 2730 alu_opcode_props[op].props & OP_COMMUTES) { 2731 mir_flip(ins); 2732 } 2733 2734 if (ins->src[1] == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { 2735 /* Component is from the swizzle. Take a nonzero component */ 2736 assert(ins->mask); 2737 unsigned first_comp = ffs(ins->mask) - 1; 2738 unsigned component = ins->swizzle[1][first_comp]; 2739 2740 /* Scale constant appropriately, if we can legally */ 2741 int16_t scaled_constant = 0; 2742 2743 if (is_16) { 2744 scaled_constant = ins->constants.u16[component]; 2745 } else if (midgard_is_integer_op(op)) { 2746 scaled_constant = ins->constants.u32[component]; 2747 2748 /* Constant overflow after resize */ 2749 if (scaled_constant != ins->constants.u32[component]) 2750 continue; 2751 } else { 2752 float original = ins->constants.f32[component]; 2753 scaled_constant = _mesa_float_to_half(original); 2754 2755 /* Check for loss of precision. If this is 2756 * mediump, we don't care, but for a highp 2757 * shader, we need to pay attention. NIR 2758 * doesn't yet tell us which mode we're in! 2759 * Practically this prevents most constants 2760 * from being inlined, sadly. */ 2761 2762 float fp32 = _mesa_half_to_float(scaled_constant); 2763 2764 if (fp32 != original) 2765 continue; 2766 } 2767 2768 /* Should've been const folded */ 2769 if (ins->src_abs[1] || ins->src_neg[1]) 2770 continue; 2771 2772 /* Make sure that the constant is not itself a vector 2773 * by checking if all accessed values are the same. */ 2774 2775 const midgard_constants *cons = &ins->constants; 2776 uint32_t value = is_16 ? cons->u16[component] : cons->u32[component]; 2777 2778 bool is_vector = false; 2779 unsigned mask = effective_writemask(ins->op, ins->mask); 2780 2781 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) { 2782 /* We only care if this component is actually used */ 2783 if (!(mask & (1 << c))) 2784 continue; 2785 2786 uint32_t test = is_16 ? 2787 cons->u16[ins->swizzle[1][c]] : 2788 cons->u32[ins->swizzle[1][c]]; 2789 2790 if (test != value) { 2791 is_vector = true; 2792 break; 2793 } 2794 } 2795 2796 if (is_vector) 2797 continue; 2798 2799 /* Get rid of the embedded constant */ 2800 ins->has_constants = false; 2801 ins->src[1] = ~0; 2802 ins->has_inline_constant = true; 2803 ins->inline_constant = scaled_constant; 2804 } 2805 } 2806} 2807 2808/* Dead code elimination for branches at the end of a block - only one branch 2809 * per block is legal semantically */ 2810 2811static void 2812midgard_cull_dead_branch(compiler_context *ctx, midgard_block *block) 2813{ 2814 bool branched = false; 2815 2816 mir_foreach_instr_in_block_safe(block, ins) { 2817 if (!midgard_is_branch_unit(ins->unit)) continue; 2818 2819 if (branched) 2820 mir_remove_instruction(ins); 2821 2822 branched = true; 2823 } 2824} 2825 2826/* We want to force the invert on AND/OR to the second slot to legalize into 2827 * iandnot/iornot. The relevant patterns are for AND (and OR respectively) 2828 * 2829 * ~a & #b = ~a & ~(#~b) 2830 * ~a & b = b & ~a 2831 */ 2832 2833static void 2834midgard_legalize_invert(compiler_context *ctx, midgard_block *block) 2835{ 2836 mir_foreach_instr_in_block(block, ins) { 2837 if (ins->type != TAG_ALU_4) continue; 2838 2839 if (ins->op != midgard_alu_op_iand && 2840 ins->op != midgard_alu_op_ior) continue; 2841 2842 if (ins->src_invert[1] || !ins->src_invert[0]) continue; 2843 2844 if (ins->has_inline_constant) { 2845 /* ~(#~a) = ~(~#a) = a, so valid, and forces both 2846 * inverts on */ 2847 ins->inline_constant = ~ins->inline_constant; 2848 ins->src_invert[1] = true; 2849 } else { 2850 /* Flip to the right invert order. Note 2851 * has_inline_constant false by assumption on the 2852 * branch, so flipping makes sense. */ 2853 mir_flip(ins); 2854 } 2855 } 2856} 2857 2858static unsigned 2859emit_fragment_epilogue(compiler_context *ctx, unsigned rt, unsigned sample_iter) 2860{ 2861 /* Loop to ourselves */ 2862 midgard_instruction *br = ctx->writeout_branch[rt][sample_iter]; 2863 struct midgard_instruction ins = v_branch(false, false); 2864 ins.writeout = br->writeout; 2865 ins.branch.target_block = ctx->block_count - 1; 2866 ins.constants.u32[0] = br->constants.u32[0]; 2867 memcpy(&ins.src_types, &br->src_types, sizeof(ins.src_types)); 2868 emit_mir_instruction(ctx, ins); 2869 2870 ctx->current_block->epilogue = true; 2871 schedule_barrier(ctx); 2872 return ins.branch.target_block; 2873} 2874 2875static midgard_block * 2876emit_block_init(compiler_context *ctx) 2877{ 2878 midgard_block *this_block = ctx->after_block; 2879 ctx->after_block = NULL; 2880 2881 if (!this_block) 2882 this_block = create_empty_block(ctx); 2883 2884 list_addtail(&this_block->base.link, &ctx->blocks); 2885 2886 this_block->scheduled = false; 2887 ++ctx->block_count; 2888 2889 /* Set up current block */ 2890 list_inithead(&this_block->base.instructions); 2891 ctx->current_block = this_block; 2892 2893 return this_block; 2894} 2895 2896static midgard_block * 2897emit_block(compiler_context *ctx, nir_block *block) 2898{ 2899 midgard_block *this_block = emit_block_init(ctx); 2900 2901 nir_foreach_instr(instr, block) { 2902 emit_instr(ctx, instr); 2903 ++ctx->instruction_count; 2904 } 2905 2906 return this_block; 2907} 2908 2909static midgard_block *emit_cf_list(struct compiler_context *ctx, struct exec_list *list); 2910 2911static void 2912emit_if(struct compiler_context *ctx, nir_if *nif) 2913{ 2914 midgard_block *before_block = ctx->current_block; 2915 2916 /* Speculatively emit the branch, but we can't fill it in until later */ 2917 bool inv = false; 2918 EMIT(branch, true, true); 2919 midgard_instruction *then_branch = mir_last_in_block(ctx->current_block); 2920 then_branch->src[0] = mir_get_branch_cond(&nif->condition, &inv); 2921 then_branch->src_types[0] = nir_type_uint32; 2922 then_branch->branch.invert_conditional = !inv; 2923 2924 /* Emit the two subblocks. */ 2925 midgard_block *then_block = emit_cf_list(ctx, &nif->then_list); 2926 midgard_block *end_then_block = ctx->current_block; 2927 2928 /* Emit a jump from the end of the then block to the end of the else */ 2929 EMIT(branch, false, false); 2930 midgard_instruction *then_exit = mir_last_in_block(ctx->current_block); 2931 2932 /* Emit second block, and check if it's empty */ 2933 2934 int else_idx = ctx->block_count; 2935 int count_in = ctx->instruction_count; 2936 midgard_block *else_block = emit_cf_list(ctx, &nif->else_list); 2937 midgard_block *end_else_block = ctx->current_block; 2938 int after_else_idx = ctx->block_count; 2939 2940 /* Now that we have the subblocks emitted, fix up the branches */ 2941 2942 assert(then_block); 2943 assert(else_block); 2944 2945 if (ctx->instruction_count == count_in) { 2946 /* The else block is empty, so don't emit an exit jump */ 2947 mir_remove_instruction(then_exit); 2948 then_branch->branch.target_block = after_else_idx; 2949 } else { 2950 then_branch->branch.target_block = else_idx; 2951 then_exit->branch.target_block = after_else_idx; 2952 } 2953 2954 /* Wire up the successors */ 2955 2956 ctx->after_block = create_empty_block(ctx); 2957 2958 pan_block_add_successor(&before_block->base, &then_block->base); 2959 pan_block_add_successor(&before_block->base, &else_block->base); 2960 2961 pan_block_add_successor(&end_then_block->base, &ctx->after_block->base); 2962 pan_block_add_successor(&end_else_block->base, &ctx->after_block->base); 2963} 2964 2965static void 2966emit_loop(struct compiler_context *ctx, nir_loop *nloop) 2967{ 2968 /* Remember where we are */ 2969 midgard_block *start_block = ctx->current_block; 2970 2971 /* Allocate a loop number, growing the current inner loop depth */ 2972 int loop_idx = ++ctx->current_loop_depth; 2973 2974 /* Get index from before the body so we can loop back later */ 2975 int start_idx = ctx->block_count; 2976 2977 /* Emit the body itself */ 2978 midgard_block *loop_block = emit_cf_list(ctx, &nloop->body); 2979 2980 /* Branch back to loop back */ 2981 struct midgard_instruction br_back = v_branch(false, false); 2982 br_back.branch.target_block = start_idx; 2983 emit_mir_instruction(ctx, br_back); 2984 2985 /* Mark down that branch in the graph. */ 2986 pan_block_add_successor(&start_block->base, &loop_block->base); 2987 pan_block_add_successor(&ctx->current_block->base, &loop_block->base); 2988 2989 /* Find the index of the block about to follow us (note: we don't add 2990 * one; blocks are 0-indexed so we get a fencepost problem) */ 2991 int break_block_idx = ctx->block_count; 2992 2993 /* Fix up the break statements we emitted to point to the right place, 2994 * now that we can allocate a block number for them */ 2995 ctx->after_block = create_empty_block(ctx); 2996 2997 mir_foreach_block_from(ctx, start_block, _block) { 2998 mir_foreach_instr_in_block(((midgard_block *) _block), ins) { 2999 if (ins->type != TAG_ALU_4) continue; 3000 if (!ins->compact_branch) continue; 3001 3002 /* We found a branch -- check the type to see if we need to do anything */ 3003 if (ins->branch.target_type != TARGET_BREAK) continue; 3004 3005 /* It's a break! Check if it's our break */ 3006 if (ins->branch.target_break != loop_idx) continue; 3007 3008 /* Okay, cool, we're breaking out of this loop. 3009 * Rewrite from a break to a goto */ 3010 3011 ins->branch.target_type = TARGET_GOTO; 3012 ins->branch.target_block = break_block_idx; 3013 3014 pan_block_add_successor(_block, &ctx->after_block->base); 3015 } 3016 } 3017 3018 /* Now that we've finished emitting the loop, free up the depth again 3019 * so we play nice with recursion amid nested loops */ 3020 --ctx->current_loop_depth; 3021 3022 /* Dump loop stats */ 3023 ++ctx->loop_count; 3024} 3025 3026static midgard_block * 3027emit_cf_list(struct compiler_context *ctx, struct exec_list *list) 3028{ 3029 midgard_block *start_block = NULL; 3030 3031 foreach_list_typed(nir_cf_node, node, node, list) { 3032 switch (node->type) { 3033 case nir_cf_node_block: { 3034 midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node)); 3035 3036 if (!start_block) 3037 start_block = block; 3038 3039 break; 3040 } 3041 3042 case nir_cf_node_if: 3043 emit_if(ctx, nir_cf_node_as_if(node)); 3044 break; 3045 3046 case nir_cf_node_loop: 3047 emit_loop(ctx, nir_cf_node_as_loop(node)); 3048 break; 3049 3050 case nir_cf_node_function: 3051 assert(0); 3052 break; 3053 } 3054 } 3055 3056 return start_block; 3057} 3058 3059/* Due to lookahead, we need to report the first tag executed in the command 3060 * stream and in branch targets. An initial block might be empty, so iterate 3061 * until we find one that 'works' */ 3062 3063unsigned 3064midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx) 3065{ 3066 midgard_block *initial_block = mir_get_block(ctx, block_idx); 3067 3068 mir_foreach_block_from(ctx, initial_block, _v) { 3069 midgard_block *v = (midgard_block *) _v; 3070 if (v->quadword_count) { 3071 midgard_bundle *initial_bundle = 3072 util_dynarray_element(&v->bundles, midgard_bundle, 0); 3073 3074 return initial_bundle->tag; 3075 } 3076 } 3077 3078 /* Default to a tag 1 which will break from the shader, in case we jump 3079 * to the exit block (i.e. `return` in a compute shader) */ 3080 3081 return 1; 3082} 3083 3084/* For each fragment writeout instruction, generate a writeout loop to 3085 * associate with it */ 3086 3087static void 3088mir_add_writeout_loops(compiler_context *ctx) 3089{ 3090 for (unsigned rt = 0; rt < ARRAY_SIZE(ctx->writeout_branch); ++rt) { 3091 for (unsigned s = 0; s < MIDGARD_MAX_SAMPLE_ITER; ++s) { 3092 midgard_instruction *br = ctx->writeout_branch[rt][s]; 3093 if (!br) continue; 3094 3095 unsigned popped = br->branch.target_block; 3096 pan_block_add_successor(&(mir_get_block(ctx, popped - 1)->base), 3097 &ctx->current_block->base); 3098 br->branch.target_block = emit_fragment_epilogue(ctx, rt, s); 3099 br->branch.target_type = TARGET_GOTO; 3100 3101 /* If we have more RTs, we'll need to restore back after our 3102 * loop terminates */ 3103 midgard_instruction *next_br = NULL; 3104 3105 if ((s + 1) < MIDGARD_MAX_SAMPLE_ITER) 3106 next_br = ctx->writeout_branch[rt][s + 1]; 3107 3108 if (!next_br && (rt + 1) < ARRAY_SIZE(ctx->writeout_branch)) 3109 next_br = ctx->writeout_branch[rt + 1][0]; 3110 3111 if (next_br) { 3112 midgard_instruction uncond = v_branch(false, false); 3113 uncond.branch.target_block = popped; 3114 uncond.branch.target_type = TARGET_GOTO; 3115 emit_mir_instruction(ctx, uncond); 3116 pan_block_add_successor(&ctx->current_block->base, 3117 &(mir_get_block(ctx, popped)->base)); 3118 schedule_barrier(ctx); 3119 } else { 3120 /* We're last, so we can terminate here */ 3121 br->last_writeout = true; 3122 } 3123 } 3124 } 3125} 3126 3127void 3128midgard_compile_shader_nir(nir_shader *nir, 3129 const struct panfrost_compile_inputs *inputs, 3130 struct util_dynarray *binary, 3131 struct pan_shader_info *info) 3132{ 3133 midgard_debug = debug_get_option_midgard_debug(); 3134 3135 /* TODO: Bound against what? */ 3136 compiler_context *ctx = rzalloc(NULL, compiler_context); 3137 ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, 3138 inputs->fixed_sysval_layout, 3139 ctx); 3140 3141 ctx->inputs = inputs; 3142 ctx->nir = nir; 3143 ctx->info = info; 3144 ctx->stage = nir->info.stage; 3145 3146 if (inputs->is_blend) { 3147 unsigned nr_samples = MAX2(inputs->blend.nr_samples, 1); 3148 const struct util_format_description *desc = 3149 util_format_description(inputs->rt_formats[inputs->blend.rt]); 3150 3151 /* We have to split writeout in 128 bit chunks */ 3152 ctx->blend_sample_iterations = 3153 DIV_ROUND_UP(desc->block.bits * nr_samples, 128); 3154 } 3155 ctx->blend_input = ~0; 3156 ctx->blend_src1 = ~0; 3157 ctx->quirks = midgard_get_quirks(inputs->gpu_id); 3158 3159 /* Initialize at a global (not block) level hash tables */ 3160 3161 ctx->ssa_constants = _mesa_hash_table_u64_create(ctx); 3162 3163 /* Lower gl_Position pre-optimisation, but after lowering vars to ssa 3164 * (so we don't accidentally duplicate the epilogue since mesa/st has 3165 * messed with our I/O quite a bit already) */ 3166 3167 NIR_PASS_V(nir, nir_lower_vars_to_ssa); 3168 3169 if (ctx->stage == MESA_SHADER_VERTEX) { 3170 NIR_PASS_V(nir, nir_lower_viewport_transform); 3171 NIR_PASS_V(nir, nir_lower_point_size, 1.0, 0.0); 3172 } 3173 3174 NIR_PASS_V(nir, nir_lower_var_copies); 3175 NIR_PASS_V(nir, nir_lower_vars_to_ssa); 3176 NIR_PASS_V(nir, nir_split_var_copies); 3177 NIR_PASS_V(nir, nir_lower_var_copies); 3178 NIR_PASS_V(nir, nir_lower_global_vars_to_local); 3179 NIR_PASS_V(nir, nir_lower_var_copies); 3180 NIR_PASS_V(nir, nir_lower_vars_to_ssa); 3181 3182 NIR_PASS_V(nir, pan_lower_framebuffer, 3183 inputs->rt_formats, inputs->raw_fmt_mask, 3184 inputs->is_blend, ctx->quirks & MIDGARD_BROKEN_BLEND_LOADS); 3185 3186 NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, 3187 glsl_type_size, 0); 3188 NIR_PASS_V(nir, nir_lower_ssbo); 3189 NIR_PASS_V(nir, pan_nir_lower_zs_store); 3190 3191 NIR_PASS_V(nir, pan_nir_lower_64bit_intrin); 3192 3193 NIR_PASS_V(nir, midgard_nir_lower_global_load); 3194 3195 /* Optimisation passes */ 3196 3197 optimise_nir(nir, ctx->quirks, inputs->is_blend, inputs->is_blit); 3198 3199 bool skip_internal = nir->info.internal; 3200 skip_internal &= !(midgard_debug & MIDGARD_DBG_INTERNAL); 3201 3202 if (midgard_debug & MIDGARD_DBG_SHADERS && !skip_internal) 3203 nir_print_shader(nir, stdout); 3204 3205 info->tls_size = nir->scratch_size; 3206 3207 nir_foreach_function(func, nir) { 3208 if (!func->impl) 3209 continue; 3210 3211 list_inithead(&ctx->blocks); 3212 ctx->block_count = 0; 3213 ctx->func = func; 3214 ctx->already_emitted = calloc(BITSET_WORDS(func->impl->ssa_alloc), sizeof(BITSET_WORD)); 3215 3216 if (nir->info.outputs_read && !inputs->is_blend) { 3217 emit_block_init(ctx); 3218 3219 struct midgard_instruction wait = v_branch(false, false); 3220 wait.branch.target_type = TARGET_TILEBUF_WAIT; 3221 3222 emit_mir_instruction(ctx, wait); 3223 3224 ++ctx->instruction_count; 3225 } 3226 3227 emit_cf_list(ctx, &func->impl->body); 3228 free(ctx->already_emitted); 3229 break; /* TODO: Multi-function shaders */ 3230 } 3231 3232 /* Per-block lowering before opts */ 3233 3234 mir_foreach_block(ctx, _block) { 3235 midgard_block *block = (midgard_block *) _block; 3236 inline_alu_constants(ctx, block); 3237 embedded_to_inline_constant(ctx, block); 3238 } 3239 /* MIR-level optimizations */ 3240 3241 bool progress = false; 3242 3243 do { 3244 progress = false; 3245 progress |= midgard_opt_dead_code_eliminate(ctx); 3246 3247 mir_foreach_block(ctx, _block) { 3248 midgard_block *block = (midgard_block *) _block; 3249 progress |= midgard_opt_copy_prop(ctx, block); 3250 progress |= midgard_opt_combine_projection(ctx, block); 3251 progress |= midgard_opt_varying_projection(ctx, block); 3252 } 3253 } while (progress); 3254 3255 mir_foreach_block(ctx, _block) { 3256 midgard_block *block = (midgard_block *) _block; 3257 midgard_lower_derivatives(ctx, block); 3258 midgard_legalize_invert(ctx, block); 3259 midgard_cull_dead_branch(ctx, block); 3260 } 3261 3262 if (ctx->stage == MESA_SHADER_FRAGMENT) 3263 mir_add_writeout_loops(ctx); 3264 3265 /* Analyze now that the code is known but before scheduling creates 3266 * pipeline registers which are harder to track */ 3267 mir_analyze_helper_requirements(ctx); 3268 3269 if (midgard_debug & MIDGARD_DBG_SHADERS && !skip_internal) 3270 mir_print_shader(ctx); 3271 3272 /* Schedule! */ 3273 midgard_schedule_program(ctx); 3274 mir_ra(ctx); 3275 3276 if (midgard_debug & MIDGARD_DBG_SHADERS && !skip_internal) 3277 mir_print_shader(ctx); 3278 3279 /* Analyze after scheduling since this is order-dependent */ 3280 mir_analyze_helper_terminate(ctx); 3281 3282 /* Emit flat binary from the instruction arrays. Iterate each block in 3283 * sequence. Save instruction boundaries such that lookahead tags can 3284 * be assigned easily */ 3285 3286 /* Cache _all_ bundles in source order for lookahead across failed branches */ 3287 3288 int bundle_count = 0; 3289 mir_foreach_block(ctx, _block) { 3290 midgard_block *block = (midgard_block *) _block; 3291 bundle_count += block->bundles.size / sizeof(midgard_bundle); 3292 } 3293 midgard_bundle **source_order_bundles = malloc(sizeof(midgard_bundle *) * bundle_count); 3294 int bundle_idx = 0; 3295 mir_foreach_block(ctx, _block) { 3296 midgard_block *block = (midgard_block *) _block; 3297 util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) { 3298 source_order_bundles[bundle_idx++] = bundle; 3299 } 3300 } 3301 3302 int current_bundle = 0; 3303 3304 /* Midgard prefetches instruction types, so during emission we 3305 * need to lookahead. Unless this is the last instruction, in 3306 * which we return 1. */ 3307 3308 mir_foreach_block(ctx, _block) { 3309 midgard_block *block = (midgard_block *) _block; 3310 mir_foreach_bundle_in_block(block, bundle) { 3311 int lookahead = 1; 3312 3313 if (!bundle->last_writeout && (current_bundle + 1 < bundle_count)) 3314 lookahead = source_order_bundles[current_bundle + 1]->tag; 3315 3316 emit_binary_bundle(ctx, block, bundle, binary, lookahead); 3317 ++current_bundle; 3318 } 3319 3320 /* TODO: Free deeper */ 3321 //util_dynarray_fini(&block->instructions); 3322 } 3323 3324 free(source_order_bundles); 3325 3326 /* Report the very first tag executed */ 3327 info->midgard.first_tag = midgard_get_first_tag_from_block(ctx, 0); 3328 3329 info->ubo_mask = ctx->ubo_mask & ((1 << ctx->nir->info.num_ubos) - 1); 3330 3331 if (midgard_debug & MIDGARD_DBG_SHADERS && !skip_internal) { 3332 disassemble_midgard(stdout, binary->data, 3333 binary->size, inputs->gpu_id, 3334 midgard_debug & MIDGARD_DBG_VERBOSE); 3335 fflush(stdout); 3336 } 3337 3338 /* A shader ending on a 16MB boundary causes INSTR_INVALID_PC faults, 3339 * workaround by adding some padding to the end of the shader. (The 3340 * kernel makes sure shader BOs can't cross 16MB boundaries.) */ 3341 if (binary->size) 3342 memset(util_dynarray_grow(binary, uint8_t, 16), 0, 16); 3343 3344 if ((midgard_debug & MIDGARD_DBG_SHADERDB || inputs->shaderdb) && 3345 !nir->info.internal) { 3346 unsigned nr_bundles = 0, nr_ins = 0; 3347 3348 /* Count instructions and bundles */ 3349 3350 mir_foreach_block(ctx, _block) { 3351 midgard_block *block = (midgard_block *) _block; 3352 nr_bundles += util_dynarray_num_elements( 3353 &block->bundles, midgard_bundle); 3354 3355 mir_foreach_bundle_in_block(block, bun) 3356 nr_ins += bun->instruction_count; 3357 } 3358 3359 /* Calculate thread count. There are certain cutoffs by 3360 * register count for thread count */ 3361 3362 unsigned nr_registers = info->work_reg_count; 3363 3364 unsigned nr_threads = 3365 (nr_registers <= 4) ? 4 : 3366 (nr_registers <= 8) ? 2 : 3367 1; 3368 3369 /* Dump stats */ 3370 3371 fprintf(stderr, "%s - %s shader: " 3372 "%u inst, %u bundles, %u quadwords, " 3373 "%u registers, %u threads, %u loops, " 3374 "%u:%u spills:fills\n", 3375 ctx->nir->info.label ?: "", 3376 ctx->inputs->is_blend ? "PAN_SHADER_BLEND" : 3377 gl_shader_stage_name(ctx->stage), 3378 nr_ins, nr_bundles, ctx->quadword_count, 3379 nr_registers, nr_threads, 3380 ctx->loop_count, 3381 ctx->spills, ctx->fills); 3382 } 3383 3384 _mesa_hash_table_u64_destroy(ctx->ssa_constants); 3385 _mesa_hash_table_u64_destroy(ctx->sysval_to_id); 3386 3387 ralloc_free(ctx); 3388} 3389