1/* 2 * Copyright (C) 2021 Alyssa Rosenzweig <alyssa@rosenzweig.io> 3 * Copyright (C) 2020 Collabora Ltd. 4 * Copyright © 2016 Broadcom 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 * SOFTWARE. 24 */ 25 26#include "main/glheader.h" 27#include "compiler/nir_types.h" 28#include "compiler/nir/nir_builder.h" 29#include "util/u_debug.h" 30#include "util/fast_idiv_by_const.h" 31#include "agx_compile.h" 32#include "agx_compiler.h" 33#include "agx_builder.h" 34 35static const struct debug_named_value agx_debug_options[] = { 36 {"msgs", AGX_DBG_MSGS, "Print debug messages"}, 37 {"shaders", AGX_DBG_SHADERS, "Dump shaders in NIR and AIR"}, 38 {"shaderdb", AGX_DBG_SHADERDB, "Print statistics"}, 39 {"verbose", AGX_DBG_VERBOSE, "Disassemble verbosely"}, 40 {"internal", AGX_DBG_INTERNAL, "Dump even internal shaders"}, 41 {"novalidate",AGX_DBG_NOVALIDATE,"Skip IR validation in debug builds"}, 42 DEBUG_NAMED_VALUE_END 43}; 44 45DEBUG_GET_ONCE_FLAGS_OPTION(agx_debug, "AGX_MESA_DEBUG", agx_debug_options, 0) 46 47int agx_debug = 0; 48 49#define DBG(fmt, ...) \ 50 do { if (agx_debug & AGX_DBG_MSGS) \ 51 fprintf(stderr, "%s:%d: "fmt, \ 52 __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0) 53 54/* Builds a 64-bit hash table key for an index */ 55static uint64_t 56agx_index_to_key(agx_index idx) 57{ 58 STATIC_ASSERT(sizeof(idx) <= sizeof(uint64_t)); 59 60 uint64_t key = 0; 61 memcpy(&key, &idx, sizeof(idx)); 62 return key; 63} 64 65/* 66 * Extract a single channel out of a vector source. We split vectors with 67 * p_split so we can use the split components directly, without emitting a 68 * machine instruction. This has advantages of RA, as the split can usually be 69 * optimized away. 70 */ 71static agx_index 72agx_emit_extract(agx_builder *b, agx_index vec, unsigned channel) 73{ 74 agx_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec, 75 agx_index_to_key(vec)); 76 77 assert(components != NULL && "missing agx_emit_combine_to"); 78 79 return components[channel]; 80} 81 82static void 83agx_cache_combine(agx_builder *b, agx_index dst, 84 agx_index s0, agx_index s1, agx_index s2, agx_index s3) 85{ 86 /* Lifetime of a hash table entry has to be at least as long as the table */ 87 agx_index *channels = ralloc_array(b->shader, agx_index, 4); 88 89 channels[0] = s0; 90 channels[1] = s1; 91 channels[2] = s2; 92 channels[3] = s3; 93 94 _mesa_hash_table_u64_insert(b->shader->allocated_vec, agx_index_to_key(dst), 95 channels); 96} 97 98/* 99 * Combine multiple scalars into a vector destination. This corresponds to 100 * p_combine, lowered to moves (a shuffle in general) after register allocation. 101 * 102 * To optimize vector extractions, we record the individual channels 103 */ 104static agx_instr * 105agx_emit_combine_to(agx_builder *b, agx_index dst, 106 agx_index s0, agx_index s1, agx_index s2, agx_index s3) 107{ 108 agx_cache_combine(b, dst, s0, s1, s2, s3); 109 return agx_p_combine_to(b, dst, s0, s1, s2, s3); 110} 111 112static void 113agx_block_add_successor(agx_block *block, agx_block *successor) 114{ 115 assert(block != NULL && successor != NULL); 116 117 /* Cull impossible edges */ 118 if (block->unconditional_jumps) 119 return; 120 121 for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) { 122 if (block->successors[i]) { 123 if (block->successors[i] == successor) 124 return; 125 else 126 continue; 127 } 128 129 block->successors[i] = successor; 130 util_dynarray_append(&successor->predecessors, agx_block *, block); 131 return; 132 } 133 134 unreachable("Too many successors"); 135} 136 137/* 138 * Splits an n-component vector (vec) into n scalar destinations (dests) using a 139 * split pseudo-instruction. 140 * 141 * Pre-condition: dests is filled with agx_null(). 142 */ 143static void 144agx_emit_split(agx_builder *b, agx_index *dests, agx_index vec, unsigned n) 145{ 146 /* Setup the destinations */ 147 for (unsigned i = 0; i < n; ++i) { 148 dests[i] = agx_temp(b->shader, vec.size); 149 } 150 151 /* Emit the split */ 152 agx_p_split_to(b, dests[0], dests[1], dests[2], dests[3], vec); 153} 154 155static void 156agx_emit_cached_split(agx_builder *b, agx_index vec, unsigned n) 157{ 158 agx_index dests[4] = { agx_null(), agx_null(), agx_null(), agx_null() }; 159 agx_emit_split(b, dests, vec, n); 160 agx_cache_combine(b, vec, dests[0], dests[1], dests[2], dests[3]); 161} 162 163static void 164agx_emit_load_const(agx_builder *b, nir_load_const_instr *instr) 165{ 166 /* Ensure we've been scalarized and bit size lowered */ 167 unsigned bit_size = instr->def.bit_size; 168 assert(instr->def.num_components == 1); 169 assert(bit_size == 1 || bit_size == 16 || bit_size == 32); 170 171 /* Emit move, later passes can inline/push if useful */ 172 agx_mov_imm_to(b, 173 agx_get_index(instr->def.index, agx_size_for_bits(bit_size)), 174 nir_const_value_as_uint(instr->value[0], bit_size)); 175} 176 177/* Emit code dividing P by Q */ 178static agx_index 179agx_udiv_const(agx_builder *b, agx_index P, uint32_t Q) 180{ 181 /* P / 1 = P */ 182 if (Q == 1) { 183 return P; 184 } 185 186 /* P / UINT32_MAX = 0, unless P = UINT32_MAX when it's one */ 187 if (Q == UINT32_MAX) { 188 agx_index max = agx_mov_imm(b, 32, UINT32_MAX); 189 agx_index one = agx_mov_imm(b, 32, 1); 190 return agx_icmpsel(b, P, max, one, agx_zero(), AGX_ICOND_UEQ); 191 } 192 193 /* P / 2^N = P >> N */ 194 if (util_is_power_of_two_or_zero(Q)) { 195 return agx_ushr(b, P, agx_mov_imm(b, 32, util_logbase2(Q))); 196 } 197 198 /* Fall back on multiplication by a magic number */ 199 struct util_fast_udiv_info info = util_compute_fast_udiv_info(Q, 32, 32); 200 agx_index preshift = agx_mov_imm(b, 32, info.pre_shift); 201 agx_index increment = agx_mov_imm(b, 32, info.increment); 202 agx_index postshift = agx_mov_imm(b, 32, info.post_shift); 203 agx_index multiplier = agx_mov_imm(b, 32, info.multiplier); 204 agx_index multiplied = agx_temp(b->shader, AGX_SIZE_64); 205 agx_index n = P; 206 207 if (info.pre_shift != 0) n = agx_ushr(b, n, preshift); 208 if (info.increment != 0) n = agx_iadd(b, n, increment, 0); 209 210 /* 64-bit multiplication, zero extending 32-bit x 32-bit, get the top word */ 211 agx_imad_to(b, multiplied, agx_abs(n), agx_abs(multiplier), agx_zero(), 0); 212 n = agx_temp(b->shader, AGX_SIZE_32); 213 agx_p_extract_to(b, n, multiplied, 1); 214 215 if (info.post_shift != 0) n = agx_ushr(b, n, postshift); 216 217 return n; 218} 219 220/* AGX appears to lack support for vertex attributes. Lower to global loads. */ 221static void 222agx_emit_load_attr(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr) 223{ 224 nir_src *offset_src = nir_get_io_offset_src(instr); 225 assert(nir_src_is_const(*offset_src) && "no attribute indirects"); 226 unsigned index = nir_intrinsic_base(instr) + 227 nir_src_as_uint(*offset_src); 228 229 struct agx_shader_key *key = b->shader->key; 230 struct agx_attribute attrib = key->vs.attributes[index]; 231 232 /* address = base + (stride * vertex_id) + src_offset */ 233 unsigned buf = attrib.buf; 234 unsigned stride = key->vs.vbuf_strides[buf]; 235 unsigned shift = agx_format_shift(attrib.format); 236 237 agx_index shifted_stride = agx_mov_imm(b, 32, stride >> shift); 238 agx_index src_offset = agx_mov_imm(b, 32, attrib.src_offset); 239 240 agx_index vertex_id = agx_register(10, AGX_SIZE_32); 241 agx_index instance_id = agx_register(12, AGX_SIZE_32); 242 243 /* A nonzero divisor requires dividing the instance ID. A zero divisor 244 * specifies per-instance data. */ 245 agx_index element_id = (attrib.divisor == 0) ? vertex_id : 246 agx_udiv_const(b, instance_id, attrib.divisor); 247 248 agx_index offset = agx_imad(b, element_id, shifted_stride, src_offset, 0); 249 250 /* Each VBO has a 64-bit = 4 x 16-bit address, lookup the base address as a sysval */ 251 unsigned num_vbos = key->vs.num_vbufs; 252 unsigned base_length = (num_vbos * 4); 253 agx_index base = agx_indexed_sysval(b->shader, 254 AGX_PUSH_VBO_BASES, AGX_SIZE_64, buf * 4, base_length); 255 256 /* Load the data */ 257 assert(instr->num_components <= 4); 258 259 unsigned actual_comps = (attrib.nr_comps_minus_1 + 1); 260 agx_index vec = agx_vec_for_dest(b->shader, &instr->dest); 261 agx_device_load_to(b, vec, base, offset, attrib.format, 262 BITFIELD_MASK(attrib.nr_comps_minus_1 + 1), 0); 263 agx_wait(b, 0); 264 265 agx_emit_split(b, dests, vec, actual_comps); 266 267 agx_index one = agx_mov_imm(b, 32, fui(1.0)); 268 agx_index zero = agx_mov_imm(b, 32, 0); 269 agx_index default_value[4] = { zero, zero, zero, one }; 270 271 for (unsigned i = actual_comps; i < instr->num_components; ++i) 272 dests[i] = default_value[i]; 273} 274 275static void 276agx_emit_load_vary_flat(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr) 277{ 278 unsigned components = instr->num_components; 279 assert(components >= 1 && components <= 4); 280 281 nir_src *offset = nir_get_io_offset_src(instr); 282 assert(nir_src_is_const(*offset) && "no indirects"); 283 unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)]; 284 imm_index += nir_src_as_uint(*offset); 285 286 assert(nir_dest_bit_size(instr->dest) == 32 && "no 16-bit flat shading"); 287 288 for (unsigned i = 0; i < components; ++i) { 289 /* vec3 for each vertex, unknown what first 2 channels are for */ 290 agx_index values = agx_ld_vary_flat(b, agx_immediate(imm_index + i), 1); 291 dests[i] = agx_p_extract(b, values, 2); 292 } 293} 294 295static void 296agx_emit_load_vary(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr) 297{ 298 ASSERTED unsigned components = instr->num_components; 299 ASSERTED nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]); 300 301 assert(components >= 1 && components <= 4); 302 assert(parent); 303 304 /* TODO: Interpolation modes */ 305 assert(parent->intrinsic == nir_intrinsic_load_barycentric_pixel); 306 307 nir_src *offset = nir_get_io_offset_src(instr); 308 assert(nir_src_is_const(*offset) && "no indirects"); 309 unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)]; 310 imm_index += nir_src_as_uint(*offset) * 4; 311 312 agx_index vec = agx_vec_for_intr(b->shader, instr); 313 agx_ld_vary_to(b, vec, agx_immediate(imm_index), components, true); 314 agx_emit_split(b, dests, vec, components); 315} 316 317static agx_instr * 318agx_emit_store_vary(agx_builder *b, nir_intrinsic_instr *instr) 319{ 320 nir_src *offset = nir_get_io_offset_src(instr); 321 assert(nir_src_is_const(*offset) && "todo: indirects"); 322 unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)]; 323 imm_index += nir_intrinsic_component(instr); 324 imm_index += nir_src_as_uint(*offset); 325 326 /* nir_lower_io_to_scalar */ 327 assert(nir_intrinsic_write_mask(instr) == 0x1); 328 329 return agx_st_vary(b, 330 agx_immediate(imm_index), 331 agx_src_index(&instr->src[0])); 332} 333 334static agx_instr * 335agx_emit_fragment_out(agx_builder *b, nir_intrinsic_instr *instr) 336{ 337 const nir_variable *var = 338 nir_find_variable_with_driver_location(b->shader->nir, 339 nir_var_shader_out, nir_intrinsic_base(instr)); 340 assert(var); 341 342 unsigned loc = var->data.location; 343 assert(var->data.index == 0 && "todo: dual-source blending"); 344 assert(loc == FRAG_RESULT_DATA0 && "todo: MRT"); 345 unsigned rt = (loc - FRAG_RESULT_DATA0); 346 347 /* TODO: Reverse-engineer interactions with MRT */ 348 if (b->shader->nir->info.internal) { 349 /* clear */ 350 } else if (b->shader->did_writeout) { 351 agx_writeout(b, 0x0004); 352 } else { 353 agx_writeout(b, 0xC200); 354 agx_writeout(b, 0x000C); 355 } 356 357 if (b->shader->nir->info.fs.uses_discard) { 358 /* If the shader uses discard, the sample mask must be written by the 359 * shader on all exeuction paths. If we've reached the end of the shader, 360 * we are therefore still active and need to write a full sample mask. 361 * TODO: interactions with MSAA and gl_SampleMask writes 362 */ 363 agx_sample_mask(b, agx_immediate(1)); 364 } 365 366 b->shader->did_writeout = true; 367 return agx_st_tile(b, agx_src_index(&instr->src[0]), 368 b->shader->key->fs.tib_formats[rt]); 369} 370 371static void 372agx_emit_load_tile(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr) 373{ 374 const nir_variable *var = 375 nir_find_variable_with_driver_location(b->shader->nir, 376 nir_var_shader_out, nir_intrinsic_base(instr)); 377 assert(var); 378 379 unsigned loc = var->data.location; 380 assert(var->data.index == 0 && "todo: dual-source blending"); 381 assert(loc == FRAG_RESULT_DATA0 && "todo: MRT"); 382 unsigned rt = (loc - FRAG_RESULT_DATA0); 383 384 /* TODO: Reverse-engineer interactions with MRT */ 385 agx_writeout(b, 0xC200); 386 agx_writeout(b, 0x0008); 387 b->shader->did_writeout = true; 388 b->shader->out->reads_tib = true; 389 390 agx_index vec = agx_vec_for_dest(b->shader, &instr->dest); 391 agx_ld_tile_to(b, vec, b->shader->key->fs.tib_formats[rt]); 392 agx_emit_split(b, dests, vec, 4); 393} 394 395static enum agx_format 396agx_format_for_bits(unsigned bits) 397{ 398 switch (bits) { 399 case 8: return AGX_FORMAT_I8; 400 case 16: return AGX_FORMAT_I16; 401 case 32: return AGX_FORMAT_I32; 402 default: unreachable("Invalid bit size for load/store"); 403 } 404} 405 406static agx_instr * 407agx_emit_load_ubo(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr) 408{ 409 bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input); 410 nir_src *offset = nir_get_io_offset_src(instr); 411 412 if (!kernel_input && !nir_src_is_const(instr->src[0])) 413 unreachable("todo: indirect UBO access"); 414 415 /* UBO blocks are specified (kernel inputs are always 0) */ 416 uint32_t block = kernel_input ? 0 : nir_src_as_uint(instr->src[0]); 417 418 /* Each UBO has a 64-bit = 4 x 16-bit address */ 419 unsigned num_ubos = b->shader->nir->info.num_ubos; 420 unsigned base_length = (num_ubos * 4); 421 unsigned index = block * 4; /* 16 bit units */ 422 423 /* Lookup the base address (TODO: indirection) */ 424 agx_index base = agx_indexed_sysval(b->shader, 425 AGX_PUSH_UBO_BASES, AGX_SIZE_64, 426 index, base_length); 427 428 /* Load the data */ 429 assert(instr->num_components <= 4); 430 431 agx_device_load_to(b, dst, base, agx_src_index(offset), 432 agx_format_for_bits(nir_dest_bit_size(instr->dest)), 433 BITFIELD_MASK(instr->num_components), 0); 434 agx_wait(b, 0); 435 agx_emit_cached_split(b, dst, instr->num_components); 436 437 return NULL; 438} 439 440static void 441agx_emit_load_frag_coord(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr) 442{ 443 /* xy */ 444 for (unsigned i = 0; i < 2; ++i) { 445 dests[i] = agx_fadd(b, agx_convert(b, agx_immediate(AGX_CONVERT_U32_TO_F), 446 agx_get_sr(b, 32, AGX_SR_THREAD_POSITION_IN_GRID_X + i), 447 AGX_ROUND_RTE), agx_immediate_f(0.5f)); 448 } 449 450 dests[2] = agx_ld_vary(b, agx_immediate(1), 1, false); /* z */ 451 dests[3] = agx_ld_vary(b, agx_immediate(0), 1, false); /* w */ 452} 453 454static agx_instr * 455agx_blend_const(agx_builder *b, agx_index dst, unsigned comp) 456{ 457 agx_index val = agx_indexed_sysval(b->shader, 458 AGX_PUSH_BLEND_CONST, AGX_SIZE_32, comp * 2, 4 * 2); 459 460 return agx_mov_to(b, dst, val); 461} 462 463/* 464 * Demoting a helper invocation is logically equivalent to zeroing the sample 465 * mask. Metal implement discard as such. 466 * 467 * XXX: Actually, Metal's "discard" is a demote, and what is implemented here 468 * is a demote. There might be a better way to implement this to get correct 469 * helper invocation semantics. For now, I'm kicking the can down the road. 470 */ 471static agx_instr * 472agx_emit_discard(agx_builder *b, nir_intrinsic_instr *instr) 473{ 474 agx_writeout(b, 0xC200); 475 agx_writeout(b, 0x0001); 476 b->shader->did_writeout = true; 477 478 b->shader->out->writes_sample_mask = true; 479 return agx_sample_mask(b, agx_immediate(0)); 480} 481 482static agx_instr * 483agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr) 484{ 485 agx_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ? 486 agx_dest_index(&instr->dest) : agx_null(); 487 gl_shader_stage stage = b->shader->stage; 488 agx_index dests[4] = { agx_null() }; 489 490 switch (instr->intrinsic) { 491 case nir_intrinsic_load_barycentric_pixel: 492 case nir_intrinsic_load_barycentric_centroid: 493 case nir_intrinsic_load_barycentric_sample: 494 case nir_intrinsic_load_barycentric_at_sample: 495 case nir_intrinsic_load_barycentric_at_offset: 496 /* handled later via load_vary */ 497 return NULL; 498 case nir_intrinsic_load_interpolated_input: 499 assert(stage == MESA_SHADER_FRAGMENT); 500 agx_emit_load_vary(b, dests, instr); 501 break; 502 503 case nir_intrinsic_load_input: 504 if (stage == MESA_SHADER_FRAGMENT) 505 agx_emit_load_vary_flat(b, dests, instr); 506 else if (stage == MESA_SHADER_VERTEX) 507 agx_emit_load_attr(b, dests, instr); 508 else 509 unreachable("Unsupported shader stage"); 510 511 break; 512 513 case nir_intrinsic_store_output: 514 if (stage == MESA_SHADER_FRAGMENT) 515 return agx_emit_fragment_out(b, instr); 516 else if (stage == MESA_SHADER_VERTEX) 517 return agx_emit_store_vary(b, instr); 518 else 519 unreachable("Unsupported shader stage"); 520 521 case nir_intrinsic_load_output: 522 assert(stage == MESA_SHADER_FRAGMENT); 523 agx_emit_load_tile(b, dests, instr); 524 break; 525 526 case nir_intrinsic_load_ubo: 527 case nir_intrinsic_load_kernel_input: 528 return agx_emit_load_ubo(b, dst, instr); 529 530 case nir_intrinsic_load_frag_coord: 531 agx_emit_load_frag_coord(b, dests, instr); 532 break; 533 534 case nir_intrinsic_discard: 535 return agx_emit_discard(b, instr); 536 537 case nir_intrinsic_load_back_face_agx: 538 return agx_get_sr_to(b, dst, AGX_SR_BACKFACING); 539 540 case nir_intrinsic_load_vertex_id: 541 return agx_mov_to(b, dst, agx_abs(agx_register(10, AGX_SIZE_32))); 542 543 case nir_intrinsic_load_instance_id: 544 return agx_mov_to(b, dst, agx_abs(agx_register(12, AGX_SIZE_32))); 545 546 case nir_intrinsic_load_blend_const_color_r_float: return agx_blend_const(b, dst, 0); 547 case nir_intrinsic_load_blend_const_color_g_float: return agx_blend_const(b, dst, 1); 548 case nir_intrinsic_load_blend_const_color_b_float: return agx_blend_const(b, dst, 2); 549 case nir_intrinsic_load_blend_const_color_a_float: return agx_blend_const(b, dst, 3); 550 551 default: 552 fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name); 553 unreachable("Unhandled intrinsic"); 554 } 555 556 /* If we got here, there is a vector destination for the intrinsic composed 557 * of separate scalars. Its components are specified separately in the dests 558 * array. We need to combine them so the vector destination itself is valid. 559 * If only individual components are accessed, this combine will be dead code 560 * eliminated. 561 */ 562 return agx_emit_combine_to(b, dst, dests[0], dests[1], dests[2], dests[3]); 563} 564 565static agx_index 566agx_alu_src_index(agx_builder *b, nir_alu_src src) 567{ 568 /* Check well-formedness of the input NIR */ 569 ASSERTED unsigned bitsize = nir_src_bit_size(src.src); 570 unsigned comps = nir_src_num_components(src.src); 571 unsigned channel = src.swizzle[0]; 572 573 assert(bitsize == 1 || bitsize == 16 || bitsize == 32 || bitsize == 64); 574 assert(!(src.negate || src.abs)); 575 assert(channel < comps); 576 577 agx_index idx = agx_src_index(&src.src); 578 579 /* We only deal with scalars, extract a single scalar if needed */ 580 if (comps > 1) 581 return agx_emit_extract(b, idx, channel); 582 else 583 return idx; 584} 585 586static agx_instr * 587agx_emit_alu_bool(agx_builder *b, nir_op op, 588 agx_index dst, agx_index s0, agx_index s1, agx_index s2) 589{ 590 /* Handle 1-bit bools as zero/nonzero rather than specifically 0/1 or 0/~0. 591 * This will give the optimizer flexibility. */ 592 agx_index f = agx_immediate(0); 593 agx_index t = agx_immediate(0x1); 594 595 switch (op) { 596 case nir_op_feq: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_EQ); 597 case nir_op_flt: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_LT); 598 case nir_op_fge: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_GE); 599 case nir_op_fneu: return agx_fcmpsel_to(b, dst, s0, s1, f, t, AGX_FCOND_EQ); 600 601 case nir_op_ieq: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_UEQ); 602 case nir_op_ine: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_UEQ); 603 case nir_op_ilt: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_SLT); 604 case nir_op_ige: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_SLT); 605 case nir_op_ult: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_ULT); 606 case nir_op_uge: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_ULT); 607 608 case nir_op_mov: return agx_mov_to(b, dst, s0); 609 case nir_op_iand: return agx_and_to(b, dst, s0, s1); 610 case nir_op_ior: return agx_or_to(b, dst, s0, s1); 611 case nir_op_ixor: return agx_xor_to(b, dst, s0, s1); 612 case nir_op_inot: return agx_xor_to(b, dst, s0, t); 613 614 case nir_op_f2b1: return agx_fcmpsel_to(b, dst, s0, f, f, t, AGX_FCOND_EQ); 615 case nir_op_i2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ); 616 case nir_op_b2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ); 617 618 case nir_op_bcsel: 619 return agx_icmpsel_to(b, dst, s0, f, s2, s1, AGX_ICOND_UEQ); 620 621 default: 622 fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[op].name); 623 unreachable("Unhandled boolean ALU instruction"); 624 } 625} 626 627static agx_instr * 628agx_emit_alu(agx_builder *b, nir_alu_instr *instr) 629{ 630 unsigned srcs = nir_op_infos[instr->op].num_inputs; 631 unsigned sz = nir_dest_bit_size(instr->dest.dest); 632 unsigned src_sz = srcs ? nir_src_bit_size(instr->src[0].src) : 0; 633 ASSERTED unsigned comps = nir_dest_num_components(instr->dest.dest); 634 635 assert(comps == 1 || nir_op_is_vec(instr->op)); 636 assert(sz == 1 || sz == 16 || sz == 32 || sz == 64); 637 638 agx_index dst = agx_dest_index(&instr->dest.dest); 639 agx_index s0 = srcs > 0 ? agx_alu_src_index(b, instr->src[0]) : agx_null(); 640 agx_index s1 = srcs > 1 ? agx_alu_src_index(b, instr->src[1]) : agx_null(); 641 agx_index s2 = srcs > 2 ? agx_alu_src_index(b, instr->src[2]) : agx_null(); 642 agx_index s3 = srcs > 3 ? agx_alu_src_index(b, instr->src[3]) : agx_null(); 643 644 /* 1-bit bools are a bit special, only handle with select ops */ 645 if (sz == 1) 646 return agx_emit_alu_bool(b, instr->op, dst, s0, s1, s2); 647 648#define UNOP(nop, aop) \ 649 case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0); 650#define BINOP(nop, aop) \ 651 case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1); 652#define TRIOP(nop, aop) \ 653 case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1, s2); 654 655 switch (instr->op) { 656 BINOP(fadd, fadd); 657 BINOP(fmul, fmul); 658 TRIOP(ffma, fma); 659 660 UNOP(f2f16, fmov); 661 UNOP(f2f32, fmov); 662 UNOP(fround_even, roundeven); 663 UNOP(ftrunc, trunc); 664 UNOP(ffloor, floor); 665 UNOP(fceil, ceil); 666 UNOP(frcp, rcp); 667 UNOP(frsq, rsqrt); 668 UNOP(flog2, log2); 669 UNOP(fexp2, exp2); 670 671 UNOP(fddx, dfdx); 672 UNOP(fddx_coarse, dfdx); 673 UNOP(fddx_fine, dfdx); 674 675 UNOP(fddy, dfdy); 676 UNOP(fddy_coarse, dfdy); 677 UNOP(fddy_fine, dfdy); 678 679 UNOP(mov, mov); 680 UNOP(u2u16, mov); 681 UNOP(u2u32, mov); 682 UNOP(inot, not); 683 BINOP(iand, and); 684 BINOP(ior, or); 685 BINOP(ixor, xor); 686 687 case nir_op_fsqrt: return agx_fmul_to(b, dst, s0, agx_srsqrt(b, s0)); 688 case nir_op_fsub: return agx_fadd_to(b, dst, s0, agx_neg(s1)); 689 case nir_op_fabs: return agx_fmov_to(b, dst, agx_abs(s0)); 690 case nir_op_fneg: return agx_fmov_to(b, dst, agx_neg(s0)); 691 692 case nir_op_fmin: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_LTN); 693 case nir_op_fmax: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_GTN); 694 case nir_op_imin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SLT); 695 case nir_op_imax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SGT); 696 case nir_op_umin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_ULT); 697 case nir_op_umax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_UGT); 698 699 case nir_op_iadd: return agx_iadd_to(b, dst, s0, s1, 0); 700 case nir_op_isub: return agx_iadd_to(b, dst, s0, agx_neg(s1), 0); 701 case nir_op_ineg: return agx_iadd_to(b, dst, agx_zero(), agx_neg(s0), 0); 702 case nir_op_imul: return agx_imad_to(b, dst, s0, s1, agx_zero(), 0); 703 704 case nir_op_ishl: return agx_bfi_to(b, dst, agx_zero(), s0, s1, 0); 705 case nir_op_ushr: return agx_ushr_to(b, dst, s0, s1); 706 case nir_op_ishr: return agx_asr_to(b, dst, s0, s1); 707 708 case nir_op_bcsel: 709 return agx_icmpsel_to(b, dst, s0, agx_zero(), s2, s1, AGX_ICOND_UEQ); 710 711 case nir_op_b2i32: 712 case nir_op_b2i16: 713 return agx_icmpsel_to(b, dst, s0, agx_zero(), agx_zero(), agx_immediate(1), AGX_ICOND_UEQ); 714 715 case nir_op_b2f16: 716 case nir_op_b2f32: 717 { 718 /* At this point, boolean is just zero/nonzero, so compare with zero */ 719 agx_index one = (sz == 16) ? 720 agx_mov_imm(b, 16, _mesa_float_to_half(1.0)) : 721 agx_mov_imm(b, 32, fui(1.0)); 722 723 agx_index zero = agx_zero(); 724 725 return agx_fcmpsel_to(b, dst, s0, zero, zero, one, AGX_FCOND_EQ); 726 } 727 728 case nir_op_i2i32: 729 { 730 if (s0.size != AGX_SIZE_16) 731 unreachable("todo: more conversions"); 732 733 return agx_iadd_to(b, dst, s0, agx_zero(), 0); 734 } 735 736 case nir_op_i2i16: 737 { 738 if (s0.size != AGX_SIZE_32) 739 unreachable("todo: more conversions"); 740 741 return agx_iadd_to(b, dst, s0, agx_zero(), 0); 742 } 743 744 case nir_op_iadd_sat: 745 { 746 agx_instr *I = agx_iadd_to(b, dst, s0, s1, 0); 747 I->saturate = true; 748 return I; 749 } 750 751 case nir_op_isub_sat: 752 { 753 agx_instr *I = agx_iadd_to(b, dst, s0, agx_neg(s1), 0); 754 I->saturate = true; 755 return I; 756 } 757 758 case nir_op_uadd_sat: 759 { 760 agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_abs(s1), 0); 761 I->saturate = true; 762 return I; 763 } 764 765 case nir_op_usub_sat: 766 { 767 agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_neg(agx_abs(s1)), 0); 768 I->saturate = true; 769 return I; 770 } 771 772 case nir_op_fsat: 773 { 774 agx_instr *I = agx_fadd_to(b, dst, s0, agx_negzero()); 775 I->saturate = true; 776 return I; 777 } 778 779 case nir_op_fsin_agx: 780 { 781 agx_index fixup = agx_sin_pt_1(b, s0); 782 agx_index sinc = agx_sin_pt_2(b, fixup); 783 return agx_fmul_to(b, dst, sinc, fixup); 784 } 785 786 case nir_op_f2i16: 787 return agx_convert_to(b, dst, 788 agx_immediate(AGX_CONVERT_F_TO_S16), s0, AGX_ROUND_RTZ); 789 790 case nir_op_f2i32: 791 return agx_convert_to(b, dst, 792 agx_immediate(AGX_CONVERT_F_TO_S32), s0, AGX_ROUND_RTZ); 793 794 case nir_op_f2u16: 795 return agx_convert_to(b, dst, 796 agx_immediate(AGX_CONVERT_F_TO_U16), s0, AGX_ROUND_RTZ); 797 798 case nir_op_f2u32: 799 return agx_convert_to(b, dst, 800 agx_immediate(AGX_CONVERT_F_TO_U32), s0, AGX_ROUND_RTZ); 801 802 case nir_op_u2f16: 803 case nir_op_u2f32: 804 { 805 if (src_sz == 64) 806 unreachable("64-bit conversions unimplemented"); 807 808 enum agx_convert mode = 809 (src_sz == 32) ? AGX_CONVERT_U32_TO_F : 810 (src_sz == 16) ? AGX_CONVERT_U16_TO_F : 811 AGX_CONVERT_U8_TO_F; 812 813 return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE); 814 } 815 816 case nir_op_i2f16: 817 case nir_op_i2f32: 818 { 819 if (src_sz == 64) 820 unreachable("64-bit conversions unimplemented"); 821 822 enum agx_convert mode = 823 (src_sz == 32) ? AGX_CONVERT_S32_TO_F : 824 (src_sz == 16) ? AGX_CONVERT_S16_TO_F : 825 AGX_CONVERT_S8_TO_F; 826 827 return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE); 828 } 829 830 case nir_op_vec2: 831 case nir_op_vec3: 832 case nir_op_vec4: 833 return agx_emit_combine_to(b, dst, s0, s1, s2, s3); 834 835 case nir_op_vec8: 836 case nir_op_vec16: 837 unreachable("should've been lowered"); 838 839 default: 840 fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name); 841 unreachable("Unhandled ALU instruction"); 842 } 843} 844 845static enum agx_dim 846agx_tex_dim(enum glsl_sampler_dim dim, bool array) 847{ 848 switch (dim) { 849 case GLSL_SAMPLER_DIM_1D: 850 case GLSL_SAMPLER_DIM_BUF: 851 return array ? AGX_DIM_TEX_1D_ARRAY : AGX_DIM_TEX_1D; 852 853 case GLSL_SAMPLER_DIM_2D: 854 case GLSL_SAMPLER_DIM_RECT: 855 case GLSL_SAMPLER_DIM_EXTERNAL: 856 return array ? AGX_DIM_TEX_2D_ARRAY : AGX_DIM_TEX_2D; 857 858 case GLSL_SAMPLER_DIM_MS: 859 assert(!array && "multisampled arrays unsupported"); 860 return AGX_DIM_TEX_2D_MS; 861 862 case GLSL_SAMPLER_DIM_3D: 863 assert(!array && "3D arrays unsupported"); 864 return AGX_DIM_TEX_3D; 865 866 case GLSL_SAMPLER_DIM_CUBE: 867 return array ? AGX_DIM_TEX_CUBE_ARRAY : AGX_DIM_TEX_CUBE; 868 869 default: 870 unreachable("Invalid sampler dim\n"); 871 } 872} 873 874static enum agx_lod_mode 875agx_lod_mode_for_nir(nir_texop op) 876{ 877 switch (op) { 878 case nir_texop_tex: return AGX_LOD_MODE_AUTO_LOD; 879 case nir_texop_txb: return AGX_LOD_MODE_AUTO_LOD_BIAS; 880 case nir_texop_txl: return AGX_LOD_MODE_LOD_MIN; 881 default: unreachable("Unhandled texture op"); 882 } 883} 884 885static void 886agx_emit_tex(agx_builder *b, nir_tex_instr *instr) 887{ 888 switch (instr->op) { 889 case nir_texop_tex: 890 case nir_texop_txl: 891 case nir_texop_txb: 892 break; 893 default: 894 unreachable("Unhandled texture op"); 895 } 896 897 agx_index coords = agx_null(), 898 texture = agx_immediate(instr->texture_index), 899 sampler = agx_immediate(instr->sampler_index), 900 lod = agx_immediate(0), 901 offset = agx_null(); 902 903 for (unsigned i = 0; i < instr->num_srcs; ++i) { 904 agx_index index = agx_src_index(&instr->src[i].src); 905 906 switch (instr->src[i].src_type) { 907 case nir_tex_src_coord: 908 coords = index; 909 910 /* Array textures are indexed by a floating-point in NIR, but by an 911 * integer in AGX. Convert the array index from float-to-int for array 912 * textures. The array index is the last source in NIR. The conversion 913 * is according to the rule from 8.9 ("Texture Functions") of the GLSL 914 * ES 3.20 specification: 915 * 916 * max(0, min(d - 1, floor(layer + 0.5))) = 917 * max(0, min(d - 1, f32_to_u32(layer + 0.5))) = 918 * min(d - 1, f32_to_u32(layer + 0.5)) 919 */ 920 if (instr->is_array) { 921 unsigned nr = nir_src_num_components(instr->src[i].src); 922 agx_index channels[4] = {}; 923 924 for (unsigned i = 0; i < nr; ++i) 925 channels[i] = agx_emit_extract(b, index, i); 926 927 agx_index layer = agx_fadd(b, channels[nr - 1], 928 agx_immediate_f(0.5f)); 929 930 agx_index d1 = agx_indexed_sysval(b->shader, 931 AGX_PUSH_ARRAY_SIZE_MINUS_1, AGX_SIZE_16, 932 instr->texture_index, 1); 933 934 layer = agx_convert(b, agx_immediate(AGX_CONVERT_F_TO_U32), layer, 935 AGX_ROUND_RTZ); 936 937 agx_index layer16 = agx_temp(b->shader, AGX_SIZE_16); 938 agx_mov_to(b, layer16, layer); 939 940 layer = agx_icmpsel(b, layer16, d1, layer16, d1, AGX_ICOND_ULT); 941 942 agx_index layer32 = agx_temp(b->shader, AGX_SIZE_32); 943 agx_mov_to(b, layer32, layer); 944 945 channels[nr - 1] = layer32; 946 coords = agx_p_combine(b, channels[0], channels[1], channels[2], channels[3]); 947 } else { 948 coords = index; 949 } 950 951 break; 952 953 case nir_tex_src_lod: 954 case nir_tex_src_bias: 955 lod = index; 956 break; 957 958 case nir_tex_src_ms_index: 959 case nir_tex_src_offset: 960 case nir_tex_src_comparator: 961 case nir_tex_src_texture_offset: 962 case nir_tex_src_sampler_offset: 963 default: 964 unreachable("todo"); 965 } 966 } 967 968 agx_index dst = agx_dest_index(&instr->dest); 969 agx_texture_sample_to(b, dst, coords, lod, texture, sampler, offset, 970 agx_tex_dim(instr->sampler_dim, instr->is_array), 971 agx_lod_mode_for_nir(instr->op), 972 0xF, /* TODO: wrmask */ 973 0); 974 975 agx_wait(b, 0); 976 agx_emit_cached_split(b, dst, 4); 977} 978 979/* 980 * Mark the logical end of the current block by emitting a p_logical_end marker. 981 * Note if an unconditional jump is emitted (for instance, to break out of a 982 * loop from inside an if), the block has already reached its logical end so we 983 * don't re-emit p_logical_end. The validator checks this, and correct register 984 * allocation depends on it. 985 */ 986static void 987agx_emit_logical_end(agx_builder *b) 988{ 989 if (!b->shader->current_block->unconditional_jumps) 990 agx_p_logical_end(b); 991} 992 993/* NIR loops are treated as a pair of AGX loops: 994 * 995 * do { 996 * do { 997 * ... 998 * } while (0); 999 * } while (cond); 1000 * 1001 * By manipulating the nesting counter (r0l), we may break out of nested loops, 1002 * so under the model, both break and continue may be implemented as breaks, 1003 * where break breaks out of the outer loop (2 layers) and continue breaks out 1004 * of the inner loop (1 layer). 1005 * 1006 * After manipulating the nesting counter directly, pop_exec #0 must be used to 1007 * flush the update to the execution mask. 1008 */ 1009 1010static void 1011agx_emit_jump(agx_builder *b, nir_jump_instr *instr) 1012{ 1013 agx_context *ctx = b->shader; 1014 assert (instr->type == nir_jump_break || instr->type == nir_jump_continue); 1015 1016 /* Break out of either one or two loops */ 1017 unsigned nestings = b->shader->loop_nesting; 1018 1019 if (instr->type == nir_jump_continue) { 1020 nestings += 1; 1021 agx_block_add_successor(ctx->current_block, ctx->continue_block); 1022 } else if (instr->type == nir_jump_break) { 1023 nestings += 2; 1024 agx_block_add_successor(ctx->current_block, ctx->break_block); 1025 } 1026 1027 /* Update the counter and flush */ 1028 agx_index r0l = agx_register(0, false); 1029 agx_mov_to(b, r0l, agx_immediate(nestings)); 1030 1031 /* Jumps must come at the end of a block */ 1032 agx_emit_logical_end(b); 1033 agx_pop_exec(b, 0); 1034 1035 ctx->current_block->unconditional_jumps = true; 1036} 1037 1038static void 1039agx_emit_phi(agx_builder *b, nir_phi_instr *instr) 1040{ 1041 agx_instr *I = agx_phi_to(b, agx_dest_index(&instr->dest)); 1042 1043 /* Deferred */ 1044 I->phi = instr; 1045} 1046 1047/* Look up the AGX block corresponding to a given NIR block. Used when 1048 * translating phi nodes after emitting all blocks. 1049 */ 1050static agx_block * 1051agx_from_nir_block(agx_context *ctx, nir_block *block) 1052{ 1053 return ctx->indexed_nir_blocks[block->index]; 1054} 1055 1056static void 1057agx_emit_phi_deferred(agx_context *ctx, agx_block *block, agx_instr *I) 1058{ 1059 nir_phi_instr *phi = I->phi; 1060 1061 /* Guaranteed by lower_phis_to_scalar */ 1062 assert(phi->dest.ssa.num_components == 1); 1063 1064 I->nr_srcs = exec_list_length(&phi->srcs); 1065 I->src = rzalloc_array(I, agx_index, I->nr_srcs); 1066 1067 nir_foreach_phi_src(src, phi) { 1068 agx_block *pred = agx_from_nir_block(ctx, src->pred); 1069 unsigned i = agx_predecessor_index(block, pred); 1070 assert(i < I->nr_srcs); 1071 1072 I->src[i] = agx_src_index(&src->src); 1073 } 1074} 1075 1076static void 1077agx_emit_phis_deferred(agx_context *ctx) 1078{ 1079 agx_foreach_block(ctx, block) { 1080 agx_foreach_instr_in_block(block, I) { 1081 if (I->op == AGX_OPCODE_PHI) 1082 agx_emit_phi_deferred(ctx, block, I); 1083 } 1084 } 1085} 1086 1087static void 1088agx_emit_instr(agx_builder *b, struct nir_instr *instr) 1089{ 1090 switch (instr->type) { 1091 case nir_instr_type_load_const: 1092 agx_emit_load_const(b, nir_instr_as_load_const(instr)); 1093 break; 1094 1095 case nir_instr_type_intrinsic: 1096 agx_emit_intrinsic(b, nir_instr_as_intrinsic(instr)); 1097 break; 1098 1099 case nir_instr_type_alu: 1100 agx_emit_alu(b, nir_instr_as_alu(instr)); 1101 break; 1102 1103 case nir_instr_type_tex: 1104 agx_emit_tex(b, nir_instr_as_tex(instr)); 1105 break; 1106 1107 case nir_instr_type_jump: 1108 agx_emit_jump(b, nir_instr_as_jump(instr)); 1109 break; 1110 1111 case nir_instr_type_phi: 1112 agx_emit_phi(b, nir_instr_as_phi(instr)); 1113 break; 1114 1115 default: 1116 unreachable("should've been lowered"); 1117 } 1118} 1119 1120static agx_block * 1121agx_create_block(agx_context *ctx) 1122{ 1123 agx_block *blk = rzalloc(ctx, agx_block); 1124 1125 util_dynarray_init(&blk->predecessors, blk); 1126 1127 return blk; 1128} 1129 1130static agx_block * 1131emit_block(agx_context *ctx, nir_block *block) 1132{ 1133 if (ctx->after_block) { 1134 ctx->current_block = ctx->after_block; 1135 ctx->after_block = NULL; 1136 } else { 1137 ctx->current_block = agx_create_block(ctx); 1138 } 1139 1140 agx_block *blk = ctx->current_block; 1141 list_addtail(&blk->link, &ctx->blocks); 1142 list_inithead(&blk->instructions); 1143 1144 ctx->indexed_nir_blocks[block->index] = blk; 1145 1146 agx_builder _b = agx_init_builder(ctx, agx_after_block(blk)); 1147 1148 nir_foreach_instr(instr, block) { 1149 agx_emit_instr(&_b, instr); 1150 } 1151 1152 return blk; 1153} 1154 1155static agx_block * 1156emit_cf_list(agx_context *ctx, struct exec_list *list); 1157 1158/* Emit if-else as 1159 * 1160 * if_icmp cond != 0 1161 * ... 1162 * else_icmp cond == 0 1163 * ... 1164 * pop_exec 1165 * 1166 * If the else is empty, we can omit the else_icmp. This happens elsewhere, as 1167 * an empty else block can become nonempty after RA due to phi lowering. This is 1168 * not usually optimal, but it's a start. 1169 */ 1170 1171static void 1172emit_if(agx_context *ctx, nir_if *nif) 1173{ 1174 agx_block *first_block = ctx->current_block; 1175 agx_builder _b = agx_init_builder(ctx, agx_after_block(first_block)); 1176 agx_index cond = agx_src_index(&nif->condition); 1177 1178 agx_emit_logical_end(&_b); 1179 agx_if_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, true); 1180 ctx->loop_nesting++; 1181 1182 /* Emit the two subblocks. */ 1183 agx_block *if_block = emit_cf_list(ctx, &nif->then_list); 1184 agx_block *end_then = ctx->current_block; 1185 1186 _b.cursor = agx_after_block(ctx->current_block); 1187 agx_emit_logical_end(&_b); 1188 agx_else_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, false); 1189 1190 agx_block *else_block = emit_cf_list(ctx, &nif->else_list); 1191 agx_block *end_else = ctx->current_block; 1192 1193 ctx->after_block = agx_create_block(ctx); 1194 1195 agx_block_add_successor(first_block, if_block); 1196 agx_block_add_successor(first_block, else_block); 1197 agx_block_add_successor(end_then, ctx->after_block); 1198 agx_block_add_successor(end_else, ctx->after_block); 1199 1200 _b.cursor = agx_after_block(ctx->current_block); 1201 agx_emit_logical_end(&_b); 1202 agx_pop_exec(&_b, 1); 1203 ctx->loop_nesting--; 1204} 1205 1206static void 1207emit_loop(agx_context *ctx, nir_loop *nloop) 1208{ 1209 /* We only track nesting within the innermost loop, so push and reset */ 1210 unsigned pushed_nesting = ctx->loop_nesting; 1211 ctx->loop_nesting = 0; 1212 1213 agx_block *popped_break = ctx->break_block; 1214 agx_block *popped_continue = ctx->continue_block; 1215 1216 ctx->break_block = agx_create_block(ctx); 1217 ctx->continue_block = agx_create_block(ctx); 1218 1219 /* Make room for break/continue nesting (TODO: skip if no divergent CF) */ 1220 agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block)); 1221 agx_emit_logical_end(&_b); 1222 agx_push_exec(&_b, 2); 1223 1224 /* Fallthrough to body */ 1225 agx_block_add_successor(ctx->current_block, ctx->continue_block); 1226 1227 /* Emit the body */ 1228 ctx->after_block = ctx->continue_block; 1229 agx_block *start_block = emit_cf_list(ctx, &nloop->body); 1230 1231 /* Fix up the nesting counter via an always true while_icmp, and branch back 1232 * to start of loop if any lanes are active */ 1233 _b.cursor = agx_after_block(ctx->current_block); 1234 agx_emit_logical_end(&_b); 1235 agx_while_icmp(&_b, agx_zero(), agx_zero(), 2, AGX_ICOND_UEQ, false); 1236 agx_jmp_exec_any(&_b, start_block); 1237 agx_pop_exec(&_b, 2); 1238 agx_block_add_successor(ctx->current_block, ctx->continue_block); 1239 1240 /* Pop off */ 1241 ctx->after_block = ctx->break_block; 1242 ctx->break_block = popped_break; 1243 ctx->continue_block = popped_continue; 1244 1245 /* Update shader-db stats */ 1246 ++ctx->loop_count; 1247 1248 /* All nested control flow must have finished */ 1249 assert(ctx->loop_nesting == 0); 1250 1251 /* Restore loop nesting (we might be inside an if inside an outer loop) */ 1252 ctx->loop_nesting = pushed_nesting; 1253} 1254 1255/* Before the first control flow structure, the nesting counter (r0l) needs to 1256 * be zeroed for correct operation. This only happens at most once, since by 1257 * definition this occurs at the end of the first block, which dominates the 1258 * rest of the program. */ 1259 1260static void 1261emit_first_cf(agx_context *ctx) 1262{ 1263 if (ctx->any_cf) 1264 return; 1265 1266 agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block)); 1267 agx_index r0l = agx_register(0, false); 1268 1269 agx_mov_to(&_b, r0l, agx_immediate(0)); 1270 ctx->any_cf = true; 1271} 1272 1273static agx_block * 1274emit_cf_list(agx_context *ctx, struct exec_list *list) 1275{ 1276 agx_block *start_block = NULL; 1277 1278 foreach_list_typed(nir_cf_node, node, node, list) { 1279 switch (node->type) { 1280 case nir_cf_node_block: { 1281 agx_block *block = emit_block(ctx, nir_cf_node_as_block(node)); 1282 1283 if (!start_block) 1284 start_block = block; 1285 1286 break; 1287 } 1288 1289 case nir_cf_node_if: 1290 emit_first_cf(ctx); 1291 emit_if(ctx, nir_cf_node_as_if(node)); 1292 break; 1293 1294 case nir_cf_node_loop: 1295 emit_first_cf(ctx); 1296 emit_loop(ctx, nir_cf_node_as_loop(node)); 1297 break; 1298 1299 default: 1300 unreachable("Unknown control flow"); 1301 } 1302 } 1303 1304 return start_block; 1305} 1306 1307static void 1308agx_set_st_vary_final(agx_context *ctx) 1309{ 1310 agx_foreach_instr_global_rev(ctx, I) { 1311 if (I->op == AGX_OPCODE_ST_VARY) { 1312 I->last = true; 1313 return; 1314 } 1315 } 1316} 1317 1318static void 1319agx_print_stats(agx_context *ctx, unsigned size, FILE *fp) 1320{ 1321 unsigned nr_ins = 0, max_reg = 0; 1322 1323 agx_foreach_instr_global(ctx, I) { 1324 /* Count instructions */ 1325 nr_ins++; 1326 1327 /* Count registers */ 1328 agx_foreach_dest(I, d) { 1329 if (I->dest[d].type == AGX_INDEX_REGISTER) { 1330 max_reg = MAX2(max_reg, 1331 I->dest[d].value + agx_write_registers(I, d) - 1); 1332 } 1333 } 1334 } 1335 1336 /* TODO: Pipe through occupancy */ 1337 unsigned nr_threads = 1; 1338 1339 fprintf(stderr, "%s - %s shader: %u inst, %u bytes, %u halfregs, %u threads, " 1340 "%u loops, %u:%u spills:fills\n", 1341 ctx->nir->info.label ?: "", 1342 gl_shader_stage_name(ctx->stage), 1343 nr_ins, size, max_reg, nr_threads, ctx->loop_count, 1344 ctx->spills, ctx->fills); 1345} 1346 1347static int 1348glsl_type_size(const struct glsl_type *type, bool bindless) 1349{ 1350 return glsl_count_attribute_slots(type, false); 1351} 1352 1353static bool 1354agx_lower_sincos_filter(const nir_instr *instr, UNUSED const void *_) 1355{ 1356 if (instr->type != nir_instr_type_alu) 1357 return false; 1358 1359 nir_alu_instr *alu = nir_instr_as_alu(instr); 1360 return alu->op == nir_op_fsin || alu->op == nir_op_fcos; 1361} 1362 1363/* Sine and cosine are implemented via the sin_pt_1 and sin_pt_2 opcodes for 1364 * heavy lifting. sin_pt_2 implements sinc in the first quadrant, expressed in 1365 * turns (sin (tau x) / x), while sin_pt_1 implements a piecewise sign/offset 1366 * fixup to transform a quadrant angle [0, 4] to [-1, 1]. The NIR opcode 1367 * fsin_agx models the fixup, sinc, and multiply to obtain sine, so we just 1368 * need to change units from radians to quadrants modulo turns. Cosine is 1369 * implemented by shifting by one quadrant: cos(x) = sin(x + tau/4). 1370 */ 1371 1372static nir_ssa_def * 1373agx_lower_sincos_impl(struct nir_builder *b, nir_instr *instr, UNUSED void *_) 1374{ 1375 nir_alu_instr *alu = nir_instr_as_alu(instr); 1376 nir_ssa_def *x = nir_mov_alu(b, alu->src[0], 1); 1377 nir_ssa_def *turns = nir_fmul_imm(b, x, M_1_PI * 0.5f); 1378 1379 if (alu->op == nir_op_fcos) 1380 turns = nir_fadd_imm(b, turns, 0.25f); 1381 1382 nir_ssa_def *quadrants = nir_fmul_imm(b, nir_ffract(b, turns), 4.0); 1383 return nir_fsin_agx(b, quadrants); 1384} 1385 1386static bool 1387agx_lower_sincos(nir_shader *shader) 1388{ 1389 return nir_shader_lower_instructions(shader, 1390 agx_lower_sincos_filter, agx_lower_sincos_impl, NULL); 1391} 1392 1393static bool 1394agx_lower_front_face(struct nir_builder *b, 1395 nir_instr *instr, UNUSED void *data) 1396{ 1397 if (instr->type != nir_instr_type_intrinsic) 1398 return false; 1399 1400 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 1401 if (intr->intrinsic != nir_intrinsic_load_front_face) 1402 return false; 1403 1404 assert(intr->dest.is_ssa); 1405 nir_ssa_def *def = &intr->dest.ssa; 1406 assert(def->bit_size == 1); 1407 1408 b->cursor = nir_before_instr(&intr->instr); 1409 nir_ssa_def_rewrite_uses(def, nir_inot(b, nir_load_back_face_agx(b, 1))); 1410 return true; 1411} 1412 1413static bool 1414agx_lower_aligned_offsets(struct nir_builder *b, 1415 nir_instr *instr, UNUSED void *data) 1416{ 1417 if (instr->type != nir_instr_type_intrinsic) 1418 return false; 1419 1420 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 1421 if (intr->intrinsic != nir_intrinsic_load_ubo) 1422 return false; 1423 1424 b->cursor = nir_before_instr(&intr->instr); 1425 1426 unsigned bytes = nir_dest_bit_size(intr->dest) / 8; 1427 assert(util_is_power_of_two_or_zero(bytes) && bytes != 0); 1428 1429 nir_src *offset = &intr->src[1]; 1430 1431 unsigned shift = util_logbase2(bytes); 1432 1433 nir_ssa_def *old = nir_ssa_for_src(b, *offset, 1); 1434 nir_ssa_def *new = nir_ishr_imm(b, old, shift); 1435 1436 nir_instr_rewrite_src_ssa(instr, offset, new); 1437 return true; 1438} 1439 1440static void 1441agx_optimize_nir(nir_shader *nir) 1442{ 1443 bool progress; 1444 1445 nir_lower_idiv_options idiv_options = { 1446 .imprecise_32bit_lowering = true, 1447 .allow_fp16 = true, 1448 }; 1449 1450 NIR_PASS_V(nir, nir_lower_regs_to_ssa); 1451 NIR_PASS_V(nir, nir_lower_int64); 1452 NIR_PASS_V(nir, nir_lower_idiv, &idiv_options); 1453 NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL); 1454 NIR_PASS_V(nir, nir_lower_load_const_to_scalar); 1455 NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false); 1456 NIR_PASS_V(nir, agx_lower_sincos); 1457 NIR_PASS_V(nir, nir_shader_instructions_pass, 1458 agx_lower_front_face, 1459 nir_metadata_block_index | nir_metadata_dominance, NULL); 1460 1461 do { 1462 progress = false; 1463 1464 NIR_PASS(progress, nir, nir_lower_var_copies); 1465 NIR_PASS(progress, nir, nir_lower_vars_to_ssa); 1466 1467 NIR_PASS(progress, nir, nir_copy_prop); 1468 NIR_PASS(progress, nir, nir_opt_remove_phis); 1469 NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true); 1470 NIR_PASS(progress, nir, nir_opt_dce); 1471 NIR_PASS(progress, nir, nir_opt_dead_cf); 1472 NIR_PASS(progress, nir, nir_opt_cse); 1473 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); 1474 NIR_PASS(progress, nir, nir_opt_algebraic); 1475 NIR_PASS(progress, nir, nir_opt_constant_folding); 1476 1477 NIR_PASS(progress, nir, nir_opt_undef); 1478 NIR_PASS(progress, nir, nir_lower_undef_to_zero); 1479 1480 NIR_PASS(progress, nir, nir_opt_loop_unroll); 1481 } while (progress); 1482 1483 NIR_PASS_V(nir, nir_opt_algebraic_late); 1484 NIR_PASS_V(nir, nir_opt_constant_folding); 1485 NIR_PASS_V(nir, nir_copy_prop); 1486 NIR_PASS_V(nir, nir_opt_dce); 1487 NIR_PASS_V(nir, nir_opt_cse); 1488 NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL); 1489 NIR_PASS_V(nir, nir_lower_load_const_to_scalar); 1490 1491 /* Cleanup optimizations */ 1492 nir_move_options move_all = 1493 nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | 1494 nir_move_comparisons | nir_move_copies | nir_move_load_ssbo; 1495 1496 NIR_PASS_V(nir, nir_opt_sink, move_all); 1497 NIR_PASS_V(nir, nir_opt_move, move_all); 1498 NIR_PASS_V(nir, nir_lower_phis_to_scalar, true); 1499} 1500 1501/* ABI: position first, then user, then psiz */ 1502static void 1503agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings *varyings, 1504 unsigned *remap) 1505{ 1506 unsigned base = 0; 1507 1508 nir_variable *pos = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_POS); 1509 if (pos) { 1510 assert(pos->data.driver_location < AGX_MAX_VARYINGS); 1511 remap[pos->data.driver_location] = base; 1512 base += 4; 1513 } 1514 1515 nir_foreach_shader_out_variable(var, nir) { 1516 unsigned loc = var->data.location; 1517 1518 if(loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ) { 1519 continue; 1520 } 1521 1522 assert(var->data.driver_location < AGX_MAX_VARYINGS); 1523 remap[var->data.driver_location] = base; 1524 base += 4; 1525 } 1526 1527 nir_variable *psiz = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_PSIZ); 1528 if (psiz) { 1529 assert(psiz->data.driver_location < AGX_MAX_VARYINGS); 1530 remap[psiz->data.driver_location] = base; 1531 base += 1; 1532 } 1533 1534 varyings->nr_slots = base; 1535} 1536 1537static void 1538agx_remap_varyings_fs(nir_shader *nir, struct agx_varyings *varyings, 1539 unsigned *remap) 1540{ 1541 struct agx_varying_packed *packed = varyings->packed; 1542 unsigned base = 0; 1543 1544 agx_pack(packed, VARYING, cfg) { 1545 cfg.type = AGX_VARYING_TYPE_FRAGCOORD_W; 1546 cfg.components = 1; 1547 cfg.triangle_slot = cfg.point_slot = base; 1548 } 1549 1550 base++; 1551 packed++; 1552 1553 agx_pack(packed, VARYING, cfg) { 1554 cfg.type = AGX_VARYING_TYPE_FRAGCOORD_Z; 1555 cfg.components = 1; 1556 cfg.triangle_slot = cfg.point_slot = base; 1557 } 1558 1559 base++; 1560 packed++; 1561 1562 unsigned comps[MAX_VARYING] = { 0 }; 1563 1564 nir_foreach_shader_in_variable(var, nir) { 1565 unsigned loc = var->data.driver_location; 1566 const struct glsl_type *column = 1567 glsl_without_array_or_matrix(var->type); 1568 unsigned chan = glsl_get_components(column); 1569 1570 /* If we have a fractional location added, we need to increase the size 1571 * so it will fit, i.e. a vec3 in YZW requires us to allocate a vec4. 1572 * We could do better but this is an edge case as it is, normally 1573 * packed varyings will be aligned. 1574 */ 1575 chan += var->data.location_frac; 1576 comps[loc] = MAX2(comps[loc], chan); 1577 } 1578 1579 nir_foreach_shader_in_variable(var, nir) { 1580 unsigned loc = var->data.driver_location; 1581 unsigned sz = glsl_count_attribute_slots(var->type, FALSE); 1582 unsigned channels = comps[loc]; 1583 1584 assert(var->data.driver_location <= AGX_MAX_VARYINGS); 1585 remap[var->data.driver_location] = base; 1586 1587 for (int c = 0; c < sz; ++c) { 1588 agx_pack(packed, VARYING, cfg) { 1589 cfg.type = (var->data.location == VARYING_SLOT_PNTC) ? 1590 AGX_VARYING_TYPE_POINT_COORDINATES : 1591 (var->data.interpolation == INTERP_MODE_FLAT) ? 1592 AGX_VARYING_TYPE_FLAT_LAST : 1593 AGX_VARYING_TYPE_SMOOTH; 1594 1595 cfg.components = channels; 1596 cfg.triangle_slot = cfg.point_slot = base; 1597 } 1598 1599 base += channels; 1600 packed++; 1601 } 1602 } 1603 1604 varyings->nr_descs = (packed - varyings->packed); 1605 varyings->nr_slots = base; 1606} 1607 1608/* 1609 * Build a bit mask of varyings (by location) that are flatshaded. This 1610 * information is needed by lower_mediump_io. 1611 */ 1612static uint64_t 1613agx_flat_varying_mask(nir_shader *nir) 1614{ 1615 uint64_t mask = 0; 1616 1617 assert(nir->info.stage == MESA_SHADER_FRAGMENT); 1618 1619 nir_foreach_shader_in_variable(var, nir) { 1620 if (var->data.interpolation == INTERP_MODE_FLAT) 1621 mask |= BITFIELD64_BIT(var->data.location); 1622 } 1623 1624 return mask; 1625} 1626 1627void 1628agx_compile_shader_nir(nir_shader *nir, 1629 struct agx_shader_key *key, 1630 struct util_dynarray *binary, 1631 struct agx_shader_info *out) 1632{ 1633 agx_debug = debug_get_option_agx_debug(); 1634 1635 agx_context *ctx = rzalloc(NULL, agx_context); 1636 ctx->nir = nir; 1637 ctx->out = out; 1638 ctx->key = key; 1639 ctx->stage = nir->info.stage; 1640 list_inithead(&ctx->blocks); 1641 1642 if (ctx->stage == MESA_SHADER_VERTEX) { 1643 out->writes_psiz = nir->info.outputs_written & 1644 BITFIELD_BIT(VARYING_SLOT_PSIZ); 1645 } 1646 1647 NIR_PASS_V(nir, nir_lower_vars_to_ssa); 1648 1649 /* Lower large arrays to scratch and small arrays to csel */ 1650 NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16, 1651 glsl_get_natural_size_align_bytes); 1652 NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0); 1653 1654 if (ctx->stage == MESA_SHADER_VERTEX) { 1655 /* Lower from OpenGL [-1, 1] to [0, 1] if half-z is not set */ 1656 if (!key->vs.clip_halfz) 1657 NIR_PASS_V(nir, nir_lower_clip_halfz); 1658 } 1659 1660 NIR_PASS_V(nir, nir_split_var_copies); 1661 NIR_PASS_V(nir, nir_lower_global_vars_to_local); 1662 NIR_PASS_V(nir, nir_lower_var_copies); 1663 NIR_PASS_V(nir, nir_lower_vars_to_ssa); 1664 NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, 1665 glsl_type_size, 0); 1666 if (ctx->stage == MESA_SHADER_FRAGMENT) { 1667 /* Interpolate varyings at fp16 and write to the tilebuffer at fp16. As an 1668 * exception, interpolate flat shaded at fp32. This works around a 1669 * hardware limitation. The resulting code (with an extra f2f16 at the end 1670 * if needed) matches what Metal produces. 1671 */ 1672 NIR_PASS_V(nir, nir_lower_mediump_io, 1673 nir_var_shader_in | nir_var_shader_out, 1674 ~agx_flat_varying_mask(nir), false); 1675 } 1676 NIR_PASS_V(nir, nir_shader_instructions_pass, 1677 agx_lower_aligned_offsets, 1678 nir_metadata_block_index | nir_metadata_dominance, NULL); 1679 1680 NIR_PASS_V(nir, nir_lower_ssbo); 1681 1682 /* Varying output is scalar, other I/O is vector */ 1683 if (ctx->stage == MESA_SHADER_VERTEX) { 1684 NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out); 1685 } 1686 1687 nir_lower_tex_options lower_tex_options = { 1688 .lower_txs_lod = true, 1689 .lower_txp = ~0, 1690 .lower_invalid_implicit_lod = true, 1691 }; 1692 1693 nir_tex_src_type_constraints tex_constraints = { 1694 [nir_tex_src_lod] = { true, 16 }, 1695 [nir_tex_src_bias] = { true, 16 }, 1696 }; 1697 1698 NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options); 1699 NIR_PASS_V(nir, nir_legalize_16bit_sampler_srcs, tex_constraints); 1700 1701 agx_optimize_nir(nir); 1702 1703 /* Implement conditional discard with real control flow like Metal */ 1704 NIR_PASS_V(nir, nir_lower_discard_if); 1705 1706 /* Must be last since NIR passes can remap driver_location freely */ 1707 if (ctx->stage == MESA_SHADER_VERTEX) { 1708 agx_remap_varyings_vs(nir, &out->varyings, ctx->varyings); 1709 } else if (ctx->stage == MESA_SHADER_FRAGMENT) { 1710 agx_remap_varyings_fs(nir, &out->varyings, ctx->varyings); 1711 } 1712 1713 bool skip_internal = nir->info.internal; 1714 skip_internal &= !(agx_debug & AGX_DBG_INTERNAL); 1715 1716 if (agx_debug & AGX_DBG_SHADERS && !skip_internal) { 1717 nir_print_shader(nir, stdout); 1718 } 1719 1720 ctx->allocated_vec = _mesa_hash_table_u64_create(ctx); 1721 1722 nir_foreach_function(func, nir) { 1723 if (!func->impl) 1724 continue; 1725 1726 nir_index_blocks(func->impl); 1727 1728 ctx->indexed_nir_blocks = 1729 rzalloc_array(ctx, agx_block *, func->impl->num_blocks); 1730 1731 ctx->alloc += func->impl->ssa_alloc; 1732 emit_cf_list(ctx, &func->impl->body); 1733 agx_emit_phis_deferred(ctx); 1734 break; /* TODO: Multi-function shaders */ 1735 } 1736 1737 /* Terminate the shader after the exit block */ 1738 agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link); 1739 agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block)); 1740 agx_stop(&_b); 1741 1742 /* Also add traps to match the blob, unsure what the function is */ 1743 for (unsigned i = 0; i < 8; ++i) 1744 agx_trap(&_b); 1745 1746 /* Index blocks now that we're done emitting so the order is consistent */ 1747 agx_foreach_block(ctx, block) 1748 block->index = ctx->num_blocks++; 1749 1750 agx_validate(ctx, "IR translation"); 1751 1752 if (agx_debug & AGX_DBG_SHADERS && !skip_internal) 1753 agx_print_shader(ctx, stdout); 1754 1755 agx_optimizer(ctx); 1756 agx_dce(ctx); 1757 agx_validate(ctx, "Optimization"); 1758 1759 if (agx_debug & AGX_DBG_SHADERS && !skip_internal) 1760 agx_print_shader(ctx, stdout); 1761 1762 agx_ra(ctx); 1763 1764 if (ctx->stage == MESA_SHADER_VERTEX) 1765 agx_set_st_vary_final(ctx); 1766 1767 if (agx_debug & AGX_DBG_SHADERS && !skip_internal) 1768 agx_print_shader(ctx, stdout); 1769 1770 agx_lower_pseudo(ctx); 1771 1772 agx_pack_binary(ctx, binary); 1773 1774 if ((agx_debug & AGX_DBG_SHADERDB) && !skip_internal) 1775 agx_print_stats(ctx, binary->size, stderr); 1776 1777 ralloc_free(ctx); 1778} 1779