1/* 2 * Copyright (C) 2020 Collabora, Ltd. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#ifndef __PAN_IR_H 25#define __PAN_IR_H 26 27#include <stdint.h> 28#include "compiler/nir/nir.h" 29#include "util/u_dynarray.h" 30#include "util/hash_table.h" 31 32/* On Valhall, the driver gives the hardware a table of resource tables. 33 * Resources are addressed as the index of the table together with the index of 34 * the resource within the table. For simplicity, we put one type of resource 35 * in each table and fix the numbering of the tables. 36 * 37 * This numbering is arbitrary. It is a software ABI between the 38 * Gallium driver and the Valhall compiler. 39 */ 40enum pan_resource_table { 41 PAN_TABLE_UBO = 0, 42 PAN_TABLE_ATTRIBUTE, 43 PAN_TABLE_ATTRIBUTE_BUFFER, 44 PAN_TABLE_SAMPLER, 45 PAN_TABLE_TEXTURE, 46 PAN_TABLE_IMAGE, 47 48 PAN_NUM_RESOURCE_TABLES 49}; 50 51/* Indices for named (non-XFB) varyings that are present. These are packed 52 * tightly so they correspond to a bitfield present (P) indexed by (1 << 53 * PAN_VARY_*). This has the nice property that you can lookup the buffer index 54 * of a given special field given a shift S by: 55 * 56 * idx = popcount(P & ((1 << S) - 1)) 57 * 58 * That is... look at all of the varyings that come earlier and count them, the 59 * count is the new index since plus one. Likewise, the total number of special 60 * buffers required is simply popcount(P) 61 */ 62 63enum pan_special_varying { 64 PAN_VARY_GENERAL = 0, 65 PAN_VARY_POSITION = 1, 66 PAN_VARY_PSIZ = 2, 67 PAN_VARY_PNTCOORD = 3, 68 PAN_VARY_FACE = 4, 69 PAN_VARY_FRAGCOORD = 5, 70 71 /* Keep last */ 72 PAN_VARY_MAX, 73}; 74 75/* Maximum number of attribute descriptors required for varyings. These include 76 * up to MAX_VARYING source level varyings plus a descriptor each non-GENERAL 77 * special varying */ 78#define PAN_MAX_VARYINGS (MAX_VARYING + PAN_VARY_MAX - 1) 79 80/* Define the general compiler entry point */ 81 82#define MAX_SYSVAL_COUNT 32 83 84/* Allow 2D of sysval IDs, while allowing nonparametric sysvals to equal 85 * their class for equal comparison */ 86 87#define PAN_SYSVAL(type, no) (((no) << 16) | PAN_SYSVAL_##type) 88#define PAN_SYSVAL_TYPE(sysval) ((sysval) & 0xffff) 89#define PAN_SYSVAL_ID(sysval) ((sysval) >> 16) 90 91/* Define some common types. We start at one for easy indexing of hash 92 * tables internal to the compiler */ 93 94enum { 95 PAN_SYSVAL_VIEWPORT_SCALE = 1, 96 PAN_SYSVAL_VIEWPORT_OFFSET = 2, 97 PAN_SYSVAL_TEXTURE_SIZE = 3, 98 PAN_SYSVAL_SSBO = 4, 99 PAN_SYSVAL_NUM_WORK_GROUPS = 5, 100 PAN_SYSVAL_SAMPLER = 7, 101 PAN_SYSVAL_LOCAL_GROUP_SIZE = 8, 102 PAN_SYSVAL_WORK_DIM = 9, 103 PAN_SYSVAL_IMAGE_SIZE = 10, 104 PAN_SYSVAL_SAMPLE_POSITIONS = 11, 105 PAN_SYSVAL_MULTISAMPLED = 12, 106 PAN_SYSVAL_RT_CONVERSION = 13, 107 PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS = 14, 108 PAN_SYSVAL_DRAWID = 15, 109 PAN_SYSVAL_BLEND_CONSTANTS = 16, 110 PAN_SYSVAL_XFB = 17, 111 PAN_SYSVAL_NUM_VERTICES = 18, 112}; 113 114#define PAN_TXS_SYSVAL_ID(texidx, dim, is_array) \ 115 ((texidx) | ((dim) << 7) | ((is_array) ? (1 << 9) : 0)) 116 117#define PAN_SYSVAL_ID_TO_TXS_TEX_IDX(id) ((id) & 0x7f) 118#define PAN_SYSVAL_ID_TO_TXS_DIM(id) (((id) >> 7) & 0x3) 119#define PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(id) !!((id) & (1 << 9)) 120 121/* Special attribute slots for vertex builtins. Sort of arbitrary but let's be 122 * consistent with the blob so we can compare traces easier. */ 123 124enum { 125 PAN_VERTEX_ID = 16, 126 PAN_INSTANCE_ID = 17, 127 PAN_MAX_ATTRIBUTE 128}; 129 130struct panfrost_sysvals { 131 /* The mapping of sysvals to uniforms, the count, and the off-by-one inverse */ 132 unsigned sysvals[MAX_SYSVAL_COUNT]; 133 unsigned sysval_count; 134}; 135 136/* Architecturally, Bifrost/Valhall can address 128 FAU slots of 64-bits each. 137 * In practice, the maximum number of FAU slots is limited by implementation. 138 * All known Bifrost and Valhall devices limit to 64 FAU slots. Therefore the 139 * maximum number of 32-bit words is 128, since there are 2 words per FAU slot. 140 * 141 * Midgard can push at most 92 words, so this bound suffices. The Midgard 142 * compiler pushes less than this, as Midgard uses register-mapped uniforms 143 * instead of FAU, preventing large numbers of uniforms to be pushed for 144 * nontrivial programs. 145 */ 146#define PAN_MAX_PUSH 128 147 148/* Architectural invariants (Midgard and Bifrost): UBO must be <= 2^16 bytes so 149 * an offset to a word must be < 2^16. There are less than 2^8 UBOs */ 150 151struct panfrost_ubo_word { 152 uint16_t ubo; 153 uint16_t offset; 154}; 155 156struct panfrost_ubo_push { 157 unsigned count; 158 struct panfrost_ubo_word words[PAN_MAX_PUSH]; 159}; 160 161/* Helper for searching the above. Note this is O(N) to the number of pushed 162 * constants, do not run in the draw call hot path */ 163 164unsigned 165pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs); 166 167struct hash_table_u64 * 168panfrost_init_sysvals(struct panfrost_sysvals *sysvals, 169 struct panfrost_sysvals *fixed_sysvals, 170 void *memctx); 171 172unsigned 173pan_lookup_sysval(struct hash_table_u64 *sysval_to_id, 174 struct panfrost_sysvals *sysvals, 175 int sysval); 176 177int 178panfrost_sysval_for_instr(nir_instr *instr, nir_dest *dest); 179 180struct panfrost_compile_inputs { 181 unsigned gpu_id; 182 bool is_blend, is_blit; 183 struct { 184 unsigned rt; 185 unsigned nr_samples; 186 uint64_t bifrost_blend_desc; 187 } blend; 188 int fixed_sysval_ubo; 189 struct panfrost_sysvals *fixed_sysval_layout; 190 bool shaderdb; 191 bool no_idvs; 192 bool no_ubo_to_push; 193 194 enum pipe_format rt_formats[8]; 195 uint8_t raw_fmt_mask; 196 unsigned nr_cbufs; 197 198 /* Used on Valhall. 199 * 200 * Bit mask of special desktop-only varyings (e.g VARYING_SLOT_TEX0) 201 * written by the previous stage (fragment shader) or written by this 202 * stage (vertex shader). Bits are slots from gl_varying_slot. 203 * 204 * For modern APIs (GLES or VK), this should be 0. 205 */ 206 uint32_t fixed_varying_mask; 207 208 union { 209 struct { 210 bool static_rt_conv; 211 uint32_t rt_conv[8]; 212 } bifrost; 213 }; 214}; 215 216struct pan_shader_varying { 217 gl_varying_slot location; 218 enum pipe_format format; 219}; 220 221struct bifrost_shader_blend_info { 222 nir_alu_type type; 223 uint32_t return_offset; 224 225 /* mali_bifrost_register_file_format corresponding to nir_alu_type */ 226 unsigned format; 227}; 228 229/* 230 * Unpacked form of a v7 message preload descriptor, produced by the compiler's 231 * message preload optimization. By splitting out this struct, the compiler does 232 * not need to know about data structure packing, avoiding a dependency on 233 * GenXML. 234 */ 235struct bifrost_message_preload { 236 /* Whether to preload this message */ 237 bool enabled; 238 239 /* Varying to load from */ 240 unsigned varying_index; 241 242 /* Register type, FP32 otherwise */ 243 bool fp16; 244 245 /* Number of components, ignored if texturing */ 246 unsigned num_components; 247 248 /* If texture is set, performs a texture instruction according to 249 * texture_index, skip, and zero_lod. If texture is unset, only the 250 * varying load is performed. 251 */ 252 bool texture, skip, zero_lod; 253 unsigned texture_index; 254}; 255 256struct bifrost_shader_info { 257 struct bifrost_shader_blend_info blend[8]; 258 nir_alu_type blend_src1_type; 259 bool wait_6, wait_7; 260 struct bifrost_message_preload messages[2]; 261 262 /* Whether any flat varyings are loaded. This may disable optimizations 263 * that change the provoking vertex, since that would load incorrect 264 * values for flat varyings. 265 */ 266 bool uses_flat_shading; 267}; 268 269struct midgard_shader_info { 270 unsigned first_tag; 271}; 272 273struct pan_shader_info { 274 gl_shader_stage stage; 275 unsigned work_reg_count; 276 unsigned tls_size; 277 unsigned wls_size; 278 279 /* Bit mask of preloaded registers */ 280 uint64_t preload; 281 282 union { 283 struct { 284 bool reads_frag_coord; 285 bool reads_point_coord; 286 bool reads_face; 287 bool can_discard; 288 bool writes_depth; 289 bool writes_stencil; 290 bool writes_coverage; 291 bool sidefx; 292 bool sample_shading; 293 bool early_fragment_tests; 294 bool can_early_z, can_fpk; 295 BITSET_WORD outputs_read; 296 BITSET_WORD outputs_written; 297 } fs; 298 299 struct { 300 bool writes_point_size; 301 302 /* If the primary shader writes point size, the Valhall 303 * driver may need a variant that does not write point 304 * size. Offset to such a shader in the program binary. 305 * 306 * Zero if no such variant is required. 307 * 308 * Only used with IDVS on Valhall. 309 */ 310 unsigned no_psiz_offset; 311 312 /* Set if Index-Driven Vertex Shading is in use */ 313 bool idvs; 314 315 /* If IDVS is used, whether a varying shader is used */ 316 bool secondary_enable; 317 318 /* If a varying shader is used, the varying shader's 319 * offset in the program binary 320 */ 321 unsigned secondary_offset; 322 323 /* If IDVS is in use, number of work registers used by 324 * the varying shader 325 */ 326 unsigned secondary_work_reg_count; 327 328 /* If IDVS is in use, bit mask of preloaded registers 329 * used by the varying shader 330 */ 331 uint64_t secondary_preload; 332 } vs; 333 334 struct { 335 /* Is it legal to merge workgroups? This is true if the 336 * shader uses neither barriers nor shared memory. 337 * 338 * Used by the Valhall hardware. 339 */ 340 bool allow_merging_workgroups; 341 } cs; 342 }; 343 344 /* Does the shader contains a barrier? or (for fragment shaders) does it 345 * require helper invocations, which demand the same ordering guarantees 346 * of the hardware? These notions are unified in the hardware, so we 347 * unify them here as well. 348 */ 349 bool contains_barrier; 350 bool separable; 351 bool writes_global; 352 uint64_t outputs_written; 353 354 unsigned sampler_count; 355 unsigned texture_count; 356 unsigned ubo_count; 357 unsigned attributes_read_count; 358 unsigned attribute_count; 359 unsigned attributes_read; 360 361 struct { 362 unsigned input_count; 363 struct pan_shader_varying input[PAN_MAX_VARYINGS]; 364 unsigned output_count; 365 struct pan_shader_varying output[PAN_MAX_VARYINGS]; 366 } varyings; 367 368 struct panfrost_sysvals sysvals; 369 370 /* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access 371 * Uniforms (Bifrost) */ 372 struct panfrost_ubo_push push; 373 374 uint32_t ubo_mask; 375 376 union { 377 struct bifrost_shader_info bifrost; 378 struct midgard_shader_info midgard; 379 }; 380}; 381 382typedef struct pan_block { 383 /* Link to next block. Must be first for mir_get_block */ 384 struct list_head link; 385 386 /* List of instructions emitted for the current block */ 387 struct list_head instructions; 388 389 /* Index of the block in source order */ 390 unsigned name; 391 392 /* Control flow graph */ 393 struct pan_block *successors[2]; 394 struct set *predecessors; 395 bool unconditional_jumps; 396 397 /* In liveness analysis, these are live masks (per-component) for 398 * indices for the block. Scalar compilers have the luxury of using 399 * simple bit fields, but for us, liveness is a vector idea. */ 400 uint16_t *live_in; 401 uint16_t *live_out; 402} pan_block; 403 404struct pan_instruction { 405 struct list_head link; 406}; 407 408#define pan_foreach_instr_in_block_rev(block, v) \ 409 list_for_each_entry_rev(struct pan_instruction, v, &block->instructions, link) 410 411#define pan_foreach_successor(blk, v) \ 412 pan_block *v; \ 413 pan_block **_v; \ 414 for (_v = (pan_block **) &blk->successors[0], \ 415 v = *_v; \ 416 v != NULL && _v < (pan_block **) &blk->successors[2]; \ 417 _v++, v = *_v) \ 418 419#define pan_foreach_predecessor(blk, v) \ 420 struct set_entry *_entry_##v; \ 421 struct pan_block *v; \ 422 for (_entry_##v = _mesa_set_next_entry(blk->predecessors, NULL), \ 423 v = (struct pan_block *) (_entry_##v ? _entry_##v->key : NULL); \ 424 _entry_##v != NULL; \ 425 _entry_##v = _mesa_set_next_entry(blk->predecessors, _entry_##v), \ 426 v = (struct pan_block *) (_entry_##v ? _entry_##v->key : NULL)) 427 428static inline pan_block * 429pan_exit_block(struct list_head *blocks) 430{ 431 pan_block *last = list_last_entry(blocks, pan_block, link); 432 assert(!last->successors[0] && !last->successors[1]); 433 return last; 434} 435 436typedef void (*pan_liveness_update)(uint16_t *, void *, unsigned max); 437 438void pan_liveness_gen(uint16_t *live, unsigned node, unsigned max, uint16_t mask); 439void pan_liveness_kill(uint16_t *live, unsigned node, unsigned max, uint16_t mask); 440bool pan_liveness_get(uint16_t *live, unsigned node, uint16_t max); 441 442void pan_compute_liveness(struct list_head *blocks, 443 unsigned temp_count, 444 pan_liveness_update callback); 445 446void pan_free_liveness(struct list_head *blocks); 447 448uint16_t 449pan_to_bytemask(unsigned bytes, unsigned mask); 450 451void pan_block_add_successor(pan_block *block, pan_block *successor); 452 453/* IR indexing */ 454#define PAN_IS_REG (1) 455 456static inline unsigned 457pan_ssa_index(nir_ssa_def *ssa) 458{ 459 /* Off-by-one ensures BIR_NO_ARG is skipped */ 460 return ((ssa->index + 1) << 1) | 0; 461} 462 463static inline unsigned 464pan_src_index(nir_src *src) 465{ 466 if (src->is_ssa) 467 return pan_ssa_index(src->ssa); 468 else { 469 assert(!src->reg.indirect); 470 return (src->reg.reg->index << 1) | PAN_IS_REG; 471 } 472} 473 474static inline unsigned 475pan_dest_index(nir_dest *dst) 476{ 477 if (dst->is_ssa) 478 return pan_ssa_index(&dst->ssa); 479 else { 480 assert(!dst->reg.indirect); 481 return (dst->reg.reg->index << 1) | PAN_IS_REG; 482 } 483} 484 485/* IR printing helpers */ 486void pan_print_alu_type(nir_alu_type t, FILE *fp); 487 488/* Until it can be upstreamed.. */ 489bool pan_has_source_mod(nir_alu_src *src, nir_op op); 490bool pan_has_dest_mod(nir_dest **dest, nir_op op); 491 492/* NIR passes to do some backend-specific lowering */ 493 494#define PAN_WRITEOUT_C 1 495#define PAN_WRITEOUT_Z 2 496#define PAN_WRITEOUT_S 4 497#define PAN_WRITEOUT_2 8 498 499bool pan_nir_lower_zs_store(nir_shader *nir); 500 501bool pan_nir_lower_64bit_intrin(nir_shader *shader); 502 503bool pan_lower_helper_invocation(nir_shader *shader); 504bool pan_lower_sample_pos(nir_shader *shader); 505bool pan_lower_xfb(nir_shader *nir); 506 507/* 508 * Helper returning the subgroup size. Generally, this is equal to the number of 509 * threads in a warp. For Midgard (including warping models), this returns 1, as 510 * subgroups are not supported. 511 */ 512static inline unsigned 513pan_subgroup_size(unsigned arch) 514{ 515 if (arch >= 9) 516 return 16; 517 else if (arch >= 7) 518 return 8; 519 else if (arch >= 6) 520 return 4; 521 else 522 return 1; 523} 524 525#endif 526