1/* 2 * Copyright © 2016 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#ifndef V3D_COMPILER_H 25#define V3D_COMPILER_H 26 27#include <assert.h> 28#include <stdio.h> 29#include <stdlib.h> 30#include <stdbool.h> 31#include <stdint.h> 32#include <string.h> 33 34#include "util/macros.h" 35#include "common/v3d_debug.h" 36#include "common/v3d_device_info.h" 37#include "common/v3d_limits.h" 38#include "compiler/nir/nir.h" 39#include "util/list.h" 40#include "util/u_math.h" 41 42#include "qpu/qpu_instr.h" 43#include "pipe/p_state.h" 44 45/** 46 * Maximum number of outstanding TMU operations we can queue for execution. 47 * 48 * This is mostly limited by the size of the TMU fifos. The Input and Config 49 * fifos can stall, but we prefer that than injecting TMU flushes manually 50 * in the driver, so we can ignore these, but we can't overflow the Output fifo, 51 * which has 16 / threads per-thread entries, meaning that the maximum number 52 * of outstanding LDTMUs we can ever have is 8, for a 2-way threaded shader. 53 * This means that at most we can have 8 outstanding TMU loads, if each load 54 * is just one component. 55 * 56 * NOTE: we could actually have a larger value here because TMU stores don't 57 * consume any entries in the Output fifo (so we could have any number of 58 * outstanding stores) and the driver keeps track of used Output fifo entries 59 * and will flush if we ever needs more than 8, but since loads are much more 60 * common than stores, it is probably not worth it. 61 */ 62#define MAX_TMU_QUEUE_SIZE 8 63 64/** 65 * Maximum offset distance in bytes between two consecutive constant UBO loads 66 * for the same UBO where we would favor updating the unifa address by emitting 67 * dummy ldunifa instructions to avoid writing the unifa register. 68 */ 69#define MAX_UNIFA_SKIP_DISTANCE 16 70 71struct nir_builder; 72 73struct v3d_fs_inputs { 74 /** 75 * Array of the meanings of the VPM inputs this shader needs. 76 * 77 * It doesn't include those that aren't part of the VPM, like 78 * point/line coordinates. 79 */ 80 struct v3d_varying_slot *input_slots; 81 uint32_t num_inputs; 82}; 83 84enum qfile { 85 /** An unused source or destination register. */ 86 QFILE_NULL, 87 88 /** A physical register, such as the W coordinate payload. */ 89 QFILE_REG, 90 /** One of the regsiters for fixed function interactions. */ 91 QFILE_MAGIC, 92 93 /** 94 * A virtual register, that will be allocated to actual accumulator 95 * or physical registers later. 96 */ 97 QFILE_TEMP, 98 99 /** 100 * VPM reads use this with an index value to say what part of the VPM 101 * is being read. 102 */ 103 QFILE_VPM, 104 105 /** 106 * Stores an immediate value in the index field that will be used 107 * directly by qpu_load_imm(). 108 */ 109 QFILE_LOAD_IMM, 110 111 /** 112 * Stores an immediate value in the index field that can be turned 113 * into a small immediate field by qpu_encode_small_immediate(). 114 */ 115 QFILE_SMALL_IMM, 116}; 117 118/** 119 * A reference to a QPU register or a virtual temp register. 120 */ 121struct qreg { 122 enum qfile file; 123 uint32_t index; 124}; 125 126static inline struct qreg vir_reg(enum qfile file, uint32_t index) 127{ 128 return (struct qreg){file, index}; 129} 130 131static inline struct qreg vir_magic_reg(uint32_t index) 132{ 133 return (struct qreg){QFILE_MAGIC, index}; 134} 135 136static inline struct qreg vir_nop_reg(void) 137{ 138 return (struct qreg){QFILE_NULL, 0}; 139} 140 141/** 142 * A reference to an actual register at the QPU level, for register 143 * allocation. 144 */ 145struct qpu_reg { 146 bool magic; 147 bool smimm; 148 int index; 149}; 150 151struct qinst { 152 /** Entry in qblock->instructions */ 153 struct list_head link; 154 155 /** 156 * The instruction being wrapped. Its condition codes, pack flags, 157 * signals, etc. will all be used, with just the register references 158 * being replaced by the contents of qinst->dst and qinst->src[]. 159 */ 160 struct v3d_qpu_instr qpu; 161 162 /* Pre-register-allocation references to src/dst registers */ 163 struct qreg dst; 164 struct qreg src[3]; 165 bool is_last_thrsw; 166 167 /* If the instruction reads a uniform (other than through src[i].file 168 * == QFILE_UNIF), that uniform's index in c->uniform_contents. ~0 169 * otherwise. 170 */ 171 int uniform; 172 173 /* If this is a a TLB Z write */ 174 bool is_tlb_z_write; 175 176 /* Position of this instruction in the program. Filled in during 177 * register allocation. 178 */ 179 int32_t ip; 180}; 181 182enum quniform_contents { 183 /** 184 * Indicates that a constant 32-bit value is copied from the program's 185 * uniform contents. 186 */ 187 QUNIFORM_CONSTANT, 188 /** 189 * Indicates that the program's uniform contents are used as an index 190 * into the GL uniform storage. 191 */ 192 QUNIFORM_UNIFORM, 193 194 /** @{ 195 * Scaling factors from clip coordinates to relative to the viewport 196 * center. 197 * 198 * This is used by the coordinate and vertex shaders to produce the 199 * 32-bit entry consisting of 2 16-bit fields with 12.4 signed fixed 200 * point offsets from the viewport ccenter. 201 */ 202 QUNIFORM_VIEWPORT_X_SCALE, 203 QUNIFORM_VIEWPORT_Y_SCALE, 204 /** @} */ 205 206 QUNIFORM_VIEWPORT_Z_OFFSET, 207 QUNIFORM_VIEWPORT_Z_SCALE, 208 209 QUNIFORM_USER_CLIP_PLANE, 210 211 /** 212 * A reference to a V3D 3.x texture config parameter 0 uniform. 213 * 214 * This is a uniform implicitly loaded with a QPU_W_TMU* write, which 215 * defines texture type, miplevels, and such. It will be found as a 216 * parameter to the first QOP_TEX_[STRB] instruction in a sequence. 217 */ 218 QUNIFORM_TEXTURE_CONFIG_P0_0, 219 QUNIFORM_TEXTURE_CONFIG_P0_1, 220 QUNIFORM_TEXTURE_CONFIG_P0_2, 221 QUNIFORM_TEXTURE_CONFIG_P0_3, 222 QUNIFORM_TEXTURE_CONFIG_P0_4, 223 QUNIFORM_TEXTURE_CONFIG_P0_5, 224 QUNIFORM_TEXTURE_CONFIG_P0_6, 225 QUNIFORM_TEXTURE_CONFIG_P0_7, 226 QUNIFORM_TEXTURE_CONFIG_P0_8, 227 QUNIFORM_TEXTURE_CONFIG_P0_9, 228 QUNIFORM_TEXTURE_CONFIG_P0_10, 229 QUNIFORM_TEXTURE_CONFIG_P0_11, 230 QUNIFORM_TEXTURE_CONFIG_P0_12, 231 QUNIFORM_TEXTURE_CONFIG_P0_13, 232 QUNIFORM_TEXTURE_CONFIG_P0_14, 233 QUNIFORM_TEXTURE_CONFIG_P0_15, 234 QUNIFORM_TEXTURE_CONFIG_P0_16, 235 QUNIFORM_TEXTURE_CONFIG_P0_17, 236 QUNIFORM_TEXTURE_CONFIG_P0_18, 237 QUNIFORM_TEXTURE_CONFIG_P0_19, 238 QUNIFORM_TEXTURE_CONFIG_P0_20, 239 QUNIFORM_TEXTURE_CONFIG_P0_21, 240 QUNIFORM_TEXTURE_CONFIG_P0_22, 241 QUNIFORM_TEXTURE_CONFIG_P0_23, 242 QUNIFORM_TEXTURE_CONFIG_P0_24, 243 QUNIFORM_TEXTURE_CONFIG_P0_25, 244 QUNIFORM_TEXTURE_CONFIG_P0_26, 245 QUNIFORM_TEXTURE_CONFIG_P0_27, 246 QUNIFORM_TEXTURE_CONFIG_P0_28, 247 QUNIFORM_TEXTURE_CONFIG_P0_29, 248 QUNIFORM_TEXTURE_CONFIG_P0_30, 249 QUNIFORM_TEXTURE_CONFIG_P0_31, 250 QUNIFORM_TEXTURE_CONFIG_P0_32, 251 252 /** 253 * A reference to a V3D 3.x texture config parameter 1 uniform. 254 * 255 * This is a uniform implicitly loaded with a QPU_W_TMU* write, which 256 * has the pointer to the indirect texture state. Our data[] field 257 * will have a packed p1 value, but the address field will be just 258 * which texture unit's texture should be referenced. 259 */ 260 QUNIFORM_TEXTURE_CONFIG_P1, 261 262 /* A V3D 4.x texture config parameter. The high 8 bits will be 263 * which texture or sampler is being sampled, and the driver must 264 * replace the address field with the appropriate address. 265 */ 266 QUNIFORM_TMU_CONFIG_P0, 267 QUNIFORM_TMU_CONFIG_P1, 268 269 QUNIFORM_IMAGE_TMU_CONFIG_P0, 270 271 QUNIFORM_TEXTURE_FIRST_LEVEL, 272 273 QUNIFORM_TEXTURE_WIDTH, 274 QUNIFORM_TEXTURE_HEIGHT, 275 QUNIFORM_TEXTURE_DEPTH, 276 QUNIFORM_TEXTURE_ARRAY_SIZE, 277 QUNIFORM_TEXTURE_LEVELS, 278 QUNIFORM_TEXTURE_SAMPLES, 279 280 QUNIFORM_UBO_ADDR, 281 282 QUNIFORM_TEXRECT_SCALE_X, 283 QUNIFORM_TEXRECT_SCALE_Y, 284 285 /* Returns the base offset of the SSBO given by the data value. */ 286 QUNIFORM_SSBO_OFFSET, 287 288 /* Returns the size of the SSBO or UBO given by the data value. */ 289 QUNIFORM_GET_SSBO_SIZE, 290 QUNIFORM_GET_UBO_SIZE, 291 292 /* Sizes (in pixels) of a shader image given by the data value. */ 293 QUNIFORM_IMAGE_WIDTH, 294 QUNIFORM_IMAGE_HEIGHT, 295 QUNIFORM_IMAGE_DEPTH, 296 QUNIFORM_IMAGE_ARRAY_SIZE, 297 298 QUNIFORM_LINE_WIDTH, 299 300 /* The line width sent to hardware. This includes the expanded width 301 * when anti-aliasing is enabled. 302 */ 303 QUNIFORM_AA_LINE_WIDTH, 304 305 /* Number of workgroups passed to glDispatchCompute in the dimension 306 * selected by the data value. 307 */ 308 QUNIFORM_NUM_WORK_GROUPS, 309 310 /* Base workgroup offset passed to vkCmdDispatchBase in the dimension 311 * selected by the data value. 312 */ 313 QUNIFORM_WORK_GROUP_BASE, 314 315 /** 316 * Returns the the offset of the scratch buffer for register spilling. 317 */ 318 QUNIFORM_SPILL_OFFSET, 319 QUNIFORM_SPILL_SIZE_PER_THREAD, 320 321 /** 322 * Returns the offset of the shared memory for compute shaders. 323 * 324 * This will be accessed using TMU general memory operations, so the 325 * L2T cache will effectively be the shared memory area. 326 */ 327 QUNIFORM_SHARED_OFFSET, 328 329 /** 330 * Returns the number of layers in the framebuffer. 331 * 332 * This is used to cap gl_Layer in geometry shaders to avoid 333 * out-of-bounds accesses into the tile state during binning. 334 */ 335 QUNIFORM_FB_LAYERS, 336 337 /** 338 * Current value of gl_ViewIndex for Multiview rendering. 339 */ 340 QUNIFORM_VIEW_INDEX, 341 342 /** 343 * Inline uniform buffers 344 */ 345 QUNIFORM_INLINE_UBO_0, 346 QUNIFORM_INLINE_UBO_1, 347 QUNIFORM_INLINE_UBO_2, 348 QUNIFORM_INLINE_UBO_3, 349}; 350 351static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value) 352{ 353 assert(value < (1 << 24)); 354 return unit << 24 | value; 355} 356 357static inline uint32_t v3d_unit_data_get_unit(uint32_t data) 358{ 359 return data >> 24; 360} 361 362static inline uint32_t v3d_unit_data_get_offset(uint32_t data) 363{ 364 return data & 0xffffff; 365} 366 367struct v3d_varying_slot { 368 uint8_t slot_and_component; 369}; 370 371static inline struct v3d_varying_slot 372v3d_slot_from_slot_and_component(uint8_t slot, uint8_t component) 373{ 374 assert(slot < 255 / 4); 375 return (struct v3d_varying_slot){ (slot << 2) + component }; 376} 377 378static inline uint8_t v3d_slot_get_slot(struct v3d_varying_slot slot) 379{ 380 return slot.slot_and_component >> 2; 381} 382 383static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot) 384{ 385 return slot.slot_and_component & 3; 386} 387 388enum v3d_execution_environment { 389 V3D_ENVIRONMENT_OPENGL = 0, 390 V3D_ENVIRONMENT_VULKAN, 391}; 392 393struct v3d_key { 394 void *shader_state; 395 struct { 396 uint8_t swizzle[4]; 397 } tex[V3D_MAX_TEXTURE_SAMPLERS]; 398 struct { 399 uint8_t return_size; 400 uint8_t return_channels; 401 } sampler[V3D_MAX_TEXTURE_SAMPLERS]; 402 403 uint8_t num_tex_used; 404 uint8_t num_samplers_used; 405 uint8_t ucp_enables; 406 bool is_last_geometry_stage; 407 bool robust_buffer_access; 408 409 enum v3d_execution_environment environment; 410}; 411 412struct v3d_fs_key { 413 struct v3d_key base; 414 bool is_points; 415 bool is_lines; 416 bool line_smoothing; 417 bool point_coord_upper_left; 418 bool msaa; 419 bool sample_coverage; 420 bool sample_alpha_to_coverage; 421 bool sample_alpha_to_one; 422 /* Mask of which color render targets are present. */ 423 uint8_t cbufs; 424 uint8_t swap_color_rb; 425 /* Mask of which render targets need to be written as 32-bit floats */ 426 uint8_t f32_color_rb; 427 /* Masks of which render targets need to be written as ints/uints. 428 * Used by gallium to work around lost information in TGSI. 429 */ 430 uint8_t int_color_rb; 431 uint8_t uint_color_rb; 432 433 /* Color format information per render target. Only set when logic 434 * operations are enabled. 435 */ 436 struct { 437 enum pipe_format format; 438 uint8_t swizzle[4]; 439 } color_fmt[V3D_MAX_DRAW_BUFFERS]; 440 441 uint8_t logicop_func; 442 uint32_t point_sprite_mask; 443 444 struct pipe_rt_blend_state blend; 445 446 /* If the fragment shader reads gl_PrimitiveID then we have 2 scenarios: 447 * 448 * - If there is a geometry shader, then gl_PrimitiveID must be written 449 * by it and the fragment shader loads it as a regular explicit input 450 * varying. This is the only valid use case in GLES 3.1. 451 * 452 * - If there is not a geometry shader (allowed since GLES 3.2 and 453 * Vulkan 1.0), then gl_PrimitiveID must be implicitly written by 454 * hardware and is considered an implicit input varying in the 455 * fragment shader. 456 */ 457 bool has_gs; 458}; 459 460struct v3d_gs_key { 461 struct v3d_key base; 462 463 struct v3d_varying_slot used_outputs[V3D_MAX_FS_INPUTS]; 464 uint8_t num_used_outputs; 465 466 bool is_coord; 467 bool per_vertex_point_size; 468}; 469 470struct v3d_vs_key { 471 struct v3d_key base; 472 473 struct v3d_varying_slot used_outputs[V3D_MAX_ANY_STAGE_INPUTS]; 474 uint8_t num_used_outputs; 475 476 /* A bit-mask indicating if we need to swap the R/B channels for 477 * vertex attributes. Since the hardware doesn't provide any 478 * means to swizzle vertex attributes we need to do it in the shader. 479 */ 480 uint32_t va_swap_rb_mask; 481 482 bool is_coord; 483 bool per_vertex_point_size; 484 bool clamp_color; 485}; 486 487/** A basic block of VIR intructions. */ 488struct qblock { 489 struct list_head link; 490 491 struct list_head instructions; 492 493 struct set *predecessors; 494 struct qblock *successors[2]; 495 496 int index; 497 498 /* Instruction IPs for the first and last instruction of the block. 499 * Set by qpu_schedule.c. 500 */ 501 uint32_t start_qpu_ip; 502 uint32_t end_qpu_ip; 503 504 /* Instruction IP for the branch instruction of the block. Set by 505 * qpu_schedule.c. 506 */ 507 uint32_t branch_qpu_ip; 508 509 /** Offset within the uniform stream at the start of the block. */ 510 uint32_t start_uniform; 511 /** Offset within the uniform stream of the branch instruction */ 512 uint32_t branch_uniform; 513 514 /** 515 * Has the terminating branch of this block already been emitted 516 * by a break or continue? 517 */ 518 bool branch_emitted; 519 520 /** @{ used by v3d_vir_live_variables.c */ 521 BITSET_WORD *def; 522 BITSET_WORD *defin; 523 BITSET_WORD *defout; 524 BITSET_WORD *use; 525 BITSET_WORD *live_in; 526 BITSET_WORD *live_out; 527 int start_ip, end_ip; 528 /** @} */ 529}; 530 531/** Which util/list.h add mode we should use when inserting an instruction. */ 532enum vir_cursor_mode { 533 vir_cursor_add, 534 vir_cursor_addtail, 535}; 536 537/** 538 * Tracking structure for where new instructions should be inserted. Create 539 * with one of the vir_after_inst()-style helper functions. 540 * 541 * This does not protect against removal of the block or instruction, so we 542 * have an assert in instruction removal to try to catch it. 543 */ 544struct vir_cursor { 545 enum vir_cursor_mode mode; 546 struct list_head *link; 547}; 548 549static inline struct vir_cursor 550vir_before_inst(struct qinst *inst) 551{ 552 return (struct vir_cursor){ vir_cursor_addtail, &inst->link }; 553} 554 555static inline struct vir_cursor 556vir_after_inst(struct qinst *inst) 557{ 558 return (struct vir_cursor){ vir_cursor_add, &inst->link }; 559} 560 561static inline struct vir_cursor 562vir_before_block(struct qblock *block) 563{ 564 return (struct vir_cursor){ vir_cursor_add, &block->instructions }; 565} 566 567static inline struct vir_cursor 568vir_after_block(struct qblock *block) 569{ 570 return (struct vir_cursor){ vir_cursor_addtail, &block->instructions }; 571} 572 573enum v3d_compilation_result { 574 V3D_COMPILATION_SUCCEEDED, 575 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION, 576 V3D_COMPILATION_FAILED, 577}; 578 579/** 580 * Compiler state saved across compiler invocations, for any expensive global 581 * setup. 582 */ 583struct v3d_compiler { 584 const struct v3d_device_info *devinfo; 585 uint32_t max_inline_uniform_buffers; 586 struct ra_regs *regs; 587 struct ra_class *reg_class_any[3]; 588 struct ra_class *reg_class_r5[3]; 589 struct ra_class *reg_class_phys[3]; 590 struct ra_class *reg_class_phys_or_acc[3]; 591}; 592 593/** 594 * This holds partially interpolated inputs as provided by hardware 595 * (The Vp = A*(x - x0) + B*(y - y0) term), as well as the C coefficient 596 * required to compute the final interpolated value. 597 */ 598struct v3d_interp_input { 599 struct qreg vp; 600 struct qreg C; 601 unsigned mode; /* interpolation mode */ 602}; 603 604struct v3d_ra_node_info { 605 struct { 606 uint32_t priority; 607 uint8_t class_bits; 608 } *info; 609 uint32_t alloc_count; 610}; 611 612struct v3d_compile { 613 const struct v3d_device_info *devinfo; 614 nir_shader *s; 615 nir_function_impl *impl; 616 struct exec_list *cf_node_list; 617 const struct v3d_compiler *compiler; 618 619 void (*debug_output)(const char *msg, 620 void *debug_output_data); 621 void *debug_output_data; 622 623 /** 624 * Mapping from nir_register * or nir_ssa_def * to array of struct 625 * qreg for the values. 626 */ 627 struct hash_table *def_ht; 628 629 /* For each temp, the instruction generating its value. */ 630 struct qinst **defs; 631 uint32_t defs_array_size; 632 633 /* TMU pipelining tracking */ 634 struct { 635 /* NIR registers that have been updated with a TMU operation 636 * that has not been flushed yet. 637 */ 638 struct set *outstanding_regs; 639 640 uint32_t output_fifo_size; 641 642 struct { 643 nir_dest *dest; 644 uint8_t num_components; 645 uint8_t component_mask; 646 } flush[MAX_TMU_QUEUE_SIZE]; 647 uint32_t flush_count; 648 } tmu; 649 650 /** 651 * Inputs to the shader, arranged by TGSI declaration order. 652 * 653 * Not all fragment shader QFILE_VARY reads are present in this array. 654 */ 655 struct qreg *inputs; 656 /** 657 * Partially interpolated inputs to the shader. 658 */ 659 struct v3d_interp_input *interp; 660 struct qreg *outputs; 661 bool msaa_per_sample_output; 662 struct qreg color_reads[V3D_MAX_DRAW_BUFFERS * V3D_MAX_SAMPLES * 4]; 663 struct qreg sample_colors[V3D_MAX_DRAW_BUFFERS * V3D_MAX_SAMPLES * 4]; 664 uint32_t inputs_array_size; 665 uint32_t outputs_array_size; 666 uint32_t uniforms_array_size; 667 668 /* Booleans for whether the corresponding QFILE_VARY[i] is 669 * flat-shaded. This includes gl_FragColor flat-shading, which is 670 * customized based on the shademodel_flat shader key. 671 */ 672 uint32_t flat_shade_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; 673 674 uint32_t noperspective_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; 675 676 uint32_t centroid_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; 677 678 bool uses_center_w; 679 bool writes_z; 680 bool writes_z_from_fep; 681 bool reads_z; 682 bool uses_implicit_point_line_varyings; 683 684 /* True if a fragment shader reads gl_PrimitiveID */ 685 bool fs_uses_primitive_id; 686 687 /* If the fragment shader does anything that requires to force 688 * per-sample MSAA, such as reading gl_SampleID. 689 */ 690 bool force_per_sample_msaa; 691 692 /* Whether we are using the fallback scheduler. This will be set after 693 * register allocation has failed once. 694 */ 695 bool fallback_scheduler; 696 697 /* Disable TMU pipelining. This may increase the chances of being able 698 * to compile shaders with high register pressure that require to emit 699 * TMU spills. 700 */ 701 bool disable_tmu_pipelining; 702 bool pipelined_any_tmu; 703 704 /* Disable sorting of UBO loads with constant offset. This may 705 * increase the chances of being able to compile shaders with high 706 * register pressure. 707 */ 708 bool disable_constant_ubo_load_sorting; 709 bool sorted_any_ubo_loads; 710 711 /* Emits ldunif for each new uniform, even if the uniform was already 712 * emitted in the same block. Useful to compile shaders with high 713 * register pressure or to disable the optimization during uniform 714 * spills. 715 */ 716 bool disable_ldunif_opt; 717 718 /* Disables loop unrolling to reduce register pressure. */ 719 bool disable_loop_unrolling; 720 bool unrolled_any_loops; 721 722 /* Disables scheduling of general TMU loads (and unfiltered image load). 723 */ 724 bool disable_general_tmu_sched; 725 bool has_general_tmu_load; 726 727 /* Minimum number of threads we are willing to use to register allocate 728 * a shader with the current compilation strategy. This only prevents 729 * us from lowering the thread count to register allocate successfully, 730 * which can be useful when we prefer doing other changes to the 731 * compilation strategy before dropping thread count. 732 */ 733 uint32_t min_threads_for_reg_alloc; 734 735 /* Whether TMU spills are allowed. If this is disabled it may cause 736 * register allocation to fail. We set this to favor other compilation 737 * strategies that can reduce register pressure and hopefully reduce or 738 * eliminate TMU spills in the shader. 739 */ 740 uint32_t max_tmu_spills; 741 742 uint32_t compile_strategy_idx; 743 744 /* The UBO index and block used with the last unifa load, as well as the 745 * current unifa offset *after* emitting that load. This is used to skip 746 * unifa writes (and their 3 delay slot) when the next UBO load reads 747 * right after the previous one in the same block. 748 */ 749 struct qblock *current_unifa_block; 750 int32_t current_unifa_index; 751 uint32_t current_unifa_offset; 752 bool current_unifa_is_ubo; 753 754 /* State for whether we're executing on each channel currently. 0 if 755 * yes, otherwise a block number + 1 that the channel jumped to. 756 */ 757 struct qreg execute; 758 bool in_control_flow; 759 760 struct qreg line_x, point_x, point_y, primitive_id; 761 762 /** 763 * Instance ID, which comes in before the vertex attribute payload if 764 * the shader record requests it. 765 */ 766 struct qreg iid; 767 768 /** 769 * Base Instance ID, which comes in before the vertex attribute payload 770 * (after Instance ID) if the shader record requests it. 771 */ 772 struct qreg biid; 773 774 /** 775 * Vertex ID, which comes in before the vertex attribute payload 776 * (after Base Instance) if the shader record requests it. 777 */ 778 struct qreg vid; 779 780 /* Fragment shader payload regs. */ 781 struct qreg payload_w, payload_w_centroid, payload_z; 782 783 struct qreg cs_payload[2]; 784 struct qreg cs_shared_offset; 785 int local_invocation_index_bits; 786 787 /* If the shader uses subgroup functionality */ 788 bool has_subgroups; 789 790 uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4]; 791 uint32_t vpm_output_size; 792 793 /* Size in bytes of registers that have been spilled. This is how much 794 * space needs to be available in the spill BO per thread per QPU. 795 */ 796 uint32_t spill_size; 797 /* Shader-db stats */ 798 uint32_t spills, fills, loops; 799 800 /* Whether we are in the process of spilling registers for 801 * register allocation 802 */ 803 bool spilling; 804 805 /** 806 * Register spilling's per-thread base address, shared between each 807 * spill/fill's addressing calculations (also used for scratch 808 * access). 809 */ 810 struct qreg spill_base; 811 812 /* Bit vector of which temps may be spilled */ 813 BITSET_WORD *spillable; 814 815 /* Used during register allocation */ 816 int thread_index; 817 struct v3d_ra_node_info nodes; 818 struct ra_graph *g; 819 820 /** 821 * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads. 822 * 823 * This includes those that aren't part of the VPM varyings, like 824 * point/line coordinates. 825 */ 826 struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS]; 827 828 /** 829 * An entry per outputs[] in the VS indicating what the VARYING_SLOT_* 830 * of the output is. Used to emit from the VS in the order that the 831 * FS needs. 832 */ 833 struct v3d_varying_slot *output_slots; 834 835 struct pipe_shader_state *shader_state; 836 struct v3d_key *key; 837 struct v3d_fs_key *fs_key; 838 struct v3d_gs_key *gs_key; 839 struct v3d_vs_key *vs_key; 840 841 /* Live ranges of temps. */ 842 int *temp_start, *temp_end; 843 bool live_intervals_valid; 844 845 uint32_t *uniform_data; 846 enum quniform_contents *uniform_contents; 847 uint32_t uniform_array_size; 848 uint32_t num_uniforms; 849 uint32_t output_position_index; 850 nir_variable *output_color_var[V3D_MAX_DRAW_BUFFERS]; 851 uint32_t output_sample_mask_index; 852 853 struct qreg undef; 854 uint32_t num_temps; 855 /* Number of temps in the program right before we spill a new temp. We 856 * use this to know which temps existed before a spill and which were 857 * added with the spill itself. 858 */ 859 uint32_t spill_start_num_temps; 860 861 struct vir_cursor cursor; 862 struct list_head blocks; 863 int next_block_index; 864 struct qblock *cur_block; 865 struct qblock *loop_cont_block; 866 struct qblock *loop_break_block; 867 /** 868 * Which temp, if any, do we currently have in the flags? 869 * This is set when processing a comparison instruction, and 870 * reset to -1 by anything else that touches the flags. 871 */ 872 int32_t flags_temp; 873 enum v3d_qpu_cond flags_cond; 874 875 uint64_t *qpu_insts; 876 uint32_t qpu_inst_count; 877 uint32_t qpu_inst_size; 878 uint32_t qpu_inst_stalled_count; 879 uint32_t nop_count; 880 881 /* For the FS, the number of varying inputs not counting the 882 * point/line varyings payload 883 */ 884 uint32_t num_inputs; 885 886 uint32_t program_id; 887 uint32_t variant_id; 888 889 /* Set to compile program in in 1x, 2x, or 4x threaded mode, where 890 * SIG_THREAD_SWITCH is used to hide texturing latency at the cost of 891 * limiting ourselves to the part of the physical reg space. 892 * 893 * On V3D 3.x, 2x or 4x divide the physical reg space by 2x or 4x. On 894 * V3D 4.x, all shaders are 2x threaded, and 4x only divides the 895 * physical reg space in half. 896 */ 897 uint8_t threads; 898 struct qinst *last_thrsw; 899 bool last_thrsw_at_top_level; 900 901 bool emitted_tlb_load; 902 bool lock_scoreboard_on_first_thrsw; 903 904 enum v3d_compilation_result compilation_result; 905 906 bool tmu_dirty_rcl; 907 bool has_global_address; 908}; 909 910struct v3d_uniform_list { 911 enum quniform_contents *contents; 912 uint32_t *data; 913 uint32_t count; 914}; 915 916struct v3d_prog_data { 917 struct v3d_uniform_list uniforms; 918 919 uint32_t spill_size; 920 uint32_t tmu_spills; 921 uint32_t tmu_fills; 922 923 uint32_t qpu_read_stalls; 924 925 uint8_t compile_strategy_idx; 926 927 uint8_t threads; 928 929 /* For threads > 1, whether the program should be dispatched in the 930 * after-final-THRSW state. 931 */ 932 bool single_seg; 933 934 bool tmu_dirty_rcl; 935 936 bool has_control_barrier; 937 938 bool has_global_address; 939}; 940 941struct v3d_vs_prog_data { 942 struct v3d_prog_data base; 943 944 bool uses_iid, uses_biid, uses_vid; 945 946 /* Number of components read from each vertex attribute. */ 947 uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4]; 948 949 /* Total number of components read, for the shader state record. */ 950 uint32_t vpm_input_size; 951 952 /* Total number of components written, for the shader state record. */ 953 uint32_t vpm_output_size; 954 955 /* Set if there should be separate VPM segments for input and output. 956 * If unset, vpm_input_size will be 0. 957 */ 958 bool separate_segments; 959 960 /* Value to be programmed in VCM_CACHE_SIZE. */ 961 uint8_t vcm_cache_size; 962 963 /* Maps the nir->data.location to its 964 * nir->data.driver_location. In general we are using the 965 * driver location as index (like vattr_sizes above), so this 966 * map is useful when what we have is the location 967 * 968 * Returns -1 if the location is not used 969 */ 970 int32_t driver_location_map[V3D_MAX_VS_INPUTS]; 971}; 972 973struct v3d_gs_prog_data { 974 struct v3d_prog_data base; 975 976 /* Whether the program reads gl_PrimitiveIDIn */ 977 bool uses_pid; 978 979 /* Number of components read from each input varying. */ 980 uint8_t input_sizes[V3D_MAX_GS_INPUTS / 4]; 981 982 /* Number of inputs */ 983 uint8_t num_inputs; 984 struct v3d_varying_slot input_slots[V3D_MAX_GS_INPUTS]; 985 986 /* Total number of components written, for the shader state record. */ 987 uint32_t vpm_output_size; 988 989 /* Maximum SIMD dispatch width to not exceed VPM output size limits 990 * in the geometry shader. Notice that the final dispatch width has to 991 * be decided at draw time and could be lower based on the VPM pressure 992 * added by other shader stages. 993 */ 994 uint8_t simd_width; 995 996 /* Output primitive type */ 997 uint8_t out_prim_type; 998 999 /* Number of GS invocations */ 1000 uint8_t num_invocations; 1001 1002 bool writes_psiz; 1003}; 1004 1005struct v3d_fs_prog_data { 1006 struct v3d_prog_data base; 1007 1008 /* Whether the program reads gl_PrimitiveID */ 1009 bool uses_pid; 1010 1011 struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS]; 1012 1013 /* Array of flat shade flags. 1014 * 1015 * Each entry is only 24 bits (high 8 bits 0), to match the hardware 1016 * packet layout. 1017 */ 1018 uint32_t flat_shade_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1]; 1019 1020 uint32_t noperspective_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1]; 1021 1022 uint32_t centroid_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1]; 1023 1024 uint8_t num_inputs; 1025 bool writes_z; 1026 bool writes_z_from_fep; 1027 bool disable_ez; 1028 bool uses_center_w; 1029 bool uses_implicit_point_line_varyings; 1030 bool lock_scoreboard_on_first_thrsw; 1031 bool force_per_sample_msaa; 1032}; 1033 1034struct v3d_compute_prog_data { 1035 struct v3d_prog_data base; 1036 /* Size in bytes of the workgroup's shared space. */ 1037 uint32_t shared_size; 1038 uint16_t local_size[3]; 1039 /* If the shader uses subgroup functionality */ 1040 bool has_subgroups; 1041}; 1042 1043struct vpm_config { 1044 uint32_t As; 1045 uint32_t Vc; 1046 uint32_t Gs; 1047 uint32_t Gd; 1048 uint32_t Gv; 1049 uint32_t Ve; 1050 uint32_t gs_width; 1051}; 1052 1053bool 1054v3d_compute_vpm_config(struct v3d_device_info *devinfo, 1055 struct v3d_vs_prog_data *vs_bin, 1056 struct v3d_vs_prog_data *vs, 1057 struct v3d_gs_prog_data *gs_bin, 1058 struct v3d_gs_prog_data *gs, 1059 struct vpm_config *vpm_cfg_bin, 1060 struct vpm_config *vpm_cfg); 1061 1062static inline bool 1063vir_has_uniform(struct qinst *inst) 1064{ 1065 return inst->uniform != ~0; 1066} 1067 1068const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo, 1069 uint32_t max_inline_uniform_buffers); 1070void v3d_compiler_free(const struct v3d_compiler *compiler); 1071void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s); 1072 1073uint64_t *v3d_compile(const struct v3d_compiler *compiler, 1074 struct v3d_key *key, 1075 struct v3d_prog_data **prog_data, 1076 nir_shader *s, 1077 void (*debug_output)(const char *msg, 1078 void *debug_output_data), 1079 void *debug_output_data, 1080 int program_id, int variant_id, 1081 uint32_t *final_assembly_size); 1082 1083uint32_t v3d_prog_data_size(gl_shader_stage stage); 1084void v3d_nir_to_vir(struct v3d_compile *c); 1085 1086void vir_compile_destroy(struct v3d_compile *c); 1087const char *vir_get_stage_name(struct v3d_compile *c); 1088struct qblock *vir_new_block(struct v3d_compile *c); 1089void vir_set_emit_block(struct v3d_compile *c, struct qblock *block); 1090void vir_link_blocks(struct qblock *predecessor, struct qblock *successor); 1091struct qblock *vir_entry_block(struct v3d_compile *c); 1092struct qblock *vir_exit_block(struct v3d_compile *c); 1093struct qinst *vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, 1094 struct qreg src0, struct qreg src1); 1095struct qinst *vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, 1096 struct qreg src0, struct qreg src1); 1097struct qinst *vir_branch_inst(struct v3d_compile *c, 1098 enum v3d_qpu_branch_cond cond); 1099void vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst); 1100uint32_t vir_get_uniform_index(struct v3d_compile *c, 1101 enum quniform_contents contents, 1102 uint32_t data); 1103struct qreg vir_uniform(struct v3d_compile *c, 1104 enum quniform_contents contents, 1105 uint32_t data); 1106void vir_schedule_instructions(struct v3d_compile *c); 1107void v3d_setup_spill_base(struct v3d_compile *c); 1108struct v3d_qpu_instr v3d_qpu_nop(void); 1109 1110struct qreg vir_emit_def(struct v3d_compile *c, struct qinst *inst); 1111struct qinst *vir_emit_nondef(struct v3d_compile *c, struct qinst *inst); 1112void vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond); 1113enum v3d_qpu_cond vir_get_cond(struct qinst *inst); 1114void vir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf); 1115void vir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf); 1116void vir_set_unpack(struct qinst *inst, int src, 1117 enum v3d_qpu_input_unpack unpack); 1118void vir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack); 1119 1120struct qreg vir_get_temp(struct v3d_compile *c); 1121void vir_calculate_live_intervals(struct v3d_compile *c); 1122int vir_get_nsrc(struct qinst *inst); 1123bool vir_has_side_effects(struct v3d_compile *c, struct qinst *inst); 1124bool vir_get_add_op(struct qinst *inst, enum v3d_qpu_add_op *op); 1125bool vir_get_mul_op(struct qinst *inst, enum v3d_qpu_mul_op *op); 1126bool vir_is_raw_mov(struct qinst *inst); 1127bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst); 1128bool vir_is_add(struct qinst *inst); 1129bool vir_is_mul(struct qinst *inst); 1130bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst); 1131bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst); 1132struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg); 1133uint8_t vir_channels_written(struct qinst *inst); 1134struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i); 1135void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, 1136 struct qreg result); 1137bool ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components); 1138void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_dest *dest, 1139 uint32_t component_mask); 1140void ntq_flush_tmu(struct v3d_compile *c); 1141void vir_emit_thrsw(struct v3d_compile *c); 1142 1143void vir_dump(struct v3d_compile *c); 1144void vir_dump_inst(struct v3d_compile *c, struct qinst *inst); 1145void vir_dump_uniform(enum quniform_contents contents, uint32_t data); 1146 1147void vir_validate(struct v3d_compile *c); 1148 1149void vir_optimize(struct v3d_compile *c); 1150bool vir_opt_algebraic(struct v3d_compile *c); 1151bool vir_opt_constant_folding(struct v3d_compile *c); 1152bool vir_opt_copy_propagate(struct v3d_compile *c); 1153bool vir_opt_dead_code(struct v3d_compile *c); 1154bool vir_opt_peephole_sf(struct v3d_compile *c); 1155bool vir_opt_redundant_flags(struct v3d_compile *c); 1156bool vir_opt_small_immediates(struct v3d_compile *c); 1157bool vir_opt_vpm(struct v3d_compile *c); 1158bool vir_opt_constant_alu(struct v3d_compile *c); 1159bool v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c); 1160bool v3d_nir_lower_line_smooth(nir_shader *shader); 1161bool v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c); 1162bool v3d_nir_lower_robust_buffer_access(nir_shader *shader, struct v3d_compile *c); 1163bool v3d_nir_lower_scratch(nir_shader *s); 1164bool v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c); 1165bool v3d_nir_lower_image_load_store(nir_shader *s); 1166bool v3d_nir_lower_load_store_bitsize(nir_shader *s, struct v3d_compile *c); 1167 1168void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components); 1169void v3d33_vir_vpm_write_setup(struct v3d_compile *c); 1170void v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr); 1171void v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr); 1172void v3d40_vir_emit_image_load_store(struct v3d_compile *c, 1173 nir_intrinsic_instr *instr); 1174 1175void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers); 1176uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c); 1177void qpu_validate(struct v3d_compile *c); 1178struct qpu_reg *v3d_register_allocate(struct v3d_compile *c); 1179bool vir_init_reg_sets(struct v3d_compiler *compiler); 1180 1181int v3d_shaderdb_dump(struct v3d_compile *c, char **shaderdb_str); 1182 1183bool v3d_gl_format_is_return_32(enum pipe_format format); 1184 1185uint32_t 1186v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src); 1187 1188static inline bool 1189quniform_contents_is_texture_p0(enum quniform_contents contents) 1190{ 1191 return (contents >= QUNIFORM_TEXTURE_CONFIG_P0_0 && 1192 contents < (QUNIFORM_TEXTURE_CONFIG_P0_0 + 1193 V3D_MAX_TEXTURE_SAMPLERS)); 1194} 1195 1196static inline bool 1197vir_in_nonuniform_control_flow(struct v3d_compile *c) 1198{ 1199 return c->execute.file != QFILE_NULL; 1200} 1201 1202static inline struct qreg 1203vir_uniform_ui(struct v3d_compile *c, uint32_t ui) 1204{ 1205 return vir_uniform(c, QUNIFORM_CONSTANT, ui); 1206} 1207 1208static inline struct qreg 1209vir_uniform_f(struct v3d_compile *c, float f) 1210{ 1211 return vir_uniform(c, QUNIFORM_CONSTANT, fui(f)); 1212} 1213 1214#define VIR_ALU0(name, vir_inst, op) \ 1215static inline struct qreg \ 1216vir_##name(struct v3d_compile *c) \ 1217{ \ 1218 return vir_emit_def(c, vir_inst(op, c->undef, \ 1219 c->undef, c->undef)); \ 1220} \ 1221static inline struct qinst * \ 1222vir_##name##_dest(struct v3d_compile *c, struct qreg dest) \ 1223{ \ 1224 return vir_emit_nondef(c, vir_inst(op, dest, \ 1225 c->undef, c->undef)); \ 1226} 1227 1228#define VIR_ALU1(name, vir_inst, op) \ 1229static inline struct qreg \ 1230vir_##name(struct v3d_compile *c, struct qreg a) \ 1231{ \ 1232 return vir_emit_def(c, vir_inst(op, c->undef, \ 1233 a, c->undef)); \ 1234} \ 1235static inline struct qinst * \ 1236vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \ 1237 struct qreg a) \ 1238{ \ 1239 return vir_emit_nondef(c, vir_inst(op, dest, a, \ 1240 c->undef)); \ 1241} 1242 1243#define VIR_ALU2(name, vir_inst, op) \ 1244static inline struct qreg \ 1245vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \ 1246{ \ 1247 return vir_emit_def(c, vir_inst(op, c->undef, a, b)); \ 1248} \ 1249static inline struct qinst * \ 1250vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \ 1251 struct qreg a, struct qreg b) \ 1252{ \ 1253 return vir_emit_nondef(c, vir_inst(op, dest, a, b)); \ 1254} 1255 1256#define VIR_NODST_0(name, vir_inst, op) \ 1257static inline struct qinst * \ 1258vir_##name(struct v3d_compile *c) \ 1259{ \ 1260 return vir_emit_nondef(c, vir_inst(op, c->undef, \ 1261 c->undef, c->undef)); \ 1262} 1263 1264#define VIR_NODST_1(name, vir_inst, op) \ 1265static inline struct qinst * \ 1266vir_##name(struct v3d_compile *c, struct qreg a) \ 1267{ \ 1268 return vir_emit_nondef(c, vir_inst(op, c->undef, \ 1269 a, c->undef)); \ 1270} 1271 1272#define VIR_NODST_2(name, vir_inst, op) \ 1273static inline struct qinst * \ 1274vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \ 1275{ \ 1276 return vir_emit_nondef(c, vir_inst(op, c->undef, \ 1277 a, b)); \ 1278} 1279 1280#define VIR_SFU(name) \ 1281static inline struct qreg \ 1282vir_##name(struct v3d_compile *c, struct qreg a) \ 1283{ \ 1284 if (c->devinfo->ver >= 41) { \ 1285 return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name, \ 1286 c->undef, \ 1287 a, c->undef)); \ 1288 } else { \ 1289 vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \ 1290 return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \ 1291 } \ 1292} \ 1293static inline struct qinst * \ 1294vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \ 1295 struct qreg a) \ 1296{ \ 1297 if (c->devinfo->ver >= 41) { \ 1298 return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \ 1299 dest, \ 1300 a, c->undef)); \ 1301 } else { \ 1302 vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \ 1303 return vir_FMOV_dest(c, dest, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \ 1304 } \ 1305} 1306 1307#define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name) 1308#define VIR_M_ALU2(name) VIR_ALU2(name, vir_mul_inst, V3D_QPU_M_##name) 1309#define VIR_A_ALU1(name) VIR_ALU1(name, vir_add_inst, V3D_QPU_A_##name) 1310#define VIR_M_ALU1(name) VIR_ALU1(name, vir_mul_inst, V3D_QPU_M_##name) 1311#define VIR_A_ALU0(name) VIR_ALU0(name, vir_add_inst, V3D_QPU_A_##name) 1312#define VIR_M_ALU0(name) VIR_ALU0(name, vir_mul_inst, V3D_QPU_M_##name) 1313#define VIR_A_NODST_2(name) VIR_NODST_2(name, vir_add_inst, V3D_QPU_A_##name) 1314#define VIR_M_NODST_2(name) VIR_NODST_2(name, vir_mul_inst, V3D_QPU_M_##name) 1315#define VIR_A_NODST_1(name) VIR_NODST_1(name, vir_add_inst, V3D_QPU_A_##name) 1316#define VIR_M_NODST_1(name) VIR_NODST_1(name, vir_mul_inst, V3D_QPU_M_##name) 1317#define VIR_A_NODST_0(name) VIR_NODST_0(name, vir_add_inst, V3D_QPU_A_##name) 1318 1319VIR_A_ALU2(FADD) 1320VIR_A_ALU2(VFPACK) 1321VIR_A_ALU2(FSUB) 1322VIR_A_ALU2(FMIN) 1323VIR_A_ALU2(FMAX) 1324 1325VIR_A_ALU2(ADD) 1326VIR_A_ALU2(SUB) 1327VIR_A_ALU2(SHL) 1328VIR_A_ALU2(SHR) 1329VIR_A_ALU2(ASR) 1330VIR_A_ALU2(ROR) 1331VIR_A_ALU2(MIN) 1332VIR_A_ALU2(MAX) 1333VIR_A_ALU2(UMIN) 1334VIR_A_ALU2(UMAX) 1335VIR_A_ALU2(AND) 1336VIR_A_ALU2(OR) 1337VIR_A_ALU2(XOR) 1338VIR_A_ALU2(VADD) 1339VIR_A_ALU2(VSUB) 1340VIR_A_NODST_2(STVPMV) 1341VIR_A_NODST_2(STVPMD) 1342VIR_A_ALU1(NOT) 1343VIR_A_ALU1(NEG) 1344VIR_A_ALU1(FLAPUSH) 1345VIR_A_ALU1(FLBPUSH) 1346VIR_A_ALU1(FLPOP) 1347VIR_A_ALU0(FLAFIRST) 1348VIR_A_ALU0(FLNAFIRST) 1349VIR_A_ALU1(SETMSF) 1350VIR_A_ALU1(SETREVF) 1351VIR_A_ALU0(TIDX) 1352VIR_A_ALU0(EIDX) 1353VIR_A_ALU1(LDVPMV_IN) 1354VIR_A_ALU1(LDVPMV_OUT) 1355VIR_A_ALU1(LDVPMD_IN) 1356VIR_A_ALU1(LDVPMD_OUT) 1357VIR_A_ALU2(LDVPMG_IN) 1358VIR_A_ALU2(LDVPMG_OUT) 1359VIR_A_ALU0(TMUWT) 1360 1361VIR_A_ALU0(IID) 1362VIR_A_ALU0(FXCD) 1363VIR_A_ALU0(XCD) 1364VIR_A_ALU0(FYCD) 1365VIR_A_ALU0(YCD) 1366VIR_A_ALU0(MSF) 1367VIR_A_ALU0(REVF) 1368VIR_A_ALU0(BARRIERID) 1369VIR_A_ALU0(SAMPID) 1370VIR_A_NODST_1(VPMSETUP) 1371VIR_A_NODST_0(VPMWT) 1372VIR_A_ALU2(FCMP) 1373VIR_A_ALU2(VFMAX) 1374 1375VIR_A_ALU1(FROUND) 1376VIR_A_ALU1(FTOIN) 1377VIR_A_ALU1(FTRUNC) 1378VIR_A_ALU1(FTOIZ) 1379VIR_A_ALU1(FFLOOR) 1380VIR_A_ALU1(FTOUZ) 1381VIR_A_ALU1(FCEIL) 1382VIR_A_ALU1(FTOC) 1383 1384VIR_A_ALU1(FDX) 1385VIR_A_ALU1(FDY) 1386 1387VIR_A_ALU1(ITOF) 1388VIR_A_ALU1(CLZ) 1389VIR_A_ALU1(UTOF) 1390 1391VIR_M_ALU2(UMUL24) 1392VIR_M_ALU2(FMUL) 1393VIR_M_ALU2(SMUL24) 1394VIR_M_NODST_2(MULTOP) 1395 1396VIR_M_ALU1(MOV) 1397VIR_M_ALU1(FMOV) 1398 1399VIR_SFU(RECIP) 1400VIR_SFU(RSQRT) 1401VIR_SFU(EXP) 1402VIR_SFU(LOG) 1403VIR_SFU(SIN) 1404VIR_SFU(RSQRT2) 1405 1406static inline struct qinst * 1407vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond, 1408 struct qreg dest, struct qreg src) 1409{ 1410 struct qinst *mov = vir_MOV_dest(c, dest, src); 1411 vir_set_cond(mov, cond); 1412 return mov; 1413} 1414 1415static inline struct qreg 1416vir_SEL(struct v3d_compile *c, enum v3d_qpu_cond cond, 1417 struct qreg src0, struct qreg src1) 1418{ 1419 struct qreg t = vir_get_temp(c); 1420 vir_MOV_dest(c, t, src1); 1421 vir_MOV_cond(c, cond, t, src0); 1422 return t; 1423} 1424 1425static inline struct qinst * 1426vir_NOP(struct v3d_compile *c) 1427{ 1428 return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_NOP, 1429 c->undef, c->undef, c->undef)); 1430} 1431 1432static inline struct qreg 1433vir_LDTMU(struct v3d_compile *c) 1434{ 1435 if (c->devinfo->ver >= 41) { 1436 struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef, 1437 c->undef, c->undef); 1438 ldtmu->qpu.sig.ldtmu = true; 1439 1440 return vir_emit_def(c, ldtmu); 1441 } else { 1442 vir_NOP(c)->qpu.sig.ldtmu = true; 1443 return vir_MOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); 1444 } 1445} 1446 1447static inline struct qreg 1448vir_UMUL(struct v3d_compile *c, struct qreg src0, struct qreg src1) 1449{ 1450 vir_MULTOP(c, src0, src1); 1451 return vir_UMUL24(c, src0, src1); 1452} 1453 1454static inline struct qreg 1455vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config) 1456{ 1457 assert(c->devinfo->ver >= 41); /* XXX */ 1458 assert((config & 0xffffff00) == 0xffffff00); 1459 1460 struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef, 1461 c->undef, c->undef); 1462 ldtlb->qpu.sig.ldtlbu = true; 1463 ldtlb->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, config); 1464 return vir_emit_def(c, ldtlb); 1465} 1466 1467static inline struct qreg 1468vir_TLB_COLOR_READ(struct v3d_compile *c) 1469{ 1470 assert(c->devinfo->ver >= 41); /* XXX */ 1471 1472 struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef, 1473 c->undef, c->undef); 1474 ldtlb->qpu.sig.ldtlb = true; 1475 return vir_emit_def(c, ldtlb); 1476} 1477 1478static inline struct qinst * 1479vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond) 1480{ 1481 /* The actual uniform_data value will be set at scheduling time */ 1482 return vir_emit_nondef(c, vir_branch_inst(c, cond)); 1483} 1484 1485#define vir_for_each_block(block, c) \ 1486 list_for_each_entry(struct qblock, block, &c->blocks, link) 1487 1488#define vir_for_each_block_rev(block, c) \ 1489 list_for_each_entry_rev(struct qblock, block, &c->blocks, link) 1490 1491/* Loop over the non-NULL members of the successors array. */ 1492#define vir_for_each_successor(succ, block) \ 1493 for (struct qblock *succ = block->successors[0]; \ 1494 succ != NULL; \ 1495 succ = (succ == block->successors[1] ? NULL : \ 1496 block->successors[1])) 1497 1498#define vir_for_each_inst(inst, block) \ 1499 list_for_each_entry(struct qinst, inst, &block->instructions, link) 1500 1501#define vir_for_each_inst_rev(inst, block) \ 1502 list_for_each_entry_rev(struct qinst, inst, &block->instructions, link) 1503 1504#define vir_for_each_inst_safe(inst, block) \ 1505 list_for_each_entry_safe(struct qinst, inst, &block->instructions, link) 1506 1507#define vir_for_each_inst_inorder(inst, c) \ 1508 vir_for_each_block(_block, c) \ 1509 vir_for_each_inst(inst, _block) 1510 1511#define vir_for_each_inst_inorder_safe(inst, c) \ 1512 vir_for_each_block(_block, c) \ 1513 vir_for_each_inst_safe(inst, _block) 1514 1515#endif /* V3D_COMPILER_H */ 1516