1/* 2 * Copyright (C) 2022 Collabora Ltd. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#include "va_compiler.h" 25#include "valhall_enums.h" 26#include "bi_builder.h" 27 28/* 29 * Merge NOPs with flow control with nearby instructions to eliminate the NOPs, 30 * according to the following rules: 31 * 32 * 1. Waits may be combined by waiting on a union of the slots. 33 * 2. Waits may be moved up as far as the first (last) asynchronous instruction 34 * writing to a slot waited on, at a performance cost. 35 * 3. Discard may be moved down, at a performance cost. 36 * 4. Reconverge must be on the last instruction of the block. 37 * 5. End must be on the last instruction of the block. 38 * 39 * For simplicity, this pass only merges within a single basic block. This 40 * should be sufficient. The algorithm works as follows. 41 * 42 * Because reconverge/end must be on the last instruction, there may be only 43 * one of these in a block. First check for a NOP at the end, and if it has 44 * reconverge/end flow control, merge with the penultimate instruction. Now we 45 * only have to worry about waits and discard. 46 * 47 * The cost of moving a wait up is assumed to be greater than the cost of moving 48 * a discard down, so we next move waits while we have more freedom. For each 49 * wait, merge with an instruction above that either is free or has another 50 * wait, and merge the flow control, aborting if we hit a message signaling the 51 * relevant slot. By maintaining the candidate instruction at all steps, this is 52 * implemented in a linear time walk. 53 * 54 * Finally, all that's left are discards. Move discards down to merge with a 55 * free instruction with a similar linear time walk. 56 * 57 * Since discarding helper threads is an optimization (not required for 58 * correctness), we may remove discards if we believe them harmful to the 59 * performance of the shader. Keeping with the local structure of the pass, we 60 * remove discards only if they are at the end of the last block and cannot be 61 * merged with any other instruction. Such a condition means every following 62 * instruction has incompatible flow control (wait and end). In practice, this 63 * means BLEND and ATEST run for helper threads in certain shaders to save a 64 * NOP, but BLEND is a no-op for helper threads anyway. 65 * 66 * One case we don't handle is merging "foo, bar, wait, reconverge" to 67 * "foo.wait, bar.reconverge". This sequence is rarely generated by the dataflow 68 * analysis, so we don't care for the complexity. 69 */ 70 71/* 72 * If the last instruction has reconverge or end, try to merge with the 73 * second to last instruction. 74 * 75 * Precondition: block has at least 2 instructions. 76 */ 77static void 78merge_end_reconverge(bi_block *block) 79{ 80 bi_instr *last = list_last_entry(&block->instructions, bi_instr, link); 81 bi_instr *penult = bi_prev_op(last); 82 83 if (last->op != BI_OPCODE_NOP) return; 84 if (last->flow != VA_FLOW_RECONVERGE && last->flow != VA_FLOW_END) return; 85 86 /* End implies all other flow control except for waiting on barriers (slot 87 * #7, with VA_FLOW_WAIT), so remove blocking flow control. 88 */ 89 if (last->flow == VA_FLOW_END) { 90 while (penult->op == BI_OPCODE_NOP && penult->flow != VA_FLOW_WAIT) { 91 bi_remove_instruction(penult); 92 93 /* There may be nothing left */ 94 if (list_is_singular(&block->instructions)) 95 return; 96 97 penult = bi_prev_op(last); 98 } 99 } 100 101 /* If there is blocking flow control, we can't merge */ 102 if (penult->flow != VA_FLOW_NONE) return; 103 104 /* Else, merge */ 105 penult->flow = last->flow; 106 bi_remove_instruction(last); 107} 108 109/* 110 * Calculate the union of two waits. We may wait on any combination of slots #0, 111 * #1, #2 or the entirety of 0126 and 01267. If we wait on the entirety, the 112 * union is trivial. If we do not wait on slot #6 (by extension slot #7), we 113 * wait only on slots #0, #1, and #2, for which the waits are encoded as a 114 * bitset and the union is just a bitwise OR. 115 */ 116static enum va_flow 117union_waits(enum va_flow x, enum va_flow y) 118{ 119 assert(va_flow_is_wait_or_none(x) && va_flow_is_wait_or_none(y)); 120 121 if ((x == VA_FLOW_WAIT) || (y == VA_FLOW_WAIT)) 122 return VA_FLOW_WAIT; 123 else if ((x == VA_FLOW_WAIT0126) || (y == VA_FLOW_WAIT0126)) 124 return VA_FLOW_WAIT0126; 125 else 126 return x | y; 127} 128 129static void 130merge_waits(bi_block *block) 131{ 132 /* Most recent instruction with which we can merge, or NULL if none */ 133 bi_instr *last_free = NULL; 134 135 bi_foreach_instr_in_block_safe(block, I) { 136 if (last_free != NULL && 137 I->op == BI_OPCODE_NOP && va_flow_is_wait_or_none(I->flow)) { 138 139 /* Merge waits with compatible instructions */ 140 last_free->flow = union_waits(last_free->flow, I->flow); 141 bi_remove_instruction(I); 142 continue; 143 } 144 145 /* Don't move waits past async instructions, since they might be what 146 * we're waiting for. If we wanted to optimize this case, we could check 147 * the signaled slots. 148 */ 149 if (bi_opcode_props[I->op].message) 150 last_free = NULL; 151 152 /* We can only merge with instructions whose flow control is a wait. 153 * This includes such an instruction after merging in a wait. It also 154 * includes async instructions. 155 */ 156 if (va_flow_is_wait_or_none(I->flow)) 157 last_free = I; 158 } 159} 160 161static bool 162bi_is_first_instr(bi_block *block, bi_instr *I) 163{ 164 return block->instructions.next == &I->link; 165} 166 167static void 168merge_discard(bi_block *block) 169{ 170 bi_instr *last_free = NULL; 171 172 bi_foreach_instr_in_block_safe_rev(block, I) { 173 if ((I->op == BI_OPCODE_NOP) && (I->flow == VA_FLOW_DISCARD)) { 174 /* Try to merge with the instruction *preceding* discard, because 175 * because flow control happens at the end of an instruction and 176 * discard is a NOP. 177 */ 178 if (!bi_is_first_instr(block, I)) { 179 bi_instr *prev = bi_prev_op(I); 180 181 if (prev->flow == VA_FLOW_NONE) { 182 prev->flow = VA_FLOW_DISCARD; 183 bi_remove_instruction(I); 184 continue; 185 } 186 } 187 188 /* Or try to merge with the next instruction with no flow control */ 189 if (last_free != NULL) { 190 last_free->flow = VA_FLOW_DISCARD; 191 bi_remove_instruction(I); 192 continue; 193 } 194 195 /* If there's nowhere to merge and this is the end of the shader, just 196 * remove the discard. 197 */ 198 if (!block->successors[0] && !block->successors[1]) { 199 bi_remove_instruction(I); 200 continue; 201 } 202 } 203 204 /* Allow merging into free instructions */ 205 if (I->flow == VA_FLOW_NONE) 206 last_free = I; 207 } 208} 209 210void 211va_merge_flow(bi_context *ctx) 212{ 213 bi_foreach_block(ctx, block) { 214 /* If there are less than 2 instructions, there's nothing to merge */ 215 if (list_is_empty(&block->instructions)) continue; 216 if (list_is_singular(&block->instructions)) continue; 217 218 merge_end_reconverge(block); 219 merge_waits(block); 220 221 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend) 222 merge_discard(block); 223 } 224} 225