1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright (C) 2019 Google, Inc. 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20bf215546Sopenharmony_ci * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21bf215546Sopenharmony_ci * SOFTWARE. 22bf215546Sopenharmony_ci * 23bf215546Sopenharmony_ci * Authors: 24bf215546Sopenharmony_ci * Rob Clark <robclark@freedesktop.org> 25bf215546Sopenharmony_ci */ 26bf215546Sopenharmony_ci 27bf215546Sopenharmony_ci#include "ir3.h" 28bf215546Sopenharmony_ci 29bf215546Sopenharmony_ci/* The maximum number of nop's we may need to insert between two instructions. 30bf215546Sopenharmony_ci */ 31bf215546Sopenharmony_ci#define MAX_NOPS 6 32bf215546Sopenharmony_ci 33bf215546Sopenharmony_ci/* 34bf215546Sopenharmony_ci * Helpers to figure out the necessary delay slots between instructions. Used 35bf215546Sopenharmony_ci * both in scheduling pass(es) and the final pass to insert any required nop's 36bf215546Sopenharmony_ci * so that the shader program is valid. 37bf215546Sopenharmony_ci * 38bf215546Sopenharmony_ci * Note that this needs to work both pre and post RA, so we can't assume ssa 39bf215546Sopenharmony_ci * src iterators work. 40bf215546Sopenharmony_ci */ 41bf215546Sopenharmony_ci 42bf215546Sopenharmony_ci/* calculate required # of delay slots between the instruction that 43bf215546Sopenharmony_ci * assigns a value and the one that consumes 44bf215546Sopenharmony_ci */ 45bf215546Sopenharmony_ciint 46bf215546Sopenharmony_ciir3_delayslots(struct ir3_instruction *assigner, 47bf215546Sopenharmony_ci struct ir3_instruction *consumer, unsigned n, bool soft) 48bf215546Sopenharmony_ci{ 49bf215546Sopenharmony_ci /* generally don't count false dependencies, since this can just be 50bf215546Sopenharmony_ci * something like a barrier, or SSBO store. 51bf215546Sopenharmony_ci */ 52bf215546Sopenharmony_ci if (__is_false_dep(consumer, n)) 53bf215546Sopenharmony_ci return 0; 54bf215546Sopenharmony_ci 55bf215546Sopenharmony_ci /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal 56bf215546Sopenharmony_ci * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch 57bf215546Sopenharmony_ci * handled with sync bits 58bf215546Sopenharmony_ci */ 59bf215546Sopenharmony_ci 60bf215546Sopenharmony_ci if (is_meta(assigner) || is_meta(consumer)) 61bf215546Sopenharmony_ci return 0; 62bf215546Sopenharmony_ci 63bf215546Sopenharmony_ci if (writes_addr0(assigner) || writes_addr1(assigner)) 64bf215546Sopenharmony_ci return 6; 65bf215546Sopenharmony_ci 66bf215546Sopenharmony_ci if (soft && is_ss_producer(assigner)) 67bf215546Sopenharmony_ci return soft_ss_delay(assigner); 68bf215546Sopenharmony_ci 69bf215546Sopenharmony_ci /* handled via sync flags: */ 70bf215546Sopenharmony_ci if (is_ss_producer(assigner) || is_sy_producer(assigner)) 71bf215546Sopenharmony_ci return 0; 72bf215546Sopenharmony_ci 73bf215546Sopenharmony_ci /* As far as we know, shader outputs don't need any delay. */ 74bf215546Sopenharmony_ci if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK) 75bf215546Sopenharmony_ci return 0; 76bf215546Sopenharmony_ci 77bf215546Sopenharmony_ci /* assigner must be alu: */ 78bf215546Sopenharmony_ci if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) || 79bf215546Sopenharmony_ci is_mem(consumer)) { 80bf215546Sopenharmony_ci return 6; 81bf215546Sopenharmony_ci } else { 82bf215546Sopenharmony_ci /* In mergedregs mode, there is an extra 2-cycle penalty when half of 83bf215546Sopenharmony_ci * a full-reg is read as a half-reg or when a half-reg is read as a 84bf215546Sopenharmony_ci * full-reg. 85bf215546Sopenharmony_ci */ 86bf215546Sopenharmony_ci bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) != 87bf215546Sopenharmony_ci (consumer->srcs[n]->flags & IR3_REG_HALF); 88bf215546Sopenharmony_ci unsigned penalty = mismatched_half ? 3 : 0; 89bf215546Sopenharmony_ci if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && (n == 2)) { 90bf215546Sopenharmony_ci /* special case, 3rd src to cat3 not required on first cycle */ 91bf215546Sopenharmony_ci return 1 + penalty; 92bf215546Sopenharmony_ci } else { 93bf215546Sopenharmony_ci return 3 + penalty; 94bf215546Sopenharmony_ci } 95bf215546Sopenharmony_ci } 96bf215546Sopenharmony_ci} 97bf215546Sopenharmony_ci 98bf215546Sopenharmony_cistatic bool 99bf215546Sopenharmony_cicount_instruction(struct ir3_instruction *n) 100bf215546Sopenharmony_ci{ 101bf215546Sopenharmony_ci /* NOTE: don't count branch/jump since we don't know yet if they will 102bf215546Sopenharmony_ci * be eliminated later in resolve_jumps().. really should do that 103bf215546Sopenharmony_ci * earlier so we don't have this constraint. 104bf215546Sopenharmony_ci */ 105bf215546Sopenharmony_ci return is_alu(n) || 106bf215546Sopenharmony_ci (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B)); 107bf215546Sopenharmony_ci} 108bf215546Sopenharmony_ci 109bf215546Sopenharmony_ci/* Post-RA, we don't have arrays any more, so we have to be a bit careful here 110bf215546Sopenharmony_ci * and have to handle relative accesses specially. 111bf215546Sopenharmony_ci */ 112bf215546Sopenharmony_ci 113bf215546Sopenharmony_cistatic unsigned 114bf215546Sopenharmony_cipost_ra_reg_elems(struct ir3_register *reg) 115bf215546Sopenharmony_ci{ 116bf215546Sopenharmony_ci if (reg->flags & IR3_REG_RELATIV) 117bf215546Sopenharmony_ci return reg->size; 118bf215546Sopenharmony_ci return reg_elems(reg); 119bf215546Sopenharmony_ci} 120bf215546Sopenharmony_ci 121bf215546Sopenharmony_cistatic unsigned 122bf215546Sopenharmony_cipost_ra_reg_num(struct ir3_register *reg) 123bf215546Sopenharmony_ci{ 124bf215546Sopenharmony_ci if (reg->flags & IR3_REG_RELATIV) 125bf215546Sopenharmony_ci return reg->array.base; 126bf215546Sopenharmony_ci return reg->num; 127bf215546Sopenharmony_ci} 128bf215546Sopenharmony_ci 129bf215546Sopenharmony_ciunsigned 130bf215546Sopenharmony_ciir3_delayslots_with_repeat(struct ir3_instruction *assigner, 131bf215546Sopenharmony_ci struct ir3_instruction *consumer, 132bf215546Sopenharmony_ci unsigned assigner_n, unsigned consumer_n) 133bf215546Sopenharmony_ci{ 134bf215546Sopenharmony_ci unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, false); 135bf215546Sopenharmony_ci 136bf215546Sopenharmony_ci struct ir3_register *src = consumer->srcs[consumer_n]; 137bf215546Sopenharmony_ci struct ir3_register *dst = assigner->dsts[assigner_n]; 138bf215546Sopenharmony_ci 139bf215546Sopenharmony_ci if (assigner->repeat == 0 && consumer->repeat == 0) 140bf215546Sopenharmony_ci return delay; 141bf215546Sopenharmony_ci 142bf215546Sopenharmony_ci unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src); 143bf215546Sopenharmony_ci unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst); 144bf215546Sopenharmony_ci 145bf215546Sopenharmony_ci /* If either side is a relative access, we can't really apply most of the 146bf215546Sopenharmony_ci * reasoning below because we don't know which component aliases which. 147bf215546Sopenharmony_ci * Just bail in this case. 148bf215546Sopenharmony_ci */ 149bf215546Sopenharmony_ci if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV)) 150bf215546Sopenharmony_ci return delay; 151bf215546Sopenharmony_ci 152bf215546Sopenharmony_ci /* MOVMSK seems to require that all users wait until the entire 153bf215546Sopenharmony_ci * instruction is finished, so just bail here. 154bf215546Sopenharmony_ci */ 155bf215546Sopenharmony_ci if (assigner->opc == OPC_MOVMSK) 156bf215546Sopenharmony_ci return delay; 157bf215546Sopenharmony_ci 158bf215546Sopenharmony_ci bool mismatched_half = 159bf215546Sopenharmony_ci (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF); 160bf215546Sopenharmony_ci 161bf215546Sopenharmony_ci /* TODO: Handle the combination of (rpt) and different component sizes 162bf215546Sopenharmony_ci * better like below. This complicates things significantly because the 163bf215546Sopenharmony_ci * components don't line up. 164bf215546Sopenharmony_ci */ 165bf215546Sopenharmony_ci if (mismatched_half) 166bf215546Sopenharmony_ci return delay; 167bf215546Sopenharmony_ci 168bf215546Sopenharmony_ci /* If an instruction has a (rpt), then it acts as a sequence of 169bf215546Sopenharmony_ci * instructions, reading its non-(r) sources at each cycle. First, get the 170bf215546Sopenharmony_ci * register num for the first instruction where they interfere: 171bf215546Sopenharmony_ci */ 172bf215546Sopenharmony_ci 173bf215546Sopenharmony_ci unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst); 174bf215546Sopenharmony_ci 175bf215546Sopenharmony_ci /* Now, for that first conflicting half/full register, figure out the 176bf215546Sopenharmony_ci * sub-instruction within assigner/consumer it corresponds to. For (r) 177bf215546Sopenharmony_ci * sources, this should already return the correct answer of 0. However we 178bf215546Sopenharmony_ci * have to special-case the multi-mov instructions, where the 179bf215546Sopenharmony_ci * sub-instructions sometimes come from the src/dst indices instead. 180bf215546Sopenharmony_ci */ 181bf215546Sopenharmony_ci unsigned first_src_instr; 182bf215546Sopenharmony_ci if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT) 183bf215546Sopenharmony_ci first_src_instr = consumer_n; 184bf215546Sopenharmony_ci else 185bf215546Sopenharmony_ci first_src_instr = first_num - src->num; 186bf215546Sopenharmony_ci 187bf215546Sopenharmony_ci unsigned first_dst_instr; 188bf215546Sopenharmony_ci if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT) 189bf215546Sopenharmony_ci first_dst_instr = assigner_n; 190bf215546Sopenharmony_ci else 191bf215546Sopenharmony_ci first_dst_instr = first_num - dst->num; 192bf215546Sopenharmony_ci 193bf215546Sopenharmony_ci /* The delay we return is relative to the *end* of assigner and the 194bf215546Sopenharmony_ci * *beginning* of consumer, because it's the number of nops (or other 195bf215546Sopenharmony_ci * things) needed between them. Any instructions after first_dst_instr 196bf215546Sopenharmony_ci * subtract from the delay, and so do any instructions before 197bf215546Sopenharmony_ci * first_src_instr. Calculate an offset to subtract from the non-rpt-aware 198bf215546Sopenharmony_ci * delay to account for that. 199bf215546Sopenharmony_ci * 200bf215546Sopenharmony_ci * Now, a priori, we need to go through this process for every 201bf215546Sopenharmony_ci * conflicting regnum and take the minimum of the offsets to make sure 202bf215546Sopenharmony_ci * that the appropriate number of nop's is inserted for every conflicting 203bf215546Sopenharmony_ci * pair of sub-instructions. However, as we go to the next conflicting 204bf215546Sopenharmony_ci * regnum (if any), the number of instructions after first_dst_instr 205bf215546Sopenharmony_ci * decreases by 1 and the number of source instructions before 206bf215546Sopenharmony_ci * first_src_instr correspondingly increases by 1, so the offset stays the 207bf215546Sopenharmony_ci * same for all conflicting registers. 208bf215546Sopenharmony_ci */ 209bf215546Sopenharmony_ci unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr); 210bf215546Sopenharmony_ci return offset > delay ? 0 : delay - offset; 211bf215546Sopenharmony_ci} 212bf215546Sopenharmony_ci 213bf215546Sopenharmony_cistatic unsigned 214bf215546Sopenharmony_cidelay_calc_srcn(struct ir3_instruction *assigner, 215bf215546Sopenharmony_ci struct ir3_instruction *consumer, unsigned assigner_n, 216bf215546Sopenharmony_ci unsigned consumer_n, bool mergedregs) 217bf215546Sopenharmony_ci{ 218bf215546Sopenharmony_ci struct ir3_register *src = consumer->srcs[consumer_n]; 219bf215546Sopenharmony_ci struct ir3_register *dst = assigner->dsts[assigner_n]; 220bf215546Sopenharmony_ci bool mismatched_half = 221bf215546Sopenharmony_ci (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF); 222bf215546Sopenharmony_ci 223bf215546Sopenharmony_ci /* In the mergedregs case or when the register is a special register, 224bf215546Sopenharmony_ci * half-registers do not alias with full registers. 225bf215546Sopenharmony_ci */ 226bf215546Sopenharmony_ci if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) && 227bf215546Sopenharmony_ci mismatched_half) 228bf215546Sopenharmony_ci return 0; 229bf215546Sopenharmony_ci 230bf215546Sopenharmony_ci unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src); 231bf215546Sopenharmony_ci unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src); 232bf215546Sopenharmony_ci unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst); 233bf215546Sopenharmony_ci unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst); 234bf215546Sopenharmony_ci 235bf215546Sopenharmony_ci if (dst_start >= src_end || src_start >= dst_end) 236bf215546Sopenharmony_ci return 0; 237bf215546Sopenharmony_ci 238bf215546Sopenharmony_ci return ir3_delayslots_with_repeat(assigner, consumer, assigner_n, consumer_n); 239bf215546Sopenharmony_ci} 240bf215546Sopenharmony_ci 241bf215546Sopenharmony_cistatic unsigned 242bf215546Sopenharmony_cidelay_calc(struct ir3_block *block, struct ir3_instruction *start, 243bf215546Sopenharmony_ci struct ir3_instruction *consumer, unsigned distance, 244bf215546Sopenharmony_ci regmask_t *in_mask, bool mergedregs) 245bf215546Sopenharmony_ci{ 246bf215546Sopenharmony_ci regmask_t mask; 247bf215546Sopenharmony_ci memcpy(&mask, in_mask, sizeof(mask)); 248bf215546Sopenharmony_ci 249bf215546Sopenharmony_ci unsigned delay = 0; 250bf215546Sopenharmony_ci /* Search backwards starting at the instruction before start, unless it's 251bf215546Sopenharmony_ci * NULL then search backwards from the block end. 252bf215546Sopenharmony_ci */ 253bf215546Sopenharmony_ci struct list_head *start_list = 254bf215546Sopenharmony_ci start ? start->node.prev : block->instr_list.prev; 255bf215546Sopenharmony_ci list_for_each_entry_from_rev (struct ir3_instruction, assigner, start_list, 256bf215546Sopenharmony_ci &block->instr_list, node) { 257bf215546Sopenharmony_ci if (count_instruction(assigner)) 258bf215546Sopenharmony_ci distance += assigner->nop; 259bf215546Sopenharmony_ci 260bf215546Sopenharmony_ci if (distance + delay >= MAX_NOPS) 261bf215546Sopenharmony_ci return delay; 262bf215546Sopenharmony_ci 263bf215546Sopenharmony_ci if (is_meta(assigner)) 264bf215546Sopenharmony_ci continue; 265bf215546Sopenharmony_ci 266bf215546Sopenharmony_ci unsigned new_delay = 0; 267bf215546Sopenharmony_ci 268bf215546Sopenharmony_ci foreach_dst_n (dst, dst_n, assigner) { 269bf215546Sopenharmony_ci if (dst->wrmask == 0) 270bf215546Sopenharmony_ci continue; 271bf215546Sopenharmony_ci if (!regmask_get(&mask, dst)) 272bf215546Sopenharmony_ci continue; 273bf215546Sopenharmony_ci foreach_src_n (src, src_n, consumer) { 274bf215546Sopenharmony_ci if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST)) 275bf215546Sopenharmony_ci continue; 276bf215546Sopenharmony_ci 277bf215546Sopenharmony_ci unsigned src_delay = delay_calc_srcn( 278bf215546Sopenharmony_ci assigner, consumer, dst_n, src_n, mergedregs); 279bf215546Sopenharmony_ci new_delay = MAX2(new_delay, src_delay); 280bf215546Sopenharmony_ci } 281bf215546Sopenharmony_ci regmask_clear(&mask, dst); 282bf215546Sopenharmony_ci } 283bf215546Sopenharmony_ci 284bf215546Sopenharmony_ci new_delay = new_delay > distance ? new_delay - distance : 0; 285bf215546Sopenharmony_ci delay = MAX2(delay, new_delay); 286bf215546Sopenharmony_ci 287bf215546Sopenharmony_ci if (count_instruction(assigner)) 288bf215546Sopenharmony_ci distance += 1 + assigner->repeat; 289bf215546Sopenharmony_ci } 290bf215546Sopenharmony_ci 291bf215546Sopenharmony_ci /* Note: this allows recursion into "block" if it has already been 292bf215546Sopenharmony_ci * visited, but *not* recursion into its predecessors. We may have to 293bf215546Sopenharmony_ci * visit the original block twice, for the loop case where we have to 294bf215546Sopenharmony_ci * consider definititons in an earlier iterations of the same loop: 295bf215546Sopenharmony_ci * 296bf215546Sopenharmony_ci * while (...) { 297bf215546Sopenharmony_ci * mov.u32u32 ..., r0.x 298bf215546Sopenharmony_ci * ... 299bf215546Sopenharmony_ci * mov.u32u32 r0.x, ... 300bf215546Sopenharmony_ci * } 301bf215546Sopenharmony_ci * 302bf215546Sopenharmony_ci * However any other recursion would be unnecessary. 303bf215546Sopenharmony_ci */ 304bf215546Sopenharmony_ci 305bf215546Sopenharmony_ci if (block->data != block) { 306bf215546Sopenharmony_ci block->data = block; 307bf215546Sopenharmony_ci 308bf215546Sopenharmony_ci for (unsigned i = 0; i < block->predecessors_count; i++) { 309bf215546Sopenharmony_ci struct ir3_block *pred = block->predecessors[i]; 310bf215546Sopenharmony_ci unsigned pred_delay = delay_calc(pred, NULL, consumer, distance, 311bf215546Sopenharmony_ci &mask, mergedregs); 312bf215546Sopenharmony_ci delay = MAX2(delay, pred_delay); 313bf215546Sopenharmony_ci } 314bf215546Sopenharmony_ci 315bf215546Sopenharmony_ci block->data = NULL; 316bf215546Sopenharmony_ci } 317bf215546Sopenharmony_ci 318bf215546Sopenharmony_ci return delay; 319bf215546Sopenharmony_ci} 320bf215546Sopenharmony_ci 321bf215546Sopenharmony_ci/** 322bf215546Sopenharmony_ci * Calculate delay for nop insertion. This must exactly match hardware 323bf215546Sopenharmony_ci * requirements, including recursing into predecessor blocks. 324bf215546Sopenharmony_ci */ 325bf215546Sopenharmony_ciunsigned 326bf215546Sopenharmony_ciir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr, 327bf215546Sopenharmony_ci bool mergedregs) 328bf215546Sopenharmony_ci{ 329bf215546Sopenharmony_ci regmask_t mask; 330bf215546Sopenharmony_ci regmask_init(&mask, mergedregs); 331bf215546Sopenharmony_ci foreach_src (src, instr) { 332bf215546Sopenharmony_ci if (!(src->flags & (IR3_REG_IMMED | IR3_REG_CONST))) 333bf215546Sopenharmony_ci regmask_set(&mask, src); 334bf215546Sopenharmony_ci } 335bf215546Sopenharmony_ci 336bf215546Sopenharmony_ci return delay_calc(block, NULL, instr, 0, &mask, mergedregs); 337bf215546Sopenharmony_ci} 338