1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2010 Intel Corporation 3bf215546Sopenharmony_ci * Copyright © 2014-2017 Broadcom 4bf215546Sopenharmony_ci * 5bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 6bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 7bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 8bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 10bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 11bf215546Sopenharmony_ci * 12bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 13bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 14bf215546Sopenharmony_ci * Software. 15bf215546Sopenharmony_ci * 16bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22bf215546Sopenharmony_ci * IN THE SOFTWARE. 23bf215546Sopenharmony_ci */ 24bf215546Sopenharmony_ci 25bf215546Sopenharmony_ci/** 26bf215546Sopenharmony_ci * @file 27bf215546Sopenharmony_ci * 28bf215546Sopenharmony_ci * The basic model of the list scheduler is to take a basic block, compute a 29bf215546Sopenharmony_ci * DAG of the dependencies, and make a list of the DAG heads. Heuristically 30bf215546Sopenharmony_ci * pick a DAG head, then put all the children that are now DAG heads into the 31bf215546Sopenharmony_ci * list of things to schedule. 32bf215546Sopenharmony_ci * 33bf215546Sopenharmony_ci * The goal of scheduling here is to pack pairs of operations together in a 34bf215546Sopenharmony_ci * single QPU instruction. 35bf215546Sopenharmony_ci */ 36bf215546Sopenharmony_ci 37bf215546Sopenharmony_ci#include "qpu/qpu_disasm.h" 38bf215546Sopenharmony_ci#include "v3d_compiler.h" 39bf215546Sopenharmony_ci#include "util/ralloc.h" 40bf215546Sopenharmony_ci#include "util/dag.h" 41bf215546Sopenharmony_ci 42bf215546Sopenharmony_cistatic bool debug; 43bf215546Sopenharmony_ci 44bf215546Sopenharmony_cistruct schedule_node_child; 45bf215546Sopenharmony_ci 46bf215546Sopenharmony_cistruct schedule_node { 47bf215546Sopenharmony_ci struct dag_node dag; 48bf215546Sopenharmony_ci struct list_head link; 49bf215546Sopenharmony_ci struct qinst *inst; 50bf215546Sopenharmony_ci 51bf215546Sopenharmony_ci /* Longest cycles + instruction_latency() of any parent of this node. */ 52bf215546Sopenharmony_ci uint32_t unblocked_time; 53bf215546Sopenharmony_ci 54bf215546Sopenharmony_ci /** 55bf215546Sopenharmony_ci * Minimum number of cycles from scheduling this instruction until the 56bf215546Sopenharmony_ci * end of the program, based on the slowest dependency chain through 57bf215546Sopenharmony_ci * the children. 58bf215546Sopenharmony_ci */ 59bf215546Sopenharmony_ci uint32_t delay; 60bf215546Sopenharmony_ci 61bf215546Sopenharmony_ci /** 62bf215546Sopenharmony_ci * cycles between this instruction being scheduled and when its result 63bf215546Sopenharmony_ci * can be consumed. 64bf215546Sopenharmony_ci */ 65bf215546Sopenharmony_ci uint32_t latency; 66bf215546Sopenharmony_ci}; 67bf215546Sopenharmony_ci 68bf215546Sopenharmony_ci/* When walking the instructions in reverse, we need to swap before/after in 69bf215546Sopenharmony_ci * add_dep(). 70bf215546Sopenharmony_ci */ 71bf215546Sopenharmony_cienum direction { F, R }; 72bf215546Sopenharmony_ci 73bf215546Sopenharmony_cistruct schedule_state { 74bf215546Sopenharmony_ci const struct v3d_device_info *devinfo; 75bf215546Sopenharmony_ci struct dag *dag; 76bf215546Sopenharmony_ci struct schedule_node *last_r[6]; 77bf215546Sopenharmony_ci struct schedule_node *last_rf[64]; 78bf215546Sopenharmony_ci struct schedule_node *last_sf; 79bf215546Sopenharmony_ci struct schedule_node *last_vpm_read; 80bf215546Sopenharmony_ci struct schedule_node *last_tmu_write; 81bf215546Sopenharmony_ci struct schedule_node *last_tmu_config; 82bf215546Sopenharmony_ci struct schedule_node *last_tmu_read; 83bf215546Sopenharmony_ci struct schedule_node *last_tlb; 84bf215546Sopenharmony_ci struct schedule_node *last_vpm; 85bf215546Sopenharmony_ci struct schedule_node *last_unif; 86bf215546Sopenharmony_ci struct schedule_node *last_rtop; 87bf215546Sopenharmony_ci struct schedule_node *last_unifa; 88bf215546Sopenharmony_ci enum direction dir; 89bf215546Sopenharmony_ci /* Estimated cycle when the current instruction would start. */ 90bf215546Sopenharmony_ci uint32_t time; 91bf215546Sopenharmony_ci}; 92bf215546Sopenharmony_ci 93bf215546Sopenharmony_cistatic void 94bf215546Sopenharmony_ciadd_dep(struct schedule_state *state, 95bf215546Sopenharmony_ci struct schedule_node *before, 96bf215546Sopenharmony_ci struct schedule_node *after, 97bf215546Sopenharmony_ci bool write) 98bf215546Sopenharmony_ci{ 99bf215546Sopenharmony_ci bool write_after_read = !write && state->dir == R; 100bf215546Sopenharmony_ci uintptr_t edge_data = write_after_read; 101bf215546Sopenharmony_ci 102bf215546Sopenharmony_ci if (!before || !after) 103bf215546Sopenharmony_ci return; 104bf215546Sopenharmony_ci 105bf215546Sopenharmony_ci assert(before != after); 106bf215546Sopenharmony_ci 107bf215546Sopenharmony_ci if (state->dir == F) 108bf215546Sopenharmony_ci dag_add_edge(&before->dag, &after->dag, edge_data); 109bf215546Sopenharmony_ci else 110bf215546Sopenharmony_ci dag_add_edge(&after->dag, &before->dag, edge_data); 111bf215546Sopenharmony_ci} 112bf215546Sopenharmony_ci 113bf215546Sopenharmony_cistatic void 114bf215546Sopenharmony_ciadd_read_dep(struct schedule_state *state, 115bf215546Sopenharmony_ci struct schedule_node *before, 116bf215546Sopenharmony_ci struct schedule_node *after) 117bf215546Sopenharmony_ci{ 118bf215546Sopenharmony_ci add_dep(state, before, after, false); 119bf215546Sopenharmony_ci} 120bf215546Sopenharmony_ci 121bf215546Sopenharmony_cistatic void 122bf215546Sopenharmony_ciadd_write_dep(struct schedule_state *state, 123bf215546Sopenharmony_ci struct schedule_node **before, 124bf215546Sopenharmony_ci struct schedule_node *after) 125bf215546Sopenharmony_ci{ 126bf215546Sopenharmony_ci add_dep(state, *before, after, true); 127bf215546Sopenharmony_ci *before = after; 128bf215546Sopenharmony_ci} 129bf215546Sopenharmony_ci 130bf215546Sopenharmony_cistatic bool 131bf215546Sopenharmony_ciqpu_inst_is_tlb(const struct v3d_qpu_instr *inst) 132bf215546Sopenharmony_ci{ 133bf215546Sopenharmony_ci if (inst->sig.ldtlb || inst->sig.ldtlbu) 134bf215546Sopenharmony_ci return true; 135bf215546Sopenharmony_ci 136bf215546Sopenharmony_ci if (inst->type != V3D_QPU_INSTR_TYPE_ALU) 137bf215546Sopenharmony_ci return false; 138bf215546Sopenharmony_ci 139bf215546Sopenharmony_ci if (inst->alu.add.magic_write && 140bf215546Sopenharmony_ci (inst->alu.add.waddr == V3D_QPU_WADDR_TLB || 141bf215546Sopenharmony_ci inst->alu.add.waddr == V3D_QPU_WADDR_TLBU)) 142bf215546Sopenharmony_ci return true; 143bf215546Sopenharmony_ci 144bf215546Sopenharmony_ci if (inst->alu.mul.magic_write && 145bf215546Sopenharmony_ci (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB || 146bf215546Sopenharmony_ci inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU)) 147bf215546Sopenharmony_ci return true; 148bf215546Sopenharmony_ci 149bf215546Sopenharmony_ci return false; 150bf215546Sopenharmony_ci} 151bf215546Sopenharmony_ci 152bf215546Sopenharmony_cistatic void 153bf215546Sopenharmony_ciprocess_mux_deps(struct schedule_state *state, struct schedule_node *n, 154bf215546Sopenharmony_ci enum v3d_qpu_mux mux) 155bf215546Sopenharmony_ci{ 156bf215546Sopenharmony_ci switch (mux) { 157bf215546Sopenharmony_ci case V3D_QPU_MUX_A: 158bf215546Sopenharmony_ci add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); 159bf215546Sopenharmony_ci break; 160bf215546Sopenharmony_ci case V3D_QPU_MUX_B: 161bf215546Sopenharmony_ci if (!n->inst->qpu.sig.small_imm) { 162bf215546Sopenharmony_ci add_read_dep(state, 163bf215546Sopenharmony_ci state->last_rf[n->inst->qpu.raddr_b], n); 164bf215546Sopenharmony_ci } 165bf215546Sopenharmony_ci break; 166bf215546Sopenharmony_ci default: 167bf215546Sopenharmony_ci add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n); 168bf215546Sopenharmony_ci break; 169bf215546Sopenharmony_ci } 170bf215546Sopenharmony_ci} 171bf215546Sopenharmony_ci 172bf215546Sopenharmony_cistatic bool 173bf215546Sopenharmony_citmu_write_is_sequence_terminator(uint32_t waddr) 174bf215546Sopenharmony_ci{ 175bf215546Sopenharmony_ci switch (waddr) { 176bf215546Sopenharmony_ci case V3D_QPU_WADDR_TMUS: 177bf215546Sopenharmony_ci case V3D_QPU_WADDR_TMUSCM: 178bf215546Sopenharmony_ci case V3D_QPU_WADDR_TMUSF: 179bf215546Sopenharmony_ci case V3D_QPU_WADDR_TMUSLOD: 180bf215546Sopenharmony_ci case V3D_QPU_WADDR_TMUA: 181bf215546Sopenharmony_ci case V3D_QPU_WADDR_TMUAU: 182bf215546Sopenharmony_ci return true; 183bf215546Sopenharmony_ci default: 184bf215546Sopenharmony_ci return false; 185bf215546Sopenharmony_ci } 186bf215546Sopenharmony_ci} 187bf215546Sopenharmony_ci 188bf215546Sopenharmony_cistatic bool 189bf215546Sopenharmony_cican_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr) 190bf215546Sopenharmony_ci{ 191bf215546Sopenharmony_ci if (devinfo->ver < 40) 192bf215546Sopenharmony_ci return false; 193bf215546Sopenharmony_ci 194bf215546Sopenharmony_ci if (tmu_write_is_sequence_terminator(waddr)) 195bf215546Sopenharmony_ci return false; 196bf215546Sopenharmony_ci 197bf215546Sopenharmony_ci if (waddr == V3D_QPU_WADDR_TMUD) 198bf215546Sopenharmony_ci return false; 199bf215546Sopenharmony_ci 200bf215546Sopenharmony_ci return true; 201bf215546Sopenharmony_ci} 202bf215546Sopenharmony_ci 203bf215546Sopenharmony_cistatic void 204bf215546Sopenharmony_ciprocess_waddr_deps(struct schedule_state *state, struct schedule_node *n, 205bf215546Sopenharmony_ci uint32_t waddr, bool magic) 206bf215546Sopenharmony_ci{ 207bf215546Sopenharmony_ci if (!magic) { 208bf215546Sopenharmony_ci add_write_dep(state, &state->last_rf[waddr], n); 209bf215546Sopenharmony_ci } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) { 210bf215546Sopenharmony_ci if (can_reorder_tmu_write(state->devinfo, waddr)) 211bf215546Sopenharmony_ci add_read_dep(state, state->last_tmu_write, n); 212bf215546Sopenharmony_ci else 213bf215546Sopenharmony_ci add_write_dep(state, &state->last_tmu_write, n); 214bf215546Sopenharmony_ci 215bf215546Sopenharmony_ci if (tmu_write_is_sequence_terminator(waddr)) 216bf215546Sopenharmony_ci add_write_dep(state, &state->last_tmu_config, n); 217bf215546Sopenharmony_ci } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) { 218bf215546Sopenharmony_ci /* Handled by v3d_qpu_writes_r4() check. */ 219bf215546Sopenharmony_ci } else { 220bf215546Sopenharmony_ci switch (waddr) { 221bf215546Sopenharmony_ci case V3D_QPU_WADDR_R0: 222bf215546Sopenharmony_ci case V3D_QPU_WADDR_R1: 223bf215546Sopenharmony_ci case V3D_QPU_WADDR_R2: 224bf215546Sopenharmony_ci add_write_dep(state, 225bf215546Sopenharmony_ci &state->last_r[waddr - V3D_QPU_WADDR_R0], 226bf215546Sopenharmony_ci n); 227bf215546Sopenharmony_ci break; 228bf215546Sopenharmony_ci case V3D_QPU_WADDR_R3: 229bf215546Sopenharmony_ci case V3D_QPU_WADDR_R4: 230bf215546Sopenharmony_ci case V3D_QPU_WADDR_R5: 231bf215546Sopenharmony_ci /* Handled by v3d_qpu_writes_r*() checks below. */ 232bf215546Sopenharmony_ci break; 233bf215546Sopenharmony_ci 234bf215546Sopenharmony_ci case V3D_QPU_WADDR_VPM: 235bf215546Sopenharmony_ci case V3D_QPU_WADDR_VPMU: 236bf215546Sopenharmony_ci add_write_dep(state, &state->last_vpm, n); 237bf215546Sopenharmony_ci break; 238bf215546Sopenharmony_ci 239bf215546Sopenharmony_ci case V3D_QPU_WADDR_TLB: 240bf215546Sopenharmony_ci case V3D_QPU_WADDR_TLBU: 241bf215546Sopenharmony_ci add_write_dep(state, &state->last_tlb, n); 242bf215546Sopenharmony_ci break; 243bf215546Sopenharmony_ci 244bf215546Sopenharmony_ci case V3D_QPU_WADDR_SYNC: 245bf215546Sopenharmony_ci case V3D_QPU_WADDR_SYNCB: 246bf215546Sopenharmony_ci case V3D_QPU_WADDR_SYNCU: 247bf215546Sopenharmony_ci /* For CS barrier(): Sync against any other memory 248bf215546Sopenharmony_ci * accesses. There doesn't appear to be any need for 249bf215546Sopenharmony_ci * barriers to affect ALU operations. 250bf215546Sopenharmony_ci */ 251bf215546Sopenharmony_ci add_write_dep(state, &state->last_tmu_write, n); 252bf215546Sopenharmony_ci add_write_dep(state, &state->last_tmu_read, n); 253bf215546Sopenharmony_ci break; 254bf215546Sopenharmony_ci 255bf215546Sopenharmony_ci case V3D_QPU_WADDR_UNIFA: 256bf215546Sopenharmony_ci if (state->devinfo->ver >= 40) 257bf215546Sopenharmony_ci add_write_dep(state, &state->last_unifa, n); 258bf215546Sopenharmony_ci break; 259bf215546Sopenharmony_ci 260bf215546Sopenharmony_ci case V3D_QPU_WADDR_NOP: 261bf215546Sopenharmony_ci break; 262bf215546Sopenharmony_ci 263bf215546Sopenharmony_ci default: 264bf215546Sopenharmony_ci fprintf(stderr, "Unknown waddr %d\n", waddr); 265bf215546Sopenharmony_ci abort(); 266bf215546Sopenharmony_ci } 267bf215546Sopenharmony_ci } 268bf215546Sopenharmony_ci} 269bf215546Sopenharmony_ci 270bf215546Sopenharmony_ci/** 271bf215546Sopenharmony_ci * Common code for dependencies that need to be tracked both forward and 272bf215546Sopenharmony_ci * backward. 273bf215546Sopenharmony_ci * 274bf215546Sopenharmony_ci * This is for things like "all reads of r4 have to happen between the r4 275bf215546Sopenharmony_ci * writes that surround them". 276bf215546Sopenharmony_ci */ 277bf215546Sopenharmony_cistatic void 278bf215546Sopenharmony_cicalculate_deps(struct schedule_state *state, struct schedule_node *n) 279bf215546Sopenharmony_ci{ 280bf215546Sopenharmony_ci const struct v3d_device_info *devinfo = state->devinfo; 281bf215546Sopenharmony_ci struct qinst *qinst = n->inst; 282bf215546Sopenharmony_ci struct v3d_qpu_instr *inst = &qinst->qpu; 283bf215546Sopenharmony_ci /* If the input and output segments are shared, then all VPM reads to 284bf215546Sopenharmony_ci * a location need to happen before all writes. We handle this by 285bf215546Sopenharmony_ci * serializing all VPM operations for now. 286bf215546Sopenharmony_ci */ 287bf215546Sopenharmony_ci bool separate_vpm_segment = false; 288bf215546Sopenharmony_ci 289bf215546Sopenharmony_ci if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 290bf215546Sopenharmony_ci if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) 291bf215546Sopenharmony_ci add_read_dep(state, state->last_sf, n); 292bf215546Sopenharmony_ci 293bf215546Sopenharmony_ci /* XXX: BDI */ 294bf215546Sopenharmony_ci /* XXX: BDU */ 295bf215546Sopenharmony_ci /* XXX: ub */ 296bf215546Sopenharmony_ci /* XXX: raddr_a */ 297bf215546Sopenharmony_ci 298bf215546Sopenharmony_ci add_write_dep(state, &state->last_unif, n); 299bf215546Sopenharmony_ci return; 300bf215546Sopenharmony_ci } 301bf215546Sopenharmony_ci 302bf215546Sopenharmony_ci assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 303bf215546Sopenharmony_ci 304bf215546Sopenharmony_ci /* XXX: LOAD_IMM */ 305bf215546Sopenharmony_ci 306bf215546Sopenharmony_ci if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) 307bf215546Sopenharmony_ci process_mux_deps(state, n, inst->alu.add.a); 308bf215546Sopenharmony_ci if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) 309bf215546Sopenharmony_ci process_mux_deps(state, n, inst->alu.add.b); 310bf215546Sopenharmony_ci 311bf215546Sopenharmony_ci if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) 312bf215546Sopenharmony_ci process_mux_deps(state, n, inst->alu.mul.a); 313bf215546Sopenharmony_ci if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) 314bf215546Sopenharmony_ci process_mux_deps(state, n, inst->alu.mul.b); 315bf215546Sopenharmony_ci 316bf215546Sopenharmony_ci switch (inst->alu.add.op) { 317bf215546Sopenharmony_ci case V3D_QPU_A_VPMSETUP: 318bf215546Sopenharmony_ci /* Could distinguish read/write by unpacking the uniform. */ 319bf215546Sopenharmony_ci add_write_dep(state, &state->last_vpm, n); 320bf215546Sopenharmony_ci add_write_dep(state, &state->last_vpm_read, n); 321bf215546Sopenharmony_ci break; 322bf215546Sopenharmony_ci 323bf215546Sopenharmony_ci case V3D_QPU_A_STVPMV: 324bf215546Sopenharmony_ci case V3D_QPU_A_STVPMD: 325bf215546Sopenharmony_ci case V3D_QPU_A_STVPMP: 326bf215546Sopenharmony_ci add_write_dep(state, &state->last_vpm, n); 327bf215546Sopenharmony_ci break; 328bf215546Sopenharmony_ci 329bf215546Sopenharmony_ci case V3D_QPU_A_LDVPMV_IN: 330bf215546Sopenharmony_ci case V3D_QPU_A_LDVPMD_IN: 331bf215546Sopenharmony_ci case V3D_QPU_A_LDVPMG_IN: 332bf215546Sopenharmony_ci case V3D_QPU_A_LDVPMP: 333bf215546Sopenharmony_ci if (!separate_vpm_segment) 334bf215546Sopenharmony_ci add_write_dep(state, &state->last_vpm, n); 335bf215546Sopenharmony_ci break; 336bf215546Sopenharmony_ci 337bf215546Sopenharmony_ci case V3D_QPU_A_VPMWT: 338bf215546Sopenharmony_ci add_read_dep(state, state->last_vpm, n); 339bf215546Sopenharmony_ci break; 340bf215546Sopenharmony_ci 341bf215546Sopenharmony_ci case V3D_QPU_A_MSF: 342bf215546Sopenharmony_ci add_read_dep(state, state->last_tlb, n); 343bf215546Sopenharmony_ci break; 344bf215546Sopenharmony_ci 345bf215546Sopenharmony_ci case V3D_QPU_A_SETMSF: 346bf215546Sopenharmony_ci case V3D_QPU_A_SETREVF: 347bf215546Sopenharmony_ci add_write_dep(state, &state->last_tlb, n); 348bf215546Sopenharmony_ci break; 349bf215546Sopenharmony_ci 350bf215546Sopenharmony_ci default: 351bf215546Sopenharmony_ci break; 352bf215546Sopenharmony_ci } 353bf215546Sopenharmony_ci 354bf215546Sopenharmony_ci switch (inst->alu.mul.op) { 355bf215546Sopenharmony_ci case V3D_QPU_M_MULTOP: 356bf215546Sopenharmony_ci case V3D_QPU_M_UMUL24: 357bf215546Sopenharmony_ci /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and 358bf215546Sopenharmony_ci * resets it to 0. We could possibly reorder umul24s relative 359bf215546Sopenharmony_ci * to each other, but for now just keep all the MUL parts in 360bf215546Sopenharmony_ci * order. 361bf215546Sopenharmony_ci */ 362bf215546Sopenharmony_ci add_write_dep(state, &state->last_rtop, n); 363bf215546Sopenharmony_ci break; 364bf215546Sopenharmony_ci default: 365bf215546Sopenharmony_ci break; 366bf215546Sopenharmony_ci } 367bf215546Sopenharmony_ci 368bf215546Sopenharmony_ci if (inst->alu.add.op != V3D_QPU_A_NOP) { 369bf215546Sopenharmony_ci process_waddr_deps(state, n, inst->alu.add.waddr, 370bf215546Sopenharmony_ci inst->alu.add.magic_write); 371bf215546Sopenharmony_ci } 372bf215546Sopenharmony_ci if (inst->alu.mul.op != V3D_QPU_M_NOP) { 373bf215546Sopenharmony_ci process_waddr_deps(state, n, inst->alu.mul.waddr, 374bf215546Sopenharmony_ci inst->alu.mul.magic_write); 375bf215546Sopenharmony_ci } 376bf215546Sopenharmony_ci if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) { 377bf215546Sopenharmony_ci process_waddr_deps(state, n, inst->sig_addr, 378bf215546Sopenharmony_ci inst->sig_magic); 379bf215546Sopenharmony_ci } 380bf215546Sopenharmony_ci 381bf215546Sopenharmony_ci if (v3d_qpu_writes_r3(devinfo, inst)) 382bf215546Sopenharmony_ci add_write_dep(state, &state->last_r[3], n); 383bf215546Sopenharmony_ci if (v3d_qpu_writes_r4(devinfo, inst)) 384bf215546Sopenharmony_ci add_write_dep(state, &state->last_r[4], n); 385bf215546Sopenharmony_ci if (v3d_qpu_writes_r5(devinfo, inst)) 386bf215546Sopenharmony_ci add_write_dep(state, &state->last_r[5], n); 387bf215546Sopenharmony_ci 388bf215546Sopenharmony_ci /* If we add any more dependencies here we should consider whether we 389bf215546Sopenharmony_ci * also need to update qpu_inst_after_thrsw_valid_in_delay_slot. 390bf215546Sopenharmony_ci */ 391bf215546Sopenharmony_ci if (inst->sig.thrsw) { 392bf215546Sopenharmony_ci /* All accumulator contents and flags are undefined after the 393bf215546Sopenharmony_ci * switch. 394bf215546Sopenharmony_ci */ 395bf215546Sopenharmony_ci for (int i = 0; i < ARRAY_SIZE(state->last_r); i++) 396bf215546Sopenharmony_ci add_write_dep(state, &state->last_r[i], n); 397bf215546Sopenharmony_ci add_write_dep(state, &state->last_sf, n); 398bf215546Sopenharmony_ci add_write_dep(state, &state->last_rtop, n); 399bf215546Sopenharmony_ci 400bf215546Sopenharmony_ci /* Scoreboard-locking operations have to stay after the last 401bf215546Sopenharmony_ci * thread switch. 402bf215546Sopenharmony_ci */ 403bf215546Sopenharmony_ci add_write_dep(state, &state->last_tlb, n); 404bf215546Sopenharmony_ci 405bf215546Sopenharmony_ci add_write_dep(state, &state->last_tmu_write, n); 406bf215546Sopenharmony_ci add_write_dep(state, &state->last_tmu_config, n); 407bf215546Sopenharmony_ci } 408bf215546Sopenharmony_ci 409bf215546Sopenharmony_ci if (v3d_qpu_waits_on_tmu(inst)) { 410bf215546Sopenharmony_ci /* TMU loads are coming from a FIFO, so ordering is important. 411bf215546Sopenharmony_ci */ 412bf215546Sopenharmony_ci add_write_dep(state, &state->last_tmu_read, n); 413bf215546Sopenharmony_ci /* Keep TMU loads after their TMU lookup terminator */ 414bf215546Sopenharmony_ci add_read_dep(state, state->last_tmu_config, n); 415bf215546Sopenharmony_ci } 416bf215546Sopenharmony_ci 417bf215546Sopenharmony_ci /* Allow wrtmuc to be reordered with other instructions in the 418bf215546Sopenharmony_ci * same TMU sequence by using a read dependency on the last TMU 419bf215546Sopenharmony_ci * sequence terminator. 420bf215546Sopenharmony_ci */ 421bf215546Sopenharmony_ci if (inst->sig.wrtmuc) 422bf215546Sopenharmony_ci add_read_dep(state, state->last_tmu_config, n); 423bf215546Sopenharmony_ci 424bf215546Sopenharmony_ci if (inst->sig.ldtlb | inst->sig.ldtlbu) 425bf215546Sopenharmony_ci add_write_dep(state, &state->last_tlb, n); 426bf215546Sopenharmony_ci 427bf215546Sopenharmony_ci if (inst->sig.ldvpm) { 428bf215546Sopenharmony_ci add_write_dep(state, &state->last_vpm_read, n); 429bf215546Sopenharmony_ci 430bf215546Sopenharmony_ci /* At least for now, we're doing shared I/O segments, so queue 431bf215546Sopenharmony_ci * all writes after all reads. 432bf215546Sopenharmony_ci */ 433bf215546Sopenharmony_ci if (!separate_vpm_segment) 434bf215546Sopenharmony_ci add_write_dep(state, &state->last_vpm, n); 435bf215546Sopenharmony_ci } 436bf215546Sopenharmony_ci 437bf215546Sopenharmony_ci /* inst->sig.ldunif or sideband uniform read */ 438bf215546Sopenharmony_ci if (vir_has_uniform(qinst)) 439bf215546Sopenharmony_ci add_write_dep(state, &state->last_unif, n); 440bf215546Sopenharmony_ci 441bf215546Sopenharmony_ci /* Both unifa and ldunifa must preserve ordering */ 442bf215546Sopenharmony_ci if (inst->sig.ldunifa || inst->sig.ldunifarf) 443bf215546Sopenharmony_ci add_write_dep(state, &state->last_unifa, n); 444bf215546Sopenharmony_ci 445bf215546Sopenharmony_ci if (v3d_qpu_reads_flags(inst)) 446bf215546Sopenharmony_ci add_read_dep(state, state->last_sf, n); 447bf215546Sopenharmony_ci if (v3d_qpu_writes_flags(inst)) 448bf215546Sopenharmony_ci add_write_dep(state, &state->last_sf, n); 449bf215546Sopenharmony_ci} 450bf215546Sopenharmony_ci 451bf215546Sopenharmony_cistatic void 452bf215546Sopenharmony_cicalculate_forward_deps(struct v3d_compile *c, struct dag *dag, 453bf215546Sopenharmony_ci struct list_head *schedule_list) 454bf215546Sopenharmony_ci{ 455bf215546Sopenharmony_ci struct schedule_state state; 456bf215546Sopenharmony_ci 457bf215546Sopenharmony_ci memset(&state, 0, sizeof(state)); 458bf215546Sopenharmony_ci state.dag = dag; 459bf215546Sopenharmony_ci state.devinfo = c->devinfo; 460bf215546Sopenharmony_ci state.dir = F; 461bf215546Sopenharmony_ci 462bf215546Sopenharmony_ci list_for_each_entry(struct schedule_node, node, schedule_list, link) 463bf215546Sopenharmony_ci calculate_deps(&state, node); 464bf215546Sopenharmony_ci} 465bf215546Sopenharmony_ci 466bf215546Sopenharmony_cistatic void 467bf215546Sopenharmony_cicalculate_reverse_deps(struct v3d_compile *c, struct dag *dag, 468bf215546Sopenharmony_ci struct list_head *schedule_list) 469bf215546Sopenharmony_ci{ 470bf215546Sopenharmony_ci struct schedule_state state; 471bf215546Sopenharmony_ci 472bf215546Sopenharmony_ci memset(&state, 0, sizeof(state)); 473bf215546Sopenharmony_ci state.dag = dag; 474bf215546Sopenharmony_ci state.devinfo = c->devinfo; 475bf215546Sopenharmony_ci state.dir = R; 476bf215546Sopenharmony_ci 477bf215546Sopenharmony_ci list_for_each_entry_rev(struct schedule_node, node, schedule_list, 478bf215546Sopenharmony_ci link) { 479bf215546Sopenharmony_ci calculate_deps(&state, (struct schedule_node *)node); 480bf215546Sopenharmony_ci } 481bf215546Sopenharmony_ci} 482bf215546Sopenharmony_ci 483bf215546Sopenharmony_cistruct choose_scoreboard { 484bf215546Sopenharmony_ci struct dag *dag; 485bf215546Sopenharmony_ci int tick; 486bf215546Sopenharmony_ci int last_magic_sfu_write_tick; 487bf215546Sopenharmony_ci int last_stallable_sfu_reg; 488bf215546Sopenharmony_ci int last_stallable_sfu_tick; 489bf215546Sopenharmony_ci int last_ldvary_tick; 490bf215546Sopenharmony_ci int last_unifa_write_tick; 491bf215546Sopenharmony_ci int last_uniforms_reset_tick; 492bf215546Sopenharmony_ci int last_thrsw_tick; 493bf215546Sopenharmony_ci int last_branch_tick; 494bf215546Sopenharmony_ci int last_setmsf_tick; 495bf215546Sopenharmony_ci bool first_thrsw_emitted; 496bf215546Sopenharmony_ci bool last_thrsw_emitted; 497bf215546Sopenharmony_ci bool fixup_ldvary; 498bf215546Sopenharmony_ci int ldvary_count; 499bf215546Sopenharmony_ci}; 500bf215546Sopenharmony_ci 501bf215546Sopenharmony_cistatic bool 502bf215546Sopenharmony_cimux_reads_too_soon(struct choose_scoreboard *scoreboard, 503bf215546Sopenharmony_ci const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux) 504bf215546Sopenharmony_ci{ 505bf215546Sopenharmony_ci switch (mux) { 506bf215546Sopenharmony_ci case V3D_QPU_MUX_R4: 507bf215546Sopenharmony_ci if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2) 508bf215546Sopenharmony_ci return true; 509bf215546Sopenharmony_ci break; 510bf215546Sopenharmony_ci 511bf215546Sopenharmony_ci case V3D_QPU_MUX_R5: 512bf215546Sopenharmony_ci if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1) 513bf215546Sopenharmony_ci return true; 514bf215546Sopenharmony_ci break; 515bf215546Sopenharmony_ci default: 516bf215546Sopenharmony_ci break; 517bf215546Sopenharmony_ci } 518bf215546Sopenharmony_ci 519bf215546Sopenharmony_ci return false; 520bf215546Sopenharmony_ci} 521bf215546Sopenharmony_ci 522bf215546Sopenharmony_cistatic bool 523bf215546Sopenharmony_cireads_too_soon_after_write(struct choose_scoreboard *scoreboard, 524bf215546Sopenharmony_ci struct qinst *qinst) 525bf215546Sopenharmony_ci{ 526bf215546Sopenharmony_ci const struct v3d_qpu_instr *inst = &qinst->qpu; 527bf215546Sopenharmony_ci 528bf215546Sopenharmony_ci /* XXX: Branching off of raddr. */ 529bf215546Sopenharmony_ci if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 530bf215546Sopenharmony_ci return false; 531bf215546Sopenharmony_ci 532bf215546Sopenharmony_ci assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 533bf215546Sopenharmony_ci 534bf215546Sopenharmony_ci if (inst->alu.add.op != V3D_QPU_A_NOP) { 535bf215546Sopenharmony_ci if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 && 536bf215546Sopenharmony_ci mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) { 537bf215546Sopenharmony_ci return true; 538bf215546Sopenharmony_ci } 539bf215546Sopenharmony_ci if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 && 540bf215546Sopenharmony_ci mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) { 541bf215546Sopenharmony_ci return true; 542bf215546Sopenharmony_ci } 543bf215546Sopenharmony_ci } 544bf215546Sopenharmony_ci 545bf215546Sopenharmony_ci if (inst->alu.mul.op != V3D_QPU_M_NOP) { 546bf215546Sopenharmony_ci if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 && 547bf215546Sopenharmony_ci mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) { 548bf215546Sopenharmony_ci return true; 549bf215546Sopenharmony_ci } 550bf215546Sopenharmony_ci if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 && 551bf215546Sopenharmony_ci mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) { 552bf215546Sopenharmony_ci return true; 553bf215546Sopenharmony_ci } 554bf215546Sopenharmony_ci } 555bf215546Sopenharmony_ci 556bf215546Sopenharmony_ci /* XXX: imm */ 557bf215546Sopenharmony_ci 558bf215546Sopenharmony_ci return false; 559bf215546Sopenharmony_ci} 560bf215546Sopenharmony_ci 561bf215546Sopenharmony_cistatic bool 562bf215546Sopenharmony_ciwrites_too_soon_after_write(const struct v3d_device_info *devinfo, 563bf215546Sopenharmony_ci struct choose_scoreboard *scoreboard, 564bf215546Sopenharmony_ci struct qinst *qinst) 565bf215546Sopenharmony_ci{ 566bf215546Sopenharmony_ci const struct v3d_qpu_instr *inst = &qinst->qpu; 567bf215546Sopenharmony_ci 568bf215546Sopenharmony_ci /* Don't schedule any other r4 write too soon after an SFU write. 569bf215546Sopenharmony_ci * This would normally be prevented by dependency tracking, but might 570bf215546Sopenharmony_ci * occur if a dead SFU computation makes it to scheduling. 571bf215546Sopenharmony_ci */ 572bf215546Sopenharmony_ci if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 && 573bf215546Sopenharmony_ci v3d_qpu_writes_r4(devinfo, inst)) 574bf215546Sopenharmony_ci return true; 575bf215546Sopenharmony_ci 576bf215546Sopenharmony_ci return false; 577bf215546Sopenharmony_ci} 578bf215546Sopenharmony_ci 579bf215546Sopenharmony_cistatic bool 580bf215546Sopenharmony_ciscoreboard_is_locked(struct choose_scoreboard *scoreboard, 581bf215546Sopenharmony_ci bool lock_scoreboard_on_first_thrsw) 582bf215546Sopenharmony_ci{ 583bf215546Sopenharmony_ci if (lock_scoreboard_on_first_thrsw) { 584bf215546Sopenharmony_ci return scoreboard->first_thrsw_emitted && 585bf215546Sopenharmony_ci scoreboard->tick - scoreboard->last_thrsw_tick >= 3; 586bf215546Sopenharmony_ci } 587bf215546Sopenharmony_ci 588bf215546Sopenharmony_ci return scoreboard->last_thrsw_emitted && 589bf215546Sopenharmony_ci scoreboard->tick - scoreboard->last_thrsw_tick >= 3; 590bf215546Sopenharmony_ci} 591bf215546Sopenharmony_ci 592bf215546Sopenharmony_cistatic bool 593bf215546Sopenharmony_cipixel_scoreboard_too_soon(struct v3d_compile *c, 594bf215546Sopenharmony_ci struct choose_scoreboard *scoreboard, 595bf215546Sopenharmony_ci const struct v3d_qpu_instr *inst) 596bf215546Sopenharmony_ci{ 597bf215546Sopenharmony_ci return qpu_inst_is_tlb(inst) && 598bf215546Sopenharmony_ci !scoreboard_is_locked(scoreboard, 599bf215546Sopenharmony_ci c->lock_scoreboard_on_first_thrsw); 600bf215546Sopenharmony_ci} 601bf215546Sopenharmony_ci 602bf215546Sopenharmony_cistatic bool 603bf215546Sopenharmony_ciqpu_instruction_uses_rf(const struct v3d_qpu_instr *inst, 604bf215546Sopenharmony_ci uint32_t waddr) { 605bf215546Sopenharmony_ci 606bf215546Sopenharmony_ci if (inst->type != V3D_QPU_INSTR_TYPE_ALU) 607bf215546Sopenharmony_ci return false; 608bf215546Sopenharmony_ci 609bf215546Sopenharmony_ci if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && 610bf215546Sopenharmony_ci inst->raddr_a == waddr) 611bf215546Sopenharmony_ci return true; 612bf215546Sopenharmony_ci 613bf215546Sopenharmony_ci if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && 614bf215546Sopenharmony_ci !inst->sig.small_imm && (inst->raddr_b == waddr)) 615bf215546Sopenharmony_ci return true; 616bf215546Sopenharmony_ci 617bf215546Sopenharmony_ci return false; 618bf215546Sopenharmony_ci} 619bf215546Sopenharmony_ci 620bf215546Sopenharmony_cistatic bool 621bf215546Sopenharmony_cimux_read_stalls(struct choose_scoreboard *scoreboard, 622bf215546Sopenharmony_ci const struct v3d_qpu_instr *inst) 623bf215546Sopenharmony_ci{ 624bf215546Sopenharmony_ci return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 && 625bf215546Sopenharmony_ci qpu_instruction_uses_rf(inst, 626bf215546Sopenharmony_ci scoreboard->last_stallable_sfu_reg); 627bf215546Sopenharmony_ci} 628bf215546Sopenharmony_ci 629bf215546Sopenharmony_ci/* We define a max schedule priority to allow negative priorities as result of 630bf215546Sopenharmony_ci * substracting this max when an instruction stalls. So instructions that 631bf215546Sopenharmony_ci * stall have lower priority than regular instructions. */ 632bf215546Sopenharmony_ci#define MAX_SCHEDULE_PRIORITY 16 633bf215546Sopenharmony_ci 634bf215546Sopenharmony_cistatic int 635bf215546Sopenharmony_ciget_instruction_priority(const struct v3d_device_info *devinfo, 636bf215546Sopenharmony_ci const struct v3d_qpu_instr *inst) 637bf215546Sopenharmony_ci{ 638bf215546Sopenharmony_ci uint32_t baseline_score; 639bf215546Sopenharmony_ci uint32_t next_score = 0; 640bf215546Sopenharmony_ci 641bf215546Sopenharmony_ci /* Schedule TLB operations as late as possible, to get more 642bf215546Sopenharmony_ci * parallelism between shaders. 643bf215546Sopenharmony_ci */ 644bf215546Sopenharmony_ci if (qpu_inst_is_tlb(inst)) 645bf215546Sopenharmony_ci return next_score; 646bf215546Sopenharmony_ci next_score++; 647bf215546Sopenharmony_ci 648bf215546Sopenharmony_ci /* Empirical testing shows that using priorities to hide latency of 649bf215546Sopenharmony_ci * TMU operations when scheduling QPU leads to slightly worse 650bf215546Sopenharmony_ci * performance, even at 2 threads. We think this is because the thread 651bf215546Sopenharmony_ci * switching is already quite effective at hiding latency and NIR 652bf215546Sopenharmony_ci * scheduling (and possibly TMU pipelining too) are sufficient to hide 653bf215546Sopenharmony_ci * TMU latency, so piling up on that here doesn't provide any benefits 654bf215546Sopenharmony_ci * and instead may cause us to postpone critical paths that depend on 655bf215546Sopenharmony_ci * the TMU results. 656bf215546Sopenharmony_ci */ 657bf215546Sopenharmony_ci#if 0 658bf215546Sopenharmony_ci /* Schedule texture read results collection late to hide latency. */ 659bf215546Sopenharmony_ci if (v3d_qpu_waits_on_tmu(inst)) 660bf215546Sopenharmony_ci return next_score; 661bf215546Sopenharmony_ci next_score++; 662bf215546Sopenharmony_ci#endif 663bf215546Sopenharmony_ci 664bf215546Sopenharmony_ci /* Default score for things that aren't otherwise special. */ 665bf215546Sopenharmony_ci baseline_score = next_score; 666bf215546Sopenharmony_ci next_score++; 667bf215546Sopenharmony_ci 668bf215546Sopenharmony_ci#if 0 669bf215546Sopenharmony_ci /* Schedule texture read setup early to hide their latency better. */ 670bf215546Sopenharmony_ci if (v3d_qpu_writes_tmu(devinfo, inst)) 671bf215546Sopenharmony_ci return next_score; 672bf215546Sopenharmony_ci next_score++; 673bf215546Sopenharmony_ci#endif 674bf215546Sopenharmony_ci 675bf215546Sopenharmony_ci /* We should increase the maximum if we assert here */ 676bf215546Sopenharmony_ci assert(next_score < MAX_SCHEDULE_PRIORITY); 677bf215546Sopenharmony_ci 678bf215546Sopenharmony_ci return baseline_score; 679bf215546Sopenharmony_ci} 680bf215546Sopenharmony_ci 681bf215546Sopenharmony_cienum { 682bf215546Sopenharmony_ci V3D_PERIPHERAL_VPM_READ = (1 << 0), 683bf215546Sopenharmony_ci V3D_PERIPHERAL_VPM_WRITE = (1 << 1), 684bf215546Sopenharmony_ci V3D_PERIPHERAL_VPM_WAIT = (1 << 2), 685bf215546Sopenharmony_ci V3D_PERIPHERAL_SFU = (1 << 3), 686bf215546Sopenharmony_ci V3D_PERIPHERAL_TMU_WRITE = (1 << 4), 687bf215546Sopenharmony_ci V3D_PERIPHERAL_TMU_READ = (1 << 5), 688bf215546Sopenharmony_ci V3D_PERIPHERAL_TMU_WAIT = (1 << 6), 689bf215546Sopenharmony_ci V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7), 690bf215546Sopenharmony_ci V3D_PERIPHERAL_TSY = (1 << 8), 691bf215546Sopenharmony_ci V3D_PERIPHERAL_TLB = (1 << 9), 692bf215546Sopenharmony_ci}; 693bf215546Sopenharmony_ci 694bf215546Sopenharmony_cistatic uint32_t 695bf215546Sopenharmony_ciqpu_peripherals(const struct v3d_device_info *devinfo, 696bf215546Sopenharmony_ci const struct v3d_qpu_instr *inst) 697bf215546Sopenharmony_ci{ 698bf215546Sopenharmony_ci uint32_t result = 0; 699bf215546Sopenharmony_ci if (v3d_qpu_reads_vpm(inst)) 700bf215546Sopenharmony_ci result |= V3D_PERIPHERAL_VPM_READ; 701bf215546Sopenharmony_ci if (v3d_qpu_writes_vpm(inst)) 702bf215546Sopenharmony_ci result |= V3D_PERIPHERAL_VPM_WRITE; 703bf215546Sopenharmony_ci if (v3d_qpu_waits_vpm(inst)) 704bf215546Sopenharmony_ci result |= V3D_PERIPHERAL_VPM_WAIT; 705bf215546Sopenharmony_ci 706bf215546Sopenharmony_ci if (v3d_qpu_writes_tmu(devinfo, inst)) 707bf215546Sopenharmony_ci result |= V3D_PERIPHERAL_TMU_WRITE; 708bf215546Sopenharmony_ci if (inst->sig.ldtmu) 709bf215546Sopenharmony_ci result |= V3D_PERIPHERAL_TMU_READ; 710bf215546Sopenharmony_ci if (inst->sig.wrtmuc) 711bf215546Sopenharmony_ci result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG; 712bf215546Sopenharmony_ci 713bf215546Sopenharmony_ci if (v3d_qpu_uses_sfu(inst)) 714bf215546Sopenharmony_ci result |= V3D_PERIPHERAL_SFU; 715bf215546Sopenharmony_ci 716bf215546Sopenharmony_ci if (v3d_qpu_uses_tlb(inst)) 717bf215546Sopenharmony_ci result |= V3D_PERIPHERAL_TLB; 718bf215546Sopenharmony_ci 719bf215546Sopenharmony_ci if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { 720bf215546Sopenharmony_ci if (inst->alu.add.op != V3D_QPU_A_NOP && 721bf215546Sopenharmony_ci inst->alu.add.magic_write && 722bf215546Sopenharmony_ci v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) { 723bf215546Sopenharmony_ci result |= V3D_PERIPHERAL_TSY; 724bf215546Sopenharmony_ci } 725bf215546Sopenharmony_ci 726bf215546Sopenharmony_ci if (inst->alu.add.op == V3D_QPU_A_TMUWT) 727bf215546Sopenharmony_ci result |= V3D_PERIPHERAL_TMU_WAIT; 728bf215546Sopenharmony_ci } 729bf215546Sopenharmony_ci 730bf215546Sopenharmony_ci return result; 731bf215546Sopenharmony_ci} 732bf215546Sopenharmony_ci 733bf215546Sopenharmony_cistatic bool 734bf215546Sopenharmony_ciqpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, 735bf215546Sopenharmony_ci const struct v3d_qpu_instr *a, 736bf215546Sopenharmony_ci const struct v3d_qpu_instr *b) 737bf215546Sopenharmony_ci{ 738bf215546Sopenharmony_ci const uint32_t a_peripherals = qpu_peripherals(devinfo, a); 739bf215546Sopenharmony_ci const uint32_t b_peripherals = qpu_peripherals(devinfo, b); 740bf215546Sopenharmony_ci 741bf215546Sopenharmony_ci /* We can always do one peripheral access per instruction. */ 742bf215546Sopenharmony_ci if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1) 743bf215546Sopenharmony_ci return true; 744bf215546Sopenharmony_ci 745bf215546Sopenharmony_ci if (devinfo->ver < 41) 746bf215546Sopenharmony_ci return false; 747bf215546Sopenharmony_ci 748bf215546Sopenharmony_ci /* V3D 4.1+ allow WRTMUC signal with TMU register write (other than 749bf215546Sopenharmony_ci * tmuc). 750bf215546Sopenharmony_ci */ 751bf215546Sopenharmony_ci if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && 752bf215546Sopenharmony_ci b_peripherals == V3D_PERIPHERAL_TMU_WRITE) { 753bf215546Sopenharmony_ci return v3d_qpu_writes_tmu_not_tmuc(devinfo, b); 754bf215546Sopenharmony_ci } 755bf215546Sopenharmony_ci 756bf215546Sopenharmony_ci if (a_peripherals == V3D_PERIPHERAL_TMU_WRITE && 757bf215546Sopenharmony_ci b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG) { 758bf215546Sopenharmony_ci return v3d_qpu_writes_tmu_not_tmuc(devinfo, a); 759bf215546Sopenharmony_ci } 760bf215546Sopenharmony_ci 761bf215546Sopenharmony_ci /* V3D 4.1+ allows TMU read with VPM read/write. */ 762bf215546Sopenharmony_ci if (a_peripherals == V3D_PERIPHERAL_TMU_READ && 763bf215546Sopenharmony_ci (b_peripherals == V3D_PERIPHERAL_VPM_READ || 764bf215546Sopenharmony_ci b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { 765bf215546Sopenharmony_ci return true; 766bf215546Sopenharmony_ci } 767bf215546Sopenharmony_ci if (b_peripherals == V3D_PERIPHERAL_TMU_READ && 768bf215546Sopenharmony_ci (a_peripherals == V3D_PERIPHERAL_VPM_READ || 769bf215546Sopenharmony_ci a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { 770bf215546Sopenharmony_ci return true; 771bf215546Sopenharmony_ci } 772bf215546Sopenharmony_ci 773bf215546Sopenharmony_ci return false; 774bf215546Sopenharmony_ci} 775bf215546Sopenharmony_ci 776bf215546Sopenharmony_ci/* Compute a bitmask of which rf registers are used between 777bf215546Sopenharmony_ci * the two instructions. 778bf215546Sopenharmony_ci */ 779bf215546Sopenharmony_cistatic uint64_t 780bf215546Sopenharmony_ciqpu_raddrs_used(const struct v3d_qpu_instr *a, 781bf215546Sopenharmony_ci const struct v3d_qpu_instr *b) 782bf215546Sopenharmony_ci{ 783bf215546Sopenharmony_ci assert(a->type == V3D_QPU_INSTR_TYPE_ALU); 784bf215546Sopenharmony_ci assert(b->type == V3D_QPU_INSTR_TYPE_ALU); 785bf215546Sopenharmony_ci 786bf215546Sopenharmony_ci uint64_t raddrs_used = 0; 787bf215546Sopenharmony_ci if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A)) 788bf215546Sopenharmony_ci raddrs_used |= (1ll << a->raddr_a); 789bf215546Sopenharmony_ci if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) 790bf215546Sopenharmony_ci raddrs_used |= (1ll << a->raddr_b); 791bf215546Sopenharmony_ci if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) 792bf215546Sopenharmony_ci raddrs_used |= (1ll << b->raddr_a); 793bf215546Sopenharmony_ci if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) 794bf215546Sopenharmony_ci raddrs_used |= (1ll << b->raddr_b); 795bf215546Sopenharmony_ci 796bf215546Sopenharmony_ci return raddrs_used; 797bf215546Sopenharmony_ci} 798bf215546Sopenharmony_ci 799bf215546Sopenharmony_ci/* Take two instructions and attempt to merge their raddr fields 800bf215546Sopenharmony_ci * into one merged instruction. Returns false if the two instructions 801bf215546Sopenharmony_ci * access more than two different rf registers between them, or more 802bf215546Sopenharmony_ci * than one rf register and one small immediate. 803bf215546Sopenharmony_ci */ 804bf215546Sopenharmony_cistatic bool 805bf215546Sopenharmony_ciqpu_merge_raddrs(struct v3d_qpu_instr *result, 806bf215546Sopenharmony_ci const struct v3d_qpu_instr *add_instr, 807bf215546Sopenharmony_ci const struct v3d_qpu_instr *mul_instr) 808bf215546Sopenharmony_ci{ 809bf215546Sopenharmony_ci uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr); 810bf215546Sopenharmony_ci int naddrs = util_bitcount64(raddrs_used); 811bf215546Sopenharmony_ci 812bf215546Sopenharmony_ci if (naddrs > 2) 813bf215546Sopenharmony_ci return false; 814bf215546Sopenharmony_ci 815bf215546Sopenharmony_ci if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) { 816bf215546Sopenharmony_ci if (naddrs > 1) 817bf215546Sopenharmony_ci return false; 818bf215546Sopenharmony_ci 819bf215546Sopenharmony_ci if (add_instr->sig.small_imm && mul_instr->sig.small_imm) 820bf215546Sopenharmony_ci if (add_instr->raddr_b != mul_instr->raddr_b) 821bf215546Sopenharmony_ci return false; 822bf215546Sopenharmony_ci 823bf215546Sopenharmony_ci result->sig.small_imm = true; 824bf215546Sopenharmony_ci result->raddr_b = add_instr->sig.small_imm ? 825bf215546Sopenharmony_ci add_instr->raddr_b : mul_instr->raddr_b; 826bf215546Sopenharmony_ci } 827bf215546Sopenharmony_ci 828bf215546Sopenharmony_ci if (naddrs == 0) 829bf215546Sopenharmony_ci return true; 830bf215546Sopenharmony_ci 831bf215546Sopenharmony_ci int raddr_a = ffsll(raddrs_used) - 1; 832bf215546Sopenharmony_ci raddrs_used &= ~(1ll << raddr_a); 833bf215546Sopenharmony_ci result->raddr_a = raddr_a; 834bf215546Sopenharmony_ci 835bf215546Sopenharmony_ci if (!result->sig.small_imm) { 836bf215546Sopenharmony_ci if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) && 837bf215546Sopenharmony_ci raddr_a == add_instr->raddr_b) { 838bf215546Sopenharmony_ci if (add_instr->alu.add.a == V3D_QPU_MUX_B) 839bf215546Sopenharmony_ci result->alu.add.a = V3D_QPU_MUX_A; 840bf215546Sopenharmony_ci if (add_instr->alu.add.b == V3D_QPU_MUX_B && 841bf215546Sopenharmony_ci v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { 842bf215546Sopenharmony_ci result->alu.add.b = V3D_QPU_MUX_A; 843bf215546Sopenharmony_ci } 844bf215546Sopenharmony_ci } 845bf215546Sopenharmony_ci if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) && 846bf215546Sopenharmony_ci raddr_a == mul_instr->raddr_b) { 847bf215546Sopenharmony_ci if (mul_instr->alu.mul.a == V3D_QPU_MUX_B) 848bf215546Sopenharmony_ci result->alu.mul.a = V3D_QPU_MUX_A; 849bf215546Sopenharmony_ci if (mul_instr->alu.mul.b == V3D_QPU_MUX_B && 850bf215546Sopenharmony_ci v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { 851bf215546Sopenharmony_ci result->alu.mul.b = V3D_QPU_MUX_A; 852bf215546Sopenharmony_ci } 853bf215546Sopenharmony_ci } 854bf215546Sopenharmony_ci } 855bf215546Sopenharmony_ci if (!raddrs_used) 856bf215546Sopenharmony_ci return true; 857bf215546Sopenharmony_ci 858bf215546Sopenharmony_ci int raddr_b = ffsll(raddrs_used) - 1; 859bf215546Sopenharmony_ci result->raddr_b = raddr_b; 860bf215546Sopenharmony_ci if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) && 861bf215546Sopenharmony_ci raddr_b == add_instr->raddr_a) { 862bf215546Sopenharmony_ci if (add_instr->alu.add.a == V3D_QPU_MUX_A) 863bf215546Sopenharmony_ci result->alu.add.a = V3D_QPU_MUX_B; 864bf215546Sopenharmony_ci if (add_instr->alu.add.b == V3D_QPU_MUX_A && 865bf215546Sopenharmony_ci v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { 866bf215546Sopenharmony_ci result->alu.add.b = V3D_QPU_MUX_B; 867bf215546Sopenharmony_ci } 868bf215546Sopenharmony_ci } 869bf215546Sopenharmony_ci if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) && 870bf215546Sopenharmony_ci raddr_b == mul_instr->raddr_a) { 871bf215546Sopenharmony_ci if (mul_instr->alu.mul.a == V3D_QPU_MUX_A) 872bf215546Sopenharmony_ci result->alu.mul.a = V3D_QPU_MUX_B; 873bf215546Sopenharmony_ci if (mul_instr->alu.mul.b == V3D_QPU_MUX_A && 874bf215546Sopenharmony_ci v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { 875bf215546Sopenharmony_ci result->alu.mul.b = V3D_QPU_MUX_B; 876bf215546Sopenharmony_ci } 877bf215546Sopenharmony_ci } 878bf215546Sopenharmony_ci 879bf215546Sopenharmony_ci return true; 880bf215546Sopenharmony_ci} 881bf215546Sopenharmony_ci 882bf215546Sopenharmony_cistatic bool 883bf215546Sopenharmony_cican_do_add_as_mul(enum v3d_qpu_add_op op) 884bf215546Sopenharmony_ci{ 885bf215546Sopenharmony_ci switch (op) { 886bf215546Sopenharmony_ci case V3D_QPU_A_ADD: 887bf215546Sopenharmony_ci case V3D_QPU_A_SUB: 888bf215546Sopenharmony_ci return true; 889bf215546Sopenharmony_ci default: 890bf215546Sopenharmony_ci return false; 891bf215546Sopenharmony_ci } 892bf215546Sopenharmony_ci} 893bf215546Sopenharmony_ci 894bf215546Sopenharmony_cistatic enum v3d_qpu_mul_op 895bf215546Sopenharmony_ciadd_op_as_mul_op(enum v3d_qpu_add_op op) 896bf215546Sopenharmony_ci{ 897bf215546Sopenharmony_ci switch (op) { 898bf215546Sopenharmony_ci case V3D_QPU_A_ADD: 899bf215546Sopenharmony_ci return V3D_QPU_M_ADD; 900bf215546Sopenharmony_ci case V3D_QPU_A_SUB: 901bf215546Sopenharmony_ci return V3D_QPU_M_SUB; 902bf215546Sopenharmony_ci default: 903bf215546Sopenharmony_ci unreachable("unexpected add opcode"); 904bf215546Sopenharmony_ci } 905bf215546Sopenharmony_ci} 906bf215546Sopenharmony_ci 907bf215546Sopenharmony_cistatic void 908bf215546Sopenharmony_ciqpu_convert_add_to_mul(struct v3d_qpu_instr *inst) 909bf215546Sopenharmony_ci{ 910bf215546Sopenharmony_ci STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add)); 911bf215546Sopenharmony_ci assert(inst->alu.add.op != V3D_QPU_A_NOP); 912bf215546Sopenharmony_ci assert(inst->alu.mul.op == V3D_QPU_M_NOP); 913bf215546Sopenharmony_ci 914bf215546Sopenharmony_ci memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul)); 915bf215546Sopenharmony_ci inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op); 916bf215546Sopenharmony_ci inst->alu.add.op = V3D_QPU_A_NOP; 917bf215546Sopenharmony_ci 918bf215546Sopenharmony_ci inst->flags.mc = inst->flags.ac; 919bf215546Sopenharmony_ci inst->flags.mpf = inst->flags.apf; 920bf215546Sopenharmony_ci inst->flags.muf = inst->flags.auf; 921bf215546Sopenharmony_ci inst->flags.ac = V3D_QPU_COND_NONE; 922bf215546Sopenharmony_ci inst->flags.apf = V3D_QPU_PF_NONE; 923bf215546Sopenharmony_ci inst->flags.auf = V3D_QPU_UF_NONE; 924bf215546Sopenharmony_ci 925bf215546Sopenharmony_ci inst->alu.mul.output_pack = inst->alu.add.output_pack; 926bf215546Sopenharmony_ci inst->alu.mul.a_unpack = inst->alu.add.a_unpack; 927bf215546Sopenharmony_ci inst->alu.mul.b_unpack = inst->alu.add.b_unpack; 928bf215546Sopenharmony_ci inst->alu.add.output_pack = V3D_QPU_PACK_NONE; 929bf215546Sopenharmony_ci inst->alu.add.a_unpack = V3D_QPU_UNPACK_NONE; 930bf215546Sopenharmony_ci inst->alu.add.b_unpack = V3D_QPU_UNPACK_NONE; 931bf215546Sopenharmony_ci} 932bf215546Sopenharmony_ci 933bf215546Sopenharmony_cistatic bool 934bf215546Sopenharmony_ciqpu_merge_inst(const struct v3d_device_info *devinfo, 935bf215546Sopenharmony_ci struct v3d_qpu_instr *result, 936bf215546Sopenharmony_ci const struct v3d_qpu_instr *a, 937bf215546Sopenharmony_ci const struct v3d_qpu_instr *b) 938bf215546Sopenharmony_ci{ 939bf215546Sopenharmony_ci if (a->type != V3D_QPU_INSTR_TYPE_ALU || 940bf215546Sopenharmony_ci b->type != V3D_QPU_INSTR_TYPE_ALU) { 941bf215546Sopenharmony_ci return false; 942bf215546Sopenharmony_ci } 943bf215546Sopenharmony_ci 944bf215546Sopenharmony_ci if (!qpu_compatible_peripheral_access(devinfo, a, b)) 945bf215546Sopenharmony_ci return false; 946bf215546Sopenharmony_ci 947bf215546Sopenharmony_ci struct v3d_qpu_instr merge = *a; 948bf215546Sopenharmony_ci const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL; 949bf215546Sopenharmony_ci 950bf215546Sopenharmony_ci struct v3d_qpu_instr mul_inst; 951bf215546Sopenharmony_ci if (b->alu.add.op != V3D_QPU_A_NOP) { 952bf215546Sopenharmony_ci if (a->alu.add.op == V3D_QPU_A_NOP) { 953bf215546Sopenharmony_ci merge.alu.add = b->alu.add; 954bf215546Sopenharmony_ci 955bf215546Sopenharmony_ci merge.flags.ac = b->flags.ac; 956bf215546Sopenharmony_ci merge.flags.apf = b->flags.apf; 957bf215546Sopenharmony_ci merge.flags.auf = b->flags.auf; 958bf215546Sopenharmony_ci 959bf215546Sopenharmony_ci add_instr = b; 960bf215546Sopenharmony_ci mul_instr = a; 961bf215546Sopenharmony_ci } 962bf215546Sopenharmony_ci /* If a's add op is used but its mul op is not, then see if we 963bf215546Sopenharmony_ci * can convert either a's add op or b's add op to a mul op 964bf215546Sopenharmony_ci * so we can merge. 965bf215546Sopenharmony_ci */ 966bf215546Sopenharmony_ci else if (a->alu.mul.op == V3D_QPU_M_NOP && 967bf215546Sopenharmony_ci can_do_add_as_mul(b->alu.add.op)) { 968bf215546Sopenharmony_ci mul_inst = *b; 969bf215546Sopenharmony_ci qpu_convert_add_to_mul(&mul_inst); 970bf215546Sopenharmony_ci 971bf215546Sopenharmony_ci merge.alu.mul = mul_inst.alu.mul; 972bf215546Sopenharmony_ci 973bf215546Sopenharmony_ci merge.flags.mc = b->flags.ac; 974bf215546Sopenharmony_ci merge.flags.mpf = b->flags.apf; 975bf215546Sopenharmony_ci merge.flags.muf = b->flags.auf; 976bf215546Sopenharmony_ci 977bf215546Sopenharmony_ci add_instr = a; 978bf215546Sopenharmony_ci mul_instr = &mul_inst; 979bf215546Sopenharmony_ci } else if (a->alu.mul.op == V3D_QPU_M_NOP && 980bf215546Sopenharmony_ci can_do_add_as_mul(a->alu.add.op)) { 981bf215546Sopenharmony_ci mul_inst = *a; 982bf215546Sopenharmony_ci qpu_convert_add_to_mul(&mul_inst); 983bf215546Sopenharmony_ci 984bf215546Sopenharmony_ci merge = mul_inst; 985bf215546Sopenharmony_ci merge.alu.add = b->alu.add; 986bf215546Sopenharmony_ci 987bf215546Sopenharmony_ci merge.flags.ac = b->flags.ac; 988bf215546Sopenharmony_ci merge.flags.apf = b->flags.apf; 989bf215546Sopenharmony_ci merge.flags.auf = b->flags.auf; 990bf215546Sopenharmony_ci 991bf215546Sopenharmony_ci add_instr = b; 992bf215546Sopenharmony_ci mul_instr = &mul_inst; 993bf215546Sopenharmony_ci } else { 994bf215546Sopenharmony_ci return false; 995bf215546Sopenharmony_ci } 996bf215546Sopenharmony_ci } 997bf215546Sopenharmony_ci 998bf215546Sopenharmony_ci if (b->alu.mul.op != V3D_QPU_M_NOP) { 999bf215546Sopenharmony_ci if (a->alu.mul.op != V3D_QPU_M_NOP) 1000bf215546Sopenharmony_ci return false; 1001bf215546Sopenharmony_ci merge.alu.mul = b->alu.mul; 1002bf215546Sopenharmony_ci 1003bf215546Sopenharmony_ci merge.flags.mc = b->flags.mc; 1004bf215546Sopenharmony_ci merge.flags.mpf = b->flags.mpf; 1005bf215546Sopenharmony_ci merge.flags.muf = b->flags.muf; 1006bf215546Sopenharmony_ci 1007bf215546Sopenharmony_ci mul_instr = b; 1008bf215546Sopenharmony_ci add_instr = a; 1009bf215546Sopenharmony_ci } 1010bf215546Sopenharmony_ci 1011bf215546Sopenharmony_ci if (add_instr && mul_instr && 1012bf215546Sopenharmony_ci !qpu_merge_raddrs(&merge, add_instr, mul_instr)) { 1013bf215546Sopenharmony_ci return false; 1014bf215546Sopenharmony_ci } 1015bf215546Sopenharmony_ci 1016bf215546Sopenharmony_ci merge.sig.thrsw |= b->sig.thrsw; 1017bf215546Sopenharmony_ci merge.sig.ldunif |= b->sig.ldunif; 1018bf215546Sopenharmony_ci merge.sig.ldunifrf |= b->sig.ldunifrf; 1019bf215546Sopenharmony_ci merge.sig.ldunifa |= b->sig.ldunifa; 1020bf215546Sopenharmony_ci merge.sig.ldunifarf |= b->sig.ldunifarf; 1021bf215546Sopenharmony_ci merge.sig.ldtmu |= b->sig.ldtmu; 1022bf215546Sopenharmony_ci merge.sig.ldvary |= b->sig.ldvary; 1023bf215546Sopenharmony_ci merge.sig.ldvpm |= b->sig.ldvpm; 1024bf215546Sopenharmony_ci merge.sig.small_imm |= b->sig.small_imm; 1025bf215546Sopenharmony_ci merge.sig.ldtlb |= b->sig.ldtlb; 1026bf215546Sopenharmony_ci merge.sig.ldtlbu |= b->sig.ldtlbu; 1027bf215546Sopenharmony_ci merge.sig.ucb |= b->sig.ucb; 1028bf215546Sopenharmony_ci merge.sig.rotate |= b->sig.rotate; 1029bf215546Sopenharmony_ci merge.sig.wrtmuc |= b->sig.wrtmuc; 1030bf215546Sopenharmony_ci 1031bf215546Sopenharmony_ci if (v3d_qpu_sig_writes_address(devinfo, &a->sig) && 1032bf215546Sopenharmony_ci v3d_qpu_sig_writes_address(devinfo, &b->sig)) 1033bf215546Sopenharmony_ci return false; 1034bf215546Sopenharmony_ci merge.sig_addr |= b->sig_addr; 1035bf215546Sopenharmony_ci merge.sig_magic |= b->sig_magic; 1036bf215546Sopenharmony_ci 1037bf215546Sopenharmony_ci uint64_t packed; 1038bf215546Sopenharmony_ci bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed); 1039bf215546Sopenharmony_ci 1040bf215546Sopenharmony_ci *result = merge; 1041bf215546Sopenharmony_ci /* No modifying the real instructions on failure. */ 1042bf215546Sopenharmony_ci assert(ok || (a != result && b != result)); 1043bf215546Sopenharmony_ci 1044bf215546Sopenharmony_ci return ok; 1045bf215546Sopenharmony_ci} 1046bf215546Sopenharmony_ci 1047bf215546Sopenharmony_cistatic inline bool 1048bf215546Sopenharmony_citry_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst) 1049bf215546Sopenharmony_ci{ 1050bf215546Sopenharmony_ci return inst->sig.ldunif || inst->sig.ldunifrf; 1051bf215546Sopenharmony_ci} 1052bf215546Sopenharmony_ci 1053bf215546Sopenharmony_cistatic bool 1054bf215546Sopenharmony_ciqpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, 1055bf215546Sopenharmony_ci struct choose_scoreboard *scoreboard, 1056bf215546Sopenharmony_ci const struct qinst *qinst); 1057bf215546Sopenharmony_ci 1058bf215546Sopenharmony_cistatic struct schedule_node * 1059bf215546Sopenharmony_cichoose_instruction_to_schedule(struct v3d_compile *c, 1060bf215546Sopenharmony_ci struct choose_scoreboard *scoreboard, 1061bf215546Sopenharmony_ci struct schedule_node *prev_inst) 1062bf215546Sopenharmony_ci{ 1063bf215546Sopenharmony_ci struct schedule_node *chosen = NULL; 1064bf215546Sopenharmony_ci int chosen_prio = 0; 1065bf215546Sopenharmony_ci 1066bf215546Sopenharmony_ci /* Don't pair up anything with a thread switch signal -- emit_thrsw() 1067bf215546Sopenharmony_ci * will handle pairing it along with filling the delay slots. 1068bf215546Sopenharmony_ci */ 1069bf215546Sopenharmony_ci if (prev_inst) { 1070bf215546Sopenharmony_ci if (prev_inst->inst->qpu.sig.thrsw) 1071bf215546Sopenharmony_ci return NULL; 1072bf215546Sopenharmony_ci } 1073bf215546Sopenharmony_ci 1074bf215546Sopenharmony_ci bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT && 1075bf215546Sopenharmony_ci scoreboard->ldvary_count < c->num_inputs; 1076bf215546Sopenharmony_ci bool skipped_insts_for_ldvary_pipelining = false; 1077bf215546Sopenharmony_ciretry: 1078bf215546Sopenharmony_ci list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads, 1079bf215546Sopenharmony_ci dag.link) { 1080bf215546Sopenharmony_ci const struct v3d_qpu_instr *inst = &n->inst->qpu; 1081bf215546Sopenharmony_ci 1082bf215546Sopenharmony_ci if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) { 1083bf215546Sopenharmony_ci skipped_insts_for_ldvary_pipelining = true; 1084bf215546Sopenharmony_ci continue; 1085bf215546Sopenharmony_ci } 1086bf215546Sopenharmony_ci 1087bf215546Sopenharmony_ci /* Don't choose the branch instruction until it's the last one 1088bf215546Sopenharmony_ci * left. We'll move it up to fit its delay slots after we 1089bf215546Sopenharmony_ci * choose it. 1090bf215546Sopenharmony_ci */ 1091bf215546Sopenharmony_ci if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && 1092bf215546Sopenharmony_ci !list_is_singular(&scoreboard->dag->heads)) { 1093bf215546Sopenharmony_ci continue; 1094bf215546Sopenharmony_ci } 1095bf215546Sopenharmony_ci 1096bf215546Sopenharmony_ci /* We need to have 3 delay slots between a write to unifa and 1097bf215546Sopenharmony_ci * a follow-up ldunifa. 1098bf215546Sopenharmony_ci */ 1099bf215546Sopenharmony_ci if ((inst->sig.ldunifa || inst->sig.ldunifarf) && 1100bf215546Sopenharmony_ci scoreboard->tick - scoreboard->last_unifa_write_tick <= 3) 1101bf215546Sopenharmony_ci continue; 1102bf215546Sopenharmony_ci 1103bf215546Sopenharmony_ci /* "An instruction must not read from a location in physical 1104bf215546Sopenharmony_ci * regfile A or B that was written to by the previous 1105bf215546Sopenharmony_ci * instruction." 1106bf215546Sopenharmony_ci */ 1107bf215546Sopenharmony_ci if (reads_too_soon_after_write(scoreboard, n->inst)) 1108bf215546Sopenharmony_ci continue; 1109bf215546Sopenharmony_ci 1110bf215546Sopenharmony_ci if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst)) 1111bf215546Sopenharmony_ci continue; 1112bf215546Sopenharmony_ci 1113bf215546Sopenharmony_ci /* "Before doing a TLB access a scoreboard wait must have been 1114bf215546Sopenharmony_ci * done. This happens either on the first or last thread 1115bf215546Sopenharmony_ci * switch, depending on a setting (scb_wait_on_first_thrsw) in 1116bf215546Sopenharmony_ci * the shader state." 1117bf215546Sopenharmony_ci */ 1118bf215546Sopenharmony_ci if (pixel_scoreboard_too_soon(c, scoreboard, inst)) 1119bf215546Sopenharmony_ci continue; 1120bf215546Sopenharmony_ci 1121bf215546Sopenharmony_ci /* ldunif and ldvary both write r5, but ldunif does so a tick 1122bf215546Sopenharmony_ci * sooner. If the ldvary's r5 wasn't used, then ldunif might 1123bf215546Sopenharmony_ci * otherwise get scheduled so ldunif and ldvary try to update 1124bf215546Sopenharmony_ci * r5 in the same tick. 1125bf215546Sopenharmony_ci */ 1126bf215546Sopenharmony_ci if ((inst->sig.ldunif || inst->sig.ldunifa) && 1127bf215546Sopenharmony_ci scoreboard->tick == scoreboard->last_ldvary_tick + 1) { 1128bf215546Sopenharmony_ci continue; 1129bf215546Sopenharmony_ci } 1130bf215546Sopenharmony_ci 1131bf215546Sopenharmony_ci /* If we are in a thrsw delay slot check that this instruction 1132bf215546Sopenharmony_ci * is valid for that. 1133bf215546Sopenharmony_ci */ 1134bf215546Sopenharmony_ci if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick && 1135bf215546Sopenharmony_ci !qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard, 1136bf215546Sopenharmony_ci n->inst)) { 1137bf215546Sopenharmony_ci continue; 1138bf215546Sopenharmony_ci } 1139bf215546Sopenharmony_ci 1140bf215546Sopenharmony_ci if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 1141bf215546Sopenharmony_ci /* Don't try to put a branch in the delay slots of another 1142bf215546Sopenharmony_ci * branch or a unifa write. 1143bf215546Sopenharmony_ci */ 1144bf215546Sopenharmony_ci if (scoreboard->last_branch_tick + 3 >= scoreboard->tick) 1145bf215546Sopenharmony_ci continue; 1146bf215546Sopenharmony_ci if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick) 1147bf215546Sopenharmony_ci continue; 1148bf215546Sopenharmony_ci 1149bf215546Sopenharmony_ci /* No branch with cond != 0,2,3 and msfign != 0 after 1150bf215546Sopenharmony_ci * setmsf. 1151bf215546Sopenharmony_ci */ 1152bf215546Sopenharmony_ci if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 && 1153bf215546Sopenharmony_ci inst->branch.msfign != V3D_QPU_MSFIGN_NONE && 1154bf215546Sopenharmony_ci inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS && 1155bf215546Sopenharmony_ci inst->branch.cond != V3D_QPU_BRANCH_COND_A0 && 1156bf215546Sopenharmony_ci inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) { 1157bf215546Sopenharmony_ci continue; 1158bf215546Sopenharmony_ci } 1159bf215546Sopenharmony_ci } 1160bf215546Sopenharmony_ci 1161bf215546Sopenharmony_ci /* If we're trying to pair with another instruction, check 1162bf215546Sopenharmony_ci * that they're compatible. 1163bf215546Sopenharmony_ci */ 1164bf215546Sopenharmony_ci if (prev_inst) { 1165bf215546Sopenharmony_ci /* Don't pair up a thread switch signal -- we'll 1166bf215546Sopenharmony_ci * handle pairing it when we pick it on its own. 1167bf215546Sopenharmony_ci */ 1168bf215546Sopenharmony_ci if (inst->sig.thrsw) 1169bf215546Sopenharmony_ci continue; 1170bf215546Sopenharmony_ci 1171bf215546Sopenharmony_ci if (prev_inst->inst->uniform != -1 && 1172bf215546Sopenharmony_ci n->inst->uniform != -1) 1173bf215546Sopenharmony_ci continue; 1174bf215546Sopenharmony_ci 1175bf215546Sopenharmony_ci /* Simulator complains if we have two uniforms loaded in 1176bf215546Sopenharmony_ci * the the same instruction, which could happen if we 1177bf215546Sopenharmony_ci * have a ldunif or sideband uniform and we pair that 1178bf215546Sopenharmony_ci * with ldunifa. 1179bf215546Sopenharmony_ci */ 1180bf215546Sopenharmony_ci if (vir_has_uniform(prev_inst->inst) && 1181bf215546Sopenharmony_ci (inst->sig.ldunifa || inst->sig.ldunifarf)) { 1182bf215546Sopenharmony_ci continue; 1183bf215546Sopenharmony_ci } 1184bf215546Sopenharmony_ci 1185bf215546Sopenharmony_ci if ((prev_inst->inst->qpu.sig.ldunifa || 1186bf215546Sopenharmony_ci prev_inst->inst->qpu.sig.ldunifarf) && 1187bf215546Sopenharmony_ci vir_has_uniform(n->inst)) { 1188bf215546Sopenharmony_ci continue; 1189bf215546Sopenharmony_ci } 1190bf215546Sopenharmony_ci 1191bf215546Sopenharmony_ci /* Don't merge TLB instructions before we have acquired 1192bf215546Sopenharmony_ci * the scoreboard lock. 1193bf215546Sopenharmony_ci */ 1194bf215546Sopenharmony_ci if (pixel_scoreboard_too_soon(c, scoreboard, inst)) 1195bf215546Sopenharmony_ci continue; 1196bf215546Sopenharmony_ci 1197bf215546Sopenharmony_ci /* When we succesfully pair up an ldvary we then try 1198bf215546Sopenharmony_ci * to merge it into the previous instruction if 1199bf215546Sopenharmony_ci * possible to improve pipelining. Don't pick up the 1200bf215546Sopenharmony_ci * ldvary now if the follow-up fixup would place 1201bf215546Sopenharmony_ci * it in the delay slots of a thrsw, which is not 1202bf215546Sopenharmony_ci * allowed and would prevent the fixup from being 1203bf215546Sopenharmony_ci * successul. 1204bf215546Sopenharmony_ci */ 1205bf215546Sopenharmony_ci if (inst->sig.ldvary && 1206bf215546Sopenharmony_ci scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) { 1207bf215546Sopenharmony_ci continue; 1208bf215546Sopenharmony_ci } 1209bf215546Sopenharmony_ci 1210bf215546Sopenharmony_ci struct v3d_qpu_instr merged_inst; 1211bf215546Sopenharmony_ci if (!qpu_merge_inst(c->devinfo, &merged_inst, 1212bf215546Sopenharmony_ci &prev_inst->inst->qpu, inst)) { 1213bf215546Sopenharmony_ci continue; 1214bf215546Sopenharmony_ci } 1215bf215546Sopenharmony_ci } 1216bf215546Sopenharmony_ci 1217bf215546Sopenharmony_ci int prio = get_instruction_priority(c->devinfo, inst); 1218bf215546Sopenharmony_ci 1219bf215546Sopenharmony_ci if (mux_read_stalls(scoreboard, inst)) { 1220bf215546Sopenharmony_ci /* Don't merge an instruction that stalls */ 1221bf215546Sopenharmony_ci if (prev_inst) 1222bf215546Sopenharmony_ci continue; 1223bf215546Sopenharmony_ci else { 1224bf215546Sopenharmony_ci /* Any instruction that don't stall will have 1225bf215546Sopenharmony_ci * higher scheduling priority */ 1226bf215546Sopenharmony_ci prio -= MAX_SCHEDULE_PRIORITY; 1227bf215546Sopenharmony_ci assert(prio < 0); 1228bf215546Sopenharmony_ci } 1229bf215546Sopenharmony_ci } 1230bf215546Sopenharmony_ci 1231bf215546Sopenharmony_ci /* Found a valid instruction. If nothing better comes along, 1232bf215546Sopenharmony_ci * this one works. 1233bf215546Sopenharmony_ci */ 1234bf215546Sopenharmony_ci if (!chosen) { 1235bf215546Sopenharmony_ci chosen = n; 1236bf215546Sopenharmony_ci chosen_prio = prio; 1237bf215546Sopenharmony_ci continue; 1238bf215546Sopenharmony_ci } 1239bf215546Sopenharmony_ci 1240bf215546Sopenharmony_ci if (prio > chosen_prio) { 1241bf215546Sopenharmony_ci chosen = n; 1242bf215546Sopenharmony_ci chosen_prio = prio; 1243bf215546Sopenharmony_ci } else if (prio < chosen_prio) { 1244bf215546Sopenharmony_ci continue; 1245bf215546Sopenharmony_ci } 1246bf215546Sopenharmony_ci 1247bf215546Sopenharmony_ci if (n->delay > chosen->delay) { 1248bf215546Sopenharmony_ci chosen = n; 1249bf215546Sopenharmony_ci chosen_prio = prio; 1250bf215546Sopenharmony_ci } else if (n->delay < chosen->delay) { 1251bf215546Sopenharmony_ci continue; 1252bf215546Sopenharmony_ci } 1253bf215546Sopenharmony_ci } 1254bf215546Sopenharmony_ci 1255bf215546Sopenharmony_ci /* If we did not find any instruction to schedule but we discarded 1256bf215546Sopenharmony_ci * some of them to prioritize ldvary pipelining, try again. 1257bf215546Sopenharmony_ci */ 1258bf215546Sopenharmony_ci if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) { 1259bf215546Sopenharmony_ci skipped_insts_for_ldvary_pipelining = false; 1260bf215546Sopenharmony_ci ldvary_pipelining = false; 1261bf215546Sopenharmony_ci goto retry; 1262bf215546Sopenharmony_ci } 1263bf215546Sopenharmony_ci 1264bf215546Sopenharmony_ci if (chosen && chosen->inst->qpu.sig.ldvary) { 1265bf215546Sopenharmony_ci scoreboard->ldvary_count++; 1266bf215546Sopenharmony_ci /* If we are pairing an ldvary, flag it so we can fix it up for 1267bf215546Sopenharmony_ci * optimal pipelining of ldvary sequences. 1268bf215546Sopenharmony_ci */ 1269bf215546Sopenharmony_ci if (prev_inst) 1270bf215546Sopenharmony_ci scoreboard->fixup_ldvary = true; 1271bf215546Sopenharmony_ci } 1272bf215546Sopenharmony_ci 1273bf215546Sopenharmony_ci return chosen; 1274bf215546Sopenharmony_ci} 1275bf215546Sopenharmony_ci 1276bf215546Sopenharmony_cistatic void 1277bf215546Sopenharmony_ciupdate_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, 1278bf215546Sopenharmony_ci enum v3d_qpu_waddr waddr, 1279bf215546Sopenharmony_ci const struct v3d_device_info *devinfo) 1280bf215546Sopenharmony_ci{ 1281bf215546Sopenharmony_ci if (v3d_qpu_magic_waddr_is_sfu(waddr)) 1282bf215546Sopenharmony_ci scoreboard->last_magic_sfu_write_tick = scoreboard->tick; 1283bf215546Sopenharmony_ci else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA) 1284bf215546Sopenharmony_ci scoreboard->last_unifa_write_tick = scoreboard->tick; 1285bf215546Sopenharmony_ci} 1286bf215546Sopenharmony_ci 1287bf215546Sopenharmony_cistatic void 1288bf215546Sopenharmony_ciupdate_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard, 1289bf215546Sopenharmony_ci const struct v3d_qpu_instr *inst) 1290bf215546Sopenharmony_ci{ 1291bf215546Sopenharmony_ci if (v3d_qpu_instr_is_sfu(inst)) { 1292bf215546Sopenharmony_ci scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr; 1293bf215546Sopenharmony_ci scoreboard->last_stallable_sfu_tick = scoreboard->tick; 1294bf215546Sopenharmony_ci } 1295bf215546Sopenharmony_ci} 1296bf215546Sopenharmony_ci 1297bf215546Sopenharmony_cistatic void 1298bf215546Sopenharmony_ciupdate_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, 1299bf215546Sopenharmony_ci const struct v3d_qpu_instr *inst, 1300bf215546Sopenharmony_ci const struct v3d_device_info *devinfo) 1301bf215546Sopenharmony_ci{ 1302bf215546Sopenharmony_ci if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 1303bf215546Sopenharmony_ci return; 1304bf215546Sopenharmony_ci 1305bf215546Sopenharmony_ci assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 1306bf215546Sopenharmony_ci 1307bf215546Sopenharmony_ci if (inst->alu.add.op != V3D_QPU_A_NOP) { 1308bf215546Sopenharmony_ci if (inst->alu.add.magic_write) { 1309bf215546Sopenharmony_ci update_scoreboard_for_magic_waddr(scoreboard, 1310bf215546Sopenharmony_ci inst->alu.add.waddr, 1311bf215546Sopenharmony_ci devinfo); 1312bf215546Sopenharmony_ci } else { 1313bf215546Sopenharmony_ci update_scoreboard_for_sfu_stall_waddr(scoreboard, 1314bf215546Sopenharmony_ci inst); 1315bf215546Sopenharmony_ci } 1316bf215546Sopenharmony_ci 1317bf215546Sopenharmony_ci if (inst->alu.add.op == V3D_QPU_A_SETMSF) 1318bf215546Sopenharmony_ci scoreboard->last_setmsf_tick = scoreboard->tick; 1319bf215546Sopenharmony_ci } 1320bf215546Sopenharmony_ci 1321bf215546Sopenharmony_ci if (inst->alu.mul.op != V3D_QPU_M_NOP) { 1322bf215546Sopenharmony_ci if (inst->alu.mul.magic_write) { 1323bf215546Sopenharmony_ci update_scoreboard_for_magic_waddr(scoreboard, 1324bf215546Sopenharmony_ci inst->alu.mul.waddr, 1325bf215546Sopenharmony_ci devinfo); 1326bf215546Sopenharmony_ci } 1327bf215546Sopenharmony_ci } 1328bf215546Sopenharmony_ci 1329bf215546Sopenharmony_ci if (inst->sig.ldvary) 1330bf215546Sopenharmony_ci scoreboard->last_ldvary_tick = scoreboard->tick; 1331bf215546Sopenharmony_ci} 1332bf215546Sopenharmony_ci 1333bf215546Sopenharmony_cistatic void 1334bf215546Sopenharmony_cidump_state(const struct v3d_device_info *devinfo, struct dag *dag) 1335bf215546Sopenharmony_ci{ 1336bf215546Sopenharmony_ci list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) { 1337bf215546Sopenharmony_ci fprintf(stderr, " t=%4d: ", n->unblocked_time); 1338bf215546Sopenharmony_ci v3d_qpu_dump(devinfo, &n->inst->qpu); 1339bf215546Sopenharmony_ci fprintf(stderr, "\n"); 1340bf215546Sopenharmony_ci 1341bf215546Sopenharmony_ci util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 1342bf215546Sopenharmony_ci struct schedule_node *child = 1343bf215546Sopenharmony_ci (struct schedule_node *)edge->child; 1344bf215546Sopenharmony_ci if (!child) 1345bf215546Sopenharmony_ci continue; 1346bf215546Sopenharmony_ci 1347bf215546Sopenharmony_ci fprintf(stderr, " - "); 1348bf215546Sopenharmony_ci v3d_qpu_dump(devinfo, &child->inst->qpu); 1349bf215546Sopenharmony_ci fprintf(stderr, " (%d parents, %c)\n", 1350bf215546Sopenharmony_ci child->dag.parent_count, 1351bf215546Sopenharmony_ci edge->data ? 'w' : 'r'); 1352bf215546Sopenharmony_ci } 1353bf215546Sopenharmony_ci } 1354bf215546Sopenharmony_ci} 1355bf215546Sopenharmony_ci 1356bf215546Sopenharmony_cistatic uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo, 1357bf215546Sopenharmony_ci enum v3d_qpu_waddr waddr, 1358bf215546Sopenharmony_ci const struct v3d_qpu_instr *after) 1359bf215546Sopenharmony_ci{ 1360bf215546Sopenharmony_ci /* Apply some huge latency between texture fetch requests and getting 1361bf215546Sopenharmony_ci * their results back. 1362bf215546Sopenharmony_ci * 1363bf215546Sopenharmony_ci * FIXME: This is actually pretty bogus. If we do: 1364bf215546Sopenharmony_ci * 1365bf215546Sopenharmony_ci * mov tmu0_s, a 1366bf215546Sopenharmony_ci * <a bit of math> 1367bf215546Sopenharmony_ci * mov tmu0_s, b 1368bf215546Sopenharmony_ci * load_tmu0 1369bf215546Sopenharmony_ci * <more math> 1370bf215546Sopenharmony_ci * load_tmu0 1371bf215546Sopenharmony_ci * 1372bf215546Sopenharmony_ci * we count that as worse than 1373bf215546Sopenharmony_ci * 1374bf215546Sopenharmony_ci * mov tmu0_s, a 1375bf215546Sopenharmony_ci * mov tmu0_s, b 1376bf215546Sopenharmony_ci * <lots of math> 1377bf215546Sopenharmony_ci * load_tmu0 1378bf215546Sopenharmony_ci * <more math> 1379bf215546Sopenharmony_ci * load_tmu0 1380bf215546Sopenharmony_ci * 1381bf215546Sopenharmony_ci * because we associate the first load_tmu0 with the *second* tmu0_s. 1382bf215546Sopenharmony_ci */ 1383bf215546Sopenharmony_ci if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) && 1384bf215546Sopenharmony_ci v3d_qpu_waits_on_tmu(after)) { 1385bf215546Sopenharmony_ci return 100; 1386bf215546Sopenharmony_ci } 1387bf215546Sopenharmony_ci 1388bf215546Sopenharmony_ci /* Assume that anything depending on us is consuming the SFU result. */ 1389bf215546Sopenharmony_ci if (v3d_qpu_magic_waddr_is_sfu(waddr)) 1390bf215546Sopenharmony_ci return 3; 1391bf215546Sopenharmony_ci 1392bf215546Sopenharmony_ci return 1; 1393bf215546Sopenharmony_ci} 1394bf215546Sopenharmony_ci 1395bf215546Sopenharmony_cistatic uint32_t 1396bf215546Sopenharmony_ciinstruction_latency(const struct v3d_device_info *devinfo, 1397bf215546Sopenharmony_ci struct schedule_node *before, struct schedule_node *after) 1398bf215546Sopenharmony_ci{ 1399bf215546Sopenharmony_ci const struct v3d_qpu_instr *before_inst = &before->inst->qpu; 1400bf215546Sopenharmony_ci const struct v3d_qpu_instr *after_inst = &after->inst->qpu; 1401bf215546Sopenharmony_ci uint32_t latency = 1; 1402bf215546Sopenharmony_ci 1403bf215546Sopenharmony_ci if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU || 1404bf215546Sopenharmony_ci after_inst->type != V3D_QPU_INSTR_TYPE_ALU) 1405bf215546Sopenharmony_ci return latency; 1406bf215546Sopenharmony_ci 1407bf215546Sopenharmony_ci if (before_inst->alu.add.magic_write) { 1408bf215546Sopenharmony_ci latency = MAX2(latency, 1409bf215546Sopenharmony_ci magic_waddr_latency(devinfo, 1410bf215546Sopenharmony_ci before_inst->alu.add.waddr, 1411bf215546Sopenharmony_ci after_inst)); 1412bf215546Sopenharmony_ci } 1413bf215546Sopenharmony_ci 1414bf215546Sopenharmony_ci if (before_inst->alu.mul.magic_write) { 1415bf215546Sopenharmony_ci latency = MAX2(latency, 1416bf215546Sopenharmony_ci magic_waddr_latency(devinfo, 1417bf215546Sopenharmony_ci before_inst->alu.mul.waddr, 1418bf215546Sopenharmony_ci after_inst)); 1419bf215546Sopenharmony_ci } 1420bf215546Sopenharmony_ci 1421bf215546Sopenharmony_ci if (v3d_qpu_instr_is_sfu(before_inst)) 1422bf215546Sopenharmony_ci return 2; 1423bf215546Sopenharmony_ci 1424bf215546Sopenharmony_ci return latency; 1425bf215546Sopenharmony_ci} 1426bf215546Sopenharmony_ci 1427bf215546Sopenharmony_ci/** Recursive computation of the delay member of a node. */ 1428bf215546Sopenharmony_cistatic void 1429bf215546Sopenharmony_cicompute_delay(struct dag_node *node, void *state) 1430bf215546Sopenharmony_ci{ 1431bf215546Sopenharmony_ci struct schedule_node *n = (struct schedule_node *)node; 1432bf215546Sopenharmony_ci struct v3d_compile *c = (struct v3d_compile *) state; 1433bf215546Sopenharmony_ci 1434bf215546Sopenharmony_ci n->delay = 1; 1435bf215546Sopenharmony_ci 1436bf215546Sopenharmony_ci util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 1437bf215546Sopenharmony_ci struct schedule_node *child = 1438bf215546Sopenharmony_ci (struct schedule_node *)edge->child; 1439bf215546Sopenharmony_ci 1440bf215546Sopenharmony_ci n->delay = MAX2(n->delay, (child->delay + 1441bf215546Sopenharmony_ci instruction_latency(c->devinfo, n, 1442bf215546Sopenharmony_ci child))); 1443bf215546Sopenharmony_ci } 1444bf215546Sopenharmony_ci} 1445bf215546Sopenharmony_ci 1446bf215546Sopenharmony_ci/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head() 1447bf215546Sopenharmony_ci * should be called on it later to finish pruning the other edges). 1448bf215546Sopenharmony_ci */ 1449bf215546Sopenharmony_cistatic void 1450bf215546Sopenharmony_cipre_remove_head(struct dag *dag, struct schedule_node *n) 1451bf215546Sopenharmony_ci{ 1452bf215546Sopenharmony_ci list_delinit(&n->dag.link); 1453bf215546Sopenharmony_ci 1454bf215546Sopenharmony_ci util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 1455bf215546Sopenharmony_ci if (edge->data) 1456bf215546Sopenharmony_ci dag_remove_edge(dag, edge); 1457bf215546Sopenharmony_ci } 1458bf215546Sopenharmony_ci} 1459bf215546Sopenharmony_ci 1460bf215546Sopenharmony_cistatic void 1461bf215546Sopenharmony_cimark_instruction_scheduled(const struct v3d_device_info *devinfo, 1462bf215546Sopenharmony_ci struct dag *dag, 1463bf215546Sopenharmony_ci uint32_t time, 1464bf215546Sopenharmony_ci struct schedule_node *node) 1465bf215546Sopenharmony_ci{ 1466bf215546Sopenharmony_ci if (!node) 1467bf215546Sopenharmony_ci return; 1468bf215546Sopenharmony_ci 1469bf215546Sopenharmony_ci util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) { 1470bf215546Sopenharmony_ci struct schedule_node *child = 1471bf215546Sopenharmony_ci (struct schedule_node *)edge->child; 1472bf215546Sopenharmony_ci 1473bf215546Sopenharmony_ci if (!child) 1474bf215546Sopenharmony_ci continue; 1475bf215546Sopenharmony_ci 1476bf215546Sopenharmony_ci uint32_t latency = instruction_latency(devinfo, node, child); 1477bf215546Sopenharmony_ci 1478bf215546Sopenharmony_ci child->unblocked_time = MAX2(child->unblocked_time, 1479bf215546Sopenharmony_ci time + latency); 1480bf215546Sopenharmony_ci } 1481bf215546Sopenharmony_ci dag_prune_head(dag, &node->dag); 1482bf215546Sopenharmony_ci} 1483bf215546Sopenharmony_ci 1484bf215546Sopenharmony_cistatic void 1485bf215546Sopenharmony_ciinsert_scheduled_instruction(struct v3d_compile *c, 1486bf215546Sopenharmony_ci struct qblock *block, 1487bf215546Sopenharmony_ci struct choose_scoreboard *scoreboard, 1488bf215546Sopenharmony_ci struct qinst *inst) 1489bf215546Sopenharmony_ci{ 1490bf215546Sopenharmony_ci list_addtail(&inst->link, &block->instructions); 1491bf215546Sopenharmony_ci 1492bf215546Sopenharmony_ci update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo); 1493bf215546Sopenharmony_ci c->qpu_inst_count++; 1494bf215546Sopenharmony_ci scoreboard->tick++; 1495bf215546Sopenharmony_ci} 1496bf215546Sopenharmony_ci 1497bf215546Sopenharmony_cistatic struct qinst * 1498bf215546Sopenharmony_civir_nop() 1499bf215546Sopenharmony_ci{ 1500bf215546Sopenharmony_ci struct qreg undef = vir_nop_reg(); 1501bf215546Sopenharmony_ci struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef); 1502bf215546Sopenharmony_ci 1503bf215546Sopenharmony_ci return qinst; 1504bf215546Sopenharmony_ci} 1505bf215546Sopenharmony_ci 1506bf215546Sopenharmony_cistatic void 1507bf215546Sopenharmony_ciemit_nop(struct v3d_compile *c, struct qblock *block, 1508bf215546Sopenharmony_ci struct choose_scoreboard *scoreboard) 1509bf215546Sopenharmony_ci{ 1510bf215546Sopenharmony_ci insert_scheduled_instruction(c, block, scoreboard, vir_nop()); 1511bf215546Sopenharmony_ci} 1512bf215546Sopenharmony_ci 1513bf215546Sopenharmony_cistatic bool 1514bf215546Sopenharmony_ciqpu_inst_valid_in_thrend_slot(struct v3d_compile *c, 1515bf215546Sopenharmony_ci const struct qinst *qinst, int slot) 1516bf215546Sopenharmony_ci{ 1517bf215546Sopenharmony_ci const struct v3d_qpu_instr *inst = &qinst->qpu; 1518bf215546Sopenharmony_ci 1519bf215546Sopenharmony_ci if (slot == 2 && qinst->is_tlb_z_write) 1520bf215546Sopenharmony_ci return false; 1521bf215546Sopenharmony_ci 1522bf215546Sopenharmony_ci if (slot > 0 && qinst->uniform != ~0) 1523bf215546Sopenharmony_ci return false; 1524bf215546Sopenharmony_ci 1525bf215546Sopenharmony_ci if (v3d_qpu_waits_vpm(inst)) 1526bf215546Sopenharmony_ci return false; 1527bf215546Sopenharmony_ci 1528bf215546Sopenharmony_ci if (inst->sig.ldvary) 1529bf215546Sopenharmony_ci return false; 1530bf215546Sopenharmony_ci 1531bf215546Sopenharmony_ci if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { 1532bf215546Sopenharmony_ci /* GFXH-1625: TMUWT not allowed in the final instruction. */ 1533bf215546Sopenharmony_ci if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT) 1534bf215546Sopenharmony_ci return false; 1535bf215546Sopenharmony_ci 1536bf215546Sopenharmony_ci /* No writing physical registers at the end. */ 1537bf215546Sopenharmony_ci if (!inst->alu.add.magic_write || 1538bf215546Sopenharmony_ci !inst->alu.mul.magic_write) { 1539bf215546Sopenharmony_ci return false; 1540bf215546Sopenharmony_ci } 1541bf215546Sopenharmony_ci 1542bf215546Sopenharmony_ci if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) && 1543bf215546Sopenharmony_ci !inst->sig_magic) { 1544bf215546Sopenharmony_ci return false; 1545bf215546Sopenharmony_ci } 1546bf215546Sopenharmony_ci 1547bf215546Sopenharmony_ci if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF) 1548bf215546Sopenharmony_ci return false; 1549bf215546Sopenharmony_ci 1550bf215546Sopenharmony_ci /* RF0-2 might be overwritten during the delay slots by 1551bf215546Sopenharmony_ci * fragment shader setup. 1552bf215546Sopenharmony_ci */ 1553bf215546Sopenharmony_ci if (inst->raddr_a < 3 && 1554bf215546Sopenharmony_ci (inst->alu.add.a == V3D_QPU_MUX_A || 1555bf215546Sopenharmony_ci inst->alu.add.b == V3D_QPU_MUX_A || 1556bf215546Sopenharmony_ci inst->alu.mul.a == V3D_QPU_MUX_A || 1557bf215546Sopenharmony_ci inst->alu.mul.b == V3D_QPU_MUX_A)) { 1558bf215546Sopenharmony_ci return false; 1559bf215546Sopenharmony_ci } 1560bf215546Sopenharmony_ci 1561bf215546Sopenharmony_ci if (inst->raddr_b < 3 && 1562bf215546Sopenharmony_ci !inst->sig.small_imm && 1563bf215546Sopenharmony_ci (inst->alu.add.a == V3D_QPU_MUX_B || 1564bf215546Sopenharmony_ci inst->alu.add.b == V3D_QPU_MUX_B || 1565bf215546Sopenharmony_ci inst->alu.mul.a == V3D_QPU_MUX_B || 1566bf215546Sopenharmony_ci inst->alu.mul.b == V3D_QPU_MUX_B)) { 1567bf215546Sopenharmony_ci return false; 1568bf215546Sopenharmony_ci } 1569bf215546Sopenharmony_ci } 1570bf215546Sopenharmony_ci 1571bf215546Sopenharmony_ci return true; 1572bf215546Sopenharmony_ci} 1573bf215546Sopenharmony_ci 1574bf215546Sopenharmony_ci/** 1575bf215546Sopenharmony_ci * This is called when trying to merge a thrsw back into the instruction stream 1576bf215546Sopenharmony_ci * of instructions that were scheduled *before* the thrsw signal to fill its 1577bf215546Sopenharmony_ci * delay slots. Because the actual execution of the thrsw happens after the 1578bf215546Sopenharmony_ci * delay slots, it is usually safe to do this, but there are some cases that 1579bf215546Sopenharmony_ci * need special care. 1580bf215546Sopenharmony_ci */ 1581bf215546Sopenharmony_cistatic bool 1582bf215546Sopenharmony_ciqpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, 1583bf215546Sopenharmony_ci const struct qinst *qinst, 1584bf215546Sopenharmony_ci uint32_t slot) 1585bf215546Sopenharmony_ci{ 1586bf215546Sopenharmony_ci /* No scheduling SFU when the result would land in the other 1587bf215546Sopenharmony_ci * thread. The simulator complains for safety, though it 1588bf215546Sopenharmony_ci * would only occur for dead code in our case. 1589bf215546Sopenharmony_ci */ 1590bf215546Sopenharmony_ci if (slot > 0 && 1591bf215546Sopenharmony_ci qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 1592bf215546Sopenharmony_ci (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) || 1593bf215546Sopenharmony_ci v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) { 1594bf215546Sopenharmony_ci return false; 1595bf215546Sopenharmony_ci } 1596bf215546Sopenharmony_ci 1597bf215546Sopenharmony_ci if (slot > 0 && qinst->qpu.sig.ldvary) 1598bf215546Sopenharmony_ci return false; 1599bf215546Sopenharmony_ci 1600bf215546Sopenharmony_ci /* unifa and the following 3 instructions can't overlap a 1601bf215546Sopenharmony_ci * thread switch/end. The docs further clarify that this means 1602bf215546Sopenharmony_ci * the cycle at which the actual thread switch/end happens 1603bf215546Sopenharmony_ci * and not when the thrsw instruction is processed, which would 1604bf215546Sopenharmony_ci * be after the 2 delay slots following the thrsw instruction. 1605bf215546Sopenharmony_ci * This means that we can move up a thrsw up to the instruction 1606bf215546Sopenharmony_ci * right after unifa: 1607bf215546Sopenharmony_ci * 1608bf215546Sopenharmony_ci * unifa, r5 1609bf215546Sopenharmony_ci * thrsw 1610bf215546Sopenharmony_ci * delay slot 1 1611bf215546Sopenharmony_ci * delay slot 2 1612bf215546Sopenharmony_ci * Thread switch happens here, 4 instructions away from unifa 1613bf215546Sopenharmony_ci */ 1614bf215546Sopenharmony_ci if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu)) 1615bf215546Sopenharmony_ci return false; 1616bf215546Sopenharmony_ci 1617bf215546Sopenharmony_ci return true; 1618bf215546Sopenharmony_ci} 1619bf215546Sopenharmony_ci 1620bf215546Sopenharmony_ci/** 1621bf215546Sopenharmony_ci * This is called for instructions scheduled *after* a thrsw signal that may 1622bf215546Sopenharmony_ci * land in the delay slots of the thrsw. Because these instructions were 1623bf215546Sopenharmony_ci * scheduled after the thrsw, we need to be careful when placing them into 1624bf215546Sopenharmony_ci * the delay slots, since that means that we are moving them ahead of the 1625bf215546Sopenharmony_ci * thread switch and we need to ensure that is not a problem. 1626bf215546Sopenharmony_ci */ 1627bf215546Sopenharmony_cistatic bool 1628bf215546Sopenharmony_ciqpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, 1629bf215546Sopenharmony_ci struct choose_scoreboard *scoreboard, 1630bf215546Sopenharmony_ci const struct qinst *qinst) 1631bf215546Sopenharmony_ci{ 1632bf215546Sopenharmony_ci const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick; 1633bf215546Sopenharmony_ci assert(slot <= 2); 1634bf215546Sopenharmony_ci 1635bf215546Sopenharmony_ci /* We merge thrsw instructions back into the instruction stream 1636bf215546Sopenharmony_ci * manually, so any instructions scheduled after a thrsw shold be 1637bf215546Sopenharmony_ci * in the actual delay slots and not in the same slot as the thrsw. 1638bf215546Sopenharmony_ci */ 1639bf215546Sopenharmony_ci assert(slot >= 1); 1640bf215546Sopenharmony_ci 1641bf215546Sopenharmony_ci /* No emitting a thrsw while the previous thrsw hasn't happened yet. */ 1642bf215546Sopenharmony_ci if (qinst->qpu.sig.thrsw) 1643bf215546Sopenharmony_ci return false; 1644bf215546Sopenharmony_ci 1645bf215546Sopenharmony_ci /* The restrictions for instructions scheduled before the the thrsw 1646bf215546Sopenharmony_ci * also apply to instructions scheduled after the thrsw that we want 1647bf215546Sopenharmony_ci * to place in its delay slots. 1648bf215546Sopenharmony_ci */ 1649bf215546Sopenharmony_ci if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) 1650bf215546Sopenharmony_ci return false; 1651bf215546Sopenharmony_ci 1652bf215546Sopenharmony_ci /* TLB access is disallowed until scoreboard wait is executed, which 1653bf215546Sopenharmony_ci * we do on the last thread switch. 1654bf215546Sopenharmony_ci */ 1655bf215546Sopenharmony_ci if (qpu_inst_is_tlb(&qinst->qpu)) 1656bf215546Sopenharmony_ci return false; 1657bf215546Sopenharmony_ci 1658bf215546Sopenharmony_ci /* Instruction sequence restrictions: Branch is not allowed in delay 1659bf215546Sopenharmony_ci * slots of a thrsw. 1660bf215546Sopenharmony_ci */ 1661bf215546Sopenharmony_ci if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) 1662bf215546Sopenharmony_ci return false; 1663bf215546Sopenharmony_ci 1664bf215546Sopenharmony_ci /* Miscellaneous restrictions: At the point of a thrsw we need to have 1665bf215546Sopenharmony_ci * at least one outstanding lookup or TSY wait. 1666bf215546Sopenharmony_ci * 1667bf215546Sopenharmony_ci * So avoid placing TMU instructions scheduled after the thrsw into 1668bf215546Sopenharmony_ci * its delay slots or we may be compromising the integrity of our TMU 1669bf215546Sopenharmony_ci * sequences. Also, notice that if we moved these instructions into 1670bf215546Sopenharmony_ci * the delay slots of a previous thrsw we could overflow our TMU output 1671bf215546Sopenharmony_ci * fifo, since we could be effectively pipelining a lookup scheduled 1672bf215546Sopenharmony_ci * after the thrsw into the sequence before the thrsw. 1673bf215546Sopenharmony_ci */ 1674bf215546Sopenharmony_ci if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) || 1675bf215546Sopenharmony_ci qinst->qpu.sig.wrtmuc) { 1676bf215546Sopenharmony_ci return false; 1677bf215546Sopenharmony_ci } 1678bf215546Sopenharmony_ci 1679bf215546Sopenharmony_ci /* Don't move instructions that wait on the TMU before the thread switch 1680bf215546Sopenharmony_ci * happens since that would make the current thread stall before the 1681bf215546Sopenharmony_ci * switch, which is exactly what we want to avoid with the thrsw 1682bf215546Sopenharmony_ci * instruction. 1683bf215546Sopenharmony_ci */ 1684bf215546Sopenharmony_ci if (v3d_qpu_waits_on_tmu(&qinst->qpu)) 1685bf215546Sopenharmony_ci return false; 1686bf215546Sopenharmony_ci 1687bf215546Sopenharmony_ci /* A thread switch invalidates all accumulators, so don't place any 1688bf215546Sopenharmony_ci * instructions that write accumulators into the delay slots. 1689bf215546Sopenharmony_ci */ 1690bf215546Sopenharmony_ci if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu)) 1691bf215546Sopenharmony_ci return false; 1692bf215546Sopenharmony_ci 1693bf215546Sopenharmony_ci /* Multop has an implicit write to the rtop register which is an 1694bf215546Sopenharmony_ci * specialized accumulator that is only used with this instruction. 1695bf215546Sopenharmony_ci */ 1696bf215546Sopenharmony_ci if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP) 1697bf215546Sopenharmony_ci return false; 1698bf215546Sopenharmony_ci 1699bf215546Sopenharmony_ci /* Flags are invalidated across a thread switch, so dont' place 1700bf215546Sopenharmony_ci * instructions that write flags into delay slots. 1701bf215546Sopenharmony_ci */ 1702bf215546Sopenharmony_ci if (v3d_qpu_writes_flags(&qinst->qpu)) 1703bf215546Sopenharmony_ci return false; 1704bf215546Sopenharmony_ci 1705bf215546Sopenharmony_ci /* TSY sync ops materialize at the point of the next thread switch, 1706bf215546Sopenharmony_ci * therefore, if we have a TSY sync right after a thread switch, we 1707bf215546Sopenharmony_ci * cannot place it in its delay slots, or we would be moving the sync 1708bf215546Sopenharmony_ci * to the thrsw before it instead. 1709bf215546Sopenharmony_ci */ 1710bf215546Sopenharmony_ci if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID) 1711bf215546Sopenharmony_ci return false; 1712bf215546Sopenharmony_ci 1713bf215546Sopenharmony_ci return true; 1714bf215546Sopenharmony_ci} 1715bf215546Sopenharmony_ci 1716bf215546Sopenharmony_cistatic bool 1717bf215546Sopenharmony_civalid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard, 1718bf215546Sopenharmony_ci struct qinst *qinst, int instructions_in_sequence, 1719bf215546Sopenharmony_ci bool is_thrend) 1720bf215546Sopenharmony_ci{ 1721bf215546Sopenharmony_ci /* No emitting our thrsw while the previous thrsw hasn't happened yet. */ 1722bf215546Sopenharmony_ci if (scoreboard->last_thrsw_tick + 3 > 1723bf215546Sopenharmony_ci scoreboard->tick - instructions_in_sequence) { 1724bf215546Sopenharmony_ci return false; 1725bf215546Sopenharmony_ci } 1726bf215546Sopenharmony_ci 1727bf215546Sopenharmony_ci for (int slot = 0; slot < instructions_in_sequence; slot++) { 1728bf215546Sopenharmony_ci if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) 1729bf215546Sopenharmony_ci return false; 1730bf215546Sopenharmony_ci 1731bf215546Sopenharmony_ci if (is_thrend && 1732bf215546Sopenharmony_ci !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) { 1733bf215546Sopenharmony_ci return false; 1734bf215546Sopenharmony_ci } 1735bf215546Sopenharmony_ci 1736bf215546Sopenharmony_ci /* Note that the list is circular, so we can only do this up 1737bf215546Sopenharmony_ci * to instructions_in_sequence. 1738bf215546Sopenharmony_ci */ 1739bf215546Sopenharmony_ci qinst = (struct qinst *)qinst->link.next; 1740bf215546Sopenharmony_ci } 1741bf215546Sopenharmony_ci 1742bf215546Sopenharmony_ci return true; 1743bf215546Sopenharmony_ci} 1744bf215546Sopenharmony_ci 1745bf215546Sopenharmony_ci/** 1746bf215546Sopenharmony_ci * Emits a THRSW signal in the stream, trying to move it up to pair with 1747bf215546Sopenharmony_ci * another instruction. 1748bf215546Sopenharmony_ci */ 1749bf215546Sopenharmony_cistatic int 1750bf215546Sopenharmony_ciemit_thrsw(struct v3d_compile *c, 1751bf215546Sopenharmony_ci struct qblock *block, 1752bf215546Sopenharmony_ci struct choose_scoreboard *scoreboard, 1753bf215546Sopenharmony_ci struct qinst *inst, 1754bf215546Sopenharmony_ci bool is_thrend) 1755bf215546Sopenharmony_ci{ 1756bf215546Sopenharmony_ci int time = 0; 1757bf215546Sopenharmony_ci 1758bf215546Sopenharmony_ci /* There should be nothing in a thrsw inst being scheduled other than 1759bf215546Sopenharmony_ci * the signal bits. 1760bf215546Sopenharmony_ci */ 1761bf215546Sopenharmony_ci assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); 1762bf215546Sopenharmony_ci assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP); 1763bf215546Sopenharmony_ci assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP); 1764bf215546Sopenharmony_ci 1765bf215546Sopenharmony_ci /* Don't try to emit a thrsw in the delay slots of a previous thrsw 1766bf215546Sopenharmony_ci * or branch. 1767bf215546Sopenharmony_ci */ 1768bf215546Sopenharmony_ci while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) { 1769bf215546Sopenharmony_ci emit_nop(c, block, scoreboard); 1770bf215546Sopenharmony_ci time++; 1771bf215546Sopenharmony_ci } 1772bf215546Sopenharmony_ci while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) { 1773bf215546Sopenharmony_ci emit_nop(c, block, scoreboard); 1774bf215546Sopenharmony_ci time++; 1775bf215546Sopenharmony_ci } 1776bf215546Sopenharmony_ci 1777bf215546Sopenharmony_ci /* Find how far back into previous instructions we can put the THRSW. */ 1778bf215546Sopenharmony_ci int slots_filled = 0; 1779bf215546Sopenharmony_ci int invalid_sig_count = 0; 1780bf215546Sopenharmony_ci bool last_thrsw_after_invalid_ok = false; 1781bf215546Sopenharmony_ci struct qinst *merge_inst = NULL; 1782bf215546Sopenharmony_ci vir_for_each_inst_rev(prev_inst, block) { 1783bf215546Sopenharmony_ci if (!valid_thrsw_sequence(c, scoreboard, 1784bf215546Sopenharmony_ci prev_inst, slots_filled + 1, 1785bf215546Sopenharmony_ci is_thrend)) { 1786bf215546Sopenharmony_ci break; 1787bf215546Sopenharmony_ci } 1788bf215546Sopenharmony_ci 1789bf215546Sopenharmony_ci struct v3d_qpu_sig sig = prev_inst->qpu.sig; 1790bf215546Sopenharmony_ci sig.thrsw = true; 1791bf215546Sopenharmony_ci uint32_t packed_sig; 1792bf215546Sopenharmony_ci if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) { 1793bf215546Sopenharmony_ci /* If we can't merge the thrsw here because of signal 1794bf215546Sopenharmony_ci * incompatibility, keep going, we might be able to 1795bf215546Sopenharmony_ci * merge it in an earlier instruction. 1796bf215546Sopenharmony_ci */ 1797bf215546Sopenharmony_ci invalid_sig_count++; 1798bf215546Sopenharmony_ci goto cont_block; 1799bf215546Sopenharmony_ci } 1800bf215546Sopenharmony_ci 1801bf215546Sopenharmony_ci /* For last thrsw we need 2 consecutive slots that are 1802bf215546Sopenharmony_ci * thrsw compatible, so if we have previously jumped over 1803bf215546Sopenharmony_ci * an incompatible signal, flag that we have found the first 1804bf215546Sopenharmony_ci * valid slot here and keep going. 1805bf215546Sopenharmony_ci */ 1806bf215546Sopenharmony_ci if (inst->is_last_thrsw && invalid_sig_count > 0 && 1807bf215546Sopenharmony_ci !last_thrsw_after_invalid_ok) { 1808bf215546Sopenharmony_ci last_thrsw_after_invalid_ok = true; 1809bf215546Sopenharmony_ci invalid_sig_count++; 1810bf215546Sopenharmony_ci goto cont_block; 1811bf215546Sopenharmony_ci } 1812bf215546Sopenharmony_ci 1813bf215546Sopenharmony_ci last_thrsw_after_invalid_ok = false; 1814bf215546Sopenharmony_ci invalid_sig_count = 0; 1815bf215546Sopenharmony_ci merge_inst = prev_inst; 1816bf215546Sopenharmony_ci 1817bf215546Sopenharmony_cicont_block: 1818bf215546Sopenharmony_ci if (++slots_filled == 3) 1819bf215546Sopenharmony_ci break; 1820bf215546Sopenharmony_ci } 1821bf215546Sopenharmony_ci 1822bf215546Sopenharmony_ci /* If we jumped over a signal incompatibility and did not manage to 1823bf215546Sopenharmony_ci * merge the thrsw in the end, we need to adjust slots filled to match 1824bf215546Sopenharmony_ci * the last valid merge point. 1825bf215546Sopenharmony_ci */ 1826bf215546Sopenharmony_ci assert(invalid_sig_count == 0 || slots_filled >= invalid_sig_count); 1827bf215546Sopenharmony_ci if (invalid_sig_count > 0) 1828bf215546Sopenharmony_ci slots_filled -= invalid_sig_count; 1829bf215546Sopenharmony_ci 1830bf215546Sopenharmony_ci bool needs_free = false; 1831bf215546Sopenharmony_ci if (merge_inst) { 1832bf215546Sopenharmony_ci merge_inst->qpu.sig.thrsw = true; 1833bf215546Sopenharmony_ci needs_free = true; 1834bf215546Sopenharmony_ci scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled; 1835bf215546Sopenharmony_ci } else { 1836bf215546Sopenharmony_ci scoreboard->last_thrsw_tick = scoreboard->tick; 1837bf215546Sopenharmony_ci insert_scheduled_instruction(c, block, scoreboard, inst); 1838bf215546Sopenharmony_ci time++; 1839bf215546Sopenharmony_ci slots_filled++; 1840bf215546Sopenharmony_ci merge_inst = inst; 1841bf215546Sopenharmony_ci } 1842bf215546Sopenharmony_ci 1843bf215546Sopenharmony_ci scoreboard->first_thrsw_emitted = true; 1844bf215546Sopenharmony_ci 1845bf215546Sopenharmony_ci /* If we're emitting the last THRSW (other than program end), then 1846bf215546Sopenharmony_ci * signal that to the HW by emitting two THRSWs in a row. 1847bf215546Sopenharmony_ci */ 1848bf215546Sopenharmony_ci if (inst->is_last_thrsw) { 1849bf215546Sopenharmony_ci if (slots_filled <= 1) { 1850bf215546Sopenharmony_ci emit_nop(c, block, scoreboard); 1851bf215546Sopenharmony_ci time++; 1852bf215546Sopenharmony_ci } 1853bf215546Sopenharmony_ci struct qinst *second_inst = 1854bf215546Sopenharmony_ci (struct qinst *)merge_inst->link.next; 1855bf215546Sopenharmony_ci second_inst->qpu.sig.thrsw = true; 1856bf215546Sopenharmony_ci scoreboard->last_thrsw_emitted = true; 1857bf215546Sopenharmony_ci } 1858bf215546Sopenharmony_ci 1859bf215546Sopenharmony_ci /* Make sure the thread end executes within the program lifespan */ 1860bf215546Sopenharmony_ci if (is_thrend) { 1861bf215546Sopenharmony_ci for (int i = 0; i < 3 - slots_filled; i++) { 1862bf215546Sopenharmony_ci emit_nop(c, block, scoreboard); 1863bf215546Sopenharmony_ci time++; 1864bf215546Sopenharmony_ci } 1865bf215546Sopenharmony_ci } 1866bf215546Sopenharmony_ci 1867bf215546Sopenharmony_ci /* If we put our THRSW into another instruction, free up the 1868bf215546Sopenharmony_ci * instruction that didn't end up scheduled into the list. 1869bf215546Sopenharmony_ci */ 1870bf215546Sopenharmony_ci if (needs_free) 1871bf215546Sopenharmony_ci free(inst); 1872bf215546Sopenharmony_ci 1873bf215546Sopenharmony_ci return time; 1874bf215546Sopenharmony_ci} 1875bf215546Sopenharmony_ci 1876bf215546Sopenharmony_cistatic bool 1877bf215546Sopenharmony_ciqpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst) 1878bf215546Sopenharmony_ci{ 1879bf215546Sopenharmony_ci if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) 1880bf215546Sopenharmony_ci return false; 1881bf215546Sopenharmony_ci 1882bf215546Sopenharmony_ci if (inst->qpu.sig.thrsw) 1883bf215546Sopenharmony_ci return false; 1884bf215546Sopenharmony_ci 1885bf215546Sopenharmony_ci if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu)) 1886bf215546Sopenharmony_ci return false; 1887bf215546Sopenharmony_ci 1888bf215546Sopenharmony_ci if (vir_has_uniform(inst)) 1889bf215546Sopenharmony_ci return false; 1890bf215546Sopenharmony_ci 1891bf215546Sopenharmony_ci return true; 1892bf215546Sopenharmony_ci} 1893bf215546Sopenharmony_ci 1894bf215546Sopenharmony_cistatic void 1895bf215546Sopenharmony_ciemit_branch(struct v3d_compile *c, 1896bf215546Sopenharmony_ci struct qblock *block, 1897bf215546Sopenharmony_ci struct choose_scoreboard *scoreboard, 1898bf215546Sopenharmony_ci struct qinst *inst) 1899bf215546Sopenharmony_ci{ 1900bf215546Sopenharmony_ci assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); 1901bf215546Sopenharmony_ci 1902bf215546Sopenharmony_ci /* We should've not picked up a branch for the delay slots of a previous 1903bf215546Sopenharmony_ci * thrsw, branch or unifa write instruction. 1904bf215546Sopenharmony_ci */ 1905bf215546Sopenharmony_ci int branch_tick = scoreboard->tick; 1906bf215546Sopenharmony_ci assert(scoreboard->last_thrsw_tick + 2 < branch_tick); 1907bf215546Sopenharmony_ci assert(scoreboard->last_branch_tick + 3 < branch_tick); 1908bf215546Sopenharmony_ci assert(scoreboard->last_unifa_write_tick + 3 < branch_tick); 1909bf215546Sopenharmony_ci 1910bf215546Sopenharmony_ci /* Can't place a branch with msfign != 0 and cond != 0,2,3 after 1911bf215546Sopenharmony_ci * setmsf. 1912bf215546Sopenharmony_ci */ 1913bf215546Sopenharmony_ci bool is_safe_msf_branch = 1914bf215546Sopenharmony_ci inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE || 1915bf215546Sopenharmony_ci inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS || 1916bf215546Sopenharmony_ci inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 || 1917bf215546Sopenharmony_ci inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0; 1918bf215546Sopenharmony_ci assert(scoreboard->last_setmsf_tick != branch_tick - 1 || 1919bf215546Sopenharmony_ci is_safe_msf_branch); 1920bf215546Sopenharmony_ci 1921bf215546Sopenharmony_ci /* Insert the branch instruction */ 1922bf215546Sopenharmony_ci insert_scheduled_instruction(c, block, scoreboard, inst); 1923bf215546Sopenharmony_ci 1924bf215546Sopenharmony_ci /* Now see if we can move the branch instruction back into the 1925bf215546Sopenharmony_ci * instruction stream to fill its delay slots 1926bf215546Sopenharmony_ci */ 1927bf215546Sopenharmony_ci int slots_filled = 0; 1928bf215546Sopenharmony_ci while (slots_filled < 3 && block->instructions.next != &inst->link) { 1929bf215546Sopenharmony_ci struct qinst *prev_inst = (struct qinst *) inst->link.prev; 1930bf215546Sopenharmony_ci assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH); 1931bf215546Sopenharmony_ci 1932bf215546Sopenharmony_ci /* Can't move the branch instruction if that would place it 1933bf215546Sopenharmony_ci * in the delay slots of other instructions. 1934bf215546Sopenharmony_ci */ 1935bf215546Sopenharmony_ci if (scoreboard->last_branch_tick + 3 >= 1936bf215546Sopenharmony_ci branch_tick - slots_filled - 1) { 1937bf215546Sopenharmony_ci break; 1938bf215546Sopenharmony_ci } 1939bf215546Sopenharmony_ci 1940bf215546Sopenharmony_ci if (scoreboard->last_thrsw_tick + 2 >= 1941bf215546Sopenharmony_ci branch_tick - slots_filled - 1) { 1942bf215546Sopenharmony_ci break; 1943bf215546Sopenharmony_ci } 1944bf215546Sopenharmony_ci 1945bf215546Sopenharmony_ci if (scoreboard->last_unifa_write_tick + 3 >= 1946bf215546Sopenharmony_ci branch_tick - slots_filled - 1) { 1947bf215546Sopenharmony_ci break; 1948bf215546Sopenharmony_ci } 1949bf215546Sopenharmony_ci 1950bf215546Sopenharmony_ci /* Do not move up a branch if it can disrupt an ldvary sequence 1951bf215546Sopenharmony_ci * as that can cause stomping of the r5 register. 1952bf215546Sopenharmony_ci */ 1953bf215546Sopenharmony_ci if (scoreboard->last_ldvary_tick + 2 >= 1954bf215546Sopenharmony_ci branch_tick - slots_filled) { 1955bf215546Sopenharmony_ci break; 1956bf215546Sopenharmony_ci } 1957bf215546Sopenharmony_ci 1958bf215546Sopenharmony_ci /* Can't move a conditional branch before the instruction 1959bf215546Sopenharmony_ci * that writes the flags for its condition. 1960bf215546Sopenharmony_ci */ 1961bf215546Sopenharmony_ci if (v3d_qpu_writes_flags(&prev_inst->qpu) && 1962bf215546Sopenharmony_ci inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) { 1963bf215546Sopenharmony_ci break; 1964bf215546Sopenharmony_ci } 1965bf215546Sopenharmony_ci 1966bf215546Sopenharmony_ci if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst)) 1967bf215546Sopenharmony_ci break; 1968bf215546Sopenharmony_ci 1969bf215546Sopenharmony_ci if (!is_safe_msf_branch) { 1970bf215546Sopenharmony_ci struct qinst *prev_prev_inst = 1971bf215546Sopenharmony_ci (struct qinst *) prev_inst->link.prev; 1972bf215546Sopenharmony_ci if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 1973bf215546Sopenharmony_ci prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) { 1974bf215546Sopenharmony_ci break; 1975bf215546Sopenharmony_ci } 1976bf215546Sopenharmony_ci } 1977bf215546Sopenharmony_ci 1978bf215546Sopenharmony_ci list_del(&prev_inst->link); 1979bf215546Sopenharmony_ci list_add(&prev_inst->link, &inst->link); 1980bf215546Sopenharmony_ci slots_filled++; 1981bf215546Sopenharmony_ci } 1982bf215546Sopenharmony_ci 1983bf215546Sopenharmony_ci block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled; 1984bf215546Sopenharmony_ci scoreboard->last_branch_tick = branch_tick - slots_filled; 1985bf215546Sopenharmony_ci 1986bf215546Sopenharmony_ci /* Fill any remaining delay slots. 1987bf215546Sopenharmony_ci * 1988bf215546Sopenharmony_ci * For unconditional branches we'll try to fill these with the 1989bf215546Sopenharmony_ci * first instructions in the successor block after scheduling 1990bf215546Sopenharmony_ci * all blocks when setting up branch targets. 1991bf215546Sopenharmony_ci */ 1992bf215546Sopenharmony_ci for (int i = 0; i < 3 - slots_filled; i++) 1993bf215546Sopenharmony_ci emit_nop(c, block, scoreboard); 1994bf215546Sopenharmony_ci} 1995bf215546Sopenharmony_ci 1996bf215546Sopenharmony_cistatic bool 1997bf215546Sopenharmony_cialu_reads_register(struct v3d_qpu_instr *inst, 1998bf215546Sopenharmony_ci bool add, bool magic, uint32_t index) 1999bf215546Sopenharmony_ci{ 2000bf215546Sopenharmony_ci uint32_t num_src; 2001bf215546Sopenharmony_ci enum v3d_qpu_mux mux_a, mux_b; 2002bf215546Sopenharmony_ci 2003bf215546Sopenharmony_ci if (add) { 2004bf215546Sopenharmony_ci num_src = v3d_qpu_add_op_num_src(inst->alu.add.op); 2005bf215546Sopenharmony_ci mux_a = inst->alu.add.a; 2006bf215546Sopenharmony_ci mux_b = inst->alu.add.b; 2007bf215546Sopenharmony_ci } else { 2008bf215546Sopenharmony_ci num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op); 2009bf215546Sopenharmony_ci mux_a = inst->alu.mul.a; 2010bf215546Sopenharmony_ci mux_b = inst->alu.mul.b; 2011bf215546Sopenharmony_ci } 2012bf215546Sopenharmony_ci 2013bf215546Sopenharmony_ci for (int i = 0; i < num_src; i++) { 2014bf215546Sopenharmony_ci if (magic) { 2015bf215546Sopenharmony_ci if (i == 0 && mux_a == index) 2016bf215546Sopenharmony_ci return true; 2017bf215546Sopenharmony_ci if (i == 1 && mux_b == index) 2018bf215546Sopenharmony_ci return true; 2019bf215546Sopenharmony_ci } else { 2020bf215546Sopenharmony_ci if (i == 0 && mux_a == V3D_QPU_MUX_A && 2021bf215546Sopenharmony_ci inst->raddr_a == index) { 2022bf215546Sopenharmony_ci return true; 2023bf215546Sopenharmony_ci } 2024bf215546Sopenharmony_ci if (i == 0 && mux_a == V3D_QPU_MUX_B && 2025bf215546Sopenharmony_ci inst->raddr_b == index) { 2026bf215546Sopenharmony_ci return true; 2027bf215546Sopenharmony_ci } 2028bf215546Sopenharmony_ci if (i == 1 && mux_b == V3D_QPU_MUX_A && 2029bf215546Sopenharmony_ci inst->raddr_a == index) { 2030bf215546Sopenharmony_ci return true; 2031bf215546Sopenharmony_ci } 2032bf215546Sopenharmony_ci if (i == 1 && mux_b == V3D_QPU_MUX_B && 2033bf215546Sopenharmony_ci inst->raddr_b == index) { 2034bf215546Sopenharmony_ci return true; 2035bf215546Sopenharmony_ci } 2036bf215546Sopenharmony_ci } 2037bf215546Sopenharmony_ci } 2038bf215546Sopenharmony_ci 2039bf215546Sopenharmony_ci return false; 2040bf215546Sopenharmony_ci} 2041bf215546Sopenharmony_ci 2042bf215546Sopenharmony_ci/** 2043bf215546Sopenharmony_ci * This takes and ldvary signal merged into 'inst' and tries to move it up to 2044bf215546Sopenharmony_ci * the previous instruction to get good pipelining of ldvary sequences, 2045bf215546Sopenharmony_ci * transforming this: 2046bf215546Sopenharmony_ci * 2047bf215546Sopenharmony_ci * nop ; nop ; ldvary.r4 2048bf215546Sopenharmony_ci * nop ; fmul r0, r4, rf0 ; 2049bf215546Sopenharmony_ci * fadd rf13, r0, r5 ; nop; ; ldvary.r1 <-- inst 2050bf215546Sopenharmony_ci * 2051bf215546Sopenharmony_ci * into: 2052bf215546Sopenharmony_ci * 2053bf215546Sopenharmony_ci * nop ; nop ; ldvary.r4 2054bf215546Sopenharmony_ci * nop ; fmul r0, r4, rf0 ; ldvary.r1 2055bf215546Sopenharmony_ci * fadd rf13, r0, r5 ; nop; ; <-- inst 2056bf215546Sopenharmony_ci * 2057bf215546Sopenharmony_ci * If we manage to do this successfully (we return true here), then flagging 2058bf215546Sopenharmony_ci * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that 2059bf215546Sopenharmony_ci * we will be able to pick up to merge into 'inst', leading to code like this: 2060bf215546Sopenharmony_ci * 2061bf215546Sopenharmony_ci * nop ; nop ; ldvary.r4 2062bf215546Sopenharmony_ci * nop ; fmul r0, r4, rf0 ; ldvary.r1 2063bf215546Sopenharmony_ci * fadd rf13, r0, r5 ; fmul r2, r1, rf0 ; <-- inst 2064bf215546Sopenharmony_ci */ 2065bf215546Sopenharmony_cistatic bool 2066bf215546Sopenharmony_cifixup_pipelined_ldvary(struct v3d_compile *c, 2067bf215546Sopenharmony_ci struct choose_scoreboard *scoreboard, 2068bf215546Sopenharmony_ci struct qblock *block, 2069bf215546Sopenharmony_ci struct v3d_qpu_instr *inst) 2070bf215546Sopenharmony_ci{ 2071bf215546Sopenharmony_ci /* We only call this if we have successfuly merged an ldvary into a 2072bf215546Sopenharmony_ci * previous instruction. 2073bf215546Sopenharmony_ci */ 2074bf215546Sopenharmony_ci assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 2075bf215546Sopenharmony_ci assert(inst->sig.ldvary); 2076bf215546Sopenharmony_ci uint32_t ldvary_magic = inst->sig_magic; 2077bf215546Sopenharmony_ci uint32_t ldvary_index = inst->sig_addr; 2078bf215546Sopenharmony_ci 2079bf215546Sopenharmony_ci /* The instruction in which we merged the ldvary cannot read 2080bf215546Sopenharmony_ci * the ldvary destination, if it does, then moving the ldvary before 2081bf215546Sopenharmony_ci * it would overwrite it. 2082bf215546Sopenharmony_ci */ 2083bf215546Sopenharmony_ci if (alu_reads_register(inst, true, ldvary_magic, ldvary_index)) 2084bf215546Sopenharmony_ci return false; 2085bf215546Sopenharmony_ci if (alu_reads_register(inst, false, ldvary_magic, ldvary_index)) 2086bf215546Sopenharmony_ci return false; 2087bf215546Sopenharmony_ci 2088bf215546Sopenharmony_ci /* The implicit ldvary destination may not be written to by a signal 2089bf215546Sopenharmony_ci * in the instruction following ldvary. Since we are planning to move 2090bf215546Sopenharmony_ci * ldvary to the previous instruction, this means we need to check if 2091bf215546Sopenharmony_ci * the current instruction has any other signal that could create this 2092bf215546Sopenharmony_ci * conflict. The only other signal that can write to the implicit 2093bf215546Sopenharmony_ci * ldvary destination that is compatible with ldvary in the same 2094bf215546Sopenharmony_ci * instruction is ldunif. 2095bf215546Sopenharmony_ci */ 2096bf215546Sopenharmony_ci if (inst->sig.ldunif) 2097bf215546Sopenharmony_ci return false; 2098bf215546Sopenharmony_ci 2099bf215546Sopenharmony_ci /* The previous instruction can't write to the same destination as the 2100bf215546Sopenharmony_ci * ldvary. 2101bf215546Sopenharmony_ci */ 2102bf215546Sopenharmony_ci struct qinst *prev = (struct qinst *) block->instructions.prev; 2103bf215546Sopenharmony_ci if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU) 2104bf215546Sopenharmony_ci return false; 2105bf215546Sopenharmony_ci 2106bf215546Sopenharmony_ci if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) { 2107bf215546Sopenharmony_ci if (prev->qpu.alu.add.magic_write == ldvary_magic && 2108bf215546Sopenharmony_ci prev->qpu.alu.add.waddr == ldvary_index) { 2109bf215546Sopenharmony_ci return false; 2110bf215546Sopenharmony_ci } 2111bf215546Sopenharmony_ci } 2112bf215546Sopenharmony_ci 2113bf215546Sopenharmony_ci if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) { 2114bf215546Sopenharmony_ci if (prev->qpu.alu.mul.magic_write == ldvary_magic && 2115bf215546Sopenharmony_ci prev->qpu.alu.mul.waddr == ldvary_index) { 2116bf215546Sopenharmony_ci return false; 2117bf215546Sopenharmony_ci } 2118bf215546Sopenharmony_ci } 2119bf215546Sopenharmony_ci 2120bf215546Sopenharmony_ci /* The previous instruction cannot have a conflicting signal */ 2121bf215546Sopenharmony_ci if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig)) 2122bf215546Sopenharmony_ci return false; 2123bf215546Sopenharmony_ci 2124bf215546Sopenharmony_ci uint32_t sig; 2125bf215546Sopenharmony_ci struct v3d_qpu_sig new_sig = prev->qpu.sig; 2126bf215546Sopenharmony_ci new_sig.ldvary = true; 2127bf215546Sopenharmony_ci if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig)) 2128bf215546Sopenharmony_ci return false; 2129bf215546Sopenharmony_ci 2130bf215546Sopenharmony_ci /* The previous instruction cannot use flags since ldvary uses the 2131bf215546Sopenharmony_ci * 'cond' instruction field to store the destination. 2132bf215546Sopenharmony_ci */ 2133bf215546Sopenharmony_ci if (v3d_qpu_writes_flags(&prev->qpu)) 2134bf215546Sopenharmony_ci return false; 2135bf215546Sopenharmony_ci if (v3d_qpu_reads_flags(&prev->qpu)) 2136bf215546Sopenharmony_ci return false; 2137bf215546Sopenharmony_ci 2138bf215546Sopenharmony_ci /* We can't put an ldvary in the delay slots of a thrsw. We should've 2139bf215546Sopenharmony_ci * prevented this when pairing up the ldvary with another instruction 2140bf215546Sopenharmony_ci * and flagging it for a fixup. 2141bf215546Sopenharmony_ci */ 2142bf215546Sopenharmony_ci assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1); 2143bf215546Sopenharmony_ci 2144bf215546Sopenharmony_ci /* Move the ldvary to the previous instruction and remove it from the 2145bf215546Sopenharmony_ci * current one. 2146bf215546Sopenharmony_ci */ 2147bf215546Sopenharmony_ci prev->qpu.sig.ldvary = true; 2148bf215546Sopenharmony_ci prev->qpu.sig_magic = ldvary_magic; 2149bf215546Sopenharmony_ci prev->qpu.sig_addr = ldvary_index; 2150bf215546Sopenharmony_ci scoreboard->last_ldvary_tick = scoreboard->tick - 1; 2151bf215546Sopenharmony_ci 2152bf215546Sopenharmony_ci inst->sig.ldvary = false; 2153bf215546Sopenharmony_ci inst->sig_magic = false; 2154bf215546Sopenharmony_ci inst->sig_addr = 0; 2155bf215546Sopenharmony_ci 2156bf215546Sopenharmony_ci /* By moving ldvary to the previous instruction we make it update 2157bf215546Sopenharmony_ci * r5 in the current one, so nothing else in it should write r5. 2158bf215546Sopenharmony_ci * This should've been prevented by our depedency tracking, which 2159bf215546Sopenharmony_ci * would not allow ldvary to be paired up with an instruction that 2160bf215546Sopenharmony_ci * writes r5 (since our dependency tracking doesn't know that the 2161bf215546Sopenharmony_ci * ldvary write r5 happens in the next instruction). 2162bf215546Sopenharmony_ci */ 2163bf215546Sopenharmony_ci assert(!v3d_qpu_writes_r5(c->devinfo, inst)); 2164bf215546Sopenharmony_ci 2165bf215546Sopenharmony_ci return true; 2166bf215546Sopenharmony_ci} 2167bf215546Sopenharmony_ci 2168bf215546Sopenharmony_cistatic uint32_t 2169bf215546Sopenharmony_cischedule_instructions(struct v3d_compile *c, 2170bf215546Sopenharmony_ci struct choose_scoreboard *scoreboard, 2171bf215546Sopenharmony_ci struct qblock *block, 2172bf215546Sopenharmony_ci enum quniform_contents *orig_uniform_contents, 2173bf215546Sopenharmony_ci uint32_t *orig_uniform_data, 2174bf215546Sopenharmony_ci uint32_t *next_uniform) 2175bf215546Sopenharmony_ci{ 2176bf215546Sopenharmony_ci const struct v3d_device_info *devinfo = c->devinfo; 2177bf215546Sopenharmony_ci uint32_t time = 0; 2178bf215546Sopenharmony_ci 2179bf215546Sopenharmony_ci while (!list_is_empty(&scoreboard->dag->heads)) { 2180bf215546Sopenharmony_ci struct schedule_node *chosen = 2181bf215546Sopenharmony_ci choose_instruction_to_schedule(c, scoreboard, NULL); 2182bf215546Sopenharmony_ci struct schedule_node *merge = NULL; 2183bf215546Sopenharmony_ci 2184bf215546Sopenharmony_ci /* If there are no valid instructions to schedule, drop a NOP 2185bf215546Sopenharmony_ci * in. 2186bf215546Sopenharmony_ci */ 2187bf215546Sopenharmony_ci struct qinst *qinst = chosen ? chosen->inst : vir_nop(); 2188bf215546Sopenharmony_ci struct v3d_qpu_instr *inst = &qinst->qpu; 2189bf215546Sopenharmony_ci 2190bf215546Sopenharmony_ci if (debug) { 2191bf215546Sopenharmony_ci fprintf(stderr, "t=%4d: current list:\n", 2192bf215546Sopenharmony_ci time); 2193bf215546Sopenharmony_ci dump_state(devinfo, scoreboard->dag); 2194bf215546Sopenharmony_ci fprintf(stderr, "t=%4d: chose: ", time); 2195bf215546Sopenharmony_ci v3d_qpu_dump(devinfo, inst); 2196bf215546Sopenharmony_ci fprintf(stderr, "\n"); 2197bf215546Sopenharmony_ci } 2198bf215546Sopenharmony_ci 2199bf215546Sopenharmony_ci /* We can't mark_instruction_scheduled() the chosen inst until 2200bf215546Sopenharmony_ci * we're done identifying instructions to merge, so put the 2201bf215546Sopenharmony_ci * merged instructions on a list for a moment. 2202bf215546Sopenharmony_ci */ 2203bf215546Sopenharmony_ci struct list_head merged_list; 2204bf215546Sopenharmony_ci list_inithead(&merged_list); 2205bf215546Sopenharmony_ci 2206bf215546Sopenharmony_ci /* Schedule this instruction onto the QPU list. Also try to 2207bf215546Sopenharmony_ci * find an instruction to pair with it. 2208bf215546Sopenharmony_ci */ 2209bf215546Sopenharmony_ci if (chosen) { 2210bf215546Sopenharmony_ci time = MAX2(chosen->unblocked_time, time); 2211bf215546Sopenharmony_ci pre_remove_head(scoreboard->dag, chosen); 2212bf215546Sopenharmony_ci 2213bf215546Sopenharmony_ci while ((merge = 2214bf215546Sopenharmony_ci choose_instruction_to_schedule(c, scoreboard, 2215bf215546Sopenharmony_ci chosen))) { 2216bf215546Sopenharmony_ci time = MAX2(merge->unblocked_time, time); 2217bf215546Sopenharmony_ci pre_remove_head(scoreboard->dag, merge); 2218bf215546Sopenharmony_ci list_addtail(&merge->link, &merged_list); 2219bf215546Sopenharmony_ci (void)qpu_merge_inst(devinfo, inst, 2220bf215546Sopenharmony_ci inst, &merge->inst->qpu); 2221bf215546Sopenharmony_ci if (merge->inst->uniform != -1) { 2222bf215546Sopenharmony_ci chosen->inst->uniform = 2223bf215546Sopenharmony_ci merge->inst->uniform; 2224bf215546Sopenharmony_ci } 2225bf215546Sopenharmony_ci 2226bf215546Sopenharmony_ci if (debug) { 2227bf215546Sopenharmony_ci fprintf(stderr, "t=%4d: merging: ", 2228bf215546Sopenharmony_ci time); 2229bf215546Sopenharmony_ci v3d_qpu_dump(devinfo, &merge->inst->qpu); 2230bf215546Sopenharmony_ci fprintf(stderr, "\n"); 2231bf215546Sopenharmony_ci fprintf(stderr, " result: "); 2232bf215546Sopenharmony_ci v3d_qpu_dump(devinfo, inst); 2233bf215546Sopenharmony_ci fprintf(stderr, "\n"); 2234bf215546Sopenharmony_ci } 2235bf215546Sopenharmony_ci 2236bf215546Sopenharmony_ci if (scoreboard->fixup_ldvary) { 2237bf215546Sopenharmony_ci scoreboard->fixup_ldvary = false; 2238bf215546Sopenharmony_ci if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) { 2239bf215546Sopenharmony_ci /* Flag the ldvary as scheduled 2240bf215546Sopenharmony_ci * now so we can try to merge the 2241bf215546Sopenharmony_ci * follow-up instruction in the 2242bf215546Sopenharmony_ci * the ldvary sequence into the 2243bf215546Sopenharmony_ci * current instruction. 2244bf215546Sopenharmony_ci */ 2245bf215546Sopenharmony_ci mark_instruction_scheduled( 2246bf215546Sopenharmony_ci devinfo, scoreboard->dag, 2247bf215546Sopenharmony_ci time, merge); 2248bf215546Sopenharmony_ci } 2249bf215546Sopenharmony_ci } 2250bf215546Sopenharmony_ci } 2251bf215546Sopenharmony_ci if (mux_read_stalls(scoreboard, inst)) 2252bf215546Sopenharmony_ci c->qpu_inst_stalled_count++; 2253bf215546Sopenharmony_ci } 2254bf215546Sopenharmony_ci 2255bf215546Sopenharmony_ci /* Update the uniform index for the rewritten location -- 2256bf215546Sopenharmony_ci * branch target updating will still need to change 2257bf215546Sopenharmony_ci * c->uniform_data[] using this index. 2258bf215546Sopenharmony_ci */ 2259bf215546Sopenharmony_ci if (qinst->uniform != -1) { 2260bf215546Sopenharmony_ci if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 2261bf215546Sopenharmony_ci block->branch_uniform = *next_uniform; 2262bf215546Sopenharmony_ci 2263bf215546Sopenharmony_ci c->uniform_data[*next_uniform] = 2264bf215546Sopenharmony_ci orig_uniform_data[qinst->uniform]; 2265bf215546Sopenharmony_ci c->uniform_contents[*next_uniform] = 2266bf215546Sopenharmony_ci orig_uniform_contents[qinst->uniform]; 2267bf215546Sopenharmony_ci qinst->uniform = *next_uniform; 2268bf215546Sopenharmony_ci (*next_uniform)++; 2269bf215546Sopenharmony_ci } 2270bf215546Sopenharmony_ci 2271bf215546Sopenharmony_ci if (debug) { 2272bf215546Sopenharmony_ci fprintf(stderr, "\n"); 2273bf215546Sopenharmony_ci } 2274bf215546Sopenharmony_ci 2275bf215546Sopenharmony_ci /* Now that we've scheduled a new instruction, some of its 2276bf215546Sopenharmony_ci * children can be promoted to the list of instructions ready to 2277bf215546Sopenharmony_ci * be scheduled. Update the children's unblocked time for this 2278bf215546Sopenharmony_ci * DAG edge as we do so. 2279bf215546Sopenharmony_ci */ 2280bf215546Sopenharmony_ci mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen); 2281bf215546Sopenharmony_ci list_for_each_entry(struct schedule_node, merge, &merged_list, 2282bf215546Sopenharmony_ci link) { 2283bf215546Sopenharmony_ci mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge); 2284bf215546Sopenharmony_ci 2285bf215546Sopenharmony_ci /* The merged VIR instruction doesn't get re-added to the 2286bf215546Sopenharmony_ci * block, so free it now. 2287bf215546Sopenharmony_ci */ 2288bf215546Sopenharmony_ci free(merge->inst); 2289bf215546Sopenharmony_ci } 2290bf215546Sopenharmony_ci 2291bf215546Sopenharmony_ci if (inst->sig.thrsw) { 2292bf215546Sopenharmony_ci time += emit_thrsw(c, block, scoreboard, qinst, false); 2293bf215546Sopenharmony_ci } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 2294bf215546Sopenharmony_ci emit_branch(c, block, scoreboard, qinst); 2295bf215546Sopenharmony_ci } else { 2296bf215546Sopenharmony_ci insert_scheduled_instruction(c, block, 2297bf215546Sopenharmony_ci scoreboard, qinst); 2298bf215546Sopenharmony_ci } 2299bf215546Sopenharmony_ci } 2300bf215546Sopenharmony_ci 2301bf215546Sopenharmony_ci return time; 2302bf215546Sopenharmony_ci} 2303bf215546Sopenharmony_ci 2304bf215546Sopenharmony_cistatic uint32_t 2305bf215546Sopenharmony_ciqpu_schedule_instructions_block(struct v3d_compile *c, 2306bf215546Sopenharmony_ci struct choose_scoreboard *scoreboard, 2307bf215546Sopenharmony_ci struct qblock *block, 2308bf215546Sopenharmony_ci enum quniform_contents *orig_uniform_contents, 2309bf215546Sopenharmony_ci uint32_t *orig_uniform_data, 2310bf215546Sopenharmony_ci uint32_t *next_uniform) 2311bf215546Sopenharmony_ci{ 2312bf215546Sopenharmony_ci void *mem_ctx = ralloc_context(NULL); 2313bf215546Sopenharmony_ci scoreboard->dag = dag_create(mem_ctx); 2314bf215546Sopenharmony_ci struct list_head setup_list; 2315bf215546Sopenharmony_ci 2316bf215546Sopenharmony_ci list_inithead(&setup_list); 2317bf215546Sopenharmony_ci 2318bf215546Sopenharmony_ci /* Wrap each instruction in a scheduler structure. */ 2319bf215546Sopenharmony_ci while (!list_is_empty(&block->instructions)) { 2320bf215546Sopenharmony_ci struct qinst *qinst = (struct qinst *)block->instructions.next; 2321bf215546Sopenharmony_ci struct schedule_node *n = 2322bf215546Sopenharmony_ci rzalloc(mem_ctx, struct schedule_node); 2323bf215546Sopenharmony_ci 2324bf215546Sopenharmony_ci dag_init_node(scoreboard->dag, &n->dag); 2325bf215546Sopenharmony_ci n->inst = qinst; 2326bf215546Sopenharmony_ci 2327bf215546Sopenharmony_ci list_del(&qinst->link); 2328bf215546Sopenharmony_ci list_addtail(&n->link, &setup_list); 2329bf215546Sopenharmony_ci } 2330bf215546Sopenharmony_ci 2331bf215546Sopenharmony_ci calculate_forward_deps(c, scoreboard->dag, &setup_list); 2332bf215546Sopenharmony_ci calculate_reverse_deps(c, scoreboard->dag, &setup_list); 2333bf215546Sopenharmony_ci 2334bf215546Sopenharmony_ci dag_traverse_bottom_up(scoreboard->dag, compute_delay, c); 2335bf215546Sopenharmony_ci 2336bf215546Sopenharmony_ci uint32_t cycles = schedule_instructions(c, scoreboard, block, 2337bf215546Sopenharmony_ci orig_uniform_contents, 2338bf215546Sopenharmony_ci orig_uniform_data, 2339bf215546Sopenharmony_ci next_uniform); 2340bf215546Sopenharmony_ci 2341bf215546Sopenharmony_ci ralloc_free(mem_ctx); 2342bf215546Sopenharmony_ci scoreboard->dag = NULL; 2343bf215546Sopenharmony_ci 2344bf215546Sopenharmony_ci return cycles; 2345bf215546Sopenharmony_ci} 2346bf215546Sopenharmony_ci 2347bf215546Sopenharmony_cistatic void 2348bf215546Sopenharmony_ciqpu_set_branch_targets(struct v3d_compile *c) 2349bf215546Sopenharmony_ci{ 2350bf215546Sopenharmony_ci vir_for_each_block(block, c) { 2351bf215546Sopenharmony_ci /* The end block of the program has no branch. */ 2352bf215546Sopenharmony_ci if (!block->successors[0]) 2353bf215546Sopenharmony_ci continue; 2354bf215546Sopenharmony_ci 2355bf215546Sopenharmony_ci /* If there was no branch instruction, then the successor 2356bf215546Sopenharmony_ci * block must follow immediately after this one. 2357bf215546Sopenharmony_ci */ 2358bf215546Sopenharmony_ci if (block->branch_qpu_ip == ~0) { 2359bf215546Sopenharmony_ci assert(block->end_qpu_ip + 1 == 2360bf215546Sopenharmony_ci block->successors[0]->start_qpu_ip); 2361bf215546Sopenharmony_ci continue; 2362bf215546Sopenharmony_ci } 2363bf215546Sopenharmony_ci 2364bf215546Sopenharmony_ci /* Walk back through the delay slots to find the branch 2365bf215546Sopenharmony_ci * instr. 2366bf215546Sopenharmony_ci */ 2367bf215546Sopenharmony_ci struct qinst *branch = NULL; 2368bf215546Sopenharmony_ci struct list_head *entry = block->instructions.prev; 2369bf215546Sopenharmony_ci int32_t delay_slot_count = -1; 2370bf215546Sopenharmony_ci struct qinst *delay_slots_start = NULL; 2371bf215546Sopenharmony_ci for (int i = 0; i < 3; i++) { 2372bf215546Sopenharmony_ci entry = entry->prev; 2373bf215546Sopenharmony_ci struct qinst *inst = 2374bf215546Sopenharmony_ci container_of(entry, struct qinst, link); 2375bf215546Sopenharmony_ci 2376bf215546Sopenharmony_ci if (delay_slot_count == -1) { 2377bf215546Sopenharmony_ci if (!v3d_qpu_is_nop(&inst->qpu)) 2378bf215546Sopenharmony_ci delay_slot_count = i; 2379bf215546Sopenharmony_ci else 2380bf215546Sopenharmony_ci delay_slots_start = inst; 2381bf215546Sopenharmony_ci } 2382bf215546Sopenharmony_ci 2383bf215546Sopenharmony_ci if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) { 2384bf215546Sopenharmony_ci branch = inst; 2385bf215546Sopenharmony_ci break; 2386bf215546Sopenharmony_ci } 2387bf215546Sopenharmony_ci } 2388bf215546Sopenharmony_ci assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); 2389bf215546Sopenharmony_ci assert(delay_slot_count >= 0 && delay_slot_count <= 3); 2390bf215546Sopenharmony_ci assert(delay_slot_count == 0 || delay_slots_start != NULL); 2391bf215546Sopenharmony_ci 2392bf215546Sopenharmony_ci /* Make sure that the if-we-don't-jump 2393bf215546Sopenharmony_ci * successor was scheduled just after the 2394bf215546Sopenharmony_ci * delay slots. 2395bf215546Sopenharmony_ci */ 2396bf215546Sopenharmony_ci assert(!block->successors[1] || 2397bf215546Sopenharmony_ci block->successors[1]->start_qpu_ip == 2398bf215546Sopenharmony_ci block->branch_qpu_ip + 4); 2399bf215546Sopenharmony_ci 2400bf215546Sopenharmony_ci branch->qpu.branch.offset = 2401bf215546Sopenharmony_ci ((block->successors[0]->start_qpu_ip - 2402bf215546Sopenharmony_ci (block->branch_qpu_ip + 4)) * 2403bf215546Sopenharmony_ci sizeof(uint64_t)); 2404bf215546Sopenharmony_ci 2405bf215546Sopenharmony_ci /* Set up the relative offset to jump in the 2406bf215546Sopenharmony_ci * uniform stream. 2407bf215546Sopenharmony_ci * 2408bf215546Sopenharmony_ci * Use a temporary here, because 2409bf215546Sopenharmony_ci * uniform_data[inst->uniform] may be shared 2410bf215546Sopenharmony_ci * between multiple instructions. 2411bf215546Sopenharmony_ci */ 2412bf215546Sopenharmony_ci assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT); 2413bf215546Sopenharmony_ci c->uniform_data[branch->uniform] = 2414bf215546Sopenharmony_ci (block->successors[0]->start_uniform - 2415bf215546Sopenharmony_ci (block->branch_uniform + 1)) * 4; 2416bf215546Sopenharmony_ci 2417bf215546Sopenharmony_ci /* If this is an unconditional branch, try to fill any remaining 2418bf215546Sopenharmony_ci * delay slots with the initial instructions of the successor 2419bf215546Sopenharmony_ci * block. 2420bf215546Sopenharmony_ci * 2421bf215546Sopenharmony_ci * FIXME: we can do the same for conditional branches if we 2422bf215546Sopenharmony_ci * predicate the instructions to match the branch condition. 2423bf215546Sopenharmony_ci */ 2424bf215546Sopenharmony_ci if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) { 2425bf215546Sopenharmony_ci struct list_head *successor_insts = 2426bf215546Sopenharmony_ci &block->successors[0]->instructions; 2427bf215546Sopenharmony_ci delay_slot_count = MIN2(delay_slot_count, 2428bf215546Sopenharmony_ci list_length(successor_insts)); 2429bf215546Sopenharmony_ci struct qinst *s_inst = 2430bf215546Sopenharmony_ci (struct qinst *) successor_insts->next; 2431bf215546Sopenharmony_ci struct qinst *slot = delay_slots_start; 2432bf215546Sopenharmony_ci int slots_filled = 0; 2433bf215546Sopenharmony_ci while (slots_filled < delay_slot_count && 2434bf215546Sopenharmony_ci qpu_inst_valid_in_branch_delay_slot(c, s_inst)) { 2435bf215546Sopenharmony_ci memcpy(&slot->qpu, &s_inst->qpu, 2436bf215546Sopenharmony_ci sizeof(slot->qpu)); 2437bf215546Sopenharmony_ci s_inst = (struct qinst *) s_inst->link.next; 2438bf215546Sopenharmony_ci slot = (struct qinst *) slot->link.next; 2439bf215546Sopenharmony_ci slots_filled++; 2440bf215546Sopenharmony_ci } 2441bf215546Sopenharmony_ci branch->qpu.branch.offset += 2442bf215546Sopenharmony_ci slots_filled * sizeof(uint64_t); 2443bf215546Sopenharmony_ci } 2444bf215546Sopenharmony_ci } 2445bf215546Sopenharmony_ci} 2446bf215546Sopenharmony_ci 2447bf215546Sopenharmony_ciuint32_t 2448bf215546Sopenharmony_civ3d_qpu_schedule_instructions(struct v3d_compile *c) 2449bf215546Sopenharmony_ci{ 2450bf215546Sopenharmony_ci const struct v3d_device_info *devinfo = c->devinfo; 2451bf215546Sopenharmony_ci struct qblock *end_block = list_last_entry(&c->blocks, 2452bf215546Sopenharmony_ci struct qblock, link); 2453bf215546Sopenharmony_ci 2454bf215546Sopenharmony_ci /* We reorder the uniforms as we schedule instructions, so save the 2455bf215546Sopenharmony_ci * old data off and replace it. 2456bf215546Sopenharmony_ci */ 2457bf215546Sopenharmony_ci uint32_t *uniform_data = c->uniform_data; 2458bf215546Sopenharmony_ci enum quniform_contents *uniform_contents = c->uniform_contents; 2459bf215546Sopenharmony_ci c->uniform_contents = ralloc_array(c, enum quniform_contents, 2460bf215546Sopenharmony_ci c->num_uniforms); 2461bf215546Sopenharmony_ci c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms); 2462bf215546Sopenharmony_ci c->uniform_array_size = c->num_uniforms; 2463bf215546Sopenharmony_ci uint32_t next_uniform = 0; 2464bf215546Sopenharmony_ci 2465bf215546Sopenharmony_ci struct choose_scoreboard scoreboard; 2466bf215546Sopenharmony_ci memset(&scoreboard, 0, sizeof(scoreboard)); 2467bf215546Sopenharmony_ci scoreboard.last_ldvary_tick = -10; 2468bf215546Sopenharmony_ci scoreboard.last_unifa_write_tick = -10; 2469bf215546Sopenharmony_ci scoreboard.last_magic_sfu_write_tick = -10; 2470bf215546Sopenharmony_ci scoreboard.last_uniforms_reset_tick = -10; 2471bf215546Sopenharmony_ci scoreboard.last_thrsw_tick = -10; 2472bf215546Sopenharmony_ci scoreboard.last_branch_tick = -10; 2473bf215546Sopenharmony_ci scoreboard.last_setmsf_tick = -10; 2474bf215546Sopenharmony_ci scoreboard.last_stallable_sfu_tick = -10; 2475bf215546Sopenharmony_ci 2476bf215546Sopenharmony_ci if (debug) { 2477bf215546Sopenharmony_ci fprintf(stderr, "Pre-schedule instructions\n"); 2478bf215546Sopenharmony_ci vir_for_each_block(block, c) { 2479bf215546Sopenharmony_ci fprintf(stderr, "BLOCK %d\n", block->index); 2480bf215546Sopenharmony_ci list_for_each_entry(struct qinst, qinst, 2481bf215546Sopenharmony_ci &block->instructions, link) { 2482bf215546Sopenharmony_ci v3d_qpu_dump(devinfo, &qinst->qpu); 2483bf215546Sopenharmony_ci fprintf(stderr, "\n"); 2484bf215546Sopenharmony_ci } 2485bf215546Sopenharmony_ci } 2486bf215546Sopenharmony_ci fprintf(stderr, "\n"); 2487bf215546Sopenharmony_ci } 2488bf215546Sopenharmony_ci 2489bf215546Sopenharmony_ci uint32_t cycles = 0; 2490bf215546Sopenharmony_ci vir_for_each_block(block, c) { 2491bf215546Sopenharmony_ci block->start_qpu_ip = c->qpu_inst_count; 2492bf215546Sopenharmony_ci block->branch_qpu_ip = ~0; 2493bf215546Sopenharmony_ci block->start_uniform = next_uniform; 2494bf215546Sopenharmony_ci 2495bf215546Sopenharmony_ci cycles += qpu_schedule_instructions_block(c, 2496bf215546Sopenharmony_ci &scoreboard, 2497bf215546Sopenharmony_ci block, 2498bf215546Sopenharmony_ci uniform_contents, 2499bf215546Sopenharmony_ci uniform_data, 2500bf215546Sopenharmony_ci &next_uniform); 2501bf215546Sopenharmony_ci 2502bf215546Sopenharmony_ci block->end_qpu_ip = c->qpu_inst_count - 1; 2503bf215546Sopenharmony_ci } 2504bf215546Sopenharmony_ci 2505bf215546Sopenharmony_ci /* Emit the program-end THRSW instruction. */; 2506bf215546Sopenharmony_ci struct qinst *thrsw = vir_nop(); 2507bf215546Sopenharmony_ci thrsw->qpu.sig.thrsw = true; 2508bf215546Sopenharmony_ci emit_thrsw(c, end_block, &scoreboard, thrsw, true); 2509bf215546Sopenharmony_ci 2510bf215546Sopenharmony_ci qpu_set_branch_targets(c); 2511bf215546Sopenharmony_ci 2512bf215546Sopenharmony_ci assert(next_uniform == c->num_uniforms); 2513bf215546Sopenharmony_ci 2514bf215546Sopenharmony_ci return cycles; 2515bf215546Sopenharmony_ci} 2516