1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2018 Valve Corporation 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21bf215546Sopenharmony_ci * IN THE SOFTWARE. 22bf215546Sopenharmony_ci * 23bf215546Sopenharmony_ci */ 24bf215546Sopenharmony_ci 25bf215546Sopenharmony_ci#include "aco_builder.h" 26bf215546Sopenharmony_ci#include "aco_ir.h" 27bf215546Sopenharmony_ci 28bf215546Sopenharmony_ci#include "common/amdgfxregs.h" 29bf215546Sopenharmony_ci 30bf215546Sopenharmony_ci#include <algorithm> 31bf215546Sopenharmony_ci#include <unordered_set> 32bf215546Sopenharmony_ci#include <vector> 33bf215546Sopenharmony_ci 34bf215546Sopenharmony_ci#define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35) 35bf215546Sopenharmony_ci#define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64) 36bf215546Sopenharmony_ci#define POS_EXP_WINDOW_SIZE 512 37bf215546Sopenharmony_ci#define SMEM_MAX_MOVES (64 - ctx.num_waves * 4) 38bf215546Sopenharmony_ci#define VMEM_MAX_MOVES (256 - ctx.num_waves * 16) 39bf215546Sopenharmony_ci/* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */ 40bf215546Sopenharmony_ci#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 2) 41bf215546Sopenharmony_ci#define POS_EXP_MAX_MOVES 512 42bf215546Sopenharmony_ci 43bf215546Sopenharmony_cinamespace aco { 44bf215546Sopenharmony_ci 45bf215546Sopenharmony_cienum MoveResult { 46bf215546Sopenharmony_ci move_success, 47bf215546Sopenharmony_ci move_fail_ssa, 48bf215546Sopenharmony_ci move_fail_rar, 49bf215546Sopenharmony_ci move_fail_pressure, 50bf215546Sopenharmony_ci}; 51bf215546Sopenharmony_ci 52bf215546Sopenharmony_ci/** 53bf215546Sopenharmony_ci * Cursor for downwards moves, where a single instruction is moved towards 54bf215546Sopenharmony_ci * or below a group of instruction that hardware can execute as a clause. 55bf215546Sopenharmony_ci */ 56bf215546Sopenharmony_cistruct DownwardsCursor { 57bf215546Sopenharmony_ci int source_idx; /* Current instruction to consider for moving */ 58bf215546Sopenharmony_ci 59bf215546Sopenharmony_ci int insert_idx_clause; /* First clause instruction */ 60bf215546Sopenharmony_ci int insert_idx; /* First instruction *after* the clause */ 61bf215546Sopenharmony_ci 62bf215546Sopenharmony_ci /* Maximum demand of all clause instructions, 63bf215546Sopenharmony_ci * i.e. from insert_idx_clause (inclusive) to insert_idx (exclusive) */ 64bf215546Sopenharmony_ci RegisterDemand clause_demand; 65bf215546Sopenharmony_ci /* Maximum demand of instructions from source_idx to insert_idx_clause (both exclusive) */ 66bf215546Sopenharmony_ci RegisterDemand total_demand; 67bf215546Sopenharmony_ci 68bf215546Sopenharmony_ci DownwardsCursor(int current_idx, RegisterDemand initial_clause_demand) 69bf215546Sopenharmony_ci : source_idx(current_idx - 1), insert_idx_clause(current_idx), insert_idx(current_idx + 1), 70bf215546Sopenharmony_ci clause_demand(initial_clause_demand) 71bf215546Sopenharmony_ci {} 72bf215546Sopenharmony_ci 73bf215546Sopenharmony_ci void verify_invariants(const RegisterDemand* register_demand); 74bf215546Sopenharmony_ci}; 75bf215546Sopenharmony_ci 76bf215546Sopenharmony_ci/** 77bf215546Sopenharmony_ci * Cursor for upwards moves, where a single instruction is moved below 78bf215546Sopenharmony_ci * another instruction. 79bf215546Sopenharmony_ci */ 80bf215546Sopenharmony_cistruct UpwardsCursor { 81bf215546Sopenharmony_ci int source_idx; /* Current instruction to consider for moving */ 82bf215546Sopenharmony_ci int insert_idx; /* Instruction to move in front of */ 83bf215546Sopenharmony_ci 84bf215546Sopenharmony_ci /* Maximum demand of instructions from insert_idx (inclusive) to source_idx (exclusive) */ 85bf215546Sopenharmony_ci RegisterDemand total_demand; 86bf215546Sopenharmony_ci 87bf215546Sopenharmony_ci UpwardsCursor(int source_idx_) : source_idx(source_idx_) 88bf215546Sopenharmony_ci { 89bf215546Sopenharmony_ci insert_idx = -1; /* to be initialized later */ 90bf215546Sopenharmony_ci } 91bf215546Sopenharmony_ci 92bf215546Sopenharmony_ci bool has_insert_idx() const { return insert_idx != -1; } 93bf215546Sopenharmony_ci void verify_invariants(const RegisterDemand* register_demand); 94bf215546Sopenharmony_ci}; 95bf215546Sopenharmony_ci 96bf215546Sopenharmony_cistruct MoveState { 97bf215546Sopenharmony_ci RegisterDemand max_registers; 98bf215546Sopenharmony_ci 99bf215546Sopenharmony_ci Block* block; 100bf215546Sopenharmony_ci Instruction* current; 101bf215546Sopenharmony_ci RegisterDemand* register_demand; /* demand per instruction */ 102bf215546Sopenharmony_ci bool improved_rar; 103bf215546Sopenharmony_ci 104bf215546Sopenharmony_ci std::vector<bool> depends_on; 105bf215546Sopenharmony_ci /* Two are needed because, for downwards VMEM scheduling, one needs to 106bf215546Sopenharmony_ci * exclude the instructions in the clause, since new instructions in the 107bf215546Sopenharmony_ci * clause are not moved past any other instructions in the clause. */ 108bf215546Sopenharmony_ci std::vector<bool> RAR_dependencies; 109bf215546Sopenharmony_ci std::vector<bool> RAR_dependencies_clause; 110bf215546Sopenharmony_ci 111bf215546Sopenharmony_ci /* for moving instructions before the current instruction to after it */ 112bf215546Sopenharmony_ci DownwardsCursor downwards_init(int current_idx, bool improved_rar, bool may_form_clauses); 113bf215546Sopenharmony_ci MoveResult downwards_move(DownwardsCursor&, bool clause); 114bf215546Sopenharmony_ci void downwards_skip(DownwardsCursor&); 115bf215546Sopenharmony_ci 116bf215546Sopenharmony_ci /* for moving instructions after the first use of the current instruction upwards */ 117bf215546Sopenharmony_ci UpwardsCursor upwards_init(int source_idx, bool improved_rar); 118bf215546Sopenharmony_ci bool upwards_check_deps(UpwardsCursor&); 119bf215546Sopenharmony_ci void upwards_update_insert_idx(UpwardsCursor&); 120bf215546Sopenharmony_ci MoveResult upwards_move(UpwardsCursor&); 121bf215546Sopenharmony_ci void upwards_skip(UpwardsCursor&); 122bf215546Sopenharmony_ci}; 123bf215546Sopenharmony_ci 124bf215546Sopenharmony_cistruct sched_ctx { 125bf215546Sopenharmony_ci int16_t num_waves; 126bf215546Sopenharmony_ci int16_t last_SMEM_stall; 127bf215546Sopenharmony_ci int last_SMEM_dep_idx; 128bf215546Sopenharmony_ci MoveState mv; 129bf215546Sopenharmony_ci bool schedule_pos_exports = true; 130bf215546Sopenharmony_ci unsigned schedule_pos_export_div = 1; 131bf215546Sopenharmony_ci}; 132bf215546Sopenharmony_ci 133bf215546Sopenharmony_ci/* This scheduler is a simple bottom-up pass based on ideas from 134bf215546Sopenharmony_ci * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler" 135bf215546Sopenharmony_ci * from Xiaohua Shi and Peng Guo. 136bf215546Sopenharmony_ci * The basic approach is to iterate over all instructions. When a memory instruction 137bf215546Sopenharmony_ci * is encountered it tries to move independent instructions from above and below 138bf215546Sopenharmony_ci * between the memory instruction and it's first user. 139bf215546Sopenharmony_ci * The novelty is that this scheduler cares for the current register pressure: 140bf215546Sopenharmony_ci * Instructions will only be moved if the register pressure won't exceed a certain bound. 141bf215546Sopenharmony_ci */ 142bf215546Sopenharmony_ci 143bf215546Sopenharmony_citemplate <typename T> 144bf215546Sopenharmony_civoid 145bf215546Sopenharmony_cimove_element(T begin_it, size_t idx, size_t before) 146bf215546Sopenharmony_ci{ 147bf215546Sopenharmony_ci if (idx < before) { 148bf215546Sopenharmony_ci auto begin = std::next(begin_it, idx); 149bf215546Sopenharmony_ci auto end = std::next(begin_it, before); 150bf215546Sopenharmony_ci std::rotate(begin, begin + 1, end); 151bf215546Sopenharmony_ci } else if (idx > before) { 152bf215546Sopenharmony_ci auto begin = std::next(begin_it, before); 153bf215546Sopenharmony_ci auto end = std::next(begin_it, idx + 1); 154bf215546Sopenharmony_ci std::rotate(begin, end - 1, end); 155bf215546Sopenharmony_ci } 156bf215546Sopenharmony_ci} 157bf215546Sopenharmony_ci 158bf215546Sopenharmony_civoid 159bf215546Sopenharmony_ciDownwardsCursor::verify_invariants(const RegisterDemand* register_demand) 160bf215546Sopenharmony_ci{ 161bf215546Sopenharmony_ci assert(source_idx < insert_idx_clause); 162bf215546Sopenharmony_ci assert(insert_idx_clause < insert_idx); 163bf215546Sopenharmony_ci 164bf215546Sopenharmony_ci#ifndef NDEBUG 165bf215546Sopenharmony_ci RegisterDemand reference_demand; 166bf215546Sopenharmony_ci for (int i = source_idx + 1; i < insert_idx_clause; ++i) { 167bf215546Sopenharmony_ci reference_demand.update(register_demand[i]); 168bf215546Sopenharmony_ci } 169bf215546Sopenharmony_ci assert(total_demand == reference_demand); 170bf215546Sopenharmony_ci 171bf215546Sopenharmony_ci reference_demand = {}; 172bf215546Sopenharmony_ci for (int i = insert_idx_clause; i < insert_idx; ++i) { 173bf215546Sopenharmony_ci reference_demand.update(register_demand[i]); 174bf215546Sopenharmony_ci } 175bf215546Sopenharmony_ci assert(clause_demand == reference_demand); 176bf215546Sopenharmony_ci#endif 177bf215546Sopenharmony_ci} 178bf215546Sopenharmony_ci 179bf215546Sopenharmony_ciDownwardsCursor 180bf215546Sopenharmony_ciMoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses) 181bf215546Sopenharmony_ci{ 182bf215546Sopenharmony_ci improved_rar = improved_rar_; 183bf215546Sopenharmony_ci 184bf215546Sopenharmony_ci std::fill(depends_on.begin(), depends_on.end(), false); 185bf215546Sopenharmony_ci if (improved_rar) { 186bf215546Sopenharmony_ci std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false); 187bf215546Sopenharmony_ci if (may_form_clauses) 188bf215546Sopenharmony_ci std::fill(RAR_dependencies_clause.begin(), RAR_dependencies_clause.end(), false); 189bf215546Sopenharmony_ci } 190bf215546Sopenharmony_ci 191bf215546Sopenharmony_ci for (const Operand& op : current->operands) { 192bf215546Sopenharmony_ci if (op.isTemp()) { 193bf215546Sopenharmony_ci depends_on[op.tempId()] = true; 194bf215546Sopenharmony_ci if (improved_rar && op.isFirstKill()) 195bf215546Sopenharmony_ci RAR_dependencies[op.tempId()] = true; 196bf215546Sopenharmony_ci } 197bf215546Sopenharmony_ci } 198bf215546Sopenharmony_ci 199bf215546Sopenharmony_ci DownwardsCursor cursor(current_idx, register_demand[current_idx]); 200bf215546Sopenharmony_ci cursor.verify_invariants(register_demand); 201bf215546Sopenharmony_ci return cursor; 202bf215546Sopenharmony_ci} 203bf215546Sopenharmony_ci 204bf215546Sopenharmony_ci/* If add_to_clause is true, the current clause is extended by moving the 205bf215546Sopenharmony_ci * instruction at source_idx in front of the clause. Otherwise, the instruction 206bf215546Sopenharmony_ci * is moved past the end of the clause without extending it */ 207bf215546Sopenharmony_ciMoveResult 208bf215546Sopenharmony_ciMoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause) 209bf215546Sopenharmony_ci{ 210bf215546Sopenharmony_ci aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx]; 211bf215546Sopenharmony_ci 212bf215546Sopenharmony_ci for (const Definition& def : instr->definitions) 213bf215546Sopenharmony_ci if (def.isTemp() && depends_on[def.tempId()]) 214bf215546Sopenharmony_ci return move_fail_ssa; 215bf215546Sopenharmony_ci 216bf215546Sopenharmony_ci /* check if one of candidate's operands is killed by depending instruction */ 217bf215546Sopenharmony_ci std::vector<bool>& RAR_deps = 218bf215546Sopenharmony_ci improved_rar ? (add_to_clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on; 219bf215546Sopenharmony_ci for (const Operand& op : instr->operands) { 220bf215546Sopenharmony_ci if (op.isTemp() && RAR_deps[op.tempId()]) { 221bf215546Sopenharmony_ci // FIXME: account for difference in register pressure 222bf215546Sopenharmony_ci return move_fail_rar; 223bf215546Sopenharmony_ci } 224bf215546Sopenharmony_ci } 225bf215546Sopenharmony_ci 226bf215546Sopenharmony_ci if (add_to_clause) { 227bf215546Sopenharmony_ci for (const Operand& op : instr->operands) { 228bf215546Sopenharmony_ci if (op.isTemp()) { 229bf215546Sopenharmony_ci depends_on[op.tempId()] = true; 230bf215546Sopenharmony_ci if (op.isFirstKill()) 231bf215546Sopenharmony_ci RAR_dependencies[op.tempId()] = true; 232bf215546Sopenharmony_ci } 233bf215546Sopenharmony_ci } 234bf215546Sopenharmony_ci } 235bf215546Sopenharmony_ci 236bf215546Sopenharmony_ci const int dest_insert_idx = add_to_clause ? cursor.insert_idx_clause : cursor.insert_idx; 237bf215546Sopenharmony_ci RegisterDemand register_pressure = cursor.total_demand; 238bf215546Sopenharmony_ci if (!add_to_clause) { 239bf215546Sopenharmony_ci register_pressure.update(cursor.clause_demand); 240bf215546Sopenharmony_ci } 241bf215546Sopenharmony_ci 242bf215546Sopenharmony_ci /* Check the new demand of the instructions being moved over */ 243bf215546Sopenharmony_ci const RegisterDemand candidate_diff = get_live_changes(instr); 244bf215546Sopenharmony_ci if (RegisterDemand(register_pressure - candidate_diff).exceeds(max_registers)) 245bf215546Sopenharmony_ci return move_fail_pressure; 246bf215546Sopenharmony_ci 247bf215546Sopenharmony_ci /* New demand for the moved instruction */ 248bf215546Sopenharmony_ci const RegisterDemand temp = get_temp_registers(instr); 249bf215546Sopenharmony_ci const RegisterDemand temp2 = get_temp_registers(block->instructions[dest_insert_idx - 1]); 250bf215546Sopenharmony_ci const RegisterDemand new_demand = register_demand[dest_insert_idx - 1] - temp2 + temp; 251bf215546Sopenharmony_ci if (new_demand.exceeds(max_registers)) 252bf215546Sopenharmony_ci return move_fail_pressure; 253bf215546Sopenharmony_ci 254bf215546Sopenharmony_ci /* move the candidate below the memory load */ 255bf215546Sopenharmony_ci move_element(block->instructions.begin(), cursor.source_idx, dest_insert_idx); 256bf215546Sopenharmony_ci 257bf215546Sopenharmony_ci /* update register pressure */ 258bf215546Sopenharmony_ci move_element(register_demand, cursor.source_idx, dest_insert_idx); 259bf215546Sopenharmony_ci for (int i = cursor.source_idx; i < dest_insert_idx - 1; i++) 260bf215546Sopenharmony_ci register_demand[i] -= candidate_diff; 261bf215546Sopenharmony_ci register_demand[dest_insert_idx - 1] = new_demand; 262bf215546Sopenharmony_ci cursor.insert_idx_clause--; 263bf215546Sopenharmony_ci if (cursor.source_idx != cursor.insert_idx_clause) { 264bf215546Sopenharmony_ci /* Update demand if we moved over any instructions before the clause */ 265bf215546Sopenharmony_ci cursor.total_demand -= candidate_diff; 266bf215546Sopenharmony_ci } else { 267bf215546Sopenharmony_ci assert(cursor.total_demand == RegisterDemand{}); 268bf215546Sopenharmony_ci } 269bf215546Sopenharmony_ci if (add_to_clause) { 270bf215546Sopenharmony_ci cursor.clause_demand.update(new_demand); 271bf215546Sopenharmony_ci } else { 272bf215546Sopenharmony_ci cursor.clause_demand -= candidate_diff; 273bf215546Sopenharmony_ci cursor.insert_idx--; 274bf215546Sopenharmony_ci } 275bf215546Sopenharmony_ci 276bf215546Sopenharmony_ci cursor.source_idx--; 277bf215546Sopenharmony_ci cursor.verify_invariants(register_demand); 278bf215546Sopenharmony_ci return move_success; 279bf215546Sopenharmony_ci} 280bf215546Sopenharmony_ci 281bf215546Sopenharmony_civoid 282bf215546Sopenharmony_ciMoveState::downwards_skip(DownwardsCursor& cursor) 283bf215546Sopenharmony_ci{ 284bf215546Sopenharmony_ci aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx]; 285bf215546Sopenharmony_ci 286bf215546Sopenharmony_ci for (const Operand& op : instr->operands) { 287bf215546Sopenharmony_ci if (op.isTemp()) { 288bf215546Sopenharmony_ci depends_on[op.tempId()] = true; 289bf215546Sopenharmony_ci if (improved_rar && op.isFirstKill()) { 290bf215546Sopenharmony_ci RAR_dependencies[op.tempId()] = true; 291bf215546Sopenharmony_ci RAR_dependencies_clause[op.tempId()] = true; 292bf215546Sopenharmony_ci } 293bf215546Sopenharmony_ci } 294bf215546Sopenharmony_ci } 295bf215546Sopenharmony_ci cursor.total_demand.update(register_demand[cursor.source_idx]); 296bf215546Sopenharmony_ci cursor.source_idx--; 297bf215546Sopenharmony_ci cursor.verify_invariants(register_demand); 298bf215546Sopenharmony_ci} 299bf215546Sopenharmony_ci 300bf215546Sopenharmony_civoid 301bf215546Sopenharmony_ciUpwardsCursor::verify_invariants(const RegisterDemand* register_demand) 302bf215546Sopenharmony_ci{ 303bf215546Sopenharmony_ci#ifndef NDEBUG 304bf215546Sopenharmony_ci if (!has_insert_idx()) { 305bf215546Sopenharmony_ci return; 306bf215546Sopenharmony_ci } 307bf215546Sopenharmony_ci 308bf215546Sopenharmony_ci assert(insert_idx < source_idx); 309bf215546Sopenharmony_ci 310bf215546Sopenharmony_ci RegisterDemand reference_demand; 311bf215546Sopenharmony_ci for (int i = insert_idx; i < source_idx; ++i) { 312bf215546Sopenharmony_ci reference_demand.update(register_demand[i]); 313bf215546Sopenharmony_ci } 314bf215546Sopenharmony_ci assert(total_demand == reference_demand); 315bf215546Sopenharmony_ci#endif 316bf215546Sopenharmony_ci} 317bf215546Sopenharmony_ci 318bf215546Sopenharmony_ciUpwardsCursor 319bf215546Sopenharmony_ciMoveState::upwards_init(int source_idx, bool improved_rar_) 320bf215546Sopenharmony_ci{ 321bf215546Sopenharmony_ci improved_rar = improved_rar_; 322bf215546Sopenharmony_ci 323bf215546Sopenharmony_ci std::fill(depends_on.begin(), depends_on.end(), false); 324bf215546Sopenharmony_ci std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false); 325bf215546Sopenharmony_ci 326bf215546Sopenharmony_ci for (const Definition& def : current->definitions) { 327bf215546Sopenharmony_ci if (def.isTemp()) 328bf215546Sopenharmony_ci depends_on[def.tempId()] = true; 329bf215546Sopenharmony_ci } 330bf215546Sopenharmony_ci 331bf215546Sopenharmony_ci return UpwardsCursor(source_idx); 332bf215546Sopenharmony_ci} 333bf215546Sopenharmony_ci 334bf215546Sopenharmony_cibool 335bf215546Sopenharmony_ciMoveState::upwards_check_deps(UpwardsCursor& cursor) 336bf215546Sopenharmony_ci{ 337bf215546Sopenharmony_ci aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx]; 338bf215546Sopenharmony_ci for (const Operand& op : instr->operands) { 339bf215546Sopenharmony_ci if (op.isTemp() && depends_on[op.tempId()]) 340bf215546Sopenharmony_ci return false; 341bf215546Sopenharmony_ci } 342bf215546Sopenharmony_ci return true; 343bf215546Sopenharmony_ci} 344bf215546Sopenharmony_ci 345bf215546Sopenharmony_civoid 346bf215546Sopenharmony_ciMoveState::upwards_update_insert_idx(UpwardsCursor& cursor) 347bf215546Sopenharmony_ci{ 348bf215546Sopenharmony_ci cursor.insert_idx = cursor.source_idx; 349bf215546Sopenharmony_ci cursor.total_demand = register_demand[cursor.insert_idx]; 350bf215546Sopenharmony_ci} 351bf215546Sopenharmony_ci 352bf215546Sopenharmony_ciMoveResult 353bf215546Sopenharmony_ciMoveState::upwards_move(UpwardsCursor& cursor) 354bf215546Sopenharmony_ci{ 355bf215546Sopenharmony_ci assert(cursor.has_insert_idx()); 356bf215546Sopenharmony_ci 357bf215546Sopenharmony_ci aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx]; 358bf215546Sopenharmony_ci for (const Operand& op : instr->operands) { 359bf215546Sopenharmony_ci if (op.isTemp() && depends_on[op.tempId()]) 360bf215546Sopenharmony_ci return move_fail_ssa; 361bf215546Sopenharmony_ci } 362bf215546Sopenharmony_ci 363bf215546Sopenharmony_ci /* check if candidate uses/kills an operand which is used by a dependency */ 364bf215546Sopenharmony_ci for (const Operand& op : instr->operands) { 365bf215546Sopenharmony_ci if (op.isTemp() && (!improved_rar || op.isFirstKill()) && RAR_dependencies[op.tempId()]) 366bf215546Sopenharmony_ci return move_fail_rar; 367bf215546Sopenharmony_ci } 368bf215546Sopenharmony_ci 369bf215546Sopenharmony_ci /* check if register pressure is low enough: the diff is negative if register pressure is 370bf215546Sopenharmony_ci * decreased */ 371bf215546Sopenharmony_ci const RegisterDemand candidate_diff = get_live_changes(instr); 372bf215546Sopenharmony_ci const RegisterDemand temp = get_temp_registers(instr); 373bf215546Sopenharmony_ci if (RegisterDemand(cursor.total_demand + candidate_diff).exceeds(max_registers)) 374bf215546Sopenharmony_ci return move_fail_pressure; 375bf215546Sopenharmony_ci const RegisterDemand temp2 = get_temp_registers(block->instructions[cursor.insert_idx - 1]); 376bf215546Sopenharmony_ci const RegisterDemand new_demand = 377bf215546Sopenharmony_ci register_demand[cursor.insert_idx - 1] - temp2 + candidate_diff + temp; 378bf215546Sopenharmony_ci if (new_demand.exceeds(max_registers)) 379bf215546Sopenharmony_ci return move_fail_pressure; 380bf215546Sopenharmony_ci 381bf215546Sopenharmony_ci /* move the candidate above the insert_idx */ 382bf215546Sopenharmony_ci move_element(block->instructions.begin(), cursor.source_idx, cursor.insert_idx); 383bf215546Sopenharmony_ci 384bf215546Sopenharmony_ci /* update register pressure */ 385bf215546Sopenharmony_ci move_element(register_demand, cursor.source_idx, cursor.insert_idx); 386bf215546Sopenharmony_ci register_demand[cursor.insert_idx] = new_demand; 387bf215546Sopenharmony_ci for (int i = cursor.insert_idx + 1; i <= cursor.source_idx; i++) 388bf215546Sopenharmony_ci register_demand[i] += candidate_diff; 389bf215546Sopenharmony_ci cursor.total_demand += candidate_diff; 390bf215546Sopenharmony_ci 391bf215546Sopenharmony_ci cursor.total_demand.update(register_demand[cursor.source_idx]); 392bf215546Sopenharmony_ci 393bf215546Sopenharmony_ci cursor.insert_idx++; 394bf215546Sopenharmony_ci cursor.source_idx++; 395bf215546Sopenharmony_ci 396bf215546Sopenharmony_ci cursor.verify_invariants(register_demand); 397bf215546Sopenharmony_ci 398bf215546Sopenharmony_ci return move_success; 399bf215546Sopenharmony_ci} 400bf215546Sopenharmony_ci 401bf215546Sopenharmony_civoid 402bf215546Sopenharmony_ciMoveState::upwards_skip(UpwardsCursor& cursor) 403bf215546Sopenharmony_ci{ 404bf215546Sopenharmony_ci if (cursor.has_insert_idx()) { 405bf215546Sopenharmony_ci aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx]; 406bf215546Sopenharmony_ci for (const Definition& def : instr->definitions) { 407bf215546Sopenharmony_ci if (def.isTemp()) 408bf215546Sopenharmony_ci depends_on[def.tempId()] = true; 409bf215546Sopenharmony_ci } 410bf215546Sopenharmony_ci for (const Operand& op : instr->operands) { 411bf215546Sopenharmony_ci if (op.isTemp()) 412bf215546Sopenharmony_ci RAR_dependencies[op.tempId()] = true; 413bf215546Sopenharmony_ci } 414bf215546Sopenharmony_ci cursor.total_demand.update(register_demand[cursor.source_idx]); 415bf215546Sopenharmony_ci } 416bf215546Sopenharmony_ci 417bf215546Sopenharmony_ci cursor.source_idx++; 418bf215546Sopenharmony_ci 419bf215546Sopenharmony_ci cursor.verify_invariants(register_demand); 420bf215546Sopenharmony_ci} 421bf215546Sopenharmony_ci 422bf215546Sopenharmony_cibool 423bf215546Sopenharmony_ciis_gs_or_done_sendmsg(const Instruction* instr) 424bf215546Sopenharmony_ci{ 425bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::s_sendmsg) { 426bf215546Sopenharmony_ci uint16_t imm = instr->sopp().imm; 427bf215546Sopenharmony_ci return (imm & sendmsg_id_mask) == _sendmsg_gs || (imm & sendmsg_id_mask) == _sendmsg_gs_done; 428bf215546Sopenharmony_ci } 429bf215546Sopenharmony_ci return false; 430bf215546Sopenharmony_ci} 431bf215546Sopenharmony_ci 432bf215546Sopenharmony_cibool 433bf215546Sopenharmony_ciis_done_sendmsg(const Instruction* instr) 434bf215546Sopenharmony_ci{ 435bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::s_sendmsg) 436bf215546Sopenharmony_ci return (instr->sopp().imm & sendmsg_id_mask) == _sendmsg_gs_done; 437bf215546Sopenharmony_ci return false; 438bf215546Sopenharmony_ci} 439bf215546Sopenharmony_ci 440bf215546Sopenharmony_cimemory_sync_info 441bf215546Sopenharmony_ciget_sync_info_with_hack(const Instruction* instr) 442bf215546Sopenharmony_ci{ 443bf215546Sopenharmony_ci memory_sync_info sync = get_sync_info(instr); 444bf215546Sopenharmony_ci if (instr->isSMEM() && !instr->operands.empty() && instr->operands[0].bytes() == 16) { 445bf215546Sopenharmony_ci // FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works 446bf215546Sopenharmony_ci sync.storage = (storage_class)(sync.storage | storage_buffer); 447bf215546Sopenharmony_ci sync.semantics = 448bf215546Sopenharmony_ci (memory_semantics)((sync.semantics | semantic_private) & ~semantic_can_reorder); 449bf215546Sopenharmony_ci } 450bf215546Sopenharmony_ci return sync; 451bf215546Sopenharmony_ci} 452bf215546Sopenharmony_ci 453bf215546Sopenharmony_cistruct memory_event_set { 454bf215546Sopenharmony_ci bool has_control_barrier; 455bf215546Sopenharmony_ci 456bf215546Sopenharmony_ci unsigned bar_acquire; 457bf215546Sopenharmony_ci unsigned bar_release; 458bf215546Sopenharmony_ci unsigned bar_classes; 459bf215546Sopenharmony_ci 460bf215546Sopenharmony_ci unsigned access_acquire; 461bf215546Sopenharmony_ci unsigned access_release; 462bf215546Sopenharmony_ci unsigned access_relaxed; 463bf215546Sopenharmony_ci unsigned access_atomic; 464bf215546Sopenharmony_ci}; 465bf215546Sopenharmony_ci 466bf215546Sopenharmony_cistruct hazard_query { 467bf215546Sopenharmony_ci bool contains_spill; 468bf215546Sopenharmony_ci bool contains_sendmsg; 469bf215546Sopenharmony_ci bool uses_exec; 470bf215546Sopenharmony_ci memory_event_set mem_events; 471bf215546Sopenharmony_ci unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */ 472bf215546Sopenharmony_ci unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */ 473bf215546Sopenharmony_ci}; 474bf215546Sopenharmony_ci 475bf215546Sopenharmony_civoid 476bf215546Sopenharmony_ciinit_hazard_query(hazard_query* query) 477bf215546Sopenharmony_ci{ 478bf215546Sopenharmony_ci query->contains_spill = false; 479bf215546Sopenharmony_ci query->contains_sendmsg = false; 480bf215546Sopenharmony_ci query->uses_exec = false; 481bf215546Sopenharmony_ci memset(&query->mem_events, 0, sizeof(query->mem_events)); 482bf215546Sopenharmony_ci query->aliasing_storage = 0; 483bf215546Sopenharmony_ci query->aliasing_storage_smem = 0; 484bf215546Sopenharmony_ci} 485bf215546Sopenharmony_ci 486bf215546Sopenharmony_civoid 487bf215546Sopenharmony_ciadd_memory_event(memory_event_set* set, Instruction* instr, memory_sync_info* sync) 488bf215546Sopenharmony_ci{ 489bf215546Sopenharmony_ci set->has_control_barrier |= is_done_sendmsg(instr); 490bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::p_barrier) { 491bf215546Sopenharmony_ci Pseudo_barrier_instruction& bar = instr->barrier(); 492bf215546Sopenharmony_ci if (bar.sync.semantics & semantic_acquire) 493bf215546Sopenharmony_ci set->bar_acquire |= bar.sync.storage; 494bf215546Sopenharmony_ci if (bar.sync.semantics & semantic_release) 495bf215546Sopenharmony_ci set->bar_release |= bar.sync.storage; 496bf215546Sopenharmony_ci set->bar_classes |= bar.sync.storage; 497bf215546Sopenharmony_ci 498bf215546Sopenharmony_ci set->has_control_barrier |= bar.exec_scope > scope_invocation; 499bf215546Sopenharmony_ci } 500bf215546Sopenharmony_ci 501bf215546Sopenharmony_ci if (!sync->storage) 502bf215546Sopenharmony_ci return; 503bf215546Sopenharmony_ci 504bf215546Sopenharmony_ci if (sync->semantics & semantic_acquire) 505bf215546Sopenharmony_ci set->access_acquire |= sync->storage; 506bf215546Sopenharmony_ci if (sync->semantics & semantic_release) 507bf215546Sopenharmony_ci set->access_release |= sync->storage; 508bf215546Sopenharmony_ci 509bf215546Sopenharmony_ci if (!(sync->semantics & semantic_private)) { 510bf215546Sopenharmony_ci if (sync->semantics & semantic_atomic) 511bf215546Sopenharmony_ci set->access_atomic |= sync->storage; 512bf215546Sopenharmony_ci else 513bf215546Sopenharmony_ci set->access_relaxed |= sync->storage; 514bf215546Sopenharmony_ci } 515bf215546Sopenharmony_ci} 516bf215546Sopenharmony_ci 517bf215546Sopenharmony_civoid 518bf215546Sopenharmony_ciadd_to_hazard_query(hazard_query* query, Instruction* instr) 519bf215546Sopenharmony_ci{ 520bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) 521bf215546Sopenharmony_ci query->contains_spill = true; 522bf215546Sopenharmony_ci query->contains_sendmsg |= instr->opcode == aco_opcode::s_sendmsg; 523bf215546Sopenharmony_ci query->uses_exec |= needs_exec_mask(instr); 524bf215546Sopenharmony_ci 525bf215546Sopenharmony_ci memory_sync_info sync = get_sync_info_with_hack(instr); 526bf215546Sopenharmony_ci 527bf215546Sopenharmony_ci add_memory_event(&query->mem_events, instr, &sync); 528bf215546Sopenharmony_ci 529bf215546Sopenharmony_ci if (!(sync.semantics & semantic_can_reorder)) { 530bf215546Sopenharmony_ci unsigned storage = sync.storage; 531bf215546Sopenharmony_ci /* images and buffer/global memory can alias */ // TODO: more precisely, buffer images and 532bf215546Sopenharmony_ci // buffer/global memory can alias 533bf215546Sopenharmony_ci if (storage & (storage_buffer | storage_image)) 534bf215546Sopenharmony_ci storage |= storage_buffer | storage_image; 535bf215546Sopenharmony_ci if (instr->isSMEM()) 536bf215546Sopenharmony_ci query->aliasing_storage_smem |= storage; 537bf215546Sopenharmony_ci else 538bf215546Sopenharmony_ci query->aliasing_storage |= storage; 539bf215546Sopenharmony_ci } 540bf215546Sopenharmony_ci} 541bf215546Sopenharmony_ci 542bf215546Sopenharmony_cienum HazardResult { 543bf215546Sopenharmony_ci hazard_success, 544bf215546Sopenharmony_ci hazard_fail_reorder_vmem_smem, 545bf215546Sopenharmony_ci hazard_fail_reorder_ds, 546bf215546Sopenharmony_ci hazard_fail_reorder_sendmsg, 547bf215546Sopenharmony_ci hazard_fail_spill, 548bf215546Sopenharmony_ci hazard_fail_export, 549bf215546Sopenharmony_ci hazard_fail_barrier, 550bf215546Sopenharmony_ci /* Must stop at these failures. The hazard query code doesn't consider them 551bf215546Sopenharmony_ci * when added. */ 552bf215546Sopenharmony_ci hazard_fail_exec, 553bf215546Sopenharmony_ci hazard_fail_unreorderable, 554bf215546Sopenharmony_ci}; 555bf215546Sopenharmony_ci 556bf215546Sopenharmony_ciHazardResult 557bf215546Sopenharmony_ciperform_hazard_query(hazard_query* query, Instruction* instr, bool upwards) 558bf215546Sopenharmony_ci{ 559bf215546Sopenharmony_ci /* don't schedule discards downwards */ 560bf215546Sopenharmony_ci if (!upwards && instr->opcode == aco_opcode::p_exit_early_if) 561bf215546Sopenharmony_ci return hazard_fail_unreorderable; 562bf215546Sopenharmony_ci 563bf215546Sopenharmony_ci if (query->uses_exec) { 564bf215546Sopenharmony_ci for (const Definition& def : instr->definitions) { 565bf215546Sopenharmony_ci if (def.isFixed() && def.physReg() == exec) 566bf215546Sopenharmony_ci return hazard_fail_exec; 567bf215546Sopenharmony_ci } 568bf215546Sopenharmony_ci } 569bf215546Sopenharmony_ci 570bf215546Sopenharmony_ci /* don't move exports so that they stay closer together */ 571bf215546Sopenharmony_ci if (instr->isEXP()) 572bf215546Sopenharmony_ci return hazard_fail_export; 573bf215546Sopenharmony_ci 574bf215546Sopenharmony_ci /* don't move non-reorderable instructions */ 575bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime || 576bf215546Sopenharmony_ci instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32 || 577bf215546Sopenharmony_ci instr->opcode == aco_opcode::p_init_scratch || instr->opcode == aco_opcode::p_jump_to_epilog) 578bf215546Sopenharmony_ci return hazard_fail_unreorderable; 579bf215546Sopenharmony_ci 580bf215546Sopenharmony_ci memory_event_set instr_set; 581bf215546Sopenharmony_ci memset(&instr_set, 0, sizeof(instr_set)); 582bf215546Sopenharmony_ci memory_sync_info sync = get_sync_info_with_hack(instr); 583bf215546Sopenharmony_ci add_memory_event(&instr_set, instr, &sync); 584bf215546Sopenharmony_ci 585bf215546Sopenharmony_ci memory_event_set* first = &instr_set; 586bf215546Sopenharmony_ci memory_event_set* second = &query->mem_events; 587bf215546Sopenharmony_ci if (upwards) 588bf215546Sopenharmony_ci std::swap(first, second); 589bf215546Sopenharmony_ci 590bf215546Sopenharmony_ci /* everything after barrier(acquire) happens after the atomics/control_barriers before 591bf215546Sopenharmony_ci * everything after load(acquire) happens after the load 592bf215546Sopenharmony_ci */ 593bf215546Sopenharmony_ci if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire) 594bf215546Sopenharmony_ci return hazard_fail_barrier; 595bf215546Sopenharmony_ci if (((first->access_acquire || first->bar_acquire) && second->bar_classes) || 596bf215546Sopenharmony_ci ((first->access_acquire | first->bar_acquire) & 597bf215546Sopenharmony_ci (second->access_relaxed | second->access_atomic))) 598bf215546Sopenharmony_ci return hazard_fail_barrier; 599bf215546Sopenharmony_ci 600bf215546Sopenharmony_ci /* everything before barrier(release) happens before the atomics/control_barriers after * 601bf215546Sopenharmony_ci * everything before store(release) happens before the store 602bf215546Sopenharmony_ci */ 603bf215546Sopenharmony_ci if (first->bar_release && (second->has_control_barrier || second->access_atomic)) 604bf215546Sopenharmony_ci return hazard_fail_barrier; 605bf215546Sopenharmony_ci if ((first->bar_classes && (second->bar_release || second->access_release)) || 606bf215546Sopenharmony_ci ((first->access_relaxed | first->access_atomic) & 607bf215546Sopenharmony_ci (second->bar_release | second->access_release))) 608bf215546Sopenharmony_ci return hazard_fail_barrier; 609bf215546Sopenharmony_ci 610bf215546Sopenharmony_ci /* don't move memory barriers around other memory barriers */ 611bf215546Sopenharmony_ci if (first->bar_classes && second->bar_classes) 612bf215546Sopenharmony_ci return hazard_fail_barrier; 613bf215546Sopenharmony_ci 614bf215546Sopenharmony_ci /* Don't move memory accesses to before control barriers. I don't think 615bf215546Sopenharmony_ci * this is necessary for the Vulkan memory model, but it might be for GLSL450. */ 616bf215546Sopenharmony_ci unsigned control_classes = 617bf215546Sopenharmony_ci storage_buffer | storage_atomic_counter | storage_image | storage_shared | 618bf215546Sopenharmony_ci storage_task_payload; 619bf215546Sopenharmony_ci if (first->has_control_barrier && 620bf215546Sopenharmony_ci ((second->access_atomic | second->access_relaxed) & control_classes)) 621bf215546Sopenharmony_ci return hazard_fail_barrier; 622bf215546Sopenharmony_ci 623bf215546Sopenharmony_ci /* don't move memory loads/stores past potentially aliasing loads/stores */ 624bf215546Sopenharmony_ci unsigned aliasing_storage = 625bf215546Sopenharmony_ci instr->isSMEM() ? query->aliasing_storage_smem : query->aliasing_storage; 626bf215546Sopenharmony_ci if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) { 627bf215546Sopenharmony_ci unsigned intersect = sync.storage & aliasing_storage; 628bf215546Sopenharmony_ci if (intersect & storage_shared) 629bf215546Sopenharmony_ci return hazard_fail_reorder_ds; 630bf215546Sopenharmony_ci return hazard_fail_reorder_vmem_smem; 631bf215546Sopenharmony_ci } 632bf215546Sopenharmony_ci 633bf215546Sopenharmony_ci if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) && 634bf215546Sopenharmony_ci query->contains_spill) 635bf215546Sopenharmony_ci return hazard_fail_spill; 636bf215546Sopenharmony_ci 637bf215546Sopenharmony_ci if (instr->opcode == aco_opcode::s_sendmsg && query->contains_sendmsg) 638bf215546Sopenharmony_ci return hazard_fail_reorder_sendmsg; 639bf215546Sopenharmony_ci 640bf215546Sopenharmony_ci return hazard_success; 641bf215546Sopenharmony_ci} 642bf215546Sopenharmony_ci 643bf215546Sopenharmony_civoid 644bf215546Sopenharmony_cischedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand, 645bf215546Sopenharmony_ci Instruction* current, int idx) 646bf215546Sopenharmony_ci{ 647bf215546Sopenharmony_ci assert(idx != 0); 648bf215546Sopenharmony_ci int window_size = SMEM_WINDOW_SIZE; 649bf215546Sopenharmony_ci int max_moves = SMEM_MAX_MOVES; 650bf215546Sopenharmony_ci int16_t k = 0; 651bf215546Sopenharmony_ci 652bf215546Sopenharmony_ci /* don't move s_memtime/s_memrealtime */ 653bf215546Sopenharmony_ci if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime) 654bf215546Sopenharmony_ci return; 655bf215546Sopenharmony_ci 656bf215546Sopenharmony_ci /* first, check if we have instructions before current to move down */ 657bf215546Sopenharmony_ci hazard_query hq; 658bf215546Sopenharmony_ci init_hazard_query(&hq); 659bf215546Sopenharmony_ci add_to_hazard_query(&hq, current); 660bf215546Sopenharmony_ci 661bf215546Sopenharmony_ci DownwardsCursor cursor = ctx.mv.downwards_init(idx, false, false); 662bf215546Sopenharmony_ci 663bf215546Sopenharmony_ci for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size; 664bf215546Sopenharmony_ci candidate_idx--) { 665bf215546Sopenharmony_ci assert(candidate_idx >= 0); 666bf215546Sopenharmony_ci assert(candidate_idx == cursor.source_idx); 667bf215546Sopenharmony_ci aco_ptr<Instruction>& candidate = block->instructions[candidate_idx]; 668bf215546Sopenharmony_ci 669bf215546Sopenharmony_ci /* break if we'd make the previous SMEM instruction stall */ 670bf215546Sopenharmony_ci bool can_stall_prev_smem = 671bf215546Sopenharmony_ci idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx; 672bf215546Sopenharmony_ci if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0) 673bf215546Sopenharmony_ci break; 674bf215546Sopenharmony_ci 675bf215546Sopenharmony_ci /* break when encountering another MEM instruction, logical_start or barriers */ 676bf215546Sopenharmony_ci if (candidate->opcode == aco_opcode::p_logical_start) 677bf215546Sopenharmony_ci break; 678bf215546Sopenharmony_ci /* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves 679bf215546Sopenharmony_ci * to help create more vmem clauses */ 680bf215546Sopenharmony_ci if ((candidate->isVMEM() || candidate->isFlatLike()) && 681bf215546Sopenharmony_ci (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) || 682bf215546Sopenharmony_ci current->operands[0].size() == 4)) 683bf215546Sopenharmony_ci break; 684bf215546Sopenharmony_ci /* don't move descriptor loads below buffer loads */ 685bf215546Sopenharmony_ci if (candidate->format == Format::SMEM && current->operands[0].size() == 4 && 686bf215546Sopenharmony_ci candidate->operands[0].size() == 2) 687bf215546Sopenharmony_ci break; 688bf215546Sopenharmony_ci 689bf215546Sopenharmony_ci bool can_move_down = true; 690bf215546Sopenharmony_ci 691bf215546Sopenharmony_ci HazardResult haz = perform_hazard_query(&hq, candidate.get(), false); 692bf215546Sopenharmony_ci if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || 693bf215546Sopenharmony_ci haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || 694bf215546Sopenharmony_ci haz == hazard_fail_export) 695bf215546Sopenharmony_ci can_move_down = false; 696bf215546Sopenharmony_ci else if (haz != hazard_success) 697bf215546Sopenharmony_ci break; 698bf215546Sopenharmony_ci 699bf215546Sopenharmony_ci /* don't use LDS/GDS instructions to hide latency since it can 700bf215546Sopenharmony_ci * significanly worsen LDS scheduling */ 701bf215546Sopenharmony_ci if (candidate->isDS() || !can_move_down) { 702bf215546Sopenharmony_ci add_to_hazard_query(&hq, candidate.get()); 703bf215546Sopenharmony_ci ctx.mv.downwards_skip(cursor); 704bf215546Sopenharmony_ci continue; 705bf215546Sopenharmony_ci } 706bf215546Sopenharmony_ci 707bf215546Sopenharmony_ci MoveResult res = ctx.mv.downwards_move(cursor, false); 708bf215546Sopenharmony_ci if (res == move_fail_ssa || res == move_fail_rar) { 709bf215546Sopenharmony_ci add_to_hazard_query(&hq, candidate.get()); 710bf215546Sopenharmony_ci ctx.mv.downwards_skip(cursor); 711bf215546Sopenharmony_ci continue; 712bf215546Sopenharmony_ci } else if (res == move_fail_pressure) { 713bf215546Sopenharmony_ci break; 714bf215546Sopenharmony_ci } 715bf215546Sopenharmony_ci 716bf215546Sopenharmony_ci if (candidate_idx < ctx.last_SMEM_dep_idx) 717bf215546Sopenharmony_ci ctx.last_SMEM_stall++; 718bf215546Sopenharmony_ci k++; 719bf215546Sopenharmony_ci } 720bf215546Sopenharmony_ci 721bf215546Sopenharmony_ci /* find the first instruction depending on current or find another MEM */ 722bf215546Sopenharmony_ci UpwardsCursor up_cursor = ctx.mv.upwards_init(idx + 1, false); 723bf215546Sopenharmony_ci 724bf215546Sopenharmony_ci bool found_dependency = false; 725bf215546Sopenharmony_ci /* second, check if we have instructions after current to move up */ 726bf215546Sopenharmony_ci for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size; 727bf215546Sopenharmony_ci candidate_idx++) { 728bf215546Sopenharmony_ci assert(candidate_idx == up_cursor.source_idx); 729bf215546Sopenharmony_ci assert(candidate_idx < (int)block->instructions.size()); 730bf215546Sopenharmony_ci aco_ptr<Instruction>& candidate = block->instructions[candidate_idx]; 731bf215546Sopenharmony_ci 732bf215546Sopenharmony_ci if (candidate->opcode == aco_opcode::p_logical_end) 733bf215546Sopenharmony_ci break; 734bf215546Sopenharmony_ci 735bf215546Sopenharmony_ci /* check if candidate depends on current */ 736bf215546Sopenharmony_ci bool is_dependency = !found_dependency && !ctx.mv.upwards_check_deps(up_cursor); 737bf215546Sopenharmony_ci /* no need to steal from following VMEM instructions */ 738bf215546Sopenharmony_ci if (is_dependency && (candidate->isVMEM() || candidate->isFlatLike())) 739bf215546Sopenharmony_ci break; 740bf215546Sopenharmony_ci 741bf215546Sopenharmony_ci if (found_dependency) { 742bf215546Sopenharmony_ci HazardResult haz = perform_hazard_query(&hq, candidate.get(), true); 743bf215546Sopenharmony_ci if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || 744bf215546Sopenharmony_ci haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || 745bf215546Sopenharmony_ci haz == hazard_fail_export) 746bf215546Sopenharmony_ci is_dependency = true; 747bf215546Sopenharmony_ci else if (haz != hazard_success) 748bf215546Sopenharmony_ci break; 749bf215546Sopenharmony_ci } 750bf215546Sopenharmony_ci 751bf215546Sopenharmony_ci if (is_dependency) { 752bf215546Sopenharmony_ci if (!found_dependency) { 753bf215546Sopenharmony_ci ctx.mv.upwards_update_insert_idx(up_cursor); 754bf215546Sopenharmony_ci init_hazard_query(&hq); 755bf215546Sopenharmony_ci found_dependency = true; 756bf215546Sopenharmony_ci } 757bf215546Sopenharmony_ci } 758bf215546Sopenharmony_ci 759bf215546Sopenharmony_ci if (is_dependency || !found_dependency) { 760bf215546Sopenharmony_ci if (found_dependency) 761bf215546Sopenharmony_ci add_to_hazard_query(&hq, candidate.get()); 762bf215546Sopenharmony_ci else 763bf215546Sopenharmony_ci k++; 764bf215546Sopenharmony_ci ctx.mv.upwards_skip(up_cursor); 765bf215546Sopenharmony_ci continue; 766bf215546Sopenharmony_ci } 767bf215546Sopenharmony_ci 768bf215546Sopenharmony_ci MoveResult res = ctx.mv.upwards_move(up_cursor); 769bf215546Sopenharmony_ci if (res == move_fail_ssa || res == move_fail_rar) { 770bf215546Sopenharmony_ci /* no need to steal from following VMEM instructions */ 771bf215546Sopenharmony_ci if (res == move_fail_ssa && (candidate->isVMEM() || candidate->isFlatLike())) 772bf215546Sopenharmony_ci break; 773bf215546Sopenharmony_ci add_to_hazard_query(&hq, candidate.get()); 774bf215546Sopenharmony_ci ctx.mv.upwards_skip(up_cursor); 775bf215546Sopenharmony_ci continue; 776bf215546Sopenharmony_ci } else if (res == move_fail_pressure) { 777bf215546Sopenharmony_ci break; 778bf215546Sopenharmony_ci } 779bf215546Sopenharmony_ci k++; 780bf215546Sopenharmony_ci } 781bf215546Sopenharmony_ci 782bf215546Sopenharmony_ci ctx.last_SMEM_dep_idx = found_dependency ? up_cursor.insert_idx : 0; 783bf215546Sopenharmony_ci ctx.last_SMEM_stall = 10 - ctx.num_waves - k; 784bf215546Sopenharmony_ci} 785bf215546Sopenharmony_ci 786bf215546Sopenharmony_civoid 787bf215546Sopenharmony_cischedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand, 788bf215546Sopenharmony_ci Instruction* current, int idx) 789bf215546Sopenharmony_ci{ 790bf215546Sopenharmony_ci assert(idx != 0); 791bf215546Sopenharmony_ci int window_size = VMEM_WINDOW_SIZE; 792bf215546Sopenharmony_ci int max_moves = VMEM_MAX_MOVES; 793bf215546Sopenharmony_ci int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST; 794bf215546Sopenharmony_ci bool only_clauses = false; 795bf215546Sopenharmony_ci int16_t k = 0; 796bf215546Sopenharmony_ci 797bf215546Sopenharmony_ci /* first, check if we have instructions before current to move down */ 798bf215546Sopenharmony_ci hazard_query indep_hq; 799bf215546Sopenharmony_ci hazard_query clause_hq; 800bf215546Sopenharmony_ci init_hazard_query(&indep_hq); 801bf215546Sopenharmony_ci init_hazard_query(&clause_hq); 802bf215546Sopenharmony_ci add_to_hazard_query(&indep_hq, current); 803bf215546Sopenharmony_ci 804bf215546Sopenharmony_ci DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true); 805bf215546Sopenharmony_ci 806bf215546Sopenharmony_ci for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size; 807bf215546Sopenharmony_ci candidate_idx--) { 808bf215546Sopenharmony_ci assert(candidate_idx == cursor.source_idx); 809bf215546Sopenharmony_ci assert(candidate_idx >= 0); 810bf215546Sopenharmony_ci aco_ptr<Instruction>& candidate = block->instructions[candidate_idx]; 811bf215546Sopenharmony_ci bool is_vmem = candidate->isVMEM() || candidate->isFlatLike(); 812bf215546Sopenharmony_ci 813bf215546Sopenharmony_ci /* break when encountering another VMEM instruction, logical_start or barriers */ 814bf215546Sopenharmony_ci if (candidate->opcode == aco_opcode::p_logical_start) 815bf215546Sopenharmony_ci break; 816bf215546Sopenharmony_ci 817bf215546Sopenharmony_ci /* break if we'd make the previous SMEM instruction stall */ 818bf215546Sopenharmony_ci bool can_stall_prev_smem = 819bf215546Sopenharmony_ci idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx; 820bf215546Sopenharmony_ci if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0) 821bf215546Sopenharmony_ci break; 822bf215546Sopenharmony_ci 823bf215546Sopenharmony_ci bool part_of_clause = false; 824bf215546Sopenharmony_ci if (current->isVMEM() == candidate->isVMEM()) { 825bf215546Sopenharmony_ci int grab_dist = cursor.insert_idx_clause - candidate_idx; 826bf215546Sopenharmony_ci /* We can't easily tell how much this will decrease the def-to-use 827bf215546Sopenharmony_ci * distances, so just use how far it will be moved as a heuristic. */ 828bf215546Sopenharmony_ci part_of_clause = 829bf215546Sopenharmony_ci grab_dist < clause_max_grab_dist + k && should_form_clause(current, candidate.get()); 830bf215546Sopenharmony_ci } 831bf215546Sopenharmony_ci 832bf215546Sopenharmony_ci /* if current depends on candidate, add additional dependencies and continue */ 833bf215546Sopenharmony_ci bool can_move_down = !is_vmem || part_of_clause || candidate->definitions.empty(); 834bf215546Sopenharmony_ci if (only_clauses) { 835bf215546Sopenharmony_ci /* In case of high register pressure, only try to form clauses, 836bf215546Sopenharmony_ci * and only if the previous clause is not larger 837bf215546Sopenharmony_ci * than the current one will be. 838bf215546Sopenharmony_ci */ 839bf215546Sopenharmony_ci if (part_of_clause) { 840bf215546Sopenharmony_ci int clause_size = cursor.insert_idx - cursor.insert_idx_clause; 841bf215546Sopenharmony_ci int prev_clause_size = 1; 842bf215546Sopenharmony_ci while (should_form_clause(current, 843bf215546Sopenharmony_ci block->instructions[candidate_idx - prev_clause_size].get())) 844bf215546Sopenharmony_ci prev_clause_size++; 845bf215546Sopenharmony_ci if (prev_clause_size > clause_size + 1) 846bf215546Sopenharmony_ci break; 847bf215546Sopenharmony_ci } else { 848bf215546Sopenharmony_ci can_move_down = false; 849bf215546Sopenharmony_ci } 850bf215546Sopenharmony_ci } 851bf215546Sopenharmony_ci HazardResult haz = 852bf215546Sopenharmony_ci perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false); 853bf215546Sopenharmony_ci if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || 854bf215546Sopenharmony_ci haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || 855bf215546Sopenharmony_ci haz == hazard_fail_export) 856bf215546Sopenharmony_ci can_move_down = false; 857bf215546Sopenharmony_ci else if (haz != hazard_success) 858bf215546Sopenharmony_ci break; 859bf215546Sopenharmony_ci 860bf215546Sopenharmony_ci if (!can_move_down) { 861bf215546Sopenharmony_ci if (part_of_clause) 862bf215546Sopenharmony_ci break; 863bf215546Sopenharmony_ci add_to_hazard_query(&indep_hq, candidate.get()); 864bf215546Sopenharmony_ci add_to_hazard_query(&clause_hq, candidate.get()); 865bf215546Sopenharmony_ci ctx.mv.downwards_skip(cursor); 866bf215546Sopenharmony_ci continue; 867bf215546Sopenharmony_ci } 868bf215546Sopenharmony_ci 869bf215546Sopenharmony_ci Instruction* candidate_ptr = candidate.get(); 870bf215546Sopenharmony_ci MoveResult res = ctx.mv.downwards_move(cursor, part_of_clause); 871bf215546Sopenharmony_ci if (res == move_fail_ssa || res == move_fail_rar) { 872bf215546Sopenharmony_ci if (part_of_clause) 873bf215546Sopenharmony_ci break; 874bf215546Sopenharmony_ci add_to_hazard_query(&indep_hq, candidate.get()); 875bf215546Sopenharmony_ci add_to_hazard_query(&clause_hq, candidate.get()); 876bf215546Sopenharmony_ci ctx.mv.downwards_skip(cursor); 877bf215546Sopenharmony_ci continue; 878bf215546Sopenharmony_ci } else if (res == move_fail_pressure) { 879bf215546Sopenharmony_ci only_clauses = true; 880bf215546Sopenharmony_ci if (part_of_clause) 881bf215546Sopenharmony_ci break; 882bf215546Sopenharmony_ci add_to_hazard_query(&indep_hq, candidate.get()); 883bf215546Sopenharmony_ci add_to_hazard_query(&clause_hq, candidate.get()); 884bf215546Sopenharmony_ci ctx.mv.downwards_skip(cursor); 885bf215546Sopenharmony_ci continue; 886bf215546Sopenharmony_ci } 887bf215546Sopenharmony_ci if (part_of_clause) 888bf215546Sopenharmony_ci add_to_hazard_query(&indep_hq, candidate_ptr); 889bf215546Sopenharmony_ci else 890bf215546Sopenharmony_ci k++; 891bf215546Sopenharmony_ci if (candidate_idx < ctx.last_SMEM_dep_idx) 892bf215546Sopenharmony_ci ctx.last_SMEM_stall++; 893bf215546Sopenharmony_ci } 894bf215546Sopenharmony_ci 895bf215546Sopenharmony_ci /* find the first instruction depending on current or find another VMEM */ 896bf215546Sopenharmony_ci UpwardsCursor up_cursor = ctx.mv.upwards_init(idx + 1, true); 897bf215546Sopenharmony_ci 898bf215546Sopenharmony_ci bool found_dependency = false; 899bf215546Sopenharmony_ci /* second, check if we have instructions after current to move up */ 900bf215546Sopenharmony_ci for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size; 901bf215546Sopenharmony_ci candidate_idx++) { 902bf215546Sopenharmony_ci assert(candidate_idx == up_cursor.source_idx); 903bf215546Sopenharmony_ci assert(candidate_idx < (int)block->instructions.size()); 904bf215546Sopenharmony_ci aco_ptr<Instruction>& candidate = block->instructions[candidate_idx]; 905bf215546Sopenharmony_ci bool is_vmem = candidate->isVMEM() || candidate->isFlatLike(); 906bf215546Sopenharmony_ci 907bf215546Sopenharmony_ci if (candidate->opcode == aco_opcode::p_logical_end) 908bf215546Sopenharmony_ci break; 909bf215546Sopenharmony_ci 910bf215546Sopenharmony_ci /* check if candidate depends on current */ 911bf215546Sopenharmony_ci bool is_dependency = false; 912bf215546Sopenharmony_ci if (found_dependency) { 913bf215546Sopenharmony_ci HazardResult haz = perform_hazard_query(&indep_hq, candidate.get(), true); 914bf215546Sopenharmony_ci if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || 915bf215546Sopenharmony_ci haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg || 916bf215546Sopenharmony_ci haz == hazard_fail_barrier || haz == hazard_fail_export) 917bf215546Sopenharmony_ci is_dependency = true; 918bf215546Sopenharmony_ci else if (haz != hazard_success) 919bf215546Sopenharmony_ci break; 920bf215546Sopenharmony_ci } 921bf215546Sopenharmony_ci 922bf215546Sopenharmony_ci is_dependency |= !found_dependency && !ctx.mv.upwards_check_deps(up_cursor); 923bf215546Sopenharmony_ci if (is_dependency) { 924bf215546Sopenharmony_ci if (!found_dependency) { 925bf215546Sopenharmony_ci ctx.mv.upwards_update_insert_idx(up_cursor); 926bf215546Sopenharmony_ci init_hazard_query(&indep_hq); 927bf215546Sopenharmony_ci found_dependency = true; 928bf215546Sopenharmony_ci } 929bf215546Sopenharmony_ci } else if (is_vmem) { 930bf215546Sopenharmony_ci /* don't move up dependencies of other VMEM instructions */ 931bf215546Sopenharmony_ci for (const Definition& def : candidate->definitions) { 932bf215546Sopenharmony_ci if (def.isTemp()) 933bf215546Sopenharmony_ci ctx.mv.depends_on[def.tempId()] = true; 934bf215546Sopenharmony_ci } 935bf215546Sopenharmony_ci } 936bf215546Sopenharmony_ci 937bf215546Sopenharmony_ci if (is_dependency || !found_dependency) { 938bf215546Sopenharmony_ci if (found_dependency) 939bf215546Sopenharmony_ci add_to_hazard_query(&indep_hq, candidate.get()); 940bf215546Sopenharmony_ci else 941bf215546Sopenharmony_ci k++; 942bf215546Sopenharmony_ci ctx.mv.upwards_skip(up_cursor); 943bf215546Sopenharmony_ci continue; 944bf215546Sopenharmony_ci } 945bf215546Sopenharmony_ci 946bf215546Sopenharmony_ci MoveResult res = ctx.mv.upwards_move(up_cursor); 947bf215546Sopenharmony_ci if (res == move_fail_ssa || res == move_fail_rar) { 948bf215546Sopenharmony_ci add_to_hazard_query(&indep_hq, candidate.get()); 949bf215546Sopenharmony_ci ctx.mv.upwards_skip(up_cursor); 950bf215546Sopenharmony_ci continue; 951bf215546Sopenharmony_ci } else if (res == move_fail_pressure) { 952bf215546Sopenharmony_ci break; 953bf215546Sopenharmony_ci } 954bf215546Sopenharmony_ci k++; 955bf215546Sopenharmony_ci } 956bf215546Sopenharmony_ci} 957bf215546Sopenharmony_ci 958bf215546Sopenharmony_civoid 959bf215546Sopenharmony_cischedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand, 960bf215546Sopenharmony_ci Instruction* current, int idx) 961bf215546Sopenharmony_ci{ 962bf215546Sopenharmony_ci assert(idx != 0); 963bf215546Sopenharmony_ci int window_size = POS_EXP_WINDOW_SIZE / ctx.schedule_pos_export_div; 964bf215546Sopenharmony_ci int max_moves = POS_EXP_MAX_MOVES / ctx.schedule_pos_export_div; 965bf215546Sopenharmony_ci int16_t k = 0; 966bf215546Sopenharmony_ci 967bf215546Sopenharmony_ci DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, false); 968bf215546Sopenharmony_ci 969bf215546Sopenharmony_ci hazard_query hq; 970bf215546Sopenharmony_ci init_hazard_query(&hq); 971bf215546Sopenharmony_ci add_to_hazard_query(&hq, current); 972bf215546Sopenharmony_ci 973bf215546Sopenharmony_ci for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size; 974bf215546Sopenharmony_ci candidate_idx--) { 975bf215546Sopenharmony_ci assert(candidate_idx >= 0); 976bf215546Sopenharmony_ci aco_ptr<Instruction>& candidate = block->instructions[candidate_idx]; 977bf215546Sopenharmony_ci 978bf215546Sopenharmony_ci if (candidate->opcode == aco_opcode::p_logical_start) 979bf215546Sopenharmony_ci break; 980bf215546Sopenharmony_ci if (candidate->isVMEM() || candidate->isSMEM() || candidate->isFlatLike()) 981bf215546Sopenharmony_ci break; 982bf215546Sopenharmony_ci 983bf215546Sopenharmony_ci HazardResult haz = perform_hazard_query(&hq, candidate.get(), false); 984bf215546Sopenharmony_ci if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable) 985bf215546Sopenharmony_ci break; 986bf215546Sopenharmony_ci 987bf215546Sopenharmony_ci if (haz != hazard_success) { 988bf215546Sopenharmony_ci add_to_hazard_query(&hq, candidate.get()); 989bf215546Sopenharmony_ci ctx.mv.downwards_skip(cursor); 990bf215546Sopenharmony_ci continue; 991bf215546Sopenharmony_ci } 992bf215546Sopenharmony_ci 993bf215546Sopenharmony_ci MoveResult res = ctx.mv.downwards_move(cursor, false); 994bf215546Sopenharmony_ci if (res == move_fail_ssa || res == move_fail_rar) { 995bf215546Sopenharmony_ci add_to_hazard_query(&hq, candidate.get()); 996bf215546Sopenharmony_ci ctx.mv.downwards_skip(cursor); 997bf215546Sopenharmony_ci continue; 998bf215546Sopenharmony_ci } else if (res == move_fail_pressure) { 999bf215546Sopenharmony_ci break; 1000bf215546Sopenharmony_ci } 1001bf215546Sopenharmony_ci k++; 1002bf215546Sopenharmony_ci } 1003bf215546Sopenharmony_ci} 1004bf215546Sopenharmony_ci 1005bf215546Sopenharmony_civoid 1006bf215546Sopenharmony_cischedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars) 1007bf215546Sopenharmony_ci{ 1008bf215546Sopenharmony_ci ctx.last_SMEM_dep_idx = 0; 1009bf215546Sopenharmony_ci ctx.last_SMEM_stall = INT16_MIN; 1010bf215546Sopenharmony_ci ctx.mv.block = block; 1011bf215546Sopenharmony_ci ctx.mv.register_demand = live_vars.register_demand[block->index].data(); 1012bf215546Sopenharmony_ci 1013bf215546Sopenharmony_ci /* go through all instructions and find memory loads */ 1014bf215546Sopenharmony_ci for (unsigned idx = 0; idx < block->instructions.size(); idx++) { 1015bf215546Sopenharmony_ci Instruction* current = block->instructions[idx].get(); 1016bf215546Sopenharmony_ci 1017bf215546Sopenharmony_ci if (block->kind & block_kind_export_end && current->isEXP() && ctx.schedule_pos_exports) { 1018bf215546Sopenharmony_ci unsigned target = current->exp().dest; 1019bf215546Sopenharmony_ci if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PRIM) { 1020bf215546Sopenharmony_ci ctx.mv.current = current; 1021bf215546Sopenharmony_ci schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, 1022bf215546Sopenharmony_ci idx); 1023bf215546Sopenharmony_ci } 1024bf215546Sopenharmony_ci } 1025bf215546Sopenharmony_ci 1026bf215546Sopenharmony_ci if (current->definitions.empty()) 1027bf215546Sopenharmony_ci continue; 1028bf215546Sopenharmony_ci 1029bf215546Sopenharmony_ci if (current->isVMEM() || current->isFlatLike()) { 1030bf215546Sopenharmony_ci ctx.mv.current = current; 1031bf215546Sopenharmony_ci schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx); 1032bf215546Sopenharmony_ci } 1033bf215546Sopenharmony_ci 1034bf215546Sopenharmony_ci if (current->isSMEM()) { 1035bf215546Sopenharmony_ci ctx.mv.current = current; 1036bf215546Sopenharmony_ci schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx); 1037bf215546Sopenharmony_ci } 1038bf215546Sopenharmony_ci } 1039bf215546Sopenharmony_ci 1040bf215546Sopenharmony_ci /* resummarize the block's register demand */ 1041bf215546Sopenharmony_ci block->register_demand = RegisterDemand(); 1042bf215546Sopenharmony_ci for (unsigned idx = 0; idx < block->instructions.size(); idx++) { 1043bf215546Sopenharmony_ci block->register_demand.update(live_vars.register_demand[block->index][idx]); 1044bf215546Sopenharmony_ci } 1045bf215546Sopenharmony_ci} 1046bf215546Sopenharmony_ci 1047bf215546Sopenharmony_civoid 1048bf215546Sopenharmony_cischedule_program(Program* program, live& live_vars) 1049bf215546Sopenharmony_ci{ 1050bf215546Sopenharmony_ci /* don't use program->max_reg_demand because that is affected by max_waves_per_simd */ 1051bf215546Sopenharmony_ci RegisterDemand demand; 1052bf215546Sopenharmony_ci for (Block& block : program->blocks) 1053bf215546Sopenharmony_ci demand.update(block.register_demand); 1054bf215546Sopenharmony_ci demand.vgpr += program->config->num_shared_vgprs / 2; 1055bf215546Sopenharmony_ci 1056bf215546Sopenharmony_ci sched_ctx ctx; 1057bf215546Sopenharmony_ci ctx.mv.depends_on.resize(program->peekAllocationId()); 1058bf215546Sopenharmony_ci ctx.mv.RAR_dependencies.resize(program->peekAllocationId()); 1059bf215546Sopenharmony_ci ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId()); 1060bf215546Sopenharmony_ci /* Allowing the scheduler to reduce the number of waves to as low as 5 1061bf215546Sopenharmony_ci * improves performance of Thrones of Britannia significantly and doesn't 1062bf215546Sopenharmony_ci * seem to hurt anything else. */ 1063bf215546Sopenharmony_ci // TODO: account for possible uneven num_waves on GFX10+ 1064bf215546Sopenharmony_ci unsigned wave_fac = program->dev.physical_vgprs / 256; 1065bf215546Sopenharmony_ci if (program->num_waves <= 5 * wave_fac) 1066bf215546Sopenharmony_ci ctx.num_waves = program->num_waves; 1067bf215546Sopenharmony_ci else if (demand.vgpr >= 29) 1068bf215546Sopenharmony_ci ctx.num_waves = 5 * wave_fac; 1069bf215546Sopenharmony_ci else if (demand.vgpr >= 25) 1070bf215546Sopenharmony_ci ctx.num_waves = 6 * wave_fac; 1071bf215546Sopenharmony_ci else 1072bf215546Sopenharmony_ci ctx.num_waves = 7 * wave_fac; 1073bf215546Sopenharmony_ci ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves); 1074bf215546Sopenharmony_ci ctx.num_waves = std::min<uint16_t>(ctx.num_waves, program->num_waves); 1075bf215546Sopenharmony_ci ctx.num_waves = max_suitable_waves(program, ctx.num_waves); 1076bf215546Sopenharmony_ci 1077bf215546Sopenharmony_ci /* VMEM_MAX_MOVES and such assume pre-GFX10 wave count */ 1078bf215546Sopenharmony_ci ctx.num_waves = std::max<uint16_t>(ctx.num_waves / wave_fac, 1); 1079bf215546Sopenharmony_ci 1080bf215546Sopenharmony_ci assert(ctx.num_waves > 0); 1081bf215546Sopenharmony_ci ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2), 1082bf215546Sopenharmony_ci int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))}; 1083bf215546Sopenharmony_ci 1084bf215546Sopenharmony_ci /* NGG culling shaders are very sensitive to position export scheduling. 1085bf215546Sopenharmony_ci * Schedule less aggressively when early primitive export is used, and 1086bf215546Sopenharmony_ci * keep the position export at the very bottom when late primitive export is used. 1087bf215546Sopenharmony_ci */ 1088bf215546Sopenharmony_ci if (program->info.has_ngg_culling && program->stage.num_sw_stages() == 1) { 1089bf215546Sopenharmony_ci if (!program->info.has_ngg_early_prim_export) 1090bf215546Sopenharmony_ci ctx.schedule_pos_exports = false; 1091bf215546Sopenharmony_ci else 1092bf215546Sopenharmony_ci ctx.schedule_pos_export_div = 4; 1093bf215546Sopenharmony_ci } 1094bf215546Sopenharmony_ci 1095bf215546Sopenharmony_ci for (Block& block : program->blocks) 1096bf215546Sopenharmony_ci schedule_block(ctx, program, &block, live_vars); 1097bf215546Sopenharmony_ci 1098bf215546Sopenharmony_ci /* update max_reg_demand and num_waves */ 1099bf215546Sopenharmony_ci RegisterDemand new_demand; 1100bf215546Sopenharmony_ci for (Block& block : program->blocks) { 1101bf215546Sopenharmony_ci new_demand.update(block.register_demand); 1102bf215546Sopenharmony_ci } 1103bf215546Sopenharmony_ci update_vgpr_sgpr_demand(program, new_demand); 1104bf215546Sopenharmony_ci 1105bf215546Sopenharmony_ci/* if enabled, this code asserts that register_demand is updated correctly */ 1106bf215546Sopenharmony_ci#if 0 1107bf215546Sopenharmony_ci int prev_num_waves = program->num_waves; 1108bf215546Sopenharmony_ci const RegisterDemand prev_max_demand = program->max_reg_demand; 1109bf215546Sopenharmony_ci 1110bf215546Sopenharmony_ci std::vector<RegisterDemand> demands(program->blocks.size()); 1111bf215546Sopenharmony_ci for (unsigned j = 0; j < program->blocks.size(); j++) { 1112bf215546Sopenharmony_ci demands[j] = program->blocks[j].register_demand; 1113bf215546Sopenharmony_ci } 1114bf215546Sopenharmony_ci 1115bf215546Sopenharmony_ci live live_vars2 = aco::live_var_analysis(program); 1116bf215546Sopenharmony_ci 1117bf215546Sopenharmony_ci for (unsigned j = 0; j < program->blocks.size(); j++) { 1118bf215546Sopenharmony_ci Block &b = program->blocks[j]; 1119bf215546Sopenharmony_ci for (unsigned i = 0; i < b.instructions.size(); i++) 1120bf215546Sopenharmony_ci assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]); 1121bf215546Sopenharmony_ci assert(b.register_demand == demands[j]); 1122bf215546Sopenharmony_ci } 1123bf215546Sopenharmony_ci 1124bf215546Sopenharmony_ci assert(program->max_reg_demand == prev_max_demand); 1125bf215546Sopenharmony_ci assert(program->num_waves == prev_num_waves); 1126bf215546Sopenharmony_ci#endif 1127bf215546Sopenharmony_ci} 1128bf215546Sopenharmony_ci 1129bf215546Sopenharmony_ci} // namespace aco 1130