1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2010 Intel Corporation 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21bf215546Sopenharmony_ci * IN THE SOFTWARE. 22bf215546Sopenharmony_ci * 23bf215546Sopenharmony_ci * Authors: 24bf215546Sopenharmony_ci * Eric Anholt <eric@anholt.net> 25bf215546Sopenharmony_ci * 26bf215546Sopenharmony_ci */ 27bf215546Sopenharmony_ci 28bf215546Sopenharmony_ci#include "brw_eu.h" 29bf215546Sopenharmony_ci#include "brw_fs.h" 30bf215546Sopenharmony_ci#include "brw_fs_live_variables.h" 31bf215546Sopenharmony_ci#include "brw_vec4.h" 32bf215546Sopenharmony_ci#include "brw_cfg.h" 33bf215546Sopenharmony_ci#include "brw_shader.h" 34bf215546Sopenharmony_ci 35bf215546Sopenharmony_ciusing namespace brw; 36bf215546Sopenharmony_ci 37bf215546Sopenharmony_ci/** @file brw_fs_schedule_instructions.cpp 38bf215546Sopenharmony_ci * 39bf215546Sopenharmony_ci * List scheduling of FS instructions. 40bf215546Sopenharmony_ci * 41bf215546Sopenharmony_ci * The basic model of the list scheduler is to take a basic block, 42bf215546Sopenharmony_ci * compute a DAG of the dependencies (RAW ordering with latency, WAW 43bf215546Sopenharmony_ci * ordering with latency, WAR ordering), and make a list of the DAG heads. 44bf215546Sopenharmony_ci * Heuristically pick a DAG head, then put all the children that are 45bf215546Sopenharmony_ci * now DAG heads into the list of things to schedule. 46bf215546Sopenharmony_ci * 47bf215546Sopenharmony_ci * The heuristic is the important part. We're trying to be cheap, 48bf215546Sopenharmony_ci * since actually computing the optimal scheduling is NP complete. 49bf215546Sopenharmony_ci * What we do is track a "current clock". When we schedule a node, we 50bf215546Sopenharmony_ci * update the earliest-unblocked clock time of its children, and 51bf215546Sopenharmony_ci * increment the clock. Then, when trying to schedule, we just pick 52bf215546Sopenharmony_ci * the earliest-unblocked instruction to schedule. 53bf215546Sopenharmony_ci * 54bf215546Sopenharmony_ci * Note that often there will be many things which could execute 55bf215546Sopenharmony_ci * immediately, and there are a range of heuristic options to choose 56bf215546Sopenharmony_ci * from in picking among those. 57bf215546Sopenharmony_ci */ 58bf215546Sopenharmony_ci 59bf215546Sopenharmony_cistatic bool debug = false; 60bf215546Sopenharmony_ci 61bf215546Sopenharmony_ciclass instruction_scheduler; 62bf215546Sopenharmony_ci 63bf215546Sopenharmony_ciclass schedule_node : public exec_node 64bf215546Sopenharmony_ci{ 65bf215546Sopenharmony_cipublic: 66bf215546Sopenharmony_ci schedule_node(backend_instruction *inst, instruction_scheduler *sched); 67bf215546Sopenharmony_ci void set_latency_gfx4(); 68bf215546Sopenharmony_ci void set_latency_gfx7(bool is_haswell); 69bf215546Sopenharmony_ci 70bf215546Sopenharmony_ci const struct brw_isa_info *isa; 71bf215546Sopenharmony_ci backend_instruction *inst; 72bf215546Sopenharmony_ci schedule_node **children; 73bf215546Sopenharmony_ci int *child_latency; 74bf215546Sopenharmony_ci int child_count; 75bf215546Sopenharmony_ci int parent_count; 76bf215546Sopenharmony_ci int child_array_size; 77bf215546Sopenharmony_ci int unblocked_time; 78bf215546Sopenharmony_ci int latency; 79bf215546Sopenharmony_ci 80bf215546Sopenharmony_ci /** 81bf215546Sopenharmony_ci * Which iteration of pushing groups of children onto the candidates list 82bf215546Sopenharmony_ci * this node was a part of. 83bf215546Sopenharmony_ci */ 84bf215546Sopenharmony_ci unsigned cand_generation; 85bf215546Sopenharmony_ci 86bf215546Sopenharmony_ci /** 87bf215546Sopenharmony_ci * This is the sum of the instruction's latency plus the maximum delay of 88bf215546Sopenharmony_ci * its children, or just the issue_time if it's a leaf node. 89bf215546Sopenharmony_ci */ 90bf215546Sopenharmony_ci int delay; 91bf215546Sopenharmony_ci 92bf215546Sopenharmony_ci /** 93bf215546Sopenharmony_ci * Preferred exit node among the (direct or indirect) successors of this 94bf215546Sopenharmony_ci * node. Among the scheduler nodes blocked by this node, this will be the 95bf215546Sopenharmony_ci * one that may cause earliest program termination, or NULL if none of the 96bf215546Sopenharmony_ci * successors is an exit node. 97bf215546Sopenharmony_ci */ 98bf215546Sopenharmony_ci schedule_node *exit; 99bf215546Sopenharmony_ci}; 100bf215546Sopenharmony_ci 101bf215546Sopenharmony_ci/** 102bf215546Sopenharmony_ci * Lower bound of the scheduling time after which one of the instructions 103bf215546Sopenharmony_ci * blocked by this node may lead to program termination. 104bf215546Sopenharmony_ci * 105bf215546Sopenharmony_ci * exit_unblocked_time() determines a strict partial ordering relation '«' on 106bf215546Sopenharmony_ci * the set of scheduler nodes as follows: 107bf215546Sopenharmony_ci * 108bf215546Sopenharmony_ci * n « m <-> exit_unblocked_time(n) < exit_unblocked_time(m) 109bf215546Sopenharmony_ci * 110bf215546Sopenharmony_ci * which can be used to heuristically order nodes according to how early they 111bf215546Sopenharmony_ci * can unblock an exit node and lead to program termination. 112bf215546Sopenharmony_ci */ 113bf215546Sopenharmony_cistatic inline int 114bf215546Sopenharmony_ciexit_unblocked_time(const schedule_node *n) 115bf215546Sopenharmony_ci{ 116bf215546Sopenharmony_ci return n->exit ? n->exit->unblocked_time : INT_MAX; 117bf215546Sopenharmony_ci} 118bf215546Sopenharmony_ci 119bf215546Sopenharmony_civoid 120bf215546Sopenharmony_cischedule_node::set_latency_gfx4() 121bf215546Sopenharmony_ci{ 122bf215546Sopenharmony_ci int chans = 8; 123bf215546Sopenharmony_ci int math_latency = 22; 124bf215546Sopenharmony_ci 125bf215546Sopenharmony_ci switch (inst->opcode) { 126bf215546Sopenharmony_ci case SHADER_OPCODE_RCP: 127bf215546Sopenharmony_ci this->latency = 1 * chans * math_latency; 128bf215546Sopenharmony_ci break; 129bf215546Sopenharmony_ci case SHADER_OPCODE_RSQ: 130bf215546Sopenharmony_ci this->latency = 2 * chans * math_latency; 131bf215546Sopenharmony_ci break; 132bf215546Sopenharmony_ci case SHADER_OPCODE_INT_QUOTIENT: 133bf215546Sopenharmony_ci case SHADER_OPCODE_SQRT: 134bf215546Sopenharmony_ci case SHADER_OPCODE_LOG2: 135bf215546Sopenharmony_ci /* full precision log. partial is 2. */ 136bf215546Sopenharmony_ci this->latency = 3 * chans * math_latency; 137bf215546Sopenharmony_ci break; 138bf215546Sopenharmony_ci case SHADER_OPCODE_INT_REMAINDER: 139bf215546Sopenharmony_ci case SHADER_OPCODE_EXP2: 140bf215546Sopenharmony_ci /* full precision. partial is 3, same throughput. */ 141bf215546Sopenharmony_ci this->latency = 4 * chans * math_latency; 142bf215546Sopenharmony_ci break; 143bf215546Sopenharmony_ci case SHADER_OPCODE_POW: 144bf215546Sopenharmony_ci this->latency = 8 * chans * math_latency; 145bf215546Sopenharmony_ci break; 146bf215546Sopenharmony_ci case SHADER_OPCODE_SIN: 147bf215546Sopenharmony_ci case SHADER_OPCODE_COS: 148bf215546Sopenharmony_ci /* minimum latency, max is 12 rounds. */ 149bf215546Sopenharmony_ci this->latency = 5 * chans * math_latency; 150bf215546Sopenharmony_ci break; 151bf215546Sopenharmony_ci default: 152bf215546Sopenharmony_ci this->latency = 2; 153bf215546Sopenharmony_ci break; 154bf215546Sopenharmony_ci } 155bf215546Sopenharmony_ci} 156bf215546Sopenharmony_ci 157bf215546Sopenharmony_civoid 158bf215546Sopenharmony_cischedule_node::set_latency_gfx7(bool is_haswell) 159bf215546Sopenharmony_ci{ 160bf215546Sopenharmony_ci switch (inst->opcode) { 161bf215546Sopenharmony_ci case BRW_OPCODE_MAD: 162bf215546Sopenharmony_ci /* 2 cycles 163bf215546Sopenharmony_ci * (since the last two src operands are in different register banks): 164bf215546Sopenharmony_ci * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 165bf215546Sopenharmony_ci * 166bf215546Sopenharmony_ci * 3 cycles on IVB, 4 on HSW 167bf215546Sopenharmony_ci * (since the last two src operands are in the same register bank): 168bf215546Sopenharmony_ci * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 169bf215546Sopenharmony_ci * 170bf215546Sopenharmony_ci * 18 cycles on IVB, 16 on HSW 171bf215546Sopenharmony_ci * (since the last two src operands are in different register banks): 172bf215546Sopenharmony_ci * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 173bf215546Sopenharmony_ci * mov(8) null g4<4,5,1>F { align16 WE_normal 1Q }; 174bf215546Sopenharmony_ci * 175bf215546Sopenharmony_ci * 20 cycles on IVB, 18 on HSW 176bf215546Sopenharmony_ci * (since the last two src operands are in the same register bank): 177bf215546Sopenharmony_ci * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 178bf215546Sopenharmony_ci * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 179bf215546Sopenharmony_ci */ 180bf215546Sopenharmony_ci 181bf215546Sopenharmony_ci /* Our register allocator doesn't know about register banks, so use the 182bf215546Sopenharmony_ci * higher latency. 183bf215546Sopenharmony_ci */ 184bf215546Sopenharmony_ci latency = is_haswell ? 16 : 18; 185bf215546Sopenharmony_ci break; 186bf215546Sopenharmony_ci 187bf215546Sopenharmony_ci case BRW_OPCODE_LRP: 188bf215546Sopenharmony_ci /* 2 cycles 189bf215546Sopenharmony_ci * (since the last two src operands are in different register banks): 190bf215546Sopenharmony_ci * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 191bf215546Sopenharmony_ci * 192bf215546Sopenharmony_ci * 3 cycles on IVB, 4 on HSW 193bf215546Sopenharmony_ci * (since the last two src operands are in the same register bank): 194bf215546Sopenharmony_ci * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 195bf215546Sopenharmony_ci * 196bf215546Sopenharmony_ci * 16 cycles on IVB, 14 on HSW 197bf215546Sopenharmony_ci * (since the last two src operands are in different register banks): 198bf215546Sopenharmony_ci * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 199bf215546Sopenharmony_ci * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 200bf215546Sopenharmony_ci * 201bf215546Sopenharmony_ci * 16 cycles 202bf215546Sopenharmony_ci * (since the last two src operands are in the same register bank): 203bf215546Sopenharmony_ci * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 204bf215546Sopenharmony_ci * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 205bf215546Sopenharmony_ci */ 206bf215546Sopenharmony_ci 207bf215546Sopenharmony_ci /* Our register allocator doesn't know about register banks, so use the 208bf215546Sopenharmony_ci * higher latency. 209bf215546Sopenharmony_ci */ 210bf215546Sopenharmony_ci latency = 14; 211bf215546Sopenharmony_ci break; 212bf215546Sopenharmony_ci 213bf215546Sopenharmony_ci case SHADER_OPCODE_RCP: 214bf215546Sopenharmony_ci case SHADER_OPCODE_RSQ: 215bf215546Sopenharmony_ci case SHADER_OPCODE_SQRT: 216bf215546Sopenharmony_ci case SHADER_OPCODE_LOG2: 217bf215546Sopenharmony_ci case SHADER_OPCODE_EXP2: 218bf215546Sopenharmony_ci case SHADER_OPCODE_SIN: 219bf215546Sopenharmony_ci case SHADER_OPCODE_COS: 220bf215546Sopenharmony_ci /* 2 cycles: 221bf215546Sopenharmony_ci * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; 222bf215546Sopenharmony_ci * 223bf215546Sopenharmony_ci * 18 cycles: 224bf215546Sopenharmony_ci * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; 225bf215546Sopenharmony_ci * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 226bf215546Sopenharmony_ci * 227bf215546Sopenharmony_ci * Same for exp2, log2, rsq, sqrt, sin, cos. 228bf215546Sopenharmony_ci */ 229bf215546Sopenharmony_ci latency = is_haswell ? 14 : 16; 230bf215546Sopenharmony_ci break; 231bf215546Sopenharmony_ci 232bf215546Sopenharmony_ci case SHADER_OPCODE_POW: 233bf215546Sopenharmony_ci /* 2 cycles: 234bf215546Sopenharmony_ci * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; 235bf215546Sopenharmony_ci * 236bf215546Sopenharmony_ci * 26 cycles: 237bf215546Sopenharmony_ci * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; 238bf215546Sopenharmony_ci * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 239bf215546Sopenharmony_ci */ 240bf215546Sopenharmony_ci latency = is_haswell ? 22 : 24; 241bf215546Sopenharmony_ci break; 242bf215546Sopenharmony_ci 243bf215546Sopenharmony_ci case SHADER_OPCODE_TEX: 244bf215546Sopenharmony_ci case SHADER_OPCODE_TXD: 245bf215546Sopenharmony_ci case SHADER_OPCODE_TXF: 246bf215546Sopenharmony_ci case SHADER_OPCODE_TXF_LZ: 247bf215546Sopenharmony_ci case SHADER_OPCODE_TXL: 248bf215546Sopenharmony_ci case SHADER_OPCODE_TXL_LZ: 249bf215546Sopenharmony_ci /* 18 cycles: 250bf215546Sopenharmony_ci * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 251bf215546Sopenharmony_ci * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 252bf215546Sopenharmony_ci * send(8) g4<1>UW g114<8,8,1>F 253bf215546Sopenharmony_ci * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 254bf215546Sopenharmony_ci * 255bf215546Sopenharmony_ci * 697 +/-49 cycles (min 610, n=26): 256bf215546Sopenharmony_ci * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 257bf215546Sopenharmony_ci * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 258bf215546Sopenharmony_ci * send(8) g4<1>UW g114<8,8,1>F 259bf215546Sopenharmony_ci * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 260bf215546Sopenharmony_ci * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 261bf215546Sopenharmony_ci * 262bf215546Sopenharmony_ci * So the latency on our first texture load of the batchbuffer takes 263bf215546Sopenharmony_ci * ~700 cycles, since the caches are cold at that point. 264bf215546Sopenharmony_ci * 265bf215546Sopenharmony_ci * 840 +/- 92 cycles (min 720, n=25): 266bf215546Sopenharmony_ci * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 267bf215546Sopenharmony_ci * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 268bf215546Sopenharmony_ci * send(8) g4<1>UW g114<8,8,1>F 269bf215546Sopenharmony_ci * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 270bf215546Sopenharmony_ci * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 271bf215546Sopenharmony_ci * send(8) g4<1>UW g114<8,8,1>F 272bf215546Sopenharmony_ci * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 273bf215546Sopenharmony_ci * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 274bf215546Sopenharmony_ci * 275bf215546Sopenharmony_ci * On the second load, it takes just an extra ~140 cycles, and after 276bf215546Sopenharmony_ci * accounting for the 14 cycles of the MOV's latency, that makes ~130. 277bf215546Sopenharmony_ci * 278bf215546Sopenharmony_ci * 683 +/- 49 cycles (min = 602, n=47): 279bf215546Sopenharmony_ci * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 280bf215546Sopenharmony_ci * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 281bf215546Sopenharmony_ci * send(8) g4<1>UW g114<8,8,1>F 282bf215546Sopenharmony_ci * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 283bf215546Sopenharmony_ci * send(8) g50<1>UW g114<8,8,1>F 284bf215546Sopenharmony_ci * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 285bf215546Sopenharmony_ci * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 286bf215546Sopenharmony_ci * 287bf215546Sopenharmony_ci * The unit appears to be pipelined, since this matches up with the 288bf215546Sopenharmony_ci * cache-cold case, despite there being two loads here. If you replace 289bf215546Sopenharmony_ci * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39). 290bf215546Sopenharmony_ci * 291bf215546Sopenharmony_ci * So, take some number between the cache-hot 140 cycles and the 292bf215546Sopenharmony_ci * cache-cold 700 cycles. No particular tuning was done on this. 293bf215546Sopenharmony_ci * 294bf215546Sopenharmony_ci * I haven't done significant testing of the non-TEX opcodes. TXL at 295bf215546Sopenharmony_ci * least looked about the same as TEX. 296bf215546Sopenharmony_ci */ 297bf215546Sopenharmony_ci latency = 200; 298bf215546Sopenharmony_ci break; 299bf215546Sopenharmony_ci 300bf215546Sopenharmony_ci case SHADER_OPCODE_TXS: 301bf215546Sopenharmony_ci /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41 302bf215546Sopenharmony_ci * cycles (n=15): 303bf215546Sopenharmony_ci * mov(8) g114<1>UD 0D { align1 WE_normal 1Q }; 304bf215546Sopenharmony_ci * send(8) g6<1>UW g114<8,8,1>F 305bf215546Sopenharmony_ci * sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q }; 306bf215546Sopenharmony_ci * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q }; 307bf215546Sopenharmony_ci * 308bf215546Sopenharmony_ci * 309bf215546Sopenharmony_ci * Two loads was 535 +/- 30 cycles (n=19): 310bf215546Sopenharmony_ci * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; 311bf215546Sopenharmony_ci * send(16) g6<1>UW g114<8,8,1>F 312bf215546Sopenharmony_ci * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; 313bf215546Sopenharmony_ci * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; 314bf215546Sopenharmony_ci * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H }; 315bf215546Sopenharmony_ci * send(16) g8<1>UW g114<8,8,1>F 316bf215546Sopenharmony_ci * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; 317bf215546Sopenharmony_ci * mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H }; 318bf215546Sopenharmony_ci * add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H }; 319bf215546Sopenharmony_ci * 320bf215546Sopenharmony_ci * Since the only caches that should matter are just the 321bf215546Sopenharmony_ci * instruction/state cache containing the surface state, assume that we 322bf215546Sopenharmony_ci * always have hot caches. 323bf215546Sopenharmony_ci */ 324bf215546Sopenharmony_ci latency = 100; 325bf215546Sopenharmony_ci break; 326bf215546Sopenharmony_ci 327bf215546Sopenharmony_ci case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: 328bf215546Sopenharmony_ci case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 329bf215546Sopenharmony_ci case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: 330bf215546Sopenharmony_ci case VS_OPCODE_PULL_CONSTANT_LOAD: 331bf215546Sopenharmony_ci /* testing using varying-index pull constants: 332bf215546Sopenharmony_ci * 333bf215546Sopenharmony_ci * 16 cycles: 334bf215546Sopenharmony_ci * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 335bf215546Sopenharmony_ci * send(8) g4<1>F g4<8,8,1>D 336bf215546Sopenharmony_ci * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 337bf215546Sopenharmony_ci * 338bf215546Sopenharmony_ci * ~480 cycles: 339bf215546Sopenharmony_ci * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 340bf215546Sopenharmony_ci * send(8) g4<1>F g4<8,8,1>D 341bf215546Sopenharmony_ci * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 342bf215546Sopenharmony_ci * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 343bf215546Sopenharmony_ci * 344bf215546Sopenharmony_ci * ~620 cycles: 345bf215546Sopenharmony_ci * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 346bf215546Sopenharmony_ci * send(8) g4<1>F g4<8,8,1>D 347bf215546Sopenharmony_ci * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 348bf215546Sopenharmony_ci * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 349bf215546Sopenharmony_ci * send(8) g4<1>F g4<8,8,1>D 350bf215546Sopenharmony_ci * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 351bf215546Sopenharmony_ci * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 352bf215546Sopenharmony_ci * 353bf215546Sopenharmony_ci * So, if it's cache-hot, it's about 140. If it's cache cold, it's 354bf215546Sopenharmony_ci * about 460. We expect to mostly be cache hot, so pick something more 355bf215546Sopenharmony_ci * in that direction. 356bf215546Sopenharmony_ci */ 357bf215546Sopenharmony_ci latency = 200; 358bf215546Sopenharmony_ci break; 359bf215546Sopenharmony_ci 360bf215546Sopenharmony_ci case SHADER_OPCODE_GFX7_SCRATCH_READ: 361bf215546Sopenharmony_ci /* Testing a load from offset 0, that had been previously written: 362bf215546Sopenharmony_ci * 363bf215546Sopenharmony_ci * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q }; 364bf215546Sopenharmony_ci * mov(8) null g114<8,8,1>F { align1 WE_normal 1Q }; 365bf215546Sopenharmony_ci * 366bf215546Sopenharmony_ci * The cycles spent seemed to be grouped around 40-50 (as low as 38), 367bf215546Sopenharmony_ci * then around 140. Presumably this is cache hit vs miss. 368bf215546Sopenharmony_ci */ 369bf215546Sopenharmony_ci latency = 50; 370bf215546Sopenharmony_ci break; 371bf215546Sopenharmony_ci 372bf215546Sopenharmony_ci case VEC4_OPCODE_UNTYPED_ATOMIC: 373bf215546Sopenharmony_ci /* See GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP */ 374bf215546Sopenharmony_ci latency = 14000; 375bf215546Sopenharmony_ci break; 376bf215546Sopenharmony_ci 377bf215546Sopenharmony_ci case VEC4_OPCODE_UNTYPED_SURFACE_READ: 378bf215546Sopenharmony_ci case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: 379bf215546Sopenharmony_ci /* See also GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ */ 380bf215546Sopenharmony_ci latency = is_haswell ? 300 : 600; 381bf215546Sopenharmony_ci break; 382bf215546Sopenharmony_ci 383bf215546Sopenharmony_ci case SHADER_OPCODE_SEND: 384bf215546Sopenharmony_ci switch (inst->sfid) { 385bf215546Sopenharmony_ci case BRW_SFID_SAMPLER: { 386bf215546Sopenharmony_ci unsigned msg_type = (inst->desc >> 12) & 0x1f; 387bf215546Sopenharmony_ci switch (msg_type) { 388bf215546Sopenharmony_ci case GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO: 389bf215546Sopenharmony_ci case GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO: 390bf215546Sopenharmony_ci /* See also SHADER_OPCODE_TXS */ 391bf215546Sopenharmony_ci latency = 100; 392bf215546Sopenharmony_ci break; 393bf215546Sopenharmony_ci 394bf215546Sopenharmony_ci default: 395bf215546Sopenharmony_ci /* See also SHADER_OPCODE_TEX */ 396bf215546Sopenharmony_ci latency = 200; 397bf215546Sopenharmony_ci break; 398bf215546Sopenharmony_ci } 399bf215546Sopenharmony_ci break; 400bf215546Sopenharmony_ci } 401bf215546Sopenharmony_ci 402bf215546Sopenharmony_ci case GFX6_SFID_DATAPORT_RENDER_CACHE: 403bf215546Sopenharmony_ci switch (brw_fb_desc_msg_type(isa->devinfo, inst->desc)) { 404bf215546Sopenharmony_ci case GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE: 405bf215546Sopenharmony_ci case GFX7_DATAPORT_RC_TYPED_SURFACE_READ: 406bf215546Sopenharmony_ci /* See also SHADER_OPCODE_TYPED_SURFACE_READ */ 407bf215546Sopenharmony_ci assert(!is_haswell); 408bf215546Sopenharmony_ci latency = 600; 409bf215546Sopenharmony_ci break; 410bf215546Sopenharmony_ci 411bf215546Sopenharmony_ci case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP: 412bf215546Sopenharmony_ci /* See also SHADER_OPCODE_TYPED_ATOMIC */ 413bf215546Sopenharmony_ci assert(!is_haswell); 414bf215546Sopenharmony_ci latency = 14000; 415bf215546Sopenharmony_ci break; 416bf215546Sopenharmony_ci 417bf215546Sopenharmony_ci case GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE: 418bf215546Sopenharmony_ci /* completely fabricated number */ 419bf215546Sopenharmony_ci latency = 600; 420bf215546Sopenharmony_ci break; 421bf215546Sopenharmony_ci 422bf215546Sopenharmony_ci default: 423bf215546Sopenharmony_ci unreachable("Unknown render cache message"); 424bf215546Sopenharmony_ci } 425bf215546Sopenharmony_ci break; 426bf215546Sopenharmony_ci 427bf215546Sopenharmony_ci case GFX7_SFID_DATAPORT_DATA_CACHE: 428bf215546Sopenharmony_ci switch ((inst->desc >> 14) & 0x1f) { 429bf215546Sopenharmony_ci case BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ: 430bf215546Sopenharmony_ci case GFX7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ: 431bf215546Sopenharmony_ci case GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE: 432bf215546Sopenharmony_ci /* We have no data for this but assume it's a little faster than 433bf215546Sopenharmony_ci * untyped surface read/write. 434bf215546Sopenharmony_ci */ 435bf215546Sopenharmony_ci latency = 200; 436bf215546Sopenharmony_ci break; 437bf215546Sopenharmony_ci 438bf215546Sopenharmony_ci case GFX7_DATAPORT_DC_DWORD_SCATTERED_READ: 439bf215546Sopenharmony_ci case GFX6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE: 440bf215546Sopenharmony_ci case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ: 441bf215546Sopenharmony_ci case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE: 442bf215546Sopenharmony_ci /* We have no data for this but assume it's roughly the same as 443bf215546Sopenharmony_ci * untyped surface read/write. 444bf215546Sopenharmony_ci */ 445bf215546Sopenharmony_ci latency = 300; 446bf215546Sopenharmony_ci break; 447bf215546Sopenharmony_ci 448bf215546Sopenharmony_ci case GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ: 449bf215546Sopenharmony_ci case GFX7_DATAPORT_DC_UNTYPED_SURFACE_WRITE: 450bf215546Sopenharmony_ci /* Test code: 451bf215546Sopenharmony_ci * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; 452bf215546Sopenharmony_ci * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; 453bf215546Sopenharmony_ci * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; 454bf215546Sopenharmony_ci * send(8) g4<1>UD g112<8,8,1>UD 455bf215546Sopenharmony_ci * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; 456bf215546Sopenharmony_ci * . 457bf215546Sopenharmony_ci * . [repeats 8 times] 458bf215546Sopenharmony_ci * . 459bf215546Sopenharmony_ci * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; 460bf215546Sopenharmony_ci * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; 461bf215546Sopenharmony_ci * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; 462bf215546Sopenharmony_ci * send(8) g4<1>UD g112<8,8,1>UD 463bf215546Sopenharmony_ci * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; 464bf215546Sopenharmony_ci * 465bf215546Sopenharmony_ci * Running it 100 times as fragment shader on a 128x128 quad 466bf215546Sopenharmony_ci * gives an average latency of 583 cycles per surface read, 467bf215546Sopenharmony_ci * standard deviation 0.9%. 468bf215546Sopenharmony_ci */ 469bf215546Sopenharmony_ci assert(!is_haswell); 470bf215546Sopenharmony_ci latency = 600; 471bf215546Sopenharmony_ci break; 472bf215546Sopenharmony_ci 473bf215546Sopenharmony_ci case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP: 474bf215546Sopenharmony_ci /* Test code: 475bf215546Sopenharmony_ci * mov(8) g112<1>ud 0x00000000ud { align1 WE_all 1Q }; 476bf215546Sopenharmony_ci * mov(1) g112.7<1>ud g1.7<0,1,0>ud { align1 WE_all }; 477bf215546Sopenharmony_ci * mov(8) g113<1>ud 0x00000000ud { align1 WE_normal 1Q }; 478bf215546Sopenharmony_ci * send(8) g4<1>ud g112<8,8,1>ud 479bf215546Sopenharmony_ci * data (38, 5, 6) mlen 2 rlen 1 { align1 WE_normal 1Q }; 480bf215546Sopenharmony_ci * 481bf215546Sopenharmony_ci * Running it 100 times as fragment shader on a 128x128 quad 482bf215546Sopenharmony_ci * gives an average latency of 13867 cycles per atomic op, 483bf215546Sopenharmony_ci * standard deviation 3%. Note that this is a rather 484bf215546Sopenharmony_ci * pessimistic estimate, the actual latency in cases with few 485bf215546Sopenharmony_ci * collisions between threads and favorable pipelining has been 486bf215546Sopenharmony_ci * seen to be reduced by a factor of 100. 487bf215546Sopenharmony_ci */ 488bf215546Sopenharmony_ci assert(!is_haswell); 489bf215546Sopenharmony_ci latency = 14000; 490bf215546Sopenharmony_ci break; 491bf215546Sopenharmony_ci 492bf215546Sopenharmony_ci default: 493bf215546Sopenharmony_ci unreachable("Unknown data cache message"); 494bf215546Sopenharmony_ci } 495bf215546Sopenharmony_ci break; 496bf215546Sopenharmony_ci 497bf215546Sopenharmony_ci case HSW_SFID_DATAPORT_DATA_CACHE_1: 498bf215546Sopenharmony_ci switch ((inst->desc >> 14) & 0x1f) { 499bf215546Sopenharmony_ci case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ: 500bf215546Sopenharmony_ci case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE: 501bf215546Sopenharmony_ci case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ: 502bf215546Sopenharmony_ci case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE: 503bf215546Sopenharmony_ci case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE: 504bf215546Sopenharmony_ci case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ: 505bf215546Sopenharmony_ci case GFX8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE: 506bf215546Sopenharmony_ci case GFX9_DATAPORT_DC_PORT1_A64_SCATTERED_READ: 507bf215546Sopenharmony_ci case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ: 508bf215546Sopenharmony_ci case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE: 509bf215546Sopenharmony_ci /* See also GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ */ 510bf215546Sopenharmony_ci latency = 300; 511bf215546Sopenharmony_ci break; 512bf215546Sopenharmony_ci 513bf215546Sopenharmony_ci case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP: 514bf215546Sopenharmony_ci case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2: 515bf215546Sopenharmony_ci case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2: 516bf215546Sopenharmony_ci case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP: 517bf215546Sopenharmony_ci case GFX9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP: 518bf215546Sopenharmony_ci case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP: 519bf215546Sopenharmony_ci case GFX9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP: 520bf215546Sopenharmony_ci case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_INT_OP: 521bf215546Sopenharmony_ci case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_FLOAT_OP: 522bf215546Sopenharmony_ci /* See also GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP */ 523bf215546Sopenharmony_ci latency = 14000; 524bf215546Sopenharmony_ci break; 525bf215546Sopenharmony_ci 526bf215546Sopenharmony_ci default: 527bf215546Sopenharmony_ci unreachable("Unknown data cache message"); 528bf215546Sopenharmony_ci } 529bf215546Sopenharmony_ci break; 530bf215546Sopenharmony_ci 531bf215546Sopenharmony_ci case GFX12_SFID_UGM: 532bf215546Sopenharmony_ci case GFX12_SFID_TGM: 533bf215546Sopenharmony_ci case GFX12_SFID_SLM: 534bf215546Sopenharmony_ci switch (lsc_msg_desc_opcode(isa->devinfo, inst->desc)) { 535bf215546Sopenharmony_ci case LSC_OP_LOAD: 536bf215546Sopenharmony_ci case LSC_OP_STORE: 537bf215546Sopenharmony_ci case LSC_OP_LOAD_CMASK: 538bf215546Sopenharmony_ci case LSC_OP_STORE_CMASK: 539bf215546Sopenharmony_ci latency = 300; 540bf215546Sopenharmony_ci break; 541bf215546Sopenharmony_ci case LSC_OP_FENCE: 542bf215546Sopenharmony_ci case LSC_OP_ATOMIC_INC: 543bf215546Sopenharmony_ci case LSC_OP_ATOMIC_DEC: 544bf215546Sopenharmony_ci case LSC_OP_ATOMIC_LOAD: 545bf215546Sopenharmony_ci case LSC_OP_ATOMIC_STORE: 546bf215546Sopenharmony_ci case LSC_OP_ATOMIC_ADD: 547bf215546Sopenharmony_ci case LSC_OP_ATOMIC_SUB: 548bf215546Sopenharmony_ci case LSC_OP_ATOMIC_MIN: 549bf215546Sopenharmony_ci case LSC_OP_ATOMIC_MAX: 550bf215546Sopenharmony_ci case LSC_OP_ATOMIC_UMIN: 551bf215546Sopenharmony_ci case LSC_OP_ATOMIC_UMAX: 552bf215546Sopenharmony_ci case LSC_OP_ATOMIC_CMPXCHG: 553bf215546Sopenharmony_ci case LSC_OP_ATOMIC_FADD: 554bf215546Sopenharmony_ci case LSC_OP_ATOMIC_FSUB: 555bf215546Sopenharmony_ci case LSC_OP_ATOMIC_FMIN: 556bf215546Sopenharmony_ci case LSC_OP_ATOMIC_FMAX: 557bf215546Sopenharmony_ci case LSC_OP_ATOMIC_FCMPXCHG: 558bf215546Sopenharmony_ci case LSC_OP_ATOMIC_AND: 559bf215546Sopenharmony_ci case LSC_OP_ATOMIC_OR: 560bf215546Sopenharmony_ci case LSC_OP_ATOMIC_XOR: 561bf215546Sopenharmony_ci latency = 1400; 562bf215546Sopenharmony_ci break; 563bf215546Sopenharmony_ci default: 564bf215546Sopenharmony_ci unreachable("unsupported new data port message instruction"); 565bf215546Sopenharmony_ci } 566bf215546Sopenharmony_ci break; 567bf215546Sopenharmony_ci 568bf215546Sopenharmony_ci case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: 569bf215546Sopenharmony_ci case GEN_RT_SFID_RAY_TRACE_ACCELERATOR: 570bf215546Sopenharmony_ci /* TODO. 571bf215546Sopenharmony_ci * 572bf215546Sopenharmony_ci * We'll assume for the moment that this is pretty quick as it 573bf215546Sopenharmony_ci * doesn't actually return any data. 574bf215546Sopenharmony_ci */ 575bf215546Sopenharmony_ci latency = 200; 576bf215546Sopenharmony_ci break; 577bf215546Sopenharmony_ci 578bf215546Sopenharmony_ci case BRW_SFID_URB: 579bf215546Sopenharmony_ci latency = 200; 580bf215546Sopenharmony_ci break; 581bf215546Sopenharmony_ci 582bf215546Sopenharmony_ci default: 583bf215546Sopenharmony_ci unreachable("Unknown SFID"); 584bf215546Sopenharmony_ci } 585bf215546Sopenharmony_ci break; 586bf215546Sopenharmony_ci 587bf215546Sopenharmony_ci default: 588bf215546Sopenharmony_ci /* 2 cycles: 589bf215546Sopenharmony_ci * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; 590bf215546Sopenharmony_ci * 591bf215546Sopenharmony_ci * 16 cycles: 592bf215546Sopenharmony_ci * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; 593bf215546Sopenharmony_ci * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 594bf215546Sopenharmony_ci */ 595bf215546Sopenharmony_ci latency = 14; 596bf215546Sopenharmony_ci break; 597bf215546Sopenharmony_ci } 598bf215546Sopenharmony_ci} 599bf215546Sopenharmony_ci 600bf215546Sopenharmony_ciclass instruction_scheduler { 601bf215546Sopenharmony_cipublic: 602bf215546Sopenharmony_ci instruction_scheduler(const backend_shader *s, int grf_count, 603bf215546Sopenharmony_ci unsigned hw_reg_count, int block_count, 604bf215546Sopenharmony_ci instruction_scheduler_mode mode): 605bf215546Sopenharmony_ci bs(s) 606bf215546Sopenharmony_ci { 607bf215546Sopenharmony_ci this->mem_ctx = ralloc_context(NULL); 608bf215546Sopenharmony_ci this->grf_count = grf_count; 609bf215546Sopenharmony_ci this->hw_reg_count = hw_reg_count; 610bf215546Sopenharmony_ci this->instructions.make_empty(); 611bf215546Sopenharmony_ci this->post_reg_alloc = (mode == SCHEDULE_POST); 612bf215546Sopenharmony_ci this->mode = mode; 613bf215546Sopenharmony_ci this->reg_pressure = 0; 614bf215546Sopenharmony_ci this->block_idx = 0; 615bf215546Sopenharmony_ci if (!post_reg_alloc) { 616bf215546Sopenharmony_ci this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count); 617bf215546Sopenharmony_ci 618bf215546Sopenharmony_ci this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 619bf215546Sopenharmony_ci for (int i = 0; i < block_count; i++) 620bf215546Sopenharmony_ci this->livein[i] = rzalloc_array(mem_ctx, BITSET_WORD, 621bf215546Sopenharmony_ci BITSET_WORDS(grf_count)); 622bf215546Sopenharmony_ci 623bf215546Sopenharmony_ci this->liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 624bf215546Sopenharmony_ci for (int i = 0; i < block_count; i++) 625bf215546Sopenharmony_ci this->liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, 626bf215546Sopenharmony_ci BITSET_WORDS(grf_count)); 627bf215546Sopenharmony_ci 628bf215546Sopenharmony_ci this->hw_liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 629bf215546Sopenharmony_ci for (int i = 0; i < block_count; i++) 630bf215546Sopenharmony_ci this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, 631bf215546Sopenharmony_ci BITSET_WORDS(hw_reg_count)); 632bf215546Sopenharmony_ci 633bf215546Sopenharmony_ci this->written = rzalloc_array(mem_ctx, bool, grf_count); 634bf215546Sopenharmony_ci 635bf215546Sopenharmony_ci this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count); 636bf215546Sopenharmony_ci 637bf215546Sopenharmony_ci this->hw_reads_remaining = rzalloc_array(mem_ctx, int, hw_reg_count); 638bf215546Sopenharmony_ci } else { 639bf215546Sopenharmony_ci this->reg_pressure_in = NULL; 640bf215546Sopenharmony_ci this->livein = NULL; 641bf215546Sopenharmony_ci this->liveout = NULL; 642bf215546Sopenharmony_ci this->hw_liveout = NULL; 643bf215546Sopenharmony_ci this->written = NULL; 644bf215546Sopenharmony_ci this->reads_remaining = NULL; 645bf215546Sopenharmony_ci this->hw_reads_remaining = NULL; 646bf215546Sopenharmony_ci } 647bf215546Sopenharmony_ci } 648bf215546Sopenharmony_ci 649bf215546Sopenharmony_ci ~instruction_scheduler() 650bf215546Sopenharmony_ci { 651bf215546Sopenharmony_ci ralloc_free(this->mem_ctx); 652bf215546Sopenharmony_ci } 653bf215546Sopenharmony_ci void add_barrier_deps(schedule_node *n); 654bf215546Sopenharmony_ci void add_dep(schedule_node *before, schedule_node *after, int latency); 655bf215546Sopenharmony_ci void add_dep(schedule_node *before, schedule_node *after); 656bf215546Sopenharmony_ci 657bf215546Sopenharmony_ci void run(cfg_t *cfg); 658bf215546Sopenharmony_ci void add_insts_from_block(bblock_t *block); 659bf215546Sopenharmony_ci void compute_delays(); 660bf215546Sopenharmony_ci void compute_exits(); 661bf215546Sopenharmony_ci virtual void calculate_deps() = 0; 662bf215546Sopenharmony_ci virtual schedule_node *choose_instruction_to_schedule() = 0; 663bf215546Sopenharmony_ci 664bf215546Sopenharmony_ci /** 665bf215546Sopenharmony_ci * Returns how many cycles it takes the instruction to issue. 666bf215546Sopenharmony_ci * 667bf215546Sopenharmony_ci * Instructions in gen hardware are handled one simd4 vector at a time, 668bf215546Sopenharmony_ci * with 1 cycle per vector dispatched. Thus SIMD8 pixel shaders take 2 669bf215546Sopenharmony_ci * cycles to dispatch and SIMD16 (compressed) instructions take 4. 670bf215546Sopenharmony_ci */ 671bf215546Sopenharmony_ci virtual int issue_time(backend_instruction *inst) = 0; 672bf215546Sopenharmony_ci 673bf215546Sopenharmony_ci virtual void count_reads_remaining(backend_instruction *inst) = 0; 674bf215546Sopenharmony_ci virtual void setup_liveness(cfg_t *cfg) = 0; 675bf215546Sopenharmony_ci virtual void update_register_pressure(backend_instruction *inst) = 0; 676bf215546Sopenharmony_ci virtual int get_register_pressure_benefit(backend_instruction *inst) = 0; 677bf215546Sopenharmony_ci 678bf215546Sopenharmony_ci void schedule_instructions(bblock_t *block); 679bf215546Sopenharmony_ci 680bf215546Sopenharmony_ci void *mem_ctx; 681bf215546Sopenharmony_ci 682bf215546Sopenharmony_ci bool post_reg_alloc; 683bf215546Sopenharmony_ci int grf_count; 684bf215546Sopenharmony_ci unsigned hw_reg_count; 685bf215546Sopenharmony_ci int reg_pressure; 686bf215546Sopenharmony_ci int block_idx; 687bf215546Sopenharmony_ci exec_list instructions; 688bf215546Sopenharmony_ci const backend_shader *bs; 689bf215546Sopenharmony_ci 690bf215546Sopenharmony_ci instruction_scheduler_mode mode; 691bf215546Sopenharmony_ci 692bf215546Sopenharmony_ci /* 693bf215546Sopenharmony_ci * The register pressure at the beginning of each basic block. 694bf215546Sopenharmony_ci */ 695bf215546Sopenharmony_ci 696bf215546Sopenharmony_ci int *reg_pressure_in; 697bf215546Sopenharmony_ci 698bf215546Sopenharmony_ci /* 699bf215546Sopenharmony_ci * The virtual GRF's whose range overlaps the beginning of each basic block. 700bf215546Sopenharmony_ci */ 701bf215546Sopenharmony_ci 702bf215546Sopenharmony_ci BITSET_WORD **livein; 703bf215546Sopenharmony_ci 704bf215546Sopenharmony_ci /* 705bf215546Sopenharmony_ci * The virtual GRF's whose range overlaps the end of each basic block. 706bf215546Sopenharmony_ci */ 707bf215546Sopenharmony_ci 708bf215546Sopenharmony_ci BITSET_WORD **liveout; 709bf215546Sopenharmony_ci 710bf215546Sopenharmony_ci /* 711bf215546Sopenharmony_ci * The hardware GRF's whose range overlaps the end of each basic block. 712bf215546Sopenharmony_ci */ 713bf215546Sopenharmony_ci 714bf215546Sopenharmony_ci BITSET_WORD **hw_liveout; 715bf215546Sopenharmony_ci 716bf215546Sopenharmony_ci /* 717bf215546Sopenharmony_ci * Whether we've scheduled a write for this virtual GRF yet. 718bf215546Sopenharmony_ci */ 719bf215546Sopenharmony_ci 720bf215546Sopenharmony_ci bool *written; 721bf215546Sopenharmony_ci 722bf215546Sopenharmony_ci /* 723bf215546Sopenharmony_ci * How many reads we haven't scheduled for this virtual GRF yet. 724bf215546Sopenharmony_ci */ 725bf215546Sopenharmony_ci 726bf215546Sopenharmony_ci int *reads_remaining; 727bf215546Sopenharmony_ci 728bf215546Sopenharmony_ci /* 729bf215546Sopenharmony_ci * How many reads we haven't scheduled for this hardware GRF yet. 730bf215546Sopenharmony_ci */ 731bf215546Sopenharmony_ci 732bf215546Sopenharmony_ci int *hw_reads_remaining; 733bf215546Sopenharmony_ci}; 734bf215546Sopenharmony_ci 735bf215546Sopenharmony_ciclass fs_instruction_scheduler : public instruction_scheduler 736bf215546Sopenharmony_ci{ 737bf215546Sopenharmony_cipublic: 738bf215546Sopenharmony_ci fs_instruction_scheduler(const fs_visitor *v, int grf_count, int hw_reg_count, 739bf215546Sopenharmony_ci int block_count, 740bf215546Sopenharmony_ci instruction_scheduler_mode mode); 741bf215546Sopenharmony_ci void calculate_deps(); 742bf215546Sopenharmony_ci bool is_compressed(const fs_inst *inst); 743bf215546Sopenharmony_ci schedule_node *choose_instruction_to_schedule(); 744bf215546Sopenharmony_ci int issue_time(backend_instruction *inst); 745bf215546Sopenharmony_ci const fs_visitor *v; 746bf215546Sopenharmony_ci 747bf215546Sopenharmony_ci void count_reads_remaining(backend_instruction *inst); 748bf215546Sopenharmony_ci void setup_liveness(cfg_t *cfg); 749bf215546Sopenharmony_ci void update_register_pressure(backend_instruction *inst); 750bf215546Sopenharmony_ci int get_register_pressure_benefit(backend_instruction *inst); 751bf215546Sopenharmony_ci}; 752bf215546Sopenharmony_ci 753bf215546Sopenharmony_cifs_instruction_scheduler::fs_instruction_scheduler(const fs_visitor *v, 754bf215546Sopenharmony_ci int grf_count, int hw_reg_count, 755bf215546Sopenharmony_ci int block_count, 756bf215546Sopenharmony_ci instruction_scheduler_mode mode) 757bf215546Sopenharmony_ci : instruction_scheduler(v, grf_count, hw_reg_count, block_count, mode), 758bf215546Sopenharmony_ci v(v) 759bf215546Sopenharmony_ci{ 760bf215546Sopenharmony_ci} 761bf215546Sopenharmony_ci 762bf215546Sopenharmony_cistatic bool 763bf215546Sopenharmony_ciis_src_duplicate(fs_inst *inst, int src) 764bf215546Sopenharmony_ci{ 765bf215546Sopenharmony_ci for (int i = 0; i < src; i++) 766bf215546Sopenharmony_ci if (inst->src[i].equals(inst->src[src])) 767bf215546Sopenharmony_ci return true; 768bf215546Sopenharmony_ci 769bf215546Sopenharmony_ci return false; 770bf215546Sopenharmony_ci} 771bf215546Sopenharmony_ci 772bf215546Sopenharmony_civoid 773bf215546Sopenharmony_cifs_instruction_scheduler::count_reads_remaining(backend_instruction *be) 774bf215546Sopenharmony_ci{ 775bf215546Sopenharmony_ci fs_inst *inst = (fs_inst *)be; 776bf215546Sopenharmony_ci 777bf215546Sopenharmony_ci if (!reads_remaining) 778bf215546Sopenharmony_ci return; 779bf215546Sopenharmony_ci 780bf215546Sopenharmony_ci for (int i = 0; i < inst->sources; i++) { 781bf215546Sopenharmony_ci if (is_src_duplicate(inst, i)) 782bf215546Sopenharmony_ci continue; 783bf215546Sopenharmony_ci 784bf215546Sopenharmony_ci if (inst->src[i].file == VGRF) { 785bf215546Sopenharmony_ci reads_remaining[inst->src[i].nr]++; 786bf215546Sopenharmony_ci } else if (inst->src[i].file == FIXED_GRF) { 787bf215546Sopenharmony_ci if (inst->src[i].nr >= hw_reg_count) 788bf215546Sopenharmony_ci continue; 789bf215546Sopenharmony_ci 790bf215546Sopenharmony_ci for (unsigned j = 0; j < regs_read(inst, i); j++) 791bf215546Sopenharmony_ci hw_reads_remaining[inst->src[i].nr + j]++; 792bf215546Sopenharmony_ci } 793bf215546Sopenharmony_ci } 794bf215546Sopenharmony_ci} 795bf215546Sopenharmony_ci 796bf215546Sopenharmony_civoid 797bf215546Sopenharmony_cifs_instruction_scheduler::setup_liveness(cfg_t *cfg) 798bf215546Sopenharmony_ci{ 799bf215546Sopenharmony_ci const fs_live_variables &live = v->live_analysis.require(); 800bf215546Sopenharmony_ci 801bf215546Sopenharmony_ci /* First, compute liveness on a per-GRF level using the in/out sets from 802bf215546Sopenharmony_ci * liveness calculation. 803bf215546Sopenharmony_ci */ 804bf215546Sopenharmony_ci for (int block = 0; block < cfg->num_blocks; block++) { 805bf215546Sopenharmony_ci for (int i = 0; i < live.num_vars; i++) { 806bf215546Sopenharmony_ci if (BITSET_TEST(live.block_data[block].livein, i)) { 807bf215546Sopenharmony_ci int vgrf = live.vgrf_from_var[i]; 808bf215546Sopenharmony_ci if (!BITSET_TEST(livein[block], vgrf)) { 809bf215546Sopenharmony_ci reg_pressure_in[block] += v->alloc.sizes[vgrf]; 810bf215546Sopenharmony_ci BITSET_SET(livein[block], vgrf); 811bf215546Sopenharmony_ci } 812bf215546Sopenharmony_ci } 813bf215546Sopenharmony_ci 814bf215546Sopenharmony_ci if (BITSET_TEST(live.block_data[block].liveout, i)) 815bf215546Sopenharmony_ci BITSET_SET(liveout[block], live.vgrf_from_var[i]); 816bf215546Sopenharmony_ci } 817bf215546Sopenharmony_ci } 818bf215546Sopenharmony_ci 819bf215546Sopenharmony_ci /* Now, extend the live in/live out sets for when a range crosses a block 820bf215546Sopenharmony_ci * boundary, which matches what our register allocator/interference code 821bf215546Sopenharmony_ci * does to account for force_writemask_all and incompatible exec_mask's. 822bf215546Sopenharmony_ci */ 823bf215546Sopenharmony_ci for (int block = 0; block < cfg->num_blocks - 1; block++) { 824bf215546Sopenharmony_ci for (int i = 0; i < grf_count; i++) { 825bf215546Sopenharmony_ci if (live.vgrf_start[i] <= cfg->blocks[block]->end_ip && 826bf215546Sopenharmony_ci live.vgrf_end[i] >= cfg->blocks[block + 1]->start_ip) { 827bf215546Sopenharmony_ci if (!BITSET_TEST(livein[block + 1], i)) { 828bf215546Sopenharmony_ci reg_pressure_in[block + 1] += v->alloc.sizes[i]; 829bf215546Sopenharmony_ci BITSET_SET(livein[block + 1], i); 830bf215546Sopenharmony_ci } 831bf215546Sopenharmony_ci 832bf215546Sopenharmony_ci BITSET_SET(liveout[block], i); 833bf215546Sopenharmony_ci } 834bf215546Sopenharmony_ci } 835bf215546Sopenharmony_ci } 836bf215546Sopenharmony_ci 837bf215546Sopenharmony_ci int payload_last_use_ip[hw_reg_count]; 838bf215546Sopenharmony_ci v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip); 839bf215546Sopenharmony_ci 840bf215546Sopenharmony_ci for (unsigned i = 0; i < hw_reg_count; i++) { 841bf215546Sopenharmony_ci if (payload_last_use_ip[i] == -1) 842bf215546Sopenharmony_ci continue; 843bf215546Sopenharmony_ci 844bf215546Sopenharmony_ci for (int block = 0; block < cfg->num_blocks; block++) { 845bf215546Sopenharmony_ci if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i]) 846bf215546Sopenharmony_ci reg_pressure_in[block]++; 847bf215546Sopenharmony_ci 848bf215546Sopenharmony_ci if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i]) 849bf215546Sopenharmony_ci BITSET_SET(hw_liveout[block], i); 850bf215546Sopenharmony_ci } 851bf215546Sopenharmony_ci } 852bf215546Sopenharmony_ci} 853bf215546Sopenharmony_ci 854bf215546Sopenharmony_civoid 855bf215546Sopenharmony_cifs_instruction_scheduler::update_register_pressure(backend_instruction *be) 856bf215546Sopenharmony_ci{ 857bf215546Sopenharmony_ci fs_inst *inst = (fs_inst *)be; 858bf215546Sopenharmony_ci 859bf215546Sopenharmony_ci if (!reads_remaining) 860bf215546Sopenharmony_ci return; 861bf215546Sopenharmony_ci 862bf215546Sopenharmony_ci if (inst->dst.file == VGRF) { 863bf215546Sopenharmony_ci written[inst->dst.nr] = true; 864bf215546Sopenharmony_ci } 865bf215546Sopenharmony_ci 866bf215546Sopenharmony_ci for (int i = 0; i < inst->sources; i++) { 867bf215546Sopenharmony_ci if (is_src_duplicate(inst, i)) 868bf215546Sopenharmony_ci continue; 869bf215546Sopenharmony_ci 870bf215546Sopenharmony_ci if (inst->src[i].file == VGRF) { 871bf215546Sopenharmony_ci reads_remaining[inst->src[i].nr]--; 872bf215546Sopenharmony_ci } else if (inst->src[i].file == FIXED_GRF && 873bf215546Sopenharmony_ci inst->src[i].nr < hw_reg_count) { 874bf215546Sopenharmony_ci for (unsigned off = 0; off < regs_read(inst, i); off++) 875bf215546Sopenharmony_ci hw_reads_remaining[inst->src[i].nr + off]--; 876bf215546Sopenharmony_ci } 877bf215546Sopenharmony_ci } 878bf215546Sopenharmony_ci} 879bf215546Sopenharmony_ci 880bf215546Sopenharmony_ciint 881bf215546Sopenharmony_cifs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be) 882bf215546Sopenharmony_ci{ 883bf215546Sopenharmony_ci fs_inst *inst = (fs_inst *)be; 884bf215546Sopenharmony_ci int benefit = 0; 885bf215546Sopenharmony_ci 886bf215546Sopenharmony_ci if (inst->dst.file == VGRF) { 887bf215546Sopenharmony_ci if (!BITSET_TEST(livein[block_idx], inst->dst.nr) && 888bf215546Sopenharmony_ci !written[inst->dst.nr]) 889bf215546Sopenharmony_ci benefit -= v->alloc.sizes[inst->dst.nr]; 890bf215546Sopenharmony_ci } 891bf215546Sopenharmony_ci 892bf215546Sopenharmony_ci for (int i = 0; i < inst->sources; i++) { 893bf215546Sopenharmony_ci if (is_src_duplicate(inst, i)) 894bf215546Sopenharmony_ci continue; 895bf215546Sopenharmony_ci 896bf215546Sopenharmony_ci if (inst->src[i].file == VGRF && 897bf215546Sopenharmony_ci !BITSET_TEST(liveout[block_idx], inst->src[i].nr) && 898bf215546Sopenharmony_ci reads_remaining[inst->src[i].nr] == 1) 899bf215546Sopenharmony_ci benefit += v->alloc.sizes[inst->src[i].nr]; 900bf215546Sopenharmony_ci 901bf215546Sopenharmony_ci if (inst->src[i].file == FIXED_GRF && 902bf215546Sopenharmony_ci inst->src[i].nr < hw_reg_count) { 903bf215546Sopenharmony_ci for (unsigned off = 0; off < regs_read(inst, i); off++) { 904bf215546Sopenharmony_ci int reg = inst->src[i].nr + off; 905bf215546Sopenharmony_ci if (!BITSET_TEST(hw_liveout[block_idx], reg) && 906bf215546Sopenharmony_ci hw_reads_remaining[reg] == 1) { 907bf215546Sopenharmony_ci benefit++; 908bf215546Sopenharmony_ci } 909bf215546Sopenharmony_ci } 910bf215546Sopenharmony_ci } 911bf215546Sopenharmony_ci } 912bf215546Sopenharmony_ci 913bf215546Sopenharmony_ci return benefit; 914bf215546Sopenharmony_ci} 915bf215546Sopenharmony_ci 916bf215546Sopenharmony_ciclass vec4_instruction_scheduler : public instruction_scheduler 917bf215546Sopenharmony_ci{ 918bf215546Sopenharmony_cipublic: 919bf215546Sopenharmony_ci vec4_instruction_scheduler(const vec4_visitor *v, int grf_count); 920bf215546Sopenharmony_ci void calculate_deps(); 921bf215546Sopenharmony_ci schedule_node *choose_instruction_to_schedule(); 922bf215546Sopenharmony_ci int issue_time(backend_instruction *inst); 923bf215546Sopenharmony_ci const vec4_visitor *v; 924bf215546Sopenharmony_ci 925bf215546Sopenharmony_ci void count_reads_remaining(backend_instruction *inst); 926bf215546Sopenharmony_ci void setup_liveness(cfg_t *cfg); 927bf215546Sopenharmony_ci void update_register_pressure(backend_instruction *inst); 928bf215546Sopenharmony_ci int get_register_pressure_benefit(backend_instruction *inst); 929bf215546Sopenharmony_ci}; 930bf215546Sopenharmony_ci 931bf215546Sopenharmony_civec4_instruction_scheduler::vec4_instruction_scheduler(const vec4_visitor *v, 932bf215546Sopenharmony_ci int grf_count) 933bf215546Sopenharmony_ci : instruction_scheduler(v, grf_count, 0, 0, SCHEDULE_POST), 934bf215546Sopenharmony_ci v(v) 935bf215546Sopenharmony_ci{ 936bf215546Sopenharmony_ci} 937bf215546Sopenharmony_ci 938bf215546Sopenharmony_civoid 939bf215546Sopenharmony_civec4_instruction_scheduler::count_reads_remaining(backend_instruction *) 940bf215546Sopenharmony_ci{ 941bf215546Sopenharmony_ci} 942bf215546Sopenharmony_ci 943bf215546Sopenharmony_civoid 944bf215546Sopenharmony_civec4_instruction_scheduler::setup_liveness(cfg_t *) 945bf215546Sopenharmony_ci{ 946bf215546Sopenharmony_ci} 947bf215546Sopenharmony_ci 948bf215546Sopenharmony_civoid 949bf215546Sopenharmony_civec4_instruction_scheduler::update_register_pressure(backend_instruction *) 950bf215546Sopenharmony_ci{ 951bf215546Sopenharmony_ci} 952bf215546Sopenharmony_ci 953bf215546Sopenharmony_ciint 954bf215546Sopenharmony_civec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *) 955bf215546Sopenharmony_ci{ 956bf215546Sopenharmony_ci return 0; 957bf215546Sopenharmony_ci} 958bf215546Sopenharmony_ci 959bf215546Sopenharmony_cischedule_node::schedule_node(backend_instruction *inst, 960bf215546Sopenharmony_ci instruction_scheduler *sched) 961bf215546Sopenharmony_ci{ 962bf215546Sopenharmony_ci const struct intel_device_info *devinfo = sched->bs->devinfo; 963bf215546Sopenharmony_ci 964bf215546Sopenharmony_ci this->isa = &sched->bs->compiler->isa; 965bf215546Sopenharmony_ci this->inst = inst; 966bf215546Sopenharmony_ci this->child_array_size = 0; 967bf215546Sopenharmony_ci this->children = NULL; 968bf215546Sopenharmony_ci this->child_latency = NULL; 969bf215546Sopenharmony_ci this->child_count = 0; 970bf215546Sopenharmony_ci this->parent_count = 0; 971bf215546Sopenharmony_ci this->unblocked_time = 0; 972bf215546Sopenharmony_ci this->cand_generation = 0; 973bf215546Sopenharmony_ci this->delay = 0; 974bf215546Sopenharmony_ci this->exit = NULL; 975bf215546Sopenharmony_ci 976bf215546Sopenharmony_ci /* We can't measure Gfx6 timings directly but expect them to be much 977bf215546Sopenharmony_ci * closer to Gfx7 than Gfx4. 978bf215546Sopenharmony_ci */ 979bf215546Sopenharmony_ci if (!sched->post_reg_alloc) 980bf215546Sopenharmony_ci this->latency = 1; 981bf215546Sopenharmony_ci else if (devinfo->ver >= 6) 982bf215546Sopenharmony_ci set_latency_gfx7(devinfo->verx10 == 75); 983bf215546Sopenharmony_ci else 984bf215546Sopenharmony_ci set_latency_gfx4(); 985bf215546Sopenharmony_ci} 986bf215546Sopenharmony_ci 987bf215546Sopenharmony_civoid 988bf215546Sopenharmony_ciinstruction_scheduler::add_insts_from_block(bblock_t *block) 989bf215546Sopenharmony_ci{ 990bf215546Sopenharmony_ci foreach_inst_in_block(backend_instruction, inst, block) { 991bf215546Sopenharmony_ci schedule_node *n = new(mem_ctx) schedule_node(inst, this); 992bf215546Sopenharmony_ci 993bf215546Sopenharmony_ci instructions.push_tail(n); 994bf215546Sopenharmony_ci } 995bf215546Sopenharmony_ci} 996bf215546Sopenharmony_ci 997bf215546Sopenharmony_ci/** Computation of the delay member of each node. */ 998bf215546Sopenharmony_civoid 999bf215546Sopenharmony_ciinstruction_scheduler::compute_delays() 1000bf215546Sopenharmony_ci{ 1001bf215546Sopenharmony_ci foreach_in_list_reverse(schedule_node, n, &instructions) { 1002bf215546Sopenharmony_ci if (!n->child_count) { 1003bf215546Sopenharmony_ci n->delay = issue_time(n->inst); 1004bf215546Sopenharmony_ci } else { 1005bf215546Sopenharmony_ci for (int i = 0; i < n->child_count; i++) { 1006bf215546Sopenharmony_ci assert(n->children[i]->delay); 1007bf215546Sopenharmony_ci n->delay = MAX2(n->delay, n->latency + n->children[i]->delay); 1008bf215546Sopenharmony_ci } 1009bf215546Sopenharmony_ci } 1010bf215546Sopenharmony_ci } 1011bf215546Sopenharmony_ci} 1012bf215546Sopenharmony_ci 1013bf215546Sopenharmony_civoid 1014bf215546Sopenharmony_ciinstruction_scheduler::compute_exits() 1015bf215546Sopenharmony_ci{ 1016bf215546Sopenharmony_ci /* Calculate a lower bound of the scheduling time of each node in the 1017bf215546Sopenharmony_ci * graph. This is analogous to the node's critical path but calculated 1018bf215546Sopenharmony_ci * from the top instead of from the bottom of the block. 1019bf215546Sopenharmony_ci */ 1020bf215546Sopenharmony_ci foreach_in_list(schedule_node, n, &instructions) { 1021bf215546Sopenharmony_ci for (int i = 0; i < n->child_count; i++) { 1022bf215546Sopenharmony_ci n->children[i]->unblocked_time = 1023bf215546Sopenharmony_ci MAX2(n->children[i]->unblocked_time, 1024bf215546Sopenharmony_ci n->unblocked_time + issue_time(n->inst) + n->child_latency[i]); 1025bf215546Sopenharmony_ci } 1026bf215546Sopenharmony_ci } 1027bf215546Sopenharmony_ci 1028bf215546Sopenharmony_ci /* Calculate the exit of each node by induction based on the exit nodes of 1029bf215546Sopenharmony_ci * its children. The preferred exit of a node is the one among the exit 1030bf215546Sopenharmony_ci * nodes of its children which can be unblocked first according to the 1031bf215546Sopenharmony_ci * optimistic unblocked time estimate calculated above. 1032bf215546Sopenharmony_ci */ 1033bf215546Sopenharmony_ci foreach_in_list_reverse(schedule_node, n, &instructions) { 1034bf215546Sopenharmony_ci n->exit = (n->inst->opcode == BRW_OPCODE_HALT ? n : NULL); 1035bf215546Sopenharmony_ci 1036bf215546Sopenharmony_ci for (int i = 0; i < n->child_count; i++) { 1037bf215546Sopenharmony_ci if (exit_unblocked_time(n->children[i]) < exit_unblocked_time(n)) 1038bf215546Sopenharmony_ci n->exit = n->children[i]->exit; 1039bf215546Sopenharmony_ci } 1040bf215546Sopenharmony_ci } 1041bf215546Sopenharmony_ci} 1042bf215546Sopenharmony_ci 1043bf215546Sopenharmony_ci/** 1044bf215546Sopenharmony_ci * Add a dependency between two instruction nodes. 1045bf215546Sopenharmony_ci * 1046bf215546Sopenharmony_ci * The @after node will be scheduled after @before. We will try to 1047bf215546Sopenharmony_ci * schedule it @latency cycles after @before, but no guarantees there. 1048bf215546Sopenharmony_ci */ 1049bf215546Sopenharmony_civoid 1050bf215546Sopenharmony_ciinstruction_scheduler::add_dep(schedule_node *before, schedule_node *after, 1051bf215546Sopenharmony_ci int latency) 1052bf215546Sopenharmony_ci{ 1053bf215546Sopenharmony_ci if (!before || !after) 1054bf215546Sopenharmony_ci return; 1055bf215546Sopenharmony_ci 1056bf215546Sopenharmony_ci assert(before != after); 1057bf215546Sopenharmony_ci 1058bf215546Sopenharmony_ci for (int i = 0; i < before->child_count; i++) { 1059bf215546Sopenharmony_ci if (before->children[i] == after) { 1060bf215546Sopenharmony_ci before->child_latency[i] = MAX2(before->child_latency[i], latency); 1061bf215546Sopenharmony_ci return; 1062bf215546Sopenharmony_ci } 1063bf215546Sopenharmony_ci } 1064bf215546Sopenharmony_ci 1065bf215546Sopenharmony_ci if (before->child_array_size <= before->child_count) { 1066bf215546Sopenharmony_ci if (before->child_array_size < 16) 1067bf215546Sopenharmony_ci before->child_array_size = 16; 1068bf215546Sopenharmony_ci else 1069bf215546Sopenharmony_ci before->child_array_size *= 2; 1070bf215546Sopenharmony_ci 1071bf215546Sopenharmony_ci before->children = reralloc(mem_ctx, before->children, 1072bf215546Sopenharmony_ci schedule_node *, 1073bf215546Sopenharmony_ci before->child_array_size); 1074bf215546Sopenharmony_ci before->child_latency = reralloc(mem_ctx, before->child_latency, 1075bf215546Sopenharmony_ci int, before->child_array_size); 1076bf215546Sopenharmony_ci } 1077bf215546Sopenharmony_ci 1078bf215546Sopenharmony_ci before->children[before->child_count] = after; 1079bf215546Sopenharmony_ci before->child_latency[before->child_count] = latency; 1080bf215546Sopenharmony_ci before->child_count++; 1081bf215546Sopenharmony_ci after->parent_count++; 1082bf215546Sopenharmony_ci} 1083bf215546Sopenharmony_ci 1084bf215546Sopenharmony_civoid 1085bf215546Sopenharmony_ciinstruction_scheduler::add_dep(schedule_node *before, schedule_node *after) 1086bf215546Sopenharmony_ci{ 1087bf215546Sopenharmony_ci if (!before) 1088bf215546Sopenharmony_ci return; 1089bf215546Sopenharmony_ci 1090bf215546Sopenharmony_ci add_dep(before, after, before->latency); 1091bf215546Sopenharmony_ci} 1092bf215546Sopenharmony_ci 1093bf215546Sopenharmony_cistatic bool 1094bf215546Sopenharmony_ciis_scheduling_barrier(const backend_instruction *inst) 1095bf215546Sopenharmony_ci{ 1096bf215546Sopenharmony_ci return inst->opcode == SHADER_OPCODE_HALT_TARGET || 1097bf215546Sopenharmony_ci inst->is_control_flow() || 1098bf215546Sopenharmony_ci inst->has_side_effects(); 1099bf215546Sopenharmony_ci} 1100bf215546Sopenharmony_ci 1101bf215546Sopenharmony_ci/** 1102bf215546Sopenharmony_ci * Sometimes we really want this node to execute after everything that 1103bf215546Sopenharmony_ci * was before it and before everything that followed it. This adds 1104bf215546Sopenharmony_ci * the deps to do so. 1105bf215546Sopenharmony_ci */ 1106bf215546Sopenharmony_civoid 1107bf215546Sopenharmony_ciinstruction_scheduler::add_barrier_deps(schedule_node *n) 1108bf215546Sopenharmony_ci{ 1109bf215546Sopenharmony_ci schedule_node *prev = (schedule_node *)n->prev; 1110bf215546Sopenharmony_ci schedule_node *next = (schedule_node *)n->next; 1111bf215546Sopenharmony_ci 1112bf215546Sopenharmony_ci if (prev) { 1113bf215546Sopenharmony_ci while (!prev->is_head_sentinel()) { 1114bf215546Sopenharmony_ci add_dep(prev, n, 0); 1115bf215546Sopenharmony_ci if (is_scheduling_barrier(prev->inst)) 1116bf215546Sopenharmony_ci break; 1117bf215546Sopenharmony_ci prev = (schedule_node *)prev->prev; 1118bf215546Sopenharmony_ci } 1119bf215546Sopenharmony_ci } 1120bf215546Sopenharmony_ci 1121bf215546Sopenharmony_ci if (next) { 1122bf215546Sopenharmony_ci while (!next->is_tail_sentinel()) { 1123bf215546Sopenharmony_ci add_dep(n, next, 0); 1124bf215546Sopenharmony_ci if (is_scheduling_barrier(next->inst)) 1125bf215546Sopenharmony_ci break; 1126bf215546Sopenharmony_ci next = (schedule_node *)next->next; 1127bf215546Sopenharmony_ci } 1128bf215546Sopenharmony_ci } 1129bf215546Sopenharmony_ci} 1130bf215546Sopenharmony_ci 1131bf215546Sopenharmony_ci/* instruction scheduling needs to be aware of when an MRF write 1132bf215546Sopenharmony_ci * actually writes 2 MRFs. 1133bf215546Sopenharmony_ci */ 1134bf215546Sopenharmony_cibool 1135bf215546Sopenharmony_cifs_instruction_scheduler::is_compressed(const fs_inst *inst) 1136bf215546Sopenharmony_ci{ 1137bf215546Sopenharmony_ci return inst->exec_size == 16; 1138bf215546Sopenharmony_ci} 1139bf215546Sopenharmony_ci 1140bf215546Sopenharmony_civoid 1141bf215546Sopenharmony_cifs_instruction_scheduler::calculate_deps() 1142bf215546Sopenharmony_ci{ 1143bf215546Sopenharmony_ci /* Pre-register-allocation, this tracks the last write per VGRF offset. 1144bf215546Sopenharmony_ci * After register allocation, reg_offsets are gone and we track individual 1145bf215546Sopenharmony_ci * GRF registers. 1146bf215546Sopenharmony_ci */ 1147bf215546Sopenharmony_ci schedule_node **last_grf_write; 1148bf215546Sopenharmony_ci schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->ver)]; 1149bf215546Sopenharmony_ci schedule_node *last_conditional_mod[8] = {}; 1150bf215546Sopenharmony_ci schedule_node *last_accumulator_write = NULL; 1151bf215546Sopenharmony_ci /* Fixed HW registers are assumed to be separate from the virtual 1152bf215546Sopenharmony_ci * GRFs, so they can be tracked separately. We don't really write 1153bf215546Sopenharmony_ci * to fixed GRFs much, so don't bother tracking them on a more 1154bf215546Sopenharmony_ci * granular level. 1155bf215546Sopenharmony_ci */ 1156bf215546Sopenharmony_ci schedule_node *last_fixed_grf_write = NULL; 1157bf215546Sopenharmony_ci 1158bf215546Sopenharmony_ci last_grf_write = (schedule_node **)calloc(sizeof(schedule_node *), grf_count * 16); 1159bf215546Sopenharmony_ci memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1160bf215546Sopenharmony_ci 1161bf215546Sopenharmony_ci /* top-to-bottom dependencies: RAW and WAW. */ 1162bf215546Sopenharmony_ci foreach_in_list(schedule_node, n, &instructions) { 1163bf215546Sopenharmony_ci fs_inst *inst = (fs_inst *)n->inst; 1164bf215546Sopenharmony_ci 1165bf215546Sopenharmony_ci if (is_scheduling_barrier(inst)) 1166bf215546Sopenharmony_ci add_barrier_deps(n); 1167bf215546Sopenharmony_ci 1168bf215546Sopenharmony_ci /* read-after-write deps. */ 1169bf215546Sopenharmony_ci for (int i = 0; i < inst->sources; i++) { 1170bf215546Sopenharmony_ci if (inst->src[i].file == VGRF) { 1171bf215546Sopenharmony_ci if (post_reg_alloc) { 1172bf215546Sopenharmony_ci for (unsigned r = 0; r < regs_read(inst, i); r++) 1173bf215546Sopenharmony_ci add_dep(last_grf_write[inst->src[i].nr + r], n); 1174bf215546Sopenharmony_ci } else { 1175bf215546Sopenharmony_ci for (unsigned r = 0; r < regs_read(inst, i); r++) { 1176bf215546Sopenharmony_ci add_dep(last_grf_write[inst->src[i].nr * 16 + 1177bf215546Sopenharmony_ci inst->src[i].offset / REG_SIZE + r], n); 1178bf215546Sopenharmony_ci } 1179bf215546Sopenharmony_ci } 1180bf215546Sopenharmony_ci } else if (inst->src[i].file == FIXED_GRF) { 1181bf215546Sopenharmony_ci if (post_reg_alloc) { 1182bf215546Sopenharmony_ci for (unsigned r = 0; r < regs_read(inst, i); r++) 1183bf215546Sopenharmony_ci add_dep(last_grf_write[inst->src[i].nr + r], n); 1184bf215546Sopenharmony_ci } else { 1185bf215546Sopenharmony_ci add_dep(last_fixed_grf_write, n); 1186bf215546Sopenharmony_ci } 1187bf215546Sopenharmony_ci } else if (inst->src[i].is_accumulator()) { 1188bf215546Sopenharmony_ci add_dep(last_accumulator_write, n); 1189bf215546Sopenharmony_ci } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { 1190bf215546Sopenharmony_ci add_barrier_deps(n); 1191bf215546Sopenharmony_ci } 1192bf215546Sopenharmony_ci } 1193bf215546Sopenharmony_ci 1194bf215546Sopenharmony_ci if (inst->base_mrf != -1) { 1195bf215546Sopenharmony_ci for (int i = 0; i < inst->mlen; i++) { 1196bf215546Sopenharmony_ci /* It looks like the MRF regs are released in the send 1197bf215546Sopenharmony_ci * instruction once it's sent, not when the result comes 1198bf215546Sopenharmony_ci * back. 1199bf215546Sopenharmony_ci */ 1200bf215546Sopenharmony_ci add_dep(last_mrf_write[inst->base_mrf + i], n); 1201bf215546Sopenharmony_ci } 1202bf215546Sopenharmony_ci } 1203bf215546Sopenharmony_ci 1204bf215546Sopenharmony_ci if (const unsigned mask = inst->flags_read(v->devinfo)) { 1205bf215546Sopenharmony_ci assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1206bf215546Sopenharmony_ci 1207bf215546Sopenharmony_ci for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1208bf215546Sopenharmony_ci if (mask & (1 << i)) 1209bf215546Sopenharmony_ci add_dep(last_conditional_mod[i], n); 1210bf215546Sopenharmony_ci } 1211bf215546Sopenharmony_ci } 1212bf215546Sopenharmony_ci 1213bf215546Sopenharmony_ci if (inst->reads_accumulator_implicitly()) { 1214bf215546Sopenharmony_ci add_dep(last_accumulator_write, n); 1215bf215546Sopenharmony_ci } 1216bf215546Sopenharmony_ci 1217bf215546Sopenharmony_ci /* write-after-write deps. */ 1218bf215546Sopenharmony_ci if (inst->dst.file == VGRF) { 1219bf215546Sopenharmony_ci if (post_reg_alloc) { 1220bf215546Sopenharmony_ci for (unsigned r = 0; r < regs_written(inst); r++) { 1221bf215546Sopenharmony_ci add_dep(last_grf_write[inst->dst.nr + r], n); 1222bf215546Sopenharmony_ci last_grf_write[inst->dst.nr + r] = n; 1223bf215546Sopenharmony_ci } 1224bf215546Sopenharmony_ci } else { 1225bf215546Sopenharmony_ci for (unsigned r = 0; r < regs_written(inst); r++) { 1226bf215546Sopenharmony_ci add_dep(last_grf_write[inst->dst.nr * 16 + 1227bf215546Sopenharmony_ci inst->dst.offset / REG_SIZE + r], n); 1228bf215546Sopenharmony_ci last_grf_write[inst->dst.nr * 16 + 1229bf215546Sopenharmony_ci inst->dst.offset / REG_SIZE + r] = n; 1230bf215546Sopenharmony_ci } 1231bf215546Sopenharmony_ci } 1232bf215546Sopenharmony_ci } else if (inst->dst.file == MRF) { 1233bf215546Sopenharmony_ci int reg = inst->dst.nr & ~BRW_MRF_COMPR4; 1234bf215546Sopenharmony_ci 1235bf215546Sopenharmony_ci add_dep(last_mrf_write[reg], n); 1236bf215546Sopenharmony_ci last_mrf_write[reg] = n; 1237bf215546Sopenharmony_ci if (is_compressed(inst)) { 1238bf215546Sopenharmony_ci if (inst->dst.nr & BRW_MRF_COMPR4) 1239bf215546Sopenharmony_ci reg += 4; 1240bf215546Sopenharmony_ci else 1241bf215546Sopenharmony_ci reg++; 1242bf215546Sopenharmony_ci add_dep(last_mrf_write[reg], n); 1243bf215546Sopenharmony_ci last_mrf_write[reg] = n; 1244bf215546Sopenharmony_ci } 1245bf215546Sopenharmony_ci } else if (inst->dst.file == FIXED_GRF) { 1246bf215546Sopenharmony_ci if (post_reg_alloc) { 1247bf215546Sopenharmony_ci for (unsigned r = 0; r < regs_written(inst); r++) { 1248bf215546Sopenharmony_ci add_dep(last_grf_write[inst->dst.nr + r], n); 1249bf215546Sopenharmony_ci last_grf_write[inst->dst.nr + r] = n; 1250bf215546Sopenharmony_ci } 1251bf215546Sopenharmony_ci } else { 1252bf215546Sopenharmony_ci add_dep(last_fixed_grf_write, n); 1253bf215546Sopenharmony_ci last_fixed_grf_write = n; 1254bf215546Sopenharmony_ci } 1255bf215546Sopenharmony_ci } else if (inst->dst.is_accumulator()) { 1256bf215546Sopenharmony_ci add_dep(last_accumulator_write, n); 1257bf215546Sopenharmony_ci last_accumulator_write = n; 1258bf215546Sopenharmony_ci } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1259bf215546Sopenharmony_ci add_barrier_deps(n); 1260bf215546Sopenharmony_ci } 1261bf215546Sopenharmony_ci 1262bf215546Sopenharmony_ci if (inst->mlen > 0 && inst->base_mrf != -1) { 1263bf215546Sopenharmony_ci for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { 1264bf215546Sopenharmony_ci add_dep(last_mrf_write[inst->base_mrf + i], n); 1265bf215546Sopenharmony_ci last_mrf_write[inst->base_mrf + i] = n; 1266bf215546Sopenharmony_ci } 1267bf215546Sopenharmony_ci } 1268bf215546Sopenharmony_ci 1269bf215546Sopenharmony_ci if (const unsigned mask = inst->flags_written(v->devinfo)) { 1270bf215546Sopenharmony_ci assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1271bf215546Sopenharmony_ci 1272bf215546Sopenharmony_ci for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1273bf215546Sopenharmony_ci if (mask & (1 << i)) { 1274bf215546Sopenharmony_ci add_dep(last_conditional_mod[i], n, 0); 1275bf215546Sopenharmony_ci last_conditional_mod[i] = n; 1276bf215546Sopenharmony_ci } 1277bf215546Sopenharmony_ci } 1278bf215546Sopenharmony_ci } 1279bf215546Sopenharmony_ci 1280bf215546Sopenharmony_ci if (inst->writes_accumulator_implicitly(v->devinfo) && 1281bf215546Sopenharmony_ci !inst->dst.is_accumulator()) { 1282bf215546Sopenharmony_ci add_dep(last_accumulator_write, n); 1283bf215546Sopenharmony_ci last_accumulator_write = n; 1284bf215546Sopenharmony_ci } 1285bf215546Sopenharmony_ci } 1286bf215546Sopenharmony_ci 1287bf215546Sopenharmony_ci /* bottom-to-top dependencies: WAR */ 1288bf215546Sopenharmony_ci memset(last_grf_write, 0, sizeof(schedule_node *) * grf_count * 16); 1289bf215546Sopenharmony_ci memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1290bf215546Sopenharmony_ci memset(last_conditional_mod, 0, sizeof(last_conditional_mod)); 1291bf215546Sopenharmony_ci last_accumulator_write = NULL; 1292bf215546Sopenharmony_ci last_fixed_grf_write = NULL; 1293bf215546Sopenharmony_ci 1294bf215546Sopenharmony_ci foreach_in_list_reverse_safe(schedule_node, n, &instructions) { 1295bf215546Sopenharmony_ci fs_inst *inst = (fs_inst *)n->inst; 1296bf215546Sopenharmony_ci 1297bf215546Sopenharmony_ci /* write-after-read deps. */ 1298bf215546Sopenharmony_ci for (int i = 0; i < inst->sources; i++) { 1299bf215546Sopenharmony_ci if (inst->src[i].file == VGRF) { 1300bf215546Sopenharmony_ci if (post_reg_alloc) { 1301bf215546Sopenharmony_ci for (unsigned r = 0; r < regs_read(inst, i); r++) 1302bf215546Sopenharmony_ci add_dep(n, last_grf_write[inst->src[i].nr + r], 0); 1303bf215546Sopenharmony_ci } else { 1304bf215546Sopenharmony_ci for (unsigned r = 0; r < regs_read(inst, i); r++) { 1305bf215546Sopenharmony_ci add_dep(n, last_grf_write[inst->src[i].nr * 16 + 1306bf215546Sopenharmony_ci inst->src[i].offset / REG_SIZE + r], 0); 1307bf215546Sopenharmony_ci } 1308bf215546Sopenharmony_ci } 1309bf215546Sopenharmony_ci } else if (inst->src[i].file == FIXED_GRF) { 1310bf215546Sopenharmony_ci if (post_reg_alloc) { 1311bf215546Sopenharmony_ci for (unsigned r = 0; r < regs_read(inst, i); r++) 1312bf215546Sopenharmony_ci add_dep(n, last_grf_write[inst->src[i].nr + r], 0); 1313bf215546Sopenharmony_ci } else { 1314bf215546Sopenharmony_ci add_dep(n, last_fixed_grf_write, 0); 1315bf215546Sopenharmony_ci } 1316bf215546Sopenharmony_ci } else if (inst->src[i].is_accumulator()) { 1317bf215546Sopenharmony_ci add_dep(n, last_accumulator_write, 0); 1318bf215546Sopenharmony_ci } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { 1319bf215546Sopenharmony_ci add_barrier_deps(n); 1320bf215546Sopenharmony_ci } 1321bf215546Sopenharmony_ci } 1322bf215546Sopenharmony_ci 1323bf215546Sopenharmony_ci if (inst->base_mrf != -1) { 1324bf215546Sopenharmony_ci for (int i = 0; i < inst->mlen; i++) { 1325bf215546Sopenharmony_ci /* It looks like the MRF regs are released in the send 1326bf215546Sopenharmony_ci * instruction once it's sent, not when the result comes 1327bf215546Sopenharmony_ci * back. 1328bf215546Sopenharmony_ci */ 1329bf215546Sopenharmony_ci add_dep(n, last_mrf_write[inst->base_mrf + i], 2); 1330bf215546Sopenharmony_ci } 1331bf215546Sopenharmony_ci } 1332bf215546Sopenharmony_ci 1333bf215546Sopenharmony_ci if (const unsigned mask = inst->flags_read(v->devinfo)) { 1334bf215546Sopenharmony_ci assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1335bf215546Sopenharmony_ci 1336bf215546Sopenharmony_ci for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1337bf215546Sopenharmony_ci if (mask & (1 << i)) 1338bf215546Sopenharmony_ci add_dep(n, last_conditional_mod[i]); 1339bf215546Sopenharmony_ci } 1340bf215546Sopenharmony_ci } 1341bf215546Sopenharmony_ci 1342bf215546Sopenharmony_ci if (inst->reads_accumulator_implicitly()) { 1343bf215546Sopenharmony_ci add_dep(n, last_accumulator_write); 1344bf215546Sopenharmony_ci } 1345bf215546Sopenharmony_ci 1346bf215546Sopenharmony_ci /* Update the things this instruction wrote, so earlier reads 1347bf215546Sopenharmony_ci * can mark this as WAR dependency. 1348bf215546Sopenharmony_ci */ 1349bf215546Sopenharmony_ci if (inst->dst.file == VGRF) { 1350bf215546Sopenharmony_ci if (post_reg_alloc) { 1351bf215546Sopenharmony_ci for (unsigned r = 0; r < regs_written(inst); r++) 1352bf215546Sopenharmony_ci last_grf_write[inst->dst.nr + r] = n; 1353bf215546Sopenharmony_ci } else { 1354bf215546Sopenharmony_ci for (unsigned r = 0; r < regs_written(inst); r++) { 1355bf215546Sopenharmony_ci last_grf_write[inst->dst.nr * 16 + 1356bf215546Sopenharmony_ci inst->dst.offset / REG_SIZE + r] = n; 1357bf215546Sopenharmony_ci } 1358bf215546Sopenharmony_ci } 1359bf215546Sopenharmony_ci } else if (inst->dst.file == MRF) { 1360bf215546Sopenharmony_ci int reg = inst->dst.nr & ~BRW_MRF_COMPR4; 1361bf215546Sopenharmony_ci 1362bf215546Sopenharmony_ci last_mrf_write[reg] = n; 1363bf215546Sopenharmony_ci 1364bf215546Sopenharmony_ci if (is_compressed(inst)) { 1365bf215546Sopenharmony_ci if (inst->dst.nr & BRW_MRF_COMPR4) 1366bf215546Sopenharmony_ci reg += 4; 1367bf215546Sopenharmony_ci else 1368bf215546Sopenharmony_ci reg++; 1369bf215546Sopenharmony_ci 1370bf215546Sopenharmony_ci last_mrf_write[reg] = n; 1371bf215546Sopenharmony_ci } 1372bf215546Sopenharmony_ci } else if (inst->dst.file == FIXED_GRF) { 1373bf215546Sopenharmony_ci if (post_reg_alloc) { 1374bf215546Sopenharmony_ci for (unsigned r = 0; r < regs_written(inst); r++) 1375bf215546Sopenharmony_ci last_grf_write[inst->dst.nr + r] = n; 1376bf215546Sopenharmony_ci } else { 1377bf215546Sopenharmony_ci last_fixed_grf_write = n; 1378bf215546Sopenharmony_ci } 1379bf215546Sopenharmony_ci } else if (inst->dst.is_accumulator()) { 1380bf215546Sopenharmony_ci last_accumulator_write = n; 1381bf215546Sopenharmony_ci } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1382bf215546Sopenharmony_ci add_barrier_deps(n); 1383bf215546Sopenharmony_ci } 1384bf215546Sopenharmony_ci 1385bf215546Sopenharmony_ci if (inst->mlen > 0 && inst->base_mrf != -1) { 1386bf215546Sopenharmony_ci for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { 1387bf215546Sopenharmony_ci last_mrf_write[inst->base_mrf + i] = n; 1388bf215546Sopenharmony_ci } 1389bf215546Sopenharmony_ci } 1390bf215546Sopenharmony_ci 1391bf215546Sopenharmony_ci if (const unsigned mask = inst->flags_written(v->devinfo)) { 1392bf215546Sopenharmony_ci assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1393bf215546Sopenharmony_ci 1394bf215546Sopenharmony_ci for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1395bf215546Sopenharmony_ci if (mask & (1 << i)) 1396bf215546Sopenharmony_ci last_conditional_mod[i] = n; 1397bf215546Sopenharmony_ci } 1398bf215546Sopenharmony_ci } 1399bf215546Sopenharmony_ci 1400bf215546Sopenharmony_ci if (inst->writes_accumulator_implicitly(v->devinfo)) { 1401bf215546Sopenharmony_ci last_accumulator_write = n; 1402bf215546Sopenharmony_ci } 1403bf215546Sopenharmony_ci } 1404bf215546Sopenharmony_ci 1405bf215546Sopenharmony_ci free(last_grf_write); 1406bf215546Sopenharmony_ci} 1407bf215546Sopenharmony_ci 1408bf215546Sopenharmony_civoid 1409bf215546Sopenharmony_civec4_instruction_scheduler::calculate_deps() 1410bf215546Sopenharmony_ci{ 1411bf215546Sopenharmony_ci schedule_node *last_grf_write[grf_count]; 1412bf215546Sopenharmony_ci schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->ver)]; 1413bf215546Sopenharmony_ci schedule_node *last_conditional_mod = NULL; 1414bf215546Sopenharmony_ci schedule_node *last_accumulator_write = NULL; 1415bf215546Sopenharmony_ci /* Fixed HW registers are assumed to be separate from the virtual 1416bf215546Sopenharmony_ci * GRFs, so they can be tracked separately. We don't really write 1417bf215546Sopenharmony_ci * to fixed GRFs much, so don't bother tracking them on a more 1418bf215546Sopenharmony_ci * granular level. 1419bf215546Sopenharmony_ci */ 1420bf215546Sopenharmony_ci schedule_node *last_fixed_grf_write = NULL; 1421bf215546Sopenharmony_ci 1422bf215546Sopenharmony_ci memset(last_grf_write, 0, sizeof(last_grf_write)); 1423bf215546Sopenharmony_ci memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1424bf215546Sopenharmony_ci 1425bf215546Sopenharmony_ci /* top-to-bottom dependencies: RAW and WAW. */ 1426bf215546Sopenharmony_ci foreach_in_list(schedule_node, n, &instructions) { 1427bf215546Sopenharmony_ci vec4_instruction *inst = (vec4_instruction *)n->inst; 1428bf215546Sopenharmony_ci 1429bf215546Sopenharmony_ci if (is_scheduling_barrier(inst)) 1430bf215546Sopenharmony_ci add_barrier_deps(n); 1431bf215546Sopenharmony_ci 1432bf215546Sopenharmony_ci /* read-after-write deps. */ 1433bf215546Sopenharmony_ci for (int i = 0; i < 3; i++) { 1434bf215546Sopenharmony_ci if (inst->src[i].file == VGRF) { 1435bf215546Sopenharmony_ci for (unsigned j = 0; j < regs_read(inst, i); ++j) 1436bf215546Sopenharmony_ci add_dep(last_grf_write[inst->src[i].nr + j], n); 1437bf215546Sopenharmony_ci } else if (inst->src[i].file == FIXED_GRF) { 1438bf215546Sopenharmony_ci add_dep(last_fixed_grf_write, n); 1439bf215546Sopenharmony_ci } else if (inst->src[i].is_accumulator()) { 1440bf215546Sopenharmony_ci assert(last_accumulator_write); 1441bf215546Sopenharmony_ci add_dep(last_accumulator_write, n); 1442bf215546Sopenharmony_ci } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { 1443bf215546Sopenharmony_ci add_barrier_deps(n); 1444bf215546Sopenharmony_ci } 1445bf215546Sopenharmony_ci } 1446bf215546Sopenharmony_ci 1447bf215546Sopenharmony_ci if (inst->reads_g0_implicitly()) 1448bf215546Sopenharmony_ci add_dep(last_fixed_grf_write, n); 1449bf215546Sopenharmony_ci 1450bf215546Sopenharmony_ci if (!inst->is_send_from_grf()) { 1451bf215546Sopenharmony_ci for (int i = 0; i < inst->mlen; i++) { 1452bf215546Sopenharmony_ci /* It looks like the MRF regs are released in the send 1453bf215546Sopenharmony_ci * instruction once it's sent, not when the result comes 1454bf215546Sopenharmony_ci * back. 1455bf215546Sopenharmony_ci */ 1456bf215546Sopenharmony_ci add_dep(last_mrf_write[inst->base_mrf + i], n); 1457bf215546Sopenharmony_ci } 1458bf215546Sopenharmony_ci } 1459bf215546Sopenharmony_ci 1460bf215546Sopenharmony_ci if (inst->reads_flag()) { 1461bf215546Sopenharmony_ci assert(last_conditional_mod); 1462bf215546Sopenharmony_ci add_dep(last_conditional_mod, n); 1463bf215546Sopenharmony_ci } 1464bf215546Sopenharmony_ci 1465bf215546Sopenharmony_ci if (inst->reads_accumulator_implicitly()) { 1466bf215546Sopenharmony_ci assert(last_accumulator_write); 1467bf215546Sopenharmony_ci add_dep(last_accumulator_write, n); 1468bf215546Sopenharmony_ci } 1469bf215546Sopenharmony_ci 1470bf215546Sopenharmony_ci /* write-after-write deps. */ 1471bf215546Sopenharmony_ci if (inst->dst.file == VGRF) { 1472bf215546Sopenharmony_ci for (unsigned j = 0; j < regs_written(inst); ++j) { 1473bf215546Sopenharmony_ci add_dep(last_grf_write[inst->dst.nr + j], n); 1474bf215546Sopenharmony_ci last_grf_write[inst->dst.nr + j] = n; 1475bf215546Sopenharmony_ci } 1476bf215546Sopenharmony_ci } else if (inst->dst.file == MRF) { 1477bf215546Sopenharmony_ci add_dep(last_mrf_write[inst->dst.nr], n); 1478bf215546Sopenharmony_ci last_mrf_write[inst->dst.nr] = n; 1479bf215546Sopenharmony_ci } else if (inst->dst.file == FIXED_GRF) { 1480bf215546Sopenharmony_ci add_dep(last_fixed_grf_write, n); 1481bf215546Sopenharmony_ci last_fixed_grf_write = n; 1482bf215546Sopenharmony_ci } else if (inst->dst.is_accumulator()) { 1483bf215546Sopenharmony_ci add_dep(last_accumulator_write, n); 1484bf215546Sopenharmony_ci last_accumulator_write = n; 1485bf215546Sopenharmony_ci } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1486bf215546Sopenharmony_ci add_barrier_deps(n); 1487bf215546Sopenharmony_ci } 1488bf215546Sopenharmony_ci 1489bf215546Sopenharmony_ci if (inst->mlen > 0 && !inst->is_send_from_grf()) { 1490bf215546Sopenharmony_ci for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { 1491bf215546Sopenharmony_ci add_dep(last_mrf_write[inst->base_mrf + i], n); 1492bf215546Sopenharmony_ci last_mrf_write[inst->base_mrf + i] = n; 1493bf215546Sopenharmony_ci } 1494bf215546Sopenharmony_ci } 1495bf215546Sopenharmony_ci 1496bf215546Sopenharmony_ci if (inst->writes_flag(v->devinfo)) { 1497bf215546Sopenharmony_ci add_dep(last_conditional_mod, n, 0); 1498bf215546Sopenharmony_ci last_conditional_mod = n; 1499bf215546Sopenharmony_ci } 1500bf215546Sopenharmony_ci 1501bf215546Sopenharmony_ci if (inst->writes_accumulator_implicitly(v->devinfo) && 1502bf215546Sopenharmony_ci !inst->dst.is_accumulator()) { 1503bf215546Sopenharmony_ci add_dep(last_accumulator_write, n); 1504bf215546Sopenharmony_ci last_accumulator_write = n; 1505bf215546Sopenharmony_ci } 1506bf215546Sopenharmony_ci } 1507bf215546Sopenharmony_ci 1508bf215546Sopenharmony_ci /* bottom-to-top dependencies: WAR */ 1509bf215546Sopenharmony_ci memset(last_grf_write, 0, sizeof(last_grf_write)); 1510bf215546Sopenharmony_ci memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1511bf215546Sopenharmony_ci last_conditional_mod = NULL; 1512bf215546Sopenharmony_ci last_accumulator_write = NULL; 1513bf215546Sopenharmony_ci last_fixed_grf_write = NULL; 1514bf215546Sopenharmony_ci 1515bf215546Sopenharmony_ci foreach_in_list_reverse_safe(schedule_node, n, &instructions) { 1516bf215546Sopenharmony_ci vec4_instruction *inst = (vec4_instruction *)n->inst; 1517bf215546Sopenharmony_ci 1518bf215546Sopenharmony_ci /* write-after-read deps. */ 1519bf215546Sopenharmony_ci for (int i = 0; i < 3; i++) { 1520bf215546Sopenharmony_ci if (inst->src[i].file == VGRF) { 1521bf215546Sopenharmony_ci for (unsigned j = 0; j < regs_read(inst, i); ++j) 1522bf215546Sopenharmony_ci add_dep(n, last_grf_write[inst->src[i].nr + j]); 1523bf215546Sopenharmony_ci } else if (inst->src[i].file == FIXED_GRF) { 1524bf215546Sopenharmony_ci add_dep(n, last_fixed_grf_write); 1525bf215546Sopenharmony_ci } else if (inst->src[i].is_accumulator()) { 1526bf215546Sopenharmony_ci add_dep(n, last_accumulator_write); 1527bf215546Sopenharmony_ci } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { 1528bf215546Sopenharmony_ci add_barrier_deps(n); 1529bf215546Sopenharmony_ci } 1530bf215546Sopenharmony_ci } 1531bf215546Sopenharmony_ci 1532bf215546Sopenharmony_ci if (!inst->is_send_from_grf()) { 1533bf215546Sopenharmony_ci for (int i = 0; i < inst->mlen; i++) { 1534bf215546Sopenharmony_ci /* It looks like the MRF regs are released in the send 1535bf215546Sopenharmony_ci * instruction once it's sent, not when the result comes 1536bf215546Sopenharmony_ci * back. 1537bf215546Sopenharmony_ci */ 1538bf215546Sopenharmony_ci add_dep(n, last_mrf_write[inst->base_mrf + i], 2); 1539bf215546Sopenharmony_ci } 1540bf215546Sopenharmony_ci } 1541bf215546Sopenharmony_ci 1542bf215546Sopenharmony_ci if (inst->reads_flag()) { 1543bf215546Sopenharmony_ci add_dep(n, last_conditional_mod); 1544bf215546Sopenharmony_ci } 1545bf215546Sopenharmony_ci 1546bf215546Sopenharmony_ci if (inst->reads_accumulator_implicitly()) { 1547bf215546Sopenharmony_ci add_dep(n, last_accumulator_write); 1548bf215546Sopenharmony_ci } 1549bf215546Sopenharmony_ci 1550bf215546Sopenharmony_ci /* Update the things this instruction wrote, so earlier reads 1551bf215546Sopenharmony_ci * can mark this as WAR dependency. 1552bf215546Sopenharmony_ci */ 1553bf215546Sopenharmony_ci if (inst->dst.file == VGRF) { 1554bf215546Sopenharmony_ci for (unsigned j = 0; j < regs_written(inst); ++j) 1555bf215546Sopenharmony_ci last_grf_write[inst->dst.nr + j] = n; 1556bf215546Sopenharmony_ci } else if (inst->dst.file == MRF) { 1557bf215546Sopenharmony_ci last_mrf_write[inst->dst.nr] = n; 1558bf215546Sopenharmony_ci } else if (inst->dst.file == FIXED_GRF) { 1559bf215546Sopenharmony_ci last_fixed_grf_write = n; 1560bf215546Sopenharmony_ci } else if (inst->dst.is_accumulator()) { 1561bf215546Sopenharmony_ci last_accumulator_write = n; 1562bf215546Sopenharmony_ci } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1563bf215546Sopenharmony_ci add_barrier_deps(n); 1564bf215546Sopenharmony_ci } 1565bf215546Sopenharmony_ci 1566bf215546Sopenharmony_ci if (inst->mlen > 0 && !inst->is_send_from_grf()) { 1567bf215546Sopenharmony_ci for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { 1568bf215546Sopenharmony_ci last_mrf_write[inst->base_mrf + i] = n; 1569bf215546Sopenharmony_ci } 1570bf215546Sopenharmony_ci } 1571bf215546Sopenharmony_ci 1572bf215546Sopenharmony_ci if (inst->writes_flag(v->devinfo)) { 1573bf215546Sopenharmony_ci last_conditional_mod = n; 1574bf215546Sopenharmony_ci } 1575bf215546Sopenharmony_ci 1576bf215546Sopenharmony_ci if (inst->writes_accumulator_implicitly(v->devinfo)) { 1577bf215546Sopenharmony_ci last_accumulator_write = n; 1578bf215546Sopenharmony_ci } 1579bf215546Sopenharmony_ci } 1580bf215546Sopenharmony_ci} 1581bf215546Sopenharmony_ci 1582bf215546Sopenharmony_cischedule_node * 1583bf215546Sopenharmony_cifs_instruction_scheduler::choose_instruction_to_schedule() 1584bf215546Sopenharmony_ci{ 1585bf215546Sopenharmony_ci schedule_node *chosen = NULL; 1586bf215546Sopenharmony_ci 1587bf215546Sopenharmony_ci if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) { 1588bf215546Sopenharmony_ci int chosen_time = 0; 1589bf215546Sopenharmony_ci 1590bf215546Sopenharmony_ci /* Of the instructions ready to execute or the closest to being ready, 1591bf215546Sopenharmony_ci * choose the one most likely to unblock an early program exit, or 1592bf215546Sopenharmony_ci * otherwise the oldest one. 1593bf215546Sopenharmony_ci */ 1594bf215546Sopenharmony_ci foreach_in_list(schedule_node, n, &instructions) { 1595bf215546Sopenharmony_ci if (!chosen || 1596bf215546Sopenharmony_ci exit_unblocked_time(n) < exit_unblocked_time(chosen) || 1597bf215546Sopenharmony_ci (exit_unblocked_time(n) == exit_unblocked_time(chosen) && 1598bf215546Sopenharmony_ci n->unblocked_time < chosen_time)) { 1599bf215546Sopenharmony_ci chosen = n; 1600bf215546Sopenharmony_ci chosen_time = n->unblocked_time; 1601bf215546Sopenharmony_ci } 1602bf215546Sopenharmony_ci } 1603bf215546Sopenharmony_ci } else { 1604bf215546Sopenharmony_ci int chosen_register_pressure_benefit = 0; 1605bf215546Sopenharmony_ci 1606bf215546Sopenharmony_ci /* Before register allocation, we don't care about the latencies of 1607bf215546Sopenharmony_ci * instructions. All we care about is reducing live intervals of 1608bf215546Sopenharmony_ci * variables so that we can avoid register spilling, or get SIMD16 1609bf215546Sopenharmony_ci * shaders which naturally do a better job of hiding instruction 1610bf215546Sopenharmony_ci * latency. 1611bf215546Sopenharmony_ci */ 1612bf215546Sopenharmony_ci foreach_in_list(schedule_node, n, &instructions) { 1613bf215546Sopenharmony_ci fs_inst *inst = (fs_inst *)n->inst; 1614bf215546Sopenharmony_ci 1615bf215546Sopenharmony_ci if (!chosen) { 1616bf215546Sopenharmony_ci chosen = n; 1617bf215546Sopenharmony_ci chosen_register_pressure_benefit = 1618bf215546Sopenharmony_ci get_register_pressure_benefit(chosen->inst); 1619bf215546Sopenharmony_ci continue; 1620bf215546Sopenharmony_ci } 1621bf215546Sopenharmony_ci 1622bf215546Sopenharmony_ci /* Most important: If we can definitely reduce register pressure, do 1623bf215546Sopenharmony_ci * so immediately. 1624bf215546Sopenharmony_ci */ 1625bf215546Sopenharmony_ci int register_pressure_benefit = get_register_pressure_benefit(n->inst); 1626bf215546Sopenharmony_ci 1627bf215546Sopenharmony_ci if (register_pressure_benefit > 0 && 1628bf215546Sopenharmony_ci register_pressure_benefit > chosen_register_pressure_benefit) { 1629bf215546Sopenharmony_ci chosen = n; 1630bf215546Sopenharmony_ci chosen_register_pressure_benefit = register_pressure_benefit; 1631bf215546Sopenharmony_ci continue; 1632bf215546Sopenharmony_ci } else if (chosen_register_pressure_benefit > 0 && 1633bf215546Sopenharmony_ci (register_pressure_benefit < 1634bf215546Sopenharmony_ci chosen_register_pressure_benefit)) { 1635bf215546Sopenharmony_ci continue; 1636bf215546Sopenharmony_ci } 1637bf215546Sopenharmony_ci 1638bf215546Sopenharmony_ci if (mode == SCHEDULE_PRE_LIFO) { 1639bf215546Sopenharmony_ci /* Prefer instructions that recently became available for 1640bf215546Sopenharmony_ci * scheduling. These are the things that are most likely to 1641bf215546Sopenharmony_ci * (eventually) make a variable dead and reduce register pressure. 1642bf215546Sopenharmony_ci * Typical register pressure estimates don't work for us because 1643bf215546Sopenharmony_ci * most of our pressure comes from texturing, where no single 1644bf215546Sopenharmony_ci * instruction to schedule will make a vec4 value dead. 1645bf215546Sopenharmony_ci */ 1646bf215546Sopenharmony_ci if (n->cand_generation > chosen->cand_generation) { 1647bf215546Sopenharmony_ci chosen = n; 1648bf215546Sopenharmony_ci chosen_register_pressure_benefit = register_pressure_benefit; 1649bf215546Sopenharmony_ci continue; 1650bf215546Sopenharmony_ci } else if (n->cand_generation < chosen->cand_generation) { 1651bf215546Sopenharmony_ci continue; 1652bf215546Sopenharmony_ci } 1653bf215546Sopenharmony_ci 1654bf215546Sopenharmony_ci /* On MRF-using chips, prefer non-SEND instructions. If we don't 1655bf215546Sopenharmony_ci * do this, then because we prefer instructions that just became 1656bf215546Sopenharmony_ci * candidates, we'll end up in a pattern of scheduling a SEND, 1657bf215546Sopenharmony_ci * then the MRFs for the next SEND, then the next SEND, then the 1658bf215546Sopenharmony_ci * MRFs, etc., without ever consuming the results of a send. 1659bf215546Sopenharmony_ci */ 1660bf215546Sopenharmony_ci if (v->devinfo->ver < 7) { 1661bf215546Sopenharmony_ci fs_inst *chosen_inst = (fs_inst *)chosen->inst; 1662bf215546Sopenharmony_ci 1663bf215546Sopenharmony_ci /* We use size_written > 4 * exec_size as our test for the kind 1664bf215546Sopenharmony_ci * of send instruction to avoid -- only sends generate many 1665bf215546Sopenharmony_ci * regs, and a single-result send is probably actually reducing 1666bf215546Sopenharmony_ci * register pressure. 1667bf215546Sopenharmony_ci */ 1668bf215546Sopenharmony_ci if (inst->size_written <= 4 * inst->exec_size && 1669bf215546Sopenharmony_ci chosen_inst->size_written > 4 * chosen_inst->exec_size) { 1670bf215546Sopenharmony_ci chosen = n; 1671bf215546Sopenharmony_ci chosen_register_pressure_benefit = register_pressure_benefit; 1672bf215546Sopenharmony_ci continue; 1673bf215546Sopenharmony_ci } else if (inst->size_written > chosen_inst->size_written) { 1674bf215546Sopenharmony_ci continue; 1675bf215546Sopenharmony_ci } 1676bf215546Sopenharmony_ci } 1677bf215546Sopenharmony_ci } 1678bf215546Sopenharmony_ci 1679bf215546Sopenharmony_ci /* For instructions pushed on the cands list at the same time, prefer 1680bf215546Sopenharmony_ci * the one with the highest delay to the end of the program. This is 1681bf215546Sopenharmony_ci * most likely to have its values able to be consumed first (such as 1682bf215546Sopenharmony_ci * for a large tree of lowered ubo loads, which appear reversed in 1683bf215546Sopenharmony_ci * the instruction stream with respect to when they can be consumed). 1684bf215546Sopenharmony_ci */ 1685bf215546Sopenharmony_ci if (n->delay > chosen->delay) { 1686bf215546Sopenharmony_ci chosen = n; 1687bf215546Sopenharmony_ci chosen_register_pressure_benefit = register_pressure_benefit; 1688bf215546Sopenharmony_ci continue; 1689bf215546Sopenharmony_ci } else if (n->delay < chosen->delay) { 1690bf215546Sopenharmony_ci continue; 1691bf215546Sopenharmony_ci } 1692bf215546Sopenharmony_ci 1693bf215546Sopenharmony_ci /* Prefer the node most likely to unblock an early program exit. 1694bf215546Sopenharmony_ci */ 1695bf215546Sopenharmony_ci if (exit_unblocked_time(n) < exit_unblocked_time(chosen)) { 1696bf215546Sopenharmony_ci chosen = n; 1697bf215546Sopenharmony_ci chosen_register_pressure_benefit = register_pressure_benefit; 1698bf215546Sopenharmony_ci continue; 1699bf215546Sopenharmony_ci } else if (exit_unblocked_time(n) > exit_unblocked_time(chosen)) { 1700bf215546Sopenharmony_ci continue; 1701bf215546Sopenharmony_ci } 1702bf215546Sopenharmony_ci 1703bf215546Sopenharmony_ci /* If all other metrics are equal, we prefer the first instruction in 1704bf215546Sopenharmony_ci * the list (program execution). 1705bf215546Sopenharmony_ci */ 1706bf215546Sopenharmony_ci } 1707bf215546Sopenharmony_ci } 1708bf215546Sopenharmony_ci 1709bf215546Sopenharmony_ci return chosen; 1710bf215546Sopenharmony_ci} 1711bf215546Sopenharmony_ci 1712bf215546Sopenharmony_cischedule_node * 1713bf215546Sopenharmony_civec4_instruction_scheduler::choose_instruction_to_schedule() 1714bf215546Sopenharmony_ci{ 1715bf215546Sopenharmony_ci schedule_node *chosen = NULL; 1716bf215546Sopenharmony_ci int chosen_time = 0; 1717bf215546Sopenharmony_ci 1718bf215546Sopenharmony_ci /* Of the instructions ready to execute or the closest to being ready, 1719bf215546Sopenharmony_ci * choose the oldest one. 1720bf215546Sopenharmony_ci */ 1721bf215546Sopenharmony_ci foreach_in_list(schedule_node, n, &instructions) { 1722bf215546Sopenharmony_ci if (!chosen || n->unblocked_time < chosen_time) { 1723bf215546Sopenharmony_ci chosen = n; 1724bf215546Sopenharmony_ci chosen_time = n->unblocked_time; 1725bf215546Sopenharmony_ci } 1726bf215546Sopenharmony_ci } 1727bf215546Sopenharmony_ci 1728bf215546Sopenharmony_ci return chosen; 1729bf215546Sopenharmony_ci} 1730bf215546Sopenharmony_ci 1731bf215546Sopenharmony_ciint 1732bf215546Sopenharmony_cifs_instruction_scheduler::issue_time(backend_instruction *inst0) 1733bf215546Sopenharmony_ci{ 1734bf215546Sopenharmony_ci const struct brw_isa_info *isa = &v->compiler->isa; 1735bf215546Sopenharmony_ci const fs_inst *inst = static_cast<fs_inst *>(inst0); 1736bf215546Sopenharmony_ci const unsigned overhead = v->grf_used && has_bank_conflict(isa, inst) ? 1737bf215546Sopenharmony_ci DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE) : 0; 1738bf215546Sopenharmony_ci if (is_compressed(inst)) 1739bf215546Sopenharmony_ci return 4 + overhead; 1740bf215546Sopenharmony_ci else 1741bf215546Sopenharmony_ci return 2 + overhead; 1742bf215546Sopenharmony_ci} 1743bf215546Sopenharmony_ci 1744bf215546Sopenharmony_ciint 1745bf215546Sopenharmony_civec4_instruction_scheduler::issue_time(backend_instruction *) 1746bf215546Sopenharmony_ci{ 1747bf215546Sopenharmony_ci /* We always execute as two vec4s in parallel. */ 1748bf215546Sopenharmony_ci return 2; 1749bf215546Sopenharmony_ci} 1750bf215546Sopenharmony_ci 1751bf215546Sopenharmony_civoid 1752bf215546Sopenharmony_ciinstruction_scheduler::schedule_instructions(bblock_t *block) 1753bf215546Sopenharmony_ci{ 1754bf215546Sopenharmony_ci const struct intel_device_info *devinfo = bs->devinfo; 1755bf215546Sopenharmony_ci int time = 0; 1756bf215546Sopenharmony_ci int instructions_to_schedule = block->end_ip - block->start_ip + 1; 1757bf215546Sopenharmony_ci 1758bf215546Sopenharmony_ci if (!post_reg_alloc) 1759bf215546Sopenharmony_ci reg_pressure = reg_pressure_in[block->num]; 1760bf215546Sopenharmony_ci block_idx = block->num; 1761bf215546Sopenharmony_ci 1762bf215546Sopenharmony_ci /* Remove non-DAG heads from the list. */ 1763bf215546Sopenharmony_ci foreach_in_list_safe(schedule_node, n, &instructions) { 1764bf215546Sopenharmony_ci if (n->parent_count != 0) 1765bf215546Sopenharmony_ci n->remove(); 1766bf215546Sopenharmony_ci } 1767bf215546Sopenharmony_ci 1768bf215546Sopenharmony_ci unsigned cand_generation = 1; 1769bf215546Sopenharmony_ci while (!instructions.is_empty()) { 1770bf215546Sopenharmony_ci schedule_node *chosen = choose_instruction_to_schedule(); 1771bf215546Sopenharmony_ci 1772bf215546Sopenharmony_ci /* Schedule this instruction. */ 1773bf215546Sopenharmony_ci assert(chosen); 1774bf215546Sopenharmony_ci chosen->remove(); 1775bf215546Sopenharmony_ci chosen->inst->exec_node::remove(); 1776bf215546Sopenharmony_ci block->instructions.push_tail(chosen->inst); 1777bf215546Sopenharmony_ci instructions_to_schedule--; 1778bf215546Sopenharmony_ci 1779bf215546Sopenharmony_ci if (!post_reg_alloc) { 1780bf215546Sopenharmony_ci reg_pressure -= get_register_pressure_benefit(chosen->inst); 1781bf215546Sopenharmony_ci update_register_pressure(chosen->inst); 1782bf215546Sopenharmony_ci } 1783bf215546Sopenharmony_ci 1784bf215546Sopenharmony_ci /* If we expected a delay for scheduling, then bump the clock to reflect 1785bf215546Sopenharmony_ci * that. In reality, the hardware will switch to another hyperthread 1786bf215546Sopenharmony_ci * and may not return to dispatching our thread for a while even after 1787bf215546Sopenharmony_ci * we're unblocked. After this, we have the time when the chosen 1788bf215546Sopenharmony_ci * instruction will start executing. 1789bf215546Sopenharmony_ci */ 1790bf215546Sopenharmony_ci time = MAX2(time, chosen->unblocked_time); 1791bf215546Sopenharmony_ci 1792bf215546Sopenharmony_ci /* Update the clock for how soon an instruction could start after the 1793bf215546Sopenharmony_ci * chosen one. 1794bf215546Sopenharmony_ci */ 1795bf215546Sopenharmony_ci time += issue_time(chosen->inst); 1796bf215546Sopenharmony_ci 1797bf215546Sopenharmony_ci if (debug) { 1798bf215546Sopenharmony_ci fprintf(stderr, "clock %4d, scheduled: ", time); 1799bf215546Sopenharmony_ci bs->dump_instruction(chosen->inst); 1800bf215546Sopenharmony_ci if (!post_reg_alloc) 1801bf215546Sopenharmony_ci fprintf(stderr, "(register pressure %d)\n", reg_pressure); 1802bf215546Sopenharmony_ci } 1803bf215546Sopenharmony_ci 1804bf215546Sopenharmony_ci /* Now that we've scheduled a new instruction, some of its 1805bf215546Sopenharmony_ci * children can be promoted to the list of instructions ready to 1806bf215546Sopenharmony_ci * be scheduled. Update the children's unblocked time for this 1807bf215546Sopenharmony_ci * DAG edge as we do so. 1808bf215546Sopenharmony_ci */ 1809bf215546Sopenharmony_ci for (int i = chosen->child_count - 1; i >= 0; i--) { 1810bf215546Sopenharmony_ci schedule_node *child = chosen->children[i]; 1811bf215546Sopenharmony_ci 1812bf215546Sopenharmony_ci child->unblocked_time = MAX2(child->unblocked_time, 1813bf215546Sopenharmony_ci time + chosen->child_latency[i]); 1814bf215546Sopenharmony_ci 1815bf215546Sopenharmony_ci if (debug) { 1816bf215546Sopenharmony_ci fprintf(stderr, "\tchild %d, %d parents: ", i, child->parent_count); 1817bf215546Sopenharmony_ci bs->dump_instruction(child->inst); 1818bf215546Sopenharmony_ci } 1819bf215546Sopenharmony_ci 1820bf215546Sopenharmony_ci child->cand_generation = cand_generation; 1821bf215546Sopenharmony_ci child->parent_count--; 1822bf215546Sopenharmony_ci if (child->parent_count == 0) { 1823bf215546Sopenharmony_ci if (debug) { 1824bf215546Sopenharmony_ci fprintf(stderr, "\t\tnow available\n"); 1825bf215546Sopenharmony_ci } 1826bf215546Sopenharmony_ci instructions.push_head(child); 1827bf215546Sopenharmony_ci } 1828bf215546Sopenharmony_ci } 1829bf215546Sopenharmony_ci cand_generation++; 1830bf215546Sopenharmony_ci 1831bf215546Sopenharmony_ci /* Shared resource: the mathbox. There's one mathbox per EU on Gfx6+ 1832bf215546Sopenharmony_ci * but it's more limited pre-gfx6, so if we send something off to it then 1833bf215546Sopenharmony_ci * the next math instruction isn't going to make progress until the first 1834bf215546Sopenharmony_ci * is done. 1835bf215546Sopenharmony_ci */ 1836bf215546Sopenharmony_ci if (devinfo->ver < 6 && chosen->inst->is_math()) { 1837bf215546Sopenharmony_ci foreach_in_list(schedule_node, n, &instructions) { 1838bf215546Sopenharmony_ci if (n->inst->is_math()) 1839bf215546Sopenharmony_ci n->unblocked_time = MAX2(n->unblocked_time, 1840bf215546Sopenharmony_ci time + chosen->latency); 1841bf215546Sopenharmony_ci } 1842bf215546Sopenharmony_ci } 1843bf215546Sopenharmony_ci } 1844bf215546Sopenharmony_ci 1845bf215546Sopenharmony_ci assert(instructions_to_schedule == 0); 1846bf215546Sopenharmony_ci} 1847bf215546Sopenharmony_ci 1848bf215546Sopenharmony_civoid 1849bf215546Sopenharmony_ciinstruction_scheduler::run(cfg_t *cfg) 1850bf215546Sopenharmony_ci{ 1851bf215546Sopenharmony_ci if (debug && !post_reg_alloc) { 1852bf215546Sopenharmony_ci fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n", 1853bf215546Sopenharmony_ci post_reg_alloc); 1854bf215546Sopenharmony_ci bs->dump_instructions(); 1855bf215546Sopenharmony_ci } 1856bf215546Sopenharmony_ci 1857bf215546Sopenharmony_ci if (!post_reg_alloc) 1858bf215546Sopenharmony_ci setup_liveness(cfg); 1859bf215546Sopenharmony_ci 1860bf215546Sopenharmony_ci foreach_block(block, cfg) { 1861bf215546Sopenharmony_ci if (reads_remaining) { 1862bf215546Sopenharmony_ci memset(reads_remaining, 0, 1863bf215546Sopenharmony_ci grf_count * sizeof(*reads_remaining)); 1864bf215546Sopenharmony_ci memset(hw_reads_remaining, 0, 1865bf215546Sopenharmony_ci hw_reg_count * sizeof(*hw_reads_remaining)); 1866bf215546Sopenharmony_ci memset(written, 0, grf_count * sizeof(*written)); 1867bf215546Sopenharmony_ci 1868bf215546Sopenharmony_ci foreach_inst_in_block(fs_inst, inst, block) 1869bf215546Sopenharmony_ci count_reads_remaining(inst); 1870bf215546Sopenharmony_ci } 1871bf215546Sopenharmony_ci 1872bf215546Sopenharmony_ci add_insts_from_block(block); 1873bf215546Sopenharmony_ci 1874bf215546Sopenharmony_ci calculate_deps(); 1875bf215546Sopenharmony_ci 1876bf215546Sopenharmony_ci compute_delays(); 1877bf215546Sopenharmony_ci compute_exits(); 1878bf215546Sopenharmony_ci 1879bf215546Sopenharmony_ci schedule_instructions(block); 1880bf215546Sopenharmony_ci } 1881bf215546Sopenharmony_ci 1882bf215546Sopenharmony_ci if (debug && !post_reg_alloc) { 1883bf215546Sopenharmony_ci fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n", 1884bf215546Sopenharmony_ci post_reg_alloc); 1885bf215546Sopenharmony_ci bs->dump_instructions(); 1886bf215546Sopenharmony_ci } 1887bf215546Sopenharmony_ci} 1888bf215546Sopenharmony_ci 1889bf215546Sopenharmony_civoid 1890bf215546Sopenharmony_cifs_visitor::schedule_instructions(instruction_scheduler_mode mode) 1891bf215546Sopenharmony_ci{ 1892bf215546Sopenharmony_ci int grf_count; 1893bf215546Sopenharmony_ci if (mode == SCHEDULE_POST) 1894bf215546Sopenharmony_ci grf_count = grf_used; 1895bf215546Sopenharmony_ci else 1896bf215546Sopenharmony_ci grf_count = alloc.count; 1897bf215546Sopenharmony_ci 1898bf215546Sopenharmony_ci fs_instruction_scheduler sched(this, grf_count, first_non_payload_grf, 1899bf215546Sopenharmony_ci cfg->num_blocks, mode); 1900bf215546Sopenharmony_ci sched.run(cfg); 1901bf215546Sopenharmony_ci 1902bf215546Sopenharmony_ci invalidate_analysis(DEPENDENCY_INSTRUCTIONS); 1903bf215546Sopenharmony_ci} 1904bf215546Sopenharmony_ci 1905bf215546Sopenharmony_civoid 1906bf215546Sopenharmony_civec4_visitor::opt_schedule_instructions() 1907bf215546Sopenharmony_ci{ 1908bf215546Sopenharmony_ci vec4_instruction_scheduler sched(this, prog_data->total_grf); 1909bf215546Sopenharmony_ci sched.run(cfg); 1910bf215546Sopenharmony_ci 1911bf215546Sopenharmony_ci invalidate_analysis(DEPENDENCY_INSTRUCTIONS); 1912bf215546Sopenharmony_ci} 1913