1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * 26 */ 27 28#include "brw_eu.h" 29#include "brw_fs.h" 30#include "brw_fs_live_variables.h" 31#include "brw_vec4.h" 32#include "brw_cfg.h" 33#include "brw_shader.h" 34 35using namespace brw; 36 37/** @file brw_fs_schedule_instructions.cpp 38 * 39 * List scheduling of FS instructions. 40 * 41 * The basic model of the list scheduler is to take a basic block, 42 * compute a DAG of the dependencies (RAW ordering with latency, WAW 43 * ordering with latency, WAR ordering), and make a list of the DAG heads. 44 * Heuristically pick a DAG head, then put all the children that are 45 * now DAG heads into the list of things to schedule. 46 * 47 * The heuristic is the important part. We're trying to be cheap, 48 * since actually computing the optimal scheduling is NP complete. 49 * What we do is track a "current clock". When we schedule a node, we 50 * update the earliest-unblocked clock time of its children, and 51 * increment the clock. Then, when trying to schedule, we just pick 52 * the earliest-unblocked instruction to schedule. 53 * 54 * Note that often there will be many things which could execute 55 * immediately, and there are a range of heuristic options to choose 56 * from in picking among those. 57 */ 58 59static bool debug = false; 60 61class instruction_scheduler; 62 63class schedule_node : public exec_node 64{ 65public: 66 schedule_node(backend_instruction *inst, instruction_scheduler *sched); 67 void set_latency_gfx4(); 68 void set_latency_gfx7(bool is_haswell); 69 70 const struct brw_isa_info *isa; 71 backend_instruction *inst; 72 schedule_node **children; 73 int *child_latency; 74 int child_count; 75 int parent_count; 76 int child_array_size; 77 int unblocked_time; 78 int latency; 79 80 /** 81 * Which iteration of pushing groups of children onto the candidates list 82 * this node was a part of. 83 */ 84 unsigned cand_generation; 85 86 /** 87 * This is the sum of the instruction's latency plus the maximum delay of 88 * its children, or just the issue_time if it's a leaf node. 89 */ 90 int delay; 91 92 /** 93 * Preferred exit node among the (direct or indirect) successors of this 94 * node. Among the scheduler nodes blocked by this node, this will be the 95 * one that may cause earliest program termination, or NULL if none of the 96 * successors is an exit node. 97 */ 98 schedule_node *exit; 99}; 100 101/** 102 * Lower bound of the scheduling time after which one of the instructions 103 * blocked by this node may lead to program termination. 104 * 105 * exit_unblocked_time() determines a strict partial ordering relation '«' on 106 * the set of scheduler nodes as follows: 107 * 108 * n « m <-> exit_unblocked_time(n) < exit_unblocked_time(m) 109 * 110 * which can be used to heuristically order nodes according to how early they 111 * can unblock an exit node and lead to program termination. 112 */ 113static inline int 114exit_unblocked_time(const schedule_node *n) 115{ 116 return n->exit ? n->exit->unblocked_time : INT_MAX; 117} 118 119void 120schedule_node::set_latency_gfx4() 121{ 122 int chans = 8; 123 int math_latency = 22; 124 125 switch (inst->opcode) { 126 case SHADER_OPCODE_RCP: 127 this->latency = 1 * chans * math_latency; 128 break; 129 case SHADER_OPCODE_RSQ: 130 this->latency = 2 * chans * math_latency; 131 break; 132 case SHADER_OPCODE_INT_QUOTIENT: 133 case SHADER_OPCODE_SQRT: 134 case SHADER_OPCODE_LOG2: 135 /* full precision log. partial is 2. */ 136 this->latency = 3 * chans * math_latency; 137 break; 138 case SHADER_OPCODE_INT_REMAINDER: 139 case SHADER_OPCODE_EXP2: 140 /* full precision. partial is 3, same throughput. */ 141 this->latency = 4 * chans * math_latency; 142 break; 143 case SHADER_OPCODE_POW: 144 this->latency = 8 * chans * math_latency; 145 break; 146 case SHADER_OPCODE_SIN: 147 case SHADER_OPCODE_COS: 148 /* minimum latency, max is 12 rounds. */ 149 this->latency = 5 * chans * math_latency; 150 break; 151 default: 152 this->latency = 2; 153 break; 154 } 155} 156 157void 158schedule_node::set_latency_gfx7(bool is_haswell) 159{ 160 switch (inst->opcode) { 161 case BRW_OPCODE_MAD: 162 /* 2 cycles 163 * (since the last two src operands are in different register banks): 164 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 165 * 166 * 3 cycles on IVB, 4 on HSW 167 * (since the last two src operands are in the same register bank): 168 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 169 * 170 * 18 cycles on IVB, 16 on HSW 171 * (since the last two src operands are in different register banks): 172 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 173 * mov(8) null g4<4,5,1>F { align16 WE_normal 1Q }; 174 * 175 * 20 cycles on IVB, 18 on HSW 176 * (since the last two src operands are in the same register bank): 177 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 178 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 179 */ 180 181 /* Our register allocator doesn't know about register banks, so use the 182 * higher latency. 183 */ 184 latency = is_haswell ? 16 : 18; 185 break; 186 187 case BRW_OPCODE_LRP: 188 /* 2 cycles 189 * (since the last two src operands are in different register banks): 190 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 191 * 192 * 3 cycles on IVB, 4 on HSW 193 * (since the last two src operands are in the same register bank): 194 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 195 * 196 * 16 cycles on IVB, 14 on HSW 197 * (since the last two src operands are in different register banks): 198 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 199 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 200 * 201 * 16 cycles 202 * (since the last two src operands are in the same register bank): 203 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 204 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 205 */ 206 207 /* Our register allocator doesn't know about register banks, so use the 208 * higher latency. 209 */ 210 latency = 14; 211 break; 212 213 case SHADER_OPCODE_RCP: 214 case SHADER_OPCODE_RSQ: 215 case SHADER_OPCODE_SQRT: 216 case SHADER_OPCODE_LOG2: 217 case SHADER_OPCODE_EXP2: 218 case SHADER_OPCODE_SIN: 219 case SHADER_OPCODE_COS: 220 /* 2 cycles: 221 * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; 222 * 223 * 18 cycles: 224 * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; 225 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 226 * 227 * Same for exp2, log2, rsq, sqrt, sin, cos. 228 */ 229 latency = is_haswell ? 14 : 16; 230 break; 231 232 case SHADER_OPCODE_POW: 233 /* 2 cycles: 234 * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; 235 * 236 * 26 cycles: 237 * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; 238 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 239 */ 240 latency = is_haswell ? 22 : 24; 241 break; 242 243 case SHADER_OPCODE_TEX: 244 case SHADER_OPCODE_TXD: 245 case SHADER_OPCODE_TXF: 246 case SHADER_OPCODE_TXF_LZ: 247 case SHADER_OPCODE_TXL: 248 case SHADER_OPCODE_TXL_LZ: 249 /* 18 cycles: 250 * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 251 * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 252 * send(8) g4<1>UW g114<8,8,1>F 253 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 254 * 255 * 697 +/-49 cycles (min 610, n=26): 256 * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 257 * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 258 * send(8) g4<1>UW g114<8,8,1>F 259 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 260 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 261 * 262 * So the latency on our first texture load of the batchbuffer takes 263 * ~700 cycles, since the caches are cold at that point. 264 * 265 * 840 +/- 92 cycles (min 720, n=25): 266 * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 267 * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 268 * send(8) g4<1>UW g114<8,8,1>F 269 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 270 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 271 * send(8) g4<1>UW g114<8,8,1>F 272 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 273 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 274 * 275 * On the second load, it takes just an extra ~140 cycles, and after 276 * accounting for the 14 cycles of the MOV's latency, that makes ~130. 277 * 278 * 683 +/- 49 cycles (min = 602, n=47): 279 * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 280 * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 281 * send(8) g4<1>UW g114<8,8,1>F 282 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 283 * send(8) g50<1>UW g114<8,8,1>F 284 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 285 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 286 * 287 * The unit appears to be pipelined, since this matches up with the 288 * cache-cold case, despite there being two loads here. If you replace 289 * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39). 290 * 291 * So, take some number between the cache-hot 140 cycles and the 292 * cache-cold 700 cycles. No particular tuning was done on this. 293 * 294 * I haven't done significant testing of the non-TEX opcodes. TXL at 295 * least looked about the same as TEX. 296 */ 297 latency = 200; 298 break; 299 300 case SHADER_OPCODE_TXS: 301 /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41 302 * cycles (n=15): 303 * mov(8) g114<1>UD 0D { align1 WE_normal 1Q }; 304 * send(8) g6<1>UW g114<8,8,1>F 305 * sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q }; 306 * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q }; 307 * 308 * 309 * Two loads was 535 +/- 30 cycles (n=19): 310 * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; 311 * send(16) g6<1>UW g114<8,8,1>F 312 * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; 313 * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; 314 * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H }; 315 * send(16) g8<1>UW g114<8,8,1>F 316 * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; 317 * mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H }; 318 * add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H }; 319 * 320 * Since the only caches that should matter are just the 321 * instruction/state cache containing the surface state, assume that we 322 * always have hot caches. 323 */ 324 latency = 100; 325 break; 326 327 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: 328 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 329 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: 330 case VS_OPCODE_PULL_CONSTANT_LOAD: 331 /* testing using varying-index pull constants: 332 * 333 * 16 cycles: 334 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 335 * send(8) g4<1>F g4<8,8,1>D 336 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 337 * 338 * ~480 cycles: 339 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 340 * send(8) g4<1>F g4<8,8,1>D 341 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 342 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 343 * 344 * ~620 cycles: 345 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 346 * send(8) g4<1>F g4<8,8,1>D 347 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 348 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 349 * send(8) g4<1>F g4<8,8,1>D 350 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 351 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 352 * 353 * So, if it's cache-hot, it's about 140. If it's cache cold, it's 354 * about 460. We expect to mostly be cache hot, so pick something more 355 * in that direction. 356 */ 357 latency = 200; 358 break; 359 360 case SHADER_OPCODE_GFX7_SCRATCH_READ: 361 /* Testing a load from offset 0, that had been previously written: 362 * 363 * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q }; 364 * mov(8) null g114<8,8,1>F { align1 WE_normal 1Q }; 365 * 366 * The cycles spent seemed to be grouped around 40-50 (as low as 38), 367 * then around 140. Presumably this is cache hit vs miss. 368 */ 369 latency = 50; 370 break; 371 372 case VEC4_OPCODE_UNTYPED_ATOMIC: 373 /* See GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP */ 374 latency = 14000; 375 break; 376 377 case VEC4_OPCODE_UNTYPED_SURFACE_READ: 378 case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: 379 /* See also GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ */ 380 latency = is_haswell ? 300 : 600; 381 break; 382 383 case SHADER_OPCODE_SEND: 384 switch (inst->sfid) { 385 case BRW_SFID_SAMPLER: { 386 unsigned msg_type = (inst->desc >> 12) & 0x1f; 387 switch (msg_type) { 388 case GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO: 389 case GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO: 390 /* See also SHADER_OPCODE_TXS */ 391 latency = 100; 392 break; 393 394 default: 395 /* See also SHADER_OPCODE_TEX */ 396 latency = 200; 397 break; 398 } 399 break; 400 } 401 402 case GFX6_SFID_DATAPORT_RENDER_CACHE: 403 switch (brw_fb_desc_msg_type(isa->devinfo, inst->desc)) { 404 case GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE: 405 case GFX7_DATAPORT_RC_TYPED_SURFACE_READ: 406 /* See also SHADER_OPCODE_TYPED_SURFACE_READ */ 407 assert(!is_haswell); 408 latency = 600; 409 break; 410 411 case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP: 412 /* See also SHADER_OPCODE_TYPED_ATOMIC */ 413 assert(!is_haswell); 414 latency = 14000; 415 break; 416 417 case GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE: 418 /* completely fabricated number */ 419 latency = 600; 420 break; 421 422 default: 423 unreachable("Unknown render cache message"); 424 } 425 break; 426 427 case GFX7_SFID_DATAPORT_DATA_CACHE: 428 switch ((inst->desc >> 14) & 0x1f) { 429 case BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ: 430 case GFX7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ: 431 case GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE: 432 /* We have no data for this but assume it's a little faster than 433 * untyped surface read/write. 434 */ 435 latency = 200; 436 break; 437 438 case GFX7_DATAPORT_DC_DWORD_SCATTERED_READ: 439 case GFX6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE: 440 case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ: 441 case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE: 442 /* We have no data for this but assume it's roughly the same as 443 * untyped surface read/write. 444 */ 445 latency = 300; 446 break; 447 448 case GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ: 449 case GFX7_DATAPORT_DC_UNTYPED_SURFACE_WRITE: 450 /* Test code: 451 * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; 452 * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; 453 * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; 454 * send(8) g4<1>UD g112<8,8,1>UD 455 * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; 456 * . 457 * . [repeats 8 times] 458 * . 459 * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; 460 * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; 461 * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; 462 * send(8) g4<1>UD g112<8,8,1>UD 463 * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; 464 * 465 * Running it 100 times as fragment shader on a 128x128 quad 466 * gives an average latency of 583 cycles per surface read, 467 * standard deviation 0.9%. 468 */ 469 assert(!is_haswell); 470 latency = 600; 471 break; 472 473 case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP: 474 /* Test code: 475 * mov(8) g112<1>ud 0x00000000ud { align1 WE_all 1Q }; 476 * mov(1) g112.7<1>ud g1.7<0,1,0>ud { align1 WE_all }; 477 * mov(8) g113<1>ud 0x00000000ud { align1 WE_normal 1Q }; 478 * send(8) g4<1>ud g112<8,8,1>ud 479 * data (38, 5, 6) mlen 2 rlen 1 { align1 WE_normal 1Q }; 480 * 481 * Running it 100 times as fragment shader on a 128x128 quad 482 * gives an average latency of 13867 cycles per atomic op, 483 * standard deviation 3%. Note that this is a rather 484 * pessimistic estimate, the actual latency in cases with few 485 * collisions between threads and favorable pipelining has been 486 * seen to be reduced by a factor of 100. 487 */ 488 assert(!is_haswell); 489 latency = 14000; 490 break; 491 492 default: 493 unreachable("Unknown data cache message"); 494 } 495 break; 496 497 case HSW_SFID_DATAPORT_DATA_CACHE_1: 498 switch ((inst->desc >> 14) & 0x1f) { 499 case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ: 500 case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE: 501 case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ: 502 case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE: 503 case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE: 504 case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ: 505 case GFX8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE: 506 case GFX9_DATAPORT_DC_PORT1_A64_SCATTERED_READ: 507 case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ: 508 case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE: 509 /* See also GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ */ 510 latency = 300; 511 break; 512 513 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP: 514 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2: 515 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2: 516 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP: 517 case GFX9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP: 518 case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP: 519 case GFX9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP: 520 case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_INT_OP: 521 case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_FLOAT_OP: 522 /* See also GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP */ 523 latency = 14000; 524 break; 525 526 default: 527 unreachable("Unknown data cache message"); 528 } 529 break; 530 531 case GFX12_SFID_UGM: 532 case GFX12_SFID_TGM: 533 case GFX12_SFID_SLM: 534 switch (lsc_msg_desc_opcode(isa->devinfo, inst->desc)) { 535 case LSC_OP_LOAD: 536 case LSC_OP_STORE: 537 case LSC_OP_LOAD_CMASK: 538 case LSC_OP_STORE_CMASK: 539 latency = 300; 540 break; 541 case LSC_OP_FENCE: 542 case LSC_OP_ATOMIC_INC: 543 case LSC_OP_ATOMIC_DEC: 544 case LSC_OP_ATOMIC_LOAD: 545 case LSC_OP_ATOMIC_STORE: 546 case LSC_OP_ATOMIC_ADD: 547 case LSC_OP_ATOMIC_SUB: 548 case LSC_OP_ATOMIC_MIN: 549 case LSC_OP_ATOMIC_MAX: 550 case LSC_OP_ATOMIC_UMIN: 551 case LSC_OP_ATOMIC_UMAX: 552 case LSC_OP_ATOMIC_CMPXCHG: 553 case LSC_OP_ATOMIC_FADD: 554 case LSC_OP_ATOMIC_FSUB: 555 case LSC_OP_ATOMIC_FMIN: 556 case LSC_OP_ATOMIC_FMAX: 557 case LSC_OP_ATOMIC_FCMPXCHG: 558 case LSC_OP_ATOMIC_AND: 559 case LSC_OP_ATOMIC_OR: 560 case LSC_OP_ATOMIC_XOR: 561 latency = 1400; 562 break; 563 default: 564 unreachable("unsupported new data port message instruction"); 565 } 566 break; 567 568 case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: 569 case GEN_RT_SFID_RAY_TRACE_ACCELERATOR: 570 /* TODO. 571 * 572 * We'll assume for the moment that this is pretty quick as it 573 * doesn't actually return any data. 574 */ 575 latency = 200; 576 break; 577 578 case BRW_SFID_URB: 579 latency = 200; 580 break; 581 582 default: 583 unreachable("Unknown SFID"); 584 } 585 break; 586 587 default: 588 /* 2 cycles: 589 * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; 590 * 591 * 16 cycles: 592 * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; 593 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 594 */ 595 latency = 14; 596 break; 597 } 598} 599 600class instruction_scheduler { 601public: 602 instruction_scheduler(const backend_shader *s, int grf_count, 603 unsigned hw_reg_count, int block_count, 604 instruction_scheduler_mode mode): 605 bs(s) 606 { 607 this->mem_ctx = ralloc_context(NULL); 608 this->grf_count = grf_count; 609 this->hw_reg_count = hw_reg_count; 610 this->instructions.make_empty(); 611 this->post_reg_alloc = (mode == SCHEDULE_POST); 612 this->mode = mode; 613 this->reg_pressure = 0; 614 this->block_idx = 0; 615 if (!post_reg_alloc) { 616 this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count); 617 618 this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 619 for (int i = 0; i < block_count; i++) 620 this->livein[i] = rzalloc_array(mem_ctx, BITSET_WORD, 621 BITSET_WORDS(grf_count)); 622 623 this->liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 624 for (int i = 0; i < block_count; i++) 625 this->liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, 626 BITSET_WORDS(grf_count)); 627 628 this->hw_liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 629 for (int i = 0; i < block_count; i++) 630 this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, 631 BITSET_WORDS(hw_reg_count)); 632 633 this->written = rzalloc_array(mem_ctx, bool, grf_count); 634 635 this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count); 636 637 this->hw_reads_remaining = rzalloc_array(mem_ctx, int, hw_reg_count); 638 } else { 639 this->reg_pressure_in = NULL; 640 this->livein = NULL; 641 this->liveout = NULL; 642 this->hw_liveout = NULL; 643 this->written = NULL; 644 this->reads_remaining = NULL; 645 this->hw_reads_remaining = NULL; 646 } 647 } 648 649 ~instruction_scheduler() 650 { 651 ralloc_free(this->mem_ctx); 652 } 653 void add_barrier_deps(schedule_node *n); 654 void add_dep(schedule_node *before, schedule_node *after, int latency); 655 void add_dep(schedule_node *before, schedule_node *after); 656 657 void run(cfg_t *cfg); 658 void add_insts_from_block(bblock_t *block); 659 void compute_delays(); 660 void compute_exits(); 661 virtual void calculate_deps() = 0; 662 virtual schedule_node *choose_instruction_to_schedule() = 0; 663 664 /** 665 * Returns how many cycles it takes the instruction to issue. 666 * 667 * Instructions in gen hardware are handled one simd4 vector at a time, 668 * with 1 cycle per vector dispatched. Thus SIMD8 pixel shaders take 2 669 * cycles to dispatch and SIMD16 (compressed) instructions take 4. 670 */ 671 virtual int issue_time(backend_instruction *inst) = 0; 672 673 virtual void count_reads_remaining(backend_instruction *inst) = 0; 674 virtual void setup_liveness(cfg_t *cfg) = 0; 675 virtual void update_register_pressure(backend_instruction *inst) = 0; 676 virtual int get_register_pressure_benefit(backend_instruction *inst) = 0; 677 678 void schedule_instructions(bblock_t *block); 679 680 void *mem_ctx; 681 682 bool post_reg_alloc; 683 int grf_count; 684 unsigned hw_reg_count; 685 int reg_pressure; 686 int block_idx; 687 exec_list instructions; 688 const backend_shader *bs; 689 690 instruction_scheduler_mode mode; 691 692 /* 693 * The register pressure at the beginning of each basic block. 694 */ 695 696 int *reg_pressure_in; 697 698 /* 699 * The virtual GRF's whose range overlaps the beginning of each basic block. 700 */ 701 702 BITSET_WORD **livein; 703 704 /* 705 * The virtual GRF's whose range overlaps the end of each basic block. 706 */ 707 708 BITSET_WORD **liveout; 709 710 /* 711 * The hardware GRF's whose range overlaps the end of each basic block. 712 */ 713 714 BITSET_WORD **hw_liveout; 715 716 /* 717 * Whether we've scheduled a write for this virtual GRF yet. 718 */ 719 720 bool *written; 721 722 /* 723 * How many reads we haven't scheduled for this virtual GRF yet. 724 */ 725 726 int *reads_remaining; 727 728 /* 729 * How many reads we haven't scheduled for this hardware GRF yet. 730 */ 731 732 int *hw_reads_remaining; 733}; 734 735class fs_instruction_scheduler : public instruction_scheduler 736{ 737public: 738 fs_instruction_scheduler(const fs_visitor *v, int grf_count, int hw_reg_count, 739 int block_count, 740 instruction_scheduler_mode mode); 741 void calculate_deps(); 742 bool is_compressed(const fs_inst *inst); 743 schedule_node *choose_instruction_to_schedule(); 744 int issue_time(backend_instruction *inst); 745 const fs_visitor *v; 746 747 void count_reads_remaining(backend_instruction *inst); 748 void setup_liveness(cfg_t *cfg); 749 void update_register_pressure(backend_instruction *inst); 750 int get_register_pressure_benefit(backend_instruction *inst); 751}; 752 753fs_instruction_scheduler::fs_instruction_scheduler(const fs_visitor *v, 754 int grf_count, int hw_reg_count, 755 int block_count, 756 instruction_scheduler_mode mode) 757 : instruction_scheduler(v, grf_count, hw_reg_count, block_count, mode), 758 v(v) 759{ 760} 761 762static bool 763is_src_duplicate(fs_inst *inst, int src) 764{ 765 for (int i = 0; i < src; i++) 766 if (inst->src[i].equals(inst->src[src])) 767 return true; 768 769 return false; 770} 771 772void 773fs_instruction_scheduler::count_reads_remaining(backend_instruction *be) 774{ 775 fs_inst *inst = (fs_inst *)be; 776 777 if (!reads_remaining) 778 return; 779 780 for (int i = 0; i < inst->sources; i++) { 781 if (is_src_duplicate(inst, i)) 782 continue; 783 784 if (inst->src[i].file == VGRF) { 785 reads_remaining[inst->src[i].nr]++; 786 } else if (inst->src[i].file == FIXED_GRF) { 787 if (inst->src[i].nr >= hw_reg_count) 788 continue; 789 790 for (unsigned j = 0; j < regs_read(inst, i); j++) 791 hw_reads_remaining[inst->src[i].nr + j]++; 792 } 793 } 794} 795 796void 797fs_instruction_scheduler::setup_liveness(cfg_t *cfg) 798{ 799 const fs_live_variables &live = v->live_analysis.require(); 800 801 /* First, compute liveness on a per-GRF level using the in/out sets from 802 * liveness calculation. 803 */ 804 for (int block = 0; block < cfg->num_blocks; block++) { 805 for (int i = 0; i < live.num_vars; i++) { 806 if (BITSET_TEST(live.block_data[block].livein, i)) { 807 int vgrf = live.vgrf_from_var[i]; 808 if (!BITSET_TEST(livein[block], vgrf)) { 809 reg_pressure_in[block] += v->alloc.sizes[vgrf]; 810 BITSET_SET(livein[block], vgrf); 811 } 812 } 813 814 if (BITSET_TEST(live.block_data[block].liveout, i)) 815 BITSET_SET(liveout[block], live.vgrf_from_var[i]); 816 } 817 } 818 819 /* Now, extend the live in/live out sets for when a range crosses a block 820 * boundary, which matches what our register allocator/interference code 821 * does to account for force_writemask_all and incompatible exec_mask's. 822 */ 823 for (int block = 0; block < cfg->num_blocks - 1; block++) { 824 for (int i = 0; i < grf_count; i++) { 825 if (live.vgrf_start[i] <= cfg->blocks[block]->end_ip && 826 live.vgrf_end[i] >= cfg->blocks[block + 1]->start_ip) { 827 if (!BITSET_TEST(livein[block + 1], i)) { 828 reg_pressure_in[block + 1] += v->alloc.sizes[i]; 829 BITSET_SET(livein[block + 1], i); 830 } 831 832 BITSET_SET(liveout[block], i); 833 } 834 } 835 } 836 837 int payload_last_use_ip[hw_reg_count]; 838 v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip); 839 840 for (unsigned i = 0; i < hw_reg_count; i++) { 841 if (payload_last_use_ip[i] == -1) 842 continue; 843 844 for (int block = 0; block < cfg->num_blocks; block++) { 845 if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i]) 846 reg_pressure_in[block]++; 847 848 if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i]) 849 BITSET_SET(hw_liveout[block], i); 850 } 851 } 852} 853 854void 855fs_instruction_scheduler::update_register_pressure(backend_instruction *be) 856{ 857 fs_inst *inst = (fs_inst *)be; 858 859 if (!reads_remaining) 860 return; 861 862 if (inst->dst.file == VGRF) { 863 written[inst->dst.nr] = true; 864 } 865 866 for (int i = 0; i < inst->sources; i++) { 867 if (is_src_duplicate(inst, i)) 868 continue; 869 870 if (inst->src[i].file == VGRF) { 871 reads_remaining[inst->src[i].nr]--; 872 } else if (inst->src[i].file == FIXED_GRF && 873 inst->src[i].nr < hw_reg_count) { 874 for (unsigned off = 0; off < regs_read(inst, i); off++) 875 hw_reads_remaining[inst->src[i].nr + off]--; 876 } 877 } 878} 879 880int 881fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be) 882{ 883 fs_inst *inst = (fs_inst *)be; 884 int benefit = 0; 885 886 if (inst->dst.file == VGRF) { 887 if (!BITSET_TEST(livein[block_idx], inst->dst.nr) && 888 !written[inst->dst.nr]) 889 benefit -= v->alloc.sizes[inst->dst.nr]; 890 } 891 892 for (int i = 0; i < inst->sources; i++) { 893 if (is_src_duplicate(inst, i)) 894 continue; 895 896 if (inst->src[i].file == VGRF && 897 !BITSET_TEST(liveout[block_idx], inst->src[i].nr) && 898 reads_remaining[inst->src[i].nr] == 1) 899 benefit += v->alloc.sizes[inst->src[i].nr]; 900 901 if (inst->src[i].file == FIXED_GRF && 902 inst->src[i].nr < hw_reg_count) { 903 for (unsigned off = 0; off < regs_read(inst, i); off++) { 904 int reg = inst->src[i].nr + off; 905 if (!BITSET_TEST(hw_liveout[block_idx], reg) && 906 hw_reads_remaining[reg] == 1) { 907 benefit++; 908 } 909 } 910 } 911 } 912 913 return benefit; 914} 915 916class vec4_instruction_scheduler : public instruction_scheduler 917{ 918public: 919 vec4_instruction_scheduler(const vec4_visitor *v, int grf_count); 920 void calculate_deps(); 921 schedule_node *choose_instruction_to_schedule(); 922 int issue_time(backend_instruction *inst); 923 const vec4_visitor *v; 924 925 void count_reads_remaining(backend_instruction *inst); 926 void setup_liveness(cfg_t *cfg); 927 void update_register_pressure(backend_instruction *inst); 928 int get_register_pressure_benefit(backend_instruction *inst); 929}; 930 931vec4_instruction_scheduler::vec4_instruction_scheduler(const vec4_visitor *v, 932 int grf_count) 933 : instruction_scheduler(v, grf_count, 0, 0, SCHEDULE_POST), 934 v(v) 935{ 936} 937 938void 939vec4_instruction_scheduler::count_reads_remaining(backend_instruction *) 940{ 941} 942 943void 944vec4_instruction_scheduler::setup_liveness(cfg_t *) 945{ 946} 947 948void 949vec4_instruction_scheduler::update_register_pressure(backend_instruction *) 950{ 951} 952 953int 954vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *) 955{ 956 return 0; 957} 958 959schedule_node::schedule_node(backend_instruction *inst, 960 instruction_scheduler *sched) 961{ 962 const struct intel_device_info *devinfo = sched->bs->devinfo; 963 964 this->isa = &sched->bs->compiler->isa; 965 this->inst = inst; 966 this->child_array_size = 0; 967 this->children = NULL; 968 this->child_latency = NULL; 969 this->child_count = 0; 970 this->parent_count = 0; 971 this->unblocked_time = 0; 972 this->cand_generation = 0; 973 this->delay = 0; 974 this->exit = NULL; 975 976 /* We can't measure Gfx6 timings directly but expect them to be much 977 * closer to Gfx7 than Gfx4. 978 */ 979 if (!sched->post_reg_alloc) 980 this->latency = 1; 981 else if (devinfo->ver >= 6) 982 set_latency_gfx7(devinfo->verx10 == 75); 983 else 984 set_latency_gfx4(); 985} 986 987void 988instruction_scheduler::add_insts_from_block(bblock_t *block) 989{ 990 foreach_inst_in_block(backend_instruction, inst, block) { 991 schedule_node *n = new(mem_ctx) schedule_node(inst, this); 992 993 instructions.push_tail(n); 994 } 995} 996 997/** Computation of the delay member of each node. */ 998void 999instruction_scheduler::compute_delays() 1000{ 1001 foreach_in_list_reverse(schedule_node, n, &instructions) { 1002 if (!n->child_count) { 1003 n->delay = issue_time(n->inst); 1004 } else { 1005 for (int i = 0; i < n->child_count; i++) { 1006 assert(n->children[i]->delay); 1007 n->delay = MAX2(n->delay, n->latency + n->children[i]->delay); 1008 } 1009 } 1010 } 1011} 1012 1013void 1014instruction_scheduler::compute_exits() 1015{ 1016 /* Calculate a lower bound of the scheduling time of each node in the 1017 * graph. This is analogous to the node's critical path but calculated 1018 * from the top instead of from the bottom of the block. 1019 */ 1020 foreach_in_list(schedule_node, n, &instructions) { 1021 for (int i = 0; i < n->child_count; i++) { 1022 n->children[i]->unblocked_time = 1023 MAX2(n->children[i]->unblocked_time, 1024 n->unblocked_time + issue_time(n->inst) + n->child_latency[i]); 1025 } 1026 } 1027 1028 /* Calculate the exit of each node by induction based on the exit nodes of 1029 * its children. The preferred exit of a node is the one among the exit 1030 * nodes of its children which can be unblocked first according to the 1031 * optimistic unblocked time estimate calculated above. 1032 */ 1033 foreach_in_list_reverse(schedule_node, n, &instructions) { 1034 n->exit = (n->inst->opcode == BRW_OPCODE_HALT ? n : NULL); 1035 1036 for (int i = 0; i < n->child_count; i++) { 1037 if (exit_unblocked_time(n->children[i]) < exit_unblocked_time(n)) 1038 n->exit = n->children[i]->exit; 1039 } 1040 } 1041} 1042 1043/** 1044 * Add a dependency between two instruction nodes. 1045 * 1046 * The @after node will be scheduled after @before. We will try to 1047 * schedule it @latency cycles after @before, but no guarantees there. 1048 */ 1049void 1050instruction_scheduler::add_dep(schedule_node *before, schedule_node *after, 1051 int latency) 1052{ 1053 if (!before || !after) 1054 return; 1055 1056 assert(before != after); 1057 1058 for (int i = 0; i < before->child_count; i++) { 1059 if (before->children[i] == after) { 1060 before->child_latency[i] = MAX2(before->child_latency[i], latency); 1061 return; 1062 } 1063 } 1064 1065 if (before->child_array_size <= before->child_count) { 1066 if (before->child_array_size < 16) 1067 before->child_array_size = 16; 1068 else 1069 before->child_array_size *= 2; 1070 1071 before->children = reralloc(mem_ctx, before->children, 1072 schedule_node *, 1073 before->child_array_size); 1074 before->child_latency = reralloc(mem_ctx, before->child_latency, 1075 int, before->child_array_size); 1076 } 1077 1078 before->children[before->child_count] = after; 1079 before->child_latency[before->child_count] = latency; 1080 before->child_count++; 1081 after->parent_count++; 1082} 1083 1084void 1085instruction_scheduler::add_dep(schedule_node *before, schedule_node *after) 1086{ 1087 if (!before) 1088 return; 1089 1090 add_dep(before, after, before->latency); 1091} 1092 1093static bool 1094is_scheduling_barrier(const backend_instruction *inst) 1095{ 1096 return inst->opcode == SHADER_OPCODE_HALT_TARGET || 1097 inst->is_control_flow() || 1098 inst->has_side_effects(); 1099} 1100 1101/** 1102 * Sometimes we really want this node to execute after everything that 1103 * was before it and before everything that followed it. This adds 1104 * the deps to do so. 1105 */ 1106void 1107instruction_scheduler::add_barrier_deps(schedule_node *n) 1108{ 1109 schedule_node *prev = (schedule_node *)n->prev; 1110 schedule_node *next = (schedule_node *)n->next; 1111 1112 if (prev) { 1113 while (!prev->is_head_sentinel()) { 1114 add_dep(prev, n, 0); 1115 if (is_scheduling_barrier(prev->inst)) 1116 break; 1117 prev = (schedule_node *)prev->prev; 1118 } 1119 } 1120 1121 if (next) { 1122 while (!next->is_tail_sentinel()) { 1123 add_dep(n, next, 0); 1124 if (is_scheduling_barrier(next->inst)) 1125 break; 1126 next = (schedule_node *)next->next; 1127 } 1128 } 1129} 1130 1131/* instruction scheduling needs to be aware of when an MRF write 1132 * actually writes 2 MRFs. 1133 */ 1134bool 1135fs_instruction_scheduler::is_compressed(const fs_inst *inst) 1136{ 1137 return inst->exec_size == 16; 1138} 1139 1140void 1141fs_instruction_scheduler::calculate_deps() 1142{ 1143 /* Pre-register-allocation, this tracks the last write per VGRF offset. 1144 * After register allocation, reg_offsets are gone and we track individual 1145 * GRF registers. 1146 */ 1147 schedule_node **last_grf_write; 1148 schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->ver)]; 1149 schedule_node *last_conditional_mod[8] = {}; 1150 schedule_node *last_accumulator_write = NULL; 1151 /* Fixed HW registers are assumed to be separate from the virtual 1152 * GRFs, so they can be tracked separately. We don't really write 1153 * to fixed GRFs much, so don't bother tracking them on a more 1154 * granular level. 1155 */ 1156 schedule_node *last_fixed_grf_write = NULL; 1157 1158 last_grf_write = (schedule_node **)calloc(sizeof(schedule_node *), grf_count * 16); 1159 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1160 1161 /* top-to-bottom dependencies: RAW and WAW. */ 1162 foreach_in_list(schedule_node, n, &instructions) { 1163 fs_inst *inst = (fs_inst *)n->inst; 1164 1165 if (is_scheduling_barrier(inst)) 1166 add_barrier_deps(n); 1167 1168 /* read-after-write deps. */ 1169 for (int i = 0; i < inst->sources; i++) { 1170 if (inst->src[i].file == VGRF) { 1171 if (post_reg_alloc) { 1172 for (unsigned r = 0; r < regs_read(inst, i); r++) 1173 add_dep(last_grf_write[inst->src[i].nr + r], n); 1174 } else { 1175 for (unsigned r = 0; r < regs_read(inst, i); r++) { 1176 add_dep(last_grf_write[inst->src[i].nr * 16 + 1177 inst->src[i].offset / REG_SIZE + r], n); 1178 } 1179 } 1180 } else if (inst->src[i].file == FIXED_GRF) { 1181 if (post_reg_alloc) { 1182 for (unsigned r = 0; r < regs_read(inst, i); r++) 1183 add_dep(last_grf_write[inst->src[i].nr + r], n); 1184 } else { 1185 add_dep(last_fixed_grf_write, n); 1186 } 1187 } else if (inst->src[i].is_accumulator()) { 1188 add_dep(last_accumulator_write, n); 1189 } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { 1190 add_barrier_deps(n); 1191 } 1192 } 1193 1194 if (inst->base_mrf != -1) { 1195 for (int i = 0; i < inst->mlen; i++) { 1196 /* It looks like the MRF regs are released in the send 1197 * instruction once it's sent, not when the result comes 1198 * back. 1199 */ 1200 add_dep(last_mrf_write[inst->base_mrf + i], n); 1201 } 1202 } 1203 1204 if (const unsigned mask = inst->flags_read(v->devinfo)) { 1205 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1206 1207 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1208 if (mask & (1 << i)) 1209 add_dep(last_conditional_mod[i], n); 1210 } 1211 } 1212 1213 if (inst->reads_accumulator_implicitly()) { 1214 add_dep(last_accumulator_write, n); 1215 } 1216 1217 /* write-after-write deps. */ 1218 if (inst->dst.file == VGRF) { 1219 if (post_reg_alloc) { 1220 for (unsigned r = 0; r < regs_written(inst); r++) { 1221 add_dep(last_grf_write[inst->dst.nr + r], n); 1222 last_grf_write[inst->dst.nr + r] = n; 1223 } 1224 } else { 1225 for (unsigned r = 0; r < regs_written(inst); r++) { 1226 add_dep(last_grf_write[inst->dst.nr * 16 + 1227 inst->dst.offset / REG_SIZE + r], n); 1228 last_grf_write[inst->dst.nr * 16 + 1229 inst->dst.offset / REG_SIZE + r] = n; 1230 } 1231 } 1232 } else if (inst->dst.file == MRF) { 1233 int reg = inst->dst.nr & ~BRW_MRF_COMPR4; 1234 1235 add_dep(last_mrf_write[reg], n); 1236 last_mrf_write[reg] = n; 1237 if (is_compressed(inst)) { 1238 if (inst->dst.nr & BRW_MRF_COMPR4) 1239 reg += 4; 1240 else 1241 reg++; 1242 add_dep(last_mrf_write[reg], n); 1243 last_mrf_write[reg] = n; 1244 } 1245 } else if (inst->dst.file == FIXED_GRF) { 1246 if (post_reg_alloc) { 1247 for (unsigned r = 0; r < regs_written(inst); r++) { 1248 add_dep(last_grf_write[inst->dst.nr + r], n); 1249 last_grf_write[inst->dst.nr + r] = n; 1250 } 1251 } else { 1252 add_dep(last_fixed_grf_write, n); 1253 last_fixed_grf_write = n; 1254 } 1255 } else if (inst->dst.is_accumulator()) { 1256 add_dep(last_accumulator_write, n); 1257 last_accumulator_write = n; 1258 } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1259 add_barrier_deps(n); 1260 } 1261 1262 if (inst->mlen > 0 && inst->base_mrf != -1) { 1263 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { 1264 add_dep(last_mrf_write[inst->base_mrf + i], n); 1265 last_mrf_write[inst->base_mrf + i] = n; 1266 } 1267 } 1268 1269 if (const unsigned mask = inst->flags_written(v->devinfo)) { 1270 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1271 1272 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1273 if (mask & (1 << i)) { 1274 add_dep(last_conditional_mod[i], n, 0); 1275 last_conditional_mod[i] = n; 1276 } 1277 } 1278 } 1279 1280 if (inst->writes_accumulator_implicitly(v->devinfo) && 1281 !inst->dst.is_accumulator()) { 1282 add_dep(last_accumulator_write, n); 1283 last_accumulator_write = n; 1284 } 1285 } 1286 1287 /* bottom-to-top dependencies: WAR */ 1288 memset(last_grf_write, 0, sizeof(schedule_node *) * grf_count * 16); 1289 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1290 memset(last_conditional_mod, 0, sizeof(last_conditional_mod)); 1291 last_accumulator_write = NULL; 1292 last_fixed_grf_write = NULL; 1293 1294 foreach_in_list_reverse_safe(schedule_node, n, &instructions) { 1295 fs_inst *inst = (fs_inst *)n->inst; 1296 1297 /* write-after-read deps. */ 1298 for (int i = 0; i < inst->sources; i++) { 1299 if (inst->src[i].file == VGRF) { 1300 if (post_reg_alloc) { 1301 for (unsigned r = 0; r < regs_read(inst, i); r++) 1302 add_dep(n, last_grf_write[inst->src[i].nr + r], 0); 1303 } else { 1304 for (unsigned r = 0; r < regs_read(inst, i); r++) { 1305 add_dep(n, last_grf_write[inst->src[i].nr * 16 + 1306 inst->src[i].offset / REG_SIZE + r], 0); 1307 } 1308 } 1309 } else if (inst->src[i].file == FIXED_GRF) { 1310 if (post_reg_alloc) { 1311 for (unsigned r = 0; r < regs_read(inst, i); r++) 1312 add_dep(n, last_grf_write[inst->src[i].nr + r], 0); 1313 } else { 1314 add_dep(n, last_fixed_grf_write, 0); 1315 } 1316 } else if (inst->src[i].is_accumulator()) { 1317 add_dep(n, last_accumulator_write, 0); 1318 } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { 1319 add_barrier_deps(n); 1320 } 1321 } 1322 1323 if (inst->base_mrf != -1) { 1324 for (int i = 0; i < inst->mlen; i++) { 1325 /* It looks like the MRF regs are released in the send 1326 * instruction once it's sent, not when the result comes 1327 * back. 1328 */ 1329 add_dep(n, last_mrf_write[inst->base_mrf + i], 2); 1330 } 1331 } 1332 1333 if (const unsigned mask = inst->flags_read(v->devinfo)) { 1334 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1335 1336 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1337 if (mask & (1 << i)) 1338 add_dep(n, last_conditional_mod[i]); 1339 } 1340 } 1341 1342 if (inst->reads_accumulator_implicitly()) { 1343 add_dep(n, last_accumulator_write); 1344 } 1345 1346 /* Update the things this instruction wrote, so earlier reads 1347 * can mark this as WAR dependency. 1348 */ 1349 if (inst->dst.file == VGRF) { 1350 if (post_reg_alloc) { 1351 for (unsigned r = 0; r < regs_written(inst); r++) 1352 last_grf_write[inst->dst.nr + r] = n; 1353 } else { 1354 for (unsigned r = 0; r < regs_written(inst); r++) { 1355 last_grf_write[inst->dst.nr * 16 + 1356 inst->dst.offset / REG_SIZE + r] = n; 1357 } 1358 } 1359 } else if (inst->dst.file == MRF) { 1360 int reg = inst->dst.nr & ~BRW_MRF_COMPR4; 1361 1362 last_mrf_write[reg] = n; 1363 1364 if (is_compressed(inst)) { 1365 if (inst->dst.nr & BRW_MRF_COMPR4) 1366 reg += 4; 1367 else 1368 reg++; 1369 1370 last_mrf_write[reg] = n; 1371 } 1372 } else if (inst->dst.file == FIXED_GRF) { 1373 if (post_reg_alloc) { 1374 for (unsigned r = 0; r < regs_written(inst); r++) 1375 last_grf_write[inst->dst.nr + r] = n; 1376 } else { 1377 last_fixed_grf_write = n; 1378 } 1379 } else if (inst->dst.is_accumulator()) { 1380 last_accumulator_write = n; 1381 } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1382 add_barrier_deps(n); 1383 } 1384 1385 if (inst->mlen > 0 && inst->base_mrf != -1) { 1386 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { 1387 last_mrf_write[inst->base_mrf + i] = n; 1388 } 1389 } 1390 1391 if (const unsigned mask = inst->flags_written(v->devinfo)) { 1392 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1393 1394 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1395 if (mask & (1 << i)) 1396 last_conditional_mod[i] = n; 1397 } 1398 } 1399 1400 if (inst->writes_accumulator_implicitly(v->devinfo)) { 1401 last_accumulator_write = n; 1402 } 1403 } 1404 1405 free(last_grf_write); 1406} 1407 1408void 1409vec4_instruction_scheduler::calculate_deps() 1410{ 1411 schedule_node *last_grf_write[grf_count]; 1412 schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->ver)]; 1413 schedule_node *last_conditional_mod = NULL; 1414 schedule_node *last_accumulator_write = NULL; 1415 /* Fixed HW registers are assumed to be separate from the virtual 1416 * GRFs, so they can be tracked separately. We don't really write 1417 * to fixed GRFs much, so don't bother tracking them on a more 1418 * granular level. 1419 */ 1420 schedule_node *last_fixed_grf_write = NULL; 1421 1422 memset(last_grf_write, 0, sizeof(last_grf_write)); 1423 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1424 1425 /* top-to-bottom dependencies: RAW and WAW. */ 1426 foreach_in_list(schedule_node, n, &instructions) { 1427 vec4_instruction *inst = (vec4_instruction *)n->inst; 1428 1429 if (is_scheduling_barrier(inst)) 1430 add_barrier_deps(n); 1431 1432 /* read-after-write deps. */ 1433 for (int i = 0; i < 3; i++) { 1434 if (inst->src[i].file == VGRF) { 1435 for (unsigned j = 0; j < regs_read(inst, i); ++j) 1436 add_dep(last_grf_write[inst->src[i].nr + j], n); 1437 } else if (inst->src[i].file == FIXED_GRF) { 1438 add_dep(last_fixed_grf_write, n); 1439 } else if (inst->src[i].is_accumulator()) { 1440 assert(last_accumulator_write); 1441 add_dep(last_accumulator_write, n); 1442 } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { 1443 add_barrier_deps(n); 1444 } 1445 } 1446 1447 if (inst->reads_g0_implicitly()) 1448 add_dep(last_fixed_grf_write, n); 1449 1450 if (!inst->is_send_from_grf()) { 1451 for (int i = 0; i < inst->mlen; i++) { 1452 /* It looks like the MRF regs are released in the send 1453 * instruction once it's sent, not when the result comes 1454 * back. 1455 */ 1456 add_dep(last_mrf_write[inst->base_mrf + i], n); 1457 } 1458 } 1459 1460 if (inst->reads_flag()) { 1461 assert(last_conditional_mod); 1462 add_dep(last_conditional_mod, n); 1463 } 1464 1465 if (inst->reads_accumulator_implicitly()) { 1466 assert(last_accumulator_write); 1467 add_dep(last_accumulator_write, n); 1468 } 1469 1470 /* write-after-write deps. */ 1471 if (inst->dst.file == VGRF) { 1472 for (unsigned j = 0; j < regs_written(inst); ++j) { 1473 add_dep(last_grf_write[inst->dst.nr + j], n); 1474 last_grf_write[inst->dst.nr + j] = n; 1475 } 1476 } else if (inst->dst.file == MRF) { 1477 add_dep(last_mrf_write[inst->dst.nr], n); 1478 last_mrf_write[inst->dst.nr] = n; 1479 } else if (inst->dst.file == FIXED_GRF) { 1480 add_dep(last_fixed_grf_write, n); 1481 last_fixed_grf_write = n; 1482 } else if (inst->dst.is_accumulator()) { 1483 add_dep(last_accumulator_write, n); 1484 last_accumulator_write = n; 1485 } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1486 add_barrier_deps(n); 1487 } 1488 1489 if (inst->mlen > 0 && !inst->is_send_from_grf()) { 1490 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { 1491 add_dep(last_mrf_write[inst->base_mrf + i], n); 1492 last_mrf_write[inst->base_mrf + i] = n; 1493 } 1494 } 1495 1496 if (inst->writes_flag(v->devinfo)) { 1497 add_dep(last_conditional_mod, n, 0); 1498 last_conditional_mod = n; 1499 } 1500 1501 if (inst->writes_accumulator_implicitly(v->devinfo) && 1502 !inst->dst.is_accumulator()) { 1503 add_dep(last_accumulator_write, n); 1504 last_accumulator_write = n; 1505 } 1506 } 1507 1508 /* bottom-to-top dependencies: WAR */ 1509 memset(last_grf_write, 0, sizeof(last_grf_write)); 1510 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1511 last_conditional_mod = NULL; 1512 last_accumulator_write = NULL; 1513 last_fixed_grf_write = NULL; 1514 1515 foreach_in_list_reverse_safe(schedule_node, n, &instructions) { 1516 vec4_instruction *inst = (vec4_instruction *)n->inst; 1517 1518 /* write-after-read deps. */ 1519 for (int i = 0; i < 3; i++) { 1520 if (inst->src[i].file == VGRF) { 1521 for (unsigned j = 0; j < regs_read(inst, i); ++j) 1522 add_dep(n, last_grf_write[inst->src[i].nr + j]); 1523 } else if (inst->src[i].file == FIXED_GRF) { 1524 add_dep(n, last_fixed_grf_write); 1525 } else if (inst->src[i].is_accumulator()) { 1526 add_dep(n, last_accumulator_write); 1527 } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { 1528 add_barrier_deps(n); 1529 } 1530 } 1531 1532 if (!inst->is_send_from_grf()) { 1533 for (int i = 0; i < inst->mlen; i++) { 1534 /* It looks like the MRF regs are released in the send 1535 * instruction once it's sent, not when the result comes 1536 * back. 1537 */ 1538 add_dep(n, last_mrf_write[inst->base_mrf + i], 2); 1539 } 1540 } 1541 1542 if (inst->reads_flag()) { 1543 add_dep(n, last_conditional_mod); 1544 } 1545 1546 if (inst->reads_accumulator_implicitly()) { 1547 add_dep(n, last_accumulator_write); 1548 } 1549 1550 /* Update the things this instruction wrote, so earlier reads 1551 * can mark this as WAR dependency. 1552 */ 1553 if (inst->dst.file == VGRF) { 1554 for (unsigned j = 0; j < regs_written(inst); ++j) 1555 last_grf_write[inst->dst.nr + j] = n; 1556 } else if (inst->dst.file == MRF) { 1557 last_mrf_write[inst->dst.nr] = n; 1558 } else if (inst->dst.file == FIXED_GRF) { 1559 last_fixed_grf_write = n; 1560 } else if (inst->dst.is_accumulator()) { 1561 last_accumulator_write = n; 1562 } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1563 add_barrier_deps(n); 1564 } 1565 1566 if (inst->mlen > 0 && !inst->is_send_from_grf()) { 1567 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { 1568 last_mrf_write[inst->base_mrf + i] = n; 1569 } 1570 } 1571 1572 if (inst->writes_flag(v->devinfo)) { 1573 last_conditional_mod = n; 1574 } 1575 1576 if (inst->writes_accumulator_implicitly(v->devinfo)) { 1577 last_accumulator_write = n; 1578 } 1579 } 1580} 1581 1582schedule_node * 1583fs_instruction_scheduler::choose_instruction_to_schedule() 1584{ 1585 schedule_node *chosen = NULL; 1586 1587 if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) { 1588 int chosen_time = 0; 1589 1590 /* Of the instructions ready to execute or the closest to being ready, 1591 * choose the one most likely to unblock an early program exit, or 1592 * otherwise the oldest one. 1593 */ 1594 foreach_in_list(schedule_node, n, &instructions) { 1595 if (!chosen || 1596 exit_unblocked_time(n) < exit_unblocked_time(chosen) || 1597 (exit_unblocked_time(n) == exit_unblocked_time(chosen) && 1598 n->unblocked_time < chosen_time)) { 1599 chosen = n; 1600 chosen_time = n->unblocked_time; 1601 } 1602 } 1603 } else { 1604 int chosen_register_pressure_benefit = 0; 1605 1606 /* Before register allocation, we don't care about the latencies of 1607 * instructions. All we care about is reducing live intervals of 1608 * variables so that we can avoid register spilling, or get SIMD16 1609 * shaders which naturally do a better job of hiding instruction 1610 * latency. 1611 */ 1612 foreach_in_list(schedule_node, n, &instructions) { 1613 fs_inst *inst = (fs_inst *)n->inst; 1614 1615 if (!chosen) { 1616 chosen = n; 1617 chosen_register_pressure_benefit = 1618 get_register_pressure_benefit(chosen->inst); 1619 continue; 1620 } 1621 1622 /* Most important: If we can definitely reduce register pressure, do 1623 * so immediately. 1624 */ 1625 int register_pressure_benefit = get_register_pressure_benefit(n->inst); 1626 1627 if (register_pressure_benefit > 0 && 1628 register_pressure_benefit > chosen_register_pressure_benefit) { 1629 chosen = n; 1630 chosen_register_pressure_benefit = register_pressure_benefit; 1631 continue; 1632 } else if (chosen_register_pressure_benefit > 0 && 1633 (register_pressure_benefit < 1634 chosen_register_pressure_benefit)) { 1635 continue; 1636 } 1637 1638 if (mode == SCHEDULE_PRE_LIFO) { 1639 /* Prefer instructions that recently became available for 1640 * scheduling. These are the things that are most likely to 1641 * (eventually) make a variable dead and reduce register pressure. 1642 * Typical register pressure estimates don't work for us because 1643 * most of our pressure comes from texturing, where no single 1644 * instruction to schedule will make a vec4 value dead. 1645 */ 1646 if (n->cand_generation > chosen->cand_generation) { 1647 chosen = n; 1648 chosen_register_pressure_benefit = register_pressure_benefit; 1649 continue; 1650 } else if (n->cand_generation < chosen->cand_generation) { 1651 continue; 1652 } 1653 1654 /* On MRF-using chips, prefer non-SEND instructions. If we don't 1655 * do this, then because we prefer instructions that just became 1656 * candidates, we'll end up in a pattern of scheduling a SEND, 1657 * then the MRFs for the next SEND, then the next SEND, then the 1658 * MRFs, etc., without ever consuming the results of a send. 1659 */ 1660 if (v->devinfo->ver < 7) { 1661 fs_inst *chosen_inst = (fs_inst *)chosen->inst; 1662 1663 /* We use size_written > 4 * exec_size as our test for the kind 1664 * of send instruction to avoid -- only sends generate many 1665 * regs, and a single-result send is probably actually reducing 1666 * register pressure. 1667 */ 1668 if (inst->size_written <= 4 * inst->exec_size && 1669 chosen_inst->size_written > 4 * chosen_inst->exec_size) { 1670 chosen = n; 1671 chosen_register_pressure_benefit = register_pressure_benefit; 1672 continue; 1673 } else if (inst->size_written > chosen_inst->size_written) { 1674 continue; 1675 } 1676 } 1677 } 1678 1679 /* For instructions pushed on the cands list at the same time, prefer 1680 * the one with the highest delay to the end of the program. This is 1681 * most likely to have its values able to be consumed first (such as 1682 * for a large tree of lowered ubo loads, which appear reversed in 1683 * the instruction stream with respect to when they can be consumed). 1684 */ 1685 if (n->delay > chosen->delay) { 1686 chosen = n; 1687 chosen_register_pressure_benefit = register_pressure_benefit; 1688 continue; 1689 } else if (n->delay < chosen->delay) { 1690 continue; 1691 } 1692 1693 /* Prefer the node most likely to unblock an early program exit. 1694 */ 1695 if (exit_unblocked_time(n) < exit_unblocked_time(chosen)) { 1696 chosen = n; 1697 chosen_register_pressure_benefit = register_pressure_benefit; 1698 continue; 1699 } else if (exit_unblocked_time(n) > exit_unblocked_time(chosen)) { 1700 continue; 1701 } 1702 1703 /* If all other metrics are equal, we prefer the first instruction in 1704 * the list (program execution). 1705 */ 1706 } 1707 } 1708 1709 return chosen; 1710} 1711 1712schedule_node * 1713vec4_instruction_scheduler::choose_instruction_to_schedule() 1714{ 1715 schedule_node *chosen = NULL; 1716 int chosen_time = 0; 1717 1718 /* Of the instructions ready to execute or the closest to being ready, 1719 * choose the oldest one. 1720 */ 1721 foreach_in_list(schedule_node, n, &instructions) { 1722 if (!chosen || n->unblocked_time < chosen_time) { 1723 chosen = n; 1724 chosen_time = n->unblocked_time; 1725 } 1726 } 1727 1728 return chosen; 1729} 1730 1731int 1732fs_instruction_scheduler::issue_time(backend_instruction *inst0) 1733{ 1734 const struct brw_isa_info *isa = &v->compiler->isa; 1735 const fs_inst *inst = static_cast<fs_inst *>(inst0); 1736 const unsigned overhead = v->grf_used && has_bank_conflict(isa, inst) ? 1737 DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE) : 0; 1738 if (is_compressed(inst)) 1739 return 4 + overhead; 1740 else 1741 return 2 + overhead; 1742} 1743 1744int 1745vec4_instruction_scheduler::issue_time(backend_instruction *) 1746{ 1747 /* We always execute as two vec4s in parallel. */ 1748 return 2; 1749} 1750 1751void 1752instruction_scheduler::schedule_instructions(bblock_t *block) 1753{ 1754 const struct intel_device_info *devinfo = bs->devinfo; 1755 int time = 0; 1756 int instructions_to_schedule = block->end_ip - block->start_ip + 1; 1757 1758 if (!post_reg_alloc) 1759 reg_pressure = reg_pressure_in[block->num]; 1760 block_idx = block->num; 1761 1762 /* Remove non-DAG heads from the list. */ 1763 foreach_in_list_safe(schedule_node, n, &instructions) { 1764 if (n->parent_count != 0) 1765 n->remove(); 1766 } 1767 1768 unsigned cand_generation = 1; 1769 while (!instructions.is_empty()) { 1770 schedule_node *chosen = choose_instruction_to_schedule(); 1771 1772 /* Schedule this instruction. */ 1773 assert(chosen); 1774 chosen->remove(); 1775 chosen->inst->exec_node::remove(); 1776 block->instructions.push_tail(chosen->inst); 1777 instructions_to_schedule--; 1778 1779 if (!post_reg_alloc) { 1780 reg_pressure -= get_register_pressure_benefit(chosen->inst); 1781 update_register_pressure(chosen->inst); 1782 } 1783 1784 /* If we expected a delay for scheduling, then bump the clock to reflect 1785 * that. In reality, the hardware will switch to another hyperthread 1786 * and may not return to dispatching our thread for a while even after 1787 * we're unblocked. After this, we have the time when the chosen 1788 * instruction will start executing. 1789 */ 1790 time = MAX2(time, chosen->unblocked_time); 1791 1792 /* Update the clock for how soon an instruction could start after the 1793 * chosen one. 1794 */ 1795 time += issue_time(chosen->inst); 1796 1797 if (debug) { 1798 fprintf(stderr, "clock %4d, scheduled: ", time); 1799 bs->dump_instruction(chosen->inst); 1800 if (!post_reg_alloc) 1801 fprintf(stderr, "(register pressure %d)\n", reg_pressure); 1802 } 1803 1804 /* Now that we've scheduled a new instruction, some of its 1805 * children can be promoted to the list of instructions ready to 1806 * be scheduled. Update the children's unblocked time for this 1807 * DAG edge as we do so. 1808 */ 1809 for (int i = chosen->child_count - 1; i >= 0; i--) { 1810 schedule_node *child = chosen->children[i]; 1811 1812 child->unblocked_time = MAX2(child->unblocked_time, 1813 time + chosen->child_latency[i]); 1814 1815 if (debug) { 1816 fprintf(stderr, "\tchild %d, %d parents: ", i, child->parent_count); 1817 bs->dump_instruction(child->inst); 1818 } 1819 1820 child->cand_generation = cand_generation; 1821 child->parent_count--; 1822 if (child->parent_count == 0) { 1823 if (debug) { 1824 fprintf(stderr, "\t\tnow available\n"); 1825 } 1826 instructions.push_head(child); 1827 } 1828 } 1829 cand_generation++; 1830 1831 /* Shared resource: the mathbox. There's one mathbox per EU on Gfx6+ 1832 * but it's more limited pre-gfx6, so if we send something off to it then 1833 * the next math instruction isn't going to make progress until the first 1834 * is done. 1835 */ 1836 if (devinfo->ver < 6 && chosen->inst->is_math()) { 1837 foreach_in_list(schedule_node, n, &instructions) { 1838 if (n->inst->is_math()) 1839 n->unblocked_time = MAX2(n->unblocked_time, 1840 time + chosen->latency); 1841 } 1842 } 1843 } 1844 1845 assert(instructions_to_schedule == 0); 1846} 1847 1848void 1849instruction_scheduler::run(cfg_t *cfg) 1850{ 1851 if (debug && !post_reg_alloc) { 1852 fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n", 1853 post_reg_alloc); 1854 bs->dump_instructions(); 1855 } 1856 1857 if (!post_reg_alloc) 1858 setup_liveness(cfg); 1859 1860 foreach_block(block, cfg) { 1861 if (reads_remaining) { 1862 memset(reads_remaining, 0, 1863 grf_count * sizeof(*reads_remaining)); 1864 memset(hw_reads_remaining, 0, 1865 hw_reg_count * sizeof(*hw_reads_remaining)); 1866 memset(written, 0, grf_count * sizeof(*written)); 1867 1868 foreach_inst_in_block(fs_inst, inst, block) 1869 count_reads_remaining(inst); 1870 } 1871 1872 add_insts_from_block(block); 1873 1874 calculate_deps(); 1875 1876 compute_delays(); 1877 compute_exits(); 1878 1879 schedule_instructions(block); 1880 } 1881 1882 if (debug && !post_reg_alloc) { 1883 fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n", 1884 post_reg_alloc); 1885 bs->dump_instructions(); 1886 } 1887} 1888 1889void 1890fs_visitor::schedule_instructions(instruction_scheduler_mode mode) 1891{ 1892 int grf_count; 1893 if (mode == SCHEDULE_POST) 1894 grf_count = grf_used; 1895 else 1896 grf_count = alloc.count; 1897 1898 fs_instruction_scheduler sched(this, grf_count, first_non_payload_grf, 1899 cfg->num_blocks, mode); 1900 sched.run(cfg); 1901 1902 invalidate_analysis(DEPENDENCY_INSTRUCTIONS); 1903} 1904 1905void 1906vec4_visitor::opt_schedule_instructions() 1907{ 1908 vec4_instruction_scheduler sched(this, prog_data->total_grf); 1909 sched.run(cfg); 1910 1911 invalidate_analysis(DEPENDENCY_INSTRUCTIONS); 1912} 1913