1/* 2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Jonathan Marek <jonathan@marek.ca> 25 */ 26 27#include "ir2_private.h" 28 29static bool 30scalar_possible(struct ir2_instr *instr) 31{ 32 if (instr->alu.scalar_opc == SCALAR_NONE) 33 return false; 34 35 return src_ncomp(instr) == 1; 36} 37 38static bool 39is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b) 40{ 41 if (!a) 42 return true; 43 44 /* dont use same instruction twice */ 45 if (a == b) 46 return false; 47 48 /* PRED_SET must be alone */ 49 if (b->alu.scalar_opc >= PRED_SETEs && 50 b->alu.scalar_opc <= PRED_SET_RESTOREs) 51 return false; 52 53 /* must write to same export (issues otherwise?) */ 54 return a->alu.export == b->alu.export; 55} 56 57/* priority of vector instruction for scheduling (lower=higher prio) */ 58static unsigned 59alu_vector_prio(struct ir2_instr *instr) 60{ 61 if (instr->alu.vector_opc == VECTOR_NONE) 62 return ~0u; 63 64 if (is_export(instr)) 65 return 4; 66 67 /* TODO check src type and ncomps */ 68 if (instr->src_count == 3) 69 return 0; 70 71 if (!scalar_possible(instr)) 72 return 1; 73 74 return instr->src_count == 2 ? 2 : 3; 75} 76 77/* priority of scalar instruction for scheduling (lower=higher prio) */ 78static unsigned 79alu_scalar_prio(struct ir2_instr *instr) 80{ 81 if (!scalar_possible(instr)) 82 return ~0u; 83 84 /* this case is dealt with later */ 85 if (instr->src_count > 1) 86 return ~0u; 87 88 if (is_export(instr)) 89 return 4; 90 91 /* PRED to end of block */ 92 if (instr->alu.scalar_opc >= PRED_SETEs && 93 instr->alu.scalar_opc <= PRED_SET_RESTOREs) 94 return 5; 95 96 /* scalar only have highest priority */ 97 return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3; 98} 99 100/* this is a bit messy: 101 * we want to find a slot where we can insert a scalar MOV with 102 * a vector instruction that was already scheduled 103 */ 104static struct ir2_sched_instr * 105insert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx, 106 struct ir2_src src1, unsigned *comp) 107{ 108 struct ir2_sched_instr *sched = NULL, *s; 109 unsigned i, mask = 0xf; 110 111 /* go first earliest point where the mov can be inserted */ 112 for (i = ctx->instr_sched_count - 1; i > 0; i--) { 113 s = &ctx->instr_sched[i - 1]; 114 115 if (s->instr && s->instr->block_idx != block_idx) 116 break; 117 if (s->instr_s && s->instr_s->block_idx != block_idx) 118 break; 119 120 if (src1.type == IR2_SRC_SSA) { 121 if ((s->instr && s->instr->idx == src1.num) || 122 (s->instr_s && s->instr_s->idx == src1.num)) 123 break; 124 } 125 126 unsigned mr = ~(s->reg_state[reg_idx / 8] >> reg_idx % 8 * 4 & 0xf); 127 if ((mask & mr) == 0) 128 break; 129 130 mask &= mr; 131 if (s->instr_s || s->instr->src_count == 3) 132 continue; 133 134 if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0) 135 continue; 136 137 sched = s; 138 } 139 *comp = ffs(mask) - 1; 140 141 if (sched) { 142 for (s = sched; s != &ctx->instr_sched[ctx->instr_sched_count]; s++) 143 s->reg_state[reg_idx / 8] |= 1 << (*comp + reg_idx % 8 * 4); 144 } 145 146 return sched; 147} 148 149/* case1: 150 * in this case, insert a mov to place the 2nd src into to same reg 151 * (scalar sources come from the same register) 152 * 153 * this is a common case which works when one of the srcs is input/const 154 * but for instrs which have 2 ssa/reg srcs, then its not ideal 155 */ 156static bool 157scalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order) 158{ 159 struct ir2_src src0 = instr->src[order]; 160 struct ir2_src src1 = instr->src[!order]; 161 struct ir2_sched_instr *sched; 162 struct ir2_instr *ins; 163 struct ir2_reg *reg; 164 unsigned idx, comp; 165 166 switch (src0.type) { 167 case IR2_SRC_CONST: 168 case IR2_SRC_INPUT: 169 return false; 170 default: 171 break; 172 } 173 174 /* TODO, insert needs logic for this */ 175 if (src1.type == IR2_SRC_REG) 176 return false; 177 178 /* we could do something if they match src1.. */ 179 if (src0.negate || src0.abs) 180 return false; 181 182 reg = get_reg_src(ctx, &src0); 183 184 /* result not used more since we will overwrite */ 185 for (int i = 0; i < 4; i++) 186 if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i)) 187 return false; 188 189 /* find a place to insert the mov */ 190 sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp); 191 if (!sched) 192 return false; 193 194 ins = &ctx->instr[idx = ctx->instr_count++]; 195 ins->idx = idx; 196 ins->type = IR2_ALU; 197 ins->src[0] = src1; 198 ins->src_count = 1; 199 ins->is_ssa = true; 200 ins->ssa.idx = reg->idx; 201 ins->ssa.ncomp = 1; 202 ins->ssa.comp[0].c = comp; 203 ins->alu.scalar_opc = MAXs; 204 ins->alu.export = -1; 205 ins->alu.write_mask = 1; 206 ins->pred = instr->pred; 207 ins->block_idx = instr->block_idx; 208 209 instr->src[0] = src0; 210 instr->alu.src1_swizzle = comp; 211 212 sched->instr_s = ins; 213 return true; 214} 215 216/* fill sched with next fetch or (vector and/or scalar) alu instruction */ 217static int 218sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched) 219{ 220 struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL; 221 unsigned avail_count = 0; 222 223 instr_alloc_type_t export = ~0u; 224 int block_idx = -1; 225 226 /* XXX merge this loop with the other one somehow? */ 227 ir2_foreach_instr (instr, ctx) { 228 if (!instr->need_emit) 229 continue; 230 if (is_export(instr)) 231 export = MIN2(export, export_buf(instr->alu.export)); 232 } 233 234 ir2_foreach_instr (instr, ctx) { 235 if (!instr->need_emit) 236 continue; 237 238 /* dont mix exports */ 239 if (is_export(instr) && export_buf(instr->alu.export) != export) 240 continue; 241 242 if (block_idx < 0) 243 block_idx = instr->block_idx; 244 else if (block_idx != instr->block_idx || /* must be same block */ 245 instr->type == IR2_CF || /* CF/MEM must be alone */ 246 (is_export(instr) && export == SQ_MEMORY)) 247 break; 248 /* it works because IR2_CF is always at end of block 249 * and somewhat same idea with MEM exports, which might not be alone 250 * but will end up in-order at least 251 */ 252 253 /* check if dependencies are satisfied */ 254 bool is_ok = true; 255 ir2_foreach_src (src, instr) { 256 if (src->type == IR2_SRC_REG) { 257 /* need to check if all previous instructions in the block 258 * which write the reg have been emitted 259 * slow.. 260 * XXX: check components instead of whole register 261 */ 262 struct ir2_reg *reg = get_reg_src(ctx, src); 263 ir2_foreach_instr (p, ctx) { 264 if (!p->is_ssa && p->reg == reg && p->idx < instr->idx) 265 is_ok &= !p->need_emit; 266 } 267 } else if (src->type == IR2_SRC_SSA) { 268 /* in this case its easy, just check need_emit */ 269 is_ok &= !ctx->instr[src->num].need_emit; 270 } 271 } 272 /* don't reorder non-ssa write before read */ 273 if (!instr->is_ssa) { 274 ir2_foreach_instr (p, ctx) { 275 if (!p->need_emit || p->idx >= instr->idx) 276 continue; 277 278 ir2_foreach_src (src, p) { 279 if (get_reg_src(ctx, src) == instr->reg) 280 is_ok = false; 281 } 282 } 283 } 284 /* don't reorder across predicates */ 285 if (avail_count && instr->pred != avail[0]->pred) 286 is_ok = false; 287 288 if (!is_ok) 289 continue; 290 291 avail[avail_count++] = instr; 292 } 293 294 if (!avail_count) { 295 assert(block_idx == -1); 296 return -1; 297 } 298 299 /* priority to FETCH instructions */ 300 ir2_foreach_avail (instr) { 301 if (instr->type == IR2_ALU) 302 continue; 303 304 ra_src_free(ctx, instr); 305 ra_reg(ctx, get_reg(instr), -1, false, 0); 306 307 instr->need_emit = false; 308 sched->instr = instr; 309 sched->instr_s = NULL; 310 return block_idx; 311 } 312 313 /* TODO precompute priorities */ 314 315 unsigned prio_v = ~0u, prio_s = ~0u, prio; 316 ir2_foreach_avail (instr) { 317 prio = alu_vector_prio(instr); 318 if (prio < prio_v) { 319 instr_v = instr; 320 prio_v = prio; 321 } 322 } 323 324 /* TODO can still insert scalar if src_count=3, if smart about it */ 325 if (!instr_v || instr_v->src_count < 3) { 326 ir2_foreach_avail (instr) { 327 bool compat = is_alu_compatible(instr_v, instr); 328 329 prio = alu_scalar_prio(instr); 330 if (prio >= prio_v && !compat) 331 continue; 332 333 if (prio < prio_s) { 334 instr_s = instr; 335 prio_s = prio; 336 if (!compat) 337 instr_v = NULL; 338 } 339 } 340 } 341 342 assert(instr_v || instr_s); 343 344 /* now, we try more complex insertion of vector instruction as scalar 345 * TODO: if we are smart we can still insert if instr_v->src_count==3 346 */ 347 if (!instr_s && instr_v->src_count < 3) { 348 ir2_foreach_avail (instr) { 349 if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr)) 350 continue; 351 352 /* at this point, src_count should always be 2 */ 353 assert(instr->src_count == 2); 354 355 if (scalarize_case1(ctx, instr, 0)) { 356 instr_s = instr; 357 break; 358 } 359 if (scalarize_case1(ctx, instr, 1)) { 360 instr_s = instr; 361 break; 362 } 363 } 364 } 365 366 /* free src registers */ 367 if (instr_v) { 368 instr_v->need_emit = false; 369 ra_src_free(ctx, instr_v); 370 } 371 372 if (instr_s) { 373 instr_s->need_emit = false; 374 ra_src_free(ctx, instr_s); 375 } 376 377 /* allocate dst registers */ 378 if (instr_v) 379 ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v), 380 instr_v->alu.write_mask); 381 382 if (instr_s) 383 ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s), 384 instr_s->alu.write_mask); 385 386 sched->instr = instr_v; 387 sched->instr_s = instr_s; 388 return block_idx; 389} 390 391/* scheduling: determine order of instructions */ 392static void 393schedule_instrs(struct ir2_context *ctx) 394{ 395 struct ir2_sched_instr *sched; 396 int block_idx; 397 398 /* allocate input registers */ 399 for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++) 400 if (ctx->input[idx].initialized) 401 ra_reg(ctx, &ctx->input[idx], idx, false, 0); 402 403 for (;;) { 404 sched = &ctx->instr_sched[ctx->instr_sched_count++]; 405 block_idx = sched_next(ctx, sched); 406 if (block_idx < 0) 407 break; 408 memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state)); 409 410 /* catch texture fetch after scheduling and insert the 411 * SET_TEX_LOD right before it if necessary 412 * TODO clean this up 413 */ 414 struct ir2_instr *instr = sched->instr, *tex_lod; 415 if (instr && instr->type == IR2_FETCH && instr->fetch.opc == TEX_FETCH && 416 instr->src_count == 2) { 417 /* generate the SET_LOD instruction */ 418 tex_lod = &ctx->instr[ctx->instr_count++]; 419 tex_lod->type = IR2_FETCH; 420 tex_lod->block_idx = instr->block_idx; 421 tex_lod->pred = instr->pred; 422 tex_lod->fetch.opc = TEX_SET_TEX_LOD; 423 tex_lod->src[0] = instr->src[1]; 424 tex_lod->src_count = 1; 425 426 sched[1] = sched[0]; 427 sched->instr = tex_lod; 428 ctx->instr_sched_count++; 429 } 430 431 bool free_block = true; 432 ir2_foreach_instr (instr, ctx) 433 free_block &= instr->block_idx != block_idx; 434 if (free_block) 435 ra_block_free(ctx, block_idx); 436 }; 437 ctx->instr_sched_count--; 438} 439 440void 441ir2_compile(struct fd2_shader_stateobj *so, unsigned variant, 442 struct fd2_shader_stateobj *fp) 443{ 444 struct ir2_context ctx = {}; 445 bool binning = !fp && so->type == MESA_SHADER_VERTEX; 446 447 if (fp) 448 so->variant[variant].f = fp->variant[0].f; 449 450 ctx.so = so; 451 ctx.info = &so->variant[variant].info; 452 ctx.f = &so->variant[variant].f; 453 ctx.info->max_reg = -1; 454 455 /* convert nir to internal representation */ 456 ir2_nir_compile(&ctx, binning); 457 458 /* copy propagate srcs */ 459 cp_src(&ctx); 460 461 /* get ref_counts and kill non-needed instructions */ 462 ra_count_refs(&ctx); 463 464 /* remove movs used to write outputs */ 465 cp_export(&ctx); 466 467 /* instruction order.. and vector->scalar conversions */ 468 schedule_instrs(&ctx); 469 470 /* finally, assemble to bitcode */ 471 assemble(&ctx, binning); 472} 473