1 2/* 3 * Copyright © 2014 Broadcom 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25#include <stdlib.h> 26 27#include "vc4_qpu.h" 28 29static void 30fail_instr(uint64_t inst, const char *msg) 31{ 32 fprintf(stderr, "vc4_qpu_validate: %s: ", msg); 33 vc4_qpu_disasm(&inst, 1); 34 fprintf(stderr, "\n"); 35 abort(); 36} 37 38static bool 39writes_reg(uint64_t inst, uint32_t w) 40{ 41 return (QPU_GET_FIELD(inst, QPU_WADDR_ADD) == w || 42 QPU_GET_FIELD(inst, QPU_WADDR_MUL) == w); 43} 44 45static bool 46_reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b) 47{ 48 struct { 49 uint32_t mux, addr; 50 } src_regs[] = { 51 { QPU_GET_FIELD(inst, QPU_ADD_A) }, 52 { QPU_GET_FIELD(inst, QPU_ADD_B) }, 53 { QPU_GET_FIELD(inst, QPU_MUL_A) }, 54 { QPU_GET_FIELD(inst, QPU_MUL_B) }, 55 }; 56 57 /* Branches only reference raddr_a (no mux), and we don't use that 58 * feature of branching. 59 */ 60 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) 61 return false; 62 63 /* Load immediates don't read any registers. */ 64 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM) 65 return false; 66 67 for (int i = 0; i < ARRAY_SIZE(src_regs); i++) { 68 if (!ignore_a && 69 src_regs[i].mux == QPU_MUX_A && 70 (QPU_GET_FIELD(inst, QPU_RADDR_A) == r)) 71 return true; 72 73 if (!ignore_b && 74 QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM && 75 src_regs[i].mux == QPU_MUX_B && 76 (QPU_GET_FIELD(inst, QPU_RADDR_B) == r)) 77 return true; 78 } 79 80 return false; 81} 82 83static bool 84reads_reg(uint64_t inst, uint32_t r) 85{ 86 return _reads_reg(inst, r, false, false); 87} 88 89static bool 90reads_a_reg(uint64_t inst, uint32_t r) 91{ 92 return _reads_reg(inst, r, false, true); 93} 94 95static bool 96reads_b_reg(uint64_t inst, uint32_t r) 97{ 98 return _reads_reg(inst, r, true, false); 99} 100 101static bool 102writes_sfu(uint64_t inst) 103{ 104 return (writes_reg(inst, QPU_W_SFU_RECIP) || 105 writes_reg(inst, QPU_W_SFU_RECIPSQRT) || 106 writes_reg(inst, QPU_W_SFU_EXP) || 107 writes_reg(inst, QPU_W_SFU_LOG)); 108} 109 110/** 111 * Checks for the instruction restrictions from page 37 ("Summary of 112 * Instruction Restrictions"). 113 */ 114void 115vc4_qpu_validate(uint64_t *insts, uint32_t num_inst) 116{ 117 bool scoreboard_locked = false; 118 bool threaded = false; 119 120 /* We don't want to do validation in release builds, but we want to 121 * keep compiling the validation code to make sure it doesn't get 122 * broken. 123 */ 124#ifndef DEBUG 125 return; 126#endif 127 128 for (int i = 0; i < num_inst; i++) { 129 uint64_t inst = insts[i]; 130 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 131 132 if (sig != QPU_SIG_PROG_END) { 133 if (qpu_inst_is_tlb(inst)) 134 scoreboard_locked = true; 135 136 if (sig == QPU_SIG_THREAD_SWITCH || 137 sig == QPU_SIG_LAST_THREAD_SWITCH) { 138 threaded = true; 139 } 140 141 continue; 142 } 143 144 /* "The Thread End instruction must not write to either physical 145 * regfile A or B." 146 */ 147 if (QPU_GET_FIELD(inst, QPU_WADDR_ADD) < 32 || 148 QPU_GET_FIELD(inst, QPU_WADDR_MUL) < 32) { 149 fail_instr(inst, "write to phys reg in thread end"); 150 } 151 152 /* Can't trigger an implicit wait on scoreboard in the program 153 * end instruction. 154 */ 155 if (qpu_inst_is_tlb(inst) && !scoreboard_locked) 156 fail_instr(inst, "implicit sb wait in program end"); 157 158 /* Two delay slots will be executed. */ 159 assert(i + 2 <= num_inst); 160 161 for (int j = i; j < i + 2; j++) { 162 /* "The last three instructions of any program 163 * (Thread End plus the following two delay-slot 164 * instructions) must not do varyings read, uniforms 165 * read or any kind of VPM, VDR, or VDW read or 166 * write." 167 */ 168 if (writes_reg(insts[j], QPU_W_VPM) || 169 reads_reg(insts[j], QPU_R_VARY) || 170 reads_reg(insts[j], QPU_R_UNIF) || 171 reads_reg(insts[j], QPU_R_VPM)) { 172 fail_instr(insts[j], "last 3 instructions " 173 "using fixed functions"); 174 } 175 176 /* "The Thread End instruction and the following two 177 * delay slot instructions must not write or read 178 * address 14 in either regfile A or B." 179 */ 180 if (writes_reg(insts[j], 14) || 181 reads_reg(insts[j], 14)) { 182 fail_instr(insts[j], "last 3 instructions " 183 "must not use r14"); 184 } 185 } 186 187 /* "The final program instruction (the second delay slot 188 * instruction) must not do a TLB Z write." 189 */ 190 if (writes_reg(insts[i + 2], QPU_W_TLB_Z)) { 191 fail_instr(insts[i + 2], "final instruction doing " 192 "Z write"); 193 } 194 } 195 196 /* "A scoreboard wait must not occur in the first two instructions of 197 * a fragment shader. This is either the explicit Wait for Scoreboard 198 * signal or an implicit wait with the first tile-buffer read or 199 * write instruction." 200 */ 201 for (int i = 0; i < 2; i++) { 202 uint64_t inst = insts[i]; 203 204 if (qpu_inst_is_tlb(inst)) 205 fail_instr(inst, "sb wait in first two insts"); 206 } 207 208 /* "If TMU_NOSWAP is written, the write must be three instructions 209 * before the first TMU write instruction. For example, if 210 * TMU_NOSWAP is written in the first shader instruction, the first 211 * TMU write cannot occur before the 4th shader instruction." 212 */ 213 int last_tmu_noswap = -10; 214 for (int i = 0; i < num_inst; i++) { 215 uint64_t inst = insts[i]; 216 217 if ((i - last_tmu_noswap) <= 3 && 218 (writes_reg(inst, QPU_W_TMU0_S) || 219 writes_reg(inst, QPU_W_TMU1_S))) { 220 fail_instr(inst, "TMU write too soon after TMU_NOSWAP"); 221 } 222 223 if (writes_reg(inst, QPU_W_TMU_NOSWAP)) 224 last_tmu_noswap = i; 225 } 226 227 /* "An instruction must not read from a location in physical regfile A 228 * or B that was written to by the previous instruction." 229 */ 230 for (int i = 0; i < num_inst - 1; i++) { 231 uint64_t inst = insts[i]; 232 uint32_t add_waddr = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 233 uint32_t mul_waddr = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 234 uint32_t waddr_a, waddr_b; 235 236 if (inst & QPU_WS) { 237 waddr_b = add_waddr; 238 waddr_a = mul_waddr; 239 } else { 240 waddr_a = add_waddr; 241 waddr_b = mul_waddr; 242 } 243 244 if ((waddr_a < 32 && reads_a_reg(insts[i + 1], waddr_a)) || 245 (waddr_b < 32 && reads_b_reg(insts[i + 1], waddr_b))) { 246 fail_instr(insts[i + 1], 247 "Reads physical reg too soon after write"); 248 } 249 } 250 251 /* "After an SFU lookup instruction, accumulator r4 must not be read 252 * in the following two instructions. Any other instruction that 253 * results in r4 being written (that is, TMU read, TLB read, SFU 254 * lookup) cannot occur in the two instructions following an SFU 255 * lookup." 256 */ 257 int last_sfu_inst = -10; 258 for (int i = 0; i < num_inst - 1; i++) { 259 uint64_t inst = insts[i]; 260 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 261 262 if (i - last_sfu_inst <= 2 && 263 (writes_sfu(inst) || 264 sig == QPU_SIG_LOAD_TMU0 || 265 sig == QPU_SIG_LOAD_TMU1 || 266 sig == QPU_SIG_COLOR_LOAD)) { 267 fail_instr(inst, "R4 write too soon after SFU write"); 268 } 269 270 if (writes_sfu(inst)) 271 last_sfu_inst = i; 272 } 273 274 for (int i = 0; i < num_inst - 1; i++) { 275 uint64_t inst = insts[i]; 276 277 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM && 278 QPU_GET_FIELD(inst, QPU_SMALL_IMM) >= 279 QPU_SMALL_IMM_MUL_ROT) { 280 uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A); 281 uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B); 282 283 /* "The full horizontal vector rotate is only 284 * available when both of the mul ALU input arguments 285 * are taken from accumulators r0-r3." 286 */ 287 if (mux_a > QPU_MUX_R3 || mux_b > QPU_MUX_R3) { 288 fail_instr(inst, 289 "MUL rotate using non-accumulator " 290 "input"); 291 } 292 293 if (QPU_GET_FIELD(inst, QPU_SMALL_IMM) == 294 QPU_SMALL_IMM_MUL_ROT) { 295 /* "An instruction that does a vector rotate 296 * by r5 must not immediately follow an 297 * instruction that writes to r5." 298 */ 299 if (writes_reg(insts[i - 1], QPU_W_ACC5)) { 300 fail_instr(inst, 301 "vector rotate by r5 " 302 "immediately after r5 write"); 303 } 304 } 305 306 /* "An instruction that does a vector rotate must not 307 * immediately follow an instruction that writes to the 308 * accumulator that is being rotated." 309 */ 310 if (writes_reg(insts[i - 1], QPU_W_ACC0 + mux_a) || 311 writes_reg(insts[i - 1], QPU_W_ACC0 + mux_b)) { 312 fail_instr(inst, 313 "vector rotate of value " 314 "written in previous instruction"); 315 } 316 } 317 } 318 319 /* "An instruction that does a vector rotate must not immediately 320 * follow an instruction that writes to the accumulator that is being 321 * rotated. 322 * 323 * XXX: TODO. 324 */ 325 326 /* "After an instruction that does a TLB Z write, the multisample mask 327 * must not be read as an instruction input argument in the following 328 * two instruction. The TLB Z write instruction can, however, be 329 * followed immediately by a TLB color write." 330 */ 331 for (int i = 0; i < num_inst - 1; i++) { 332 uint64_t inst = insts[i]; 333 if (writes_reg(inst, QPU_W_TLB_Z) && 334 (reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS) || 335 reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS))) { 336 fail_instr(inst, "TLB Z write followed by MS mask read"); 337 } 338 } 339 340 /* 341 * "A single instruction can only perform a maximum of one of the 342 * following closely coupled peripheral accesses in a single 343 * instruction: TMU write, TMU read, TLB write, TLB read, TLB 344 * combined color read and write, SFU write, Mutex read or Semaphore 345 * access." 346 */ 347 for (int i = 0; i < num_inst - 1; i++) { 348 uint64_t inst = insts[i]; 349 350 if (qpu_num_sf_accesses(inst) > 1) 351 fail_instr(inst, "Single instruction writes SFU twice"); 352 } 353 354 /* "The uniform base pointer can be written (from SIMD element 0) by 355 * the processor to reset the stream, there must be at least two 356 * nonuniform-accessing instructions following a pointer change 357 * before uniforms can be accessed once more." 358 */ 359 int last_unif_pointer_update = -3; 360 for (int i = 0; i < num_inst; i++) { 361 uint64_t inst = insts[i]; 362 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 363 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 364 365 if (reads_reg(inst, QPU_R_UNIF) && 366 i - last_unif_pointer_update <= 2) { 367 fail_instr(inst, 368 "uniform read too soon after pointer update"); 369 } 370 371 if (waddr_add == QPU_W_UNIFORMS_ADDRESS || 372 waddr_mul == QPU_W_UNIFORMS_ADDRESS) 373 last_unif_pointer_update = i; 374 } 375 376 if (threaded) { 377 bool last_thrsw_found = false; 378 bool scoreboard_locked = false; 379 int tex_samples_outstanding = 0; 380 int last_tex_samples_outstanding = 0; 381 int thrsw_ip = -1; 382 383 for (int i = 0; i < num_inst; i++) { 384 uint64_t inst = insts[i]; 385 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 386 387 if (i == thrsw_ip) { 388 /* In order to get texture results back in the 389 * correct order, before a new thrsw we have 390 * to read all the texture results from before 391 * the previous thrsw. 392 * 393 * FIXME: Is collecting the remaining results 394 * during the delay slots OK, or should we do 395 * this at THRSW signal time? 396 */ 397 if (last_tex_samples_outstanding != 0) { 398 fail_instr(inst, "THRSW with texture " 399 "results from the previous " 400 "THRSW still in the FIFO."); 401 } 402 403 last_tex_samples_outstanding = 404 tex_samples_outstanding; 405 tex_samples_outstanding = 0; 406 } 407 408 if (qpu_inst_is_tlb(inst)) 409 scoreboard_locked = true; 410 411 switch (sig) { 412 case QPU_SIG_THREAD_SWITCH: 413 case QPU_SIG_LAST_THREAD_SWITCH: 414 /* No thread switching with the scoreboard 415 * locked. Doing so means we may deadlock 416 * when the other thread tries to lock 417 * scoreboard. 418 */ 419 if (scoreboard_locked) { 420 fail_instr(inst, "THRSW with the " 421 "scoreboard locked."); 422 } 423 424 /* No thread switching after lthrsw, since 425 * lthrsw means that we get delayed until the 426 * other shader is ready for us to terminate. 427 */ 428 if (last_thrsw_found) { 429 fail_instr(inst, "THRSW after a " 430 "previous LTHRSW"); 431 } 432 433 if (sig == QPU_SIG_LAST_THREAD_SWITCH) 434 last_thrsw_found = true; 435 436 /* No THRSW while we already have a THRSW 437 * queued. 438 */ 439 if (i < thrsw_ip) { 440 fail_instr(inst, 441 "THRSW with a THRSW queued."); 442 } 443 444 thrsw_ip = i + 3; 445 break; 446 447 case QPU_SIG_LOAD_TMU0: 448 case QPU_SIG_LOAD_TMU1: 449 if (last_tex_samples_outstanding == 0) { 450 fail_instr(inst, "TMU load with nothing " 451 "in the results fifo from " 452 "the previous THRSW."); 453 } 454 455 last_tex_samples_outstanding--; 456 break; 457 } 458 459 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 460 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 461 if (waddr_add == QPU_W_TMU0_S || 462 waddr_add == QPU_W_TMU1_S || 463 waddr_mul == QPU_W_TMU0_S || 464 waddr_mul == QPU_W_TMU1_S) { 465 tex_samples_outstanding++; 466 } 467 } 468 } 469} 470