1/* 2 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. */ 22 23#include "radeon_compiler.h" 24 25#include <stdbool.h> 26#include <stdio.h> 27 28#include "r300_reg.h" 29 30#include "radeon_compiler_util.h" 31#include "radeon_dataflow.h" 32#include "radeon_program.h" 33#include "radeon_program_alu.h" 34#include "radeon_swizzle.h" 35#include "radeon_emulate_branches.h" 36#include "radeon_remove_constants.h" 37 38#include "util/compiler.h" 39 40/* 41 * Take an already-setup and valid source then swizzle it appropriately to 42 * obtain a constant ZERO or ONE source. 43 */ 44#define __CONST(x, y) \ 45 (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \ 46 t_swizzle(y), \ 47 t_swizzle(y), \ 48 t_swizzle(y), \ 49 t_swizzle(y), \ 50 t_src_class(vpi->SrcReg[x].File), \ 51 RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4)) 52 53 54static unsigned long t_dst_mask(unsigned int mask) 55{ 56 /* RC_MASK_* is equivalent to VSF_FLAG_* */ 57 return mask & RC_MASK_XYZW; 58} 59 60static unsigned long t_dst_class(rc_register_file file) 61{ 62 switch (file) { 63 default: 64 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file); 65 FALLTHROUGH; 66 case RC_FILE_TEMPORARY: 67 return PVS_DST_REG_TEMPORARY; 68 case RC_FILE_OUTPUT: 69 return PVS_DST_REG_OUT; 70 case RC_FILE_ADDRESS: 71 return PVS_DST_REG_A0; 72 } 73} 74 75static unsigned long t_dst_index(struct r300_vertex_program_code *vp, 76 struct rc_dst_register *dst) 77{ 78 if (dst->File == RC_FILE_OUTPUT) 79 return vp->outputs[dst->Index]; 80 81 return dst->Index; 82} 83 84static unsigned long t_src_class(rc_register_file file) 85{ 86 switch (file) { 87 default: 88 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file); 89 FALLTHROUGH; 90 case RC_FILE_NONE: 91 case RC_FILE_TEMPORARY: 92 return PVS_SRC_REG_TEMPORARY; 93 case RC_FILE_INPUT: 94 return PVS_SRC_REG_INPUT; 95 case RC_FILE_CONSTANT: 96 return PVS_SRC_REG_CONSTANT; 97 } 98} 99 100static int t_src_conflict(struct rc_src_register a, struct rc_src_register b) 101{ 102 unsigned long aclass = t_src_class(a.File); 103 unsigned long bclass = t_src_class(b.File); 104 105 if (aclass != bclass) 106 return 0; 107 if (aclass == PVS_SRC_REG_TEMPORARY) 108 return 0; 109 110 if (a.RelAddr || b.RelAddr) 111 return 1; 112 if (a.Index != b.Index) 113 return 1; 114 115 return 0; 116} 117 118static inline unsigned long t_swizzle(unsigned int swizzle) 119{ 120 /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */ 121 return swizzle; 122} 123 124static unsigned long t_src_index(struct r300_vertex_program_code *vp, 125 struct rc_src_register *src) 126{ 127 if (src->File == RC_FILE_INPUT) { 128 assert(vp->inputs[src->Index] != -1); 129 return vp->inputs[src->Index]; 130 } else { 131 if (src->Index < 0) { 132 fprintf(stderr, 133 "negative offsets for indirect addressing do not work.\n"); 134 return 0; 135 } 136 return src->Index; 137 } 138} 139 140/* these two functions should probably be merged... */ 141 142static unsigned long t_src(struct r300_vertex_program_code *vp, 143 struct rc_src_register *src) 144{ 145 /* src->Negate uses the RC_MASK_ flags from program_instruction.h, 146 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here. 147 */ 148 return PVS_SRC_OPERAND(t_src_index(vp, src), 149 t_swizzle(GET_SWZ(src->Swizzle, 0)), 150 t_swizzle(GET_SWZ(src->Swizzle, 1)), 151 t_swizzle(GET_SWZ(src->Swizzle, 2)), 152 t_swizzle(GET_SWZ(src->Swizzle, 3)), 153 t_src_class(src->File), 154 src->Negate) | 155 (src->RelAddr << 4) | (src->Abs << 3); 156} 157 158static unsigned long t_src_scalar(struct r300_vertex_program_code *vp, 159 struct rc_src_register *src) 160{ 161 /* src->Negate uses the RC_MASK_ flags from program_instruction.h, 162 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here. 163 */ 164 unsigned int swz = rc_get_scalar_src_swz(src->Swizzle); 165 166 return PVS_SRC_OPERAND(t_src_index(vp, src), 167 t_swizzle(swz), 168 t_swizzle(swz), 169 t_swizzle(swz), 170 t_swizzle(swz), 171 t_src_class(src->File), 172 src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) | 173 (src->RelAddr << 4) | (src->Abs << 3); 174} 175 176static int valid_dst(struct r300_vertex_program_code *vp, 177 struct rc_dst_register *dst) 178{ 179 if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) { 180 return 0; 181 } else if (dst->File == RC_FILE_ADDRESS) { 182 assert(dst->Index == 0); 183 } 184 185 return 1; 186} 187 188static void ei_vector1(struct r300_vertex_program_code *vp, 189 unsigned int hw_opcode, 190 struct rc_sub_instruction *vpi, 191 unsigned int * inst) 192{ 193 inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 194 0, 195 0, 196 t_dst_index(vp, &vpi->DstReg), 197 t_dst_mask(vpi->DstReg.WriteMask), 198 t_dst_class(vpi->DstReg.File), 199 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 200 inst[1] = t_src(vp, &vpi->SrcReg[0]); 201 inst[2] = __CONST(0, RC_SWIZZLE_ZERO); 202 inst[3] = __CONST(0, RC_SWIZZLE_ZERO); 203} 204 205static void ei_vector2(struct r300_vertex_program_code *vp, 206 unsigned int hw_opcode, 207 struct rc_sub_instruction *vpi, 208 unsigned int * inst) 209{ 210 inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 211 0, 212 0, 213 t_dst_index(vp, &vpi->DstReg), 214 t_dst_mask(vpi->DstReg.WriteMask), 215 t_dst_class(vpi->DstReg.File), 216 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 217 inst[1] = t_src(vp, &vpi->SrcReg[0]); 218 inst[2] = t_src(vp, &vpi->SrcReg[1]); 219 inst[3] = __CONST(1, RC_SWIZZLE_ZERO); 220} 221 222static void ei_math1(struct r300_vertex_program_code *vp, 223 unsigned int hw_opcode, 224 struct rc_sub_instruction *vpi, 225 unsigned int * inst) 226{ 227 inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 228 1, 229 0, 230 t_dst_index(vp, &vpi->DstReg), 231 t_dst_mask(vpi->DstReg.WriteMask), 232 t_dst_class(vpi->DstReg.File), 233 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 234 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]); 235 inst[2] = __CONST(0, RC_SWIZZLE_ZERO); 236 inst[3] = __CONST(0, RC_SWIZZLE_ZERO); 237} 238 239static void ei_lit(struct r300_vertex_program_code *vp, 240 struct rc_sub_instruction *vpi, 241 unsigned int * inst) 242{ 243 //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W} 244 245 inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX, 246 1, 247 0, 248 t_dst_index(vp, &vpi->DstReg), 249 t_dst_mask(vpi->DstReg.WriteMask), 250 t_dst_class(vpi->DstReg.File), 251 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 252 /* NOTE: Users swizzling might not work. */ 253 inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X 254 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W 255 PVS_SRC_SELECT_FORCE_0, // Z 256 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y 257 t_src_class(vpi->SrcReg[0].File), 258 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) | 259 (vpi->SrcReg[0].RelAddr << 4); 260 inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y 261 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W 262 PVS_SRC_SELECT_FORCE_0, // Z 263 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X 264 t_src_class(vpi->SrcReg[0].File), 265 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) | 266 (vpi->SrcReg[0].RelAddr << 4); 267 inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y 268 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X 269 PVS_SRC_SELECT_FORCE_0, // Z 270 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W 271 t_src_class(vpi->SrcReg[0].File), 272 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) | 273 (vpi->SrcReg[0].RelAddr << 4); 274} 275 276static void ei_mad(struct r300_vertex_program_code *vp, 277 struct rc_sub_instruction *vpi, 278 unsigned int * inst) 279{ 280 unsigned int i; 281 /* Remarks about hardware limitations of MAD 282 * (please preserve this comment, as this information is _NOT_ 283 * in the documentation provided by AMD). 284 * 285 * As described in the documentation, MAD with three unique temporary 286 * source registers requires the use of the macro version. 287 * 288 * However (and this is not mentioned in the documentation), apparently 289 * the macro version is _NOT_ a full superset of the normal version. 290 * In particular, the macro version does not always work when relative 291 * addressing is used in the source operands. 292 * 293 * This limitation caused incorrect rendering in Sauerbraten's OpenGL 294 * assembly shader path when using medium quality animations 295 * (i.e. animations with matrix blending instead of quaternion blending). 296 * 297 * Unfortunately, I (nha) have been unable to extract a Piglit regression 298 * test for this issue - for some reason, it is possible to have vertex 299 * programs whose prefix is *exactly* the same as the prefix of the 300 * offending program in Sauerbraten up to the offending instruction 301 * without causing any trouble. 302 * 303 * Bottom line: Only use the macro version only when really necessary; 304 * according to AMD docs, this should improve performance by one clock 305 * as a nice side bonus. 306 */ 307 if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY && 308 vpi->SrcReg[1].File == RC_FILE_TEMPORARY && 309 vpi->SrcReg[2].File == RC_FILE_TEMPORARY && 310 vpi->SrcReg[0].Index != vpi->SrcReg[1].Index && 311 vpi->SrcReg[0].Index != vpi->SrcReg[2].Index && 312 vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) { 313 inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD, 314 0, 315 1, 316 t_dst_index(vp, &vpi->DstReg), 317 t_dst_mask(vpi->DstReg.WriteMask), 318 t_dst_class(vpi->DstReg.File), 319 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 320 } else { 321 inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD, 322 0, 323 0, 324 t_dst_index(vp, &vpi->DstReg), 325 t_dst_mask(vpi->DstReg.WriteMask), 326 t_dst_class(vpi->DstReg.File), 327 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 328 329 /* Arguments with constant swizzles still count as a unique 330 * temporary, so we should make sure these arguments share a 331 * register index with one of the other arguments. */ 332 for (i = 0; i < 3; i++) { 333 unsigned int j; 334 if (vpi->SrcReg[i].File != RC_FILE_NONE) 335 continue; 336 337 for (j = 0; j < 3; j++) { 338 if (i != j) { 339 vpi->SrcReg[i].Index = 340 vpi->SrcReg[j].Index; 341 break; 342 } 343 } 344 } 345 } 346 inst[1] = t_src(vp, &vpi->SrcReg[0]); 347 inst[2] = t_src(vp, &vpi->SrcReg[1]); 348 inst[3] = t_src(vp, &vpi->SrcReg[2]); 349} 350 351static void ei_pow(struct r300_vertex_program_code *vp, 352 struct rc_sub_instruction *vpi, 353 unsigned int * inst) 354{ 355 inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF, 356 1, 357 0, 358 t_dst_index(vp, &vpi->DstReg), 359 t_dst_mask(vpi->DstReg.WriteMask), 360 t_dst_class(vpi->DstReg.File), 361 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 362 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]); 363 inst[2] = __CONST(0, RC_SWIZZLE_ZERO); 364 inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]); 365} 366 367static void translate_vertex_program(struct radeon_compiler *c, void *user) 368{ 369 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c; 370 struct rc_instruction *rci; 371 372 unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {}; 373 unsigned loop_depth = 0; 374 bool last_input_read_at_loop_end = false; 375 bool last_pos_write_at_loop_end = false; 376 377 compiler->code->pos_end = 0; /* Not supported yet */ 378 compiler->code->length = 0; 379 compiler->code->num_temporaries = 0; 380 compiler->code->last_input_read = 0; 381 compiler->code->last_pos_write = 0; 382 383 compiler->SetHwInputOutput(compiler); 384 385 for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) { 386 struct rc_sub_instruction *vpi = &rci->U.I; 387 unsigned int *inst = compiler->code->body.d + compiler->code->length; 388 const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode); 389 390 /* Skip instructions writing to non-existing destination */ 391 if (!valid_dst(compiler->code, &vpi->DstReg)) 392 continue; 393 394 if (info->HasDstReg) { 395 /* Neither is Saturate. */ 396 if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) { 397 rc_error(&compiler->Base, "Vertex program does not support the Saturate " 398 "modifier (yet).\n"); 399 } 400 } 401 402 if (compiler->code->length >= c->max_alu_insts * 4) { 403 rc_error(&compiler->Base, "Vertex program has too many instructions\n"); 404 return; 405 } 406 407 assert(compiler->Base.is_r500 || 408 (vpi->Opcode != RC_OPCODE_SEQ && 409 vpi->Opcode != RC_OPCODE_SNE)); 410 411 switch (vpi->Opcode) { 412 case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break; 413 case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break; 414 case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break; 415 case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break; 416 case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break; 417 case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break; 418 case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break; 419 case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break; 420 case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break; 421 case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break; 422 case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break; 423 case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break; 424 case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break; 425 case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break; 426 case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break; 427 case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break; 428 case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break; 429 case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break; 430 case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break; 431 case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break; 432 case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break; 433 case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break; 434 case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break; 435 case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break; 436 case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break; 437 case RC_OPCODE_BGNLOOP: 438 { 439 if ((!compiler->Base.is_r500 440 && loop_depth >= R300_VS_MAX_LOOP_DEPTH) 441 || loop_depth >= R500_PVS_MAX_LOOP_DEPTH) { 442 rc_error(&compiler->Base, 443 "Loops are nested too deep."); 444 return; 445 } 446 loops[loop_depth++] = ((compiler->code->length)/ 4) + 1; 447 break; 448 } 449 case RC_OPCODE_ENDLOOP: 450 { 451 unsigned int act_addr; 452 unsigned int last_addr; 453 unsigned int ret_addr; 454 455 if (loop_depth == 1 && last_input_read_at_loop_end) { 456 compiler->code->last_input_read = compiler->code->length / 4; 457 last_input_read_at_loop_end = false; 458 } 459 if (loop_depth == 1 && last_pos_write_at_loop_end) { 460 compiler->code->last_pos_write = compiler->code->length / 4; 461 last_pos_write_at_loop_end = false; 462 } 463 464 ret_addr = loops[--loop_depth]; 465 act_addr = ret_addr - 1; 466 last_addr = (compiler->code->length / 4) - 1; 467 468 if (loop_depth >= R300_VS_MAX_FC_OPS) { 469 rc_error(&compiler->Base, 470 "Too many flow control instructions."); 471 return; 472 } 473 if (compiler->Base.is_r500) { 474 compiler->code->fc_op_addrs.r500 475 [compiler->code->num_fc_ops].lw = 476 R500_PVS_FC_ACT_ADRS(act_addr) 477 | R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff) 478 ; 479 compiler->code->fc_op_addrs.r500 480 [compiler->code->num_fc_ops].uw = 481 R500_PVS_FC_LAST_INST(last_addr) 482 | R500_PVS_FC_RTN_INST(ret_addr) 483 ; 484 } else { 485 compiler->code->fc_op_addrs.r300 486 [compiler->code->num_fc_ops] = 487 R300_PVS_FC_ACT_ADRS(act_addr) 488 | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff) 489 | R300_PVS_FC_LAST_INST(last_addr) 490 | R300_PVS_FC_RTN_INST(ret_addr) 491 ; 492 } 493 compiler->code->fc_loop_index[compiler->code->num_fc_ops] = 494 R300_PVS_FC_LOOP_INIT_VAL(0x0) 495 | R300_PVS_FC_LOOP_STEP_VAL(0x1) 496 ; 497 compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP( 498 compiler->code->num_fc_ops); 499 compiler->code->num_fc_ops++; 500 501 break; 502 } 503 504 case RC_ME_PRED_SET_CLR: 505 ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst); 506 break; 507 508 case RC_ME_PRED_SET_INV: 509 ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst); 510 break; 511 512 case RC_ME_PRED_SET_POP: 513 ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst); 514 break; 515 516 case RC_ME_PRED_SET_RESTORE: 517 ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst); 518 break; 519 520 case RC_ME_PRED_SEQ: 521 ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst); 522 break; 523 524 case RC_ME_PRED_SNEQ: 525 ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst); 526 break; 527 528 case RC_VE_PRED_SNEQ_PUSH: 529 ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH, 530 vpi, inst); 531 break; 532 533 default: 534 rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name); 535 return; 536 } 537 538 if (vpi->DstReg.Pred != RC_PRED_DISABLED) { 539 inst[0] |= (PVS_DST_PRED_ENABLE_MASK 540 << PVS_DST_PRED_ENABLE_SHIFT); 541 if (vpi->DstReg.Pred == RC_PRED_SET) { 542 inst[0] |= (PVS_DST_PRED_SENSE_MASK 543 << PVS_DST_PRED_SENSE_SHIFT); 544 } 545 } 546 547 /* Update the number of temporaries. */ 548 if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY && 549 vpi->DstReg.Index >= compiler->code->num_temporaries) 550 compiler->code->num_temporaries = vpi->DstReg.Index + 1; 551 552 /* last instruction that writes position */ 553 if (info->HasDstReg && vpi->DstReg.File == RC_FILE_OUTPUT && 554 t_dst_index(compiler->code, &vpi->DstReg) == 0) { 555 if (loop_depth == 0) 556 compiler->code->last_pos_write = compiler->code->length / 4; 557 else 558 last_pos_write_at_loop_end = true; 559 } 560 561 for (unsigned i = 0; i < info->NumSrcRegs; i++) { 562 if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY && 563 vpi->SrcReg[i].Index >= compiler->code->num_temporaries) 564 compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1; 565 if (vpi->SrcReg[i].File == RC_FILE_INPUT) { 566 if (loop_depth == 0) 567 compiler->code->last_input_read = compiler->code->length / 4; 568 else 569 last_input_read_at_loop_end = true; 570 } 571 572 } 573 574 575 if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) { 576 rc_error(&compiler->Base, "Too many temporaries.\n"); 577 return; 578 } 579 580 compiler->code->length += 4; 581 582 if (compiler->Base.Error) 583 return; 584 } 585} 586 587struct temporary_allocation { 588 unsigned int Allocated:1; 589 unsigned int HwTemp:15; 590 struct rc_instruction * LastRead; 591}; 592 593static int get_reg(struct radeon_compiler *c, struct temporary_allocation *ta, bool *hwtemps, 594 unsigned int orig) 595{ 596 if (!ta[orig].Allocated) { 597 int j; 598 for (j = 0; j < c->max_temp_regs; ++j) 599 { 600 if (!hwtemps[j]) 601 break; 602 } 603 ta[orig].Allocated = 1; 604 ta[orig].HwTemp = j; 605 hwtemps[ta[orig].HwTemp] = true; 606 } 607 608 return ta[orig].HwTemp; 609} 610 611static void allocate_temporary_registers(struct radeon_compiler *c, void *user) 612{ 613 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c; 614 struct rc_instruction *inst; 615 struct rc_instruction *end_loop = NULL; 616 unsigned int num_orig_temps = 0; 617 bool hwtemps[RC_REGISTER_MAX_INDEX]; 618 struct temporary_allocation * ta; 619 unsigned int i; 620 621 memset(hwtemps, 0, sizeof(hwtemps)); 622 623 rc_recompute_ips(c); 624 625 /* Pass 1: Count original temporaries. */ 626 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { 627 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 628 629 for (i = 0; i < opcode->NumSrcRegs; ++i) { 630 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) { 631 if (inst->U.I.SrcReg[i].Index >= num_orig_temps) 632 num_orig_temps = inst->U.I.SrcReg[i].Index + 1; 633 } 634 } 635 636 if (opcode->HasDstReg) { 637 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) { 638 if (inst->U.I.DstReg.Index >= num_orig_temps) 639 num_orig_temps = inst->U.I.DstReg.Index + 1; 640 } 641 } 642 } 643 644 ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool, 645 sizeof(struct temporary_allocation) * num_orig_temps); 646 memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps); 647 648 /* Pass 2: Determine original temporary lifetimes */ 649 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { 650 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 651 /* Instructions inside of loops need to use the ENDLOOP 652 * instruction as their LastRead. */ 653 if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) 654 end_loop = rc_match_bgnloop(inst); 655 656 if (inst == end_loop) { 657 end_loop = NULL; 658 continue; 659 } 660 661 for (i = 0; i < opcode->NumSrcRegs; ++i) { 662 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) { 663 ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst; 664 } 665 } 666 } 667 668 /* Pass 3: Register allocation */ 669 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { 670 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 671 672 for (i = 0; i < opcode->NumSrcRegs; ++i) { 673 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) { 674 unsigned int orig = inst->U.I.SrcReg[i].Index; 675 inst->U.I.SrcReg[i].Index = get_reg(c, ta, hwtemps, orig); 676 677 if (ta[orig].Allocated && inst == ta[orig].LastRead) 678 hwtemps[ta[orig].HwTemp] = false; 679 } 680 } 681 682 if (opcode->HasDstReg) { 683 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) { 684 unsigned int orig = inst->U.I.DstReg.Index; 685 inst->U.I.DstReg.Index = get_reg(c, ta, hwtemps, orig); 686 } 687 } 688 } 689} 690 691/** 692 * R3xx-R4xx vertex engine does not support the Absolute source operand modifier 693 * and the Saturate opcode modifier. Only Absolute is currently transformed. 694 */ 695static int transform_nonnative_modifiers( 696 struct radeon_compiler *c, 697 struct rc_instruction *inst, 698 void* unused) 699{ 700 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode); 701 unsigned i; 702 703 /* Transform ABS(a) to MAX(a, -a). */ 704 for (i = 0; i < opcode->NumSrcRegs; i++) { 705 if (inst->U.I.SrcReg[i].Abs) { 706 struct rc_instruction *new_inst; 707 unsigned temp; 708 709 inst->U.I.SrcReg[i].Abs = 0; 710 711 temp = rc_find_free_temporary(c); 712 713 new_inst = rc_insert_new_instruction(c, inst->Prev); 714 new_inst->U.I.Opcode = RC_OPCODE_MAX; 715 new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY; 716 new_inst->U.I.DstReg.Index = temp; 717 new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i]; 718 new_inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW; 719 new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i]; 720 new_inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_XYZW; 721 new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW; 722 723 inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY; 724 inst->U.I.SrcReg[i].Index = temp; 725 inst->U.I.SrcReg[i].RelAddr = 0; 726 } 727 } 728 return 1; 729} 730 731/** 732 * Vertex engine cannot read two inputs or two constants at the same time. 733 * Introduce intermediate MOVs to temporary registers to account for this. 734 */ 735static int transform_source_conflicts( 736 struct radeon_compiler *c, 737 struct rc_instruction* inst, 738 void* unused) 739{ 740 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 741 742 if (opcode->NumSrcRegs == 3) { 743 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2]) 744 || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) { 745 int tmpreg = rc_find_free_temporary(c); 746 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev); 747 inst_mov->U.I.Opcode = RC_OPCODE_MOV; 748 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY; 749 inst_mov->U.I.DstReg.Index = tmpreg; 750 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2]; 751 inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW; 752 inst_mov->U.I.SrcReg[0].Negate = 0; 753 inst_mov->U.I.SrcReg[0].Abs = 0; 754 755 inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY; 756 inst->U.I.SrcReg[2].Index = tmpreg; 757 inst->U.I.SrcReg[2].RelAddr = false; 758 } 759 } 760 761 if (opcode->NumSrcRegs >= 2) { 762 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) { 763 int tmpreg = rc_find_free_temporary(c); 764 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev); 765 inst_mov->U.I.Opcode = RC_OPCODE_MOV; 766 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY; 767 inst_mov->U.I.DstReg.Index = tmpreg; 768 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1]; 769 inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW; 770 inst_mov->U.I.SrcReg[0].Negate = 0; 771 inst_mov->U.I.SrcReg[0].Abs = 0; 772 773 inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY; 774 inst->U.I.SrcReg[1].Index = tmpreg; 775 inst->U.I.SrcReg[1].RelAddr = false; 776 } 777 } 778 779 return 1; 780} 781 782static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user) 783{ 784 struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c; 785 int i; 786 787 for(i = 0; i < 32; ++i) { 788 if ((compiler->RequiredOutputs & (1U << i)) && 789 !(compiler->Base.Program.OutputsWritten & (1U << i))) { 790 struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev); 791 inst->U.I.Opcode = RC_OPCODE_MOV; 792 793 inst->U.I.DstReg.File = RC_FILE_OUTPUT; 794 inst->U.I.DstReg.Index = i; 795 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW; 796 797 inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT; 798 inst->U.I.SrcReg[0].Index = 0; 799 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW; 800 801 compiler->Base.Program.OutputsWritten |= 1U << i; 802 } 803 } 804} 805 806static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg) 807{ 808 (void) opcode; 809 (void) reg; 810 811 return 1; 812} 813 814static void transform_negative_addressing(struct r300_vertex_program_compiler *c, 815 struct rc_instruction *arl, 816 struct rc_instruction *end, 817 int min_offset) 818{ 819 struct rc_instruction *inst, *add; 820 unsigned const_swizzle; 821 822 /* Transform ARL/ARR */ 823 add = rc_insert_new_instruction(&c->Base, arl->Prev); 824 add->U.I.Opcode = RC_OPCODE_ADD; 825 add->U.I.DstReg.File = RC_FILE_TEMPORARY; 826 add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base); 827 add->U.I.DstReg.WriteMask = RC_MASK_X; 828 add->U.I.SrcReg[0] = arl->U.I.SrcReg[0]; 829 add->U.I.SrcReg[1].File = RC_FILE_CONSTANT; 830 add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants, 831 min_offset, &const_swizzle); 832 add->U.I.SrcReg[1].Swizzle = const_swizzle; 833 834 arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY; 835 arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index; 836 arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX; 837 838 /* Rewrite offsets up to and excluding inst. */ 839 for (inst = arl->Next; inst != end; inst = inst->Next) { 840 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 841 842 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) 843 if (inst->U.I.SrcReg[i].RelAddr) 844 inst->U.I.SrcReg[i].Index -= min_offset; 845 } 846} 847 848static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user) 849{ 850 struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler; 851 struct rc_instruction *inst, *lastARL = NULL; 852 int min_offset = 0; 853 854 for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) { 855 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 856 857 if (inst->U.I.Opcode == RC_OPCODE_ARL || inst->U.I.Opcode == RC_OPCODE_ARR) { 858 if (lastARL != NULL && min_offset < 0) 859 transform_negative_addressing(c, lastARL, inst, min_offset); 860 861 lastARL = inst; 862 min_offset = 0; 863 continue; 864 } 865 866 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) { 867 if (inst->U.I.SrcReg[i].RelAddr && 868 inst->U.I.SrcReg[i].Index < 0) { 869 /* ARL must precede any indirect addressing. */ 870 if (!lastARL) { 871 rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL/ARR."); 872 return; 873 } 874 875 if (inst->U.I.SrcReg[i].Index < min_offset) 876 min_offset = inst->U.I.SrcReg[i].Index; 877 } 878 } 879 } 880 881 if (lastARL != NULL && min_offset < 0) 882 transform_negative_addressing(c, lastARL, inst, min_offset); 883} 884 885const struct rc_swizzle_caps r300_vertprog_swizzle_caps = { 886 .IsNative = &swizzle_is_native, 887 .Split = NULL /* should never be called */ 888}; 889 890void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c) 891{ 892 int is_r500 = c->Base.is_r500; 893 int opt = !c->Base.disable_optimizations; 894 895 /* Lists of instruction transformations. */ 896 struct radeon_program_transformation alu_rewrite_r500[] = { 897 { &r300_transform_vertex_alu, NULL }, 898 { &r300_transform_trig_scale_vertex, NULL }, 899 { NULL, NULL } 900 }; 901 902 struct radeon_program_transformation alu_rewrite_r300[] = { 903 { &r300_transform_vertex_alu, NULL }, 904 { &r300_transform_trig_simple, NULL }, 905 { NULL, NULL } 906 }; 907 908 /* Note: These passes have to be done seperately from ALU rewrite, 909 * otherwise non-native ALU instructions with source conflits 910 * or non-native modifiers will not be treated properly. 911 */ 912 struct radeon_program_transformation emulate_modifiers[] = { 913 { &transform_nonnative_modifiers, NULL }, 914 { NULL, NULL } 915 }; 916 917 struct radeon_program_transformation resolve_src_conflicts[] = { 918 { &transform_source_conflicts, NULL }, 919 { NULL, NULL } 920 }; 921 922 /* List of compiler passes. */ 923 struct radeon_compiler_pass vs_list[] = { 924 /* NAME DUMP PREDICATE FUNCTION PARAM */ 925 {"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL}, 926 {"emulate branches", 1, !is_r500, rc_emulate_branches, NULL}, 927 {"emulate negative addressing", 1, 1, rc_emulate_negative_addressing, NULL}, 928 {"native rewrite", 1, is_r500, rc_local_transform, alu_rewrite_r500}, 929 {"native rewrite", 1, !is_r500, rc_local_transform, alu_rewrite_r300}, 930 {"emulate modifiers", 1, !is_r500, rc_local_transform, emulate_modifiers}, 931 {"deadcode", 1, opt, rc_dataflow_deadcode, NULL}, 932 {"dataflow optimize", 1, opt, rc_optimize, NULL}, 933 /* This pass must be done after optimizations. */ 934 {"source conflict resolve", 1, 1, rc_local_transform, resolve_src_conflicts}, 935 {"register allocation", 1, opt, allocate_temporary_registers, NULL}, 936 {"dead constants", 1, 1, rc_remove_unused_constants, &c->code->constants_remap_table}, 937 {"lower control flow opcodes", 1, is_r500, rc_vert_fc, NULL}, 938 {"final code validation", 0, 1, rc_validate_final_shader, NULL}, 939 {"machine code generation", 0, 1, translate_vertex_program, NULL}, 940 {"dump machine code", 0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump, NULL}, 941 {NULL, 0, 0, NULL, NULL} 942 }; 943 944 c->Base.type = RC_VERTEX_PROGRAM; 945 c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps; 946 947 rc_run_compiler(&c->Base, vs_list); 948 949 c->code->InputsRead = c->Base.Program.InputsRead; 950 c->code->OutputsWritten = c->Base.Program.OutputsWritten; 951 rc_constants_copy(&c->code->constants, &c->Base.Program.Constants); 952} 953