1/* 2 * Copyright 2011 Christoph Bumiller 3 * 2014 Red Hat Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24#include "nv50_ir.h" 25#include "nv50_ir_build_util.h" 26 27#include "nv50_ir_target_nvc0.h" 28#include "nv50_ir_lowering_gm107.h" 29 30#include <limits> 31 32namespace nv50_ir { 33 34#define QOP_ADD 0 35#define QOP_SUBR 1 36#define QOP_SUB 2 37#define QOP_MOV2 3 38 39// UL UR LL LR 40#define QUADOP(q, r, s, t) \ 41 ((QOP_##q << 6) | (QOP_##r << 4) | \ 42 (QOP_##s << 2) | (QOP_##t << 0)) 43 44#define SHFL_BOUND_QUAD 0x1c03 45 46void 47GM107LegalizeSSA::handlePFETCH(Instruction *i) 48{ 49 Value *src0; 50 51 if (i->src(0).getFile() == FILE_GPR && !i->srcExists(1)) 52 return; 53 54 bld.setPosition(i, false); 55 src0 = bld.getSSA(); 56 57 if (i->srcExists(1)) 58 bld.mkOp2(OP_ADD , TYPE_U32, src0, i->getSrc(0), i->getSrc(1)); 59 else 60 bld.mkOp1(OP_MOV , TYPE_U32, src0, i->getSrc(0)); 61 62 i->setSrc(0, src0); 63 i->setSrc(1, NULL); 64} 65 66void 67GM107LegalizeSSA::handleLOAD(Instruction *i) 68{ 69 if (i->src(0).getFile() != FILE_MEMORY_CONST) 70 return; 71 if (i->src(0).isIndirect(0)) 72 return; 73 if (typeSizeof(i->dType) != 4) 74 return; 75 76 i->op = OP_MOV; 77} 78 79void 80GM107LegalizeSSA::handleQUADON(Instruction *i) 81{ 82 i->setDef(0, NULL); 83} 84 85void 86GM107LegalizeSSA::handleQUADPOP(Instruction *i) 87{ 88 i->setSrc(0, NULL); 89} 90 91bool 92GM107LegalizeSSA::visit(Instruction *i) 93{ 94 switch (i->op) { 95 case OP_QUADON: 96 handleQUADON(i); 97 break; 98 case OP_QUADPOP: 99 handleQUADPOP(i); 100 break; 101 case OP_PFETCH: 102 handlePFETCH(i); 103 break; 104 case OP_LOAD: 105 handleLOAD(i); 106 break; 107 default: 108 break; 109 } 110 return true; 111} 112 113bool 114GM107LoweringPass::handleManualTXD(TexInstruction *i) 115{ 116 // See NVC0LoweringPass::handleManualTXD for rationale. This function 117 // implements the same logic, but using SM50-friendly primitives. 118 static const uint8_t qOps[2] = 119 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }; 120 Value *def[4][4]; 121 Value *crd[3], *arr, *shadow; 122 Value *tmp; 123 Instruction *tex, *add; 124 Value *quad = bld.mkImm(SHFL_BOUND_QUAD); 125 int l, c; 126 const int dim = i->tex.target.getDim() + i->tex.target.isCube(); 127 const int array = i->tex.target.isArray(); 128 const int indirect = i->tex.rIndirectSrc >= 0; 129 130 i->op = OP_TEX; // no need to clone dPdx/dPdy later 131 132 for (c = 0; c < dim; ++c) 133 crd[c] = bld.getScratch(); 134 arr = bld.getScratch(); 135 shadow = bld.getScratch(); 136 tmp = bld.getScratch(); 137 138 for (l = 0; l < 4; ++l) { 139 Value *bar = bld.getSSA(4, FILE_BARRIER); 140 Value *src[3], *val; 141 Value *lane = bld.mkImm(l); 142 bld.mkOp(OP_QUADON, TYPE_U32, bar); 143 // Make sure lane 0 has the appropriate array/depth compare values 144 if (l != 0) { 145 if (array) 146 bld.mkOp3(OP_SHFL, TYPE_F32, arr, i->getSrc(0), lane, quad); 147 if (i->tex.target.isShadow()) 148 bld.mkOp3(OP_SHFL, TYPE_F32, shadow, i->getSrc(array + dim + indirect), lane, quad); 149 } 150 151 // mov coordinates from lane l to all lanes 152 for (c = 0; c < dim; ++c) { 153 bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array), lane, quad); 154 } 155 156 // add dPdx from lane l to lanes dx 157 for (c = 0; c < dim; ++c) { 158 bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), lane, quad); 159 add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]); 160 add->subOp = qOps[0]; 161 add->lanes = 1; /* abused for .ndv */ 162 } 163 164 // add dPdy from lane l to lanes dy 165 for (c = 0; c < dim; ++c) { 166 bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), lane, quad); 167 add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]); 168 add->subOp = qOps[1]; 169 add->lanes = 1; /* abused for .ndv */ 170 } 171 172 // normalize cube coordinates if necessary 173 if (i->tex.target.isCube()) { 174 for (c = 0; c < 3; ++c) 175 src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]); 176 val = bld.getScratch(); 177 bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); 178 bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val); 179 bld.mkOp1(OP_RCP, TYPE_F32, val, val); 180 for (c = 0; c < 3; ++c) 181 src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val); 182 } else { 183 for (c = 0; c < dim; ++c) 184 src[c] = crd[c]; 185 } 186 187 // texture 188 bld.insert(tex = cloneForward(func, i)); 189 if (l != 0) { 190 if (array) 191 tex->setSrc(0, arr); 192 if (i->tex.target.isShadow()) 193 tex->setSrc(array + dim + indirect, shadow); 194 } 195 for (c = 0; c < dim; ++c) 196 tex->setSrc(c + array, src[c]); 197 // broadcast results from lane 0 to all lanes 198 if (l != 0) 199 for (c = 0; i->defExists(c); ++c) 200 bld.mkOp3(OP_SHFL, TYPE_F32, tex->getDef(c), tex->getDef(c), bld.mkImm(0), quad); 201 bld.mkOp1(OP_QUADPOP, TYPE_U32, NULL, bar)->fixed = 1; 202 203 // save results 204 for (c = 0; i->defExists(c); ++c) { 205 Instruction *mov; 206 def[c][l] = bld.getSSA(); 207 mov = bld.mkMov(def[c][l], tex->getDef(c)); 208 mov->fixed = 1; 209 mov->lanes = 1 << l; 210 } 211 } 212 213 for (c = 0; i->defExists(c); ++c) { 214 Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c)); 215 for (l = 0; l < 4; ++l) 216 u->setSrc(l, def[c][l]); 217 } 218 219 i->bb->remove(i); 220 return true; 221} 222 223bool 224GM107LoweringPass::handleDFDX(Instruction *insn) 225{ 226 Instruction *shfl; 227 int qop = 0, xid = 0; 228 229 switch (insn->op) { 230 case OP_DFDX: 231 qop = QUADOP(SUB, SUBR, SUB, SUBR); 232 xid = 1; 233 break; 234 case OP_DFDY: 235 qop = QUADOP(SUB, SUB, SUBR, SUBR); 236 xid = 2; 237 break; 238 default: 239 assert(!"invalid dfdx opcode"); 240 break; 241 } 242 243 shfl = bld.mkOp3(OP_SHFL, TYPE_F32, bld.getScratch(), insn->getSrc(0), 244 bld.mkImm(xid), bld.mkImm(SHFL_BOUND_QUAD)); 245 shfl->subOp = NV50_IR_SUBOP_SHFL_BFLY; 246 insn->op = OP_QUADOP; 247 insn->subOp = qop; 248 insn->lanes = 0; /* abused for !.ndv */ 249 insn->setSrc(1, insn->getSrc(0)); 250 insn->setSrc(0, shfl->getDef(0)); 251 return true; 252} 253 254bool 255GM107LoweringPass::handlePFETCH(Instruction *i) 256{ 257 Value *tmp0 = bld.getScratch(); 258 Value *tmp1 = bld.getScratch(); 259 Value *tmp2 = bld.getScratch(); 260 bld.mkOp1(OP_RDSV, TYPE_U32, tmp0, bld.mkSysVal(SV_INVOCATION_INFO, 0)); 261 bld.mkOp3(OP_PERMT, TYPE_U32, tmp1, tmp0, bld.mkImm(0x4442), bld.mkImm(0)); 262 bld.mkOp3(OP_PERMT, TYPE_U32, tmp0, tmp0, bld.mkImm(0x4440), bld.mkImm(0)); 263 if (i->getSrc(1)) 264 bld.mkOp2(OP_ADD , TYPE_U32, tmp2, i->getSrc(0), i->getSrc(1)); 265 else 266 bld.mkOp1(OP_MOV , TYPE_U32, tmp2, i->getSrc(0)); 267 bld.mkOp3(OP_MAD , TYPE_U32, tmp0, tmp0, tmp1, tmp2); 268 i->setSrc(0, tmp0); 269 i->setSrc(1, NULL); 270 return true; 271} 272 273bool 274GM107LoweringPass::handlePOPCNT(Instruction *i) 275{ 276 Value *tmp = bld.mkOp2v(OP_AND, i->sType, bld.getScratch(), 277 i->getSrc(0), i->getSrc(1)); 278 i->setSrc(0, tmp); 279 i->setSrc(1, NULL); 280 return true; 281} 282 283bool 284GM107LoweringPass::handleSUQ(TexInstruction *suq) 285{ 286 Value *ind = suq->getIndirectR(); 287 Value *handle; 288 const int slot = suq->tex.r; 289 const int mask = suq->tex.mask; 290 291 if (suq->tex.bindless) 292 handle = ind; 293 else 294 handle = loadTexHandle(ind, slot + 32); 295 296 suq->tex.r = 0xff; 297 suq->tex.s = 0x1f; 298 299 suq->setIndirectR(NULL); 300 suq->setSrc(0, handle); 301 suq->tex.rIndirectSrc = 0; 302 suq->setSrc(1, bld.loadImm(NULL, 0)); 303 suq->tex.query = TXQ_DIMS; 304 suq->op = OP_TXQ; 305 306 // We store CUBE / CUBE_ARRAY as a 2D ARRAY. Make sure that depth gets 307 // divided by 6. 308 if (mask & 0x4 && suq->tex.target.isCube()) { 309 int d = util_bitcount(mask & 0x3); 310 bld.setPosition(suq, true); 311 bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d), suq->getDef(d), 312 bld.loadImm(NULL, 6)); 313 } 314 315 // Samples come from a different query. If we want both samples and dims, 316 // create a second suq. 317 if (mask & 0x8) { 318 int d = util_bitcount(mask & 0x7); 319 Value *dst = suq->getDef(d); 320 TexInstruction *samples = suq; 321 assert(dst); 322 323 if (mask != 0x8) { 324 suq->setDef(d, NULL); 325 suq->tex.mask &= 0x7; 326 samples = cloneShallow(func, suq); 327 for (int i = 0; i < d; i++) 328 samples->setDef(d, NULL); 329 samples->setDef(0, dst); 330 suq->bb->insertAfter(suq, samples); 331 } 332 samples->tex.mask = 0x4; 333 samples->tex.query = TXQ_TYPE; 334 } 335 336 if (suq->tex.target.isMS()) { 337 bld.setPosition(suq, true); 338 339 if (mask & 0x1) 340 bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(0), suq->getDef(0), 341 loadMsAdjInfo32(suq->tex.target, 0, slot, ind, suq->tex.bindless)); 342 if (mask & 0x2) { 343 int d = util_bitcount(mask & 0x1); 344 bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(d), suq->getDef(d), 345 loadMsAdjInfo32(suq->tex.target, 1, slot, ind, suq->tex.bindless)); 346 } 347 } 348 349 return true; 350} 351 352// 353// - add quadop dance for texturing 354// - put FP outputs in GPRs 355// - convert instruction sequences 356// 357bool 358GM107LoweringPass::visit(Instruction *i) 359{ 360 bld.setPosition(i, false); 361 362 if (i->cc != CC_ALWAYS) 363 checkPredicate(i); 364 365 switch (i->op) { 366 case OP_PFETCH: 367 return handlePFETCH(i); 368 case OP_DFDX: 369 case OP_DFDY: 370 return handleDFDX(i); 371 case OP_POPCNT: 372 return handlePOPCNT(i); 373 case OP_SUQ: 374 return handleSUQ(i->asTex()); 375 default: 376 return NVC0LoweringPass::visit(i); 377 } 378} 379 380} // namespace nv50_ir 381