1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright 2011 Christoph Bumiller 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice shall be included in 12bf215546Sopenharmony_ci * all copies or substantial portions of the Software. 13bf215546Sopenharmony_ci * 14bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18bf215546Sopenharmony_ci * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19bf215546Sopenharmony_ci * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20bf215546Sopenharmony_ci * OTHER DEALINGS IN THE SOFTWARE. 21bf215546Sopenharmony_ci */ 22bf215546Sopenharmony_ci 23bf215546Sopenharmony_ci#include "nv50_ir.h" 24bf215546Sopenharmony_ci#include "nv50_ir_build_util.h" 25bf215546Sopenharmony_ci 26bf215546Sopenharmony_ci#include "nv50_ir_target_nvc0.h" 27bf215546Sopenharmony_ci#include "nv50_ir_lowering_nvc0.h" 28bf215546Sopenharmony_ci 29bf215546Sopenharmony_ci#include <limits> 30bf215546Sopenharmony_ci 31bf215546Sopenharmony_cinamespace nv50_ir { 32bf215546Sopenharmony_ci 33bf215546Sopenharmony_ci#define QOP_ADD 0 34bf215546Sopenharmony_ci#define QOP_SUBR 1 35bf215546Sopenharmony_ci#define QOP_SUB 2 36bf215546Sopenharmony_ci#define QOP_MOV2 3 37bf215546Sopenharmony_ci 38bf215546Sopenharmony_ci// UL UR LL LR 39bf215546Sopenharmony_ci#define QUADOP(q, r, s, t) \ 40bf215546Sopenharmony_ci ((QOP_##q << 6) | (QOP_##r << 4) | \ 41bf215546Sopenharmony_ci (QOP_##s << 2) | (QOP_##t << 0)) 42bf215546Sopenharmony_ci 43bf215546Sopenharmony_civoid 44bf215546Sopenharmony_ciNVC0LegalizeSSA::handleDIV(Instruction *i) 45bf215546Sopenharmony_ci{ 46bf215546Sopenharmony_ci FlowInstruction *call; 47bf215546Sopenharmony_ci int builtin; 48bf215546Sopenharmony_ci 49bf215546Sopenharmony_ci bld.setPosition(i, false); 50bf215546Sopenharmony_ci 51bf215546Sopenharmony_ci // Generate movs to the input regs for the call we want to generate 52bf215546Sopenharmony_ci for (int s = 0; i->srcExists(s); ++s) { 53bf215546Sopenharmony_ci Instruction *ld = i->getSrc(s)->getInsn(); 54bf215546Sopenharmony_ci // check if we are moving an immediate, propagate it in that case 55bf215546Sopenharmony_ci if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV) || 56bf215546Sopenharmony_ci !(ld->src(0).getFile() == FILE_IMMEDIATE)) 57bf215546Sopenharmony_ci bld.mkMovToReg(s, i->getSrc(s)); 58bf215546Sopenharmony_ci else { 59bf215546Sopenharmony_ci assert(ld->getSrc(0) != NULL); 60bf215546Sopenharmony_ci bld.mkMovToReg(s, ld->getSrc(0)); 61bf215546Sopenharmony_ci // Clear the src, to make code elimination possible here before we 62bf215546Sopenharmony_ci // delete the instruction i later 63bf215546Sopenharmony_ci i->setSrc(s, NULL); 64bf215546Sopenharmony_ci if (ld->isDead()) 65bf215546Sopenharmony_ci delete_Instruction(prog, ld); 66bf215546Sopenharmony_ci } 67bf215546Sopenharmony_ci } 68bf215546Sopenharmony_ci 69bf215546Sopenharmony_ci switch (i->dType) { 70bf215546Sopenharmony_ci case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break; 71bf215546Sopenharmony_ci case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break; 72bf215546Sopenharmony_ci default: 73bf215546Sopenharmony_ci return; 74bf215546Sopenharmony_ci } 75bf215546Sopenharmony_ci call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL); 76bf215546Sopenharmony_ci bld.mkMovFromReg(i->getDef(0), i->op == OP_DIV ? 0 : 1); 77bf215546Sopenharmony_ci bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2); 78bf215546Sopenharmony_ci bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0); 79bf215546Sopenharmony_ci 80bf215546Sopenharmony_ci call->fixed = 1; 81bf215546Sopenharmony_ci call->absolute = call->builtin = 1; 82bf215546Sopenharmony_ci call->target.builtin = builtin; 83bf215546Sopenharmony_ci delete_Instruction(prog, i); 84bf215546Sopenharmony_ci} 85bf215546Sopenharmony_ci 86bf215546Sopenharmony_civoid 87bf215546Sopenharmony_ciNVC0LegalizeSSA::handleRCPRSQLib(Instruction *i, Value *src[]) 88bf215546Sopenharmony_ci{ 89bf215546Sopenharmony_ci FlowInstruction *call; 90bf215546Sopenharmony_ci Value *def[2]; 91bf215546Sopenharmony_ci int builtin; 92bf215546Sopenharmony_ci 93bf215546Sopenharmony_ci def[0] = bld.mkMovToReg(0, src[0])->getDef(0); 94bf215546Sopenharmony_ci def[1] = bld.mkMovToReg(1, src[1])->getDef(0); 95bf215546Sopenharmony_ci 96bf215546Sopenharmony_ci if (i->op == OP_RCP) 97bf215546Sopenharmony_ci builtin = NVC0_BUILTIN_RCP_F64; 98bf215546Sopenharmony_ci else 99bf215546Sopenharmony_ci builtin = NVC0_BUILTIN_RSQ_F64; 100bf215546Sopenharmony_ci 101bf215546Sopenharmony_ci call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL); 102bf215546Sopenharmony_ci def[0] = bld.getSSA(); 103bf215546Sopenharmony_ci def[1] = bld.getSSA(); 104bf215546Sopenharmony_ci bld.mkMovFromReg(def[0], 0); 105bf215546Sopenharmony_ci bld.mkMovFromReg(def[1], 1); 106bf215546Sopenharmony_ci bld.mkClobber(FILE_GPR, 0x3fc, 2); 107bf215546Sopenharmony_ci bld.mkClobber(FILE_PREDICATE, i->op == OP_RSQ ? 0x3 : 0x1, 0); 108bf215546Sopenharmony_ci bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), def[0], def[1]); 109bf215546Sopenharmony_ci 110bf215546Sopenharmony_ci call->fixed = 1; 111bf215546Sopenharmony_ci call->absolute = call->builtin = 1; 112bf215546Sopenharmony_ci call->target.builtin = builtin; 113bf215546Sopenharmony_ci delete_Instruction(prog, i); 114bf215546Sopenharmony_ci 115bf215546Sopenharmony_ci prog->fp64 = true; 116bf215546Sopenharmony_ci} 117bf215546Sopenharmony_ci 118bf215546Sopenharmony_civoid 119bf215546Sopenharmony_ciNVC0LegalizeSSA::handleRCPRSQ(Instruction *i) 120bf215546Sopenharmony_ci{ 121bf215546Sopenharmony_ci assert(i->dType == TYPE_F64); 122bf215546Sopenharmony_ci // There are instructions that will compute the high 32 bits of the 64-bit 123bf215546Sopenharmony_ci // float. We will just stick 0 in the bottom 32 bits. 124bf215546Sopenharmony_ci 125bf215546Sopenharmony_ci bld.setPosition(i, false); 126bf215546Sopenharmony_ci 127bf215546Sopenharmony_ci // 1. Take the source and it up. 128bf215546Sopenharmony_ci Value *src[2], *dst[2], *def = i->getDef(0); 129bf215546Sopenharmony_ci bld.mkSplit(src, 4, i->getSrc(0)); 130bf215546Sopenharmony_ci 131bf215546Sopenharmony_ci int chip = prog->getTarget()->getChipset(); 132bf215546Sopenharmony_ci if (chip >= NVISA_GK104_CHIPSET) { 133bf215546Sopenharmony_ci handleRCPRSQLib(i, src); 134bf215546Sopenharmony_ci return; 135bf215546Sopenharmony_ci } 136bf215546Sopenharmony_ci 137bf215546Sopenharmony_ci // 2. We don't care about the low 32 bits of the destination. Stick a 0 in. 138bf215546Sopenharmony_ci dst[0] = bld.loadImm(NULL, 0); 139bf215546Sopenharmony_ci dst[1] = bld.getSSA(); 140bf215546Sopenharmony_ci 141bf215546Sopenharmony_ci // 3. The new version of the instruction takes the high 32 bits of the 142bf215546Sopenharmony_ci // source and outputs the high 32 bits of the destination. 143bf215546Sopenharmony_ci i->setSrc(0, src[1]); 144bf215546Sopenharmony_ci i->setDef(0, dst[1]); 145bf215546Sopenharmony_ci i->setType(TYPE_F32); 146bf215546Sopenharmony_ci i->subOp = NV50_IR_SUBOP_RCPRSQ_64H; 147bf215546Sopenharmony_ci 148bf215546Sopenharmony_ci // 4. Recombine the two dst pieces back into the original destination. 149bf215546Sopenharmony_ci bld.setPosition(i, true); 150bf215546Sopenharmony_ci bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]); 151bf215546Sopenharmony_ci} 152bf215546Sopenharmony_ci 153bf215546Sopenharmony_civoid 154bf215546Sopenharmony_ciNVC0LegalizeSSA::handleFTZ(Instruction *i) 155bf215546Sopenharmony_ci{ 156bf215546Sopenharmony_ci // Only want to flush float inputs 157bf215546Sopenharmony_ci assert(i->sType == TYPE_F32); 158bf215546Sopenharmony_ci 159bf215546Sopenharmony_ci // If we're already flushing denorms (and NaN's) to zero, no need for this. 160bf215546Sopenharmony_ci if (i->dnz) 161bf215546Sopenharmony_ci return; 162bf215546Sopenharmony_ci 163bf215546Sopenharmony_ci // Only certain classes of operations can flush 164bf215546Sopenharmony_ci OpClass cls = prog->getTarget()->getOpClass(i->op); 165bf215546Sopenharmony_ci if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE && 166bf215546Sopenharmony_ci cls != OPCLASS_CONVERT) 167bf215546Sopenharmony_ci return; 168bf215546Sopenharmony_ci 169bf215546Sopenharmony_ci i->ftz = true; 170bf215546Sopenharmony_ci} 171bf215546Sopenharmony_ci 172bf215546Sopenharmony_civoid 173bf215546Sopenharmony_ciNVC0LegalizeSSA::handleTEXLOD(TexInstruction *i) 174bf215546Sopenharmony_ci{ 175bf215546Sopenharmony_ci if (i->tex.levelZero) 176bf215546Sopenharmony_ci return; 177bf215546Sopenharmony_ci 178bf215546Sopenharmony_ci ImmediateValue lod; 179bf215546Sopenharmony_ci 180bf215546Sopenharmony_ci // The LOD argument comes right after the coordinates (before depth bias, 181bf215546Sopenharmony_ci // offsets, etc). 182bf215546Sopenharmony_ci int arg = i->tex.target.getArgCount(); 183bf215546Sopenharmony_ci 184bf215546Sopenharmony_ci // SM30+ stores the indirect handle as a separate arg, which comes before 185bf215546Sopenharmony_ci // the LOD. 186bf215546Sopenharmony_ci if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET && 187bf215546Sopenharmony_ci i->tex.rIndirectSrc >= 0) 188bf215546Sopenharmony_ci arg++; 189bf215546Sopenharmony_ci // SM20 stores indirect handle combined with array coordinate 190bf215546Sopenharmony_ci if (prog->getTarget()->getChipset() < NVISA_GK104_CHIPSET && 191bf215546Sopenharmony_ci !i->tex.target.isArray() && 192bf215546Sopenharmony_ci i->tex.rIndirectSrc >= 0) 193bf215546Sopenharmony_ci arg++; 194bf215546Sopenharmony_ci 195bf215546Sopenharmony_ci if (!i->src(arg).getImmediate(lod) || !lod.isInteger(0)) 196bf215546Sopenharmony_ci return; 197bf215546Sopenharmony_ci 198bf215546Sopenharmony_ci if (i->op == OP_TXL) 199bf215546Sopenharmony_ci i->op = OP_TEX; 200bf215546Sopenharmony_ci i->tex.levelZero = true; 201bf215546Sopenharmony_ci i->moveSources(arg + 1, -1); 202bf215546Sopenharmony_ci} 203bf215546Sopenharmony_ci 204bf215546Sopenharmony_civoid 205bf215546Sopenharmony_ciNVC0LegalizeSSA::handleShift(Instruction *lo) 206bf215546Sopenharmony_ci{ 207bf215546Sopenharmony_ci Value *shift = lo->getSrc(1); 208bf215546Sopenharmony_ci Value *dst64 = lo->getDef(0); 209bf215546Sopenharmony_ci Value *src[2], *dst[2]; 210bf215546Sopenharmony_ci operation op = lo->op; 211bf215546Sopenharmony_ci 212bf215546Sopenharmony_ci bld.setPosition(lo, false); 213bf215546Sopenharmony_ci 214bf215546Sopenharmony_ci bld.mkSplit(src, 4, lo->getSrc(0)); 215bf215546Sopenharmony_ci 216bf215546Sopenharmony_ci // SM30 and prior don't have the fancy new SHF.L/R ops. So the logic has to 217bf215546Sopenharmony_ci // be completely emulated. For SM35+, we can use the more directed SHF 218bf215546Sopenharmony_ci // operations. 219bf215546Sopenharmony_ci if (prog->getTarget()->getChipset() < NVISA_GK20A_CHIPSET) { 220bf215546Sopenharmony_ci // The strategy here is to handle shifts >= 32 and less than 32 as 221bf215546Sopenharmony_ci // separate parts. 222bf215546Sopenharmony_ci // 223bf215546Sopenharmony_ci // For SHL: 224bf215546Sopenharmony_ci // If the shift is <= 32, then 225bf215546Sopenharmony_ci // (HI,LO) << x = (HI << x | (LO >> (32 - x)), LO << x) 226bf215546Sopenharmony_ci // If the shift is > 32, then 227bf215546Sopenharmony_ci // (HI,LO) << x = (LO << (x - 32), 0) 228bf215546Sopenharmony_ci // 229bf215546Sopenharmony_ci // For SHR: 230bf215546Sopenharmony_ci // If the shift is <= 32, then 231bf215546Sopenharmony_ci // (HI,LO) >> x = (HI >> x, (HI << (32 - x)) | LO >> x) 232bf215546Sopenharmony_ci // If the shift is > 32, then 233bf215546Sopenharmony_ci // (HI,LO) >> x = (0, HI >> (x - 32)) 234bf215546Sopenharmony_ci // 235bf215546Sopenharmony_ci // Note that on NVIDIA hardware, a shift > 32 yields a 0 value, which we 236bf215546Sopenharmony_ci // can use to our advantage. Also note the structural similarities 237bf215546Sopenharmony_ci // between the right/left cases. The main difference is swapping hi/lo 238bf215546Sopenharmony_ci // on input and output. 239bf215546Sopenharmony_ci 240bf215546Sopenharmony_ci Value *x32_minus_shift, *pred, *hi1, *hi2; 241bf215546Sopenharmony_ci DataType type = isSignedIntType(lo->dType) ? TYPE_S32 : TYPE_U32; 242bf215546Sopenharmony_ci operation antiop = op == OP_SHR ? OP_SHL : OP_SHR; 243bf215546Sopenharmony_ci if (op == OP_SHR) 244bf215546Sopenharmony_ci std::swap(src[0], src[1]); 245bf215546Sopenharmony_ci bld.mkOp2(OP_ADD, TYPE_U32, (x32_minus_shift = bld.getSSA()), shift, bld.mkImm(0x20)) 246bf215546Sopenharmony_ci ->src(0).mod = Modifier(NV50_IR_MOD_NEG); 247bf215546Sopenharmony_ci bld.mkCmp(OP_SET, CC_LE, TYPE_U8, (pred = bld.getSSA(1, FILE_PREDICATE)), 248bf215546Sopenharmony_ci TYPE_U32, shift, bld.mkImm(32)); 249bf215546Sopenharmony_ci // Compute HI (shift <= 32) 250bf215546Sopenharmony_ci bld.mkOp2(OP_OR, TYPE_U32, (hi1 = bld.getSSA()), 251bf215546Sopenharmony_ci bld.mkOp2v(op, TYPE_U32, bld.getSSA(), src[1], shift), 252bf215546Sopenharmony_ci bld.mkOp2v(antiop, TYPE_U32, bld.getSSA(), src[0], x32_minus_shift)) 253bf215546Sopenharmony_ci ->setPredicate(CC_P, pred); 254bf215546Sopenharmony_ci // Compute LO (all shift values) 255bf215546Sopenharmony_ci bld.mkOp2(op, type, (dst[0] = bld.getSSA()), src[0], shift); 256bf215546Sopenharmony_ci // Compute HI (shift > 32) 257bf215546Sopenharmony_ci bld.mkOp2(op, type, (hi2 = bld.getSSA()), src[0], 258bf215546Sopenharmony_ci bld.mkOp1v(OP_NEG, TYPE_S32, bld.getSSA(), x32_minus_shift)) 259bf215546Sopenharmony_ci ->setPredicate(CC_NOT_P, pred); 260bf215546Sopenharmony_ci bld.mkOp2(OP_UNION, TYPE_U32, (dst[1] = bld.getSSA()), hi1, hi2); 261bf215546Sopenharmony_ci if (op == OP_SHR) 262bf215546Sopenharmony_ci std::swap(dst[0], dst[1]); 263bf215546Sopenharmony_ci bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]); 264bf215546Sopenharmony_ci delete_Instruction(prog, lo); 265bf215546Sopenharmony_ci return; 266bf215546Sopenharmony_ci } 267bf215546Sopenharmony_ci 268bf215546Sopenharmony_ci Instruction *hi = new_Instruction(func, op, TYPE_U32); 269bf215546Sopenharmony_ci lo->bb->insertAfter(lo, hi); 270bf215546Sopenharmony_ci 271bf215546Sopenharmony_ci hi->sType = lo->sType; 272bf215546Sopenharmony_ci lo->dType = TYPE_U32; 273bf215546Sopenharmony_ci 274bf215546Sopenharmony_ci hi->setDef(0, (dst[1] = bld.getSSA())); 275bf215546Sopenharmony_ci if (lo->op == OP_SHR) 276bf215546Sopenharmony_ci hi->subOp |= NV50_IR_SUBOP_SHIFT_HIGH; 277bf215546Sopenharmony_ci lo->setDef(0, (dst[0] = bld.getSSA())); 278bf215546Sopenharmony_ci 279bf215546Sopenharmony_ci bld.setPosition(hi, true); 280bf215546Sopenharmony_ci 281bf215546Sopenharmony_ci if (lo->op == OP_SHL) 282bf215546Sopenharmony_ci std::swap(hi, lo); 283bf215546Sopenharmony_ci 284bf215546Sopenharmony_ci hi->setSrc(0, new_ImmediateValue(prog, 0u)); 285bf215546Sopenharmony_ci hi->setSrc(1, shift); 286bf215546Sopenharmony_ci hi->setSrc(2, lo->op == OP_SHL ? src[0] : src[1]); 287bf215546Sopenharmony_ci 288bf215546Sopenharmony_ci lo->setSrc(0, src[0]); 289bf215546Sopenharmony_ci lo->setSrc(1, shift); 290bf215546Sopenharmony_ci lo->setSrc(2, src[1]); 291bf215546Sopenharmony_ci 292bf215546Sopenharmony_ci bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]); 293bf215546Sopenharmony_ci} 294bf215546Sopenharmony_ci 295bf215546Sopenharmony_civoid 296bf215546Sopenharmony_ciNVC0LegalizeSSA::handleSET(CmpInstruction *cmp) 297bf215546Sopenharmony_ci{ 298bf215546Sopenharmony_ci DataType hTy = cmp->sType == TYPE_S64 ? TYPE_S32 : TYPE_U32; 299bf215546Sopenharmony_ci Value *carry; 300bf215546Sopenharmony_ci Value *src0[2], *src1[2]; 301bf215546Sopenharmony_ci bld.setPosition(cmp, false); 302bf215546Sopenharmony_ci 303bf215546Sopenharmony_ci bld.mkSplit(src0, 4, cmp->getSrc(0)); 304bf215546Sopenharmony_ci bld.mkSplit(src1, 4, cmp->getSrc(1)); 305bf215546Sopenharmony_ci bld.mkOp2(OP_SUB, hTy, NULL, src0[0], src1[0]) 306bf215546Sopenharmony_ci ->setFlagsDef(0, (carry = bld.getSSA(1, FILE_FLAGS))); 307bf215546Sopenharmony_ci cmp->setFlagsSrc(cmp->srcCount(), carry); 308bf215546Sopenharmony_ci cmp->setSrc(0, src0[1]); 309bf215546Sopenharmony_ci cmp->setSrc(1, src1[1]); 310bf215546Sopenharmony_ci cmp->sType = hTy; 311bf215546Sopenharmony_ci} 312bf215546Sopenharmony_ci 313bf215546Sopenharmony_civoid 314bf215546Sopenharmony_ciNVC0LegalizeSSA::handleBREV(Instruction *i) 315bf215546Sopenharmony_ci{ 316bf215546Sopenharmony_ci i->op = OP_EXTBF; 317bf215546Sopenharmony_ci i->subOp = NV50_IR_SUBOP_EXTBF_REV; 318bf215546Sopenharmony_ci i->setSrc(1, bld.mkImm(0x2000)); 319bf215546Sopenharmony_ci} 320bf215546Sopenharmony_ci 321bf215546Sopenharmony_cibool 322bf215546Sopenharmony_ciNVC0LegalizeSSA::visit(Function *fn) 323bf215546Sopenharmony_ci{ 324bf215546Sopenharmony_ci bld.setProgram(fn->getProgram()); 325bf215546Sopenharmony_ci return true; 326bf215546Sopenharmony_ci} 327bf215546Sopenharmony_ci 328bf215546Sopenharmony_cibool 329bf215546Sopenharmony_ciNVC0LegalizeSSA::visit(BasicBlock *bb) 330bf215546Sopenharmony_ci{ 331bf215546Sopenharmony_ci Instruction *next; 332bf215546Sopenharmony_ci for (Instruction *i = bb->getEntry(); i; i = next) { 333bf215546Sopenharmony_ci next = i->next; 334bf215546Sopenharmony_ci 335bf215546Sopenharmony_ci if (i->sType == TYPE_F32 && prog->getType() != Program::TYPE_COMPUTE) 336bf215546Sopenharmony_ci handleFTZ(i); 337bf215546Sopenharmony_ci 338bf215546Sopenharmony_ci switch (i->op) { 339bf215546Sopenharmony_ci case OP_DIV: 340bf215546Sopenharmony_ci case OP_MOD: 341bf215546Sopenharmony_ci if (i->sType != TYPE_F32) 342bf215546Sopenharmony_ci handleDIV(i); 343bf215546Sopenharmony_ci break; 344bf215546Sopenharmony_ci case OP_RCP: 345bf215546Sopenharmony_ci case OP_RSQ: 346bf215546Sopenharmony_ci if (i->dType == TYPE_F64) 347bf215546Sopenharmony_ci handleRCPRSQ(i); 348bf215546Sopenharmony_ci break; 349bf215546Sopenharmony_ci case OP_TXL: 350bf215546Sopenharmony_ci case OP_TXF: 351bf215546Sopenharmony_ci handleTEXLOD(i->asTex()); 352bf215546Sopenharmony_ci break; 353bf215546Sopenharmony_ci case OP_SHR: 354bf215546Sopenharmony_ci case OP_SHL: 355bf215546Sopenharmony_ci if (typeSizeof(i->sType) == 8) 356bf215546Sopenharmony_ci handleShift(i); 357bf215546Sopenharmony_ci break; 358bf215546Sopenharmony_ci case OP_SET: 359bf215546Sopenharmony_ci case OP_SET_AND: 360bf215546Sopenharmony_ci case OP_SET_OR: 361bf215546Sopenharmony_ci case OP_SET_XOR: 362bf215546Sopenharmony_ci if (typeSizeof(i->sType) == 8 && i->sType != TYPE_F64) 363bf215546Sopenharmony_ci handleSET(i->asCmp()); 364bf215546Sopenharmony_ci break; 365bf215546Sopenharmony_ci case OP_BREV: 366bf215546Sopenharmony_ci handleBREV(i); 367bf215546Sopenharmony_ci break; 368bf215546Sopenharmony_ci default: 369bf215546Sopenharmony_ci break; 370bf215546Sopenharmony_ci } 371bf215546Sopenharmony_ci } 372bf215546Sopenharmony_ci return true; 373bf215546Sopenharmony_ci} 374bf215546Sopenharmony_ci 375bf215546Sopenharmony_ciNVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog) 376bf215546Sopenharmony_ci : rZero(NULL), 377bf215546Sopenharmony_ci carry(NULL), 378bf215546Sopenharmony_ci pOne(NULL), 379bf215546Sopenharmony_ci needTexBar(prog->getTarget()->getChipset() >= 0xe0 && 380bf215546Sopenharmony_ci prog->getTarget()->getChipset() < 0x110) 381bf215546Sopenharmony_ci{ 382bf215546Sopenharmony_ci} 383bf215546Sopenharmony_ci 384bf215546Sopenharmony_cibool 385bf215546Sopenharmony_ciNVC0LegalizePostRA::insnDominatedBy(const Instruction *later, 386bf215546Sopenharmony_ci const Instruction *early) const 387bf215546Sopenharmony_ci{ 388bf215546Sopenharmony_ci if (early->bb == later->bb) 389bf215546Sopenharmony_ci return early->serial < later->serial; 390bf215546Sopenharmony_ci return later->bb->dominatedBy(early->bb); 391bf215546Sopenharmony_ci} 392bf215546Sopenharmony_ci 393bf215546Sopenharmony_civoid 394bf215546Sopenharmony_ciNVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses, 395bf215546Sopenharmony_ci Instruction *usei, const Instruction *texi) 396bf215546Sopenharmony_ci{ 397bf215546Sopenharmony_ci bool add = true; 398bf215546Sopenharmony_ci bool dominated = insnDominatedBy(usei, texi); 399bf215546Sopenharmony_ci // Uses before the tex have to all be included. Just because an earlier 400bf215546Sopenharmony_ci // instruction dominates another instruction doesn't mean that there's no 401bf215546Sopenharmony_ci // way to get from the tex to the later instruction. For example you could 402bf215546Sopenharmony_ci // have nested loops, with the tex in the inner loop, and uses before it in 403bf215546Sopenharmony_ci // both loops - even though the outer loop's instruction would dominate the 404bf215546Sopenharmony_ci // inner's, we still want a texbar before the inner loop's instruction. 405bf215546Sopenharmony_ci // 406bf215546Sopenharmony_ci // However we can still use the eliding logic between uses dominated by the 407bf215546Sopenharmony_ci // tex instruction, as that is unambiguously correct. 408bf215546Sopenharmony_ci if (dominated) { 409bf215546Sopenharmony_ci for (std::list<TexUse>::iterator it = uses.begin(); it != uses.end();) { 410bf215546Sopenharmony_ci if (it->after) { 411bf215546Sopenharmony_ci if (insnDominatedBy(usei, it->insn)) { 412bf215546Sopenharmony_ci add = false; 413bf215546Sopenharmony_ci break; 414bf215546Sopenharmony_ci } 415bf215546Sopenharmony_ci if (insnDominatedBy(it->insn, usei)) { 416bf215546Sopenharmony_ci it = uses.erase(it); 417bf215546Sopenharmony_ci continue; 418bf215546Sopenharmony_ci } 419bf215546Sopenharmony_ci } 420bf215546Sopenharmony_ci ++it; 421bf215546Sopenharmony_ci } 422bf215546Sopenharmony_ci } 423bf215546Sopenharmony_ci if (add) 424bf215546Sopenharmony_ci uses.push_back(TexUse(usei, texi, dominated)); 425bf215546Sopenharmony_ci} 426bf215546Sopenharmony_ci 427bf215546Sopenharmony_ci// While it might be tempting to use the an algorithm that just looks at tex 428bf215546Sopenharmony_ci// uses, not all texture results are guaranteed to be used on all paths. In 429bf215546Sopenharmony_ci// the case where along some control flow path a texture result is never used, 430bf215546Sopenharmony_ci// we might reuse that register for something else, creating a 431bf215546Sopenharmony_ci// write-after-write hazard. So we have to manually look through all 432bf215546Sopenharmony_ci// instructions looking for ones that reference the registers in question. 433bf215546Sopenharmony_civoid 434bf215546Sopenharmony_ciNVC0LegalizePostRA::findFirstUses( 435bf215546Sopenharmony_ci Instruction *texi, std::list<TexUse> &uses) 436bf215546Sopenharmony_ci{ 437bf215546Sopenharmony_ci int minGPR = texi->def(0).rep()->reg.data.id; 438bf215546Sopenharmony_ci int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1; 439bf215546Sopenharmony_ci 440bf215546Sopenharmony_ci std::unordered_set<const BasicBlock *> visited; 441bf215546Sopenharmony_ci findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited); 442bf215546Sopenharmony_ci} 443bf215546Sopenharmony_ci 444bf215546Sopenharmony_civoid 445bf215546Sopenharmony_ciNVC0LegalizePostRA::findFirstUsesBB( 446bf215546Sopenharmony_ci int minGPR, int maxGPR, Instruction *start, 447bf215546Sopenharmony_ci const Instruction *texi, std::list<TexUse> &uses, 448bf215546Sopenharmony_ci std::unordered_set<const BasicBlock *> &visited) 449bf215546Sopenharmony_ci{ 450bf215546Sopenharmony_ci const BasicBlock *bb = start->bb; 451bf215546Sopenharmony_ci 452bf215546Sopenharmony_ci // We don't process the whole bb the first time around. This is correct, 453bf215546Sopenharmony_ci // however we might be in a loop and hit this BB again, and need to process 454bf215546Sopenharmony_ci // the full thing. So only mark a bb as visited if we processed it from the 455bf215546Sopenharmony_ci // beginning. 456bf215546Sopenharmony_ci if (start == bb->getEntry()) { 457bf215546Sopenharmony_ci if (visited.find(bb) != visited.end()) 458bf215546Sopenharmony_ci return; 459bf215546Sopenharmony_ci visited.insert(bb); 460bf215546Sopenharmony_ci } 461bf215546Sopenharmony_ci 462bf215546Sopenharmony_ci for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) { 463bf215546Sopenharmony_ci if (insn->isNop()) 464bf215546Sopenharmony_ci continue; 465bf215546Sopenharmony_ci 466bf215546Sopenharmony_ci for (int d = 0; insn->defExists(d); ++d) { 467bf215546Sopenharmony_ci const Value *def = insn->def(d).rep(); 468bf215546Sopenharmony_ci if (insn->def(d).getFile() != FILE_GPR || 469bf215546Sopenharmony_ci def->reg.data.id + def->reg.size / 4 - 1 < minGPR || 470bf215546Sopenharmony_ci def->reg.data.id > maxGPR) 471bf215546Sopenharmony_ci continue; 472bf215546Sopenharmony_ci addTexUse(uses, insn, texi); 473bf215546Sopenharmony_ci return; 474bf215546Sopenharmony_ci } 475bf215546Sopenharmony_ci 476bf215546Sopenharmony_ci for (int s = 0; insn->srcExists(s); ++s) { 477bf215546Sopenharmony_ci const Value *src = insn->src(s).rep(); 478bf215546Sopenharmony_ci if (insn->src(s).getFile() != FILE_GPR || 479bf215546Sopenharmony_ci src->reg.data.id + src->reg.size / 4 - 1 < minGPR || 480bf215546Sopenharmony_ci src->reg.data.id > maxGPR) 481bf215546Sopenharmony_ci continue; 482bf215546Sopenharmony_ci addTexUse(uses, insn, texi); 483bf215546Sopenharmony_ci return; 484bf215546Sopenharmony_ci } 485bf215546Sopenharmony_ci } 486bf215546Sopenharmony_ci 487bf215546Sopenharmony_ci for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) { 488bf215546Sopenharmony_ci findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(), 489bf215546Sopenharmony_ci texi, uses, visited); 490bf215546Sopenharmony_ci } 491bf215546Sopenharmony_ci} 492bf215546Sopenharmony_ci 493bf215546Sopenharmony_ci// Texture barriers: 494bf215546Sopenharmony_ci// This pass is a bit long and ugly and can probably be optimized. 495bf215546Sopenharmony_ci// 496bf215546Sopenharmony_ci// 1. obtain a list of TEXes and their outputs' first use(s) 497bf215546Sopenharmony_ci// 2. calculate the barrier level of each first use (minimal number of TEXes, 498bf215546Sopenharmony_ci// over all paths, between the TEX and the use in question) 499bf215546Sopenharmony_ci// 3. for each barrier, if all paths from the source TEX to that barrier 500bf215546Sopenharmony_ci// contain a barrier of lesser level, it can be culled 501bf215546Sopenharmony_cibool 502bf215546Sopenharmony_ciNVC0LegalizePostRA::insertTextureBarriers(Function *fn) 503bf215546Sopenharmony_ci{ 504bf215546Sopenharmony_ci std::list<TexUse> *uses; 505bf215546Sopenharmony_ci std::vector<Instruction *> texes; 506bf215546Sopenharmony_ci std::vector<int> bbFirstTex; 507bf215546Sopenharmony_ci std::vector<int> bbFirstUse; 508bf215546Sopenharmony_ci std::vector<int> texCounts; 509bf215546Sopenharmony_ci std::vector<TexUse> useVec; 510bf215546Sopenharmony_ci ArrayList insns; 511bf215546Sopenharmony_ci 512bf215546Sopenharmony_ci fn->orderInstructions(insns); 513bf215546Sopenharmony_ci 514bf215546Sopenharmony_ci texCounts.resize(fn->allBBlocks.getSize(), 0); 515bf215546Sopenharmony_ci bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize()); 516bf215546Sopenharmony_ci bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize()); 517bf215546Sopenharmony_ci 518bf215546Sopenharmony_ci // tag BB CFG nodes by their id for later 519bf215546Sopenharmony_ci for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) { 520bf215546Sopenharmony_ci BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get()); 521bf215546Sopenharmony_ci if (bb) 522bf215546Sopenharmony_ci bb->cfg.tag = bb->getId(); 523bf215546Sopenharmony_ci } 524bf215546Sopenharmony_ci 525bf215546Sopenharmony_ci // gather the first uses for each TEX 526bf215546Sopenharmony_ci for (int i = 0; i < insns.getSize(); ++i) { 527bf215546Sopenharmony_ci Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i)); 528bf215546Sopenharmony_ci if (isTextureOp(tex->op)) { 529bf215546Sopenharmony_ci texes.push_back(tex); 530bf215546Sopenharmony_ci if (!texCounts.at(tex->bb->getId())) 531bf215546Sopenharmony_ci bbFirstTex[tex->bb->getId()] = texes.size() - 1; 532bf215546Sopenharmony_ci texCounts[tex->bb->getId()]++; 533bf215546Sopenharmony_ci } 534bf215546Sopenharmony_ci } 535bf215546Sopenharmony_ci insns.clear(); 536bf215546Sopenharmony_ci if (texes.empty()) 537bf215546Sopenharmony_ci return false; 538bf215546Sopenharmony_ci uses = new std::list<TexUse>[texes.size()]; 539bf215546Sopenharmony_ci if (!uses) 540bf215546Sopenharmony_ci return false; 541bf215546Sopenharmony_ci for (size_t i = 0; i < texes.size(); ++i) { 542bf215546Sopenharmony_ci findFirstUses(texes[i], uses[i]); 543bf215546Sopenharmony_ci } 544bf215546Sopenharmony_ci 545bf215546Sopenharmony_ci // determine the barrier level at each use 546bf215546Sopenharmony_ci for (size_t i = 0; i < texes.size(); ++i) { 547bf215546Sopenharmony_ci for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end(); 548bf215546Sopenharmony_ci ++u) { 549bf215546Sopenharmony_ci BasicBlock *tb = texes[i]->bb; 550bf215546Sopenharmony_ci BasicBlock *ub = u->insn->bb; 551bf215546Sopenharmony_ci if (tb == ub) { 552bf215546Sopenharmony_ci u->level = 0; 553bf215546Sopenharmony_ci for (size_t j = i + 1; j < texes.size() && 554bf215546Sopenharmony_ci texes[j]->bb == tb && texes[j]->serial < u->insn->serial; 555bf215546Sopenharmony_ci ++j) 556bf215546Sopenharmony_ci u->level++; 557bf215546Sopenharmony_ci } else { 558bf215546Sopenharmony_ci u->level = fn->cfg.findLightestPathWeight(&tb->cfg, 559bf215546Sopenharmony_ci &ub->cfg, texCounts); 560bf215546Sopenharmony_ci if (u->level < 0) { 561bf215546Sopenharmony_ci WARN("Failed to find path TEX -> TEXBAR\n"); 562bf215546Sopenharmony_ci u->level = 0; 563bf215546Sopenharmony_ci continue; 564bf215546Sopenharmony_ci } 565bf215546Sopenharmony_ci // this counted all TEXes in the origin block, correct that 566bf215546Sopenharmony_ci u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */; 567bf215546Sopenharmony_ci // and did not count the TEXes in the destination block, add those 568bf215546Sopenharmony_ci for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() && 569bf215546Sopenharmony_ci texes[j]->bb == ub && texes[j]->serial < u->insn->serial; 570bf215546Sopenharmony_ci ++j) 571bf215546Sopenharmony_ci u->level++; 572bf215546Sopenharmony_ci } 573bf215546Sopenharmony_ci assert(u->level >= 0); 574bf215546Sopenharmony_ci useVec.push_back(*u); 575bf215546Sopenharmony_ci } 576bf215546Sopenharmony_ci } 577bf215546Sopenharmony_ci delete[] uses; 578bf215546Sopenharmony_ci 579bf215546Sopenharmony_ci // insert the barriers 580bf215546Sopenharmony_ci for (size_t i = 0; i < useVec.size(); ++i) { 581bf215546Sopenharmony_ci Instruction *prev = useVec[i].insn->prev; 582bf215546Sopenharmony_ci if (useVec[i].level < 0) 583bf215546Sopenharmony_ci continue; 584bf215546Sopenharmony_ci if (prev && prev->op == OP_TEXBAR) { 585bf215546Sopenharmony_ci if (prev->subOp > useVec[i].level) 586bf215546Sopenharmony_ci prev->subOp = useVec[i].level; 587bf215546Sopenharmony_ci prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0)); 588bf215546Sopenharmony_ci } else { 589bf215546Sopenharmony_ci Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE); 590bf215546Sopenharmony_ci bar->fixed = 1; 591bf215546Sopenharmony_ci bar->subOp = useVec[i].level; 592bf215546Sopenharmony_ci // make use explicit to ease latency calculation 593bf215546Sopenharmony_ci bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0)); 594bf215546Sopenharmony_ci useVec[i].insn->bb->insertBefore(useVec[i].insn, bar); 595bf215546Sopenharmony_ci } 596bf215546Sopenharmony_ci } 597bf215546Sopenharmony_ci 598bf215546Sopenharmony_ci if (fn->getProgram()->optLevel < 3) 599bf215546Sopenharmony_ci return true; 600bf215546Sopenharmony_ci 601bf215546Sopenharmony_ci std::vector<Limits> limitT, limitB, limitS; // entry, exit, single 602bf215546Sopenharmony_ci 603bf215546Sopenharmony_ci limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0)); 604bf215546Sopenharmony_ci limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0)); 605bf215546Sopenharmony_ci limitS.resize(fn->allBBlocks.getSize()); 606bf215546Sopenharmony_ci 607bf215546Sopenharmony_ci // cull unneeded barriers (should do that earlier, but for simplicity) 608bf215546Sopenharmony_ci IteratorRef bi = fn->cfg.iteratorCFG(); 609bf215546Sopenharmony_ci // first calculate min/max outstanding TEXes for each BB 610bf215546Sopenharmony_ci for (bi->reset(); !bi->end(); bi->next()) { 611bf215546Sopenharmony_ci Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get()); 612bf215546Sopenharmony_ci BasicBlock *bb = BasicBlock::get(n); 613bf215546Sopenharmony_ci int min = 0; 614bf215546Sopenharmony_ci int max = std::numeric_limits<int>::max(); 615bf215546Sopenharmony_ci for (Instruction *i = bb->getFirst(); i; i = i->next) { 616bf215546Sopenharmony_ci if (isTextureOp(i->op)) { 617bf215546Sopenharmony_ci min++; 618bf215546Sopenharmony_ci if (max < std::numeric_limits<int>::max()) 619bf215546Sopenharmony_ci max++; 620bf215546Sopenharmony_ci } else 621bf215546Sopenharmony_ci if (i->op == OP_TEXBAR) { 622bf215546Sopenharmony_ci min = MIN2(min, i->subOp); 623bf215546Sopenharmony_ci max = MIN2(max, i->subOp); 624bf215546Sopenharmony_ci } 625bf215546Sopenharmony_ci } 626bf215546Sopenharmony_ci // limits when looking at an isolated block 627bf215546Sopenharmony_ci limitS[bb->getId()].min = min; 628bf215546Sopenharmony_ci limitS[bb->getId()].max = max; 629bf215546Sopenharmony_ci } 630bf215546Sopenharmony_ci // propagate the min/max values 631bf215546Sopenharmony_ci for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) { 632bf215546Sopenharmony_ci for (bi->reset(); !bi->end(); bi->next()) { 633bf215546Sopenharmony_ci Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get()); 634bf215546Sopenharmony_ci BasicBlock *bb = BasicBlock::get(n); 635bf215546Sopenharmony_ci const int bbId = bb->getId(); 636bf215546Sopenharmony_ci for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) { 637bf215546Sopenharmony_ci BasicBlock *in = BasicBlock::get(ei.getNode()); 638bf215546Sopenharmony_ci const int inId = in->getId(); 639bf215546Sopenharmony_ci limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min); 640bf215546Sopenharmony_ci limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max); 641bf215546Sopenharmony_ci } 642bf215546Sopenharmony_ci // I just hope this is correct ... 643bf215546Sopenharmony_ci if (limitS[bbId].max == std::numeric_limits<int>::max()) { 644bf215546Sopenharmony_ci // no barrier 645bf215546Sopenharmony_ci limitB[bbId].min = limitT[bbId].min + limitS[bbId].min; 646bf215546Sopenharmony_ci limitB[bbId].max = limitT[bbId].max + limitS[bbId].min; 647bf215546Sopenharmony_ci } else { 648bf215546Sopenharmony_ci // block contained a barrier 649bf215546Sopenharmony_ci limitB[bbId].min = MIN2(limitS[bbId].max, 650bf215546Sopenharmony_ci limitT[bbId].min + limitS[bbId].min); 651bf215546Sopenharmony_ci limitB[bbId].max = MIN2(limitS[bbId].max, 652bf215546Sopenharmony_ci limitT[bbId].max + limitS[bbId].min); 653bf215546Sopenharmony_ci } 654bf215546Sopenharmony_ci } 655bf215546Sopenharmony_ci } 656bf215546Sopenharmony_ci // finally delete unnecessary barriers 657bf215546Sopenharmony_ci for (bi->reset(); !bi->end(); bi->next()) { 658bf215546Sopenharmony_ci Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get()); 659bf215546Sopenharmony_ci BasicBlock *bb = BasicBlock::get(n); 660bf215546Sopenharmony_ci Instruction *prev = NULL; 661bf215546Sopenharmony_ci Instruction *next; 662bf215546Sopenharmony_ci int max = limitT[bb->getId()].max; 663bf215546Sopenharmony_ci for (Instruction *i = bb->getFirst(); i; i = next) { 664bf215546Sopenharmony_ci next = i->next; 665bf215546Sopenharmony_ci if (i->op == OP_TEXBAR) { 666bf215546Sopenharmony_ci if (i->subOp >= max) { 667bf215546Sopenharmony_ci delete_Instruction(prog, i); 668bf215546Sopenharmony_ci i = NULL; 669bf215546Sopenharmony_ci } else { 670bf215546Sopenharmony_ci max = i->subOp; 671bf215546Sopenharmony_ci if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) { 672bf215546Sopenharmony_ci delete_Instruction(prog, prev); 673bf215546Sopenharmony_ci prev = NULL; 674bf215546Sopenharmony_ci } 675bf215546Sopenharmony_ci } 676bf215546Sopenharmony_ci } else 677bf215546Sopenharmony_ci if (isTextureOp(i->op)) { 678bf215546Sopenharmony_ci max++; 679bf215546Sopenharmony_ci } 680bf215546Sopenharmony_ci if (i && !i->isNop()) 681bf215546Sopenharmony_ci prev = i; 682bf215546Sopenharmony_ci } 683bf215546Sopenharmony_ci } 684bf215546Sopenharmony_ci return true; 685bf215546Sopenharmony_ci} 686bf215546Sopenharmony_ci 687bf215546Sopenharmony_cibool 688bf215546Sopenharmony_ciNVC0LegalizePostRA::visit(Function *fn) 689bf215546Sopenharmony_ci{ 690bf215546Sopenharmony_ci if (needTexBar) 691bf215546Sopenharmony_ci insertTextureBarriers(fn); 692bf215546Sopenharmony_ci 693bf215546Sopenharmony_ci rZero = new_LValue(fn, FILE_GPR); 694bf215546Sopenharmony_ci pOne = new_LValue(fn, FILE_PREDICATE); 695bf215546Sopenharmony_ci carry = new_LValue(fn, FILE_FLAGS); 696bf215546Sopenharmony_ci 697bf215546Sopenharmony_ci rZero->reg.data.id = (prog->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET) ? 255 : 63; 698bf215546Sopenharmony_ci carry->reg.data.id = 0; 699bf215546Sopenharmony_ci pOne->reg.data.id = 7; 700bf215546Sopenharmony_ci 701bf215546Sopenharmony_ci return true; 702bf215546Sopenharmony_ci} 703bf215546Sopenharmony_ci 704bf215546Sopenharmony_civoid 705bf215546Sopenharmony_ciNVC0LegalizePostRA::replaceZero(Instruction *i) 706bf215546Sopenharmony_ci{ 707bf215546Sopenharmony_ci for (int s = 0; i->srcExists(s); ++s) { 708bf215546Sopenharmony_ci if (s == 2 && i->op == OP_SUCLAMP) 709bf215546Sopenharmony_ci continue; 710bf215546Sopenharmony_ci if (s == 1 && i->op == OP_SHLADD) 711bf215546Sopenharmony_ci continue; 712bf215546Sopenharmony_ci ImmediateValue *imm = i->getSrc(s)->asImm(); 713bf215546Sopenharmony_ci if (imm) { 714bf215546Sopenharmony_ci if (i->op == OP_SELP && s == 2) { 715bf215546Sopenharmony_ci i->setSrc(s, pOne); 716bf215546Sopenharmony_ci if (imm->reg.data.u64 == 0) 717bf215546Sopenharmony_ci i->src(s).mod = i->src(s).mod ^ Modifier(NV50_IR_MOD_NOT); 718bf215546Sopenharmony_ci } else if (imm->reg.data.u64 == 0) { 719bf215546Sopenharmony_ci i->setSrc(s, rZero); 720bf215546Sopenharmony_ci } 721bf215546Sopenharmony_ci } 722bf215546Sopenharmony_ci } 723bf215546Sopenharmony_ci} 724bf215546Sopenharmony_ci 725bf215546Sopenharmony_ci// replace CONT with BRA for single unconditional continue 726bf215546Sopenharmony_cibool 727bf215546Sopenharmony_ciNVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb) 728bf215546Sopenharmony_ci{ 729bf215546Sopenharmony_ci if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT) 730bf215546Sopenharmony_ci return false; 731bf215546Sopenharmony_ci Graph::EdgeIterator ei = bb->cfg.incident(); 732bf215546Sopenharmony_ci if (ei.getType() != Graph::Edge::BACK) 733bf215546Sopenharmony_ci ei.next(); 734bf215546Sopenharmony_ci if (ei.getType() != Graph::Edge::BACK) 735bf215546Sopenharmony_ci return false; 736bf215546Sopenharmony_ci BasicBlock *contBB = BasicBlock::get(ei.getNode()); 737bf215546Sopenharmony_ci 738bf215546Sopenharmony_ci if (!contBB->getExit() || contBB->getExit()->op != OP_CONT || 739bf215546Sopenharmony_ci contBB->getExit()->getPredicate()) 740bf215546Sopenharmony_ci return false; 741bf215546Sopenharmony_ci contBB->getExit()->op = OP_BRA; 742bf215546Sopenharmony_ci bb->remove(bb->getEntry()); // delete PRECONT 743bf215546Sopenharmony_ci 744bf215546Sopenharmony_ci ei.next(); 745bf215546Sopenharmony_ci assert(ei.end() || ei.getType() != Graph::Edge::BACK); 746bf215546Sopenharmony_ci return true; 747bf215546Sopenharmony_ci} 748bf215546Sopenharmony_ci 749bf215546Sopenharmony_ci// replace branches to join blocks with join ops 750bf215546Sopenharmony_civoid 751bf215546Sopenharmony_ciNVC0LegalizePostRA::propagateJoin(BasicBlock *bb) 752bf215546Sopenharmony_ci{ 753bf215546Sopenharmony_ci if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit) 754bf215546Sopenharmony_ci return; 755bf215546Sopenharmony_ci for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) { 756bf215546Sopenharmony_ci BasicBlock *in = BasicBlock::get(ei.getNode()); 757bf215546Sopenharmony_ci Instruction *exit = in->getExit(); 758bf215546Sopenharmony_ci if (!exit) { 759bf215546Sopenharmony_ci in->insertTail(new FlowInstruction(func, OP_JOIN, bb)); 760bf215546Sopenharmony_ci // there should always be a terminator instruction 761bf215546Sopenharmony_ci WARN("inserted missing terminator in BB:%i\n", in->getId()); 762bf215546Sopenharmony_ci } else 763bf215546Sopenharmony_ci if (exit->op == OP_BRA) { 764bf215546Sopenharmony_ci exit->op = OP_JOIN; 765bf215546Sopenharmony_ci exit->asFlow()->limit = 1; // must-not-propagate marker 766bf215546Sopenharmony_ci } 767bf215546Sopenharmony_ci } 768bf215546Sopenharmony_ci bb->remove(bb->getEntry()); 769bf215546Sopenharmony_ci} 770bf215546Sopenharmony_ci 771bf215546Sopenharmony_ci// replaces instructions which would end up as f2f or i2i with faster 772bf215546Sopenharmony_ci// alternatives: 773bf215546Sopenharmony_ci// - fabs(a) -> fadd(0, abs a) 774bf215546Sopenharmony_ci// - fneg(a) -> fadd(neg 0, neg a) 775bf215546Sopenharmony_ci// - ineg(a) -> iadd(0, neg a) 776bf215546Sopenharmony_ci// - fneg(abs a) -> fadd(neg 0, neg abs a) 777bf215546Sopenharmony_ci// - sat(a) -> sat add(0, a) 778bf215546Sopenharmony_civoid 779bf215546Sopenharmony_ciNVC0LegalizePostRA::replaceCvt(Instruction *cvt) 780bf215546Sopenharmony_ci{ 781bf215546Sopenharmony_ci if (!isFloatType(cvt->sType) && typeSizeof(cvt->sType) != 4) 782bf215546Sopenharmony_ci return; 783bf215546Sopenharmony_ci if (cvt->sType != cvt->dType) 784bf215546Sopenharmony_ci return; 785bf215546Sopenharmony_ci // we could make it work, but in this case we have optimizations disabled 786bf215546Sopenharmony_ci // and we don't really care either way. 787bf215546Sopenharmony_ci if (cvt->src(0).getFile() != FILE_GPR && 788bf215546Sopenharmony_ci cvt->src(0).getFile() != FILE_MEMORY_CONST) 789bf215546Sopenharmony_ci return; 790bf215546Sopenharmony_ci 791bf215546Sopenharmony_ci Modifier mod0, mod1; 792bf215546Sopenharmony_ci 793bf215546Sopenharmony_ci switch (cvt->op) { 794bf215546Sopenharmony_ci case OP_ABS: 795bf215546Sopenharmony_ci if (cvt->src(0).mod) 796bf215546Sopenharmony_ci return; 797bf215546Sopenharmony_ci if (!isFloatType(cvt->sType)) 798bf215546Sopenharmony_ci return; 799bf215546Sopenharmony_ci mod0 = 0; 800bf215546Sopenharmony_ci mod1 = NV50_IR_MOD_ABS; 801bf215546Sopenharmony_ci break; 802bf215546Sopenharmony_ci case OP_NEG: 803bf215546Sopenharmony_ci if (!isFloatType(cvt->sType) && cvt->src(0).mod) 804bf215546Sopenharmony_ci return; 805bf215546Sopenharmony_ci if (isFloatType(cvt->sType) && 806bf215546Sopenharmony_ci (cvt->src(0).mod && cvt->src(0).mod != Modifier(NV50_IR_MOD_ABS))) 807bf215546Sopenharmony_ci return; 808bf215546Sopenharmony_ci 809bf215546Sopenharmony_ci mod0 = isFloatType(cvt->sType) ? NV50_IR_MOD_NEG : 0; 810bf215546Sopenharmony_ci mod1 = cvt->src(0).mod == Modifier(NV50_IR_MOD_ABS) ? 811bf215546Sopenharmony_ci NV50_IR_MOD_NEG_ABS : NV50_IR_MOD_NEG; 812bf215546Sopenharmony_ci break; 813bf215546Sopenharmony_ci case OP_SAT: 814bf215546Sopenharmony_ci if (!isFloatType(cvt->sType) && cvt->src(0).mod.abs()) 815bf215546Sopenharmony_ci return; 816bf215546Sopenharmony_ci mod0 = 0; 817bf215546Sopenharmony_ci mod1 = cvt->src(0).mod; 818bf215546Sopenharmony_ci cvt->saturate = true; 819bf215546Sopenharmony_ci break; 820bf215546Sopenharmony_ci default: 821bf215546Sopenharmony_ci return; 822bf215546Sopenharmony_ci } 823bf215546Sopenharmony_ci 824bf215546Sopenharmony_ci cvt->op = OP_ADD; 825bf215546Sopenharmony_ci cvt->moveSources(0, 1); 826bf215546Sopenharmony_ci cvt->setSrc(0, rZero); 827bf215546Sopenharmony_ci cvt->src(0).mod = mod0; 828bf215546Sopenharmony_ci cvt->src(1).mod = mod1; 829bf215546Sopenharmony_ci} 830bf215546Sopenharmony_ci 831bf215546Sopenharmony_cibool 832bf215546Sopenharmony_ciNVC0LegalizePostRA::visit(BasicBlock *bb) 833bf215546Sopenharmony_ci{ 834bf215546Sopenharmony_ci Instruction *i, *next; 835bf215546Sopenharmony_ci 836bf215546Sopenharmony_ci // remove pseudo operations and non-fixed no-ops, split 64 bit operations 837bf215546Sopenharmony_ci for (i = bb->getFirst(); i; i = next) { 838bf215546Sopenharmony_ci next = i->next; 839bf215546Sopenharmony_ci if (i->op == OP_EMIT || i->op == OP_RESTART) { 840bf215546Sopenharmony_ci if (!i->getDef(0)->refCount()) 841bf215546Sopenharmony_ci i->setDef(0, NULL); 842bf215546Sopenharmony_ci if (i->src(0).getFile() == FILE_IMMEDIATE) 843bf215546Sopenharmony_ci i->setSrc(0, rZero); // initial value must be 0 844bf215546Sopenharmony_ci replaceZero(i); 845bf215546Sopenharmony_ci } else 846bf215546Sopenharmony_ci if (i->isNop()) { 847bf215546Sopenharmony_ci bb->remove(i); 848bf215546Sopenharmony_ci } else 849bf215546Sopenharmony_ci if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC && 850bf215546Sopenharmony_ci prog->getType() != Program::TYPE_COMPUTE) { 851bf215546Sopenharmony_ci // It seems like barriers are never required for tessellation since 852bf215546Sopenharmony_ci // the warp size is 32, and there are always at most 32 tcs threads. 853bf215546Sopenharmony_ci bb->remove(i); 854bf215546Sopenharmony_ci } else 855bf215546Sopenharmony_ci if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) { 856bf215546Sopenharmony_ci int offset = i->src(0).get()->reg.data.offset; 857bf215546Sopenharmony_ci if (abs(offset) >= 0x10000) 858bf215546Sopenharmony_ci i->src(0).get()->reg.fileIndex += offset >> 16; 859bf215546Sopenharmony_ci i->src(0).get()->reg.data.offset = (int)(short)offset; 860bf215546Sopenharmony_ci } else { 861bf215546Sopenharmony_ci // TODO: Move this to before register allocation for operations that 862bf215546Sopenharmony_ci // need the $c register ! 863bf215546Sopenharmony_ci if (typeSizeof(i->sType) == 8 || typeSizeof(i->dType) == 8) { 864bf215546Sopenharmony_ci Instruction *hi; 865bf215546Sopenharmony_ci hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry); 866bf215546Sopenharmony_ci if (hi) 867bf215546Sopenharmony_ci next = hi; 868bf215546Sopenharmony_ci } 869bf215546Sopenharmony_ci 870bf215546Sopenharmony_ci if (i->op != OP_MOV && i->op != OP_PFETCH) 871bf215546Sopenharmony_ci replaceZero(i); 872bf215546Sopenharmony_ci 873bf215546Sopenharmony_ci if (i->op == OP_SAT || i->op == OP_NEG || i->op == OP_ABS) 874bf215546Sopenharmony_ci replaceCvt(i); 875bf215546Sopenharmony_ci } 876bf215546Sopenharmony_ci } 877bf215546Sopenharmony_ci if (!bb->getEntry()) 878bf215546Sopenharmony_ci return true; 879bf215546Sopenharmony_ci 880bf215546Sopenharmony_ci if (!tryReplaceContWithBra(bb)) 881bf215546Sopenharmony_ci propagateJoin(bb); 882bf215546Sopenharmony_ci 883bf215546Sopenharmony_ci return true; 884bf215546Sopenharmony_ci} 885bf215546Sopenharmony_ci 886bf215546Sopenharmony_ciNVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget()), 887bf215546Sopenharmony_ci gpEmitAddress(NULL) 888bf215546Sopenharmony_ci{ 889bf215546Sopenharmony_ci bld.setProgram(prog); 890bf215546Sopenharmony_ci} 891bf215546Sopenharmony_ci 892bf215546Sopenharmony_cibool 893bf215546Sopenharmony_ciNVC0LoweringPass::visit(Function *fn) 894bf215546Sopenharmony_ci{ 895bf215546Sopenharmony_ci if (prog->getType() == Program::TYPE_GEOMETRY) { 896bf215546Sopenharmony_ci assert(!strncmp(fn->getName(), "MAIN", 4)); 897bf215546Sopenharmony_ci // TODO: when we generate actual functions pass this value along somehow 898bf215546Sopenharmony_ci bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false); 899bf215546Sopenharmony_ci gpEmitAddress = bld.loadImm(NULL, 0)->asLValue(); 900bf215546Sopenharmony_ci if (fn->cfgExit) { 901bf215546Sopenharmony_ci bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false); 902bf215546Sopenharmony_ci if (prog->getTarget()->getChipset() >= NVISA_GV100_CHIPSET) 903bf215546Sopenharmony_ci bld.mkOp1(OP_FINAL, TYPE_NONE, NULL, gpEmitAddress)->fixed = 1; 904bf215546Sopenharmony_ci bld.mkMovToReg(0, gpEmitAddress); 905bf215546Sopenharmony_ci } 906bf215546Sopenharmony_ci } 907bf215546Sopenharmony_ci return true; 908bf215546Sopenharmony_ci} 909bf215546Sopenharmony_ci 910bf215546Sopenharmony_cibool 911bf215546Sopenharmony_ciNVC0LoweringPass::visit(BasicBlock *bb) 912bf215546Sopenharmony_ci{ 913bf215546Sopenharmony_ci return true; 914bf215546Sopenharmony_ci} 915bf215546Sopenharmony_ci 916bf215546Sopenharmony_ciinline Value * 917bf215546Sopenharmony_ciNVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot) 918bf215546Sopenharmony_ci{ 919bf215546Sopenharmony_ci uint8_t b = prog->driver->io.auxCBSlot; 920bf215546Sopenharmony_ci uint32_t off = prog->driver->io.texBindBase + slot * 4; 921bf215546Sopenharmony_ci 922bf215546Sopenharmony_ci if (ptr) 923bf215546Sopenharmony_ci ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(2)); 924bf215546Sopenharmony_ci 925bf215546Sopenharmony_ci return bld. 926bf215546Sopenharmony_ci mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr); 927bf215546Sopenharmony_ci} 928bf215546Sopenharmony_ci 929bf215546Sopenharmony_ci// move array source to first slot, convert to u16, add indirections 930bf215546Sopenharmony_cibool 931bf215546Sopenharmony_ciNVC0LoweringPass::handleTEX(TexInstruction *i) 932bf215546Sopenharmony_ci{ 933bf215546Sopenharmony_ci const int dim = i->tex.target.getDim() + i->tex.target.isCube(); 934bf215546Sopenharmony_ci const int arg = i->tex.target.getArgCount(); 935bf215546Sopenharmony_ci const int lyr = arg - (i->tex.target.isMS() ? 2 : 1); 936bf215546Sopenharmony_ci const int chipset = prog->getTarget()->getChipset(); 937bf215546Sopenharmony_ci 938bf215546Sopenharmony_ci /* Only normalize in the non-explicit derivatives case. For explicit 939bf215546Sopenharmony_ci * derivatives, this is handled in handleManualTXD. 940bf215546Sopenharmony_ci */ 941bf215546Sopenharmony_ci if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) { 942bf215546Sopenharmony_ci Value *src[3], *val; 943bf215546Sopenharmony_ci int c; 944bf215546Sopenharmony_ci for (c = 0; c < 3; ++c) 945bf215546Sopenharmony_ci src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c)); 946bf215546Sopenharmony_ci val = bld.getScratch(); 947bf215546Sopenharmony_ci bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); 948bf215546Sopenharmony_ci bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val); 949bf215546Sopenharmony_ci bld.mkOp1(OP_RCP, TYPE_F32, val, val); 950bf215546Sopenharmony_ci for (c = 0; c < 3; ++c) { 951bf215546Sopenharmony_ci i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), 952bf215546Sopenharmony_ci i->getSrc(c), val)); 953bf215546Sopenharmony_ci } 954bf215546Sopenharmony_ci } 955bf215546Sopenharmony_ci 956bf215546Sopenharmony_ci // Arguments to the TEX instruction are a little insane. Even though the 957bf215546Sopenharmony_ci // encoding is identical between SM20 and SM30, the arguments mean 958bf215546Sopenharmony_ci // different things between Fermi and Kepler+. A lot of arguments are 959bf215546Sopenharmony_ci // optional based on flags passed to the instruction. This summarizes the 960bf215546Sopenharmony_ci // order of things. 961bf215546Sopenharmony_ci // 962bf215546Sopenharmony_ci // Fermi: 963bf215546Sopenharmony_ci // array/indirect 964bf215546Sopenharmony_ci // coords 965bf215546Sopenharmony_ci // sample 966bf215546Sopenharmony_ci // lod bias 967bf215546Sopenharmony_ci // depth compare 968bf215546Sopenharmony_ci // offsets: 969bf215546Sopenharmony_ci // - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg) 970bf215546Sopenharmony_ci // - other: 4 bits each, single reg 971bf215546Sopenharmony_ci // 972bf215546Sopenharmony_ci // Kepler+: 973bf215546Sopenharmony_ci // indirect handle 974bf215546Sopenharmony_ci // array (+ offsets for txd in upper 16 bits) 975bf215546Sopenharmony_ci // coords 976bf215546Sopenharmony_ci // sample 977bf215546Sopenharmony_ci // lod bias 978bf215546Sopenharmony_ci // depth compare 979bf215546Sopenharmony_ci // offsets (same as fermi, except txd which takes it with array) 980bf215546Sopenharmony_ci // 981bf215546Sopenharmony_ci // Maxwell (tex): 982bf215546Sopenharmony_ci // array 983bf215546Sopenharmony_ci // coords 984bf215546Sopenharmony_ci // indirect handle 985bf215546Sopenharmony_ci // sample 986bf215546Sopenharmony_ci // lod bias 987bf215546Sopenharmony_ci // depth compare 988bf215546Sopenharmony_ci // offsets 989bf215546Sopenharmony_ci // 990bf215546Sopenharmony_ci // Maxwell (txd): 991bf215546Sopenharmony_ci // indirect handle 992bf215546Sopenharmony_ci // coords 993bf215546Sopenharmony_ci // array + offsets 994bf215546Sopenharmony_ci // derivatives 995bf215546Sopenharmony_ci 996bf215546Sopenharmony_ci if (chipset >= NVISA_GK104_CHIPSET) { 997bf215546Sopenharmony_ci if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) { 998bf215546Sopenharmony_ci // XXX this ignores tsc, and assumes a 1:1 mapping 999bf215546Sopenharmony_ci assert(i->tex.rIndirectSrc >= 0); 1000bf215546Sopenharmony_ci if (!i->tex.bindless) { 1001bf215546Sopenharmony_ci Value *hnd = loadTexHandle(i->getIndirectR(), i->tex.r); 1002bf215546Sopenharmony_ci i->tex.r = 0xff; 1003bf215546Sopenharmony_ci i->tex.s = 0x1f; 1004bf215546Sopenharmony_ci i->setIndirectR(hnd); 1005bf215546Sopenharmony_ci } 1006bf215546Sopenharmony_ci i->setIndirectS(NULL); 1007bf215546Sopenharmony_ci } else if (i->tex.r == i->tex.s || i->op == OP_TXF) { 1008bf215546Sopenharmony_ci if (i->tex.r == 0xffff) 1009bf215546Sopenharmony_ci i->tex.r = prog->driver->io.fbtexBindBase / 4; 1010bf215546Sopenharmony_ci else 1011bf215546Sopenharmony_ci i->tex.r += prog->driver->io.texBindBase / 4; 1012bf215546Sopenharmony_ci i->tex.s = 0; // only a single cX[] value possible here 1013bf215546Sopenharmony_ci } else { 1014bf215546Sopenharmony_ci Value *hnd = bld.getScratch(); 1015bf215546Sopenharmony_ci Value *rHnd = loadTexHandle(NULL, i->tex.r); 1016bf215546Sopenharmony_ci Value *sHnd = loadTexHandle(NULL, i->tex.s); 1017bf215546Sopenharmony_ci 1018bf215546Sopenharmony_ci bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd); 1019bf215546Sopenharmony_ci 1020bf215546Sopenharmony_ci i->tex.r = 0; // not used for indirect tex 1021bf215546Sopenharmony_ci i->tex.s = 0; 1022bf215546Sopenharmony_ci i->setIndirectR(hnd); 1023bf215546Sopenharmony_ci } 1024bf215546Sopenharmony_ci if (i->tex.target.isArray()) { 1025bf215546Sopenharmony_ci LValue *layer = new_LValue(func, FILE_GPR); 1026bf215546Sopenharmony_ci Value *src = i->getSrc(lyr); 1027bf215546Sopenharmony_ci const int sat = (i->op == OP_TXF) ? 1 : 0; 1028bf215546Sopenharmony_ci DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32; 1029bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat; 1030bf215546Sopenharmony_ci if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) { 1031bf215546Sopenharmony_ci for (int s = dim; s >= 1; --s) 1032bf215546Sopenharmony_ci i->setSrc(s, i->getSrc(s - 1)); 1033bf215546Sopenharmony_ci i->setSrc(0, layer); 1034bf215546Sopenharmony_ci } else { 1035bf215546Sopenharmony_ci i->setSrc(dim, layer); 1036bf215546Sopenharmony_ci } 1037bf215546Sopenharmony_ci } 1038bf215546Sopenharmony_ci // Move the indirect reference to the first place 1039bf215546Sopenharmony_ci if (i->tex.rIndirectSrc >= 0 && ( 1040bf215546Sopenharmony_ci i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) { 1041bf215546Sopenharmony_ci Value *hnd = i->getIndirectR(); 1042bf215546Sopenharmony_ci 1043bf215546Sopenharmony_ci i->setIndirectR(NULL); 1044bf215546Sopenharmony_ci i->moveSources(0, 1); 1045bf215546Sopenharmony_ci i->setSrc(0, hnd); 1046bf215546Sopenharmony_ci i->tex.rIndirectSrc = 0; 1047bf215546Sopenharmony_ci i->tex.sIndirectSrc = -1; 1048bf215546Sopenharmony_ci } 1049bf215546Sopenharmony_ci // Move the indirect reference to right after the coords 1050bf215546Sopenharmony_ci else if (i->tex.rIndirectSrc >= 0 && chipset >= NVISA_GM107_CHIPSET) { 1051bf215546Sopenharmony_ci Value *hnd = i->getIndirectR(); 1052bf215546Sopenharmony_ci 1053bf215546Sopenharmony_ci i->setIndirectR(NULL); 1054bf215546Sopenharmony_ci i->moveSources(arg, 1); 1055bf215546Sopenharmony_ci i->setSrc(arg, hnd); 1056bf215546Sopenharmony_ci i->tex.rIndirectSrc = 0; 1057bf215546Sopenharmony_ci i->tex.sIndirectSrc = -1; 1058bf215546Sopenharmony_ci } 1059bf215546Sopenharmony_ci } else 1060bf215546Sopenharmony_ci // (nvc0) generate and move the tsc/tic/array source to the front 1061bf215546Sopenharmony_ci if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) { 1062bf215546Sopenharmony_ci LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa 1063bf215546Sopenharmony_ci 1064bf215546Sopenharmony_ci Value *ticRel = i->getIndirectR(); 1065bf215546Sopenharmony_ci Value *tscRel = i->getIndirectS(); 1066bf215546Sopenharmony_ci 1067bf215546Sopenharmony_ci if (i->tex.r == 0xffff) { 1068bf215546Sopenharmony_ci i->tex.r = 0x20; 1069bf215546Sopenharmony_ci i->tex.s = 0x10; 1070bf215546Sopenharmony_ci } 1071bf215546Sopenharmony_ci 1072bf215546Sopenharmony_ci if (ticRel) { 1073bf215546Sopenharmony_ci i->setSrc(i->tex.rIndirectSrc, NULL); 1074bf215546Sopenharmony_ci if (i->tex.r) 1075bf215546Sopenharmony_ci ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), 1076bf215546Sopenharmony_ci ticRel, bld.mkImm(i->tex.r)); 1077bf215546Sopenharmony_ci } 1078bf215546Sopenharmony_ci if (tscRel) { 1079bf215546Sopenharmony_ci i->setSrc(i->tex.sIndirectSrc, NULL); 1080bf215546Sopenharmony_ci if (i->tex.s) 1081bf215546Sopenharmony_ci tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), 1082bf215546Sopenharmony_ci tscRel, bld.mkImm(i->tex.s)); 1083bf215546Sopenharmony_ci } 1084bf215546Sopenharmony_ci 1085bf215546Sopenharmony_ci Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL; 1086bf215546Sopenharmony_ci if (arrayIndex) { 1087bf215546Sopenharmony_ci for (int s = dim; s >= 1; --s) 1088bf215546Sopenharmony_ci i->setSrc(s, i->getSrc(s - 1)); 1089bf215546Sopenharmony_ci i->setSrc(0, arrayIndex); 1090bf215546Sopenharmony_ci } else { 1091bf215546Sopenharmony_ci i->moveSources(0, 1); 1092bf215546Sopenharmony_ci } 1093bf215546Sopenharmony_ci 1094bf215546Sopenharmony_ci if (arrayIndex) { 1095bf215546Sopenharmony_ci int sat = (i->op == OP_TXF) ? 1 : 0; 1096bf215546Sopenharmony_ci DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32; 1097bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat; 1098bf215546Sopenharmony_ci } else { 1099bf215546Sopenharmony_ci bld.loadImm(src, 0); 1100bf215546Sopenharmony_ci } 1101bf215546Sopenharmony_ci 1102bf215546Sopenharmony_ci if (ticRel) 1103bf215546Sopenharmony_ci bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src); 1104bf215546Sopenharmony_ci if (tscRel) 1105bf215546Sopenharmony_ci bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src); 1106bf215546Sopenharmony_ci 1107bf215546Sopenharmony_ci i->setSrc(0, src); 1108bf215546Sopenharmony_ci } 1109bf215546Sopenharmony_ci 1110bf215546Sopenharmony_ci // For nvc0, the sample id has to be in the second operand, as the offset 1111bf215546Sopenharmony_ci // does. Right now we don't know how to pass both in, and this case can't 1112bf215546Sopenharmony_ci // happen with OpenGL. On nve0, the sample id is part of the texture 1113bf215546Sopenharmony_ci // coordinate argument. 1114bf215546Sopenharmony_ci assert(chipset >= NVISA_GK104_CHIPSET || 1115bf215546Sopenharmony_ci !i->tex.useOffsets || !i->tex.target.isMS()); 1116bf215546Sopenharmony_ci 1117bf215546Sopenharmony_ci // offset is between lod and dc 1118bf215546Sopenharmony_ci if (i->tex.useOffsets) { 1119bf215546Sopenharmony_ci int n, c; 1120bf215546Sopenharmony_ci int s = i->srcCount(0xff, true); 1121bf215546Sopenharmony_ci if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) { 1122bf215546Sopenharmony_ci if (i->tex.target.isShadow()) 1123bf215546Sopenharmony_ci s--; 1124bf215546Sopenharmony_ci if (i->srcExists(s)) // move potential predicate out of the way 1125bf215546Sopenharmony_ci i->moveSources(s, 1); 1126bf215546Sopenharmony_ci if (i->tex.useOffsets == 4 && i->srcExists(s + 1)) 1127bf215546Sopenharmony_ci i->moveSources(s + 1, 1); 1128bf215546Sopenharmony_ci } 1129bf215546Sopenharmony_ci if (i->op == OP_TXG) { 1130bf215546Sopenharmony_ci // Either there is 1 offset, which goes into the 2 low bytes of the 1131bf215546Sopenharmony_ci // first source, or there are 4 offsets, which go into 2 sources (8 1132bf215546Sopenharmony_ci // values, 1 byte each). 1133bf215546Sopenharmony_ci Value *offs[2] = {NULL, NULL}; 1134bf215546Sopenharmony_ci for (n = 0; n < i->tex.useOffsets; n++) { 1135bf215546Sopenharmony_ci for (c = 0; c < 2; ++c) { 1136bf215546Sopenharmony_ci if ((n % 2) == 0 && c == 0) 1137bf215546Sopenharmony_ci bld.mkMov(offs[n / 2] = bld.getScratch(), i->offset[n][c].get()); 1138bf215546Sopenharmony_ci else 1139bf215546Sopenharmony_ci bld.mkOp3(OP_INSBF, TYPE_U32, 1140bf215546Sopenharmony_ci offs[n / 2], 1141bf215546Sopenharmony_ci i->offset[n][c].get(), 1142bf215546Sopenharmony_ci bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)), 1143bf215546Sopenharmony_ci offs[n / 2]); 1144bf215546Sopenharmony_ci } 1145bf215546Sopenharmony_ci } 1146bf215546Sopenharmony_ci i->setSrc(s, offs[0]); 1147bf215546Sopenharmony_ci if (offs[1]) 1148bf215546Sopenharmony_ci i->setSrc(s + 1, offs[1]); 1149bf215546Sopenharmony_ci } else { 1150bf215546Sopenharmony_ci unsigned imm = 0; 1151bf215546Sopenharmony_ci assert(i->tex.useOffsets == 1); 1152bf215546Sopenharmony_ci for (c = 0; c < 3; ++c) { 1153bf215546Sopenharmony_ci ImmediateValue val; 1154bf215546Sopenharmony_ci if (!i->offset[0][c].getImmediate(val)) 1155bf215546Sopenharmony_ci assert(!"non-immediate offset passed to non-TXG"); 1156bf215546Sopenharmony_ci imm |= (val.reg.data.u32 & 0xf) << (c * 4); 1157bf215546Sopenharmony_ci } 1158bf215546Sopenharmony_ci if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) { 1159bf215546Sopenharmony_ci // The offset goes into the upper 16 bits of the array index. So 1160bf215546Sopenharmony_ci // create it if it's not already there, and INSBF it if it already 1161bf215546Sopenharmony_ci // is. 1162bf215546Sopenharmony_ci s = (i->tex.rIndirectSrc >= 0) ? 1 : 0; 1163bf215546Sopenharmony_ci if (chipset >= NVISA_GM107_CHIPSET) 1164bf215546Sopenharmony_ci s += dim; 1165bf215546Sopenharmony_ci if (i->tex.target.isArray()) { 1166bf215546Sopenharmony_ci Value *offset = bld.getScratch(); 1167bf215546Sopenharmony_ci bld.mkOp3(OP_INSBF, TYPE_U32, offset, 1168bf215546Sopenharmony_ci bld.loadImm(NULL, imm), bld.mkImm(0xc10), 1169bf215546Sopenharmony_ci i->getSrc(s)); 1170bf215546Sopenharmony_ci i->setSrc(s, offset); 1171bf215546Sopenharmony_ci } else { 1172bf215546Sopenharmony_ci i->moveSources(s, 1); 1173bf215546Sopenharmony_ci i->setSrc(s, bld.loadImm(NULL, imm << 16)); 1174bf215546Sopenharmony_ci } 1175bf215546Sopenharmony_ci } else { 1176bf215546Sopenharmony_ci i->setSrc(s, bld.loadImm(NULL, imm)); 1177bf215546Sopenharmony_ci } 1178bf215546Sopenharmony_ci } 1179bf215546Sopenharmony_ci } 1180bf215546Sopenharmony_ci 1181bf215546Sopenharmony_ci return true; 1182bf215546Sopenharmony_ci} 1183bf215546Sopenharmony_ci 1184bf215546Sopenharmony_cibool 1185bf215546Sopenharmony_ciNVC0LoweringPass::handleManualTXD(TexInstruction *i) 1186bf215546Sopenharmony_ci{ 1187bf215546Sopenharmony_ci // Always done from the l0 perspective. This is the way that NVIDIA's 1188bf215546Sopenharmony_ci // driver does it, and doing it from the "current" lane's perspective 1189bf215546Sopenharmony_ci // doesn't seem to always work for reasons that aren't altogether clear, 1190bf215546Sopenharmony_ci // even in frag shaders. 1191bf215546Sopenharmony_ci // 1192bf215546Sopenharmony_ci // Note that we must move not only the coordinates into lane0, but also all 1193bf215546Sopenharmony_ci // ancillary arguments, like array indices and depth compare as they may 1194bf215546Sopenharmony_ci // differ between lanes. Offsets for TXD are supposed to be uniform, so we 1195bf215546Sopenharmony_ci // leave them alone. 1196bf215546Sopenharmony_ci static const uint8_t qOps[2] = 1197bf215546Sopenharmony_ci { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }; 1198bf215546Sopenharmony_ci 1199bf215546Sopenharmony_ci Value *def[4][4]; 1200bf215546Sopenharmony_ci Value *crd[3], *arr[2], *shadow; 1201bf215546Sopenharmony_ci Instruction *tex; 1202bf215546Sopenharmony_ci Value *zero = bld.loadImm(bld.getSSA(), 0); 1203bf215546Sopenharmony_ci int l, c; 1204bf215546Sopenharmony_ci const int dim = i->tex.target.getDim() + i->tex.target.isCube(); 1205bf215546Sopenharmony_ci 1206bf215546Sopenharmony_ci // This function is invoked after handleTEX lowering, so we have to expect 1207bf215546Sopenharmony_ci // the arguments in the order that the hw wants them. For Fermi, array and 1208bf215546Sopenharmony_ci // indirect are both in the leading arg, while for Kepler, array and 1209bf215546Sopenharmony_ci // indirect are separate (and both precede the coordinates). Maxwell is 1210bf215546Sopenharmony_ci // handled in a separate function. 1211bf215546Sopenharmony_ci int array; 1212bf215546Sopenharmony_ci if (targ->getChipset() < NVISA_GK104_CHIPSET) 1213bf215546Sopenharmony_ci array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0; 1214bf215546Sopenharmony_ci else 1215bf215546Sopenharmony_ci array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0); 1216bf215546Sopenharmony_ci 1217bf215546Sopenharmony_ci i->op = OP_TEX; // no need to clone dPdx/dPdy later 1218bf215546Sopenharmony_ci 1219bf215546Sopenharmony_ci for (c = 0; c < dim; ++c) 1220bf215546Sopenharmony_ci crd[c] = bld.getScratch(); 1221bf215546Sopenharmony_ci for (c = 0; c < array; ++c) 1222bf215546Sopenharmony_ci arr[c] = bld.getScratch(); 1223bf215546Sopenharmony_ci shadow = bld.getScratch(); 1224bf215546Sopenharmony_ci 1225bf215546Sopenharmony_ci for (l = 0; l < 4; ++l) { 1226bf215546Sopenharmony_ci Value *src[3], *val; 1227bf215546Sopenharmony_ci 1228bf215546Sopenharmony_ci bld.mkOp(OP_QUADON, TYPE_NONE, NULL); 1229bf215546Sopenharmony_ci // we're using the texture result from lane 0 in all cases, so make sure 1230bf215546Sopenharmony_ci // that lane 0 is pointing at the proper array index, indirect value, 1231bf215546Sopenharmony_ci // and depth compare. 1232bf215546Sopenharmony_ci if (l != 0) { 1233bf215546Sopenharmony_ci for (c = 0; c < array; ++c) 1234bf215546Sopenharmony_ci bld.mkQuadop(0x00, arr[c], l, i->getSrc(c), zero); 1235bf215546Sopenharmony_ci if (i->tex.target.isShadow()) { 1236bf215546Sopenharmony_ci // The next argument after coords is the depth compare 1237bf215546Sopenharmony_ci bld.mkQuadop(0x00, shadow, l, i->getSrc(array + dim), zero); 1238bf215546Sopenharmony_ci } 1239bf215546Sopenharmony_ci } 1240bf215546Sopenharmony_ci // mov position coordinates from lane l to all lanes 1241bf215546Sopenharmony_ci for (c = 0; c < dim; ++c) 1242bf215546Sopenharmony_ci bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero); 1243bf215546Sopenharmony_ci // add dPdx from lane l to lanes dx 1244bf215546Sopenharmony_ci for (c = 0; c < dim; ++c) 1245bf215546Sopenharmony_ci bld.mkQuadop(qOps[0], crd[c], l, i->dPdx[c].get(), crd[c]); 1246bf215546Sopenharmony_ci // add dPdy from lane l to lanes dy 1247bf215546Sopenharmony_ci for (c = 0; c < dim; ++c) 1248bf215546Sopenharmony_ci bld.mkQuadop(qOps[1], crd[c], l, i->dPdy[c].get(), crd[c]); 1249bf215546Sopenharmony_ci // normalize cube coordinates 1250bf215546Sopenharmony_ci if (i->tex.target.isCube()) { 1251bf215546Sopenharmony_ci for (c = 0; c < 3; ++c) 1252bf215546Sopenharmony_ci src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]); 1253bf215546Sopenharmony_ci val = bld.getScratch(); 1254bf215546Sopenharmony_ci bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); 1255bf215546Sopenharmony_ci bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val); 1256bf215546Sopenharmony_ci bld.mkOp1(OP_RCP, TYPE_F32, val, val); 1257bf215546Sopenharmony_ci for (c = 0; c < 3; ++c) 1258bf215546Sopenharmony_ci src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val); 1259bf215546Sopenharmony_ci } else { 1260bf215546Sopenharmony_ci for (c = 0; c < dim; ++c) 1261bf215546Sopenharmony_ci src[c] = crd[c]; 1262bf215546Sopenharmony_ci } 1263bf215546Sopenharmony_ci // texture 1264bf215546Sopenharmony_ci bld.insert(tex = cloneForward(func, i)); 1265bf215546Sopenharmony_ci if (l != 0) { 1266bf215546Sopenharmony_ci for (c = 0; c < array; ++c) 1267bf215546Sopenharmony_ci tex->setSrc(c, arr[c]); 1268bf215546Sopenharmony_ci if (i->tex.target.isShadow()) 1269bf215546Sopenharmony_ci tex->setSrc(array + dim, shadow); 1270bf215546Sopenharmony_ci } 1271bf215546Sopenharmony_ci for (c = 0; c < dim; ++c) 1272bf215546Sopenharmony_ci tex->setSrc(c + array, src[c]); 1273bf215546Sopenharmony_ci // broadcast results from lane 0 to all lanes so that the moves *into* 1274bf215546Sopenharmony_ci // the target lane pick up the proper value. 1275bf215546Sopenharmony_ci if (l != 0) 1276bf215546Sopenharmony_ci for (c = 0; i->defExists(c); ++c) 1277bf215546Sopenharmony_ci bld.mkQuadop(0x00, tex->getDef(c), 0, tex->getDef(c), zero); 1278bf215546Sopenharmony_ci bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL); 1279bf215546Sopenharmony_ci 1280bf215546Sopenharmony_ci // save results 1281bf215546Sopenharmony_ci for (c = 0; i->defExists(c); ++c) { 1282bf215546Sopenharmony_ci Instruction *mov; 1283bf215546Sopenharmony_ci def[c][l] = bld.getSSA(); 1284bf215546Sopenharmony_ci mov = bld.mkMov(def[c][l], tex->getDef(c)); 1285bf215546Sopenharmony_ci mov->fixed = 1; 1286bf215546Sopenharmony_ci mov->lanes = 1 << l; 1287bf215546Sopenharmony_ci } 1288bf215546Sopenharmony_ci } 1289bf215546Sopenharmony_ci 1290bf215546Sopenharmony_ci for (c = 0; i->defExists(c); ++c) { 1291bf215546Sopenharmony_ci Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c)); 1292bf215546Sopenharmony_ci for (l = 0; l < 4; ++l) 1293bf215546Sopenharmony_ci u->setSrc(l, def[c][l]); 1294bf215546Sopenharmony_ci } 1295bf215546Sopenharmony_ci 1296bf215546Sopenharmony_ci i->bb->remove(i); 1297bf215546Sopenharmony_ci return true; 1298bf215546Sopenharmony_ci} 1299bf215546Sopenharmony_ci 1300bf215546Sopenharmony_cibool 1301bf215546Sopenharmony_ciNVC0LoweringPass::handleTXD(TexInstruction *txd) 1302bf215546Sopenharmony_ci{ 1303bf215546Sopenharmony_ci int dim = txd->tex.target.getDim() + txd->tex.target.isCube(); 1304bf215546Sopenharmony_ci unsigned arg = txd->tex.target.getArgCount(); 1305bf215546Sopenharmony_ci unsigned expected_args = arg; 1306bf215546Sopenharmony_ci const int chipset = prog->getTarget()->getChipset(); 1307bf215546Sopenharmony_ci 1308bf215546Sopenharmony_ci if (chipset >= NVISA_GK104_CHIPSET) { 1309bf215546Sopenharmony_ci if (!txd->tex.target.isArray() && txd->tex.useOffsets) 1310bf215546Sopenharmony_ci expected_args++; 1311bf215546Sopenharmony_ci if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0) 1312bf215546Sopenharmony_ci expected_args++; 1313bf215546Sopenharmony_ci } else { 1314bf215546Sopenharmony_ci if (txd->tex.useOffsets) 1315bf215546Sopenharmony_ci expected_args++; 1316bf215546Sopenharmony_ci if (!txd->tex.target.isArray() && ( 1317bf215546Sopenharmony_ci txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)) 1318bf215546Sopenharmony_ci expected_args++; 1319bf215546Sopenharmony_ci } 1320bf215546Sopenharmony_ci 1321bf215546Sopenharmony_ci if (expected_args > 4 || 1322bf215546Sopenharmony_ci dim > 2 || 1323bf215546Sopenharmony_ci txd->tex.target.isShadow()) 1324bf215546Sopenharmony_ci txd->op = OP_TEX; 1325bf215546Sopenharmony_ci 1326bf215546Sopenharmony_ci handleTEX(txd); 1327bf215546Sopenharmony_ci while (txd->srcExists(arg)) 1328bf215546Sopenharmony_ci ++arg; 1329bf215546Sopenharmony_ci 1330bf215546Sopenharmony_ci txd->tex.derivAll = true; 1331bf215546Sopenharmony_ci if (txd->op == OP_TEX) 1332bf215546Sopenharmony_ci return handleManualTXD(txd); 1333bf215546Sopenharmony_ci 1334bf215546Sopenharmony_ci assert(arg == expected_args); 1335bf215546Sopenharmony_ci for (int c = 0; c < dim; ++c) { 1336bf215546Sopenharmony_ci txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]); 1337bf215546Sopenharmony_ci txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]); 1338bf215546Sopenharmony_ci txd->dPdx[c].set(NULL); 1339bf215546Sopenharmony_ci txd->dPdy[c].set(NULL); 1340bf215546Sopenharmony_ci } 1341bf215546Sopenharmony_ci 1342bf215546Sopenharmony_ci // In this case we have fewer than 4 "real" arguments, which means that 1343bf215546Sopenharmony_ci // handleTEX didn't apply any padding. However we have to make sure that 1344bf215546Sopenharmony_ci // the second "group" of arguments still gets padded up to 4. 1345bf215546Sopenharmony_ci if (chipset >= NVISA_GK104_CHIPSET) { 1346bf215546Sopenharmony_ci int s = arg + 2 * dim; 1347bf215546Sopenharmony_ci if (s >= 4 && s < 7) { 1348bf215546Sopenharmony_ci if (txd->srcExists(s)) // move potential predicate out of the way 1349bf215546Sopenharmony_ci txd->moveSources(s, 7 - s); 1350bf215546Sopenharmony_ci while (s < 7) 1351bf215546Sopenharmony_ci txd->setSrc(s++, bld.loadImm(NULL, 0)); 1352bf215546Sopenharmony_ci } 1353bf215546Sopenharmony_ci } 1354bf215546Sopenharmony_ci 1355bf215546Sopenharmony_ci return true; 1356bf215546Sopenharmony_ci} 1357bf215546Sopenharmony_ci 1358bf215546Sopenharmony_cibool 1359bf215546Sopenharmony_ciNVC0LoweringPass::handleTXQ(TexInstruction *txq) 1360bf215546Sopenharmony_ci{ 1361bf215546Sopenharmony_ci const int chipset = prog->getTarget()->getChipset(); 1362bf215546Sopenharmony_ci if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0) 1363bf215546Sopenharmony_ci txq->tex.r += prog->driver->io.texBindBase / 4; 1364bf215546Sopenharmony_ci 1365bf215546Sopenharmony_ci if (txq->tex.rIndirectSrc < 0) 1366bf215546Sopenharmony_ci return true; 1367bf215546Sopenharmony_ci 1368bf215546Sopenharmony_ci Value *ticRel = txq->getIndirectR(); 1369bf215546Sopenharmony_ci 1370bf215546Sopenharmony_ci txq->setIndirectS(NULL); 1371bf215546Sopenharmony_ci txq->tex.sIndirectSrc = -1; 1372bf215546Sopenharmony_ci 1373bf215546Sopenharmony_ci assert(ticRel); 1374bf215546Sopenharmony_ci 1375bf215546Sopenharmony_ci if (chipset < NVISA_GK104_CHIPSET) { 1376bf215546Sopenharmony_ci LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa 1377bf215546Sopenharmony_ci 1378bf215546Sopenharmony_ci txq->setSrc(txq->tex.rIndirectSrc, NULL); 1379bf215546Sopenharmony_ci if (txq->tex.r) 1380bf215546Sopenharmony_ci ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), 1381bf215546Sopenharmony_ci ticRel, bld.mkImm(txq->tex.r)); 1382bf215546Sopenharmony_ci 1383bf215546Sopenharmony_ci bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17)); 1384bf215546Sopenharmony_ci 1385bf215546Sopenharmony_ci txq->moveSources(0, 1); 1386bf215546Sopenharmony_ci txq->setSrc(0, src); 1387bf215546Sopenharmony_ci } else { 1388bf215546Sopenharmony_ci Value *hnd = loadTexHandle(txq->getIndirectR(), txq->tex.r); 1389bf215546Sopenharmony_ci txq->tex.r = 0xff; 1390bf215546Sopenharmony_ci txq->tex.s = 0x1f; 1391bf215546Sopenharmony_ci 1392bf215546Sopenharmony_ci txq->setIndirectR(NULL); 1393bf215546Sopenharmony_ci txq->moveSources(0, 1); 1394bf215546Sopenharmony_ci txq->setSrc(0, hnd); 1395bf215546Sopenharmony_ci txq->tex.rIndirectSrc = 0; 1396bf215546Sopenharmony_ci } 1397bf215546Sopenharmony_ci 1398bf215546Sopenharmony_ci return true; 1399bf215546Sopenharmony_ci} 1400bf215546Sopenharmony_ci 1401bf215546Sopenharmony_cibool 1402bf215546Sopenharmony_ciNVC0LoweringPass::handleTXLQ(TexInstruction *i) 1403bf215546Sopenharmony_ci{ 1404bf215546Sopenharmony_ci /* The outputs are inverted compared to what the TGSI instruction 1405bf215546Sopenharmony_ci * expects. Take that into account in the mask. 1406bf215546Sopenharmony_ci */ 1407bf215546Sopenharmony_ci assert((i->tex.mask & ~3) == 0); 1408bf215546Sopenharmony_ci if (i->tex.mask == 1) 1409bf215546Sopenharmony_ci i->tex.mask = 2; 1410bf215546Sopenharmony_ci else if (i->tex.mask == 2) 1411bf215546Sopenharmony_ci i->tex.mask = 1; 1412bf215546Sopenharmony_ci handleTEX(i); 1413bf215546Sopenharmony_ci bld.setPosition(i, true); 1414bf215546Sopenharmony_ci 1415bf215546Sopenharmony_ci /* The returned values are not quite what we want: 1416bf215546Sopenharmony_ci * (a) convert from s16/u16 to f32 1417bf215546Sopenharmony_ci * (b) multiply by 1/256 1418bf215546Sopenharmony_ci */ 1419bf215546Sopenharmony_ci for (int def = 0; def < 2; ++def) { 1420bf215546Sopenharmony_ci if (!i->defExists(def)) 1421bf215546Sopenharmony_ci continue; 1422bf215546Sopenharmony_ci enum DataType type = TYPE_S16; 1423bf215546Sopenharmony_ci if (i->tex.mask == 2 || def > 0) 1424bf215546Sopenharmony_ci type = TYPE_U16; 1425bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def)); 1426bf215546Sopenharmony_ci bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def), 1427bf215546Sopenharmony_ci i->getDef(def), bld.loadImm(NULL, 1.0f / 256)); 1428bf215546Sopenharmony_ci } 1429bf215546Sopenharmony_ci if (i->tex.mask == 3) { 1430bf215546Sopenharmony_ci LValue *t = new_LValue(func, FILE_GPR); 1431bf215546Sopenharmony_ci bld.mkMov(t, i->getDef(0)); 1432bf215546Sopenharmony_ci bld.mkMov(i->getDef(0), i->getDef(1)); 1433bf215546Sopenharmony_ci bld.mkMov(i->getDef(1), t); 1434bf215546Sopenharmony_ci } 1435bf215546Sopenharmony_ci return true; 1436bf215546Sopenharmony_ci} 1437bf215546Sopenharmony_ci 1438bf215546Sopenharmony_cibool 1439bf215546Sopenharmony_ciNVC0LoweringPass::handleBUFQ(Instruction *bufq) 1440bf215546Sopenharmony_ci{ 1441bf215546Sopenharmony_ci bufq->op = OP_MOV; 1442bf215546Sopenharmony_ci bufq->setSrc(0, loadBufLength32(bufq->getIndirect(0, 1), 1443bf215546Sopenharmony_ci bufq->getSrc(0)->reg.fileIndex * 16)); 1444bf215546Sopenharmony_ci bufq->setIndirect(0, 0, NULL); 1445bf215546Sopenharmony_ci bufq->setIndirect(0, 1, NULL); 1446bf215546Sopenharmony_ci return true; 1447bf215546Sopenharmony_ci} 1448bf215546Sopenharmony_ci 1449bf215546Sopenharmony_civoid 1450bf215546Sopenharmony_ciNVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom) 1451bf215546Sopenharmony_ci{ 1452bf215546Sopenharmony_ci assert(atom->src(0).getFile() == FILE_MEMORY_SHARED); 1453bf215546Sopenharmony_ci 1454bf215546Sopenharmony_ci BasicBlock *currBB = atom->bb; 1455bf215546Sopenharmony_ci BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false); 1456bf215546Sopenharmony_ci BasicBlock *joinBB = atom->bb->splitAfter(atom); 1457bf215546Sopenharmony_ci BasicBlock *setAndUnlockBB = new BasicBlock(func); 1458bf215546Sopenharmony_ci BasicBlock *failLockBB = new BasicBlock(func); 1459bf215546Sopenharmony_ci 1460bf215546Sopenharmony_ci bld.setPosition(currBB, true); 1461bf215546Sopenharmony_ci assert(!currBB->joinAt); 1462bf215546Sopenharmony_ci currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL); 1463bf215546Sopenharmony_ci 1464bf215546Sopenharmony_ci CmpInstruction *pred = 1465bf215546Sopenharmony_ci bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), 1466bf215546Sopenharmony_ci TYPE_U32, bld.mkImm(0), bld.mkImm(1)); 1467bf215546Sopenharmony_ci 1468bf215546Sopenharmony_ci bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL); 1469bf215546Sopenharmony_ci currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE); 1470bf215546Sopenharmony_ci 1471bf215546Sopenharmony_ci bld.setPosition(tryLockBB, true); 1472bf215546Sopenharmony_ci 1473bf215546Sopenharmony_ci Instruction *ld = 1474bf215546Sopenharmony_ci bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(), 1475bf215546Sopenharmony_ci atom->getIndirect(0, 0)); 1476bf215546Sopenharmony_ci ld->setDef(1, bld.getSSA(1, FILE_PREDICATE)); 1477bf215546Sopenharmony_ci ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED; 1478bf215546Sopenharmony_ci 1479bf215546Sopenharmony_ci bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1)); 1480bf215546Sopenharmony_ci bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL); 1481bf215546Sopenharmony_ci tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS); 1482bf215546Sopenharmony_ci tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE); 1483bf215546Sopenharmony_ci 1484bf215546Sopenharmony_ci tryLockBB->cfg.detach(&joinBB->cfg); 1485bf215546Sopenharmony_ci bld.remove(atom); 1486bf215546Sopenharmony_ci 1487bf215546Sopenharmony_ci bld.setPosition(setAndUnlockBB, true); 1488bf215546Sopenharmony_ci Value *stVal; 1489bf215546Sopenharmony_ci if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) { 1490bf215546Sopenharmony_ci // Read the old value, and write the new one. 1491bf215546Sopenharmony_ci stVal = atom->getSrc(1); 1492bf215546Sopenharmony_ci } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) { 1493bf215546Sopenharmony_ci CmpInstruction *set = 1494bf215546Sopenharmony_ci bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(), 1495bf215546Sopenharmony_ci TYPE_U32, ld->getDef(0), atom->getSrc(1)); 1496bf215546Sopenharmony_ci 1497bf215546Sopenharmony_ci bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()), 1498bf215546Sopenharmony_ci TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0)); 1499bf215546Sopenharmony_ci } else { 1500bf215546Sopenharmony_ci operation op; 1501bf215546Sopenharmony_ci 1502bf215546Sopenharmony_ci switch (atom->subOp) { 1503bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_ADD: 1504bf215546Sopenharmony_ci op = OP_ADD; 1505bf215546Sopenharmony_ci break; 1506bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_AND: 1507bf215546Sopenharmony_ci op = OP_AND; 1508bf215546Sopenharmony_ci break; 1509bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_OR: 1510bf215546Sopenharmony_ci op = OP_OR; 1511bf215546Sopenharmony_ci break; 1512bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_XOR: 1513bf215546Sopenharmony_ci op = OP_XOR; 1514bf215546Sopenharmony_ci break; 1515bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_MIN: 1516bf215546Sopenharmony_ci op = OP_MIN; 1517bf215546Sopenharmony_ci break; 1518bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_MAX: 1519bf215546Sopenharmony_ci op = OP_MAX; 1520bf215546Sopenharmony_ci break; 1521bf215546Sopenharmony_ci default: 1522bf215546Sopenharmony_ci assert(0); 1523bf215546Sopenharmony_ci return; 1524bf215546Sopenharmony_ci } 1525bf215546Sopenharmony_ci 1526bf215546Sopenharmony_ci stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0), 1527bf215546Sopenharmony_ci atom->getSrc(1)); 1528bf215546Sopenharmony_ci } 1529bf215546Sopenharmony_ci 1530bf215546Sopenharmony_ci Instruction *st = 1531bf215546Sopenharmony_ci bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(), 1532bf215546Sopenharmony_ci atom->getIndirect(0, 0), stVal); 1533bf215546Sopenharmony_ci st->setDef(0, pred->getDef(0)); 1534bf215546Sopenharmony_ci st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED; 1535bf215546Sopenharmony_ci 1536bf215546Sopenharmony_ci bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL); 1537bf215546Sopenharmony_ci setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE); 1538bf215546Sopenharmony_ci 1539bf215546Sopenharmony_ci // Lock until the store has not been performed. 1540bf215546Sopenharmony_ci bld.setPosition(failLockBB, true); 1541bf215546Sopenharmony_ci bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0)); 1542bf215546Sopenharmony_ci bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL); 1543bf215546Sopenharmony_ci failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK); 1544bf215546Sopenharmony_ci failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE); 1545bf215546Sopenharmony_ci 1546bf215546Sopenharmony_ci bld.setPosition(joinBB, false); 1547bf215546Sopenharmony_ci bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1; 1548bf215546Sopenharmony_ci} 1549bf215546Sopenharmony_ci 1550bf215546Sopenharmony_civoid 1551bf215546Sopenharmony_ciNVC0LoweringPass::handleSharedATOM(Instruction *atom) 1552bf215546Sopenharmony_ci{ 1553bf215546Sopenharmony_ci assert(atom->src(0).getFile() == FILE_MEMORY_SHARED); 1554bf215546Sopenharmony_ci 1555bf215546Sopenharmony_ci BasicBlock *currBB = atom->bb; 1556bf215546Sopenharmony_ci BasicBlock *tryLockAndSetBB = atom->bb->splitBefore(atom, false); 1557bf215546Sopenharmony_ci BasicBlock *joinBB = atom->bb->splitAfter(atom); 1558bf215546Sopenharmony_ci 1559bf215546Sopenharmony_ci bld.setPosition(currBB, true); 1560bf215546Sopenharmony_ci assert(!currBB->joinAt); 1561bf215546Sopenharmony_ci currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL); 1562bf215546Sopenharmony_ci 1563bf215546Sopenharmony_ci bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_ALWAYS, NULL); 1564bf215546Sopenharmony_ci currBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::TREE); 1565bf215546Sopenharmony_ci 1566bf215546Sopenharmony_ci bld.setPosition(tryLockAndSetBB, true); 1567bf215546Sopenharmony_ci 1568bf215546Sopenharmony_ci Instruction *ld = 1569bf215546Sopenharmony_ci bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(), 1570bf215546Sopenharmony_ci atom->getIndirect(0, 0)); 1571bf215546Sopenharmony_ci ld->setDef(1, bld.getSSA(1, FILE_PREDICATE)); 1572bf215546Sopenharmony_ci ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED; 1573bf215546Sopenharmony_ci 1574bf215546Sopenharmony_ci Value *stVal; 1575bf215546Sopenharmony_ci if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) { 1576bf215546Sopenharmony_ci // Read the old value, and write the new one. 1577bf215546Sopenharmony_ci stVal = atom->getSrc(1); 1578bf215546Sopenharmony_ci } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) { 1579bf215546Sopenharmony_ci CmpInstruction *set = 1580bf215546Sopenharmony_ci bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), 1581bf215546Sopenharmony_ci TYPE_U32, ld->getDef(0), atom->getSrc(1)); 1582bf215546Sopenharmony_ci set->setPredicate(CC_P, ld->getDef(1)); 1583bf215546Sopenharmony_ci 1584bf215546Sopenharmony_ci Instruction *selp = 1585bf215546Sopenharmony_ci bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), ld->getDef(0), 1586bf215546Sopenharmony_ci atom->getSrc(2), set->getDef(0)); 1587bf215546Sopenharmony_ci selp->src(2).mod = Modifier(NV50_IR_MOD_NOT); 1588bf215546Sopenharmony_ci selp->setPredicate(CC_P, ld->getDef(1)); 1589bf215546Sopenharmony_ci 1590bf215546Sopenharmony_ci stVal = selp->getDef(0); 1591bf215546Sopenharmony_ci } else { 1592bf215546Sopenharmony_ci operation op; 1593bf215546Sopenharmony_ci 1594bf215546Sopenharmony_ci switch (atom->subOp) { 1595bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_ADD: 1596bf215546Sopenharmony_ci op = OP_ADD; 1597bf215546Sopenharmony_ci break; 1598bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_AND: 1599bf215546Sopenharmony_ci op = OP_AND; 1600bf215546Sopenharmony_ci break; 1601bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_OR: 1602bf215546Sopenharmony_ci op = OP_OR; 1603bf215546Sopenharmony_ci break; 1604bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_XOR: 1605bf215546Sopenharmony_ci op = OP_XOR; 1606bf215546Sopenharmony_ci break; 1607bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_MIN: 1608bf215546Sopenharmony_ci op = OP_MIN; 1609bf215546Sopenharmony_ci break; 1610bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_MAX: 1611bf215546Sopenharmony_ci op = OP_MAX; 1612bf215546Sopenharmony_ci break; 1613bf215546Sopenharmony_ci default: 1614bf215546Sopenharmony_ci assert(0); 1615bf215546Sopenharmony_ci return; 1616bf215546Sopenharmony_ci } 1617bf215546Sopenharmony_ci 1618bf215546Sopenharmony_ci Instruction *i = 1619bf215546Sopenharmony_ci bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0), 1620bf215546Sopenharmony_ci atom->getSrc(1)); 1621bf215546Sopenharmony_ci i->setPredicate(CC_P, ld->getDef(1)); 1622bf215546Sopenharmony_ci 1623bf215546Sopenharmony_ci stVal = i->getDef(0); 1624bf215546Sopenharmony_ci } 1625bf215546Sopenharmony_ci 1626bf215546Sopenharmony_ci Instruction *st = 1627bf215546Sopenharmony_ci bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(), 1628bf215546Sopenharmony_ci atom->getIndirect(0, 0), stVal); 1629bf215546Sopenharmony_ci st->setPredicate(CC_P, ld->getDef(1)); 1630bf215546Sopenharmony_ci st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED; 1631bf215546Sopenharmony_ci 1632bf215546Sopenharmony_ci // Loop until the lock is acquired. 1633bf215546Sopenharmony_ci bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_NOT_P, ld->getDef(1)); 1634bf215546Sopenharmony_ci tryLockAndSetBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::BACK); 1635bf215546Sopenharmony_ci tryLockAndSetBB->cfg.attach(&joinBB->cfg, Graph::Edge::CROSS); 1636bf215546Sopenharmony_ci bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL); 1637bf215546Sopenharmony_ci 1638bf215546Sopenharmony_ci bld.remove(atom); 1639bf215546Sopenharmony_ci 1640bf215546Sopenharmony_ci bld.setPosition(joinBB, false); 1641bf215546Sopenharmony_ci bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1; 1642bf215546Sopenharmony_ci} 1643bf215546Sopenharmony_ci 1644bf215546Sopenharmony_cibool 1645bf215546Sopenharmony_ciNVC0LoweringPass::handleATOM(Instruction *atom) 1646bf215546Sopenharmony_ci{ 1647bf215546Sopenharmony_ci SVSemantic sv; 1648bf215546Sopenharmony_ci Value *ptr = atom->getIndirect(0, 0), *ind = atom->getIndirect(0, 1), *base; 1649bf215546Sopenharmony_ci 1650bf215546Sopenharmony_ci switch (atom->src(0).getFile()) { 1651bf215546Sopenharmony_ci case FILE_MEMORY_LOCAL: 1652bf215546Sopenharmony_ci sv = SV_LBASE; 1653bf215546Sopenharmony_ci break; 1654bf215546Sopenharmony_ci case FILE_MEMORY_SHARED: 1655bf215546Sopenharmony_ci // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic 1656bf215546Sopenharmony_ci // operations on shared memory. For Maxwell, ATOMS is enough. 1657bf215546Sopenharmony_ci if (targ->getChipset() < NVISA_GK104_CHIPSET) 1658bf215546Sopenharmony_ci handleSharedATOM(atom); 1659bf215546Sopenharmony_ci else if (targ->getChipset() < NVISA_GM107_CHIPSET) 1660bf215546Sopenharmony_ci handleSharedATOMNVE4(atom); 1661bf215546Sopenharmony_ci return true; 1662bf215546Sopenharmony_ci case FILE_MEMORY_GLOBAL: 1663bf215546Sopenharmony_ci return true; 1664bf215546Sopenharmony_ci default: 1665bf215546Sopenharmony_ci assert(atom->src(0).getFile() == FILE_MEMORY_BUFFER); 1666bf215546Sopenharmony_ci base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16); 1667bf215546Sopenharmony_ci assert(base->reg.size == 8); 1668bf215546Sopenharmony_ci if (ptr) 1669bf215546Sopenharmony_ci base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr); 1670bf215546Sopenharmony_ci assert(base->reg.size == 8); 1671bf215546Sopenharmony_ci atom->setIndirect(0, 0, base); 1672bf215546Sopenharmony_ci atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL; 1673bf215546Sopenharmony_ci 1674bf215546Sopenharmony_ci // Harden against out-of-bounds accesses 1675bf215546Sopenharmony_ci Value *offset = bld.loadImm(NULL, atom->getSrc(0)->reg.data.offset + typeSizeof(atom->sType)); 1676bf215546Sopenharmony_ci Value *length = loadBufLength32(ind, atom->getSrc(0)->reg.fileIndex * 16); 1677bf215546Sopenharmony_ci Value *pred = new_LValue(func, FILE_PREDICATE); 1678bf215546Sopenharmony_ci if (ptr) 1679bf215546Sopenharmony_ci bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, ptr); 1680bf215546Sopenharmony_ci bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length); 1681bf215546Sopenharmony_ci atom->setPredicate(CC_NOT_P, pred); 1682bf215546Sopenharmony_ci if (atom->defExists(0)) { 1683bf215546Sopenharmony_ci Value *zero, *dst = atom->getDef(0); 1684bf215546Sopenharmony_ci atom->setDef(0, bld.getSSA()); 1685bf215546Sopenharmony_ci 1686bf215546Sopenharmony_ci bld.setPosition(atom, true); 1687bf215546Sopenharmony_ci bld.mkMov((zero = bld.getSSA()), bld.mkImm(0)) 1688bf215546Sopenharmony_ci ->setPredicate(CC_P, pred); 1689bf215546Sopenharmony_ci bld.mkOp2(OP_UNION, TYPE_U32, dst, atom->getDef(0), zero); 1690bf215546Sopenharmony_ci } 1691bf215546Sopenharmony_ci 1692bf215546Sopenharmony_ci return true; 1693bf215546Sopenharmony_ci } 1694bf215546Sopenharmony_ci base = 1695bf215546Sopenharmony_ci bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0)); 1696bf215546Sopenharmony_ci 1697bf215546Sopenharmony_ci atom->setSrc(0, cloneShallow(func, atom->getSrc(0))); 1698bf215546Sopenharmony_ci atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL; 1699bf215546Sopenharmony_ci if (ptr) 1700bf215546Sopenharmony_ci base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr); 1701bf215546Sopenharmony_ci atom->setIndirect(0, 1, NULL); 1702bf215546Sopenharmony_ci atom->setIndirect(0, 0, base); 1703bf215546Sopenharmony_ci 1704bf215546Sopenharmony_ci return true; 1705bf215546Sopenharmony_ci} 1706bf215546Sopenharmony_ci 1707bf215546Sopenharmony_cibool 1708bf215546Sopenharmony_ciNVC0LoweringPass::handleATOMCctl(Instruction *atom) { 1709bf215546Sopenharmony_ci // Flush L1 cache manually since atomics go directly to L2. This ensures 1710bf215546Sopenharmony_ci // that any later CA reads retrieve the updated data. 1711bf215546Sopenharmony_ci 1712bf215546Sopenharmony_ci if (atom->cache != nv50_ir::CACHE_CA) 1713bf215546Sopenharmony_ci return false; 1714bf215546Sopenharmony_ci 1715bf215546Sopenharmony_ci bld.setPosition(atom, true); 1716bf215546Sopenharmony_ci 1717bf215546Sopenharmony_ci Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, atom->getSrc(0)); 1718bf215546Sopenharmony_ci cctl->setIndirect(0, 0, atom->getIndirect(0, 0)); 1719bf215546Sopenharmony_ci cctl->fixed = 1; 1720bf215546Sopenharmony_ci cctl->subOp = NV50_IR_SUBOP_CCTL_IV; 1721bf215546Sopenharmony_ci if (atom->isPredicated()) 1722bf215546Sopenharmony_ci cctl->setPredicate(atom->cc, atom->getPredicate()); 1723bf215546Sopenharmony_ci 1724bf215546Sopenharmony_ci return true; 1725bf215546Sopenharmony_ci} 1726bf215546Sopenharmony_ci 1727bf215546Sopenharmony_cibool 1728bf215546Sopenharmony_ciNVC0LoweringPass::handleCasExch(Instruction *cas) 1729bf215546Sopenharmony_ci{ 1730bf215546Sopenharmony_ci if (targ->getChipset() < NVISA_GM107_CHIPSET) { 1731bf215546Sopenharmony_ci if (cas->src(0).getFile() == FILE_MEMORY_SHARED) { 1732bf215546Sopenharmony_ci // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM(). 1733bf215546Sopenharmony_ci return false; 1734bf215546Sopenharmony_ci } 1735bf215546Sopenharmony_ci } 1736bf215546Sopenharmony_ci 1737bf215546Sopenharmony_ci if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS && 1738bf215546Sopenharmony_ci cas->subOp != NV50_IR_SUBOP_ATOM_EXCH) 1739bf215546Sopenharmony_ci return false; 1740bf215546Sopenharmony_ci 1741bf215546Sopenharmony_ci if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS && 1742bf215546Sopenharmony_ci targ->getChipset() < NVISA_GV100_CHIPSET) { 1743bf215546Sopenharmony_ci // CAS is crazy. It's 2nd source is a double reg, and the 3rd source 1744bf215546Sopenharmony_ci // should be set to the high part of the double reg or bad things will 1745bf215546Sopenharmony_ci // happen elsewhere in the universe. 1746bf215546Sopenharmony_ci // Also, it sometimes returns the new value instead of the old one 1747bf215546Sopenharmony_ci // under mysterious circumstances. 1748bf215546Sopenharmony_ci DataType ty = typeOfSize(typeSizeof(cas->dType) * 2); 1749bf215546Sopenharmony_ci Value *dreg = bld.getSSA(typeSizeof(ty)); 1750bf215546Sopenharmony_ci bld.setPosition(cas, false); 1751bf215546Sopenharmony_ci bld.mkOp2(OP_MERGE, ty, dreg, cas->getSrc(1), cas->getSrc(2)); 1752bf215546Sopenharmony_ci cas->setSrc(1, dreg); 1753bf215546Sopenharmony_ci cas->setSrc(2, dreg); 1754bf215546Sopenharmony_ci } 1755bf215546Sopenharmony_ci 1756bf215546Sopenharmony_ci return true; 1757bf215546Sopenharmony_ci} 1758bf215546Sopenharmony_ci 1759bf215546Sopenharmony_ciinline Value * 1760bf215546Sopenharmony_ciNVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base) 1761bf215546Sopenharmony_ci{ 1762bf215546Sopenharmony_ci uint8_t b = prog->driver->io.auxCBSlot; 1763bf215546Sopenharmony_ci off += base; 1764bf215546Sopenharmony_ci 1765bf215546Sopenharmony_ci return bld. 1766bf215546Sopenharmony_ci mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr); 1767bf215546Sopenharmony_ci} 1768bf215546Sopenharmony_ci 1769bf215546Sopenharmony_ciinline Value * 1770bf215546Sopenharmony_ciNVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base) 1771bf215546Sopenharmony_ci{ 1772bf215546Sopenharmony_ci uint8_t b = prog->driver->io.auxCBSlot; 1773bf215546Sopenharmony_ci off += base; 1774bf215546Sopenharmony_ci 1775bf215546Sopenharmony_ci if (ptr) 1776bf215546Sopenharmony_ci ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4)); 1777bf215546Sopenharmony_ci 1778bf215546Sopenharmony_ci return bld. 1779bf215546Sopenharmony_ci mkLoadv(TYPE_U64, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off), ptr); 1780bf215546Sopenharmony_ci} 1781bf215546Sopenharmony_ci 1782bf215546Sopenharmony_ciinline Value * 1783bf215546Sopenharmony_ciNVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base) 1784bf215546Sopenharmony_ci{ 1785bf215546Sopenharmony_ci uint8_t b = prog->driver->io.auxCBSlot; 1786bf215546Sopenharmony_ci off += base; 1787bf215546Sopenharmony_ci 1788bf215546Sopenharmony_ci if (ptr) 1789bf215546Sopenharmony_ci ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4)); 1790bf215546Sopenharmony_ci 1791bf215546Sopenharmony_ci return bld. 1792bf215546Sopenharmony_ci mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr); 1793bf215546Sopenharmony_ci} 1794bf215546Sopenharmony_ci 1795bf215546Sopenharmony_ciinline Value * 1796bf215546Sopenharmony_ciNVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off) 1797bf215546Sopenharmony_ci{ 1798bf215546Sopenharmony_ci return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase); 1799bf215546Sopenharmony_ci} 1800bf215546Sopenharmony_ci 1801bf215546Sopenharmony_ciinline Value * 1802bf215546Sopenharmony_ciNVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off) 1803bf215546Sopenharmony_ci{ 1804bf215546Sopenharmony_ci return loadResLength32(ptr, off, prog->driver->io.bufInfoBase); 1805bf215546Sopenharmony_ci} 1806bf215546Sopenharmony_ci 1807bf215546Sopenharmony_ciinline Value * 1808bf215546Sopenharmony_ciNVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off) 1809bf215546Sopenharmony_ci{ 1810bf215546Sopenharmony_ci return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase); 1811bf215546Sopenharmony_ci} 1812bf215546Sopenharmony_ci 1813bf215546Sopenharmony_ciinline Value * 1814bf215546Sopenharmony_ciNVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off) 1815bf215546Sopenharmony_ci{ 1816bf215546Sopenharmony_ci return loadResLength32(ptr, off, prog->driver->io.uboInfoBase); 1817bf215546Sopenharmony_ci} 1818bf215546Sopenharmony_ci 1819bf215546Sopenharmony_ciinline Value * 1820bf215546Sopenharmony_ciNVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off) 1821bf215546Sopenharmony_ci{ 1822bf215546Sopenharmony_ci uint8_t b = prog->driver->io.msInfoCBSlot; 1823bf215546Sopenharmony_ci off += prog->driver->io.msInfoBase; 1824bf215546Sopenharmony_ci return bld. 1825bf215546Sopenharmony_ci mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr); 1826bf215546Sopenharmony_ci} 1827bf215546Sopenharmony_ci 1828bf215546Sopenharmony_ciinline Value * 1829bf215546Sopenharmony_ciNVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless) 1830bf215546Sopenharmony_ci{ 1831bf215546Sopenharmony_ci uint32_t base = slot * NVC0_SU_INFO__STRIDE; 1832bf215546Sopenharmony_ci 1833bf215546Sopenharmony_ci // We don't upload surface info for bindless for GM107+ 1834bf215546Sopenharmony_ci assert(!bindless || targ->getChipset() < NVISA_GM107_CHIPSET); 1835bf215546Sopenharmony_ci 1836bf215546Sopenharmony_ci if (ptr) { 1837bf215546Sopenharmony_ci ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot)); 1838bf215546Sopenharmony_ci if (bindless) 1839bf215546Sopenharmony_ci ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(511)); 1840bf215546Sopenharmony_ci else 1841bf215546Sopenharmony_ci ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7)); 1842bf215546Sopenharmony_ci ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(6)); 1843bf215546Sopenharmony_ci base = 0; 1844bf215546Sopenharmony_ci } 1845bf215546Sopenharmony_ci off += base; 1846bf215546Sopenharmony_ci 1847bf215546Sopenharmony_ci return loadResInfo32(ptr, off, bindless ? prog->driver->io.bindlessBase : 1848bf215546Sopenharmony_ci prog->driver->io.suInfoBase); 1849bf215546Sopenharmony_ci} 1850bf215546Sopenharmony_ci 1851bf215546Sopenharmony_ciValue * 1852bf215546Sopenharmony_ciNVC0LoweringPass::loadMsAdjInfo32(TexInstruction::Target target, uint32_t index, int slot, Value *ind, bool bindless) 1853bf215546Sopenharmony_ci{ 1854bf215546Sopenharmony_ci if (!bindless || targ->getChipset() < NVISA_GM107_CHIPSET) 1855bf215546Sopenharmony_ci return loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(index), bindless); 1856bf215546Sopenharmony_ci 1857bf215546Sopenharmony_ci assert(bindless); 1858bf215546Sopenharmony_ci 1859bf215546Sopenharmony_ci Value *samples = bld.getSSA(); 1860bf215546Sopenharmony_ci // this shouldn't be lowered because it's being inserted before the current instruction 1861bf215546Sopenharmony_ci TexInstruction *tex = new_TexInstruction(func, OP_TXQ); 1862bf215546Sopenharmony_ci tex->tex.target = target; 1863bf215546Sopenharmony_ci tex->tex.query = TXQ_TYPE; 1864bf215546Sopenharmony_ci tex->tex.mask = 0x4; 1865bf215546Sopenharmony_ci tex->tex.r = 0xff; 1866bf215546Sopenharmony_ci tex->tex.s = 0x1f; 1867bf215546Sopenharmony_ci tex->tex.rIndirectSrc = 0; 1868bf215546Sopenharmony_ci tex->setDef(0, samples); 1869bf215546Sopenharmony_ci tex->setSrc(0, ind); 1870bf215546Sopenharmony_ci tex->setSrc(1, bld.loadImm(NULL, 0)); 1871bf215546Sopenharmony_ci bld.insert(tex); 1872bf215546Sopenharmony_ci 1873bf215546Sopenharmony_ci // doesn't work with sample counts other than 1/2/4/8 but they aren't supported 1874bf215546Sopenharmony_ci switch (index) { 1875bf215546Sopenharmony_ci case 0: { 1876bf215546Sopenharmony_ci Value *tmp = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), samples, bld.mkImm(2)); 1877bf215546Sopenharmony_ci return bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(2)); 1878bf215546Sopenharmony_ci } 1879bf215546Sopenharmony_ci case 1: { 1880bf215546Sopenharmony_ci Value *tmp = bld.mkCmp(OP_SET, CC_GT, TYPE_U32, bld.getSSA(), TYPE_U32, samples, bld.mkImm(2))->getDef(0); 1881bf215546Sopenharmony_ci return bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(1)); 1882bf215546Sopenharmony_ci } 1883bf215546Sopenharmony_ci default: { 1884bf215546Sopenharmony_ci assert(false); 1885bf215546Sopenharmony_ci return NULL; 1886bf215546Sopenharmony_ci } 1887bf215546Sopenharmony_ci } 1888bf215546Sopenharmony_ci} 1889bf215546Sopenharmony_ci 1890bf215546Sopenharmony_cistatic inline uint16_t getSuClampSubOp(const TexInstruction *su, int c) 1891bf215546Sopenharmony_ci{ 1892bf215546Sopenharmony_ci switch (su->tex.target.getEnum()) { 1893bf215546Sopenharmony_ci case TEX_TARGET_BUFFER: return NV50_IR_SUBOP_SUCLAMP_PL(0, 1); 1894bf215546Sopenharmony_ci case TEX_TARGET_RECT: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2); 1895bf215546Sopenharmony_ci case TEX_TARGET_1D: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2); 1896bf215546Sopenharmony_ci case TEX_TARGET_1D_ARRAY: return (c == 1) ? 1897bf215546Sopenharmony_ci NV50_IR_SUBOP_SUCLAMP_PL(0, 2) : 1898bf215546Sopenharmony_ci NV50_IR_SUBOP_SUCLAMP_SD(0, 2); 1899bf215546Sopenharmony_ci case TEX_TARGET_2D: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2); 1900bf215546Sopenharmony_ci case TEX_TARGET_2D_MS: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2); 1901bf215546Sopenharmony_ci case TEX_TARGET_2D_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2); 1902bf215546Sopenharmony_ci case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2); 1903bf215546Sopenharmony_ci case TEX_TARGET_3D: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2); 1904bf215546Sopenharmony_ci case TEX_TARGET_CUBE: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2); 1905bf215546Sopenharmony_ci case TEX_TARGET_CUBE_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2); 1906bf215546Sopenharmony_ci default: 1907bf215546Sopenharmony_ci assert(0); 1908bf215546Sopenharmony_ci return 0; 1909bf215546Sopenharmony_ci } 1910bf215546Sopenharmony_ci} 1911bf215546Sopenharmony_ci 1912bf215546Sopenharmony_cibool 1913bf215546Sopenharmony_ciNVC0LoweringPass::handleSUQ(TexInstruction *suq) 1914bf215546Sopenharmony_ci{ 1915bf215546Sopenharmony_ci int mask = suq->tex.mask; 1916bf215546Sopenharmony_ci int dim = suq->tex.target.getDim(); 1917bf215546Sopenharmony_ci int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube()); 1918bf215546Sopenharmony_ci Value *ind = suq->getIndirectR(); 1919bf215546Sopenharmony_ci int slot = suq->tex.r; 1920bf215546Sopenharmony_ci int c, d; 1921bf215546Sopenharmony_ci 1922bf215546Sopenharmony_ci for (c = 0, d = 0; c < 3; ++c, mask >>= 1) { 1923bf215546Sopenharmony_ci if (c >= arg || !(mask & 1)) 1924bf215546Sopenharmony_ci continue; 1925bf215546Sopenharmony_ci 1926bf215546Sopenharmony_ci int offset; 1927bf215546Sopenharmony_ci 1928bf215546Sopenharmony_ci if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) { 1929bf215546Sopenharmony_ci offset = NVC0_SU_INFO_SIZE(2); 1930bf215546Sopenharmony_ci } else { 1931bf215546Sopenharmony_ci offset = NVC0_SU_INFO_SIZE(c); 1932bf215546Sopenharmony_ci } 1933bf215546Sopenharmony_ci bld.mkMov(suq->getDef(d++), loadSuInfo32(ind, slot, offset, suq->tex.bindless)); 1934bf215546Sopenharmony_ci if (c == 2 && suq->tex.target.isCube()) 1935bf215546Sopenharmony_ci bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1), 1936bf215546Sopenharmony_ci bld.loadImm(NULL, 6)); 1937bf215546Sopenharmony_ci } 1938bf215546Sopenharmony_ci 1939bf215546Sopenharmony_ci if (mask & 1) { 1940bf215546Sopenharmony_ci if (suq->tex.target.isMS()) { 1941bf215546Sopenharmony_ci Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), suq->tex.bindless); 1942bf215546Sopenharmony_ci Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), suq->tex.bindless); 1943bf215546Sopenharmony_ci Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y); 1944bf215546Sopenharmony_ci bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms); 1945bf215546Sopenharmony_ci } else { 1946bf215546Sopenharmony_ci bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1)); 1947bf215546Sopenharmony_ci } 1948bf215546Sopenharmony_ci } 1949bf215546Sopenharmony_ci 1950bf215546Sopenharmony_ci bld.remove(suq); 1951bf215546Sopenharmony_ci return true; 1952bf215546Sopenharmony_ci} 1953bf215546Sopenharmony_ci 1954bf215546Sopenharmony_civoid 1955bf215546Sopenharmony_ciNVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex) 1956bf215546Sopenharmony_ci{ 1957bf215546Sopenharmony_ci const int arg = tex->tex.target.getArgCount(); 1958bf215546Sopenharmony_ci int slot = tex->tex.r; 1959bf215546Sopenharmony_ci 1960bf215546Sopenharmony_ci if (tex->tex.target == TEX_TARGET_2D_MS) 1961bf215546Sopenharmony_ci tex->tex.target = TEX_TARGET_2D; 1962bf215546Sopenharmony_ci else 1963bf215546Sopenharmony_ci if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY) 1964bf215546Sopenharmony_ci tex->tex.target = TEX_TARGET_2D_ARRAY; 1965bf215546Sopenharmony_ci else 1966bf215546Sopenharmony_ci return; 1967bf215546Sopenharmony_ci 1968bf215546Sopenharmony_ci Value *x = tex->getSrc(0); 1969bf215546Sopenharmony_ci Value *y = tex->getSrc(1); 1970bf215546Sopenharmony_ci Value *s = tex->getSrc(arg - 1); 1971bf215546Sopenharmony_ci 1972bf215546Sopenharmony_ci Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA(); 1973bf215546Sopenharmony_ci Value *ind = tex->getIndirectR(); 1974bf215546Sopenharmony_ci 1975bf215546Sopenharmony_ci Value *ms_x = loadMsAdjInfo32(tex->tex.target, 0, slot, ind, tex->tex.bindless); 1976bf215546Sopenharmony_ci Value *ms_y = loadMsAdjInfo32(tex->tex.target, 1, slot, ind, tex->tex.bindless); 1977bf215546Sopenharmony_ci 1978bf215546Sopenharmony_ci bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x); 1979bf215546Sopenharmony_ci bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y); 1980bf215546Sopenharmony_ci 1981bf215546Sopenharmony_ci s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7)); 1982bf215546Sopenharmony_ci s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3)); 1983bf215546Sopenharmony_ci 1984bf215546Sopenharmony_ci Value *dx = loadMsInfo32(ts, 0x0); 1985bf215546Sopenharmony_ci Value *dy = loadMsInfo32(ts, 0x4); 1986bf215546Sopenharmony_ci 1987bf215546Sopenharmony_ci bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx); 1988bf215546Sopenharmony_ci bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy); 1989bf215546Sopenharmony_ci 1990bf215546Sopenharmony_ci tex->setSrc(0, tx); 1991bf215546Sopenharmony_ci tex->setSrc(1, ty); 1992bf215546Sopenharmony_ci tex->moveSources(arg, -1); 1993bf215546Sopenharmony_ci} 1994bf215546Sopenharmony_ci 1995bf215546Sopenharmony_ci// Sets 64-bit "generic address", predicate and format sources for SULD/SUST. 1996bf215546Sopenharmony_ci// They're computed from the coordinates using the surface info in c[] space. 1997bf215546Sopenharmony_civoid 1998bf215546Sopenharmony_ciNVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) 1999bf215546Sopenharmony_ci{ 2000bf215546Sopenharmony_ci Instruction *insn; 2001bf215546Sopenharmony_ci const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP; 2002bf215546Sopenharmony_ci const bool raw = 2003bf215546Sopenharmony_ci su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB; 2004bf215546Sopenharmony_ci const int slot = su->tex.r; 2005bf215546Sopenharmony_ci const int dim = su->tex.target.getDim(); 2006bf215546Sopenharmony_ci const bool array = su->tex.target.isArray() || su->tex.target.isCube(); 2007bf215546Sopenharmony_ci const int arg = dim + array; 2008bf215546Sopenharmony_ci int c; 2009bf215546Sopenharmony_ci Value *zero = bld.mkImm(0); 2010bf215546Sopenharmony_ci Value *p1 = NULL; 2011bf215546Sopenharmony_ci Value *v; 2012bf215546Sopenharmony_ci Value *src[3]; 2013bf215546Sopenharmony_ci Value *bf, *eau, *off; 2014bf215546Sopenharmony_ci Value *addr, *pred; 2015bf215546Sopenharmony_ci Value *ind = su->getIndirectR(); 2016bf215546Sopenharmony_ci Value *y, *z; 2017bf215546Sopenharmony_ci 2018bf215546Sopenharmony_ci off = bld.getScratch(4); 2019bf215546Sopenharmony_ci bf = bld.getScratch(4); 2020bf215546Sopenharmony_ci addr = bld.getSSA(8); 2021bf215546Sopenharmony_ci pred = bld.getScratch(1, FILE_PREDICATE); 2022bf215546Sopenharmony_ci 2023bf215546Sopenharmony_ci bld.setPosition(su, false); 2024bf215546Sopenharmony_ci 2025bf215546Sopenharmony_ci adjustCoordinatesMS(su); 2026bf215546Sopenharmony_ci 2027bf215546Sopenharmony_ci // calculate clamped coordinates 2028bf215546Sopenharmony_ci for (c = 0; c < arg; ++c) { 2029bf215546Sopenharmony_ci int dimc = c; 2030bf215546Sopenharmony_ci 2031bf215546Sopenharmony_ci if (c == 1 && su->tex.target == TEX_TARGET_1D_ARRAY) { 2032bf215546Sopenharmony_ci // The array index is stored in the Z component for 1D arrays. 2033bf215546Sopenharmony_ci dimc = 2; 2034bf215546Sopenharmony_ci } 2035bf215546Sopenharmony_ci 2036bf215546Sopenharmony_ci src[c] = bld.getScratch(); 2037bf215546Sopenharmony_ci if (c == 0 && raw) 2038bf215546Sopenharmony_ci v = loadSuInfo32(ind, slot, NVC0_SU_INFO_RAW_X, su->tex.bindless); 2039bf215546Sopenharmony_ci else 2040bf215546Sopenharmony_ci v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(dimc), su->tex.bindless); 2041bf215546Sopenharmony_ci bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero) 2042bf215546Sopenharmony_ci ->subOp = getSuClampSubOp(su, dimc); 2043bf215546Sopenharmony_ci } 2044bf215546Sopenharmony_ci for (; c < 3; ++c) 2045bf215546Sopenharmony_ci src[c] = zero; 2046bf215546Sopenharmony_ci 2047bf215546Sopenharmony_ci if (dim == 2 && !array) { 2048bf215546Sopenharmony_ci v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless); 2049bf215546Sopenharmony_ci src[2] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), 2050bf215546Sopenharmony_ci v, bld.loadImm(NULL, 16)); 2051bf215546Sopenharmony_ci 2052bf215546Sopenharmony_ci v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(2), su->tex.bindless); 2053bf215546Sopenharmony_ci bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[2], src[2], v, zero) 2054bf215546Sopenharmony_ci ->subOp = NV50_IR_SUBOP_SUCLAMP_SD(0, 2); 2055bf215546Sopenharmony_ci } 2056bf215546Sopenharmony_ci 2057bf215546Sopenharmony_ci // set predicate output 2058bf215546Sopenharmony_ci if (su->tex.target == TEX_TARGET_BUFFER) { 2059bf215546Sopenharmony_ci src[0]->getInsn()->setFlagsDef(1, pred); 2060bf215546Sopenharmony_ci } else 2061bf215546Sopenharmony_ci if (array) { 2062bf215546Sopenharmony_ci p1 = bld.getSSA(1, FILE_PREDICATE); 2063bf215546Sopenharmony_ci src[dim]->getInsn()->setFlagsDef(1, p1); 2064bf215546Sopenharmony_ci } 2065bf215546Sopenharmony_ci 2066bf215546Sopenharmony_ci // calculate pixel offset 2067bf215546Sopenharmony_ci if (dim == 1) { 2068bf215546Sopenharmony_ci y = z = zero; 2069bf215546Sopenharmony_ci if (su->tex.target != TEX_TARGET_BUFFER) 2070bf215546Sopenharmony_ci bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff)); 2071bf215546Sopenharmony_ci } else { 2072bf215546Sopenharmony_ci y = src[1]; 2073bf215546Sopenharmony_ci z = src[2]; 2074bf215546Sopenharmony_ci 2075bf215546Sopenharmony_ci v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless); 2076bf215546Sopenharmony_ci bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1]) 2077bf215546Sopenharmony_ci ->subOp = NV50_IR_SUBOP_MADSP(4,4,8); // u16l u16l u16l 2078bf215546Sopenharmony_ci 2079bf215546Sopenharmony_ci v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless); 2080bf215546Sopenharmony_ci bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0]) 2081bf215546Sopenharmony_ci ->subOp = array ? 2082bf215546Sopenharmony_ci NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l 2083bf215546Sopenharmony_ci } 2084bf215546Sopenharmony_ci 2085bf215546Sopenharmony_ci // calculate effective address part 1 2086bf215546Sopenharmony_ci if (su->tex.target == TEX_TARGET_BUFFER) { 2087bf215546Sopenharmony_ci if (raw) { 2088bf215546Sopenharmony_ci bf = src[0]; 2089bf215546Sopenharmony_ci } else { 2090bf215546Sopenharmony_ci v = loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless); 2091bf215546Sopenharmony_ci bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero) 2092bf215546Sopenharmony_ci ->subOp = NV50_IR_SUBOP_V1(7,6,8|2); 2093bf215546Sopenharmony_ci } 2094bf215546Sopenharmony_ci } else { 2095bf215546Sopenharmony_ci uint16_t subOp = 0; 2096bf215546Sopenharmony_ci 2097bf215546Sopenharmony_ci switch (dim) { 2098bf215546Sopenharmony_ci case 1: 2099bf215546Sopenharmony_ci break; 2100bf215546Sopenharmony_ci case 2: 2101bf215546Sopenharmony_ci if (array) { 2102bf215546Sopenharmony_ci z = off; 2103bf215546Sopenharmony_ci } else { 2104bf215546Sopenharmony_ci subOp = NV50_IR_SUBOP_SUBFM_3D; 2105bf215546Sopenharmony_ci } 2106bf215546Sopenharmony_ci break; 2107bf215546Sopenharmony_ci default: 2108bf215546Sopenharmony_ci subOp = NV50_IR_SUBOP_SUBFM_3D; 2109bf215546Sopenharmony_ci assert(dim == 3); 2110bf215546Sopenharmony_ci break; 2111bf215546Sopenharmony_ci } 2112bf215546Sopenharmony_ci insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z); 2113bf215546Sopenharmony_ci insn->subOp = subOp; 2114bf215546Sopenharmony_ci insn->setFlagsDef(1, pred); 2115bf215546Sopenharmony_ci } 2116bf215546Sopenharmony_ci 2117bf215546Sopenharmony_ci // part 2 2118bf215546Sopenharmony_ci v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless); 2119bf215546Sopenharmony_ci 2120bf215546Sopenharmony_ci if (su->tex.target == TEX_TARGET_BUFFER) { 2121bf215546Sopenharmony_ci eau = v; 2122bf215546Sopenharmony_ci } else { 2123bf215546Sopenharmony_ci eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v); 2124bf215546Sopenharmony_ci } 2125bf215546Sopenharmony_ci // add array layer offset 2126bf215546Sopenharmony_ci if (array) { 2127bf215546Sopenharmony_ci v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless); 2128bf215546Sopenharmony_ci if (dim == 1) 2129bf215546Sopenharmony_ci bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau) 2130bf215546Sopenharmony_ci ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32 2131bf215546Sopenharmony_ci else 2132bf215546Sopenharmony_ci bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau) 2133bf215546Sopenharmony_ci ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32 2134bf215546Sopenharmony_ci // combine predicates 2135bf215546Sopenharmony_ci assert(p1); 2136bf215546Sopenharmony_ci bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1); 2137bf215546Sopenharmony_ci } 2138bf215546Sopenharmony_ci 2139bf215546Sopenharmony_ci if (atom) { 2140bf215546Sopenharmony_ci Value *lo = bf; 2141bf215546Sopenharmony_ci if (su->tex.target == TEX_TARGET_BUFFER) { 2142bf215546Sopenharmony_ci lo = zero; 2143bf215546Sopenharmony_ci bld.mkMov(off, bf); 2144bf215546Sopenharmony_ci } 2145bf215546Sopenharmony_ci // bf == g[] address & 0xff 2146bf215546Sopenharmony_ci // eau == g[] address >> 8 2147bf215546Sopenharmony_ci bld.mkOp3(OP_PERMT, TYPE_U32, bf, lo, bld.loadImm(NULL, 0x6540), eau); 2148bf215546Sopenharmony_ci bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau); 2149bf215546Sopenharmony_ci } else 2150bf215546Sopenharmony_ci if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) { 2151bf215546Sopenharmony_ci // Convert from u32 to u8 address format, which is what the library code 2152bf215546Sopenharmony_ci // doing SULDP currently uses. 2153bf215546Sopenharmony_ci // XXX: can SUEAU do this ? 2154bf215546Sopenharmony_ci // XXX: does it matter that we don't mask high bytes in bf ? 2155bf215546Sopenharmony_ci // Grrr. 2156bf215546Sopenharmony_ci bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8)); 2157bf215546Sopenharmony_ci bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off); 2158bf215546Sopenharmony_ci } 2159bf215546Sopenharmony_ci 2160bf215546Sopenharmony_ci bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau); 2161bf215546Sopenharmony_ci 2162bf215546Sopenharmony_ci if (atom && su->tex.target == TEX_TARGET_BUFFER) 2163bf215546Sopenharmony_ci bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off); 2164bf215546Sopenharmony_ci 2165bf215546Sopenharmony_ci // let's just set it 0 for raw access and hope it works 2166bf215546Sopenharmony_ci v = raw ? 2167bf215546Sopenharmony_ci bld.mkImm(0) : loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless); 2168bf215546Sopenharmony_ci 2169bf215546Sopenharmony_ci // get rid of old coordinate sources, make space for fmt info and predicate 2170bf215546Sopenharmony_ci su->moveSources(arg, 3 - arg); 2171bf215546Sopenharmony_ci // set 64 bit address and 32-bit format sources 2172bf215546Sopenharmony_ci su->setSrc(0, addr); 2173bf215546Sopenharmony_ci su->setSrc(1, v); 2174bf215546Sopenharmony_ci su->setSrc(2, pred); 2175bf215546Sopenharmony_ci su->setIndirectR(NULL); 2176bf215546Sopenharmony_ci 2177bf215546Sopenharmony_ci // prevent read fault when the image is not actually bound 2178bf215546Sopenharmony_ci CmpInstruction *pred1 = 2179bf215546Sopenharmony_ci bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), 2180bf215546Sopenharmony_ci TYPE_U32, bld.mkImm(0), 2181bf215546Sopenharmony_ci loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless)); 2182bf215546Sopenharmony_ci 2183bf215546Sopenharmony_ci if (su->op != OP_SUSTP && su->tex.format) { 2184bf215546Sopenharmony_ci const TexInstruction::ImgFormatDesc *format = su->tex.format; 2185bf215546Sopenharmony_ci int blockwidth = format->bits[0] + format->bits[1] + 2186bf215546Sopenharmony_ci format->bits[2] + format->bits[3]; 2187bf215546Sopenharmony_ci 2188bf215546Sopenharmony_ci // make sure that the format doesn't mismatch 2189bf215546Sopenharmony_ci assert(format->components != 0); 2190bf215546Sopenharmony_ci bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred1->getDef(0), 2191bf215546Sopenharmony_ci TYPE_U32, bld.loadImm(NULL, blockwidth / 8), 2192bf215546Sopenharmony_ci loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless), 2193bf215546Sopenharmony_ci pred1->getDef(0)); 2194bf215546Sopenharmony_ci } 2195bf215546Sopenharmony_ci su->setPredicate(CC_NOT_P, pred1->getDef(0)); 2196bf215546Sopenharmony_ci 2197bf215546Sopenharmony_ci // TODO: initialize def values to 0 when the surface operation is not 2198bf215546Sopenharmony_ci // performed (not needed for stores). Also, fix the "address bounds test" 2199bf215546Sopenharmony_ci // subtests from arb_shader_image_load_store-invalid for buffers, because it 2200bf215546Sopenharmony_ci // seems like that the predicate is not correctly set by suclamp. 2201bf215546Sopenharmony_ci} 2202bf215546Sopenharmony_ci 2203bf215546Sopenharmony_cistatic DataType 2204bf215546Sopenharmony_cigetSrcType(const TexInstruction::ImgFormatDesc *t, int c) 2205bf215546Sopenharmony_ci{ 2206bf215546Sopenharmony_ci switch (t->type) { 2207bf215546Sopenharmony_ci case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32; 2208bf215546Sopenharmony_ci case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16; 2209bf215546Sopenharmony_ci case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16; 2210bf215546Sopenharmony_ci case UINT: 2211bf215546Sopenharmony_ci return (t->bits[c] == 8 ? TYPE_U8 : 2212bf215546Sopenharmony_ci (t->bits[c] == 16 ? TYPE_U16 : TYPE_U32)); 2213bf215546Sopenharmony_ci case SINT: 2214bf215546Sopenharmony_ci return (t->bits[c] == 8 ? TYPE_S8 : 2215bf215546Sopenharmony_ci (t->bits[c] == 16 ? TYPE_S16 : TYPE_S32)); 2216bf215546Sopenharmony_ci } 2217bf215546Sopenharmony_ci return TYPE_NONE; 2218bf215546Sopenharmony_ci} 2219bf215546Sopenharmony_ci 2220bf215546Sopenharmony_cistatic DataType 2221bf215546Sopenharmony_cigetDestType(const ImgType type) { 2222bf215546Sopenharmony_ci switch (type) { 2223bf215546Sopenharmony_ci case FLOAT: 2224bf215546Sopenharmony_ci case UNORM: 2225bf215546Sopenharmony_ci case SNORM: 2226bf215546Sopenharmony_ci return TYPE_F32; 2227bf215546Sopenharmony_ci case UINT: 2228bf215546Sopenharmony_ci return TYPE_U32; 2229bf215546Sopenharmony_ci case SINT: 2230bf215546Sopenharmony_ci return TYPE_S32; 2231bf215546Sopenharmony_ci default: 2232bf215546Sopenharmony_ci assert(!"Impossible type"); 2233bf215546Sopenharmony_ci return TYPE_NONE; 2234bf215546Sopenharmony_ci } 2235bf215546Sopenharmony_ci} 2236bf215546Sopenharmony_ci 2237bf215546Sopenharmony_civoid 2238bf215546Sopenharmony_ciNVC0LoweringPass::convertSurfaceFormat(TexInstruction *su, Instruction **loaded) 2239bf215546Sopenharmony_ci{ 2240bf215546Sopenharmony_ci const TexInstruction::ImgFormatDesc *format = su->tex.format; 2241bf215546Sopenharmony_ci int width = format->bits[0] + format->bits[1] + 2242bf215546Sopenharmony_ci format->bits[2] + format->bits[3]; 2243bf215546Sopenharmony_ci Value *untypedDst[4] = {}; 2244bf215546Sopenharmony_ci Value *typedDst[4] = {}; 2245bf215546Sopenharmony_ci 2246bf215546Sopenharmony_ci // We must convert this to a generic load. 2247bf215546Sopenharmony_ci su->op = OP_SULDB; 2248bf215546Sopenharmony_ci 2249bf215546Sopenharmony_ci su->dType = typeOfSize(width / 8); 2250bf215546Sopenharmony_ci su->sType = TYPE_U8; 2251bf215546Sopenharmony_ci 2252bf215546Sopenharmony_ci for (int i = 0; i < width / 32; i++) 2253bf215546Sopenharmony_ci untypedDst[i] = bld.getSSA(); 2254bf215546Sopenharmony_ci if (width < 32) 2255bf215546Sopenharmony_ci untypedDst[0] = bld.getSSA(); 2256bf215546Sopenharmony_ci 2257bf215546Sopenharmony_ci if (loaded && loaded[0]) { 2258bf215546Sopenharmony_ci for (int i = 0; i < 4; i++) { 2259bf215546Sopenharmony_ci if (loaded[i]) 2260bf215546Sopenharmony_ci typedDst[i] = loaded[i]->getDef(0); 2261bf215546Sopenharmony_ci } 2262bf215546Sopenharmony_ci } else { 2263bf215546Sopenharmony_ci for (int i = 0; i < 4; i++) { 2264bf215546Sopenharmony_ci typedDst[i] = su->getDef(i); 2265bf215546Sopenharmony_ci } 2266bf215546Sopenharmony_ci } 2267bf215546Sopenharmony_ci 2268bf215546Sopenharmony_ci // Set the untyped dsts as the su's destinations 2269bf215546Sopenharmony_ci if (loaded && loaded[0]) { 2270bf215546Sopenharmony_ci for (int i = 0; i < 4; i++) 2271bf215546Sopenharmony_ci if (loaded[i]) 2272bf215546Sopenharmony_ci loaded[i]->setDef(0, untypedDst[i]); 2273bf215546Sopenharmony_ci } else { 2274bf215546Sopenharmony_ci for (int i = 0; i < 4; i++) 2275bf215546Sopenharmony_ci su->setDef(i, untypedDst[i]); 2276bf215546Sopenharmony_ci 2277bf215546Sopenharmony_ci bld.setPosition(su, true); 2278bf215546Sopenharmony_ci } 2279bf215546Sopenharmony_ci 2280bf215546Sopenharmony_ci // Unpack each component into the typed dsts 2281bf215546Sopenharmony_ci int bits = 0; 2282bf215546Sopenharmony_ci for (int i = 0; i < 4; bits += format->bits[i], i++) { 2283bf215546Sopenharmony_ci if (!typedDst[i]) 2284bf215546Sopenharmony_ci continue; 2285bf215546Sopenharmony_ci 2286bf215546Sopenharmony_ci if (loaded && loaded[0]) 2287bf215546Sopenharmony_ci bld.setPosition(loaded[i], true); 2288bf215546Sopenharmony_ci 2289bf215546Sopenharmony_ci if (i >= format->components) { 2290bf215546Sopenharmony_ci if (format->type == FLOAT || 2291bf215546Sopenharmony_ci format->type == UNORM || 2292bf215546Sopenharmony_ci format->type == SNORM) 2293bf215546Sopenharmony_ci bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f); 2294bf215546Sopenharmony_ci else 2295bf215546Sopenharmony_ci bld.loadImm(typedDst[i], i == 3 ? 1 : 0); 2296bf215546Sopenharmony_ci continue; 2297bf215546Sopenharmony_ci } 2298bf215546Sopenharmony_ci 2299bf215546Sopenharmony_ci // Get just that component's data into the relevant place 2300bf215546Sopenharmony_ci if (format->bits[i] == 32) 2301bf215546Sopenharmony_ci bld.mkMov(typedDst[i], untypedDst[i]); 2302bf215546Sopenharmony_ci else if (format->bits[i] == 16) 2303bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i], 2304bf215546Sopenharmony_ci getSrcType(format, i), untypedDst[i / 2]) 2305bf215546Sopenharmony_ci ->subOp = (i & 1) << (format->type == FLOAT ? 0 : 1); 2306bf215546Sopenharmony_ci else if (format->bits[i] == 8) 2307bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i], 2308bf215546Sopenharmony_ci getSrcType(format, i), untypedDst[0])->subOp = i; 2309bf215546Sopenharmony_ci else { 2310bf215546Sopenharmony_ci bld.mkOp2(OP_EXTBF, TYPE_U32, typedDst[i], untypedDst[bits / 32], 2311bf215546Sopenharmony_ci bld.mkImm((bits % 32) | (format->bits[i] << 8))); 2312bf215546Sopenharmony_ci if (format->type == UNORM || format->type == SNORM) 2313bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], getSrcType(format, i), typedDst[i]); 2314bf215546Sopenharmony_ci } 2315bf215546Sopenharmony_ci 2316bf215546Sopenharmony_ci // Normalize / convert as necessary 2317bf215546Sopenharmony_ci if (format->type == UNORM) 2318bf215546Sopenharmony_ci bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1))); 2319bf215546Sopenharmony_ci else if (format->type == SNORM) 2320bf215546Sopenharmony_ci bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1))); 2321bf215546Sopenharmony_ci else if (format->type == FLOAT && format->bits[i] < 16) { 2322bf215546Sopenharmony_ci bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i])); 2323bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, typedDst[i]); 2324bf215546Sopenharmony_ci } 2325bf215546Sopenharmony_ci } 2326bf215546Sopenharmony_ci 2327bf215546Sopenharmony_ci if (format->bgra) { 2328bf215546Sopenharmony_ci std::swap(typedDst[0], typedDst[2]); 2329bf215546Sopenharmony_ci } 2330bf215546Sopenharmony_ci} 2331bf215546Sopenharmony_ci 2332bf215546Sopenharmony_civoid 2333bf215546Sopenharmony_ciNVC0LoweringPass::insertOOBSurfaceOpResult(TexInstruction *su) 2334bf215546Sopenharmony_ci{ 2335bf215546Sopenharmony_ci if (!su->getPredicate()) 2336bf215546Sopenharmony_ci return; 2337bf215546Sopenharmony_ci 2338bf215546Sopenharmony_ci bld.setPosition(su, true); 2339bf215546Sopenharmony_ci 2340bf215546Sopenharmony_ci for (unsigned i = 0; su->defExists(i); ++i) { 2341bf215546Sopenharmony_ci Value *def = su->getDef(i); 2342bf215546Sopenharmony_ci Value *newDef = bld.getSSA(); 2343bf215546Sopenharmony_ci su->setDef(i, newDef); 2344bf215546Sopenharmony_ci 2345bf215546Sopenharmony_ci Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0)); 2346bf215546Sopenharmony_ci assert(su->cc == CC_NOT_P); 2347bf215546Sopenharmony_ci mov->setPredicate(CC_P, su->getPredicate()); 2348bf215546Sopenharmony_ci Instruction *uni = bld.mkOp2(OP_UNION, TYPE_U32, bld.getSSA(), newDef, mov->getDef(0)); 2349bf215546Sopenharmony_ci bld.mkMov(def, uni->getDef(0)); 2350bf215546Sopenharmony_ci } 2351bf215546Sopenharmony_ci} 2352bf215546Sopenharmony_ci 2353bf215546Sopenharmony_civoid 2354bf215546Sopenharmony_ciNVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su) 2355bf215546Sopenharmony_ci{ 2356bf215546Sopenharmony_ci processSurfaceCoordsNVE4(su); 2357bf215546Sopenharmony_ci 2358bf215546Sopenharmony_ci if (su->op == OP_SULDP) { 2359bf215546Sopenharmony_ci convertSurfaceFormat(su, NULL); 2360bf215546Sopenharmony_ci insertOOBSurfaceOpResult(su); 2361bf215546Sopenharmony_ci } 2362bf215546Sopenharmony_ci 2363bf215546Sopenharmony_ci if (su->op == OP_SUREDB || su->op == OP_SUREDP) { 2364bf215546Sopenharmony_ci assert(su->getPredicate()); 2365bf215546Sopenharmony_ci Value *pred = 2366bf215546Sopenharmony_ci bld.mkOp2v(OP_OR, TYPE_U8, bld.getScratch(1, FILE_PREDICATE), 2367bf215546Sopenharmony_ci su->getPredicate(), su->getSrc(2)); 2368bf215546Sopenharmony_ci 2369bf215546Sopenharmony_ci Instruction *red = bld.mkOp(OP_ATOM, su->dType, bld.getSSA()); 2370bf215546Sopenharmony_ci red->subOp = su->subOp; 2371bf215546Sopenharmony_ci red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0)); 2372bf215546Sopenharmony_ci red->setSrc(1, su->getSrc(3)); 2373bf215546Sopenharmony_ci if (su->subOp == NV50_IR_SUBOP_ATOM_CAS) 2374bf215546Sopenharmony_ci red->setSrc(2, su->getSrc(4)); 2375bf215546Sopenharmony_ci red->setIndirect(0, 0, su->getSrc(0)); 2376bf215546Sopenharmony_ci 2377bf215546Sopenharmony_ci // make sure to initialize dst value when the atomic operation is not 2378bf215546Sopenharmony_ci // performed 2379bf215546Sopenharmony_ci Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0)); 2380bf215546Sopenharmony_ci 2381bf215546Sopenharmony_ci assert(su->cc == CC_NOT_P); 2382bf215546Sopenharmony_ci red->setPredicate(su->cc, pred); 2383bf215546Sopenharmony_ci mov->setPredicate(CC_P, pred); 2384bf215546Sopenharmony_ci 2385bf215546Sopenharmony_ci bld.mkOp2(OP_UNION, TYPE_U32, su->getDef(0), 2386bf215546Sopenharmony_ci red->getDef(0), mov->getDef(0)); 2387bf215546Sopenharmony_ci 2388bf215546Sopenharmony_ci delete_Instruction(bld.getProgram(), su); 2389bf215546Sopenharmony_ci 2390bf215546Sopenharmony_ci handleATOMCctl(red); 2391bf215546Sopenharmony_ci handleCasExch(red); 2392bf215546Sopenharmony_ci } 2393bf215546Sopenharmony_ci 2394bf215546Sopenharmony_ci if (su->op == OP_SUSTB || su->op == OP_SUSTP) 2395bf215546Sopenharmony_ci su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8; 2396bf215546Sopenharmony_ci} 2397bf215546Sopenharmony_ci 2398bf215546Sopenharmony_civoid 2399bf215546Sopenharmony_ciNVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction *su) 2400bf215546Sopenharmony_ci{ 2401bf215546Sopenharmony_ci const int slot = su->tex.r; 2402bf215546Sopenharmony_ci const int dim = su->tex.target.getDim(); 2403bf215546Sopenharmony_ci const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube()); 2404bf215546Sopenharmony_ci int c; 2405bf215546Sopenharmony_ci Value *zero = bld.mkImm(0); 2406bf215546Sopenharmony_ci Value *src[3]; 2407bf215546Sopenharmony_ci Value *v; 2408bf215546Sopenharmony_ci Value *ind = su->getIndirectR(); 2409bf215546Sopenharmony_ci 2410bf215546Sopenharmony_ci bld.setPosition(su, false); 2411bf215546Sopenharmony_ci 2412bf215546Sopenharmony_ci adjustCoordinatesMS(su); 2413bf215546Sopenharmony_ci 2414bf215546Sopenharmony_ci if (ind) { 2415bf215546Sopenharmony_ci Value *ptr; 2416bf215546Sopenharmony_ci ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ind, bld.mkImm(su->tex.r)); 2417bf215546Sopenharmony_ci ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7)); 2418bf215546Sopenharmony_ci su->setIndirectR(ptr); 2419bf215546Sopenharmony_ci } 2420bf215546Sopenharmony_ci 2421bf215546Sopenharmony_ci // get surface coordinates 2422bf215546Sopenharmony_ci for (c = 0; c < arg; ++c) 2423bf215546Sopenharmony_ci src[c] = su->getSrc(c); 2424bf215546Sopenharmony_ci for (; c < 3; ++c) 2425bf215546Sopenharmony_ci src[c] = zero; 2426bf215546Sopenharmony_ci 2427bf215546Sopenharmony_ci // calculate pixel offset 2428bf215546Sopenharmony_ci if (su->op == OP_SULDP || su->op == OP_SUREDP) { 2429bf215546Sopenharmony_ci v = loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless); 2430bf215546Sopenharmony_ci su->setSrc(0, (src[0] = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), src[0], v))); 2431bf215546Sopenharmony_ci } 2432bf215546Sopenharmony_ci 2433bf215546Sopenharmony_ci // add array layer offset 2434bf215546Sopenharmony_ci if (su->tex.target.isArray() || su->tex.target.isCube()) { 2435bf215546Sopenharmony_ci v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless); 2436bf215546Sopenharmony_ci assert(dim > 1); 2437bf215546Sopenharmony_ci su->setSrc(2, (src[2] = bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[2], v))); 2438bf215546Sopenharmony_ci } 2439bf215546Sopenharmony_ci 2440bf215546Sopenharmony_ci // 3d is special-cased. Note that a single "slice" of a 3d image may 2441bf215546Sopenharmony_ci // also be attached as 2d, so we have to do the same 3d processing for 2442bf215546Sopenharmony_ci // 2d as well, just in case. In order to remap a 3d image onto a 2d 2443bf215546Sopenharmony_ci // image, we have to retile it "by hand". 2444bf215546Sopenharmony_ci if (su->tex.target == TEX_TARGET_3D || su->tex.target == TEX_TARGET_2D) { 2445bf215546Sopenharmony_ci Value *z = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless); 2446bf215546Sopenharmony_ci Value *y_size_aligned = 2447bf215546Sopenharmony_ci bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), 2448bf215546Sopenharmony_ci loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM_Y, su->tex.bindless), 2449bf215546Sopenharmony_ci bld.loadImm(NULL, 0x0000ffff)); 2450bf215546Sopenharmony_ci // Add the z coordinate for actual 3d-images 2451bf215546Sopenharmony_ci if (dim > 2) 2452bf215546Sopenharmony_ci src[2] = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), z, src[2]); 2453bf215546Sopenharmony_ci else 2454bf215546Sopenharmony_ci src[2] = z; 2455bf215546Sopenharmony_ci 2456bf215546Sopenharmony_ci // Compute the surface parameters from tile shifts 2457bf215546Sopenharmony_ci Value *tile_shift[3]; 2458bf215546Sopenharmony_ci Value *tile_extbf[3]; 2459bf215546Sopenharmony_ci // Fetch the "real" tiling parameters of the underlying surface 2460bf215546Sopenharmony_ci for (int i = 0; i < 3; i++) { 2461bf215546Sopenharmony_ci tile_extbf[i] = 2462bf215546Sopenharmony_ci bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), 2463bf215546Sopenharmony_ci loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(i), su->tex.bindless), 2464bf215546Sopenharmony_ci bld.loadImm(NULL, 16)); 2465bf215546Sopenharmony_ci tile_shift[i] = 2466bf215546Sopenharmony_ci bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), 2467bf215546Sopenharmony_ci loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(i), su->tex.bindless), 2468bf215546Sopenharmony_ci bld.loadImm(NULL, 24)); 2469bf215546Sopenharmony_ci } 2470bf215546Sopenharmony_ci 2471bf215546Sopenharmony_ci // However for load/atomics, we use byte-indexing. And for byte 2472bf215546Sopenharmony_ci // indexing, the X tile size is always the same. This leads to slightly 2473bf215546Sopenharmony_ci // better code. 2474bf215546Sopenharmony_ci if (su->op == OP_SULDP || su->op == OP_SUREDP) { 2475bf215546Sopenharmony_ci tile_extbf[0] = bld.loadImm(NULL, 0x600); 2476bf215546Sopenharmony_ci tile_shift[0] = bld.loadImm(NULL, 6); 2477bf215546Sopenharmony_ci } 2478bf215546Sopenharmony_ci 2479bf215546Sopenharmony_ci // Compute the location of given coordinate, both inside the tile as 2480bf215546Sopenharmony_ci // well as which (linearly-laid out) tile it's in. 2481bf215546Sopenharmony_ci Value *coord_in_tile[3]; 2482bf215546Sopenharmony_ci Value *tile[3]; 2483bf215546Sopenharmony_ci for (int i = 0; i < 3; i++) { 2484bf215546Sopenharmony_ci coord_in_tile[i] = bld.mkOp2v(OP_EXTBF, TYPE_U32, bld.getSSA(), src[i], tile_extbf[i]); 2485bf215546Sopenharmony_ci tile[i] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), src[i], tile_shift[i]); 2486bf215546Sopenharmony_ci } 2487bf215546Sopenharmony_ci 2488bf215546Sopenharmony_ci // Based on the "real" tiling parameters, compute x/y coordinates in the 2489bf215546Sopenharmony_ci // larger surface with 2d tiling that was supplied to the hardware. This 2490bf215546Sopenharmony_ci // was determined and verified with the help of the tiling pseudocode in 2491bf215546Sopenharmony_ci // the envytools docs. 2492bf215546Sopenharmony_ci // 2493bf215546Sopenharmony_ci // adj_x = x_coord_in_tile + x_tile * x_tile_size * z_tile_size + 2494bf215546Sopenharmony_ci // z_coord_in_tile * x_tile_size 2495bf215546Sopenharmony_ci // adj_y = y_coord_in_tile + y_tile * y_tile_size + 2496bf215546Sopenharmony_ci // z_tile * y_tile_size * y_tiles 2497bf215546Sopenharmony_ci // 2498bf215546Sopenharmony_ci // Note: STRIDE_Y = y_tile_size * y_tiles 2499bf215546Sopenharmony_ci 2500bf215546Sopenharmony_ci su->setSrc(0, bld.mkOp2v( 2501bf215546Sopenharmony_ci OP_ADD, TYPE_U32, bld.getSSA(), 2502bf215546Sopenharmony_ci bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), 2503bf215546Sopenharmony_ci coord_in_tile[0], 2504bf215546Sopenharmony_ci bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), 2505bf215546Sopenharmony_ci tile[0], 2506bf215546Sopenharmony_ci bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), 2507bf215546Sopenharmony_ci tile_shift[2], tile_shift[0]))), 2508bf215546Sopenharmony_ci bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), 2509bf215546Sopenharmony_ci coord_in_tile[2], tile_shift[0]))); 2510bf215546Sopenharmony_ci 2511bf215546Sopenharmony_ci su->setSrc(1, bld.mkOp2v( 2512bf215546Sopenharmony_ci OP_ADD, TYPE_U32, bld.getSSA(), 2513bf215546Sopenharmony_ci bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), 2514bf215546Sopenharmony_ci tile[2], y_size_aligned), 2515bf215546Sopenharmony_ci bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), 2516bf215546Sopenharmony_ci coord_in_tile[1], 2517bf215546Sopenharmony_ci bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), 2518bf215546Sopenharmony_ci tile[1], tile_shift[1])))); 2519bf215546Sopenharmony_ci 2520bf215546Sopenharmony_ci if (su->tex.target == TEX_TARGET_3D) { 2521bf215546Sopenharmony_ci su->moveSources(3, -1); 2522bf215546Sopenharmony_ci su->tex.target = TEX_TARGET_2D; 2523bf215546Sopenharmony_ci } 2524bf215546Sopenharmony_ci } 2525bf215546Sopenharmony_ci 2526bf215546Sopenharmony_ci // prevent read fault when the image is not actually bound 2527bf215546Sopenharmony_ci CmpInstruction *pred = 2528bf215546Sopenharmony_ci bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), 2529bf215546Sopenharmony_ci TYPE_U32, bld.mkImm(0), 2530bf215546Sopenharmony_ci loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless)); 2531bf215546Sopenharmony_ci if (su->op != OP_SUSTP && su->tex.format) { 2532bf215546Sopenharmony_ci const TexInstruction::ImgFormatDesc *format = su->tex.format; 2533bf215546Sopenharmony_ci int blockwidth = format->bits[0] + format->bits[1] + 2534bf215546Sopenharmony_ci format->bits[2] + format->bits[3]; 2535bf215546Sopenharmony_ci 2536bf215546Sopenharmony_ci assert(format->components != 0); 2537bf215546Sopenharmony_ci // make sure that the format doesn't mismatch when it's not FMT_NONE 2538bf215546Sopenharmony_ci bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0), 2539bf215546Sopenharmony_ci TYPE_U32, bld.loadImm(NULL, ffs(blockwidth / 8) - 1), 2540bf215546Sopenharmony_ci loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless), 2541bf215546Sopenharmony_ci pred->getDef(0)); 2542bf215546Sopenharmony_ci } 2543bf215546Sopenharmony_ci su->setPredicate(CC_NOT_P, pred->getDef(0)); 2544bf215546Sopenharmony_ci} 2545bf215546Sopenharmony_ci 2546bf215546Sopenharmony_civoid 2547bf215546Sopenharmony_ciNVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su) 2548bf215546Sopenharmony_ci{ 2549bf215546Sopenharmony_ci if (su->tex.target == TEX_TARGET_1D_ARRAY) { 2550bf215546Sopenharmony_ci /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY 2551bf215546Sopenharmony_ci * will simplify the lowering pass and the texture constraints. */ 2552bf215546Sopenharmony_ci su->moveSources(1, 1); 2553bf215546Sopenharmony_ci su->setSrc(1, bld.loadImm(NULL, 0)); 2554bf215546Sopenharmony_ci su->tex.target = TEX_TARGET_2D_ARRAY; 2555bf215546Sopenharmony_ci } 2556bf215546Sopenharmony_ci 2557bf215546Sopenharmony_ci processSurfaceCoordsNVC0(su); 2558bf215546Sopenharmony_ci 2559bf215546Sopenharmony_ci if (su->op == OP_SULDP) { 2560bf215546Sopenharmony_ci convertSurfaceFormat(su, NULL); 2561bf215546Sopenharmony_ci insertOOBSurfaceOpResult(su); 2562bf215546Sopenharmony_ci } 2563bf215546Sopenharmony_ci 2564bf215546Sopenharmony_ci if (su->op == OP_SUREDB || su->op == OP_SUREDP) { 2565bf215546Sopenharmony_ci const int dim = su->tex.target.getDim(); 2566bf215546Sopenharmony_ci const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube()); 2567bf215546Sopenharmony_ci LValue *addr = bld.getSSA(8); 2568bf215546Sopenharmony_ci Value *def = su->getDef(0); 2569bf215546Sopenharmony_ci 2570bf215546Sopenharmony_ci su->op = OP_SULEA; 2571bf215546Sopenharmony_ci 2572bf215546Sopenharmony_ci // Set the destination to the address 2573bf215546Sopenharmony_ci su->dType = TYPE_U64; 2574bf215546Sopenharmony_ci su->setDef(0, addr); 2575bf215546Sopenharmony_ci su->setDef(1, su->getPredicate()); 2576bf215546Sopenharmony_ci 2577bf215546Sopenharmony_ci bld.setPosition(su, true); 2578bf215546Sopenharmony_ci 2579bf215546Sopenharmony_ci // Perform the atomic op 2580bf215546Sopenharmony_ci Instruction *red = bld.mkOp(OP_ATOM, su->sType, bld.getSSA()); 2581bf215546Sopenharmony_ci red->subOp = su->subOp; 2582bf215546Sopenharmony_ci red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, su->sType, 0)); 2583bf215546Sopenharmony_ci red->setSrc(1, su->getSrc(arg)); 2584bf215546Sopenharmony_ci if (red->subOp == NV50_IR_SUBOP_ATOM_CAS) 2585bf215546Sopenharmony_ci red->setSrc(2, su->getSrc(arg + 1)); 2586bf215546Sopenharmony_ci red->setIndirect(0, 0, addr); 2587bf215546Sopenharmony_ci 2588bf215546Sopenharmony_ci // make sure to initialize dst value when the atomic operation is not 2589bf215546Sopenharmony_ci // performed 2590bf215546Sopenharmony_ci Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0)); 2591bf215546Sopenharmony_ci 2592bf215546Sopenharmony_ci assert(su->cc == CC_NOT_P); 2593bf215546Sopenharmony_ci red->setPredicate(su->cc, su->getPredicate()); 2594bf215546Sopenharmony_ci mov->setPredicate(CC_P, su->getPredicate()); 2595bf215546Sopenharmony_ci 2596bf215546Sopenharmony_ci bld.mkOp2(OP_UNION, TYPE_U32, def, red->getDef(0), mov->getDef(0)); 2597bf215546Sopenharmony_ci 2598bf215546Sopenharmony_ci handleCasExch(red); 2599bf215546Sopenharmony_ci } 2600bf215546Sopenharmony_ci} 2601bf215546Sopenharmony_ci 2602bf215546Sopenharmony_ciTexInstruction * 2603bf215546Sopenharmony_ciNVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su, Instruction *ret[4]) 2604bf215546Sopenharmony_ci{ 2605bf215546Sopenharmony_ci const int slot = su->tex.r; 2606bf215546Sopenharmony_ci const int dim = su->tex.target.getDim(); 2607bf215546Sopenharmony_ci const bool array = su->tex.target.isArray() || su->tex.target.isCube(); 2608bf215546Sopenharmony_ci const int arg = dim + array; 2609bf215546Sopenharmony_ci Value *ind = su->getIndirectR(); 2610bf215546Sopenharmony_ci Value *handle; 2611bf215546Sopenharmony_ci Instruction *pred = NULL, *pred2d = NULL; 2612bf215546Sopenharmony_ci int pos = 0; 2613bf215546Sopenharmony_ci 2614bf215546Sopenharmony_ci bld.setPosition(su, false); 2615bf215546Sopenharmony_ci 2616bf215546Sopenharmony_ci adjustCoordinatesMS(su); 2617bf215546Sopenharmony_ci 2618bf215546Sopenharmony_ci // add texture handle 2619bf215546Sopenharmony_ci switch (su->op) { 2620bf215546Sopenharmony_ci case OP_SUSTP: 2621bf215546Sopenharmony_ci pos = 4; 2622bf215546Sopenharmony_ci break; 2623bf215546Sopenharmony_ci case OP_SUREDP: 2624bf215546Sopenharmony_ci pos = (su->subOp == NV50_IR_SUBOP_ATOM_CAS) ? 2 : 1; 2625bf215546Sopenharmony_ci break; 2626bf215546Sopenharmony_ci default: 2627bf215546Sopenharmony_ci assert(pos == 0); 2628bf215546Sopenharmony_ci break; 2629bf215546Sopenharmony_ci } 2630bf215546Sopenharmony_ci 2631bf215546Sopenharmony_ci if (dim == 2 && !array) { 2632bf215546Sopenharmony_ci // This might be a 2d slice of a 3d texture, try to load the z 2633bf215546Sopenharmony_ci // coordinate in. 2634bf215546Sopenharmony_ci Value *v; 2635bf215546Sopenharmony_ci if (!su->tex.bindless) 2636bf215546Sopenharmony_ci v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless); 2637bf215546Sopenharmony_ci else 2638bf215546Sopenharmony_ci v = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), ind, bld.mkImm(11)); 2639bf215546Sopenharmony_ci Value *is_3d = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), v, bld.mkImm(1)); 2640bf215546Sopenharmony_ci pred2d = bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), 2641bf215546Sopenharmony_ci TYPE_U32, bld.mkImm(0), is_3d); 2642bf215546Sopenharmony_ci 2643bf215546Sopenharmony_ci bld.mkOp2(OP_SHR, TYPE_U32, v, v, bld.loadImm(NULL, 16)); 2644bf215546Sopenharmony_ci su->moveSources(dim, 1); 2645bf215546Sopenharmony_ci su->setSrc(dim, v); 2646bf215546Sopenharmony_ci su->tex.target = nv50_ir::TEX_TARGET_3D; 2647bf215546Sopenharmony_ci pos++; 2648bf215546Sopenharmony_ci } 2649bf215546Sopenharmony_ci 2650bf215546Sopenharmony_ci if (su->tex.bindless) 2651bf215546Sopenharmony_ci handle = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ind, bld.mkImm(2047)); 2652bf215546Sopenharmony_ci else 2653bf215546Sopenharmony_ci handle = loadTexHandle(ind, slot + 32); 2654bf215546Sopenharmony_ci 2655bf215546Sopenharmony_ci su->setSrc(arg + pos, handle); 2656bf215546Sopenharmony_ci 2657bf215546Sopenharmony_ci // The address check doesn't make sense here. The format check could make 2658bf215546Sopenharmony_ci // sense but it's a bit of a pain. 2659bf215546Sopenharmony_ci if (!su->tex.bindless) { 2660bf215546Sopenharmony_ci // prevent read fault when the image is not actually bound 2661bf215546Sopenharmony_ci pred = 2662bf215546Sopenharmony_ci bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), 2663bf215546Sopenharmony_ci TYPE_U32, bld.mkImm(0), 2664bf215546Sopenharmony_ci loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless)); 2665bf215546Sopenharmony_ci if (su->op != OP_SUSTP && su->tex.format) { 2666bf215546Sopenharmony_ci const TexInstruction::ImgFormatDesc *format = su->tex.format; 2667bf215546Sopenharmony_ci int blockwidth = format->bits[0] + format->bits[1] + 2668bf215546Sopenharmony_ci format->bits[2] + format->bits[3]; 2669bf215546Sopenharmony_ci 2670bf215546Sopenharmony_ci assert(format->components != 0); 2671bf215546Sopenharmony_ci // make sure that the format doesn't mismatch when it's not FMT_NONE 2672bf215546Sopenharmony_ci bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0), 2673bf215546Sopenharmony_ci TYPE_U32, bld.loadImm(NULL, blockwidth / 8), 2674bf215546Sopenharmony_ci loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless), 2675bf215546Sopenharmony_ci pred->getDef(0)); 2676bf215546Sopenharmony_ci } 2677bf215546Sopenharmony_ci } 2678bf215546Sopenharmony_ci 2679bf215546Sopenharmony_ci // Now we have "pred" which (optionally) contains whether to do the surface 2680bf215546Sopenharmony_ci // op at all, and a "pred2d" which indicates that, in case of doing the 2681bf215546Sopenharmony_ci // surface op, we have to create a 2d and 3d version, conditioned on pred2d. 2682bf215546Sopenharmony_ci TexInstruction *su2d = NULL; 2683bf215546Sopenharmony_ci if (pred2d) { 2684bf215546Sopenharmony_ci su2d = cloneForward(func, su)->asTex(); 2685bf215546Sopenharmony_ci for (unsigned i = 0; su->defExists(i); ++i) 2686bf215546Sopenharmony_ci su2d->setDef(i, bld.getSSA()); 2687bf215546Sopenharmony_ci su2d->moveSources(dim + 1, -1); 2688bf215546Sopenharmony_ci su2d->tex.target = nv50_ir::TEX_TARGET_2D; 2689bf215546Sopenharmony_ci } 2690bf215546Sopenharmony_ci if (pred2d && pred) { 2691bf215546Sopenharmony_ci Instruction *pred3d = bld.mkOp2(OP_AND, TYPE_U8, 2692bf215546Sopenharmony_ci bld.getSSA(1, FILE_PREDICATE), 2693bf215546Sopenharmony_ci pred->getDef(0), pred2d->getDef(0)); 2694bf215546Sopenharmony_ci pred3d->src(0).mod = Modifier(NV50_IR_MOD_NOT); 2695bf215546Sopenharmony_ci pred3d->src(1).mod = Modifier(NV50_IR_MOD_NOT); 2696bf215546Sopenharmony_ci su->setPredicate(CC_P, pred3d->getDef(0)); 2697bf215546Sopenharmony_ci pred2d = bld.mkOp2(OP_AND, TYPE_U8, bld.getSSA(1, FILE_PREDICATE), 2698bf215546Sopenharmony_ci pred->getDef(0), pred2d->getDef(0)); 2699bf215546Sopenharmony_ci pred2d->src(0).mod = Modifier(NV50_IR_MOD_NOT); 2700bf215546Sopenharmony_ci } else if (pred) { 2701bf215546Sopenharmony_ci su->setPredicate(CC_NOT_P, pred->getDef(0)); 2702bf215546Sopenharmony_ci } else if (pred2d) { 2703bf215546Sopenharmony_ci su->setPredicate(CC_NOT_P, pred2d->getDef(0)); 2704bf215546Sopenharmony_ci } 2705bf215546Sopenharmony_ci if (su2d) { 2706bf215546Sopenharmony_ci su2d->setPredicate(CC_P, pred2d->getDef(0)); 2707bf215546Sopenharmony_ci bld.insert(su2d); 2708bf215546Sopenharmony_ci 2709bf215546Sopenharmony_ci // Create a UNION so that RA assigns the same registers 2710bf215546Sopenharmony_ci bld.setPosition(su, true); 2711bf215546Sopenharmony_ci for (unsigned i = 0; su->defExists(i); ++i) { 2712bf215546Sopenharmony_ci assert(i < 4); 2713bf215546Sopenharmony_ci 2714bf215546Sopenharmony_ci Value *def = su->getDef(i); 2715bf215546Sopenharmony_ci Value *newDef = bld.getSSA(); 2716bf215546Sopenharmony_ci ValueDef &def2 = su2d->def(i); 2717bf215546Sopenharmony_ci Instruction *mov = NULL; 2718bf215546Sopenharmony_ci 2719bf215546Sopenharmony_ci su->setDef(i, newDef); 2720bf215546Sopenharmony_ci if (pred) { 2721bf215546Sopenharmony_ci mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0)); 2722bf215546Sopenharmony_ci mov->setPredicate(CC_P, pred->getDef(0)); 2723bf215546Sopenharmony_ci } 2724bf215546Sopenharmony_ci 2725bf215546Sopenharmony_ci Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32, 2726bf215546Sopenharmony_ci bld.getSSA(), 2727bf215546Sopenharmony_ci newDef, def2.get()); 2728bf215546Sopenharmony_ci if (mov) 2729bf215546Sopenharmony_ci uni->setSrc(2, mov->getDef(0)); 2730bf215546Sopenharmony_ci bld.mkMov(def, uni->getDef(0)); 2731bf215546Sopenharmony_ci } 2732bf215546Sopenharmony_ci } else if (pred) { 2733bf215546Sopenharmony_ci // Create a UNION so that RA assigns the same registers 2734bf215546Sopenharmony_ci bld.setPosition(su, true); 2735bf215546Sopenharmony_ci for (unsigned i = 0; su->defExists(i); ++i) { 2736bf215546Sopenharmony_ci assert(i < 4); 2737bf215546Sopenharmony_ci 2738bf215546Sopenharmony_ci Value *def = su->getDef(i); 2739bf215546Sopenharmony_ci Value *newDef = bld.getSSA(); 2740bf215546Sopenharmony_ci su->setDef(i, newDef); 2741bf215546Sopenharmony_ci 2742bf215546Sopenharmony_ci Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0)); 2743bf215546Sopenharmony_ci mov->setPredicate(CC_P, pred->getDef(0)); 2744bf215546Sopenharmony_ci 2745bf215546Sopenharmony_ci Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32, 2746bf215546Sopenharmony_ci bld.getSSA(), 2747bf215546Sopenharmony_ci newDef, mov->getDef(0)); 2748bf215546Sopenharmony_ci bld.mkMov(def, uni->getDef(0)); 2749bf215546Sopenharmony_ci } 2750bf215546Sopenharmony_ci } 2751bf215546Sopenharmony_ci 2752bf215546Sopenharmony_ci return su2d; 2753bf215546Sopenharmony_ci} 2754bf215546Sopenharmony_ci 2755bf215546Sopenharmony_civoid 2756bf215546Sopenharmony_ciNVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su) 2757bf215546Sopenharmony_ci{ 2758bf215546Sopenharmony_ci // processSurfaceCoords also takes care of fixing up the outputs and 2759bf215546Sopenharmony_ci // union'ing them with 0 as necessary. Additionally it may create a second 2760bf215546Sopenharmony_ci // surface which needs some of the similar fixups. 2761bf215546Sopenharmony_ci 2762bf215546Sopenharmony_ci Instruction *loaded[4] = {}; 2763bf215546Sopenharmony_ci TexInstruction *su2 = processSurfaceCoordsGM107(su, loaded); 2764bf215546Sopenharmony_ci 2765bf215546Sopenharmony_ci if (su->op == OP_SULDP) { 2766bf215546Sopenharmony_ci convertSurfaceFormat(su, loaded); 2767bf215546Sopenharmony_ci } 2768bf215546Sopenharmony_ci 2769bf215546Sopenharmony_ci if (su->op == OP_SUREDP) { 2770bf215546Sopenharmony_ci su->op = OP_SUREDB; 2771bf215546Sopenharmony_ci } 2772bf215546Sopenharmony_ci 2773bf215546Sopenharmony_ci // If we fixed up the type of the regular surface load instruction, we also 2774bf215546Sopenharmony_ci // have to fix up the copy. 2775bf215546Sopenharmony_ci if (su2) { 2776bf215546Sopenharmony_ci su2->op = su->op; 2777bf215546Sopenharmony_ci su2->dType = su->dType; 2778bf215546Sopenharmony_ci su2->sType = su->sType; 2779bf215546Sopenharmony_ci } 2780bf215546Sopenharmony_ci} 2781bf215546Sopenharmony_ci 2782bf215546Sopenharmony_cibool 2783bf215546Sopenharmony_ciNVC0LoweringPass::handleWRSV(Instruction *i) 2784bf215546Sopenharmony_ci{ 2785bf215546Sopenharmony_ci Instruction *st; 2786bf215546Sopenharmony_ci Symbol *sym; 2787bf215546Sopenharmony_ci uint32_t addr; 2788bf215546Sopenharmony_ci 2789bf215546Sopenharmony_ci // must replace, $sreg are not writeable 2790bf215546Sopenharmony_ci addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym()); 2791bf215546Sopenharmony_ci if (addr >= 0x400) 2792bf215546Sopenharmony_ci return false; 2793bf215546Sopenharmony_ci sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr); 2794bf215546Sopenharmony_ci 2795bf215546Sopenharmony_ci st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), 2796bf215546Sopenharmony_ci i->getSrc(1)); 2797bf215546Sopenharmony_ci st->perPatch = i->perPatch; 2798bf215546Sopenharmony_ci 2799bf215546Sopenharmony_ci bld.getBB()->remove(i); 2800bf215546Sopenharmony_ci return true; 2801bf215546Sopenharmony_ci} 2802bf215546Sopenharmony_ci 2803bf215546Sopenharmony_civoid 2804bf215546Sopenharmony_ciNVC0LoweringPass::handleLDST(Instruction *i) 2805bf215546Sopenharmony_ci{ 2806bf215546Sopenharmony_ci if (i->src(0).getFile() == FILE_SHADER_INPUT) { 2807bf215546Sopenharmony_ci if (prog->getType() == Program::TYPE_COMPUTE) { 2808bf215546Sopenharmony_ci i->getSrc(0)->reg.file = FILE_MEMORY_CONST; 2809bf215546Sopenharmony_ci i->getSrc(0)->reg.fileIndex = 0; 2810bf215546Sopenharmony_ci } else 2811bf215546Sopenharmony_ci if (prog->getType() == Program::TYPE_GEOMETRY && 2812bf215546Sopenharmony_ci i->src(0).isIndirect(0)) { 2813bf215546Sopenharmony_ci // XXX: this assumes vec4 units 2814bf215546Sopenharmony_ci Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), 2815bf215546Sopenharmony_ci i->getIndirect(0, 0), bld.mkImm(4)); 2816bf215546Sopenharmony_ci i->setIndirect(0, 0, ptr); 2817bf215546Sopenharmony_ci i->op = OP_VFETCH; 2818bf215546Sopenharmony_ci } else { 2819bf215546Sopenharmony_ci i->op = OP_VFETCH; 2820bf215546Sopenharmony_ci assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP 2821bf215546Sopenharmony_ci } 2822bf215546Sopenharmony_ci } else if (i->src(0).getFile() == FILE_MEMORY_CONST) { 2823bf215546Sopenharmony_ci int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1; 2824bf215546Sopenharmony_ci Value *ind = i->getIndirect(0, 1); 2825bf215546Sopenharmony_ci 2826bf215546Sopenharmony_ci if (targ->getChipset() >= NVISA_GK104_CHIPSET && 2827bf215546Sopenharmony_ci prog->getType() == Program::TYPE_COMPUTE && 2828bf215546Sopenharmony_ci (fileIndex >= 6 || ind)) { 2829bf215546Sopenharmony_ci // The launch descriptor only allows to set up 8 CBs, but OpenGL 2830bf215546Sopenharmony_ci // requires at least 12 UBOs. To bypass this limitation, for constant 2831bf215546Sopenharmony_ci // buffers 7+, we store the addrs into the driver constbuf and we 2832bf215546Sopenharmony_ci // directly load from the global memory. 2833bf215546Sopenharmony_ci if (ind) { 2834bf215546Sopenharmony_ci // Clamp the UBO index when an indirect access is used to avoid 2835bf215546Sopenharmony_ci // loading information from the wrong place in the driver cb. 2836bf215546Sopenharmony_ci // TODO - synchronize the max with the driver. 2837bf215546Sopenharmony_ci ind = bld.mkOp2v(OP_MIN, TYPE_U32, bld.getSSA(), 2838bf215546Sopenharmony_ci bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), 2839bf215546Sopenharmony_ci ind, bld.loadImm(NULL, fileIndex)), 2840bf215546Sopenharmony_ci bld.loadImm(NULL, 13)); 2841bf215546Sopenharmony_ci fileIndex = 0; 2842bf215546Sopenharmony_ci } 2843bf215546Sopenharmony_ci 2844bf215546Sopenharmony_ci Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType)); 2845bf215546Sopenharmony_ci Value *ptr = loadUboInfo64(ind, fileIndex * 16); 2846bf215546Sopenharmony_ci Value *length = loadUboLength32(ind, fileIndex * 16); 2847bf215546Sopenharmony_ci Value *pred = new_LValue(func, FILE_PREDICATE); 2848bf215546Sopenharmony_ci if (i->src(0).isIndirect(0)) { 2849bf215546Sopenharmony_ci bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0)); 2850bf215546Sopenharmony_ci bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0)); 2851bf215546Sopenharmony_ci } 2852bf215546Sopenharmony_ci i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL; 2853bf215546Sopenharmony_ci i->setIndirect(0, 1, NULL); 2854bf215546Sopenharmony_ci i->setIndirect(0, 0, ptr); 2855bf215546Sopenharmony_ci bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length); 2856bf215546Sopenharmony_ci i->setPredicate(CC_NOT_P, pred); 2857bf215546Sopenharmony_ci Value *zero, *dst = i->getDef(0); 2858bf215546Sopenharmony_ci i->setDef(0, bld.getSSA()); 2859bf215546Sopenharmony_ci 2860bf215546Sopenharmony_ci bld.setPosition(i, true); 2861bf215546Sopenharmony_ci bld.mkMov((zero = bld.getSSA()), bld.mkImm(0)) 2862bf215546Sopenharmony_ci ->setPredicate(CC_P, pred); 2863bf215546Sopenharmony_ci bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero); 2864bf215546Sopenharmony_ci } else if (i->src(0).isIndirect(1)) { 2865bf215546Sopenharmony_ci Value *ptr; 2866bf215546Sopenharmony_ci if (i->src(0).isIndirect(0)) 2867bf215546Sopenharmony_ci ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(), 2868bf215546Sopenharmony_ci i->getIndirect(0, 1), bld.mkImm(0x1010), 2869bf215546Sopenharmony_ci i->getIndirect(0, 0)); 2870bf215546Sopenharmony_ci else 2871bf215546Sopenharmony_ci ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), 2872bf215546Sopenharmony_ci i->getIndirect(0, 1), bld.mkImm(16)); 2873bf215546Sopenharmony_ci i->setIndirect(0, 1, NULL); 2874bf215546Sopenharmony_ci i->setIndirect(0, 0, ptr); 2875bf215546Sopenharmony_ci i->subOp = NV50_IR_SUBOP_LDC_IS; 2876bf215546Sopenharmony_ci } 2877bf215546Sopenharmony_ci } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) { 2878bf215546Sopenharmony_ci assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL); 2879bf215546Sopenharmony_ci i->op = OP_VFETCH; 2880bf215546Sopenharmony_ci } else if (i->src(0).getFile() == FILE_MEMORY_BUFFER) { 2881bf215546Sopenharmony_ci Value *ind = i->getIndirect(0, 1); 2882bf215546Sopenharmony_ci Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16); 2883bf215546Sopenharmony_ci // XXX come up with a way not to do this for EVERY little access but 2884bf215546Sopenharmony_ci // rather to batch these up somehow. Unfortunately we've lost the 2885bf215546Sopenharmony_ci // information about the field width by the time we get here. 2886bf215546Sopenharmony_ci Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType)); 2887bf215546Sopenharmony_ci Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16); 2888bf215546Sopenharmony_ci Value *pred = new_LValue(func, FILE_PREDICATE); 2889bf215546Sopenharmony_ci if (i->src(0).isIndirect(0)) { 2890bf215546Sopenharmony_ci bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0)); 2891bf215546Sopenharmony_ci bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0)); 2892bf215546Sopenharmony_ci } 2893bf215546Sopenharmony_ci i->setIndirect(0, 1, NULL); 2894bf215546Sopenharmony_ci i->setIndirect(0, 0, ptr); 2895bf215546Sopenharmony_ci i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL; 2896bf215546Sopenharmony_ci bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length); 2897bf215546Sopenharmony_ci i->setPredicate(CC_NOT_P, pred); 2898bf215546Sopenharmony_ci if (i->defExists(0)) { 2899bf215546Sopenharmony_ci Value *zero, *dst = i->getDef(0); 2900bf215546Sopenharmony_ci uint8_t size = dst->reg.size; 2901bf215546Sopenharmony_ci i->setDef(0, bld.getSSA(size)); 2902bf215546Sopenharmony_ci 2903bf215546Sopenharmony_ci bld.setPosition(i, true); 2904bf215546Sopenharmony_ci bld.mkMov((zero = bld.getSSA(size)), bld.mkImm(0), i->dType) 2905bf215546Sopenharmony_ci ->setPredicate(CC_P, pred); 2906bf215546Sopenharmony_ci bld.mkOp2(OP_UNION, i->dType, dst, i->getDef(0), zero); 2907bf215546Sopenharmony_ci } 2908bf215546Sopenharmony_ci } 2909bf215546Sopenharmony_ci} 2910bf215546Sopenharmony_ci 2911bf215546Sopenharmony_civoid 2912bf215546Sopenharmony_ciNVC0LoweringPass::readTessCoord(LValue *dst, int c) 2913bf215546Sopenharmony_ci{ 2914bf215546Sopenharmony_ci Value *laneid = bld.getSSA(); 2915bf215546Sopenharmony_ci Value *x, *y; 2916bf215546Sopenharmony_ci 2917bf215546Sopenharmony_ci bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0)); 2918bf215546Sopenharmony_ci 2919bf215546Sopenharmony_ci if (c == 0) { 2920bf215546Sopenharmony_ci x = dst; 2921bf215546Sopenharmony_ci y = NULL; 2922bf215546Sopenharmony_ci } else 2923bf215546Sopenharmony_ci if (c == 1) { 2924bf215546Sopenharmony_ci x = NULL; 2925bf215546Sopenharmony_ci y = dst; 2926bf215546Sopenharmony_ci } else { 2927bf215546Sopenharmony_ci assert(c == 2); 2928bf215546Sopenharmony_ci if (prog->driver_out->prop.tp.domain != PIPE_PRIM_TRIANGLES) { 2929bf215546Sopenharmony_ci bld.mkMov(dst, bld.loadImm(NULL, 0)); 2930bf215546Sopenharmony_ci return; 2931bf215546Sopenharmony_ci } 2932bf215546Sopenharmony_ci x = bld.getSSA(); 2933bf215546Sopenharmony_ci y = bld.getSSA(); 2934bf215546Sopenharmony_ci } 2935bf215546Sopenharmony_ci if (x) 2936bf215546Sopenharmony_ci bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid); 2937bf215546Sopenharmony_ci if (y) 2938bf215546Sopenharmony_ci bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid); 2939bf215546Sopenharmony_ci 2940bf215546Sopenharmony_ci if (c == 2) { 2941bf215546Sopenharmony_ci bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y); 2942bf215546Sopenharmony_ci bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst); 2943bf215546Sopenharmony_ci } 2944bf215546Sopenharmony_ci} 2945bf215546Sopenharmony_ci 2946bf215546Sopenharmony_cibool 2947bf215546Sopenharmony_ciNVC0LoweringPass::handleRDSV(Instruction *i) 2948bf215546Sopenharmony_ci{ 2949bf215546Sopenharmony_ci Symbol *sym = i->getSrc(0)->asSym(); 2950bf215546Sopenharmony_ci const SVSemantic sv = sym->reg.data.sv.sv; 2951bf215546Sopenharmony_ci Value *vtx = NULL; 2952bf215546Sopenharmony_ci Instruction *ld; 2953bf215546Sopenharmony_ci uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym); 2954bf215546Sopenharmony_ci 2955bf215546Sopenharmony_ci if (addr >= 0x400) { 2956bf215546Sopenharmony_ci // mov $sreg 2957bf215546Sopenharmony_ci if (sym->reg.data.sv.index == 3) { 2958bf215546Sopenharmony_ci // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID 2959bf215546Sopenharmony_ci i->op = OP_MOV; 2960bf215546Sopenharmony_ci i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0)); 2961bf215546Sopenharmony_ci } else 2962bf215546Sopenharmony_ci if (sv == SV_TID) { 2963bf215546Sopenharmony_ci // Help CSE combine TID fetches 2964bf215546Sopenharmony_ci Value *tid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), 2965bf215546Sopenharmony_ci bld.mkSysVal(SV_COMBINED_TID, 0)); 2966bf215546Sopenharmony_ci i->op = OP_EXTBF; 2967bf215546Sopenharmony_ci i->setSrc(0, tid); 2968bf215546Sopenharmony_ci switch (sym->reg.data.sv.index) { 2969bf215546Sopenharmony_ci case 0: i->setSrc(1, bld.mkImm(0x1000)); break; 2970bf215546Sopenharmony_ci case 1: i->setSrc(1, bld.mkImm(0x0a10)); break; 2971bf215546Sopenharmony_ci case 2: i->setSrc(1, bld.mkImm(0x061a)); break; 2972bf215546Sopenharmony_ci } 2973bf215546Sopenharmony_ci } 2974bf215546Sopenharmony_ci if (sv == SV_VERTEX_COUNT) { 2975bf215546Sopenharmony_ci bld.setPosition(i, true); 2976bf215546Sopenharmony_ci bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808)); 2977bf215546Sopenharmony_ci } 2978bf215546Sopenharmony_ci return true; 2979bf215546Sopenharmony_ci } 2980bf215546Sopenharmony_ci 2981bf215546Sopenharmony_ci switch (sv) { 2982bf215546Sopenharmony_ci case SV_POSITION: 2983bf215546Sopenharmony_ci assert(prog->getType() == Program::TYPE_FRAGMENT); 2984bf215546Sopenharmony_ci if (i->srcExists(1)) { 2985bf215546Sopenharmony_ci // Pass offset through to the interpolation logic 2986bf215546Sopenharmony_ci ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET, 2987bf215546Sopenharmony_ci i->getDef(0), addr, NULL); 2988bf215546Sopenharmony_ci ld->setSrc(1, i->getSrc(1)); 2989bf215546Sopenharmony_ci } else { 2990bf215546Sopenharmony_ci bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL); 2991bf215546Sopenharmony_ci } 2992bf215546Sopenharmony_ci break; 2993bf215546Sopenharmony_ci case SV_FACE: 2994bf215546Sopenharmony_ci { 2995bf215546Sopenharmony_ci Value *face = i->getDef(0); 2996bf215546Sopenharmony_ci bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL); 2997bf215546Sopenharmony_ci if (i->dType == TYPE_F32) { 2998bf215546Sopenharmony_ci bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001)); 2999bf215546Sopenharmony_ci bld.mkOp1(OP_NEG, TYPE_S32, face, face); 3000bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face); 3001bf215546Sopenharmony_ci } 3002bf215546Sopenharmony_ci } 3003bf215546Sopenharmony_ci break; 3004bf215546Sopenharmony_ci case SV_TESS_COORD: 3005bf215546Sopenharmony_ci assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL); 3006bf215546Sopenharmony_ci readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index); 3007bf215546Sopenharmony_ci break; 3008bf215546Sopenharmony_ci case SV_NTID: 3009bf215546Sopenharmony_ci case SV_NCTAID: 3010bf215546Sopenharmony_ci case SV_GRIDID: 3011bf215546Sopenharmony_ci assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise 3012bf215546Sopenharmony_ci if (sym->reg.data.sv.index == 3) { 3013bf215546Sopenharmony_ci i->op = OP_MOV; 3014bf215546Sopenharmony_ci i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1)); 3015bf215546Sopenharmony_ci return true; 3016bf215546Sopenharmony_ci } 3017bf215546Sopenharmony_ci FALLTHROUGH; 3018bf215546Sopenharmony_ci case SV_WORK_DIM: 3019bf215546Sopenharmony_ci addr += prog->driver->prop.cp.gridInfoBase; 3020bf215546Sopenharmony_ci bld.mkLoad(TYPE_U32, i->getDef(0), 3021bf215546Sopenharmony_ci bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot, 3022bf215546Sopenharmony_ci TYPE_U32, addr), NULL); 3023bf215546Sopenharmony_ci break; 3024bf215546Sopenharmony_ci case SV_SAMPLE_INDEX: 3025bf215546Sopenharmony_ci // TODO: Properly pass source as an address in the PIX address space 3026bf215546Sopenharmony_ci // (which can be of the form [r0+offset]). But this is currently 3027bf215546Sopenharmony_ci // unnecessary. 3028bf215546Sopenharmony_ci ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0)); 3029bf215546Sopenharmony_ci ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID; 3030bf215546Sopenharmony_ci break; 3031bf215546Sopenharmony_ci case SV_SAMPLE_POS: { 3032bf215546Sopenharmony_ci Value *sampleID = bld.getScratch(); 3033bf215546Sopenharmony_ci ld = bld.mkOp1(OP_PIXLD, TYPE_U32, sampleID, bld.mkImm(0)); 3034bf215546Sopenharmony_ci ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID; 3035bf215546Sopenharmony_ci Value *offset = calculateSampleOffset(sampleID); 3036bf215546Sopenharmony_ci 3037bf215546Sopenharmony_ci assert(prog->driver_out->prop.fp.readsSampleLocations); 3038bf215546Sopenharmony_ci 3039bf215546Sopenharmony_ci if (targ->getChipset() >= NVISA_GM200_CHIPSET) { 3040bf215546Sopenharmony_ci bld.mkLoad(TYPE_F32, 3041bf215546Sopenharmony_ci i->getDef(0), 3042bf215546Sopenharmony_ci bld.mkSymbol( 3043bf215546Sopenharmony_ci FILE_MEMORY_CONST, prog->driver->io.auxCBSlot, 3044bf215546Sopenharmony_ci TYPE_U32, prog->driver->io.sampleInfoBase), 3045bf215546Sopenharmony_ci offset); 3046bf215546Sopenharmony_ci bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), 3047bf215546Sopenharmony_ci bld.mkImm(0x040c + sym->reg.data.sv.index * 16)); 3048bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_U32, i->getDef(0)); 3049bf215546Sopenharmony_ci bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(0), i->getDef(0), bld.mkImm(1.0f / 16.0f)); 3050bf215546Sopenharmony_ci } else { 3051bf215546Sopenharmony_ci bld.mkLoad(TYPE_F32, 3052bf215546Sopenharmony_ci i->getDef(0), 3053bf215546Sopenharmony_ci bld.mkSymbol( 3054bf215546Sopenharmony_ci FILE_MEMORY_CONST, prog->driver->io.auxCBSlot, 3055bf215546Sopenharmony_ci TYPE_U32, prog->driver->io.sampleInfoBase + 3056bf215546Sopenharmony_ci 4 * sym->reg.data.sv.index), 3057bf215546Sopenharmony_ci offset); 3058bf215546Sopenharmony_ci } 3059bf215546Sopenharmony_ci break; 3060bf215546Sopenharmony_ci } 3061bf215546Sopenharmony_ci case SV_SAMPLE_MASK: { 3062bf215546Sopenharmony_ci ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0)); 3063bf215546Sopenharmony_ci ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK; 3064bf215546Sopenharmony_ci Instruction *sampleid = 3065bf215546Sopenharmony_ci bld.mkOp1(OP_PIXLD, TYPE_U32, bld.getSSA(), bld.mkImm(0)); 3066bf215546Sopenharmony_ci sampleid->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID; 3067bf215546Sopenharmony_ci Value *masked = 3068bf215546Sopenharmony_ci bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ld->getDef(0), 3069bf215546Sopenharmony_ci bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), 3070bf215546Sopenharmony_ci bld.loadImm(NULL, 1), sampleid->getDef(0))); 3071bf215546Sopenharmony_ci if (prog->persampleInvocation) { 3072bf215546Sopenharmony_ci bld.mkMov(i->getDef(0), masked); 3073bf215546Sopenharmony_ci } else { 3074bf215546Sopenharmony_ci bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), ld->getDef(0), masked, 3075bf215546Sopenharmony_ci bld.mkImm(0)) 3076bf215546Sopenharmony_ci ->subOp = 1; 3077bf215546Sopenharmony_ci } 3078bf215546Sopenharmony_ci break; 3079bf215546Sopenharmony_ci } 3080bf215546Sopenharmony_ci case SV_BASEVERTEX: 3081bf215546Sopenharmony_ci case SV_BASEINSTANCE: 3082bf215546Sopenharmony_ci case SV_DRAWID: 3083bf215546Sopenharmony_ci ld = bld.mkLoad(TYPE_U32, i->getDef(0), 3084bf215546Sopenharmony_ci bld.mkSymbol(FILE_MEMORY_CONST, 3085bf215546Sopenharmony_ci prog->driver->io.auxCBSlot, 3086bf215546Sopenharmony_ci TYPE_U32, 3087bf215546Sopenharmony_ci prog->driver->io.drawInfoBase + 3088bf215546Sopenharmony_ci 4 * (sv - SV_BASEVERTEX)), 3089bf215546Sopenharmony_ci NULL); 3090bf215546Sopenharmony_ci break; 3091bf215546Sopenharmony_ci default: 3092bf215546Sopenharmony_ci if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch) 3093bf215546Sopenharmony_ci vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0)); 3094bf215546Sopenharmony_ci if (prog->getType() == Program::TYPE_FRAGMENT) { 3095bf215546Sopenharmony_ci bld.mkInterp(NV50_IR_INTERP_FLAT, i->getDef(0), addr, NULL); 3096bf215546Sopenharmony_ci } else { 3097bf215546Sopenharmony_ci ld = bld.mkFetch(i->getDef(0), i->dType, 3098bf215546Sopenharmony_ci FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx); 3099bf215546Sopenharmony_ci ld->perPatch = i->perPatch; 3100bf215546Sopenharmony_ci } 3101bf215546Sopenharmony_ci break; 3102bf215546Sopenharmony_ci } 3103bf215546Sopenharmony_ci bld.getBB()->remove(i); 3104bf215546Sopenharmony_ci return true; 3105bf215546Sopenharmony_ci} 3106bf215546Sopenharmony_ci 3107bf215546Sopenharmony_cibool 3108bf215546Sopenharmony_ciNVC0LoweringPass::handleDIV(Instruction *i) 3109bf215546Sopenharmony_ci{ 3110bf215546Sopenharmony_ci if (!isFloatType(i->dType)) 3111bf215546Sopenharmony_ci return true; 3112bf215546Sopenharmony_ci bld.setPosition(i, false); 3113bf215546Sopenharmony_ci Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1)); 3114bf215546Sopenharmony_ci i->op = OP_MUL; 3115bf215546Sopenharmony_ci i->setSrc(1, rcp->getDef(0)); 3116bf215546Sopenharmony_ci return true; 3117bf215546Sopenharmony_ci} 3118bf215546Sopenharmony_ci 3119bf215546Sopenharmony_cibool 3120bf215546Sopenharmony_ciNVC0LoweringPass::handleMOD(Instruction *i) 3121bf215546Sopenharmony_ci{ 3122bf215546Sopenharmony_ci if (!isFloatType(i->dType)) 3123bf215546Sopenharmony_ci return true; 3124bf215546Sopenharmony_ci LValue *value = bld.getScratch(typeSizeof(i->dType)); 3125bf215546Sopenharmony_ci bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1)); 3126bf215546Sopenharmony_ci bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value); 3127bf215546Sopenharmony_ci bld.mkOp1(OP_TRUNC, i->dType, value, value); 3128bf215546Sopenharmony_ci bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value); 3129bf215546Sopenharmony_ci i->op = OP_SUB; 3130bf215546Sopenharmony_ci i->setSrc(1, value); 3131bf215546Sopenharmony_ci return true; 3132bf215546Sopenharmony_ci} 3133bf215546Sopenharmony_ci 3134bf215546Sopenharmony_cibool 3135bf215546Sopenharmony_ciNVC0LoweringPass::handleSQRT(Instruction *i) 3136bf215546Sopenharmony_ci{ 3137bf215546Sopenharmony_ci if (targ->isOpSupported(OP_SQRT, i->dType)) 3138bf215546Sopenharmony_ci return true; 3139bf215546Sopenharmony_ci 3140bf215546Sopenharmony_ci if (i->dType == TYPE_F64) { 3141bf215546Sopenharmony_ci Value *pred = bld.getSSA(1, FILE_PREDICATE); 3142bf215546Sopenharmony_ci Value *zero = bld.loadImm(NULL, 0.0); 3143bf215546Sopenharmony_ci Value *dst = bld.getSSA(8); 3144bf215546Sopenharmony_ci bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0)); 3145bf215546Sopenharmony_ci bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero); 3146bf215546Sopenharmony_ci bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred); 3147bf215546Sopenharmony_ci i->op = OP_MUL; 3148bf215546Sopenharmony_ci i->setSrc(1, dst); 3149bf215546Sopenharmony_ci // TODO: Handle this properly with a library function 3150bf215546Sopenharmony_ci } else { 3151bf215546Sopenharmony_ci bld.setPosition(i, true); 3152bf215546Sopenharmony_ci i->op = OP_RSQ; 3153bf215546Sopenharmony_ci bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0)); 3154bf215546Sopenharmony_ci } 3155bf215546Sopenharmony_ci 3156bf215546Sopenharmony_ci return true; 3157bf215546Sopenharmony_ci} 3158bf215546Sopenharmony_ci 3159bf215546Sopenharmony_cibool 3160bf215546Sopenharmony_ciNVC0LoweringPass::handlePOW(Instruction *i) 3161bf215546Sopenharmony_ci{ 3162bf215546Sopenharmony_ci LValue *val = bld.getScratch(); 3163bf215546Sopenharmony_ci 3164bf215546Sopenharmony_ci bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0)); 3165bf215546Sopenharmony_ci bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1; 3166bf215546Sopenharmony_ci bld.mkOp1(OP_PREEX2, TYPE_F32, val, val); 3167bf215546Sopenharmony_ci 3168bf215546Sopenharmony_ci i->op = OP_EX2; 3169bf215546Sopenharmony_ci i->setSrc(0, val); 3170bf215546Sopenharmony_ci i->setSrc(1, NULL); 3171bf215546Sopenharmony_ci 3172bf215546Sopenharmony_ci return true; 3173bf215546Sopenharmony_ci} 3174bf215546Sopenharmony_ci 3175bf215546Sopenharmony_cibool 3176bf215546Sopenharmony_ciNVC0LoweringPass::handleEXPORT(Instruction *i) 3177bf215546Sopenharmony_ci{ 3178bf215546Sopenharmony_ci if (prog->getType() == Program::TYPE_FRAGMENT) { 3179bf215546Sopenharmony_ci int id = i->getSrc(0)->reg.data.offset / 4; 3180bf215546Sopenharmony_ci 3181bf215546Sopenharmony_ci if (i->src(0).isIndirect(0)) // TODO, ugly 3182bf215546Sopenharmony_ci return false; 3183bf215546Sopenharmony_ci i->op = OP_MOV; 3184bf215546Sopenharmony_ci i->subOp = NV50_IR_SUBOP_MOV_FINAL; 3185bf215546Sopenharmony_ci i->src(0).set(i->src(1)); 3186bf215546Sopenharmony_ci i->setSrc(1, NULL); 3187bf215546Sopenharmony_ci i->setDef(0, new_LValue(func, FILE_GPR)); 3188bf215546Sopenharmony_ci i->getDef(0)->reg.data.id = id; 3189bf215546Sopenharmony_ci 3190bf215546Sopenharmony_ci prog->maxGPR = MAX2(prog->maxGPR, id); 3191bf215546Sopenharmony_ci } else 3192bf215546Sopenharmony_ci if (prog->getType() == Program::TYPE_GEOMETRY) { 3193bf215546Sopenharmony_ci i->setIndirect(0, 1, gpEmitAddress); 3194bf215546Sopenharmony_ci } 3195bf215546Sopenharmony_ci return true; 3196bf215546Sopenharmony_ci} 3197bf215546Sopenharmony_ci 3198bf215546Sopenharmony_cibool 3199bf215546Sopenharmony_ciNVC0LoweringPass::handleOUT(Instruction *i) 3200bf215546Sopenharmony_ci{ 3201bf215546Sopenharmony_ci Instruction *prev = i->prev; 3202bf215546Sopenharmony_ci ImmediateValue stream, prevStream; 3203bf215546Sopenharmony_ci 3204bf215546Sopenharmony_ci // Only merge if the stream ids match. Also, note that the previous 3205bf215546Sopenharmony_ci // instruction would have already been lowered, so we take arg1 from it. 3206bf215546Sopenharmony_ci if (i->op == OP_RESTART && prev && prev->op == OP_EMIT && 3207bf215546Sopenharmony_ci i->src(0).getImmediate(stream) && 3208bf215546Sopenharmony_ci prev->src(1).getImmediate(prevStream) && 3209bf215546Sopenharmony_ci stream.reg.data.u32 == prevStream.reg.data.u32) { 3210bf215546Sopenharmony_ci i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART; 3211bf215546Sopenharmony_ci delete_Instruction(prog, i); 3212bf215546Sopenharmony_ci } else { 3213bf215546Sopenharmony_ci assert(gpEmitAddress); 3214bf215546Sopenharmony_ci i->setDef(0, gpEmitAddress); 3215bf215546Sopenharmony_ci i->setSrc(1, i->getSrc(0)); 3216bf215546Sopenharmony_ci i->setSrc(0, gpEmitAddress); 3217bf215546Sopenharmony_ci } 3218bf215546Sopenharmony_ci return true; 3219bf215546Sopenharmony_ci} 3220bf215546Sopenharmony_ci 3221bf215546Sopenharmony_ciValue * 3222bf215546Sopenharmony_ciNVC0LoweringPass::calculateSampleOffset(Value *sampleID) 3223bf215546Sopenharmony_ci{ 3224bf215546Sopenharmony_ci Value *offset = bld.getScratch(); 3225bf215546Sopenharmony_ci if (targ->getChipset() >= NVISA_GM200_CHIPSET) { 3226bf215546Sopenharmony_ci // Sample location offsets (in bytes) are calculated like so: 3227bf215546Sopenharmony_ci // offset = (SV_POSITION.y % 4 * 2) + (SV_POSITION.x % 2) 3228bf215546Sopenharmony_ci // offset = offset * 32 + sampleID % 8 * 4; 3229bf215546Sopenharmony_ci // which is equivalent to: 3230bf215546Sopenharmony_ci // offset = (SV_POSITION.y & 0x3) << 6 + (SV_POSITION.x & 0x1) << 5; 3231bf215546Sopenharmony_ci // offset += sampleID << 2 3232bf215546Sopenharmony_ci 3233bf215546Sopenharmony_ci // The second operand (src1) of the INSBF instructions are like so: 3234bf215546Sopenharmony_ci // 0xssll where ss is the size and ll is the offset. 3235bf215546Sopenharmony_ci // so: dest = src2 | (src0 & (1 << ss - 1)) << ll 3236bf215546Sopenharmony_ci 3237bf215546Sopenharmony_ci // Add sample ID (offset = (sampleID & 0x7) << 2) 3238bf215546Sopenharmony_ci bld.mkOp3(OP_INSBF, TYPE_U32, offset, sampleID, bld.mkImm(0x0302), bld.mkImm(0x0)); 3239bf215546Sopenharmony_ci 3240bf215546Sopenharmony_ci Symbol *xSym = bld.mkSysVal(SV_POSITION, 0); 3241bf215546Sopenharmony_ci Symbol *ySym = bld.mkSysVal(SV_POSITION, 1); 3242bf215546Sopenharmony_ci Value *coord = bld.getScratch(); 3243bf215546Sopenharmony_ci 3244bf215546Sopenharmony_ci // Add X coordinate (offset |= (SV_POSITION.x & 0x1) << 5) 3245bf215546Sopenharmony_ci bld.mkInterp(NV50_IR_INTERP_LINEAR, coord, 3246bf215546Sopenharmony_ci targ->getSVAddress(FILE_SHADER_INPUT, xSym), NULL); 3247bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord) 3248bf215546Sopenharmony_ci ->rnd = ROUND_ZI; 3249bf215546Sopenharmony_ci bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0105), offset); 3250bf215546Sopenharmony_ci 3251bf215546Sopenharmony_ci // Add Y coordinate (offset |= (SV_POSITION.y & 0x3) << 6) 3252bf215546Sopenharmony_ci bld.mkInterp(NV50_IR_INTERP_LINEAR, coord, 3253bf215546Sopenharmony_ci targ->getSVAddress(FILE_SHADER_INPUT, ySym), NULL); 3254bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord) 3255bf215546Sopenharmony_ci ->rnd = ROUND_ZI; 3256bf215546Sopenharmony_ci bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0206), offset); 3257bf215546Sopenharmony_ci } else { 3258bf215546Sopenharmony_ci bld.mkOp2(OP_SHL, TYPE_U32, offset, sampleID, bld.mkImm(3)); 3259bf215546Sopenharmony_ci } 3260bf215546Sopenharmony_ci return offset; 3261bf215546Sopenharmony_ci} 3262bf215546Sopenharmony_ci 3263bf215546Sopenharmony_ci// Handle programmable sample locations for GM20x+ 3264bf215546Sopenharmony_civoid 3265bf215546Sopenharmony_ciNVC0LoweringPass::handlePIXLD(Instruction *i) 3266bf215546Sopenharmony_ci{ 3267bf215546Sopenharmony_ci if (i->subOp != NV50_IR_SUBOP_PIXLD_OFFSET) 3268bf215546Sopenharmony_ci return; 3269bf215546Sopenharmony_ci if (targ->getChipset() < NVISA_GM200_CHIPSET) 3270bf215546Sopenharmony_ci return; 3271bf215546Sopenharmony_ci 3272bf215546Sopenharmony_ci assert(prog->driver_out->prop.fp.readsSampleLocations); 3273bf215546Sopenharmony_ci 3274bf215546Sopenharmony_ci bld.mkLoad(TYPE_F32, 3275bf215546Sopenharmony_ci i->getDef(0), 3276bf215546Sopenharmony_ci bld.mkSymbol( 3277bf215546Sopenharmony_ci FILE_MEMORY_CONST, prog->driver->io.auxCBSlot, 3278bf215546Sopenharmony_ci TYPE_U32, prog->driver->io.sampleInfoBase), 3279bf215546Sopenharmony_ci calculateSampleOffset(i->getSrc(0))); 3280bf215546Sopenharmony_ci 3281bf215546Sopenharmony_ci bld.getBB()->remove(i); 3282bf215546Sopenharmony_ci} 3283bf215546Sopenharmony_ci 3284bf215546Sopenharmony_ci// Generate a binary predicate if an instruction is predicated by 3285bf215546Sopenharmony_ci// e.g. an f32 value. 3286bf215546Sopenharmony_civoid 3287bf215546Sopenharmony_ciNVC0LoweringPass::checkPredicate(Instruction *insn) 3288bf215546Sopenharmony_ci{ 3289bf215546Sopenharmony_ci Value *pred = insn->getPredicate(); 3290bf215546Sopenharmony_ci Value *pdst; 3291bf215546Sopenharmony_ci 3292bf215546Sopenharmony_ci if (!pred || pred->reg.file == FILE_PREDICATE) 3293bf215546Sopenharmony_ci return; 3294bf215546Sopenharmony_ci pdst = new_LValue(func, FILE_PREDICATE); 3295bf215546Sopenharmony_ci 3296bf215546Sopenharmony_ci // CAUTION: don't use pdst->getInsn, the definition might not be unique, 3297bf215546Sopenharmony_ci // delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass 3298bf215546Sopenharmony_ci 3299bf215546Sopenharmony_ci bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred); 3300bf215546Sopenharmony_ci 3301bf215546Sopenharmony_ci insn->setPredicate(insn->cc, pdst); 3302bf215546Sopenharmony_ci} 3303bf215546Sopenharmony_ci 3304bf215546Sopenharmony_ci// 3305bf215546Sopenharmony_ci// - add quadop dance for texturing 3306bf215546Sopenharmony_ci// - put FP outputs in GPRs 3307bf215546Sopenharmony_ci// - convert instruction sequences 3308bf215546Sopenharmony_ci// 3309bf215546Sopenharmony_cibool 3310bf215546Sopenharmony_ciNVC0LoweringPass::visit(Instruction *i) 3311bf215546Sopenharmony_ci{ 3312bf215546Sopenharmony_ci bool ret = true; 3313bf215546Sopenharmony_ci bld.setPosition(i, false); 3314bf215546Sopenharmony_ci 3315bf215546Sopenharmony_ci if (i->cc != CC_ALWAYS) 3316bf215546Sopenharmony_ci checkPredicate(i); 3317bf215546Sopenharmony_ci 3318bf215546Sopenharmony_ci switch (i->op) { 3319bf215546Sopenharmony_ci case OP_TEX: 3320bf215546Sopenharmony_ci case OP_TXB: 3321bf215546Sopenharmony_ci case OP_TXL: 3322bf215546Sopenharmony_ci case OP_TXF: 3323bf215546Sopenharmony_ci case OP_TXG: 3324bf215546Sopenharmony_ci return handleTEX(i->asTex()); 3325bf215546Sopenharmony_ci case OP_TXD: 3326bf215546Sopenharmony_ci return handleTXD(i->asTex()); 3327bf215546Sopenharmony_ci case OP_TXLQ: 3328bf215546Sopenharmony_ci return handleTXLQ(i->asTex()); 3329bf215546Sopenharmony_ci case OP_TXQ: 3330bf215546Sopenharmony_ci return handleTXQ(i->asTex()); 3331bf215546Sopenharmony_ci case OP_EX2: 3332bf215546Sopenharmony_ci bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0)); 3333bf215546Sopenharmony_ci i->setSrc(0, i->getDef(0)); 3334bf215546Sopenharmony_ci break; 3335bf215546Sopenharmony_ci case OP_POW: 3336bf215546Sopenharmony_ci return handlePOW(i); 3337bf215546Sopenharmony_ci case OP_DIV: 3338bf215546Sopenharmony_ci return handleDIV(i); 3339bf215546Sopenharmony_ci case OP_MOD: 3340bf215546Sopenharmony_ci return handleMOD(i); 3341bf215546Sopenharmony_ci case OP_SQRT: 3342bf215546Sopenharmony_ci return handleSQRT(i); 3343bf215546Sopenharmony_ci case OP_EXPORT: 3344bf215546Sopenharmony_ci ret = handleEXPORT(i); 3345bf215546Sopenharmony_ci break; 3346bf215546Sopenharmony_ci case OP_EMIT: 3347bf215546Sopenharmony_ci case OP_RESTART: 3348bf215546Sopenharmony_ci return handleOUT(i); 3349bf215546Sopenharmony_ci case OP_RDSV: 3350bf215546Sopenharmony_ci return handleRDSV(i); 3351bf215546Sopenharmony_ci case OP_WRSV: 3352bf215546Sopenharmony_ci return handleWRSV(i); 3353bf215546Sopenharmony_ci case OP_STORE: 3354bf215546Sopenharmony_ci case OP_LOAD: 3355bf215546Sopenharmony_ci handleLDST(i); 3356bf215546Sopenharmony_ci break; 3357bf215546Sopenharmony_ci case OP_ATOM: 3358bf215546Sopenharmony_ci { 3359bf215546Sopenharmony_ci const bool cctl = i->src(0).getFile() == FILE_MEMORY_BUFFER; 3360bf215546Sopenharmony_ci handleATOM(i); 3361bf215546Sopenharmony_ci if (cctl) 3362bf215546Sopenharmony_ci handleATOMCctl(i); 3363bf215546Sopenharmony_ci handleCasExch(i); 3364bf215546Sopenharmony_ci } 3365bf215546Sopenharmony_ci break; 3366bf215546Sopenharmony_ci case OP_SULDB: 3367bf215546Sopenharmony_ci case OP_SULDP: 3368bf215546Sopenharmony_ci case OP_SUSTB: 3369bf215546Sopenharmony_ci case OP_SUSTP: 3370bf215546Sopenharmony_ci case OP_SUREDB: 3371bf215546Sopenharmony_ci case OP_SUREDP: 3372bf215546Sopenharmony_ci if (targ->getChipset() >= NVISA_GM107_CHIPSET) 3373bf215546Sopenharmony_ci handleSurfaceOpGM107(i->asTex()); 3374bf215546Sopenharmony_ci else if (targ->getChipset() >= NVISA_GK104_CHIPSET) 3375bf215546Sopenharmony_ci handleSurfaceOpNVE4(i->asTex()); 3376bf215546Sopenharmony_ci else 3377bf215546Sopenharmony_ci handleSurfaceOpNVC0(i->asTex()); 3378bf215546Sopenharmony_ci break; 3379bf215546Sopenharmony_ci case OP_SUQ: 3380bf215546Sopenharmony_ci handleSUQ(i->asTex()); 3381bf215546Sopenharmony_ci break; 3382bf215546Sopenharmony_ci case OP_BUFQ: 3383bf215546Sopenharmony_ci handleBUFQ(i); 3384bf215546Sopenharmony_ci break; 3385bf215546Sopenharmony_ci case OP_PIXLD: 3386bf215546Sopenharmony_ci handlePIXLD(i); 3387bf215546Sopenharmony_ci break; 3388bf215546Sopenharmony_ci default: 3389bf215546Sopenharmony_ci break; 3390bf215546Sopenharmony_ci } 3391bf215546Sopenharmony_ci 3392bf215546Sopenharmony_ci /* Kepler+ has a special opcode to compute a new base address to be used 3393bf215546Sopenharmony_ci * for indirect loads. 3394bf215546Sopenharmony_ci * 3395bf215546Sopenharmony_ci * Maxwell+ has an additional similar requirement for indirect 3396bf215546Sopenharmony_ci * interpolation ops in frag shaders. 3397bf215546Sopenharmony_ci */ 3398bf215546Sopenharmony_ci bool doAfetch = false; 3399bf215546Sopenharmony_ci if (targ->getChipset() >= NVISA_GK104_CHIPSET && 3400bf215546Sopenharmony_ci !i->perPatch && 3401bf215546Sopenharmony_ci (i->op == OP_VFETCH || i->op == OP_EXPORT) && 3402bf215546Sopenharmony_ci i->src(0).isIndirect(0)) { 3403bf215546Sopenharmony_ci doAfetch = true; 3404bf215546Sopenharmony_ci } 3405bf215546Sopenharmony_ci if (targ->getChipset() >= NVISA_GM107_CHIPSET && 3406bf215546Sopenharmony_ci (i->op == OP_LINTERP || i->op == OP_PINTERP) && 3407bf215546Sopenharmony_ci i->src(0).isIndirect(0)) { 3408bf215546Sopenharmony_ci doAfetch = true; 3409bf215546Sopenharmony_ci } 3410bf215546Sopenharmony_ci 3411bf215546Sopenharmony_ci if (doAfetch) { 3412bf215546Sopenharmony_ci Value *addr = cloneShallow(func, i->getSrc(0)); 3413bf215546Sopenharmony_ci Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(), 3414bf215546Sopenharmony_ci i->getSrc(0)); 3415bf215546Sopenharmony_ci afetch->setIndirect(0, 0, i->getIndirect(0, 0)); 3416bf215546Sopenharmony_ci addr->reg.data.offset = 0; 3417bf215546Sopenharmony_ci i->setSrc(0, addr); 3418bf215546Sopenharmony_ci i->setIndirect(0, 0, afetch->getDef(0)); 3419bf215546Sopenharmony_ci i->subOp = NV50_IR_SUBOP_VFETCH_PHYS; 3420bf215546Sopenharmony_ci } 3421bf215546Sopenharmony_ci 3422bf215546Sopenharmony_ci return ret; 3423bf215546Sopenharmony_ci} 3424bf215546Sopenharmony_ci 3425bf215546Sopenharmony_cibool 3426bf215546Sopenharmony_ciTargetNVC0::runLegalizePass(Program *prog, CGStage stage) const 3427bf215546Sopenharmony_ci{ 3428bf215546Sopenharmony_ci if (stage == CG_STAGE_PRE_SSA) { 3429bf215546Sopenharmony_ci NVC0LoweringPass pass(prog); 3430bf215546Sopenharmony_ci return pass.run(prog, false, true); 3431bf215546Sopenharmony_ci } else 3432bf215546Sopenharmony_ci if (stage == CG_STAGE_POST_RA) { 3433bf215546Sopenharmony_ci NVC0LegalizePostRA pass(prog); 3434bf215546Sopenharmony_ci return pass.run(prog, false, true); 3435bf215546Sopenharmony_ci } else 3436bf215546Sopenharmony_ci if (stage == CG_STAGE_SSA) { 3437bf215546Sopenharmony_ci NVC0LegalizeSSA pass; 3438bf215546Sopenharmony_ci return pass.run(prog, false, true); 3439bf215546Sopenharmony_ci } 3440bf215546Sopenharmony_ci return false; 3441bf215546Sopenharmony_ci} 3442bf215546Sopenharmony_ci 3443bf215546Sopenharmony_ci} // namespace nv50_ir 3444