1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright 2011 Christoph Bumiller 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice shall be included in 12bf215546Sopenharmony_ci * all copies or substantial portions of the Software. 13bf215546Sopenharmony_ci * 14bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18bf215546Sopenharmony_ci * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19bf215546Sopenharmony_ci * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20bf215546Sopenharmony_ci * OTHER DEALINGS IN THE SOFTWARE. 21bf215546Sopenharmony_ci */ 22bf215546Sopenharmony_ci 23bf215546Sopenharmony_ci#include "nv50_ir.h" 24bf215546Sopenharmony_ci#include "nv50_ir_build_util.h" 25bf215546Sopenharmony_ci 26bf215546Sopenharmony_ci#include "nv50_ir_target_nv50.h" 27bf215546Sopenharmony_ci 28bf215546Sopenharmony_ci#define NV50_SU_INFO_SIZE_X 0x00 29bf215546Sopenharmony_ci#define NV50_SU_INFO_SIZE_Y 0x04 30bf215546Sopenharmony_ci#define NV50_SU_INFO_SIZE_Z 0x08 31bf215546Sopenharmony_ci#define NV50_SU_INFO_BSIZE 0x0c 32bf215546Sopenharmony_ci#define NV50_SU_INFO_STRIDE_Y 0x10 33bf215546Sopenharmony_ci#define NV50_SU_INFO_MS_X 0x18 34bf215546Sopenharmony_ci#define NV50_SU_INFO_MS_Y 0x1c 35bf215546Sopenharmony_ci#define NV50_SU_INFO_TILE_SHIFT_X 0x20 36bf215546Sopenharmony_ci#define NV50_SU_INFO_TILE_SHIFT_Y 0x24 37bf215546Sopenharmony_ci#define NV50_SU_INFO_TILE_SHIFT_Z 0x28 38bf215546Sopenharmony_ci#define NV50_SU_INFO_OFFSET_Z 0x2c 39bf215546Sopenharmony_ci 40bf215546Sopenharmony_ci#define NV50_SU_INFO__STRIDE 0x30 41bf215546Sopenharmony_ci 42bf215546Sopenharmony_ci#define NV50_SU_INFO_SIZE(i) (0x00 + (i) * 4) 43bf215546Sopenharmony_ci#define NV50_SU_INFO_MS(i) (0x18 + (i) * 4) 44bf215546Sopenharmony_ci#define NV50_SU_INFO_TILE_SHIFT(i) (0x20 + (i) * 4) 45bf215546Sopenharmony_ci 46bf215546Sopenharmony_cinamespace nv50_ir { 47bf215546Sopenharmony_ci 48bf215546Sopenharmony_ci// nv50 doesn't support 32 bit integer multiplication 49bf215546Sopenharmony_ci// 50bf215546Sopenharmony_ci// ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl) 51bf215546Sopenharmony_ci// ------------------- 52bf215546Sopenharmony_ci// al*bh 00 HI32: (al * bh + ah * bl) >> 16 + (ah * bh) + 53bf215546Sopenharmony_ci// ah*bh 00 00 ( carry1) << 16 + ( carry2) 54bf215546Sopenharmony_ci// al*bl 55bf215546Sopenharmony_ci// ah*bl 00 56bf215546Sopenharmony_ci// 57bf215546Sopenharmony_ci// fffe0001 + fffe0001 58bf215546Sopenharmony_ci// 59bf215546Sopenharmony_ci// Note that this sort of splitting doesn't work for signed values, so we 60bf215546Sopenharmony_ci// compute the sign on those manually and then perform an unsigned multiply. 61bf215546Sopenharmony_cistatic bool 62bf215546Sopenharmony_ciexpandIntegerMUL(BuildUtil *bld, Instruction *mul) 63bf215546Sopenharmony_ci{ 64bf215546Sopenharmony_ci const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH; 65bf215546Sopenharmony_ci ImmediateValue src1; 66bf215546Sopenharmony_ci bool src1imm = mul->src(1).getImmediate(src1); 67bf215546Sopenharmony_ci 68bf215546Sopenharmony_ci DataType fTy; // full type 69bf215546Sopenharmony_ci switch (mul->sType) { 70bf215546Sopenharmony_ci case TYPE_S32: fTy = TYPE_U32; break; 71bf215546Sopenharmony_ci case TYPE_S64: fTy = TYPE_U64; break; 72bf215546Sopenharmony_ci default: fTy = mul->sType; break; 73bf215546Sopenharmony_ci } 74bf215546Sopenharmony_ci 75bf215546Sopenharmony_ci DataType hTy; // half type 76bf215546Sopenharmony_ci switch (fTy) { 77bf215546Sopenharmony_ci case TYPE_U32: hTy = TYPE_U16; break; 78bf215546Sopenharmony_ci case TYPE_U64: hTy = TYPE_U32; break; 79bf215546Sopenharmony_ci default: 80bf215546Sopenharmony_ci return false; 81bf215546Sopenharmony_ci } 82bf215546Sopenharmony_ci unsigned int fullSize = typeSizeof(fTy); 83bf215546Sopenharmony_ci unsigned int halfSize = typeSizeof(hTy); 84bf215546Sopenharmony_ci 85bf215546Sopenharmony_ci Instruction *i[9]; 86bf215546Sopenharmony_ci 87bf215546Sopenharmony_ci bld->setPosition(mul, true); 88bf215546Sopenharmony_ci 89bf215546Sopenharmony_ci Value *s[2]; 90bf215546Sopenharmony_ci Value *a[2], *b[2]; 91bf215546Sopenharmony_ci Value *t[4]; 92bf215546Sopenharmony_ci for (int j = 0; j < 4; ++j) 93bf215546Sopenharmony_ci t[j] = bld->getSSA(fullSize); 94bf215546Sopenharmony_ci 95bf215546Sopenharmony_ci if (isSignedType(mul->sType) && highResult) { 96bf215546Sopenharmony_ci s[0] = bld->getSSA(fullSize); 97bf215546Sopenharmony_ci s[1] = bld->getSSA(fullSize); 98bf215546Sopenharmony_ci bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0)); 99bf215546Sopenharmony_ci bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1)); 100bf215546Sopenharmony_ci src1.reg.data.s32 = abs(src1.reg.data.s32); 101bf215546Sopenharmony_ci } else { 102bf215546Sopenharmony_ci s[0] = mul->getSrc(0); 103bf215546Sopenharmony_ci s[1] = mul->getSrc(1); 104bf215546Sopenharmony_ci } 105bf215546Sopenharmony_ci 106bf215546Sopenharmony_ci // split sources into halves 107bf215546Sopenharmony_ci i[0] = bld->mkSplit(a, halfSize, s[0]); 108bf215546Sopenharmony_ci i[1] = bld->mkSplit(b, halfSize, s[1]); 109bf215546Sopenharmony_ci 110bf215546Sopenharmony_ci if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) { 111bf215546Sopenharmony_ci i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1], 112bf215546Sopenharmony_ci bld->mkImm(src1.reg.data.u32 & 0xffff)); 113bf215546Sopenharmony_ci } else { 114bf215546Sopenharmony_ci i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], 115bf215546Sopenharmony_ci src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]); 116bf215546Sopenharmony_ci if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) { 117bf215546Sopenharmony_ci i[3] = i[2]; 118bf215546Sopenharmony_ci t[1] = t[0]; 119bf215546Sopenharmony_ci } else { 120bf215546Sopenharmony_ci i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]); 121bf215546Sopenharmony_ci } 122bf215546Sopenharmony_ci } 123bf215546Sopenharmony_ci i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8)); 124bf215546Sopenharmony_ci if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) { 125bf215546Sopenharmony_ci i[4] = i[3]; 126bf215546Sopenharmony_ci t[3] = t[2]; 127bf215546Sopenharmony_ci } else { 128bf215546Sopenharmony_ci i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]); 129bf215546Sopenharmony_ci } 130bf215546Sopenharmony_ci 131bf215546Sopenharmony_ci if (highResult) { 132bf215546Sopenharmony_ci Value *c[2]; 133bf215546Sopenharmony_ci Value *r[5]; 134bf215546Sopenharmony_ci Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8)); 135bf215546Sopenharmony_ci c[0] = bld->getSSA(1, FILE_FLAGS); 136bf215546Sopenharmony_ci c[1] = bld->getSSA(1, FILE_FLAGS); 137bf215546Sopenharmony_ci for (int j = 0; j < 5; ++j) 138bf215546Sopenharmony_ci r[j] = bld->getSSA(fullSize); 139bf215546Sopenharmony_ci 140bf215546Sopenharmony_ci i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8)); 141bf215546Sopenharmony_ci i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm); 142bf215546Sopenharmony_ci bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]); 143bf215546Sopenharmony_ci bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]); 144bf215546Sopenharmony_ci i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]); 145bf215546Sopenharmony_ci 146bf215546Sopenharmony_ci // set carry defs / sources 147bf215546Sopenharmony_ci i[3]->setFlagsDef(1, c[0]); 148bf215546Sopenharmony_ci // actual result required in negative case, but ignored for 149bf215546Sopenharmony_ci // unsigned. for some reason the compiler ends up dropping the whole 150bf215546Sopenharmony_ci // instruction if the destination is unused but the flags are. 151bf215546Sopenharmony_ci if (isSignedType(mul->sType)) 152bf215546Sopenharmony_ci i[4]->setFlagsDef(1, c[1]); 153bf215546Sopenharmony_ci else 154bf215546Sopenharmony_ci i[4]->setFlagsDef(0, c[1]); 155bf215546Sopenharmony_ci i[6]->setPredicate(CC_C, c[0]); 156bf215546Sopenharmony_ci i[5]->setFlagsSrc(3, c[1]); 157bf215546Sopenharmony_ci 158bf215546Sopenharmony_ci if (isSignedType(mul->sType)) { 159bf215546Sopenharmony_ci Value *cc[2]; 160bf215546Sopenharmony_ci Value *rr[7]; 161bf215546Sopenharmony_ci Value *one = bld->getSSA(fullSize); 162bf215546Sopenharmony_ci bld->loadImm(one, 1); 163bf215546Sopenharmony_ci for (int j = 0; j < 7; j++) 164bf215546Sopenharmony_ci rr[j] = bld->getSSA(fullSize); 165bf215546Sopenharmony_ci 166bf215546Sopenharmony_ci // NOTE: this logic uses predicates because splitting basic blocks is 167bf215546Sopenharmony_ci // ~impossible during the SSA phase. The RA relies on a correlation 168bf215546Sopenharmony_ci // between edge order and phi node sources. 169bf215546Sopenharmony_ci 170bf215546Sopenharmony_ci // Set the sign of the result based on the inputs 171bf215546Sopenharmony_ci bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1)) 172bf215546Sopenharmony_ci ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS))); 173bf215546Sopenharmony_ci 174bf215546Sopenharmony_ci // 1s complement of 64-bit value 175bf215546Sopenharmony_ci bld->mkOp1(OP_NOT, fTy, rr[0], r[4]) 176bf215546Sopenharmony_ci ->setPredicate(CC_S, cc[0]); 177bf215546Sopenharmony_ci bld->mkOp1(OP_NOT, fTy, rr[1], t[3]) 178bf215546Sopenharmony_ci ->setPredicate(CC_S, cc[0]); 179bf215546Sopenharmony_ci 180bf215546Sopenharmony_ci // add to low 32-bits, keep track of the carry 181bf215546Sopenharmony_ci Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one); 182bf215546Sopenharmony_ci n->setPredicate(CC_S, cc[0]); 183bf215546Sopenharmony_ci n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS))); 184bf215546Sopenharmony_ci 185bf215546Sopenharmony_ci // If there was a carry, add 1 to the upper 32 bits 186bf215546Sopenharmony_ci // XXX: These get executed even if they shouldn't be 187bf215546Sopenharmony_ci bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one) 188bf215546Sopenharmony_ci ->setPredicate(CC_C, cc[1]); 189bf215546Sopenharmony_ci bld->mkMov(rr[3], rr[0]) 190bf215546Sopenharmony_ci ->setPredicate(CC_NC, cc[1]); 191bf215546Sopenharmony_ci bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]); 192bf215546Sopenharmony_ci 193bf215546Sopenharmony_ci // Merge the results from the negative and non-negative paths 194bf215546Sopenharmony_ci bld->mkMov(rr[5], rr[4]) 195bf215546Sopenharmony_ci ->setPredicate(CC_S, cc[0]); 196bf215546Sopenharmony_ci bld->mkMov(rr[6], r[4]) 197bf215546Sopenharmony_ci ->setPredicate(CC_NS, cc[0]); 198bf215546Sopenharmony_ci bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]); 199bf215546Sopenharmony_ci } else { 200bf215546Sopenharmony_ci bld->mkMov(mul->getDef(0), r[4]); 201bf215546Sopenharmony_ci } 202bf215546Sopenharmony_ci } else { 203bf215546Sopenharmony_ci bld->mkMov(mul->getDef(0), t[3]); 204bf215546Sopenharmony_ci } 205bf215546Sopenharmony_ci delete_Instruction(bld->getProgram(), mul); 206bf215546Sopenharmony_ci 207bf215546Sopenharmony_ci for (int j = 2; j <= (highResult ? 5 : 4); ++j) 208bf215546Sopenharmony_ci if (i[j]) 209bf215546Sopenharmony_ci i[j]->sType = hTy; 210bf215546Sopenharmony_ci 211bf215546Sopenharmony_ci return true; 212bf215546Sopenharmony_ci} 213bf215546Sopenharmony_ci 214bf215546Sopenharmony_ci#define QOP_ADD 0 215bf215546Sopenharmony_ci#define QOP_SUBR 1 216bf215546Sopenharmony_ci#define QOP_SUB 2 217bf215546Sopenharmony_ci#define QOP_MOV2 3 218bf215546Sopenharmony_ci 219bf215546Sopenharmony_ci// UL UR LL LR 220bf215546Sopenharmony_ci#define QUADOP(q, r, s, t) \ 221bf215546Sopenharmony_ci ((QOP_##q << 6) | (QOP_##r << 4) | \ 222bf215546Sopenharmony_ci (QOP_##s << 2) | (QOP_##t << 0)) 223bf215546Sopenharmony_ci 224bf215546Sopenharmony_ciclass NV50LegalizePostRA : public Pass 225bf215546Sopenharmony_ci{ 226bf215546Sopenharmony_cipublic: 227bf215546Sopenharmony_ci NV50LegalizePostRA() : r63(NULL) { } 228bf215546Sopenharmony_ci 229bf215546Sopenharmony_ciprivate: 230bf215546Sopenharmony_ci virtual bool visit(Function *); 231bf215546Sopenharmony_ci virtual bool visit(BasicBlock *); 232bf215546Sopenharmony_ci 233bf215546Sopenharmony_ci void handlePRERET(FlowInstruction *); 234bf215546Sopenharmony_ci void replaceZero(Instruction *); 235bf215546Sopenharmony_ci 236bf215546Sopenharmony_ci BuildUtil bld; 237bf215546Sopenharmony_ci 238bf215546Sopenharmony_ci LValue *r63; 239bf215546Sopenharmony_ci}; 240bf215546Sopenharmony_ci 241bf215546Sopenharmony_cibool 242bf215546Sopenharmony_ciNV50LegalizePostRA::visit(Function *fn) 243bf215546Sopenharmony_ci{ 244bf215546Sopenharmony_ci Program *prog = fn->getProgram(); 245bf215546Sopenharmony_ci 246bf215546Sopenharmony_ci r63 = new_LValue(fn, FILE_GPR); 247bf215546Sopenharmony_ci // GPR units on nv50 are in half-regs 248bf215546Sopenharmony_ci if (prog->maxGPR < 126) 249bf215546Sopenharmony_ci r63->reg.data.id = 63; 250bf215546Sopenharmony_ci else 251bf215546Sopenharmony_ci r63->reg.data.id = 127; 252bf215546Sopenharmony_ci 253bf215546Sopenharmony_ci // this is actually per-program, but we can do it all on visiting main() 254bf215546Sopenharmony_ci std::list<Instruction *> *outWrites = 255bf215546Sopenharmony_ci reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv); 256bf215546Sopenharmony_ci 257bf215546Sopenharmony_ci if (outWrites) { 258bf215546Sopenharmony_ci for (std::list<Instruction *>::iterator it = outWrites->begin(); 259bf215546Sopenharmony_ci it != outWrites->end(); ++it) 260bf215546Sopenharmony_ci (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0)); 261bf215546Sopenharmony_ci // instructions will be deleted on exit 262bf215546Sopenharmony_ci outWrites->clear(); 263bf215546Sopenharmony_ci } 264bf215546Sopenharmony_ci 265bf215546Sopenharmony_ci return true; 266bf215546Sopenharmony_ci} 267bf215546Sopenharmony_ci 268bf215546Sopenharmony_civoid 269bf215546Sopenharmony_ciNV50LegalizePostRA::replaceZero(Instruction *i) 270bf215546Sopenharmony_ci{ 271bf215546Sopenharmony_ci for (int s = 0; i->srcExists(s); ++s) { 272bf215546Sopenharmony_ci ImmediateValue *imm = i->getSrc(s)->asImm(); 273bf215546Sopenharmony_ci if (imm && imm->reg.data.u64 == 0) 274bf215546Sopenharmony_ci i->setSrc(s, r63); 275bf215546Sopenharmony_ci } 276bf215546Sopenharmony_ci} 277bf215546Sopenharmony_ci 278bf215546Sopenharmony_ci// Emulate PRERET: jump to the target and call to the origin from there 279bf215546Sopenharmony_ci// 280bf215546Sopenharmony_ci// WARNING: atm only works if BBs are affected by at most a single PRERET 281bf215546Sopenharmony_ci// 282bf215546Sopenharmony_ci// BB:0 283bf215546Sopenharmony_ci// preret BB:3 284bf215546Sopenharmony_ci// (...) 285bf215546Sopenharmony_ci// BB:3 286bf215546Sopenharmony_ci// (...) 287bf215546Sopenharmony_ci// ---> 288bf215546Sopenharmony_ci// BB:0 289bf215546Sopenharmony_ci// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate) 290bf215546Sopenharmony_ci// (...) 291bf215546Sopenharmony_ci// BB:3 292bf215546Sopenharmony_ci// bra BB:3 + n1 (skip the call) 293bf215546Sopenharmony_ci// call BB:0 + n2 (skip bra at beginning of BB:0) 294bf215546Sopenharmony_ci// (...) 295bf215546Sopenharmony_civoid 296bf215546Sopenharmony_ciNV50LegalizePostRA::handlePRERET(FlowInstruction *pre) 297bf215546Sopenharmony_ci{ 298bf215546Sopenharmony_ci BasicBlock *bbE = pre->bb; 299bf215546Sopenharmony_ci BasicBlock *bbT = pre->target.bb; 300bf215546Sopenharmony_ci 301bf215546Sopenharmony_ci pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0; 302bf215546Sopenharmony_ci bbE->remove(pre); 303bf215546Sopenharmony_ci bbE->insertHead(pre); 304bf215546Sopenharmony_ci 305bf215546Sopenharmony_ci Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT); 306bf215546Sopenharmony_ci Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE); 307bf215546Sopenharmony_ci 308bf215546Sopenharmony_ci bbT->insertHead(call); 309bf215546Sopenharmony_ci bbT->insertHead(skip); 310bf215546Sopenharmony_ci 311bf215546Sopenharmony_ci // NOTE: maybe split blocks to prevent the instructions from moving ? 312bf215546Sopenharmony_ci 313bf215546Sopenharmony_ci skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1; 314bf215546Sopenharmony_ci call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2; 315bf215546Sopenharmony_ci} 316bf215546Sopenharmony_ci 317bf215546Sopenharmony_cibool 318bf215546Sopenharmony_ciNV50LegalizePostRA::visit(BasicBlock *bb) 319bf215546Sopenharmony_ci{ 320bf215546Sopenharmony_ci Instruction *i, *next; 321bf215546Sopenharmony_ci 322bf215546Sopenharmony_ci // remove pseudo operations and non-fixed no-ops, split 64 bit operations 323bf215546Sopenharmony_ci for (i = bb->getFirst(); i; i = next) { 324bf215546Sopenharmony_ci next = i->next; 325bf215546Sopenharmony_ci if (i->isNop()) { 326bf215546Sopenharmony_ci bb->remove(i); 327bf215546Sopenharmony_ci } else 328bf215546Sopenharmony_ci if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) { 329bf215546Sopenharmony_ci handlePRERET(i->asFlow()); 330bf215546Sopenharmony_ci } else { 331bf215546Sopenharmony_ci // TODO: We will want to do this before register allocation, 332bf215546Sopenharmony_ci // since have to use a $c register for the carry flag. 333bf215546Sopenharmony_ci if (typeSizeof(i->dType) == 8) { 334bf215546Sopenharmony_ci Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL); 335bf215546Sopenharmony_ci if (hi) 336bf215546Sopenharmony_ci next = hi; 337bf215546Sopenharmony_ci } 338bf215546Sopenharmony_ci 339bf215546Sopenharmony_ci if (i->op != OP_PFETCH && i->op != OP_BAR && 340bf215546Sopenharmony_ci (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS)) 341bf215546Sopenharmony_ci replaceZero(i); 342bf215546Sopenharmony_ci } 343bf215546Sopenharmony_ci } 344bf215546Sopenharmony_ci if (!bb->getEntry()) 345bf215546Sopenharmony_ci return true; 346bf215546Sopenharmony_ci 347bf215546Sopenharmony_ci return true; 348bf215546Sopenharmony_ci} 349bf215546Sopenharmony_ci 350bf215546Sopenharmony_ciclass NV50LegalizeSSA : public Pass 351bf215546Sopenharmony_ci{ 352bf215546Sopenharmony_cipublic: 353bf215546Sopenharmony_ci NV50LegalizeSSA(Program *); 354bf215546Sopenharmony_ci 355bf215546Sopenharmony_ci virtual bool visit(BasicBlock *bb); 356bf215546Sopenharmony_ci 357bf215546Sopenharmony_ciprivate: 358bf215546Sopenharmony_ci void propagateWriteToOutput(Instruction *); 359bf215546Sopenharmony_ci void handleDIV(Instruction *); 360bf215546Sopenharmony_ci void handleMOD(Instruction *); 361bf215546Sopenharmony_ci void handleMUL(Instruction *); 362bf215546Sopenharmony_ci void handleAddrDef(Instruction *); 363bf215546Sopenharmony_ci 364bf215546Sopenharmony_ci inline bool isARL(const Instruction *) const; 365bf215546Sopenharmony_ci 366bf215546Sopenharmony_ci BuildUtil bld; 367bf215546Sopenharmony_ci 368bf215546Sopenharmony_ci std::list<Instruction *> *outWrites; 369bf215546Sopenharmony_ci}; 370bf215546Sopenharmony_ci 371bf215546Sopenharmony_ciNV50LegalizeSSA::NV50LegalizeSSA(Program *prog) 372bf215546Sopenharmony_ci{ 373bf215546Sopenharmony_ci bld.setProgram(prog); 374bf215546Sopenharmony_ci 375bf215546Sopenharmony_ci if (prog->optLevel >= 2 && 376bf215546Sopenharmony_ci (prog->getType() == Program::TYPE_GEOMETRY || 377bf215546Sopenharmony_ci prog->getType() == Program::TYPE_VERTEX)) 378bf215546Sopenharmony_ci outWrites = 379bf215546Sopenharmony_ci reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv); 380bf215546Sopenharmony_ci else 381bf215546Sopenharmony_ci outWrites = NULL; 382bf215546Sopenharmony_ci} 383bf215546Sopenharmony_ci 384bf215546Sopenharmony_civoid 385bf215546Sopenharmony_ciNV50LegalizeSSA::propagateWriteToOutput(Instruction *st) 386bf215546Sopenharmony_ci{ 387bf215546Sopenharmony_ci if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1) 388bf215546Sopenharmony_ci return; 389bf215546Sopenharmony_ci 390bf215546Sopenharmony_ci // check def instruction can store 391bf215546Sopenharmony_ci Instruction *di = st->getSrc(1)->defs.front()->getInsn(); 392bf215546Sopenharmony_ci 393bf215546Sopenharmony_ci // TODO: move exports (if beneficial) in common opt pass 394bf215546Sopenharmony_ci if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1) 395bf215546Sopenharmony_ci return; 396bf215546Sopenharmony_ci 397bf215546Sopenharmony_ci for (int s = 0; di->srcExists(s); ++s) 398bf215546Sopenharmony_ci if (di->src(s).getFile() == FILE_IMMEDIATE || 399bf215546Sopenharmony_ci di->src(s).getFile() == FILE_MEMORY_LOCAL) 400bf215546Sopenharmony_ci return; 401bf215546Sopenharmony_ci 402bf215546Sopenharmony_ci if (prog->getType() == Program::TYPE_GEOMETRY) { 403bf215546Sopenharmony_ci // Only propagate output writes in geometry shaders when we can be sure 404bf215546Sopenharmony_ci // that we are propagating to the same output vertex. 405bf215546Sopenharmony_ci if (di->bb != st->bb) 406bf215546Sopenharmony_ci return; 407bf215546Sopenharmony_ci Instruction *i; 408bf215546Sopenharmony_ci for (i = di; i != st; i = i->next) { 409bf215546Sopenharmony_ci if (i->op == OP_EMIT || i->op == OP_RESTART) 410bf215546Sopenharmony_ci return; 411bf215546Sopenharmony_ci } 412bf215546Sopenharmony_ci assert(i); // st after di 413bf215546Sopenharmony_ci } 414bf215546Sopenharmony_ci 415bf215546Sopenharmony_ci // We cannot set defs to non-lvalues before register allocation, so 416bf215546Sopenharmony_ci // save & remove (to save registers) the exports and replace later. 417bf215546Sopenharmony_ci outWrites->push_back(st); 418bf215546Sopenharmony_ci st->bb->remove(st); 419bf215546Sopenharmony_ci} 420bf215546Sopenharmony_ci 421bf215546Sopenharmony_cibool 422bf215546Sopenharmony_ciNV50LegalizeSSA::isARL(const Instruction *i) const 423bf215546Sopenharmony_ci{ 424bf215546Sopenharmony_ci ImmediateValue imm; 425bf215546Sopenharmony_ci 426bf215546Sopenharmony_ci if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR) 427bf215546Sopenharmony_ci return false; 428bf215546Sopenharmony_ci if (!i->src(1).getImmediate(imm)) 429bf215546Sopenharmony_ci return false; 430bf215546Sopenharmony_ci return imm.isInteger(0); 431bf215546Sopenharmony_ci} 432bf215546Sopenharmony_ci 433bf215546Sopenharmony_civoid 434bf215546Sopenharmony_ciNV50LegalizeSSA::handleAddrDef(Instruction *i) 435bf215546Sopenharmony_ci{ 436bf215546Sopenharmony_ci Instruction *arl; 437bf215546Sopenharmony_ci 438bf215546Sopenharmony_ci i->getDef(0)->reg.size = 2; // $aX are only 16 bit 439bf215546Sopenharmony_ci 440bf215546Sopenharmony_ci // PFETCH can always write to $a 441bf215546Sopenharmony_ci if (i->op == OP_PFETCH) 442bf215546Sopenharmony_ci return; 443bf215546Sopenharmony_ci // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid 444bf215546Sopenharmony_ci if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) { 445bf215546Sopenharmony_ci if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR) 446bf215546Sopenharmony_ci return; 447bf215546Sopenharmony_ci if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS) 448bf215546Sopenharmony_ci return; 449bf215546Sopenharmony_ci } 450bf215546Sopenharmony_ci 451bf215546Sopenharmony_ci // turn $a sources into $r sources (can't operate on $a) 452bf215546Sopenharmony_ci for (int s = 0; i->srcExists(s); ++s) { 453bf215546Sopenharmony_ci Value *a = i->getSrc(s); 454bf215546Sopenharmony_ci Value *r; 455bf215546Sopenharmony_ci if (a->reg.file == FILE_ADDRESS) { 456bf215546Sopenharmony_ci if (a->getInsn() && isARL(a->getInsn())) { 457bf215546Sopenharmony_ci i->setSrc(s, a->getInsn()->getSrc(0)); 458bf215546Sopenharmony_ci } else { 459bf215546Sopenharmony_ci bld.setPosition(i, false); 460bf215546Sopenharmony_ci r = bld.getSSA(); 461bf215546Sopenharmony_ci bld.mkMov(r, a); 462bf215546Sopenharmony_ci i->setSrc(s, r); 463bf215546Sopenharmony_ci } 464bf215546Sopenharmony_ci } 465bf215546Sopenharmony_ci } 466bf215546Sopenharmony_ci if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE) 467bf215546Sopenharmony_ci return; 468bf215546Sopenharmony_ci 469bf215546Sopenharmony_ci // turn result back into $a 470bf215546Sopenharmony_ci bld.setPosition(i, true); 471bf215546Sopenharmony_ci arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0)); 472bf215546Sopenharmony_ci i->setDef(0, arl->getSrc(0)); 473bf215546Sopenharmony_ci} 474bf215546Sopenharmony_ci 475bf215546Sopenharmony_civoid 476bf215546Sopenharmony_ciNV50LegalizeSSA::handleMUL(Instruction *mul) 477bf215546Sopenharmony_ci{ 478bf215546Sopenharmony_ci if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2) 479bf215546Sopenharmony_ci return; 480bf215546Sopenharmony_ci Value *def = mul->getDef(0); 481bf215546Sopenharmony_ci Value *pred = mul->getPredicate(); 482bf215546Sopenharmony_ci CondCode cc = mul->cc; 483bf215546Sopenharmony_ci if (pred) 484bf215546Sopenharmony_ci mul->setPredicate(CC_ALWAYS, NULL); 485bf215546Sopenharmony_ci 486bf215546Sopenharmony_ci if (mul->op == OP_MAD) { 487bf215546Sopenharmony_ci Instruction *add = mul; 488bf215546Sopenharmony_ci bld.setPosition(add, false); 489bf215546Sopenharmony_ci Value *res = cloneShallow(func, mul->getDef(0)); 490bf215546Sopenharmony_ci mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1)); 491bf215546Sopenharmony_ci add->op = OP_ADD; 492bf215546Sopenharmony_ci add->setSrc(0, mul->getDef(0)); 493bf215546Sopenharmony_ci add->setSrc(1, add->getSrc(2)); 494bf215546Sopenharmony_ci for (int s = 2; add->srcExists(s); ++s) 495bf215546Sopenharmony_ci add->setSrc(s, NULL); 496bf215546Sopenharmony_ci mul->subOp = add->subOp; 497bf215546Sopenharmony_ci add->subOp = 0; 498bf215546Sopenharmony_ci } 499bf215546Sopenharmony_ci expandIntegerMUL(&bld, mul); 500bf215546Sopenharmony_ci if (pred) 501bf215546Sopenharmony_ci def->getInsn()->setPredicate(cc, pred); 502bf215546Sopenharmony_ci} 503bf215546Sopenharmony_ci 504bf215546Sopenharmony_ci// Use f32 division: first compute an approximate result, use it to reduce 505bf215546Sopenharmony_ci// the dividend, which should then be representable as f32, divide the reduced 506bf215546Sopenharmony_ci// dividend, and add the quotients. 507bf215546Sopenharmony_civoid 508bf215546Sopenharmony_ciNV50LegalizeSSA::handleDIV(Instruction *div) 509bf215546Sopenharmony_ci{ 510bf215546Sopenharmony_ci const DataType ty = div->sType; 511bf215546Sopenharmony_ci 512bf215546Sopenharmony_ci if (ty != TYPE_U32 && ty != TYPE_S32) 513bf215546Sopenharmony_ci return; 514bf215546Sopenharmony_ci 515bf215546Sopenharmony_ci Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond; 516bf215546Sopenharmony_ci 517bf215546Sopenharmony_ci bld.setPosition(div, false); 518bf215546Sopenharmony_ci 519bf215546Sopenharmony_ci Value *a, *af = bld.getSSA(); 520bf215546Sopenharmony_ci Value *b, *bf = bld.getSSA(); 521bf215546Sopenharmony_ci 522bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0)); 523bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1)); 524bf215546Sopenharmony_ci 525bf215546Sopenharmony_ci if (isSignedType(ty)) { 526bf215546Sopenharmony_ci af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS); 527bf215546Sopenharmony_ci bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS); 528bf215546Sopenharmony_ci a = bld.getSSA(); 529bf215546Sopenharmony_ci b = bld.getSSA(); 530bf215546Sopenharmony_ci bld.mkOp1(OP_ABS, ty, a, div->getSrc(0)); 531bf215546Sopenharmony_ci bld.mkOp1(OP_ABS, ty, b, div->getSrc(1)); 532bf215546Sopenharmony_ci } else { 533bf215546Sopenharmony_ci a = div->getSrc(0); 534bf215546Sopenharmony_ci b = div->getSrc(1); 535bf215546Sopenharmony_ci } 536bf215546Sopenharmony_ci 537bf215546Sopenharmony_ci bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf); 538bf215546Sopenharmony_ci bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2)); 539bf215546Sopenharmony_ci 540bf215546Sopenharmony_ci bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z; 541bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z; 542bf215546Sopenharmony_ci 543bf215546Sopenharmony_ci // get error of 1st result 544bf215546Sopenharmony_ci expandIntegerMUL(&bld, 545bf215546Sopenharmony_ci bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b)); 546bf215546Sopenharmony_ci bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t); 547bf215546Sopenharmony_ci 548bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf); 549bf215546Sopenharmony_ci 550bf215546Sopenharmony_ci bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z; 551bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf) 552bf215546Sopenharmony_ci ->rnd = ROUND_Z; 553bf215546Sopenharmony_ci bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients 554bf215546Sopenharmony_ci 555bf215546Sopenharmony_ci // correction: if modulus >= divisor, add 1 556bf215546Sopenharmony_ci expandIntegerMUL(&bld, 557bf215546Sopenharmony_ci bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b)); 558bf215546Sopenharmony_ci bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t); 559bf215546Sopenharmony_ci bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b); 560bf215546Sopenharmony_ci if (!isSignedType(ty)) { 561bf215546Sopenharmony_ci div->op = OP_SUB; 562bf215546Sopenharmony_ci div->setSrc(0, q); 563bf215546Sopenharmony_ci div->setSrc(1, s); 564bf215546Sopenharmony_ci } else { 565bf215546Sopenharmony_ci t = q; 566bf215546Sopenharmony_ci bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s); 567bf215546Sopenharmony_ci s = bld.getSSA(); 568bf215546Sopenharmony_ci t = bld.getSSA(); 569bf215546Sopenharmony_ci // fix the sign 570bf215546Sopenharmony_ci bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1)) 571bf215546Sopenharmony_ci ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS))); 572bf215546Sopenharmony_ci bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond); 573bf215546Sopenharmony_ci bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond); 574bf215546Sopenharmony_ci 575bf215546Sopenharmony_ci div->op = OP_UNION; 576bf215546Sopenharmony_ci div->setSrc(0, s); 577bf215546Sopenharmony_ci div->setSrc(1, t); 578bf215546Sopenharmony_ci } 579bf215546Sopenharmony_ci} 580bf215546Sopenharmony_ci 581bf215546Sopenharmony_civoid 582bf215546Sopenharmony_ciNV50LegalizeSSA::handleMOD(Instruction *mod) 583bf215546Sopenharmony_ci{ 584bf215546Sopenharmony_ci if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32) 585bf215546Sopenharmony_ci return; 586bf215546Sopenharmony_ci bld.setPosition(mod, false); 587bf215546Sopenharmony_ci 588bf215546Sopenharmony_ci Value *q = bld.getSSA(); 589bf215546Sopenharmony_ci Value *m = bld.getSSA(); 590bf215546Sopenharmony_ci 591bf215546Sopenharmony_ci bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1)); 592bf215546Sopenharmony_ci handleDIV(q->getInsn()); 593bf215546Sopenharmony_ci 594bf215546Sopenharmony_ci bld.setPosition(mod, false); 595bf215546Sopenharmony_ci expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1))); 596bf215546Sopenharmony_ci 597bf215546Sopenharmony_ci mod->op = OP_SUB; 598bf215546Sopenharmony_ci mod->setSrc(1, m); 599bf215546Sopenharmony_ci} 600bf215546Sopenharmony_ci 601bf215546Sopenharmony_cibool 602bf215546Sopenharmony_ciNV50LegalizeSSA::visit(BasicBlock *bb) 603bf215546Sopenharmony_ci{ 604bf215546Sopenharmony_ci Instruction *insn, *next; 605bf215546Sopenharmony_ci // skipping PHIs (don't pass them to handleAddrDef) ! 606bf215546Sopenharmony_ci for (insn = bb->getEntry(); insn; insn = next) { 607bf215546Sopenharmony_ci next = insn->next; 608bf215546Sopenharmony_ci 609bf215546Sopenharmony_ci if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS) 610bf215546Sopenharmony_ci handleAddrDef(insn); 611bf215546Sopenharmony_ci 612bf215546Sopenharmony_ci switch (insn->op) { 613bf215546Sopenharmony_ci case OP_EXPORT: 614bf215546Sopenharmony_ci if (outWrites) 615bf215546Sopenharmony_ci propagateWriteToOutput(insn); 616bf215546Sopenharmony_ci break; 617bf215546Sopenharmony_ci case OP_DIV: 618bf215546Sopenharmony_ci handleDIV(insn); 619bf215546Sopenharmony_ci break; 620bf215546Sopenharmony_ci case OP_MOD: 621bf215546Sopenharmony_ci handleMOD(insn); 622bf215546Sopenharmony_ci break; 623bf215546Sopenharmony_ci case OP_MAD: 624bf215546Sopenharmony_ci case OP_MUL: 625bf215546Sopenharmony_ci handleMUL(insn); 626bf215546Sopenharmony_ci break; 627bf215546Sopenharmony_ci default: 628bf215546Sopenharmony_ci break; 629bf215546Sopenharmony_ci } 630bf215546Sopenharmony_ci } 631bf215546Sopenharmony_ci return true; 632bf215546Sopenharmony_ci} 633bf215546Sopenharmony_ci 634bf215546Sopenharmony_ciclass NV50LoweringPreSSA : public Pass 635bf215546Sopenharmony_ci{ 636bf215546Sopenharmony_cipublic: 637bf215546Sopenharmony_ci NV50LoweringPreSSA(Program *); 638bf215546Sopenharmony_ci 639bf215546Sopenharmony_ciprivate: 640bf215546Sopenharmony_ci virtual bool visit(Instruction *); 641bf215546Sopenharmony_ci virtual bool visit(Function *); 642bf215546Sopenharmony_ci 643bf215546Sopenharmony_ci bool handleRDSV(Instruction *); 644bf215546Sopenharmony_ci bool handleWRSV(Instruction *); 645bf215546Sopenharmony_ci 646bf215546Sopenharmony_ci bool handlePFETCH(Instruction *); 647bf215546Sopenharmony_ci bool handleEXPORT(Instruction *); 648bf215546Sopenharmony_ci bool handleLOAD(Instruction *); 649bf215546Sopenharmony_ci bool handleLDST(Instruction *); 650bf215546Sopenharmony_ci bool handleMEMBAR(Instruction *); 651bf215546Sopenharmony_ci bool handleSharedATOM(Instruction *); 652bf215546Sopenharmony_ci bool handleSULDP(TexInstruction *); 653bf215546Sopenharmony_ci bool handleSUREDP(TexInstruction *); 654bf215546Sopenharmony_ci bool handleSUSTP(TexInstruction *); 655bf215546Sopenharmony_ci Value *processSurfaceCoords(TexInstruction *); 656bf215546Sopenharmony_ci 657bf215546Sopenharmony_ci bool handleDIV(Instruction *); 658bf215546Sopenharmony_ci bool handleSQRT(Instruction *); 659bf215546Sopenharmony_ci bool handlePOW(Instruction *); 660bf215546Sopenharmony_ci 661bf215546Sopenharmony_ci bool handleSET(Instruction *); 662bf215546Sopenharmony_ci bool handleSLCT(CmpInstruction *); 663bf215546Sopenharmony_ci bool handleSELP(Instruction *); 664bf215546Sopenharmony_ci 665bf215546Sopenharmony_ci bool handleTEX(TexInstruction *); 666bf215546Sopenharmony_ci bool handleTXB(TexInstruction *); // I really 667bf215546Sopenharmony_ci bool handleTXL(TexInstruction *); // hate 668bf215546Sopenharmony_ci bool handleTXD(TexInstruction *); // these 3 669bf215546Sopenharmony_ci bool handleTXLQ(TexInstruction *); 670bf215546Sopenharmony_ci bool handleTXQ(TexInstruction *); 671bf215546Sopenharmony_ci bool handleSUQ(TexInstruction *); 672bf215546Sopenharmony_ci bool handleBUFQ(Instruction *); 673bf215546Sopenharmony_ci 674bf215546Sopenharmony_ci bool handleCALL(Instruction *); 675bf215546Sopenharmony_ci bool handlePRECONT(Instruction *); 676bf215546Sopenharmony_ci bool handleCONT(Instruction *); 677bf215546Sopenharmony_ci 678bf215546Sopenharmony_ci void checkPredicate(Instruction *); 679bf215546Sopenharmony_ci void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y); 680bf215546Sopenharmony_ci void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy); 681bf215546Sopenharmony_ci Value *loadSuInfo(int slot, uint32_t off); 682bf215546Sopenharmony_ci Value *loadSuInfo16(int slot, uint32_t off); 683bf215546Sopenharmony_ci 684bf215546Sopenharmony_ciprivate: 685bf215546Sopenharmony_ci const Target *const targ; 686bf215546Sopenharmony_ci 687bf215546Sopenharmony_ci BuildUtil bld; 688bf215546Sopenharmony_ci 689bf215546Sopenharmony_ci Value *tid; 690bf215546Sopenharmony_ci}; 691bf215546Sopenharmony_ci 692bf215546Sopenharmony_ciNV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) : 693bf215546Sopenharmony_ci targ(prog->getTarget()), tid(NULL) 694bf215546Sopenharmony_ci{ 695bf215546Sopenharmony_ci bld.setProgram(prog); 696bf215546Sopenharmony_ci} 697bf215546Sopenharmony_ci 698bf215546Sopenharmony_cibool 699bf215546Sopenharmony_ciNV50LoweringPreSSA::visit(Function *f) 700bf215546Sopenharmony_ci{ 701bf215546Sopenharmony_ci BasicBlock *root = BasicBlock::get(func->cfg.getRoot()); 702bf215546Sopenharmony_ci 703bf215546Sopenharmony_ci if (prog->getType() == Program::TYPE_COMPUTE) { 704bf215546Sopenharmony_ci // Add implicit "thread id" argument in $r0 to the function 705bf215546Sopenharmony_ci Value *arg = new_LValue(func, FILE_GPR); 706bf215546Sopenharmony_ci arg->reg.data.id = 0; 707bf215546Sopenharmony_ci f->ins.push_back(arg); 708bf215546Sopenharmony_ci 709bf215546Sopenharmony_ci bld.setPosition(root, false); 710bf215546Sopenharmony_ci tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0); 711bf215546Sopenharmony_ci } 712bf215546Sopenharmony_ci 713bf215546Sopenharmony_ci return true; 714bf215546Sopenharmony_ci} 715bf215546Sopenharmony_ci 716bf215546Sopenharmony_civoid NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms, 717bf215546Sopenharmony_ci Value **ms_x, Value **ms_y) { 718bf215546Sopenharmony_ci // This loads the texture-indexed ms setting from the constant buffer 719bf215546Sopenharmony_ci Value *tmp = new_LValue(func, FILE_GPR); 720bf215546Sopenharmony_ci uint8_t b = prog->driver->io.auxCBSlot; 721bf215546Sopenharmony_ci off += prog->driver->io.suInfoBase; 722bf215546Sopenharmony_ci if (prog->getType() > Program::TYPE_VERTEX) 723bf215546Sopenharmony_ci off += 16 * 2 * 4; 724bf215546Sopenharmony_ci if (prog->getType() > Program::TYPE_GEOMETRY) 725bf215546Sopenharmony_ci off += 16 * 2 * 4; 726bf215546Sopenharmony_ci if (prog->getType() > Program::TYPE_FRAGMENT) 727bf215546Sopenharmony_ci off += 16 * 2 * 4; 728bf215546Sopenharmony_ci *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol( 729bf215546Sopenharmony_ci FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL); 730bf215546Sopenharmony_ci *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol( 731bf215546Sopenharmony_ci FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL); 732bf215546Sopenharmony_ci *ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, *ms_x, *ms_y); 733bf215546Sopenharmony_ci} 734bf215546Sopenharmony_ci 735bf215546Sopenharmony_civoid NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) { 736bf215546Sopenharmony_ci // Given a MS level, and a sample id, compute the delta x/y 737bf215546Sopenharmony_ci uint8_t b = prog->driver->io.msInfoCBSlot; 738bf215546Sopenharmony_ci Value *off = new_LValue(func, FILE_ADDRESS), *t = new_LValue(func, FILE_GPR); 739bf215546Sopenharmony_ci 740bf215546Sopenharmony_ci // The required information is at mslevel * 16 * 4 + sample * 8 741bf215546Sopenharmony_ci // = (mslevel * 8 + sample) * 8 742bf215546Sopenharmony_ci bld.mkOp2(OP_SHL, 743bf215546Sopenharmony_ci TYPE_U32, 744bf215546Sopenharmony_ci off, 745bf215546Sopenharmony_ci bld.mkOp2v(OP_ADD, TYPE_U32, t, 746bf215546Sopenharmony_ci bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)), 747bf215546Sopenharmony_ci s), 748bf215546Sopenharmony_ci bld.mkImm(3)); 749bf215546Sopenharmony_ci *dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol( 750bf215546Sopenharmony_ci FILE_MEMORY_CONST, b, TYPE_U32, 751bf215546Sopenharmony_ci prog->driver->io.msInfoBase), off); 752bf215546Sopenharmony_ci *dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol( 753bf215546Sopenharmony_ci FILE_MEMORY_CONST, b, TYPE_U32, 754bf215546Sopenharmony_ci prog->driver->io.msInfoBase + 4), off); 755bf215546Sopenharmony_ci} 756bf215546Sopenharmony_ci 757bf215546Sopenharmony_ciValue * 758bf215546Sopenharmony_ciNV50LoweringPreSSA::loadSuInfo(int slot, uint32_t off) 759bf215546Sopenharmony_ci{ 760bf215546Sopenharmony_ci uint8_t b = prog->driver->io.auxCBSlot; 761bf215546Sopenharmony_ci off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE; 762bf215546Sopenharmony_ci return bld.mkLoadv(TYPE_U32, bld.mkSymbol( 763bf215546Sopenharmony_ci FILE_MEMORY_CONST, b, TYPE_U32, off), NULL); 764bf215546Sopenharmony_ci} 765bf215546Sopenharmony_ci 766bf215546Sopenharmony_ciValue * 767bf215546Sopenharmony_ciNV50LoweringPreSSA::loadSuInfo16(int slot, uint32_t off) 768bf215546Sopenharmony_ci{ 769bf215546Sopenharmony_ci uint8_t b = prog->driver->io.auxCBSlot; 770bf215546Sopenharmony_ci off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE; 771bf215546Sopenharmony_ci return bld.mkLoadv(TYPE_U16, bld.mkSymbol( 772bf215546Sopenharmony_ci FILE_MEMORY_CONST, b, TYPE_U16, off), NULL); 773bf215546Sopenharmony_ci} 774bf215546Sopenharmony_ci 775bf215546Sopenharmony_cibool 776bf215546Sopenharmony_ciNV50LoweringPreSSA::handleTEX(TexInstruction *i) 777bf215546Sopenharmony_ci{ 778bf215546Sopenharmony_ci const int arg = i->tex.target.getArgCount(); 779bf215546Sopenharmony_ci const int dref = arg; 780bf215546Sopenharmony_ci const int lod = i->tex.target.isShadow() ? (arg + 1) : arg; 781bf215546Sopenharmony_ci 782bf215546Sopenharmony_ci /* Only normalize in the non-explicit derivatives case. 783bf215546Sopenharmony_ci */ 784bf215546Sopenharmony_ci if (i->tex.target.isCube() && i->op != OP_TXD) { 785bf215546Sopenharmony_ci Value *src[3], *val; 786bf215546Sopenharmony_ci int c; 787bf215546Sopenharmony_ci for (c = 0; c < 3; ++c) 788bf215546Sopenharmony_ci src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c)); 789bf215546Sopenharmony_ci val = bld.getScratch(); 790bf215546Sopenharmony_ci bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); 791bf215546Sopenharmony_ci bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val); 792bf215546Sopenharmony_ci bld.mkOp1(OP_RCP, TYPE_F32, val, val); 793bf215546Sopenharmony_ci for (c = 0; c < 3; ++c) { 794bf215546Sopenharmony_ci i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), 795bf215546Sopenharmony_ci i->getSrc(c), val)); 796bf215546Sopenharmony_ci } 797bf215546Sopenharmony_ci } 798bf215546Sopenharmony_ci 799bf215546Sopenharmony_ci // handle MS, which means looking up the MS params for this texture, and 800bf215546Sopenharmony_ci // adjusting the input coordinates to point at the right sample. 801bf215546Sopenharmony_ci if (i->tex.target.isMS()) { 802bf215546Sopenharmony_ci Value *x = i->getSrc(0); 803bf215546Sopenharmony_ci Value *y = i->getSrc(1); 804bf215546Sopenharmony_ci Value *s = i->getSrc(arg - 1); 805bf215546Sopenharmony_ci Value *tx = new_LValue(func, FILE_GPR), *ty = new_LValue(func, FILE_GPR), 806bf215546Sopenharmony_ci *ms, *ms_x, *ms_y, *dx, *dy; 807bf215546Sopenharmony_ci 808bf215546Sopenharmony_ci i->tex.target.clearMS(); 809bf215546Sopenharmony_ci 810bf215546Sopenharmony_ci loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y); 811bf215546Sopenharmony_ci loadMsInfo(ms, s, &dx, &dy); 812bf215546Sopenharmony_ci 813bf215546Sopenharmony_ci bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x); 814bf215546Sopenharmony_ci bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y); 815bf215546Sopenharmony_ci bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx); 816bf215546Sopenharmony_ci bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy); 817bf215546Sopenharmony_ci i->setSrc(0, tx); 818bf215546Sopenharmony_ci i->setSrc(1, ty); 819bf215546Sopenharmony_ci i->setSrc(arg - 1, bld.loadImm(NULL, 0)); 820bf215546Sopenharmony_ci } 821bf215546Sopenharmony_ci 822bf215546Sopenharmony_ci // dref comes before bias/lod 823bf215546Sopenharmony_ci if (i->tex.target.isShadow()) 824bf215546Sopenharmony_ci if (i->op == OP_TXB || i->op == OP_TXL) 825bf215546Sopenharmony_ci i->swapSources(dref, lod); 826bf215546Sopenharmony_ci 827bf215546Sopenharmony_ci if (i->tex.target.isArray()) { 828bf215546Sopenharmony_ci if (i->op != OP_TXF) { 829bf215546Sopenharmony_ci // array index must be converted to u32, but it's already an integer 830bf215546Sopenharmony_ci // for TXF 831bf215546Sopenharmony_ci Value *layer = i->getSrc(arg - 1); 832bf215546Sopenharmony_ci LValue *src = new_LValue(func, FILE_GPR); 833bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer); 834bf215546Sopenharmony_ci bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511)); 835bf215546Sopenharmony_ci i->setSrc(arg - 1, src); 836bf215546Sopenharmony_ci } 837bf215546Sopenharmony_ci if (i->tex.target.isCube() && i->srcCount() > 4) { 838bf215546Sopenharmony_ci std::vector<Value *> acube, a2d; 839bf215546Sopenharmony_ci int c; 840bf215546Sopenharmony_ci 841bf215546Sopenharmony_ci acube.resize(4); 842bf215546Sopenharmony_ci for (c = 0; c < 4; ++c) 843bf215546Sopenharmony_ci acube[c] = i->getSrc(c); 844bf215546Sopenharmony_ci a2d.resize(4); 845bf215546Sopenharmony_ci for (c = 0; c < 3; ++c) 846bf215546Sopenharmony_ci a2d[c] = new_LValue(func, FILE_GPR); 847bf215546Sopenharmony_ci a2d[3] = NULL; 848bf215546Sopenharmony_ci 849bf215546Sopenharmony_ci bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s, 850bf215546Sopenharmony_ci a2d, acube)->asTex()->tex.mask = 0x7; 851bf215546Sopenharmony_ci 852bf215546Sopenharmony_ci for (c = 0; c < 3; ++c) 853bf215546Sopenharmony_ci i->setSrc(c, a2d[c]); 854bf215546Sopenharmony_ci for (; i->srcExists(c + 1); ++c) 855bf215546Sopenharmony_ci i->setSrc(c, i->getSrc(c + 1)); 856bf215546Sopenharmony_ci i->setSrc(c, NULL); 857bf215546Sopenharmony_ci assert(c <= 4); 858bf215546Sopenharmony_ci 859bf215546Sopenharmony_ci i->tex.target = i->tex.target.isShadow() ? 860bf215546Sopenharmony_ci TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY; 861bf215546Sopenharmony_ci } 862bf215546Sopenharmony_ci } 863bf215546Sopenharmony_ci 864bf215546Sopenharmony_ci // texel offsets are 3 immediate fields in the instruction, 865bf215546Sopenharmony_ci // nv50 cannot do textureGatherOffsets 866bf215546Sopenharmony_ci assert(i->tex.useOffsets <= 1); 867bf215546Sopenharmony_ci if (i->tex.useOffsets) { 868bf215546Sopenharmony_ci for (int c = 0; c < 3; ++c) { 869bf215546Sopenharmony_ci ImmediateValue val; 870bf215546Sopenharmony_ci if (!i->offset[0][c].getImmediate(val)) 871bf215546Sopenharmony_ci assert(!"non-immediate offset"); 872bf215546Sopenharmony_ci i->tex.offset[c] = val.reg.data.u32; 873bf215546Sopenharmony_ci i->offset[0][c].set(NULL); 874bf215546Sopenharmony_ci } 875bf215546Sopenharmony_ci } 876bf215546Sopenharmony_ci 877bf215546Sopenharmony_ci return true; 878bf215546Sopenharmony_ci} 879bf215546Sopenharmony_ci 880bf215546Sopenharmony_ci// Bias must be equal for all threads of a quad or lod calculation will fail. 881bf215546Sopenharmony_ci// 882bf215546Sopenharmony_ci// The lanes of a quad are grouped by the bit in the condition register they 883bf215546Sopenharmony_ci// have set, which is selected by differing bias values. 884bf215546Sopenharmony_ci// Move the input values for TEX into a new register set for each group and 885bf215546Sopenharmony_ci// execute TEX only for a specific group. 886bf215546Sopenharmony_ci// We always need to use 4 new registers for the inputs/outputs because the 887bf215546Sopenharmony_ci// implicitly calculated derivatives must be correct. 888bf215546Sopenharmony_ci// 889bf215546Sopenharmony_ci// TODO: move to SSA phase so we can easily determine whether bias is constant 890bf215546Sopenharmony_cibool 891bf215546Sopenharmony_ciNV50LoweringPreSSA::handleTXB(TexInstruction *i) 892bf215546Sopenharmony_ci{ 893bf215546Sopenharmony_ci const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O }; 894bf215546Sopenharmony_ci int l, d; 895bf215546Sopenharmony_ci 896bf215546Sopenharmony_ci // We can't actually apply bias *and* do a compare for a cube 897bf215546Sopenharmony_ci // texture. Since the compare has to be done before the filtering, just 898bf215546Sopenharmony_ci // drop the bias on the floor. 899bf215546Sopenharmony_ci if (i->tex.target == TEX_TARGET_CUBE_SHADOW) { 900bf215546Sopenharmony_ci i->op = OP_TEX; 901bf215546Sopenharmony_ci i->setSrc(3, i->getSrc(4)); 902bf215546Sopenharmony_ci i->setSrc(4, NULL); 903bf215546Sopenharmony_ci return handleTEX(i); 904bf215546Sopenharmony_ci } 905bf215546Sopenharmony_ci 906bf215546Sopenharmony_ci handleTEX(i); 907bf215546Sopenharmony_ci Value *bias = i->getSrc(i->tex.target.getArgCount()); 908bf215546Sopenharmony_ci if (bias->isUniform()) 909bf215546Sopenharmony_ci return true; 910bf215546Sopenharmony_ci 911bf215546Sopenharmony_ci Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(), 912bf215546Sopenharmony_ci bld.loadImm(NULL, 1)); 913bf215546Sopenharmony_ci bld.setPosition(cond, false); 914bf215546Sopenharmony_ci 915bf215546Sopenharmony_ci for (l = 1; l < 4; ++l) { 916bf215546Sopenharmony_ci const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR); 917bf215546Sopenharmony_ci Value *bit = bld.getSSA(); 918bf215546Sopenharmony_ci Value *pred = bld.getScratch(1, FILE_FLAGS); 919bf215546Sopenharmony_ci Value *imm = bld.loadImm(NULL, (1 << l)); 920bf215546Sopenharmony_ci bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0; 921bf215546Sopenharmony_ci bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred); 922bf215546Sopenharmony_ci cond->setSrc(l, bit); 923bf215546Sopenharmony_ci } 924bf215546Sopenharmony_ci Value *flags = bld.getScratch(1, FILE_FLAGS); 925bf215546Sopenharmony_ci bld.setPosition(cond, true); 926bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0; 927bf215546Sopenharmony_ci 928bf215546Sopenharmony_ci Instruction *tex[4]; 929bf215546Sopenharmony_ci for (l = 0; l < 4; ++l) { 930bf215546Sopenharmony_ci (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags); 931bf215546Sopenharmony_ci bld.insert(tex[l]); 932bf215546Sopenharmony_ci } 933bf215546Sopenharmony_ci 934bf215546Sopenharmony_ci Value *res[4][4]; 935bf215546Sopenharmony_ci for (d = 0; i->defExists(d); ++d) 936bf215546Sopenharmony_ci res[0][d] = tex[0]->getDef(d); 937bf215546Sopenharmony_ci for (l = 1; l < 4; ++l) { 938bf215546Sopenharmony_ci for (d = 0; tex[l]->defExists(d); ++d) { 939bf215546Sopenharmony_ci res[l][d] = cloneShallow(func, res[0][d]); 940bf215546Sopenharmony_ci bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags); 941bf215546Sopenharmony_ci } 942bf215546Sopenharmony_ci } 943bf215546Sopenharmony_ci 944bf215546Sopenharmony_ci for (d = 0; i->defExists(d); ++d) { 945bf215546Sopenharmony_ci Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d)); 946bf215546Sopenharmony_ci for (l = 0; l < 4; ++l) 947bf215546Sopenharmony_ci dst->setSrc(l, res[l][d]); 948bf215546Sopenharmony_ci } 949bf215546Sopenharmony_ci delete_Instruction(prog, i); 950bf215546Sopenharmony_ci return true; 951bf215546Sopenharmony_ci} 952bf215546Sopenharmony_ci 953bf215546Sopenharmony_ci// LOD must be equal for all threads of a quad. 954bf215546Sopenharmony_ci// Unlike with TXB, here we can just diverge since there's no LOD calculation 955bf215546Sopenharmony_ci// that would require all 4 threads' sources to be set up properly. 956bf215546Sopenharmony_cibool 957bf215546Sopenharmony_ciNV50LoweringPreSSA::handleTXL(TexInstruction *i) 958bf215546Sopenharmony_ci{ 959bf215546Sopenharmony_ci handleTEX(i); 960bf215546Sopenharmony_ci Value *lod = i->getSrc(i->tex.target.getArgCount()); 961bf215546Sopenharmony_ci if (lod->isUniform()) 962bf215546Sopenharmony_ci return true; 963bf215546Sopenharmony_ci 964bf215546Sopenharmony_ci BasicBlock *currBB = i->bb; 965bf215546Sopenharmony_ci BasicBlock *texiBB = i->bb->splitBefore(i, false); 966bf215546Sopenharmony_ci BasicBlock *joinBB = i->bb->splitAfter(i); 967bf215546Sopenharmony_ci 968bf215546Sopenharmony_ci bld.setPosition(currBB, true); 969bf215546Sopenharmony_ci assert(!currBB->joinAt); 970bf215546Sopenharmony_ci currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL); 971bf215546Sopenharmony_ci 972bf215546Sopenharmony_ci for (int l = 0; l <= 3; ++l) { 973bf215546Sopenharmony_ci const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR); 974bf215546Sopenharmony_ci Value *pred = bld.getScratch(1, FILE_FLAGS); 975bf215546Sopenharmony_ci bld.setPosition(currBB, true); 976bf215546Sopenharmony_ci bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0; 977bf215546Sopenharmony_ci bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1; 978bf215546Sopenharmony_ci currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD); 979bf215546Sopenharmony_ci if (l <= 2) { 980bf215546Sopenharmony_ci BasicBlock *laneBB = new BasicBlock(func); 981bf215546Sopenharmony_ci currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE); 982bf215546Sopenharmony_ci currBB = laneBB; 983bf215546Sopenharmony_ci } 984bf215546Sopenharmony_ci } 985bf215546Sopenharmony_ci bld.setPosition(joinBB, false); 986bf215546Sopenharmony_ci bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1; 987bf215546Sopenharmony_ci return true; 988bf215546Sopenharmony_ci} 989bf215546Sopenharmony_ci 990bf215546Sopenharmony_cibool 991bf215546Sopenharmony_ciNV50LoweringPreSSA::handleTXD(TexInstruction *i) 992bf215546Sopenharmony_ci{ 993bf215546Sopenharmony_ci static const uint8_t qOps[4][2] = 994bf215546Sopenharmony_ci { 995bf215546Sopenharmony_ci { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0 996bf215546Sopenharmony_ci { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1 997bf215546Sopenharmony_ci { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2 998bf215546Sopenharmony_ci { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3 999bf215546Sopenharmony_ci }; 1000bf215546Sopenharmony_ci Value *def[4][4]; 1001bf215546Sopenharmony_ci Value *crd[3]; 1002bf215546Sopenharmony_ci Instruction *tex; 1003bf215546Sopenharmony_ci Value *zero = bld.loadImm(bld.getSSA(), 0); 1004bf215546Sopenharmony_ci int l, c; 1005bf215546Sopenharmony_ci const int dim = i->tex.target.getDim() + i->tex.target.isCube(); 1006bf215546Sopenharmony_ci 1007bf215546Sopenharmony_ci handleTEX(i); 1008bf215546Sopenharmony_ci i->op = OP_TEX; // no need to clone dPdx/dPdy later 1009bf215546Sopenharmony_ci i->tex.derivAll = true; 1010bf215546Sopenharmony_ci 1011bf215546Sopenharmony_ci for (c = 0; c < dim; ++c) 1012bf215546Sopenharmony_ci crd[c] = bld.getScratch(); 1013bf215546Sopenharmony_ci 1014bf215546Sopenharmony_ci bld.mkOp(OP_QUADON, TYPE_NONE, NULL); 1015bf215546Sopenharmony_ci for (l = 0; l < 4; ++l) { 1016bf215546Sopenharmony_ci Value *src[3], *val; 1017bf215546Sopenharmony_ci // mov coordinates from lane l to all lanes 1018bf215546Sopenharmony_ci for (c = 0; c < dim; ++c) 1019bf215546Sopenharmony_ci bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero); 1020bf215546Sopenharmony_ci // add dPdx from lane l to lanes dx 1021bf215546Sopenharmony_ci for (c = 0; c < dim; ++c) 1022bf215546Sopenharmony_ci bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]); 1023bf215546Sopenharmony_ci // add dPdy from lane l to lanes dy 1024bf215546Sopenharmony_ci for (c = 0; c < dim; ++c) 1025bf215546Sopenharmony_ci bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]); 1026bf215546Sopenharmony_ci // normalize cube coordinates if necessary 1027bf215546Sopenharmony_ci if (i->tex.target.isCube()) { 1028bf215546Sopenharmony_ci for (c = 0; c < 3; ++c) 1029bf215546Sopenharmony_ci src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]); 1030bf215546Sopenharmony_ci val = bld.getScratch(); 1031bf215546Sopenharmony_ci bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); 1032bf215546Sopenharmony_ci bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val); 1033bf215546Sopenharmony_ci bld.mkOp1(OP_RCP, TYPE_F32, val, val); 1034bf215546Sopenharmony_ci for (c = 0; c < 3; ++c) 1035bf215546Sopenharmony_ci src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val); 1036bf215546Sopenharmony_ci } else { 1037bf215546Sopenharmony_ci for (c = 0; c < dim; ++c) 1038bf215546Sopenharmony_ci src[c] = crd[c]; 1039bf215546Sopenharmony_ci } 1040bf215546Sopenharmony_ci // texture 1041bf215546Sopenharmony_ci bld.insert(tex = cloneForward(func, i)); 1042bf215546Sopenharmony_ci for (c = 0; c < dim; ++c) 1043bf215546Sopenharmony_ci tex->setSrc(c, src[c]); 1044bf215546Sopenharmony_ci // save results 1045bf215546Sopenharmony_ci for (c = 0; i->defExists(c); ++c) { 1046bf215546Sopenharmony_ci Instruction *mov; 1047bf215546Sopenharmony_ci def[c][l] = bld.getSSA(); 1048bf215546Sopenharmony_ci mov = bld.mkMov(def[c][l], tex->getDef(c)); 1049bf215546Sopenharmony_ci mov->fixed = 1; 1050bf215546Sopenharmony_ci mov->lanes = 1 << l; 1051bf215546Sopenharmony_ci } 1052bf215546Sopenharmony_ci } 1053bf215546Sopenharmony_ci bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL); 1054bf215546Sopenharmony_ci 1055bf215546Sopenharmony_ci for (c = 0; i->defExists(c); ++c) { 1056bf215546Sopenharmony_ci Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c)); 1057bf215546Sopenharmony_ci for (l = 0; l < 4; ++l) 1058bf215546Sopenharmony_ci u->setSrc(l, def[c][l]); 1059bf215546Sopenharmony_ci } 1060bf215546Sopenharmony_ci 1061bf215546Sopenharmony_ci i->bb->remove(i); 1062bf215546Sopenharmony_ci return true; 1063bf215546Sopenharmony_ci} 1064bf215546Sopenharmony_ci 1065bf215546Sopenharmony_cibool 1066bf215546Sopenharmony_ciNV50LoweringPreSSA::handleTXLQ(TexInstruction *i) 1067bf215546Sopenharmony_ci{ 1068bf215546Sopenharmony_ci handleTEX(i); 1069bf215546Sopenharmony_ci bld.setPosition(i, true); 1070bf215546Sopenharmony_ci 1071bf215546Sopenharmony_ci /* The returned values are not quite what we want: 1072bf215546Sopenharmony_ci * (a) convert from s32 to f32 1073bf215546Sopenharmony_ci * (b) multiply by 1/256 1074bf215546Sopenharmony_ci */ 1075bf215546Sopenharmony_ci for (int def = 0; def < 2; ++def) { 1076bf215546Sopenharmony_ci if (!i->defExists(def)) 1077bf215546Sopenharmony_ci continue; 1078bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def)); 1079bf215546Sopenharmony_ci bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def), 1080bf215546Sopenharmony_ci i->getDef(def), bld.loadImm(NULL, 1.0f / 256)); 1081bf215546Sopenharmony_ci } 1082bf215546Sopenharmony_ci return true; 1083bf215546Sopenharmony_ci} 1084bf215546Sopenharmony_ci 1085bf215546Sopenharmony_cibool 1086bf215546Sopenharmony_ciNV50LoweringPreSSA::handleTXQ(TexInstruction *i) 1087bf215546Sopenharmony_ci{ 1088bf215546Sopenharmony_ci Value *ms, *ms_x, *ms_y; 1089bf215546Sopenharmony_ci if (i->tex.query == TXQ_DIMS) { 1090bf215546Sopenharmony_ci if (i->tex.target.isMS()) { 1091bf215546Sopenharmony_ci bld.setPosition(i, true); 1092bf215546Sopenharmony_ci loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y); 1093bf215546Sopenharmony_ci int d = 0; 1094bf215546Sopenharmony_ci if (i->tex.mask & 1) { 1095bf215546Sopenharmony_ci bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_x); 1096bf215546Sopenharmony_ci d++; 1097bf215546Sopenharmony_ci } 1098bf215546Sopenharmony_ci if (i->tex.mask & 2) { 1099bf215546Sopenharmony_ci bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_y); 1100bf215546Sopenharmony_ci d++; 1101bf215546Sopenharmony_ci } 1102bf215546Sopenharmony_ci } 1103bf215546Sopenharmony_ci return true; 1104bf215546Sopenharmony_ci } 1105bf215546Sopenharmony_ci assert(i->tex.query == TXQ_TYPE); 1106bf215546Sopenharmony_ci assert(i->tex.mask == 4); 1107bf215546Sopenharmony_ci 1108bf215546Sopenharmony_ci loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y); 1109bf215546Sopenharmony_ci bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.loadImm(NULL, 1), ms); 1110bf215546Sopenharmony_ci i->bb->remove(i); 1111bf215546Sopenharmony_ci 1112bf215546Sopenharmony_ci return true; 1113bf215546Sopenharmony_ci} 1114bf215546Sopenharmony_ci 1115bf215546Sopenharmony_cibool 1116bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSUQ(TexInstruction *suq) 1117bf215546Sopenharmony_ci{ 1118bf215546Sopenharmony_ci const int dim = suq->tex.target.getDim(); 1119bf215546Sopenharmony_ci const int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube()); 1120bf215546Sopenharmony_ci int mask = suq->tex.mask; 1121bf215546Sopenharmony_ci int slot = suq->tex.r; 1122bf215546Sopenharmony_ci int c, d; 1123bf215546Sopenharmony_ci 1124bf215546Sopenharmony_ci for (c = 0, d = 0; c < 3; ++c, mask >>= 1) { 1125bf215546Sopenharmony_ci if (c >= arg || !(mask & 1)) 1126bf215546Sopenharmony_ci continue; 1127bf215546Sopenharmony_ci 1128bf215546Sopenharmony_ci int offset; 1129bf215546Sopenharmony_ci 1130bf215546Sopenharmony_ci if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) { 1131bf215546Sopenharmony_ci offset = NV50_SU_INFO_SIZE(2); 1132bf215546Sopenharmony_ci } else { 1133bf215546Sopenharmony_ci offset = NV50_SU_INFO_SIZE(c); 1134bf215546Sopenharmony_ci } 1135bf215546Sopenharmony_ci bld.mkMov(suq->getDef(d++), loadSuInfo(slot, offset)); 1136bf215546Sopenharmony_ci if (c == 2 && suq->tex.target.isCube()) 1137bf215546Sopenharmony_ci bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1), 1138bf215546Sopenharmony_ci bld.loadImm(NULL, 6)); 1139bf215546Sopenharmony_ci } 1140bf215546Sopenharmony_ci 1141bf215546Sopenharmony_ci if (mask & 1) { 1142bf215546Sopenharmony_ci if (suq->tex.target.isMS()) { 1143bf215546Sopenharmony_ci Value *ms_x = loadSuInfo(slot, NV50_SU_INFO_MS(0)); 1144bf215546Sopenharmony_ci Value *ms_y = loadSuInfo(slot, NV50_SU_INFO_MS(1)); 1145bf215546Sopenharmony_ci Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y); 1146bf215546Sopenharmony_ci bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms); 1147bf215546Sopenharmony_ci } else { 1148bf215546Sopenharmony_ci bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1)); 1149bf215546Sopenharmony_ci } 1150bf215546Sopenharmony_ci } 1151bf215546Sopenharmony_ci 1152bf215546Sopenharmony_ci bld.remove(suq); 1153bf215546Sopenharmony_ci return true; 1154bf215546Sopenharmony_ci} 1155bf215546Sopenharmony_ci 1156bf215546Sopenharmony_cibool 1157bf215546Sopenharmony_ciNV50LoweringPreSSA::handleBUFQ(Instruction *bufq) 1158bf215546Sopenharmony_ci{ 1159bf215546Sopenharmony_ci bufq->op = OP_MOV; 1160bf215546Sopenharmony_ci bufq->setSrc(0, loadSuInfo(bufq->getSrc(0)->reg.fileIndex, NV50_SU_INFO_SIZE_X)); 1161bf215546Sopenharmony_ci bufq->setIndirect(0, 0, NULL); 1162bf215546Sopenharmony_ci bufq->setIndirect(0, 1, NULL); 1163bf215546Sopenharmony_ci return true; 1164bf215546Sopenharmony_ci} 1165bf215546Sopenharmony_ci 1166bf215546Sopenharmony_cibool 1167bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSET(Instruction *i) 1168bf215546Sopenharmony_ci{ 1169bf215546Sopenharmony_ci if (i->dType == TYPE_F32) { 1170bf215546Sopenharmony_ci bld.setPosition(i, true); 1171bf215546Sopenharmony_ci i->dType = TYPE_U32; 1172bf215546Sopenharmony_ci bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0)); 1173bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0)); 1174bf215546Sopenharmony_ci } 1175bf215546Sopenharmony_ci return true; 1176bf215546Sopenharmony_ci} 1177bf215546Sopenharmony_ci 1178bf215546Sopenharmony_cibool 1179bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSLCT(CmpInstruction *i) 1180bf215546Sopenharmony_ci{ 1181bf215546Sopenharmony_ci Value *src0 = bld.getSSA(); 1182bf215546Sopenharmony_ci Value *src1 = bld.getSSA(); 1183bf215546Sopenharmony_ci Value *pred = bld.getScratch(1, FILE_FLAGS); 1184bf215546Sopenharmony_ci 1185bf215546Sopenharmony_ci Value *v0 = i->getSrc(0); 1186bf215546Sopenharmony_ci Value *v1 = i->getSrc(1); 1187bf215546Sopenharmony_ci // XXX: these probably shouldn't be immediates in the first place ... 1188bf215546Sopenharmony_ci if (v0->asImm()) 1189bf215546Sopenharmony_ci v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0); 1190bf215546Sopenharmony_ci if (v1->asImm()) 1191bf215546Sopenharmony_ci v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0); 1192bf215546Sopenharmony_ci 1193bf215546Sopenharmony_ci bld.setPosition(i, true); 1194bf215546Sopenharmony_ci bld.mkMov(src0, v0)->setPredicate(CC_NE, pred); 1195bf215546Sopenharmony_ci bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred); 1196bf215546Sopenharmony_ci bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1); 1197bf215546Sopenharmony_ci 1198bf215546Sopenharmony_ci bld.setPosition(i, false); 1199bf215546Sopenharmony_ci i->op = OP_SET; 1200bf215546Sopenharmony_ci i->setFlagsDef(0, pred); 1201bf215546Sopenharmony_ci i->dType = TYPE_U8; 1202bf215546Sopenharmony_ci i->setSrc(0, i->getSrc(2)); 1203bf215546Sopenharmony_ci i->setSrc(2, NULL); 1204bf215546Sopenharmony_ci i->setSrc(1, bld.loadImm(NULL, 0)); 1205bf215546Sopenharmony_ci 1206bf215546Sopenharmony_ci return true; 1207bf215546Sopenharmony_ci} 1208bf215546Sopenharmony_ci 1209bf215546Sopenharmony_cibool 1210bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSELP(Instruction *i) 1211bf215546Sopenharmony_ci{ 1212bf215546Sopenharmony_ci Value *src0 = bld.getSSA(); 1213bf215546Sopenharmony_ci Value *src1 = bld.getSSA(); 1214bf215546Sopenharmony_ci 1215bf215546Sopenharmony_ci Value *v0 = i->getSrc(0); 1216bf215546Sopenharmony_ci Value *v1 = i->getSrc(1); 1217bf215546Sopenharmony_ci if (v0->asImm()) 1218bf215546Sopenharmony_ci v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0); 1219bf215546Sopenharmony_ci if (v1->asImm()) 1220bf215546Sopenharmony_ci v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0); 1221bf215546Sopenharmony_ci 1222bf215546Sopenharmony_ci bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2)); 1223bf215546Sopenharmony_ci bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2)); 1224bf215546Sopenharmony_ci bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1); 1225bf215546Sopenharmony_ci delete_Instruction(prog, i); 1226bf215546Sopenharmony_ci return true; 1227bf215546Sopenharmony_ci} 1228bf215546Sopenharmony_ci 1229bf215546Sopenharmony_cibool 1230bf215546Sopenharmony_ciNV50LoweringPreSSA::handleWRSV(Instruction *i) 1231bf215546Sopenharmony_ci{ 1232bf215546Sopenharmony_ci Symbol *sym = i->getSrc(0)->asSym(); 1233bf215546Sopenharmony_ci 1234bf215546Sopenharmony_ci // these are all shader outputs, $sreg are not writeable 1235bf215546Sopenharmony_ci uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym); 1236bf215546Sopenharmony_ci if (addr >= 0x400) 1237bf215546Sopenharmony_ci return false; 1238bf215546Sopenharmony_ci sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr); 1239bf215546Sopenharmony_ci 1240bf215546Sopenharmony_ci bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1)); 1241bf215546Sopenharmony_ci 1242bf215546Sopenharmony_ci bld.getBB()->remove(i); 1243bf215546Sopenharmony_ci return true; 1244bf215546Sopenharmony_ci} 1245bf215546Sopenharmony_ci 1246bf215546Sopenharmony_cibool 1247bf215546Sopenharmony_ciNV50LoweringPreSSA::handleCALL(Instruction *i) 1248bf215546Sopenharmony_ci{ 1249bf215546Sopenharmony_ci if (prog->getType() == Program::TYPE_COMPUTE) { 1250bf215546Sopenharmony_ci // Add implicit "thread id" argument in $r0 to the function 1251bf215546Sopenharmony_ci i->setSrc(i->srcCount(), tid); 1252bf215546Sopenharmony_ci } 1253bf215546Sopenharmony_ci return true; 1254bf215546Sopenharmony_ci} 1255bf215546Sopenharmony_ci 1256bf215546Sopenharmony_cibool 1257bf215546Sopenharmony_ciNV50LoweringPreSSA::handlePRECONT(Instruction *i) 1258bf215546Sopenharmony_ci{ 1259bf215546Sopenharmony_ci delete_Instruction(prog, i); 1260bf215546Sopenharmony_ci return true; 1261bf215546Sopenharmony_ci} 1262bf215546Sopenharmony_ci 1263bf215546Sopenharmony_cibool 1264bf215546Sopenharmony_ciNV50LoweringPreSSA::handleCONT(Instruction *i) 1265bf215546Sopenharmony_ci{ 1266bf215546Sopenharmony_ci i->op = OP_BRA; 1267bf215546Sopenharmony_ci return true; 1268bf215546Sopenharmony_ci} 1269bf215546Sopenharmony_ci 1270bf215546Sopenharmony_cibool 1271bf215546Sopenharmony_ciNV50LoweringPreSSA::handleRDSV(Instruction *i) 1272bf215546Sopenharmony_ci{ 1273bf215546Sopenharmony_ci Symbol *sym = i->getSrc(0)->asSym(); 1274bf215546Sopenharmony_ci uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym); 1275bf215546Sopenharmony_ci Value *def = i->getDef(0); 1276bf215546Sopenharmony_ci SVSemantic sv = sym->reg.data.sv.sv; 1277bf215546Sopenharmony_ci int idx = sym->reg.data.sv.index; 1278bf215546Sopenharmony_ci 1279bf215546Sopenharmony_ci if (addr >= 0x400) // mov $sreg 1280bf215546Sopenharmony_ci return true; 1281bf215546Sopenharmony_ci 1282bf215546Sopenharmony_ci switch (sv) { 1283bf215546Sopenharmony_ci case SV_POSITION: 1284bf215546Sopenharmony_ci assert(prog->getType() == Program::TYPE_FRAGMENT); 1285bf215546Sopenharmony_ci bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL); 1286bf215546Sopenharmony_ci break; 1287bf215546Sopenharmony_ci case SV_FACE: 1288bf215546Sopenharmony_ci bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL); 1289bf215546Sopenharmony_ci if (i->dType == TYPE_F32) { 1290bf215546Sopenharmony_ci bld.mkOp2(OP_OR, TYPE_U32, def, def, bld.mkImm(0x00000001)); 1291bf215546Sopenharmony_ci bld.mkOp1(OP_NEG, TYPE_S32, def, def); 1292bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_F32, def, TYPE_S32, def); 1293bf215546Sopenharmony_ci } 1294bf215546Sopenharmony_ci break; 1295bf215546Sopenharmony_ci case SV_NCTAID: 1296bf215546Sopenharmony_ci case SV_CTAID: 1297bf215546Sopenharmony_ci case SV_NTID: { 1298bf215546Sopenharmony_ci Value *x = bld.getSSA(2); 1299bf215546Sopenharmony_ci bld.mkOp1(OP_LOAD, TYPE_U16, x, 1300bf215546Sopenharmony_ci bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr)); 1301bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x); 1302bf215546Sopenharmony_ci break; 1303bf215546Sopenharmony_ci } 1304bf215546Sopenharmony_ci case SV_TID: 1305bf215546Sopenharmony_ci if (idx == 0) { 1306bf215546Sopenharmony_ci bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff)); 1307bf215546Sopenharmony_ci } else if (idx == 1) { 1308bf215546Sopenharmony_ci bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000)); 1309bf215546Sopenharmony_ci bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16)); 1310bf215546Sopenharmony_ci } else if (idx == 2) { 1311bf215546Sopenharmony_ci bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26)); 1312bf215546Sopenharmony_ci } else { 1313bf215546Sopenharmony_ci bld.mkMov(def, bld.mkImm(0)); 1314bf215546Sopenharmony_ci } 1315bf215546Sopenharmony_ci break; 1316bf215546Sopenharmony_ci case SV_COMBINED_TID: 1317bf215546Sopenharmony_ci bld.mkMov(def, tid); 1318bf215546Sopenharmony_ci break; 1319bf215546Sopenharmony_ci case SV_SAMPLE_POS: { 1320bf215546Sopenharmony_ci Value *off = new_LValue(func, FILE_ADDRESS); 1321bf215546Sopenharmony_ci bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0)); 1322bf215546Sopenharmony_ci bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3)); 1323bf215546Sopenharmony_ci bld.mkLoad(TYPE_F32, 1324bf215546Sopenharmony_ci def, 1325bf215546Sopenharmony_ci bld.mkSymbol( 1326bf215546Sopenharmony_ci FILE_MEMORY_CONST, prog->driver->io.auxCBSlot, 1327bf215546Sopenharmony_ci TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx), 1328bf215546Sopenharmony_ci off); 1329bf215546Sopenharmony_ci break; 1330bf215546Sopenharmony_ci } 1331bf215546Sopenharmony_ci case SV_THREAD_KILL: 1332bf215546Sopenharmony_ci // Not actually supported. But it's implementation-dependent, so we can 1333bf215546Sopenharmony_ci // always just say it's not a helper. 1334bf215546Sopenharmony_ci bld.mkMov(def, bld.loadImm(NULL, 0)); 1335bf215546Sopenharmony_ci break; 1336bf215546Sopenharmony_ci default: 1337bf215546Sopenharmony_ci bld.mkFetch(i->getDef(0), i->dType, 1338bf215546Sopenharmony_ci FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL); 1339bf215546Sopenharmony_ci break; 1340bf215546Sopenharmony_ci } 1341bf215546Sopenharmony_ci bld.getBB()->remove(i); 1342bf215546Sopenharmony_ci return true; 1343bf215546Sopenharmony_ci} 1344bf215546Sopenharmony_ci 1345bf215546Sopenharmony_cibool 1346bf215546Sopenharmony_ciNV50LoweringPreSSA::handleDIV(Instruction *i) 1347bf215546Sopenharmony_ci{ 1348bf215546Sopenharmony_ci if (!isFloatType(i->dType)) 1349bf215546Sopenharmony_ci return true; 1350bf215546Sopenharmony_ci bld.setPosition(i, false); 1351bf215546Sopenharmony_ci Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1)); 1352bf215546Sopenharmony_ci i->op = OP_MUL; 1353bf215546Sopenharmony_ci i->setSrc(1, rcp->getDef(0)); 1354bf215546Sopenharmony_ci return true; 1355bf215546Sopenharmony_ci} 1356bf215546Sopenharmony_ci 1357bf215546Sopenharmony_cibool 1358bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSQRT(Instruction *i) 1359bf215546Sopenharmony_ci{ 1360bf215546Sopenharmony_ci bld.setPosition(i, true); 1361bf215546Sopenharmony_ci i->op = OP_RSQ; 1362bf215546Sopenharmony_ci bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0)); 1363bf215546Sopenharmony_ci 1364bf215546Sopenharmony_ci return true; 1365bf215546Sopenharmony_ci} 1366bf215546Sopenharmony_ci 1367bf215546Sopenharmony_cibool 1368bf215546Sopenharmony_ciNV50LoweringPreSSA::handlePOW(Instruction *i) 1369bf215546Sopenharmony_ci{ 1370bf215546Sopenharmony_ci LValue *val = bld.getScratch(); 1371bf215546Sopenharmony_ci 1372bf215546Sopenharmony_ci bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0)); 1373bf215546Sopenharmony_ci bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1; 1374bf215546Sopenharmony_ci bld.mkOp1(OP_PREEX2, TYPE_F32, val, val); 1375bf215546Sopenharmony_ci 1376bf215546Sopenharmony_ci i->op = OP_EX2; 1377bf215546Sopenharmony_ci i->setSrc(0, val); 1378bf215546Sopenharmony_ci i->setSrc(1, NULL); 1379bf215546Sopenharmony_ci 1380bf215546Sopenharmony_ci return true; 1381bf215546Sopenharmony_ci} 1382bf215546Sopenharmony_ci 1383bf215546Sopenharmony_cibool 1384bf215546Sopenharmony_ciNV50LoweringPreSSA::handleEXPORT(Instruction *i) 1385bf215546Sopenharmony_ci{ 1386bf215546Sopenharmony_ci if (prog->getType() == Program::TYPE_FRAGMENT) { 1387bf215546Sopenharmony_ci if (i->getIndirect(0, 0)) { 1388bf215546Sopenharmony_ci // TODO: redirect to l[] here, load to GPRs at exit 1389bf215546Sopenharmony_ci return false; 1390bf215546Sopenharmony_ci } else { 1391bf215546Sopenharmony_ci int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units 1392bf215546Sopenharmony_ci 1393bf215546Sopenharmony_ci i->op = OP_MOV; 1394bf215546Sopenharmony_ci i->subOp = NV50_IR_SUBOP_MOV_FINAL; 1395bf215546Sopenharmony_ci i->src(0).set(i->src(1)); 1396bf215546Sopenharmony_ci i->setSrc(1, NULL); 1397bf215546Sopenharmony_ci i->setDef(0, new_LValue(func, FILE_GPR)); 1398bf215546Sopenharmony_ci i->getDef(0)->reg.data.id = id; 1399bf215546Sopenharmony_ci 1400bf215546Sopenharmony_ci prog->maxGPR = MAX2(prog->maxGPR, id * 2); 1401bf215546Sopenharmony_ci } 1402bf215546Sopenharmony_ci } 1403bf215546Sopenharmony_ci return true; 1404bf215546Sopenharmony_ci} 1405bf215546Sopenharmony_ci 1406bf215546Sopenharmony_ci// Handle indirect addressing in geometry shaders: 1407bf215546Sopenharmony_ci// 1408bf215546Sopenharmony_ci// ld $r0 a[$a1][$a2+k] -> 1409bf215546Sopenharmony_ci// ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit 1410bf215546Sopenharmony_ci// 1411bf215546Sopenharmony_cibool 1412bf215546Sopenharmony_ciNV50LoweringPreSSA::handleLOAD(Instruction *i) 1413bf215546Sopenharmony_ci{ 1414bf215546Sopenharmony_ci ValueRef src = i->src(0); 1415bf215546Sopenharmony_ci Symbol *sym = i->getSrc(0)->asSym(); 1416bf215546Sopenharmony_ci 1417bf215546Sopenharmony_ci if (prog->getType() == Program::TYPE_COMPUTE) { 1418bf215546Sopenharmony_ci if (sym->inFile(FILE_MEMORY_SHARED) || 1419bf215546Sopenharmony_ci sym->inFile(FILE_MEMORY_BUFFER) || 1420bf215546Sopenharmony_ci sym->inFile(FILE_MEMORY_GLOBAL)) { 1421bf215546Sopenharmony_ci return handleLDST(i); 1422bf215546Sopenharmony_ci } 1423bf215546Sopenharmony_ci } 1424bf215546Sopenharmony_ci 1425bf215546Sopenharmony_ci if (src.isIndirect(1)) { 1426bf215546Sopenharmony_ci assert(prog->getType() == Program::TYPE_GEOMETRY); 1427bf215546Sopenharmony_ci Value *addr = i->getIndirect(0, 1); 1428bf215546Sopenharmony_ci 1429bf215546Sopenharmony_ci if (src.isIndirect(0)) { 1430bf215546Sopenharmony_ci // base address is in an address register, so move to a GPR 1431bf215546Sopenharmony_ci Value *base = bld.getScratch(); 1432bf215546Sopenharmony_ci bld.mkMov(base, addr); 1433bf215546Sopenharmony_ci 1434bf215546Sopenharmony_ci Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0); 1435bf215546Sopenharmony_ci Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv); 1436bf215546Sopenharmony_ci Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), 1437bf215546Sopenharmony_ci i->getIndirect(0, 0), bld.mkImm(2)); 1438bf215546Sopenharmony_ci 1439bf215546Sopenharmony_ci // Calculate final address: addr = base + attr*vstride; use 16-bit 1440bf215546Sopenharmony_ci // multiplication since 32-bit would be lowered to multiple 1441bf215546Sopenharmony_ci // instructions, and we only need the low 16 bits of the result 1442bf215546Sopenharmony_ci Value *a[2], *b[2]; 1443bf215546Sopenharmony_ci bld.mkSplit(a, 2, attrib); 1444bf215546Sopenharmony_ci bld.mkSplit(b, 2, vstride); 1445bf215546Sopenharmony_ci Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0], 1446bf215546Sopenharmony_ci base); 1447bf215546Sopenharmony_ci 1448bf215546Sopenharmony_ci // move address from GPR into an address register 1449bf215546Sopenharmony_ci addr = bld.getSSA(2, FILE_ADDRESS); 1450bf215546Sopenharmony_ci bld.mkMov(addr, sum); 1451bf215546Sopenharmony_ci } 1452bf215546Sopenharmony_ci 1453bf215546Sopenharmony_ci i->setIndirect(0, 1, NULL); 1454bf215546Sopenharmony_ci i->setIndirect(0, 0, addr); 1455bf215546Sopenharmony_ci } 1456bf215546Sopenharmony_ci 1457bf215546Sopenharmony_ci return true; 1458bf215546Sopenharmony_ci} 1459bf215546Sopenharmony_ci 1460bf215546Sopenharmony_cibool 1461bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSharedATOM(Instruction *atom) 1462bf215546Sopenharmony_ci{ 1463bf215546Sopenharmony_ci assert(atom->src(0).getFile() == FILE_MEMORY_SHARED); 1464bf215546Sopenharmony_ci 1465bf215546Sopenharmony_ci BasicBlock *currBB = atom->bb; 1466bf215546Sopenharmony_ci BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false); 1467bf215546Sopenharmony_ci BasicBlock *joinBB = atom->bb->splitAfter(atom); 1468bf215546Sopenharmony_ci BasicBlock *setAndUnlockBB = new BasicBlock(func); 1469bf215546Sopenharmony_ci BasicBlock *failLockBB = new BasicBlock(func); 1470bf215546Sopenharmony_ci 1471bf215546Sopenharmony_ci bld.setPosition(currBB, true); 1472bf215546Sopenharmony_ci assert(!currBB->joinAt); 1473bf215546Sopenharmony_ci currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL); 1474bf215546Sopenharmony_ci 1475bf215546Sopenharmony_ci bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL); 1476bf215546Sopenharmony_ci currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE); 1477bf215546Sopenharmony_ci 1478bf215546Sopenharmony_ci bld.setPosition(tryLockBB, true); 1479bf215546Sopenharmony_ci 1480bf215546Sopenharmony_ci Instruction *ld = 1481bf215546Sopenharmony_ci bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(), 1482bf215546Sopenharmony_ci atom->getIndirect(0, 0)); 1483bf215546Sopenharmony_ci Value *locked = bld.getSSA(1, FILE_FLAGS); 1484bf215546Sopenharmony_ci if (prog->getTarget()->getChipset() >= 0xa0) { 1485bf215546Sopenharmony_ci ld->setFlagsDef(1, locked); 1486bf215546Sopenharmony_ci ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED; 1487bf215546Sopenharmony_ci } else { 1488bf215546Sopenharmony_ci bld.mkMov(locked, bld.loadImm(NULL, 2)) 1489bf215546Sopenharmony_ci ->flagsDef = 0; 1490bf215546Sopenharmony_ci } 1491bf215546Sopenharmony_ci 1492bf215546Sopenharmony_ci bld.mkFlow(OP_BRA, setAndUnlockBB, CC_LT, locked); 1493bf215546Sopenharmony_ci bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL); 1494bf215546Sopenharmony_ci tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS); 1495bf215546Sopenharmony_ci tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE); 1496bf215546Sopenharmony_ci 1497bf215546Sopenharmony_ci tryLockBB->cfg.detach(&joinBB->cfg); 1498bf215546Sopenharmony_ci bld.remove(atom); 1499bf215546Sopenharmony_ci 1500bf215546Sopenharmony_ci bld.setPosition(setAndUnlockBB, true); 1501bf215546Sopenharmony_ci Value *stVal; 1502bf215546Sopenharmony_ci if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) { 1503bf215546Sopenharmony_ci // Read the old value, and write the new one. 1504bf215546Sopenharmony_ci stVal = atom->getSrc(1); 1505bf215546Sopenharmony_ci } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) { 1506bf215546Sopenharmony_ci CmpInstruction *set = 1507bf215546Sopenharmony_ci bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_FLAGS), 1508bf215546Sopenharmony_ci TYPE_U32, ld->getDef(0), atom->getSrc(1)); 1509bf215546Sopenharmony_ci 1510bf215546Sopenharmony_ci Instruction *selp = 1511bf215546Sopenharmony_ci bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), atom->getSrc(2), 1512bf215546Sopenharmony_ci ld->getDef(0), set->getDef(0)); 1513bf215546Sopenharmony_ci stVal = selp->getDef(0); 1514bf215546Sopenharmony_ci 1515bf215546Sopenharmony_ci handleSELP(selp); 1516bf215546Sopenharmony_ci } else { 1517bf215546Sopenharmony_ci operation op; 1518bf215546Sopenharmony_ci 1519bf215546Sopenharmony_ci switch (atom->subOp) { 1520bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_ADD: 1521bf215546Sopenharmony_ci op = OP_ADD; 1522bf215546Sopenharmony_ci break; 1523bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_AND: 1524bf215546Sopenharmony_ci op = OP_AND; 1525bf215546Sopenharmony_ci break; 1526bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_OR: 1527bf215546Sopenharmony_ci op = OP_OR; 1528bf215546Sopenharmony_ci break; 1529bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_XOR: 1530bf215546Sopenharmony_ci op = OP_XOR; 1531bf215546Sopenharmony_ci break; 1532bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_MIN: 1533bf215546Sopenharmony_ci op = OP_MIN; 1534bf215546Sopenharmony_ci break; 1535bf215546Sopenharmony_ci case NV50_IR_SUBOP_ATOM_MAX: 1536bf215546Sopenharmony_ci op = OP_MAX; 1537bf215546Sopenharmony_ci break; 1538bf215546Sopenharmony_ci default: 1539bf215546Sopenharmony_ci assert(0); 1540bf215546Sopenharmony_ci return false; 1541bf215546Sopenharmony_ci } 1542bf215546Sopenharmony_ci 1543bf215546Sopenharmony_ci Instruction *i = 1544bf215546Sopenharmony_ci bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0), 1545bf215546Sopenharmony_ci atom->getSrc(1)); 1546bf215546Sopenharmony_ci 1547bf215546Sopenharmony_ci stVal = i->getDef(0); 1548bf215546Sopenharmony_ci } 1549bf215546Sopenharmony_ci 1550bf215546Sopenharmony_ci Instruction *store = bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(), 1551bf215546Sopenharmony_ci atom->getIndirect(0, 0), stVal); 1552bf215546Sopenharmony_ci if (prog->getTarget()->getChipset() >= 0xa0) { 1553bf215546Sopenharmony_ci store->subOp = NV50_IR_SUBOP_STORE_UNLOCKED; 1554bf215546Sopenharmony_ci } 1555bf215546Sopenharmony_ci 1556bf215546Sopenharmony_ci bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL); 1557bf215546Sopenharmony_ci setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE); 1558bf215546Sopenharmony_ci 1559bf215546Sopenharmony_ci // Loop until the lock is acquired. 1560bf215546Sopenharmony_ci bld.setPosition(failLockBB, true); 1561bf215546Sopenharmony_ci bld.mkFlow(OP_BRA, tryLockBB, CC_GEU, locked); 1562bf215546Sopenharmony_ci bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL); 1563bf215546Sopenharmony_ci failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK); 1564bf215546Sopenharmony_ci failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE); 1565bf215546Sopenharmony_ci 1566bf215546Sopenharmony_ci bld.setPosition(joinBB, false); 1567bf215546Sopenharmony_ci bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1; 1568bf215546Sopenharmony_ci 1569bf215546Sopenharmony_ci return true; 1570bf215546Sopenharmony_ci} 1571bf215546Sopenharmony_ci 1572bf215546Sopenharmony_cibool 1573bf215546Sopenharmony_ciNV50LoweringPreSSA::handleLDST(Instruction *i) 1574bf215546Sopenharmony_ci{ 1575bf215546Sopenharmony_ci ValueRef src = i->src(0); 1576bf215546Sopenharmony_ci Symbol *sym = i->getSrc(0)->asSym(); 1577bf215546Sopenharmony_ci 1578bf215546Sopenharmony_ci if (prog->getType() != Program::TYPE_COMPUTE) { 1579bf215546Sopenharmony_ci return true; 1580bf215546Sopenharmony_ci } 1581bf215546Sopenharmony_ci 1582bf215546Sopenharmony_ci // Buffers just map directly to the different global memory spaces 1583bf215546Sopenharmony_ci if (sym->inFile(FILE_MEMORY_BUFFER)) { 1584bf215546Sopenharmony_ci sym->reg.file = FILE_MEMORY_GLOBAL; 1585bf215546Sopenharmony_ci } 1586bf215546Sopenharmony_ci 1587bf215546Sopenharmony_ci if (sym->inFile(FILE_MEMORY_SHARED)) { 1588bf215546Sopenharmony_ci 1589bf215546Sopenharmony_ci if (src.isIndirect(0)) { 1590bf215546Sopenharmony_ci Value *addr = i->getIndirect(0, 0); 1591bf215546Sopenharmony_ci 1592bf215546Sopenharmony_ci if (!addr->inFile(FILE_ADDRESS)) { 1593bf215546Sopenharmony_ci // Move address from GPR into an address register 1594bf215546Sopenharmony_ci Value *new_addr = bld.getSSA(2, FILE_ADDRESS); 1595bf215546Sopenharmony_ci bld.mkMov(new_addr, addr); 1596bf215546Sopenharmony_ci 1597bf215546Sopenharmony_ci i->setIndirect(0, 0, new_addr); 1598bf215546Sopenharmony_ci } 1599bf215546Sopenharmony_ci } 1600bf215546Sopenharmony_ci 1601bf215546Sopenharmony_ci if (i->op == OP_ATOM) 1602bf215546Sopenharmony_ci handleSharedATOM(i); 1603bf215546Sopenharmony_ci } else if (sym->inFile(FILE_MEMORY_GLOBAL)) { 1604bf215546Sopenharmony_ci // All global access must be indirect. There are no instruction forms 1605bf215546Sopenharmony_ci // with direct access. 1606bf215546Sopenharmony_ci Value *addr = i->getIndirect(0, 0); 1607bf215546Sopenharmony_ci 1608bf215546Sopenharmony_ci Value *offset = bld.loadImm(bld.getSSA(), sym->reg.data.offset); 1609bf215546Sopenharmony_ci Value *sum; 1610bf215546Sopenharmony_ci if (addr != NULL) 1611bf215546Sopenharmony_ci sum = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), addr, 1612bf215546Sopenharmony_ci offset); 1613bf215546Sopenharmony_ci else 1614bf215546Sopenharmony_ci sum = offset; 1615bf215546Sopenharmony_ci 1616bf215546Sopenharmony_ci i->setIndirect(0, 0, sum); 1617bf215546Sopenharmony_ci sym->reg.data.offset = 0; 1618bf215546Sopenharmony_ci } 1619bf215546Sopenharmony_ci 1620bf215546Sopenharmony_ci return true; 1621bf215546Sopenharmony_ci} 1622bf215546Sopenharmony_ci 1623bf215546Sopenharmony_cibool 1624bf215546Sopenharmony_ciNV50LoweringPreSSA::handleMEMBAR(Instruction *i) 1625bf215546Sopenharmony_ci{ 1626bf215546Sopenharmony_ci // For global memory, apparently doing a bunch of reads at different 1627bf215546Sopenharmony_ci // addresses forces things to get sufficiently flushed. 1628bf215546Sopenharmony_ci if (i->subOp & NV50_IR_SUBOP_MEMBAR_GL) { 1629bf215546Sopenharmony_ci uint8_t b = prog->driver->io.auxCBSlot; 1630bf215546Sopenharmony_ci Value *base = 1631bf215546Sopenharmony_ci bld.mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, 1632bf215546Sopenharmony_ci prog->driver->io.membarOffset), NULL); 1633bf215546Sopenharmony_ci Value *physid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), bld.mkSysVal(SV_PHYSID, 0)); 1634bf215546Sopenharmony_ci Value *off = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), 1635bf215546Sopenharmony_ci bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), 1636bf215546Sopenharmony_ci physid, bld.loadImm(NULL, 0x1f)), 1637bf215546Sopenharmony_ci bld.loadImm(NULL, 2)); 1638bf215546Sopenharmony_ci base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, off); 1639bf215546Sopenharmony_ci Symbol *gmemMembar = bld.mkSymbol(FILE_MEMORY_GLOBAL, prog->driver->io.gmemMembar, TYPE_U32, 0); 1640bf215546Sopenharmony_ci for (int i = 0; i < 8; i++) { 1641bf215546Sopenharmony_ci if (i != 0) { 1642bf215546Sopenharmony_ci base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, bld.loadImm(NULL, 0x100)); 1643bf215546Sopenharmony_ci } 1644bf215546Sopenharmony_ci bld.mkLoad(TYPE_U32, bld.getSSA(), gmemMembar, base) 1645bf215546Sopenharmony_ci ->fixed = 1; 1646bf215546Sopenharmony_ci } 1647bf215546Sopenharmony_ci } 1648bf215546Sopenharmony_ci 1649bf215546Sopenharmony_ci // Both global and shared memory barriers also need a regular control bar 1650bf215546Sopenharmony_ci // TODO: double-check this is the case 1651bf215546Sopenharmony_ci i->op = OP_BAR; 1652bf215546Sopenharmony_ci i->subOp = NV50_IR_SUBOP_BAR_SYNC; 1653bf215546Sopenharmony_ci i->setSrc(0, bld.mkImm(0u)); 1654bf215546Sopenharmony_ci i->setSrc(1, bld.mkImm(0u)); 1655bf215546Sopenharmony_ci 1656bf215546Sopenharmony_ci return true; 1657bf215546Sopenharmony_ci} 1658bf215546Sopenharmony_ci 1659bf215546Sopenharmony_ci// The type that bests represents how each component can be stored when packed. 1660bf215546Sopenharmony_cistatic DataType 1661bf215546Sopenharmony_cigetPackedType(const TexInstruction::ImgFormatDesc *t, int c) 1662bf215546Sopenharmony_ci{ 1663bf215546Sopenharmony_ci switch (t->type) { 1664bf215546Sopenharmony_ci case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32; 1665bf215546Sopenharmony_ci case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16; 1666bf215546Sopenharmony_ci case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16; 1667bf215546Sopenharmony_ci case UINT: 1668bf215546Sopenharmony_ci return (t->bits[c] == 8 ? TYPE_U8 : 1669bf215546Sopenharmony_ci (t->bits[c] <= 16 ? TYPE_U16 : TYPE_U32)); 1670bf215546Sopenharmony_ci case SINT: 1671bf215546Sopenharmony_ci return (t->bits[c] == 8 ? TYPE_S8 : 1672bf215546Sopenharmony_ci (t->bits[c] <= 16 ? TYPE_S16 : TYPE_S32)); 1673bf215546Sopenharmony_ci } 1674bf215546Sopenharmony_ci return TYPE_NONE; 1675bf215546Sopenharmony_ci} 1676bf215546Sopenharmony_ci 1677bf215546Sopenharmony_ci// The type that the rest of the shader expects to process this image type in. 1678bf215546Sopenharmony_cistatic DataType 1679bf215546Sopenharmony_cigetShaderType(const ImgType type) { 1680bf215546Sopenharmony_ci switch (type) { 1681bf215546Sopenharmony_ci case FLOAT: 1682bf215546Sopenharmony_ci case UNORM: 1683bf215546Sopenharmony_ci case SNORM: 1684bf215546Sopenharmony_ci return TYPE_F32; 1685bf215546Sopenharmony_ci case UINT: 1686bf215546Sopenharmony_ci return TYPE_U32; 1687bf215546Sopenharmony_ci case SINT: 1688bf215546Sopenharmony_ci return TYPE_S32; 1689bf215546Sopenharmony_ci default: 1690bf215546Sopenharmony_ci assert(!"Impossible type"); 1691bf215546Sopenharmony_ci return TYPE_NONE; 1692bf215546Sopenharmony_ci } 1693bf215546Sopenharmony_ci} 1694bf215546Sopenharmony_ci 1695bf215546Sopenharmony_ci// Reads the raw coordinates out of the input instruction, and returns a 1696bf215546Sopenharmony_ci// single-value coordinate which is what the hardware expects to receive in a 1697bf215546Sopenharmony_ci// ld/st op. 1698bf215546Sopenharmony_ciValue * 1699bf215546Sopenharmony_ciNV50LoweringPreSSA::processSurfaceCoords(TexInstruction *su) 1700bf215546Sopenharmony_ci{ 1701bf215546Sopenharmony_ci const int slot = su->tex.r; 1702bf215546Sopenharmony_ci const int dim = su->tex.target.getDim(); 1703bf215546Sopenharmony_ci const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube()); 1704bf215546Sopenharmony_ci 1705bf215546Sopenharmony_ci const TexInstruction::ImgFormatDesc *format = su->tex.format; 1706bf215546Sopenharmony_ci const uint16_t bytes = (format->bits[0] + format->bits[1] + 1707bf215546Sopenharmony_ci format->bits[2] + format->bits[3]) / 8; 1708bf215546Sopenharmony_ci uint16_t shift = ffs(bytes) - 1; 1709bf215546Sopenharmony_ci 1710bf215546Sopenharmony_ci // Buffer sizes don't necessarily fit in 16-bit values 1711bf215546Sopenharmony_ci if (su->tex.target == TEX_TARGET_BUFFER) { 1712bf215546Sopenharmony_ci return bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), 1713bf215546Sopenharmony_ci su->getSrc(0), bld.loadImm(NULL, (uint32_t)shift)); 1714bf215546Sopenharmony_ci } 1715bf215546Sopenharmony_ci 1716bf215546Sopenharmony_ci // For buffers, we just need the byte offset. And for 2d buffers we want 1717bf215546Sopenharmony_ci // the x coordinate in bytes as well. 1718bf215546Sopenharmony_ci Value *coords[3] = {}; 1719bf215546Sopenharmony_ci for (int i = 0; i < arg; i++) { 1720bf215546Sopenharmony_ci Value *src[2]; 1721bf215546Sopenharmony_ci bld.mkSplit(src, 2, su->getSrc(i)); 1722bf215546Sopenharmony_ci coords[i] = src[0]; 1723bf215546Sopenharmony_ci // For 1d-images, we want the y coord to be 0, which it will be here. 1724bf215546Sopenharmony_ci if (i == 0) 1725bf215546Sopenharmony_ci coords[1] = src[1]; 1726bf215546Sopenharmony_ci } 1727bf215546Sopenharmony_ci 1728bf215546Sopenharmony_ci coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), 1729bf215546Sopenharmony_ci coords[0], bld.loadImm(NULL, shift)); 1730bf215546Sopenharmony_ci 1731bf215546Sopenharmony_ci if (su->tex.target.isMS()) { 1732bf215546Sopenharmony_ci Value *ms_x = loadSuInfo16(slot, NV50_SU_INFO_MS(0)); 1733bf215546Sopenharmony_ci Value *ms_y = loadSuInfo16(slot, NV50_SU_INFO_MS(1)); 1734bf215546Sopenharmony_ci coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[0], ms_x); 1735bf215546Sopenharmony_ci coords[1] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[1], ms_y); 1736bf215546Sopenharmony_ci } 1737bf215546Sopenharmony_ci 1738bf215546Sopenharmony_ci // If there are more dimensions, we just want the y-offset. But that needs 1739bf215546Sopenharmony_ci // to be adjusted up by the y-stride for array images. 1740bf215546Sopenharmony_ci if (su->tex.target.isArray() || su->tex.target.isCube()) { 1741bf215546Sopenharmony_ci Value *index = coords[dim]; 1742bf215546Sopenharmony_ci Value *height = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y); 1743bf215546Sopenharmony_ci Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4), index, height); 1744bf215546Sopenharmony_ci mul->sType = TYPE_U16; 1745bf215546Sopenharmony_ci Value *muls[2]; 1746bf215546Sopenharmony_ci bld.mkSplit(muls, 2, mul->getDef(0)); 1747bf215546Sopenharmony_ci if (dim > 1) 1748bf215546Sopenharmony_ci coords[1] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), coords[1], muls[0]); 1749bf215546Sopenharmony_ci else 1750bf215546Sopenharmony_ci coords[1] = muls[0]; 1751bf215546Sopenharmony_ci } 1752bf215546Sopenharmony_ci 1753bf215546Sopenharmony_ci // 3d is special-cased. Note that a single "slice" of a 3d image may 1754bf215546Sopenharmony_ci // also be attached as 2d, so we have to do the same 3d processing for 1755bf215546Sopenharmony_ci // 2d as well, just in case. In order to remap a 3d image onto a 2d 1756bf215546Sopenharmony_ci // image, we have to retile it "by hand". 1757bf215546Sopenharmony_ci if (su->tex.target == TEX_TARGET_3D || su->tex.target == TEX_TARGET_2D) { 1758bf215546Sopenharmony_ci Value *z = loadSuInfo16(slot, NV50_SU_INFO_OFFSET_Z); 1759bf215546Sopenharmony_ci Value *y_size_aligned = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y); 1760bf215546Sopenharmony_ci // Add the z coordinate for actual 3d-images 1761bf215546Sopenharmony_ci if (dim > 2) 1762bf215546Sopenharmony_ci coords[2] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), z, coords[2]); 1763bf215546Sopenharmony_ci else 1764bf215546Sopenharmony_ci coords[2] = z; 1765bf215546Sopenharmony_ci 1766bf215546Sopenharmony_ci // Compute the surface parameters from tile shifts 1767bf215546Sopenharmony_ci Value *tile_shift[3]; 1768bf215546Sopenharmony_ci Value *tile_size[3]; 1769bf215546Sopenharmony_ci Value *tile_mask[3]; 1770bf215546Sopenharmony_ci // We only ever use one kind of X-tiling. 1771bf215546Sopenharmony_ci tile_shift[0] = bld.loadImm(NULL, (uint16_t)6); 1772bf215546Sopenharmony_ci tile_size[0] = bld.loadImm(NULL, (uint16_t)64); 1773bf215546Sopenharmony_ci tile_mask[0] = bld.loadImm(NULL, (uint16_t)63); 1774bf215546Sopenharmony_ci // Fetch the "real" tiling parameters of the underlying surface 1775bf215546Sopenharmony_ci for (int i = 1; i < 3; i++) { 1776bf215546Sopenharmony_ci tile_shift[i] = loadSuInfo16(slot, NV50_SU_INFO_TILE_SHIFT(i)); 1777bf215546Sopenharmony_ci tile_size[i] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), bld.loadImm(NULL, (uint16_t)1), tile_shift[i]); 1778bf215546Sopenharmony_ci tile_mask[i] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), tile_size[i], bld.loadImm(NULL, (uint16_t)-1)); 1779bf215546Sopenharmony_ci } 1780bf215546Sopenharmony_ci 1781bf215546Sopenharmony_ci // Compute the location of given coordinate, both inside the tile as 1782bf215546Sopenharmony_ci // well as which (linearly-laid out) tile it's in. 1783bf215546Sopenharmony_ci Value *coord_in_tile[3]; 1784bf215546Sopenharmony_ci Value *tile[3]; 1785bf215546Sopenharmony_ci for (int i = 0; i < 3; i++) { 1786bf215546Sopenharmony_ci coord_in_tile[i] = bld.mkOp2v(OP_AND, TYPE_U16, bld.getSSA(2), coords[i], tile_mask[i]); 1787bf215546Sopenharmony_ci tile[i] = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), coords[i], tile_shift[i]); 1788bf215546Sopenharmony_ci } 1789bf215546Sopenharmony_ci 1790bf215546Sopenharmony_ci // Based on the "real" tiling parameters, compute x/y coordinates in the 1791bf215546Sopenharmony_ci // larger surface with 2d tiling that was supplied to the hardware. This 1792bf215546Sopenharmony_ci // was determined and verified with the help of the tiling pseudocode in 1793bf215546Sopenharmony_ci // the envytools docs. 1794bf215546Sopenharmony_ci // 1795bf215546Sopenharmony_ci // adj_x = x_coord_in_tile + x_tile * x_tile_size * z_tile_size + 1796bf215546Sopenharmony_ci // z_coord_in_tile * x_tile_size 1797bf215546Sopenharmony_ci // adj_y = y_coord_in_tile + y_tile * y_tile_size + 1798bf215546Sopenharmony_ci // z_tile * y_tile_size * y_tiles 1799bf215546Sopenharmony_ci // 1800bf215546Sopenharmony_ci // Note: STRIDE_Y = y_tile_size * y_tiles 1801bf215546Sopenharmony_ci 1802bf215546Sopenharmony_ci coords[0] = bld.mkOp2v( 1803bf215546Sopenharmony_ci OP_ADD, TYPE_U16, bld.getSSA(2), 1804bf215546Sopenharmony_ci bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), 1805bf215546Sopenharmony_ci coord_in_tile[0], 1806bf215546Sopenharmony_ci bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), 1807bf215546Sopenharmony_ci tile[0], 1808bf215546Sopenharmony_ci bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), 1809bf215546Sopenharmony_ci tile_shift[2], tile_shift[0]))), 1810bf215546Sopenharmony_ci bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), 1811bf215546Sopenharmony_ci coord_in_tile[2], tile_shift[0])); 1812bf215546Sopenharmony_ci 1813bf215546Sopenharmony_ci Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4), 1814bf215546Sopenharmony_ci tile[2], y_size_aligned); 1815bf215546Sopenharmony_ci mul->sType = TYPE_U16; 1816bf215546Sopenharmony_ci Value *muls[2]; 1817bf215546Sopenharmony_ci bld.mkSplit(muls, 2, mul->getDef(0)); 1818bf215546Sopenharmony_ci 1819bf215546Sopenharmony_ci coords[1] = bld.mkOp2v( 1820bf215546Sopenharmony_ci OP_ADD, TYPE_U16, bld.getSSA(2), 1821bf215546Sopenharmony_ci muls[0], 1822bf215546Sopenharmony_ci bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), 1823bf215546Sopenharmony_ci coord_in_tile[1], 1824bf215546Sopenharmony_ci bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), 1825bf215546Sopenharmony_ci tile[1], tile_shift[1]))); 1826bf215546Sopenharmony_ci } 1827bf215546Sopenharmony_ci 1828bf215546Sopenharmony_ci return bld.mkOp2v(OP_MERGE, TYPE_U32, bld.getSSA(), coords[0], coords[1]); 1829bf215546Sopenharmony_ci} 1830bf215546Sopenharmony_ci 1831bf215546Sopenharmony_ci// This is largely a copy of NVC0LoweringPass::convertSurfaceFormat, but 1832bf215546Sopenharmony_ci// adjusted to make use of 16-bit math where possible. 1833bf215546Sopenharmony_cibool 1834bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSULDP(TexInstruction *su) 1835bf215546Sopenharmony_ci{ 1836bf215546Sopenharmony_ci const int slot = su->tex.r; 1837bf215546Sopenharmony_ci assert(!su->getIndirectR()); 1838bf215546Sopenharmony_ci 1839bf215546Sopenharmony_ci bld.setPosition(su, false); 1840bf215546Sopenharmony_ci 1841bf215546Sopenharmony_ci const TexInstruction::ImgFormatDesc *format = su->tex.format; 1842bf215546Sopenharmony_ci const int bytes = (su->tex.format->bits[0] + 1843bf215546Sopenharmony_ci su->tex.format->bits[1] + 1844bf215546Sopenharmony_ci su->tex.format->bits[2] + 1845bf215546Sopenharmony_ci su->tex.format->bits[3]) / 8; 1846bf215546Sopenharmony_ci DataType ty = typeOfSize(bytes); 1847bf215546Sopenharmony_ci 1848bf215546Sopenharmony_ci Value *coord = processSurfaceCoords(su); 1849bf215546Sopenharmony_ci 1850bf215546Sopenharmony_ci Value *untypedDst[4] = {}; 1851bf215546Sopenharmony_ci Value *typedDst[4] = {}; 1852bf215546Sopenharmony_ci int i; 1853bf215546Sopenharmony_ci for (i = 0; i < bytes / 4; i++) 1854bf215546Sopenharmony_ci untypedDst[i] = bld.getSSA(); 1855bf215546Sopenharmony_ci if (bytes < 4) 1856bf215546Sopenharmony_ci untypedDst[0] = bld.getSSA(); 1857bf215546Sopenharmony_ci 1858bf215546Sopenharmony_ci for (i = 0; i < 4; i++) 1859bf215546Sopenharmony_ci typedDst[i] = su->getDef(i); 1860bf215546Sopenharmony_ci 1861bf215546Sopenharmony_ci Instruction *load = bld.mkLoad(ty, NULL, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, ty, 0), coord); 1862bf215546Sopenharmony_ci for (i = 0; i < 4 && untypedDst[i]; i++) 1863bf215546Sopenharmony_ci load->setDef(i, untypedDst[i]); 1864bf215546Sopenharmony_ci 1865bf215546Sopenharmony_ci // Unpack each component into the typed dsts 1866bf215546Sopenharmony_ci int bits = 0; 1867bf215546Sopenharmony_ci for (int i = 0; i < 4; bits += format->bits[i], i++) { 1868bf215546Sopenharmony_ci if (!typedDst[i]) 1869bf215546Sopenharmony_ci continue; 1870bf215546Sopenharmony_ci 1871bf215546Sopenharmony_ci if (i >= format->components) { 1872bf215546Sopenharmony_ci if (format->type == FLOAT || 1873bf215546Sopenharmony_ci format->type == UNORM || 1874bf215546Sopenharmony_ci format->type == SNORM) 1875bf215546Sopenharmony_ci bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f); 1876bf215546Sopenharmony_ci else 1877bf215546Sopenharmony_ci bld.loadImm(typedDst[i], i == 3 ? 1 : 0); 1878bf215546Sopenharmony_ci continue; 1879bf215546Sopenharmony_ci } 1880bf215546Sopenharmony_ci 1881bf215546Sopenharmony_ci // Get just that component's data into the relevant place 1882bf215546Sopenharmony_ci if (format->bits[i] == 32) 1883bf215546Sopenharmony_ci bld.mkMov(typedDst[i], untypedDst[i]); 1884bf215546Sopenharmony_ci else if (format->bits[i] == 16) { 1885bf215546Sopenharmony_ci // We can always convert directly from the appropriate half of the 1886bf215546Sopenharmony_ci // loaded value into the typed result. 1887bf215546Sopenharmony_ci Value *src[2]; 1888bf215546Sopenharmony_ci bld.mkSplit(src, 2, untypedDst[i / 2]); 1889bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i], 1890bf215546Sopenharmony_ci getPackedType(format, i), src[i & 1]); 1891bf215546Sopenharmony_ci } 1892bf215546Sopenharmony_ci else if (format->bits[i] == 8) { 1893bf215546Sopenharmony_ci // Same approach as for 16 bits, but we have to massage the value a 1894bf215546Sopenharmony_ci // bit more, since we have to get the appropriate 8 bits from the 1895bf215546Sopenharmony_ci // half-register. In all cases, we can CVT from a 8-bit source, so we 1896bf215546Sopenharmony_ci // only have to shift when we want the upper 8 bits. 1897bf215546Sopenharmony_ci Value *src[2], *shifted; 1898bf215546Sopenharmony_ci bld.mkSplit(src, 2, untypedDst[0]); 1899bf215546Sopenharmony_ci DataType packedType = getPackedType(format, i); 1900bf215546Sopenharmony_ci if (i & 1) 1901bf215546Sopenharmony_ci shifted = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), src[!!(i & 2)], bld.loadImm(NULL, (uint16_t)8)); 1902bf215546Sopenharmony_ci else 1903bf215546Sopenharmony_ci shifted = src[!!(i & 2)]; 1904bf215546Sopenharmony_ci 1905bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i], 1906bf215546Sopenharmony_ci packedType, shifted); 1907bf215546Sopenharmony_ci } 1908bf215546Sopenharmony_ci else { 1909bf215546Sopenharmony_ci // The options are 10, 11, and 2. Get it into a 32-bit reg, then 1910bf215546Sopenharmony_ci // shift/mask. That's where it'll have to end up anyways. For signed, 1911bf215546Sopenharmony_ci // we have to make sure to get sign-extension, so we actually have to 1912bf215546Sopenharmony_ci // shift *up* first, and then shift down. There's no advantage to 1913bf215546Sopenharmony_ci // AND'ing, so we don't. 1914bf215546Sopenharmony_ci DataType ty = TYPE_U32; 1915bf215546Sopenharmony_ci if (format->type == SNORM || format->type == SINT) { 1916bf215546Sopenharmony_ci ty = TYPE_S32; 1917bf215546Sopenharmony_ci } 1918bf215546Sopenharmony_ci 1919bf215546Sopenharmony_ci // Poor man's EXTBF 1920bf215546Sopenharmony_ci bld.mkOp2( 1921bf215546Sopenharmony_ci OP_SHR, ty, typedDst[i], 1922bf215546Sopenharmony_ci bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), untypedDst[0], bld.loadImm(NULL, 32 - bits - format->bits[i])), 1923bf215546Sopenharmony_ci bld.loadImm(NULL, 32 - format->bits[i])); 1924bf215546Sopenharmony_ci 1925bf215546Sopenharmony_ci // If the stored data is already in the appropriate type, we don't 1926bf215546Sopenharmony_ci // have to do anything. Convert to float for the *NORM formats. 1927bf215546Sopenharmony_ci if (format->type == UNORM || format->type == SNORM) 1928bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_U32, typedDst[i]); 1929bf215546Sopenharmony_ci } 1930bf215546Sopenharmony_ci 1931bf215546Sopenharmony_ci // Normalize / convert as necessary 1932bf215546Sopenharmony_ci if (format->type == UNORM) 1933bf215546Sopenharmony_ci bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1))); 1934bf215546Sopenharmony_ci else if (format->type == SNORM) 1935bf215546Sopenharmony_ci bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1))); 1936bf215546Sopenharmony_ci else if (format->type == FLOAT && format->bits[i] < 16) { 1937bf215546Sopenharmony_ci // We expect the value to be in the low bits of the register, so we 1938bf215546Sopenharmony_ci // have to shift back up. 1939bf215546Sopenharmony_ci bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i])); 1940bf215546Sopenharmony_ci Value *src[2]; 1941bf215546Sopenharmony_ci bld.mkSplit(src, 2, typedDst[i]); 1942bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, src[0]); 1943bf215546Sopenharmony_ci } 1944bf215546Sopenharmony_ci } 1945bf215546Sopenharmony_ci 1946bf215546Sopenharmony_ci if (format->bgra) { 1947bf215546Sopenharmony_ci std::swap(typedDst[0], typedDst[2]); 1948bf215546Sopenharmony_ci } 1949bf215546Sopenharmony_ci 1950bf215546Sopenharmony_ci bld.getBB()->remove(su); 1951bf215546Sopenharmony_ci return true; 1952bf215546Sopenharmony_ci} 1953bf215546Sopenharmony_ci 1954bf215546Sopenharmony_cibool 1955bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSUREDP(TexInstruction *su) 1956bf215546Sopenharmony_ci{ 1957bf215546Sopenharmony_ci const int slot = su->tex.r; 1958bf215546Sopenharmony_ci const int dim = su->tex.target.getDim(); 1959bf215546Sopenharmony_ci const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube()); 1960bf215546Sopenharmony_ci assert(!su->getIndirectR()); 1961bf215546Sopenharmony_ci 1962bf215546Sopenharmony_ci bld.setPosition(su, false); 1963bf215546Sopenharmony_ci 1964bf215546Sopenharmony_ci Value *coord = processSurfaceCoords(su); 1965bf215546Sopenharmony_ci 1966bf215546Sopenharmony_ci // This is guaranteed to be a 32-bit format. So there's nothing to 1967bf215546Sopenharmony_ci // pack/unpack. 1968bf215546Sopenharmony_ci Instruction *atom = bld.mkOp2( 1969bf215546Sopenharmony_ci OP_ATOM, su->dType, su->getDef(0), 1970bf215546Sopenharmony_ci bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), su->getSrc(arg)); 1971bf215546Sopenharmony_ci if (su->subOp == NV50_IR_SUBOP_ATOM_CAS) 1972bf215546Sopenharmony_ci atom->setSrc(2, su->getSrc(arg + 1)); 1973bf215546Sopenharmony_ci atom->setIndirect(0, 0, coord); 1974bf215546Sopenharmony_ci atom->subOp = su->subOp; 1975bf215546Sopenharmony_ci 1976bf215546Sopenharmony_ci bld.getBB()->remove(su); 1977bf215546Sopenharmony_ci return true; 1978bf215546Sopenharmony_ci} 1979bf215546Sopenharmony_ci 1980bf215546Sopenharmony_cibool 1981bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSUSTP(TexInstruction *su) 1982bf215546Sopenharmony_ci{ 1983bf215546Sopenharmony_ci const int slot = su->tex.r; 1984bf215546Sopenharmony_ci const int dim = su->tex.target.getDim(); 1985bf215546Sopenharmony_ci const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube()); 1986bf215546Sopenharmony_ci assert(!su->getIndirectR()); 1987bf215546Sopenharmony_ci 1988bf215546Sopenharmony_ci bld.setPosition(su, false); 1989bf215546Sopenharmony_ci 1990bf215546Sopenharmony_ci const TexInstruction::ImgFormatDesc *format = su->tex.format; 1991bf215546Sopenharmony_ci const int bytes = (su->tex.format->bits[0] + 1992bf215546Sopenharmony_ci su->tex.format->bits[1] + 1993bf215546Sopenharmony_ci su->tex.format->bits[2] + 1994bf215546Sopenharmony_ci su->tex.format->bits[3]) / 8; 1995bf215546Sopenharmony_ci DataType ty = typeOfSize(bytes); 1996bf215546Sopenharmony_ci 1997bf215546Sopenharmony_ci Value *coord = processSurfaceCoords(su); 1998bf215546Sopenharmony_ci 1999bf215546Sopenharmony_ci // The packed values we will eventually store into memory 2000bf215546Sopenharmony_ci Value *untypedDst[4] = {}; 2001bf215546Sopenharmony_ci // Each component's packed representation, in 16-bit registers (only used 2002bf215546Sopenharmony_ci // where appropriate) 2003bf215546Sopenharmony_ci Value *untypedDst16[4] = {}; 2004bf215546Sopenharmony_ci // The original values that are being packed 2005bf215546Sopenharmony_ci Value *typedDst[4] = {}; 2006bf215546Sopenharmony_ci int i; 2007bf215546Sopenharmony_ci 2008bf215546Sopenharmony_ci for (i = 0; i < bytes / 4; i++) 2009bf215546Sopenharmony_ci untypedDst[i] = bld.getSSA(); 2010bf215546Sopenharmony_ci for (i = 0; i < format->components; i++) 2011bf215546Sopenharmony_ci untypedDst16[i] = bld.getSSA(2); 2012bf215546Sopenharmony_ci // Make sure we get at least one of each value allocated for the 2013bf215546Sopenharmony_ci // super-narrow formats. 2014bf215546Sopenharmony_ci if (bytes < 4) 2015bf215546Sopenharmony_ci untypedDst[0] = bld.getSSA(); 2016bf215546Sopenharmony_ci if (bytes < 2) 2017bf215546Sopenharmony_ci untypedDst16[0] = bld.getSSA(2); 2018bf215546Sopenharmony_ci 2019bf215546Sopenharmony_ci for (i = 0; i < 4; i++) { 2020bf215546Sopenharmony_ci typedDst[i] = bld.getSSA(); 2021bf215546Sopenharmony_ci bld.mkMov(typedDst[i], su->getSrc(arg + i)); 2022bf215546Sopenharmony_ci } 2023bf215546Sopenharmony_ci 2024bf215546Sopenharmony_ci if (format->bgra) { 2025bf215546Sopenharmony_ci std::swap(typedDst[0], typedDst[2]); 2026bf215546Sopenharmony_ci } 2027bf215546Sopenharmony_ci 2028bf215546Sopenharmony_ci // Pack each component into the untyped dsts. 2029bf215546Sopenharmony_ci int bits = 0; 2030bf215546Sopenharmony_ci for (int i = 0; i < format->components; bits += format->bits[i], i++) { 2031bf215546Sopenharmony_ci // Un-normalize / convert as necessary 2032bf215546Sopenharmony_ci if (format->type == UNORM) 2033bf215546Sopenharmony_ci bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << format->bits[i]) - 1))); 2034bf215546Sopenharmony_ci else if (format->type == SNORM) 2035bf215546Sopenharmony_ci bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << (format->bits[i] - 1)) - 1))); 2036bf215546Sopenharmony_ci 2037bf215546Sopenharmony_ci // There is nothing to convert/pack for 32-bit values 2038bf215546Sopenharmony_ci if (format->bits[i] == 32) { 2039bf215546Sopenharmony_ci bld.mkMov(untypedDst[i], typedDst[i]); 2040bf215546Sopenharmony_ci continue; 2041bf215546Sopenharmony_ci } 2042bf215546Sopenharmony_ci 2043bf215546Sopenharmony_ci // The remainder of the cases will naturally want to deal in 16-bit 2044bf215546Sopenharmony_ci // registers. We will put these into untypedDst16 and then merge them 2045bf215546Sopenharmony_ci // together later. 2046bf215546Sopenharmony_ci if (format->type == FLOAT && format->bits[i] < 16) { 2047bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_F16, untypedDst16[i], TYPE_F32, typedDst[i]); 2048bf215546Sopenharmony_ci bld.mkOp2(OP_SHR, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(15 - format->bits[i]))); 2049bf215546Sopenharmony_ci 2050bf215546Sopenharmony_ci // For odd bit sizes, it's easier to pack it into the final 2051bf215546Sopenharmony_ci // destination directly. 2052bf215546Sopenharmony_ci Value *tmp = bld.getSSA(); 2053bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]); 2054bf215546Sopenharmony_ci if (i == 0) { 2055bf215546Sopenharmony_ci untypedDst[0] = tmp; 2056bf215546Sopenharmony_ci } else { 2057bf215546Sopenharmony_ci bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits)); 2058bf215546Sopenharmony_ci bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp); 2059bf215546Sopenharmony_ci } 2060bf215546Sopenharmony_ci } else if (format->bits[i] == 16) { 2061bf215546Sopenharmony_ci // We can always convert the shader value into the packed value 2062bf215546Sopenharmony_ci // directly here 2063bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, getPackedType(format, i), untypedDst16[i], 2064bf215546Sopenharmony_ci getShaderType(format->type), typedDst[i]); 2065bf215546Sopenharmony_ci } else if (format->bits[i] < 16) { 2066bf215546Sopenharmony_ci DataType packedType = getPackedType(format, i); 2067bf215546Sopenharmony_ci DataType shaderType = getShaderType(format->type); 2068bf215546Sopenharmony_ci // We can't convert F32 to U8/S8 directly, so go to U16/S16 first. 2069bf215546Sopenharmony_ci if (shaderType == TYPE_F32 && typeSizeof(packedType) == 1) { 2070bf215546Sopenharmony_ci packedType = format->type == SNORM ? TYPE_S16 : TYPE_U16; 2071bf215546Sopenharmony_ci } 2072bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, packedType, untypedDst16[i], shaderType, typedDst[i]); 2073bf215546Sopenharmony_ci // TODO: clamp for 10- and 2-bit sizes. Also, due to the oddness of 2074bf215546Sopenharmony_ci // the size, it's easier to dump them into a 32-bit value and OR 2075bf215546Sopenharmony_ci // everything later. 2076bf215546Sopenharmony_ci if (format->bits[i] != 8) { 2077bf215546Sopenharmony_ci // Restrict value to the appropriate bits (although maybe supposed 2078bf215546Sopenharmony_ci // to clamp instead?) 2079bf215546Sopenharmony_ci bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)((1 << format->bits[i]) - 1))); 2080bf215546Sopenharmony_ci // And merge into final packed value 2081bf215546Sopenharmony_ci Value *tmp = bld.getSSA(); 2082bf215546Sopenharmony_ci bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]); 2083bf215546Sopenharmony_ci if (i == 0) { 2084bf215546Sopenharmony_ci untypedDst[0] = tmp; 2085bf215546Sopenharmony_ci } else { 2086bf215546Sopenharmony_ci bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits)); 2087bf215546Sopenharmony_ci bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp); 2088bf215546Sopenharmony_ci } 2089bf215546Sopenharmony_ci } else if (i & 1) { 2090bf215546Sopenharmony_ci // Shift the 8-bit value up (so that it can be OR'd later) 2091bf215546Sopenharmony_ci bld.mkOp2(OP_SHL, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(bits % 16))); 2092bf215546Sopenharmony_ci } else if (packedType != TYPE_U8) { 2093bf215546Sopenharmony_ci // S8 (or the *16 if converted from float) will all have high bits 2094bf215546Sopenharmony_ci // set, so AND them out. 2095bf215546Sopenharmony_ci bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)0xff)); 2096bf215546Sopenharmony_ci } 2097bf215546Sopenharmony_ci } 2098bf215546Sopenharmony_ci } 2099bf215546Sopenharmony_ci 2100bf215546Sopenharmony_ci // OR pairs of 8-bit values together (into the even value) 2101bf215546Sopenharmony_ci if (format->bits[0] == 8) { 2102bf215546Sopenharmony_ci for (i = 0; i < 2 && untypedDst16[2 * i] && untypedDst16[2 * i + 1]; i++) 2103bf215546Sopenharmony_ci bld.mkOp2(OP_OR, TYPE_U16, untypedDst16[2 * i], untypedDst16[2 * i], untypedDst16[2 * i + 1]); 2104bf215546Sopenharmony_ci } 2105bf215546Sopenharmony_ci 2106bf215546Sopenharmony_ci // We'll always want to have at least a 32-bit source register for the store 2107bf215546Sopenharmony_ci Instruction *merge = bld.mkOp(OP_MERGE, bytes < 4 ? TYPE_U32 : ty, bld.getSSA(bytes < 4 ? 4 : bytes)); 2108bf215546Sopenharmony_ci if (format->bits[0] == 32) { 2109bf215546Sopenharmony_ci for (i = 0; i < 4 && untypedDst[i]; i++) 2110bf215546Sopenharmony_ci merge->setSrc(i, untypedDst[i]); 2111bf215546Sopenharmony_ci } else if (format->bits[0] == 16) { 2112bf215546Sopenharmony_ci for (i = 0; i < 4 && untypedDst16[i]; i++) 2113bf215546Sopenharmony_ci merge->setSrc(i, untypedDst16[i]); 2114bf215546Sopenharmony_ci if (i == 1) 2115bf215546Sopenharmony_ci merge->setSrc(i, bld.getSSA(2)); 2116bf215546Sopenharmony_ci } else if (format->bits[0] == 8) { 2117bf215546Sopenharmony_ci for (i = 0; i < 2 && untypedDst16[2 * i]; i++) 2118bf215546Sopenharmony_ci merge->setSrc(i, untypedDst16[2 * i]); 2119bf215546Sopenharmony_ci if (i == 1) 2120bf215546Sopenharmony_ci merge->setSrc(i, bld.getSSA(2)); 2121bf215546Sopenharmony_ci } else { 2122bf215546Sopenharmony_ci merge->setSrc(0, untypedDst[0]); 2123bf215546Sopenharmony_ci } 2124bf215546Sopenharmony_ci 2125bf215546Sopenharmony_ci bld.mkStore(OP_STORE, ty, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), coord, merge->getDef(0)); 2126bf215546Sopenharmony_ci 2127bf215546Sopenharmony_ci bld.getBB()->remove(su); 2128bf215546Sopenharmony_ci return true; 2129bf215546Sopenharmony_ci} 2130bf215546Sopenharmony_ci 2131bf215546Sopenharmony_cibool 2132bf215546Sopenharmony_ciNV50LoweringPreSSA::handlePFETCH(Instruction *i) 2133bf215546Sopenharmony_ci{ 2134bf215546Sopenharmony_ci assert(prog->getType() == Program::TYPE_GEOMETRY); 2135bf215546Sopenharmony_ci 2136bf215546Sopenharmony_ci // NOTE: cannot use getImmediate here, not in SSA form yet, move to 2137bf215546Sopenharmony_ci // later phase if that assertion ever triggers: 2138bf215546Sopenharmony_ci 2139bf215546Sopenharmony_ci ImmediateValue *imm = i->getSrc(0)->asImm(); 2140bf215546Sopenharmony_ci assert(imm); 2141bf215546Sopenharmony_ci 2142bf215546Sopenharmony_ci assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens 2143bf215546Sopenharmony_ci 2144bf215546Sopenharmony_ci if (i->srcExists(1)) { 2145bf215546Sopenharmony_ci // indirect addressing of vertex in primitive space 2146bf215546Sopenharmony_ci 2147bf215546Sopenharmony_ci LValue *val = bld.getScratch(); 2148bf215546Sopenharmony_ci Value *ptr = bld.getSSA(2, FILE_ADDRESS); 2149bf215546Sopenharmony_ci bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2)); 2150bf215546Sopenharmony_ci bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr); 2151bf215546Sopenharmony_ci 2152bf215546Sopenharmony_ci // NOTE: PFETCH directly to an $aX only works with direct addressing 2153bf215546Sopenharmony_ci i->op = OP_SHL; 2154bf215546Sopenharmony_ci i->setSrc(0, val); 2155bf215546Sopenharmony_ci i->setSrc(1, bld.mkImm(0)); 2156bf215546Sopenharmony_ci } 2157bf215546Sopenharmony_ci 2158bf215546Sopenharmony_ci return true; 2159bf215546Sopenharmony_ci} 2160bf215546Sopenharmony_ci 2161bf215546Sopenharmony_ci// Set flags according to predicate and make the instruction read $cX. 2162bf215546Sopenharmony_civoid 2163bf215546Sopenharmony_ciNV50LoweringPreSSA::checkPredicate(Instruction *insn) 2164bf215546Sopenharmony_ci{ 2165bf215546Sopenharmony_ci Value *pred = insn->getPredicate(); 2166bf215546Sopenharmony_ci Value *cdst; 2167bf215546Sopenharmony_ci 2168bf215546Sopenharmony_ci // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA 2169bf215546Sopenharmony_ci if (!pred || 2170bf215546Sopenharmony_ci pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE) 2171bf215546Sopenharmony_ci return; 2172bf215546Sopenharmony_ci 2173bf215546Sopenharmony_ci cdst = bld.getSSA(1, FILE_FLAGS); 2174bf215546Sopenharmony_ci 2175bf215546Sopenharmony_ci bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred); 2176bf215546Sopenharmony_ci 2177bf215546Sopenharmony_ci insn->setPredicate(insn->cc, cdst); 2178bf215546Sopenharmony_ci} 2179bf215546Sopenharmony_ci 2180bf215546Sopenharmony_ci// 2181bf215546Sopenharmony_ci// - add quadop dance for texturing 2182bf215546Sopenharmony_ci// - put FP outputs in GPRs 2183bf215546Sopenharmony_ci// - convert instruction sequences 2184bf215546Sopenharmony_ci// 2185bf215546Sopenharmony_cibool 2186bf215546Sopenharmony_ciNV50LoweringPreSSA::visit(Instruction *i) 2187bf215546Sopenharmony_ci{ 2188bf215546Sopenharmony_ci bld.setPosition(i, false); 2189bf215546Sopenharmony_ci 2190bf215546Sopenharmony_ci if (i->cc != CC_ALWAYS) 2191bf215546Sopenharmony_ci checkPredicate(i); 2192bf215546Sopenharmony_ci 2193bf215546Sopenharmony_ci switch (i->op) { 2194bf215546Sopenharmony_ci case OP_TEX: 2195bf215546Sopenharmony_ci case OP_TXF: 2196bf215546Sopenharmony_ci case OP_TXG: 2197bf215546Sopenharmony_ci return handleTEX(i->asTex()); 2198bf215546Sopenharmony_ci case OP_TXB: 2199bf215546Sopenharmony_ci return handleTXB(i->asTex()); 2200bf215546Sopenharmony_ci case OP_TXL: 2201bf215546Sopenharmony_ci return handleTXL(i->asTex()); 2202bf215546Sopenharmony_ci case OP_TXD: 2203bf215546Sopenharmony_ci return handleTXD(i->asTex()); 2204bf215546Sopenharmony_ci case OP_TXLQ: 2205bf215546Sopenharmony_ci return handleTXLQ(i->asTex()); 2206bf215546Sopenharmony_ci case OP_TXQ: 2207bf215546Sopenharmony_ci return handleTXQ(i->asTex()); 2208bf215546Sopenharmony_ci case OP_EX2: 2209bf215546Sopenharmony_ci bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0)); 2210bf215546Sopenharmony_ci i->setSrc(0, i->getDef(0)); 2211bf215546Sopenharmony_ci break; 2212bf215546Sopenharmony_ci case OP_SET: 2213bf215546Sopenharmony_ci return handleSET(i); 2214bf215546Sopenharmony_ci case OP_SLCT: 2215bf215546Sopenharmony_ci return handleSLCT(i->asCmp()); 2216bf215546Sopenharmony_ci case OP_SELP: 2217bf215546Sopenharmony_ci return handleSELP(i); 2218bf215546Sopenharmony_ci case OP_POW: 2219bf215546Sopenharmony_ci return handlePOW(i); 2220bf215546Sopenharmony_ci case OP_DIV: 2221bf215546Sopenharmony_ci return handleDIV(i); 2222bf215546Sopenharmony_ci case OP_SQRT: 2223bf215546Sopenharmony_ci return handleSQRT(i); 2224bf215546Sopenharmony_ci case OP_EXPORT: 2225bf215546Sopenharmony_ci return handleEXPORT(i); 2226bf215546Sopenharmony_ci case OP_LOAD: 2227bf215546Sopenharmony_ci return handleLOAD(i); 2228bf215546Sopenharmony_ci case OP_MEMBAR: 2229bf215546Sopenharmony_ci return handleMEMBAR(i); 2230bf215546Sopenharmony_ci case OP_ATOM: 2231bf215546Sopenharmony_ci case OP_STORE: 2232bf215546Sopenharmony_ci return handleLDST(i); 2233bf215546Sopenharmony_ci case OP_SULDP: 2234bf215546Sopenharmony_ci return handleSULDP(i->asTex()); 2235bf215546Sopenharmony_ci case OP_SUSTP: 2236bf215546Sopenharmony_ci return handleSUSTP(i->asTex()); 2237bf215546Sopenharmony_ci case OP_SUREDP: 2238bf215546Sopenharmony_ci return handleSUREDP(i->asTex()); 2239bf215546Sopenharmony_ci case OP_SUQ: 2240bf215546Sopenharmony_ci return handleSUQ(i->asTex()); 2241bf215546Sopenharmony_ci case OP_BUFQ: 2242bf215546Sopenharmony_ci return handleBUFQ(i); 2243bf215546Sopenharmony_ci case OP_RDSV: 2244bf215546Sopenharmony_ci return handleRDSV(i); 2245bf215546Sopenharmony_ci case OP_WRSV: 2246bf215546Sopenharmony_ci return handleWRSV(i); 2247bf215546Sopenharmony_ci case OP_CALL: 2248bf215546Sopenharmony_ci return handleCALL(i); 2249bf215546Sopenharmony_ci case OP_PRECONT: 2250bf215546Sopenharmony_ci return handlePRECONT(i); 2251bf215546Sopenharmony_ci case OP_CONT: 2252bf215546Sopenharmony_ci return handleCONT(i); 2253bf215546Sopenharmony_ci case OP_PFETCH: 2254bf215546Sopenharmony_ci return handlePFETCH(i); 2255bf215546Sopenharmony_ci default: 2256bf215546Sopenharmony_ci break; 2257bf215546Sopenharmony_ci } 2258bf215546Sopenharmony_ci return true; 2259bf215546Sopenharmony_ci} 2260bf215546Sopenharmony_ci 2261bf215546Sopenharmony_cibool 2262bf215546Sopenharmony_ciTargetNV50::runLegalizePass(Program *prog, CGStage stage) const 2263bf215546Sopenharmony_ci{ 2264bf215546Sopenharmony_ci bool ret = false; 2265bf215546Sopenharmony_ci 2266bf215546Sopenharmony_ci if (stage == CG_STAGE_PRE_SSA) { 2267bf215546Sopenharmony_ci NV50LoweringPreSSA pass(prog); 2268bf215546Sopenharmony_ci ret = pass.run(prog, false, true); 2269bf215546Sopenharmony_ci } else 2270bf215546Sopenharmony_ci if (stage == CG_STAGE_SSA) { 2271bf215546Sopenharmony_ci if (!prog->targetPriv) 2272bf215546Sopenharmony_ci prog->targetPriv = new std::list<Instruction *>(); 2273bf215546Sopenharmony_ci NV50LegalizeSSA pass(prog); 2274bf215546Sopenharmony_ci ret = pass.run(prog, false, true); 2275bf215546Sopenharmony_ci } else 2276bf215546Sopenharmony_ci if (stage == CG_STAGE_POST_RA) { 2277bf215546Sopenharmony_ci NV50LegalizePostRA pass; 2278bf215546Sopenharmony_ci ret = pass.run(prog, false, true); 2279bf215546Sopenharmony_ci if (prog->targetPriv) 2280bf215546Sopenharmony_ci delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv); 2281bf215546Sopenharmony_ci } 2282bf215546Sopenharmony_ci return ret; 2283bf215546Sopenharmony_ci} 2284bf215546Sopenharmony_ci 2285bf215546Sopenharmony_ci} // namespace nv50_ir 2286