1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright 2011 Christoph Bumiller
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice shall be included in
12bf215546Sopenharmony_ci * all copies or substantial portions of the Software.
13bf215546Sopenharmony_ci *
14bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18bf215546Sopenharmony_ci * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19bf215546Sopenharmony_ci * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20bf215546Sopenharmony_ci * OTHER DEALINGS IN THE SOFTWARE.
21bf215546Sopenharmony_ci */
22bf215546Sopenharmony_ci
23bf215546Sopenharmony_ci#include "nv50_ir.h"
24bf215546Sopenharmony_ci#include "nv50_ir_build_util.h"
25bf215546Sopenharmony_ci
26bf215546Sopenharmony_ci#include "nv50_ir_target_nv50.h"
27bf215546Sopenharmony_ci
28bf215546Sopenharmony_ci#define NV50_SU_INFO_SIZE_X   0x00
29bf215546Sopenharmony_ci#define NV50_SU_INFO_SIZE_Y   0x04
30bf215546Sopenharmony_ci#define NV50_SU_INFO_SIZE_Z   0x08
31bf215546Sopenharmony_ci#define NV50_SU_INFO_BSIZE    0x0c
32bf215546Sopenharmony_ci#define NV50_SU_INFO_STRIDE_Y 0x10
33bf215546Sopenharmony_ci#define NV50_SU_INFO_MS_X     0x18
34bf215546Sopenharmony_ci#define NV50_SU_INFO_MS_Y     0x1c
35bf215546Sopenharmony_ci#define NV50_SU_INFO_TILE_SHIFT_X 0x20
36bf215546Sopenharmony_ci#define NV50_SU_INFO_TILE_SHIFT_Y 0x24
37bf215546Sopenharmony_ci#define NV50_SU_INFO_TILE_SHIFT_Z 0x28
38bf215546Sopenharmony_ci#define NV50_SU_INFO_OFFSET_Z 0x2c
39bf215546Sopenharmony_ci
40bf215546Sopenharmony_ci#define NV50_SU_INFO__STRIDE 0x30
41bf215546Sopenharmony_ci
42bf215546Sopenharmony_ci#define NV50_SU_INFO_SIZE(i) (0x00 + (i) * 4)
43bf215546Sopenharmony_ci#define NV50_SU_INFO_MS(i)   (0x18 + (i) * 4)
44bf215546Sopenharmony_ci#define NV50_SU_INFO_TILE_SHIFT(i) (0x20 + (i) * 4)
45bf215546Sopenharmony_ci
46bf215546Sopenharmony_cinamespace nv50_ir {
47bf215546Sopenharmony_ci
48bf215546Sopenharmony_ci// nv50 doesn't support 32 bit integer multiplication
49bf215546Sopenharmony_ci//
50bf215546Sopenharmony_ci//       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
51bf215546Sopenharmony_ci// -------------------
52bf215546Sopenharmony_ci//    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
53bf215546Sopenharmony_ci// ah*bh 00 00                 (           carry1) << 16 + ( carry2)
54bf215546Sopenharmony_ci//       al*bl
55bf215546Sopenharmony_ci//    ah*bl 00
56bf215546Sopenharmony_ci//
57bf215546Sopenharmony_ci// fffe0001 + fffe0001
58bf215546Sopenharmony_ci//
59bf215546Sopenharmony_ci// Note that this sort of splitting doesn't work for signed values, so we
60bf215546Sopenharmony_ci// compute the sign on those manually and then perform an unsigned multiply.
61bf215546Sopenharmony_cistatic bool
62bf215546Sopenharmony_ciexpandIntegerMUL(BuildUtil *bld, Instruction *mul)
63bf215546Sopenharmony_ci{
64bf215546Sopenharmony_ci   const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
65bf215546Sopenharmony_ci   ImmediateValue src1;
66bf215546Sopenharmony_ci   bool src1imm = mul->src(1).getImmediate(src1);
67bf215546Sopenharmony_ci
68bf215546Sopenharmony_ci   DataType fTy; // full type
69bf215546Sopenharmony_ci   switch (mul->sType) {
70bf215546Sopenharmony_ci   case TYPE_S32: fTy = TYPE_U32; break;
71bf215546Sopenharmony_ci   case TYPE_S64: fTy = TYPE_U64; break;
72bf215546Sopenharmony_ci   default: fTy = mul->sType; break;
73bf215546Sopenharmony_ci   }
74bf215546Sopenharmony_ci
75bf215546Sopenharmony_ci   DataType hTy; // half type
76bf215546Sopenharmony_ci   switch (fTy) {
77bf215546Sopenharmony_ci   case TYPE_U32: hTy = TYPE_U16; break;
78bf215546Sopenharmony_ci   case TYPE_U64: hTy = TYPE_U32; break;
79bf215546Sopenharmony_ci   default:
80bf215546Sopenharmony_ci      return false;
81bf215546Sopenharmony_ci   }
82bf215546Sopenharmony_ci   unsigned int fullSize = typeSizeof(fTy);
83bf215546Sopenharmony_ci   unsigned int halfSize = typeSizeof(hTy);
84bf215546Sopenharmony_ci
85bf215546Sopenharmony_ci   Instruction *i[9];
86bf215546Sopenharmony_ci
87bf215546Sopenharmony_ci   bld->setPosition(mul, true);
88bf215546Sopenharmony_ci
89bf215546Sopenharmony_ci   Value *s[2];
90bf215546Sopenharmony_ci   Value *a[2], *b[2];
91bf215546Sopenharmony_ci   Value *t[4];
92bf215546Sopenharmony_ci   for (int j = 0; j < 4; ++j)
93bf215546Sopenharmony_ci      t[j] = bld->getSSA(fullSize);
94bf215546Sopenharmony_ci
95bf215546Sopenharmony_ci   if (isSignedType(mul->sType) && highResult) {
96bf215546Sopenharmony_ci      s[0] = bld->getSSA(fullSize);
97bf215546Sopenharmony_ci      s[1] = bld->getSSA(fullSize);
98bf215546Sopenharmony_ci      bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
99bf215546Sopenharmony_ci      bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
100bf215546Sopenharmony_ci      src1.reg.data.s32 = abs(src1.reg.data.s32);
101bf215546Sopenharmony_ci   } else {
102bf215546Sopenharmony_ci      s[0] = mul->getSrc(0);
103bf215546Sopenharmony_ci      s[1] = mul->getSrc(1);
104bf215546Sopenharmony_ci   }
105bf215546Sopenharmony_ci
106bf215546Sopenharmony_ci   // split sources into halves
107bf215546Sopenharmony_ci   i[0] = bld->mkSplit(a, halfSize, s[0]);
108bf215546Sopenharmony_ci   i[1] = bld->mkSplit(b, halfSize, s[1]);
109bf215546Sopenharmony_ci
110bf215546Sopenharmony_ci   if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) {
111bf215546Sopenharmony_ci      i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1],
112bf215546Sopenharmony_ci                               bld->mkImm(src1.reg.data.u32 & 0xffff));
113bf215546Sopenharmony_ci   } else {
114bf215546Sopenharmony_ci      i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0],
115bf215546Sopenharmony_ci                        src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]);
116bf215546Sopenharmony_ci      if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
117bf215546Sopenharmony_ci         i[3] = i[2];
118bf215546Sopenharmony_ci         t[1] = t[0];
119bf215546Sopenharmony_ci      } else {
120bf215546Sopenharmony_ci         i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
121bf215546Sopenharmony_ci      }
122bf215546Sopenharmony_ci   }
123bf215546Sopenharmony_ci   i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
124bf215546Sopenharmony_ci   if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
125bf215546Sopenharmony_ci      i[4] = i[3];
126bf215546Sopenharmony_ci      t[3] = t[2];
127bf215546Sopenharmony_ci   } else {
128bf215546Sopenharmony_ci      i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
129bf215546Sopenharmony_ci   }
130bf215546Sopenharmony_ci
131bf215546Sopenharmony_ci   if (highResult) {
132bf215546Sopenharmony_ci      Value *c[2];
133bf215546Sopenharmony_ci      Value *r[5];
134bf215546Sopenharmony_ci      Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
135bf215546Sopenharmony_ci      c[0] = bld->getSSA(1, FILE_FLAGS);
136bf215546Sopenharmony_ci      c[1] = bld->getSSA(1, FILE_FLAGS);
137bf215546Sopenharmony_ci      for (int j = 0; j < 5; ++j)
138bf215546Sopenharmony_ci         r[j] = bld->getSSA(fullSize);
139bf215546Sopenharmony_ci
140bf215546Sopenharmony_ci      i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
141bf215546Sopenharmony_ci      i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
142bf215546Sopenharmony_ci      bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
143bf215546Sopenharmony_ci      bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
144bf215546Sopenharmony_ci      i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
145bf215546Sopenharmony_ci
146bf215546Sopenharmony_ci      // set carry defs / sources
147bf215546Sopenharmony_ci      i[3]->setFlagsDef(1, c[0]);
148bf215546Sopenharmony_ci      // actual result required in negative case, but ignored for
149bf215546Sopenharmony_ci      // unsigned. for some reason the compiler ends up dropping the whole
150bf215546Sopenharmony_ci      // instruction if the destination is unused but the flags are.
151bf215546Sopenharmony_ci      if (isSignedType(mul->sType))
152bf215546Sopenharmony_ci         i[4]->setFlagsDef(1, c[1]);
153bf215546Sopenharmony_ci      else
154bf215546Sopenharmony_ci         i[4]->setFlagsDef(0, c[1]);
155bf215546Sopenharmony_ci      i[6]->setPredicate(CC_C, c[0]);
156bf215546Sopenharmony_ci      i[5]->setFlagsSrc(3, c[1]);
157bf215546Sopenharmony_ci
158bf215546Sopenharmony_ci      if (isSignedType(mul->sType)) {
159bf215546Sopenharmony_ci         Value *cc[2];
160bf215546Sopenharmony_ci         Value *rr[7];
161bf215546Sopenharmony_ci         Value *one = bld->getSSA(fullSize);
162bf215546Sopenharmony_ci         bld->loadImm(one, 1);
163bf215546Sopenharmony_ci         for (int j = 0; j < 7; j++)
164bf215546Sopenharmony_ci            rr[j] = bld->getSSA(fullSize);
165bf215546Sopenharmony_ci
166bf215546Sopenharmony_ci         // NOTE: this logic uses predicates because splitting basic blocks is
167bf215546Sopenharmony_ci         // ~impossible during the SSA phase. The RA relies on a correlation
168bf215546Sopenharmony_ci         // between edge order and phi node sources.
169bf215546Sopenharmony_ci
170bf215546Sopenharmony_ci         // Set the sign of the result based on the inputs
171bf215546Sopenharmony_ci         bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
172bf215546Sopenharmony_ci            ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
173bf215546Sopenharmony_ci
174bf215546Sopenharmony_ci         // 1s complement of 64-bit value
175bf215546Sopenharmony_ci         bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
176bf215546Sopenharmony_ci            ->setPredicate(CC_S, cc[0]);
177bf215546Sopenharmony_ci         bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
178bf215546Sopenharmony_ci            ->setPredicate(CC_S, cc[0]);
179bf215546Sopenharmony_ci
180bf215546Sopenharmony_ci         // add to low 32-bits, keep track of the carry
181bf215546Sopenharmony_ci         Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
182bf215546Sopenharmony_ci         n->setPredicate(CC_S, cc[0]);
183bf215546Sopenharmony_ci         n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
184bf215546Sopenharmony_ci
185bf215546Sopenharmony_ci         // If there was a carry, add 1 to the upper 32 bits
186bf215546Sopenharmony_ci         // XXX: These get executed even if they shouldn't be
187bf215546Sopenharmony_ci         bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
188bf215546Sopenharmony_ci            ->setPredicate(CC_C, cc[1]);
189bf215546Sopenharmony_ci         bld->mkMov(rr[3], rr[0])
190bf215546Sopenharmony_ci            ->setPredicate(CC_NC, cc[1]);
191bf215546Sopenharmony_ci         bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
192bf215546Sopenharmony_ci
193bf215546Sopenharmony_ci         // Merge the results from the negative and non-negative paths
194bf215546Sopenharmony_ci         bld->mkMov(rr[5], rr[4])
195bf215546Sopenharmony_ci            ->setPredicate(CC_S, cc[0]);
196bf215546Sopenharmony_ci         bld->mkMov(rr[6], r[4])
197bf215546Sopenharmony_ci            ->setPredicate(CC_NS, cc[0]);
198bf215546Sopenharmony_ci         bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
199bf215546Sopenharmony_ci      } else {
200bf215546Sopenharmony_ci         bld->mkMov(mul->getDef(0), r[4]);
201bf215546Sopenharmony_ci      }
202bf215546Sopenharmony_ci   } else {
203bf215546Sopenharmony_ci      bld->mkMov(mul->getDef(0), t[3]);
204bf215546Sopenharmony_ci   }
205bf215546Sopenharmony_ci   delete_Instruction(bld->getProgram(), mul);
206bf215546Sopenharmony_ci
207bf215546Sopenharmony_ci   for (int j = 2; j <= (highResult ? 5 : 4); ++j)
208bf215546Sopenharmony_ci      if (i[j])
209bf215546Sopenharmony_ci         i[j]->sType = hTy;
210bf215546Sopenharmony_ci
211bf215546Sopenharmony_ci   return true;
212bf215546Sopenharmony_ci}
213bf215546Sopenharmony_ci
214bf215546Sopenharmony_ci#define QOP_ADD  0
215bf215546Sopenharmony_ci#define QOP_SUBR 1
216bf215546Sopenharmony_ci#define QOP_SUB  2
217bf215546Sopenharmony_ci#define QOP_MOV2 3
218bf215546Sopenharmony_ci
219bf215546Sopenharmony_ci//             UL UR LL LR
220bf215546Sopenharmony_ci#define QUADOP(q, r, s, t)            \
221bf215546Sopenharmony_ci   ((QOP_##q << 6) | (QOP_##r << 4) | \
222bf215546Sopenharmony_ci    (QOP_##s << 2) | (QOP_##t << 0))
223bf215546Sopenharmony_ci
224bf215546Sopenharmony_ciclass NV50LegalizePostRA : public Pass
225bf215546Sopenharmony_ci{
226bf215546Sopenharmony_cipublic:
227bf215546Sopenharmony_ci   NV50LegalizePostRA() : r63(NULL) { }
228bf215546Sopenharmony_ci
229bf215546Sopenharmony_ciprivate:
230bf215546Sopenharmony_ci   virtual bool visit(Function *);
231bf215546Sopenharmony_ci   virtual bool visit(BasicBlock *);
232bf215546Sopenharmony_ci
233bf215546Sopenharmony_ci   void handlePRERET(FlowInstruction *);
234bf215546Sopenharmony_ci   void replaceZero(Instruction *);
235bf215546Sopenharmony_ci
236bf215546Sopenharmony_ci   BuildUtil bld;
237bf215546Sopenharmony_ci
238bf215546Sopenharmony_ci   LValue *r63;
239bf215546Sopenharmony_ci};
240bf215546Sopenharmony_ci
241bf215546Sopenharmony_cibool
242bf215546Sopenharmony_ciNV50LegalizePostRA::visit(Function *fn)
243bf215546Sopenharmony_ci{
244bf215546Sopenharmony_ci   Program *prog = fn->getProgram();
245bf215546Sopenharmony_ci
246bf215546Sopenharmony_ci   r63 = new_LValue(fn, FILE_GPR);
247bf215546Sopenharmony_ci   // GPR units on nv50 are in half-regs
248bf215546Sopenharmony_ci   if (prog->maxGPR < 126)
249bf215546Sopenharmony_ci      r63->reg.data.id = 63;
250bf215546Sopenharmony_ci   else
251bf215546Sopenharmony_ci      r63->reg.data.id = 127;
252bf215546Sopenharmony_ci
253bf215546Sopenharmony_ci   // this is actually per-program, but we can do it all on visiting main()
254bf215546Sopenharmony_ci   std::list<Instruction *> *outWrites =
255bf215546Sopenharmony_ci      reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
256bf215546Sopenharmony_ci
257bf215546Sopenharmony_ci   if (outWrites) {
258bf215546Sopenharmony_ci      for (std::list<Instruction *>::iterator it = outWrites->begin();
259bf215546Sopenharmony_ci           it != outWrites->end(); ++it)
260bf215546Sopenharmony_ci         (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
261bf215546Sopenharmony_ci      // instructions will be deleted on exit
262bf215546Sopenharmony_ci      outWrites->clear();
263bf215546Sopenharmony_ci   }
264bf215546Sopenharmony_ci
265bf215546Sopenharmony_ci   return true;
266bf215546Sopenharmony_ci}
267bf215546Sopenharmony_ci
268bf215546Sopenharmony_civoid
269bf215546Sopenharmony_ciNV50LegalizePostRA::replaceZero(Instruction *i)
270bf215546Sopenharmony_ci{
271bf215546Sopenharmony_ci   for (int s = 0; i->srcExists(s); ++s) {
272bf215546Sopenharmony_ci      ImmediateValue *imm = i->getSrc(s)->asImm();
273bf215546Sopenharmony_ci      if (imm && imm->reg.data.u64 == 0)
274bf215546Sopenharmony_ci         i->setSrc(s, r63);
275bf215546Sopenharmony_ci   }
276bf215546Sopenharmony_ci}
277bf215546Sopenharmony_ci
278bf215546Sopenharmony_ci// Emulate PRERET: jump to the target and call to the origin from there
279bf215546Sopenharmony_ci//
280bf215546Sopenharmony_ci// WARNING: atm only works if BBs are affected by at most a single PRERET
281bf215546Sopenharmony_ci//
282bf215546Sopenharmony_ci// BB:0
283bf215546Sopenharmony_ci// preret BB:3
284bf215546Sopenharmony_ci// (...)
285bf215546Sopenharmony_ci// BB:3
286bf215546Sopenharmony_ci// (...)
287bf215546Sopenharmony_ci//             --->
288bf215546Sopenharmony_ci// BB:0
289bf215546Sopenharmony_ci// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
290bf215546Sopenharmony_ci// (...)
291bf215546Sopenharmony_ci// BB:3
292bf215546Sopenharmony_ci// bra BB:3 + n1 (skip the call)
293bf215546Sopenharmony_ci// call BB:0 + n2 (skip bra at beginning of BB:0)
294bf215546Sopenharmony_ci// (...)
295bf215546Sopenharmony_civoid
296bf215546Sopenharmony_ciNV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
297bf215546Sopenharmony_ci{
298bf215546Sopenharmony_ci   BasicBlock *bbE = pre->bb;
299bf215546Sopenharmony_ci   BasicBlock *bbT = pre->target.bb;
300bf215546Sopenharmony_ci
301bf215546Sopenharmony_ci   pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
302bf215546Sopenharmony_ci   bbE->remove(pre);
303bf215546Sopenharmony_ci   bbE->insertHead(pre);
304bf215546Sopenharmony_ci
305bf215546Sopenharmony_ci   Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
306bf215546Sopenharmony_ci   Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
307bf215546Sopenharmony_ci
308bf215546Sopenharmony_ci   bbT->insertHead(call);
309bf215546Sopenharmony_ci   bbT->insertHead(skip);
310bf215546Sopenharmony_ci
311bf215546Sopenharmony_ci   // NOTE: maybe split blocks to prevent the instructions from moving ?
312bf215546Sopenharmony_ci
313bf215546Sopenharmony_ci   skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
314bf215546Sopenharmony_ci   call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
315bf215546Sopenharmony_ci}
316bf215546Sopenharmony_ci
317bf215546Sopenharmony_cibool
318bf215546Sopenharmony_ciNV50LegalizePostRA::visit(BasicBlock *bb)
319bf215546Sopenharmony_ci{
320bf215546Sopenharmony_ci   Instruction *i, *next;
321bf215546Sopenharmony_ci
322bf215546Sopenharmony_ci   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
323bf215546Sopenharmony_ci   for (i = bb->getFirst(); i; i = next) {
324bf215546Sopenharmony_ci      next = i->next;
325bf215546Sopenharmony_ci      if (i->isNop()) {
326bf215546Sopenharmony_ci         bb->remove(i);
327bf215546Sopenharmony_ci      } else
328bf215546Sopenharmony_ci      if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
329bf215546Sopenharmony_ci         handlePRERET(i->asFlow());
330bf215546Sopenharmony_ci      } else {
331bf215546Sopenharmony_ci         // TODO: We will want to do this before register allocation,
332bf215546Sopenharmony_ci         // since have to use a $c register for the carry flag.
333bf215546Sopenharmony_ci         if (typeSizeof(i->dType) == 8) {
334bf215546Sopenharmony_ci            Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
335bf215546Sopenharmony_ci            if (hi)
336bf215546Sopenharmony_ci               next = hi;
337bf215546Sopenharmony_ci         }
338bf215546Sopenharmony_ci
339bf215546Sopenharmony_ci         if (i->op != OP_PFETCH && i->op != OP_BAR &&
340bf215546Sopenharmony_ci             (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
341bf215546Sopenharmony_ci            replaceZero(i);
342bf215546Sopenharmony_ci      }
343bf215546Sopenharmony_ci   }
344bf215546Sopenharmony_ci   if (!bb->getEntry())
345bf215546Sopenharmony_ci      return true;
346bf215546Sopenharmony_ci
347bf215546Sopenharmony_ci   return true;
348bf215546Sopenharmony_ci}
349bf215546Sopenharmony_ci
350bf215546Sopenharmony_ciclass NV50LegalizeSSA : public Pass
351bf215546Sopenharmony_ci{
352bf215546Sopenharmony_cipublic:
353bf215546Sopenharmony_ci   NV50LegalizeSSA(Program *);
354bf215546Sopenharmony_ci
355bf215546Sopenharmony_ci   virtual bool visit(BasicBlock *bb);
356bf215546Sopenharmony_ci
357bf215546Sopenharmony_ciprivate:
358bf215546Sopenharmony_ci   void propagateWriteToOutput(Instruction *);
359bf215546Sopenharmony_ci   void handleDIV(Instruction *);
360bf215546Sopenharmony_ci   void handleMOD(Instruction *);
361bf215546Sopenharmony_ci   void handleMUL(Instruction *);
362bf215546Sopenharmony_ci   void handleAddrDef(Instruction *);
363bf215546Sopenharmony_ci
364bf215546Sopenharmony_ci   inline bool isARL(const Instruction *) const;
365bf215546Sopenharmony_ci
366bf215546Sopenharmony_ci   BuildUtil bld;
367bf215546Sopenharmony_ci
368bf215546Sopenharmony_ci   std::list<Instruction *> *outWrites;
369bf215546Sopenharmony_ci};
370bf215546Sopenharmony_ci
371bf215546Sopenharmony_ciNV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
372bf215546Sopenharmony_ci{
373bf215546Sopenharmony_ci   bld.setProgram(prog);
374bf215546Sopenharmony_ci
375bf215546Sopenharmony_ci   if (prog->optLevel >= 2 &&
376bf215546Sopenharmony_ci       (prog->getType() == Program::TYPE_GEOMETRY ||
377bf215546Sopenharmony_ci        prog->getType() == Program::TYPE_VERTEX))
378bf215546Sopenharmony_ci      outWrites =
379bf215546Sopenharmony_ci         reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
380bf215546Sopenharmony_ci   else
381bf215546Sopenharmony_ci      outWrites = NULL;
382bf215546Sopenharmony_ci}
383bf215546Sopenharmony_ci
384bf215546Sopenharmony_civoid
385bf215546Sopenharmony_ciNV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
386bf215546Sopenharmony_ci{
387bf215546Sopenharmony_ci   if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
388bf215546Sopenharmony_ci      return;
389bf215546Sopenharmony_ci
390bf215546Sopenharmony_ci   // check def instruction can store
391bf215546Sopenharmony_ci   Instruction *di = st->getSrc(1)->defs.front()->getInsn();
392bf215546Sopenharmony_ci
393bf215546Sopenharmony_ci   // TODO: move exports (if beneficial) in common opt pass
394bf215546Sopenharmony_ci   if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
395bf215546Sopenharmony_ci      return;
396bf215546Sopenharmony_ci
397bf215546Sopenharmony_ci   for (int s = 0; di->srcExists(s); ++s)
398bf215546Sopenharmony_ci      if (di->src(s).getFile() == FILE_IMMEDIATE ||
399bf215546Sopenharmony_ci          di->src(s).getFile() == FILE_MEMORY_LOCAL)
400bf215546Sopenharmony_ci         return;
401bf215546Sopenharmony_ci
402bf215546Sopenharmony_ci   if (prog->getType() == Program::TYPE_GEOMETRY) {
403bf215546Sopenharmony_ci      // Only propagate output writes in geometry shaders when we can be sure
404bf215546Sopenharmony_ci      // that we are propagating to the same output vertex.
405bf215546Sopenharmony_ci      if (di->bb != st->bb)
406bf215546Sopenharmony_ci         return;
407bf215546Sopenharmony_ci      Instruction *i;
408bf215546Sopenharmony_ci      for (i = di; i != st; i = i->next) {
409bf215546Sopenharmony_ci         if (i->op == OP_EMIT || i->op == OP_RESTART)
410bf215546Sopenharmony_ci            return;
411bf215546Sopenharmony_ci      }
412bf215546Sopenharmony_ci      assert(i); // st after di
413bf215546Sopenharmony_ci   }
414bf215546Sopenharmony_ci
415bf215546Sopenharmony_ci   // We cannot set defs to non-lvalues before register allocation, so
416bf215546Sopenharmony_ci   // save & remove (to save registers) the exports and replace later.
417bf215546Sopenharmony_ci   outWrites->push_back(st);
418bf215546Sopenharmony_ci   st->bb->remove(st);
419bf215546Sopenharmony_ci}
420bf215546Sopenharmony_ci
421bf215546Sopenharmony_cibool
422bf215546Sopenharmony_ciNV50LegalizeSSA::isARL(const Instruction *i) const
423bf215546Sopenharmony_ci{
424bf215546Sopenharmony_ci   ImmediateValue imm;
425bf215546Sopenharmony_ci
426bf215546Sopenharmony_ci   if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
427bf215546Sopenharmony_ci      return false;
428bf215546Sopenharmony_ci   if (!i->src(1).getImmediate(imm))
429bf215546Sopenharmony_ci      return false;
430bf215546Sopenharmony_ci   return imm.isInteger(0);
431bf215546Sopenharmony_ci}
432bf215546Sopenharmony_ci
433bf215546Sopenharmony_civoid
434bf215546Sopenharmony_ciNV50LegalizeSSA::handleAddrDef(Instruction *i)
435bf215546Sopenharmony_ci{
436bf215546Sopenharmony_ci   Instruction *arl;
437bf215546Sopenharmony_ci
438bf215546Sopenharmony_ci   i->getDef(0)->reg.size = 2; // $aX are only 16 bit
439bf215546Sopenharmony_ci
440bf215546Sopenharmony_ci   // PFETCH can always write to $a
441bf215546Sopenharmony_ci   if (i->op == OP_PFETCH)
442bf215546Sopenharmony_ci      return;
443bf215546Sopenharmony_ci   // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
444bf215546Sopenharmony_ci   if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
445bf215546Sopenharmony_ci      if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
446bf215546Sopenharmony_ci         return;
447bf215546Sopenharmony_ci      if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
448bf215546Sopenharmony_ci         return;
449bf215546Sopenharmony_ci   }
450bf215546Sopenharmony_ci
451bf215546Sopenharmony_ci   // turn $a sources into $r sources (can't operate on $a)
452bf215546Sopenharmony_ci   for (int s = 0; i->srcExists(s); ++s) {
453bf215546Sopenharmony_ci      Value *a = i->getSrc(s);
454bf215546Sopenharmony_ci      Value *r;
455bf215546Sopenharmony_ci      if (a->reg.file == FILE_ADDRESS) {
456bf215546Sopenharmony_ci         if (a->getInsn() && isARL(a->getInsn())) {
457bf215546Sopenharmony_ci            i->setSrc(s, a->getInsn()->getSrc(0));
458bf215546Sopenharmony_ci         } else {
459bf215546Sopenharmony_ci            bld.setPosition(i, false);
460bf215546Sopenharmony_ci            r = bld.getSSA();
461bf215546Sopenharmony_ci            bld.mkMov(r, a);
462bf215546Sopenharmony_ci            i->setSrc(s, r);
463bf215546Sopenharmony_ci         }
464bf215546Sopenharmony_ci      }
465bf215546Sopenharmony_ci   }
466bf215546Sopenharmony_ci   if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
467bf215546Sopenharmony_ci      return;
468bf215546Sopenharmony_ci
469bf215546Sopenharmony_ci   // turn result back into $a
470bf215546Sopenharmony_ci   bld.setPosition(i, true);
471bf215546Sopenharmony_ci   arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
472bf215546Sopenharmony_ci   i->setDef(0, arl->getSrc(0));
473bf215546Sopenharmony_ci}
474bf215546Sopenharmony_ci
475bf215546Sopenharmony_civoid
476bf215546Sopenharmony_ciNV50LegalizeSSA::handleMUL(Instruction *mul)
477bf215546Sopenharmony_ci{
478bf215546Sopenharmony_ci   if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
479bf215546Sopenharmony_ci      return;
480bf215546Sopenharmony_ci   Value *def = mul->getDef(0);
481bf215546Sopenharmony_ci   Value *pred = mul->getPredicate();
482bf215546Sopenharmony_ci   CondCode cc = mul->cc;
483bf215546Sopenharmony_ci   if (pred)
484bf215546Sopenharmony_ci      mul->setPredicate(CC_ALWAYS, NULL);
485bf215546Sopenharmony_ci
486bf215546Sopenharmony_ci   if (mul->op == OP_MAD) {
487bf215546Sopenharmony_ci      Instruction *add = mul;
488bf215546Sopenharmony_ci      bld.setPosition(add, false);
489bf215546Sopenharmony_ci      Value *res = cloneShallow(func, mul->getDef(0));
490bf215546Sopenharmony_ci      mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
491bf215546Sopenharmony_ci      add->op = OP_ADD;
492bf215546Sopenharmony_ci      add->setSrc(0, mul->getDef(0));
493bf215546Sopenharmony_ci      add->setSrc(1, add->getSrc(2));
494bf215546Sopenharmony_ci      for (int s = 2; add->srcExists(s); ++s)
495bf215546Sopenharmony_ci         add->setSrc(s, NULL);
496bf215546Sopenharmony_ci      mul->subOp = add->subOp;
497bf215546Sopenharmony_ci      add->subOp = 0;
498bf215546Sopenharmony_ci   }
499bf215546Sopenharmony_ci   expandIntegerMUL(&bld, mul);
500bf215546Sopenharmony_ci   if (pred)
501bf215546Sopenharmony_ci      def->getInsn()->setPredicate(cc, pred);
502bf215546Sopenharmony_ci}
503bf215546Sopenharmony_ci
504bf215546Sopenharmony_ci// Use f32 division: first compute an approximate result, use it to reduce
505bf215546Sopenharmony_ci// the dividend, which should then be representable as f32, divide the reduced
506bf215546Sopenharmony_ci// dividend, and add the quotients.
507bf215546Sopenharmony_civoid
508bf215546Sopenharmony_ciNV50LegalizeSSA::handleDIV(Instruction *div)
509bf215546Sopenharmony_ci{
510bf215546Sopenharmony_ci   const DataType ty = div->sType;
511bf215546Sopenharmony_ci
512bf215546Sopenharmony_ci   if (ty != TYPE_U32 && ty != TYPE_S32)
513bf215546Sopenharmony_ci      return;
514bf215546Sopenharmony_ci
515bf215546Sopenharmony_ci   Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
516bf215546Sopenharmony_ci
517bf215546Sopenharmony_ci   bld.setPosition(div, false);
518bf215546Sopenharmony_ci
519bf215546Sopenharmony_ci   Value *a, *af = bld.getSSA();
520bf215546Sopenharmony_ci   Value *b, *bf = bld.getSSA();
521bf215546Sopenharmony_ci
522bf215546Sopenharmony_ci   bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
523bf215546Sopenharmony_ci   bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
524bf215546Sopenharmony_ci
525bf215546Sopenharmony_ci   if (isSignedType(ty)) {
526bf215546Sopenharmony_ci      af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
527bf215546Sopenharmony_ci      bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
528bf215546Sopenharmony_ci      a = bld.getSSA();
529bf215546Sopenharmony_ci      b = bld.getSSA();
530bf215546Sopenharmony_ci      bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
531bf215546Sopenharmony_ci      bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
532bf215546Sopenharmony_ci   } else {
533bf215546Sopenharmony_ci      a = div->getSrc(0);
534bf215546Sopenharmony_ci      b = div->getSrc(1);
535bf215546Sopenharmony_ci   }
536bf215546Sopenharmony_ci
537bf215546Sopenharmony_ci   bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
538bf215546Sopenharmony_ci   bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
539bf215546Sopenharmony_ci
540bf215546Sopenharmony_ci   bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
541bf215546Sopenharmony_ci   bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
542bf215546Sopenharmony_ci
543bf215546Sopenharmony_ci   // get error of 1st result
544bf215546Sopenharmony_ci   expandIntegerMUL(&bld,
545bf215546Sopenharmony_ci      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
546bf215546Sopenharmony_ci   bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
547bf215546Sopenharmony_ci
548bf215546Sopenharmony_ci   bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
549bf215546Sopenharmony_ci
550bf215546Sopenharmony_ci   bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
551bf215546Sopenharmony_ci   bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
552bf215546Sopenharmony_ci      ->rnd = ROUND_Z;
553bf215546Sopenharmony_ci   bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
554bf215546Sopenharmony_ci
555bf215546Sopenharmony_ci   // correction: if modulus >= divisor, add 1
556bf215546Sopenharmony_ci   expandIntegerMUL(&bld,
557bf215546Sopenharmony_ci      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
558bf215546Sopenharmony_ci   bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
559bf215546Sopenharmony_ci   bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b);
560bf215546Sopenharmony_ci   if (!isSignedType(ty)) {
561bf215546Sopenharmony_ci      div->op = OP_SUB;
562bf215546Sopenharmony_ci      div->setSrc(0, q);
563bf215546Sopenharmony_ci      div->setSrc(1, s);
564bf215546Sopenharmony_ci   } else {
565bf215546Sopenharmony_ci      t = q;
566bf215546Sopenharmony_ci      bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
567bf215546Sopenharmony_ci      s = bld.getSSA();
568bf215546Sopenharmony_ci      t = bld.getSSA();
569bf215546Sopenharmony_ci      // fix the sign
570bf215546Sopenharmony_ci      bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
571bf215546Sopenharmony_ci         ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
572bf215546Sopenharmony_ci      bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
573bf215546Sopenharmony_ci      bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
574bf215546Sopenharmony_ci
575bf215546Sopenharmony_ci      div->op = OP_UNION;
576bf215546Sopenharmony_ci      div->setSrc(0, s);
577bf215546Sopenharmony_ci      div->setSrc(1, t);
578bf215546Sopenharmony_ci   }
579bf215546Sopenharmony_ci}
580bf215546Sopenharmony_ci
581bf215546Sopenharmony_civoid
582bf215546Sopenharmony_ciNV50LegalizeSSA::handleMOD(Instruction *mod)
583bf215546Sopenharmony_ci{
584bf215546Sopenharmony_ci   if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
585bf215546Sopenharmony_ci      return;
586bf215546Sopenharmony_ci   bld.setPosition(mod, false);
587bf215546Sopenharmony_ci
588bf215546Sopenharmony_ci   Value *q = bld.getSSA();
589bf215546Sopenharmony_ci   Value *m = bld.getSSA();
590bf215546Sopenharmony_ci
591bf215546Sopenharmony_ci   bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
592bf215546Sopenharmony_ci   handleDIV(q->getInsn());
593bf215546Sopenharmony_ci
594bf215546Sopenharmony_ci   bld.setPosition(mod, false);
595bf215546Sopenharmony_ci   expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
596bf215546Sopenharmony_ci
597bf215546Sopenharmony_ci   mod->op = OP_SUB;
598bf215546Sopenharmony_ci   mod->setSrc(1, m);
599bf215546Sopenharmony_ci}
600bf215546Sopenharmony_ci
601bf215546Sopenharmony_cibool
602bf215546Sopenharmony_ciNV50LegalizeSSA::visit(BasicBlock *bb)
603bf215546Sopenharmony_ci{
604bf215546Sopenharmony_ci   Instruction *insn, *next;
605bf215546Sopenharmony_ci   // skipping PHIs (don't pass them to handleAddrDef) !
606bf215546Sopenharmony_ci   for (insn = bb->getEntry(); insn; insn = next) {
607bf215546Sopenharmony_ci      next = insn->next;
608bf215546Sopenharmony_ci
609bf215546Sopenharmony_ci      if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
610bf215546Sopenharmony_ci         handleAddrDef(insn);
611bf215546Sopenharmony_ci
612bf215546Sopenharmony_ci      switch (insn->op) {
613bf215546Sopenharmony_ci      case OP_EXPORT:
614bf215546Sopenharmony_ci         if (outWrites)
615bf215546Sopenharmony_ci            propagateWriteToOutput(insn);
616bf215546Sopenharmony_ci         break;
617bf215546Sopenharmony_ci      case OP_DIV:
618bf215546Sopenharmony_ci         handleDIV(insn);
619bf215546Sopenharmony_ci         break;
620bf215546Sopenharmony_ci      case OP_MOD:
621bf215546Sopenharmony_ci         handleMOD(insn);
622bf215546Sopenharmony_ci         break;
623bf215546Sopenharmony_ci      case OP_MAD:
624bf215546Sopenharmony_ci      case OP_MUL:
625bf215546Sopenharmony_ci         handleMUL(insn);
626bf215546Sopenharmony_ci         break;
627bf215546Sopenharmony_ci      default:
628bf215546Sopenharmony_ci         break;
629bf215546Sopenharmony_ci      }
630bf215546Sopenharmony_ci   }
631bf215546Sopenharmony_ci   return true;
632bf215546Sopenharmony_ci}
633bf215546Sopenharmony_ci
634bf215546Sopenharmony_ciclass NV50LoweringPreSSA : public Pass
635bf215546Sopenharmony_ci{
636bf215546Sopenharmony_cipublic:
637bf215546Sopenharmony_ci   NV50LoweringPreSSA(Program *);
638bf215546Sopenharmony_ci
639bf215546Sopenharmony_ciprivate:
640bf215546Sopenharmony_ci   virtual bool visit(Instruction *);
641bf215546Sopenharmony_ci   virtual bool visit(Function *);
642bf215546Sopenharmony_ci
643bf215546Sopenharmony_ci   bool handleRDSV(Instruction *);
644bf215546Sopenharmony_ci   bool handleWRSV(Instruction *);
645bf215546Sopenharmony_ci
646bf215546Sopenharmony_ci   bool handlePFETCH(Instruction *);
647bf215546Sopenharmony_ci   bool handleEXPORT(Instruction *);
648bf215546Sopenharmony_ci   bool handleLOAD(Instruction *);
649bf215546Sopenharmony_ci   bool handleLDST(Instruction *);
650bf215546Sopenharmony_ci   bool handleMEMBAR(Instruction *);
651bf215546Sopenharmony_ci   bool handleSharedATOM(Instruction *);
652bf215546Sopenharmony_ci   bool handleSULDP(TexInstruction *);
653bf215546Sopenharmony_ci   bool handleSUREDP(TexInstruction *);
654bf215546Sopenharmony_ci   bool handleSUSTP(TexInstruction *);
655bf215546Sopenharmony_ci   Value *processSurfaceCoords(TexInstruction *);
656bf215546Sopenharmony_ci
657bf215546Sopenharmony_ci   bool handleDIV(Instruction *);
658bf215546Sopenharmony_ci   bool handleSQRT(Instruction *);
659bf215546Sopenharmony_ci   bool handlePOW(Instruction *);
660bf215546Sopenharmony_ci
661bf215546Sopenharmony_ci   bool handleSET(Instruction *);
662bf215546Sopenharmony_ci   bool handleSLCT(CmpInstruction *);
663bf215546Sopenharmony_ci   bool handleSELP(Instruction *);
664bf215546Sopenharmony_ci
665bf215546Sopenharmony_ci   bool handleTEX(TexInstruction *);
666bf215546Sopenharmony_ci   bool handleTXB(TexInstruction *); // I really
667bf215546Sopenharmony_ci   bool handleTXL(TexInstruction *); // hate
668bf215546Sopenharmony_ci   bool handleTXD(TexInstruction *); // these 3
669bf215546Sopenharmony_ci   bool handleTXLQ(TexInstruction *);
670bf215546Sopenharmony_ci   bool handleTXQ(TexInstruction *);
671bf215546Sopenharmony_ci   bool handleSUQ(TexInstruction *);
672bf215546Sopenharmony_ci   bool handleBUFQ(Instruction *);
673bf215546Sopenharmony_ci
674bf215546Sopenharmony_ci   bool handleCALL(Instruction *);
675bf215546Sopenharmony_ci   bool handlePRECONT(Instruction *);
676bf215546Sopenharmony_ci   bool handleCONT(Instruction *);
677bf215546Sopenharmony_ci
678bf215546Sopenharmony_ci   void checkPredicate(Instruction *);
679bf215546Sopenharmony_ci   void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
680bf215546Sopenharmony_ci   void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);
681bf215546Sopenharmony_ci   Value *loadSuInfo(int slot, uint32_t off);
682bf215546Sopenharmony_ci   Value *loadSuInfo16(int slot, uint32_t off);
683bf215546Sopenharmony_ci
684bf215546Sopenharmony_ciprivate:
685bf215546Sopenharmony_ci   const Target *const targ;
686bf215546Sopenharmony_ci
687bf215546Sopenharmony_ci   BuildUtil bld;
688bf215546Sopenharmony_ci
689bf215546Sopenharmony_ci   Value *tid;
690bf215546Sopenharmony_ci};
691bf215546Sopenharmony_ci
692bf215546Sopenharmony_ciNV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
693bf215546Sopenharmony_ci   targ(prog->getTarget()), tid(NULL)
694bf215546Sopenharmony_ci{
695bf215546Sopenharmony_ci   bld.setProgram(prog);
696bf215546Sopenharmony_ci}
697bf215546Sopenharmony_ci
698bf215546Sopenharmony_cibool
699bf215546Sopenharmony_ciNV50LoweringPreSSA::visit(Function *f)
700bf215546Sopenharmony_ci{
701bf215546Sopenharmony_ci   BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
702bf215546Sopenharmony_ci
703bf215546Sopenharmony_ci   if (prog->getType() == Program::TYPE_COMPUTE) {
704bf215546Sopenharmony_ci      // Add implicit "thread id" argument in $r0 to the function
705bf215546Sopenharmony_ci      Value *arg = new_LValue(func, FILE_GPR);
706bf215546Sopenharmony_ci      arg->reg.data.id = 0;
707bf215546Sopenharmony_ci      f->ins.push_back(arg);
708bf215546Sopenharmony_ci
709bf215546Sopenharmony_ci      bld.setPosition(root, false);
710bf215546Sopenharmony_ci      tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
711bf215546Sopenharmony_ci   }
712bf215546Sopenharmony_ci
713bf215546Sopenharmony_ci   return true;
714bf215546Sopenharmony_ci}
715bf215546Sopenharmony_ci
716bf215546Sopenharmony_civoid NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
717bf215546Sopenharmony_ci                                       Value **ms_x, Value **ms_y) {
718bf215546Sopenharmony_ci   // This loads the texture-indexed ms setting from the constant buffer
719bf215546Sopenharmony_ci   Value *tmp = new_LValue(func, FILE_GPR);
720bf215546Sopenharmony_ci   uint8_t b = prog->driver->io.auxCBSlot;
721bf215546Sopenharmony_ci   off += prog->driver->io.suInfoBase;
722bf215546Sopenharmony_ci   if (prog->getType() > Program::TYPE_VERTEX)
723bf215546Sopenharmony_ci      off += 16 * 2 * 4;
724bf215546Sopenharmony_ci   if (prog->getType() > Program::TYPE_GEOMETRY)
725bf215546Sopenharmony_ci      off += 16 * 2 * 4;
726bf215546Sopenharmony_ci   if (prog->getType() > Program::TYPE_FRAGMENT)
727bf215546Sopenharmony_ci      off += 16 * 2 * 4;
728bf215546Sopenharmony_ci   *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
729bf215546Sopenharmony_ci                             FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
730bf215546Sopenharmony_ci   *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
731bf215546Sopenharmony_ci                             FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL);
732bf215546Sopenharmony_ci   *ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, *ms_x, *ms_y);
733bf215546Sopenharmony_ci}
734bf215546Sopenharmony_ci
735bf215546Sopenharmony_civoid NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) {
736bf215546Sopenharmony_ci   // Given a MS level, and a sample id, compute the delta x/y
737bf215546Sopenharmony_ci   uint8_t b = prog->driver->io.msInfoCBSlot;
738bf215546Sopenharmony_ci   Value *off = new_LValue(func, FILE_ADDRESS), *t = new_LValue(func, FILE_GPR);
739bf215546Sopenharmony_ci
740bf215546Sopenharmony_ci   // The required information is at mslevel * 16 * 4 + sample * 8
741bf215546Sopenharmony_ci   // = (mslevel * 8 + sample) * 8
742bf215546Sopenharmony_ci   bld.mkOp2(OP_SHL,
743bf215546Sopenharmony_ci             TYPE_U32,
744bf215546Sopenharmony_ci             off,
745bf215546Sopenharmony_ci             bld.mkOp2v(OP_ADD, TYPE_U32, t,
746bf215546Sopenharmony_ci                        bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)),
747bf215546Sopenharmony_ci                        s),
748bf215546Sopenharmony_ci             bld.mkImm(3));
749bf215546Sopenharmony_ci   *dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
750bf215546Sopenharmony_ci                           FILE_MEMORY_CONST, b, TYPE_U32,
751bf215546Sopenharmony_ci                           prog->driver->io.msInfoBase), off);
752bf215546Sopenharmony_ci   *dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
753bf215546Sopenharmony_ci                           FILE_MEMORY_CONST, b, TYPE_U32,
754bf215546Sopenharmony_ci                           prog->driver->io.msInfoBase + 4), off);
755bf215546Sopenharmony_ci}
756bf215546Sopenharmony_ci
757bf215546Sopenharmony_ciValue *
758bf215546Sopenharmony_ciNV50LoweringPreSSA::loadSuInfo(int slot, uint32_t off)
759bf215546Sopenharmony_ci{
760bf215546Sopenharmony_ci   uint8_t b = prog->driver->io.auxCBSlot;
761bf215546Sopenharmony_ci   off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;
762bf215546Sopenharmony_ci   return bld.mkLoadv(TYPE_U32, bld.mkSymbol(
763bf215546Sopenharmony_ci                            FILE_MEMORY_CONST, b, TYPE_U32, off), NULL);
764bf215546Sopenharmony_ci}
765bf215546Sopenharmony_ci
766bf215546Sopenharmony_ciValue *
767bf215546Sopenharmony_ciNV50LoweringPreSSA::loadSuInfo16(int slot, uint32_t off)
768bf215546Sopenharmony_ci{
769bf215546Sopenharmony_ci   uint8_t b = prog->driver->io.auxCBSlot;
770bf215546Sopenharmony_ci   off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;
771bf215546Sopenharmony_ci   return bld.mkLoadv(TYPE_U16, bld.mkSymbol(
772bf215546Sopenharmony_ci                            FILE_MEMORY_CONST, b, TYPE_U16, off), NULL);
773bf215546Sopenharmony_ci}
774bf215546Sopenharmony_ci
775bf215546Sopenharmony_cibool
776bf215546Sopenharmony_ciNV50LoweringPreSSA::handleTEX(TexInstruction *i)
777bf215546Sopenharmony_ci{
778bf215546Sopenharmony_ci   const int arg = i->tex.target.getArgCount();
779bf215546Sopenharmony_ci   const int dref = arg;
780bf215546Sopenharmony_ci   const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
781bf215546Sopenharmony_ci
782bf215546Sopenharmony_ci   /* Only normalize in the non-explicit derivatives case.
783bf215546Sopenharmony_ci    */
784bf215546Sopenharmony_ci   if (i->tex.target.isCube() && i->op != OP_TXD) {
785bf215546Sopenharmony_ci      Value *src[3], *val;
786bf215546Sopenharmony_ci      int c;
787bf215546Sopenharmony_ci      for (c = 0; c < 3; ++c)
788bf215546Sopenharmony_ci         src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
789bf215546Sopenharmony_ci      val = bld.getScratch();
790bf215546Sopenharmony_ci      bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
791bf215546Sopenharmony_ci      bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
792bf215546Sopenharmony_ci      bld.mkOp1(OP_RCP, TYPE_F32, val, val);
793bf215546Sopenharmony_ci      for (c = 0; c < 3; ++c) {
794bf215546Sopenharmony_ci         i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
795bf215546Sopenharmony_ci                                 i->getSrc(c), val));
796bf215546Sopenharmony_ci      }
797bf215546Sopenharmony_ci   }
798bf215546Sopenharmony_ci
799bf215546Sopenharmony_ci   // handle MS, which means looking up the MS params for this texture, and
800bf215546Sopenharmony_ci   // adjusting the input coordinates to point at the right sample.
801bf215546Sopenharmony_ci   if (i->tex.target.isMS()) {
802bf215546Sopenharmony_ci      Value *x = i->getSrc(0);
803bf215546Sopenharmony_ci      Value *y = i->getSrc(1);
804bf215546Sopenharmony_ci      Value *s = i->getSrc(arg - 1);
805bf215546Sopenharmony_ci      Value *tx = new_LValue(func, FILE_GPR), *ty = new_LValue(func, FILE_GPR),
806bf215546Sopenharmony_ci         *ms, *ms_x, *ms_y, *dx, *dy;
807bf215546Sopenharmony_ci
808bf215546Sopenharmony_ci      i->tex.target.clearMS();
809bf215546Sopenharmony_ci
810bf215546Sopenharmony_ci      loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
811bf215546Sopenharmony_ci      loadMsInfo(ms, s, &dx, &dy);
812bf215546Sopenharmony_ci
813bf215546Sopenharmony_ci      bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
814bf215546Sopenharmony_ci      bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
815bf215546Sopenharmony_ci      bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
816bf215546Sopenharmony_ci      bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
817bf215546Sopenharmony_ci      i->setSrc(0, tx);
818bf215546Sopenharmony_ci      i->setSrc(1, ty);
819bf215546Sopenharmony_ci      i->setSrc(arg - 1, bld.loadImm(NULL, 0));
820bf215546Sopenharmony_ci   }
821bf215546Sopenharmony_ci
822bf215546Sopenharmony_ci   // dref comes before bias/lod
823bf215546Sopenharmony_ci   if (i->tex.target.isShadow())
824bf215546Sopenharmony_ci      if (i->op == OP_TXB || i->op == OP_TXL)
825bf215546Sopenharmony_ci         i->swapSources(dref, lod);
826bf215546Sopenharmony_ci
827bf215546Sopenharmony_ci   if (i->tex.target.isArray()) {
828bf215546Sopenharmony_ci      if (i->op != OP_TXF) {
829bf215546Sopenharmony_ci         // array index must be converted to u32, but it's already an integer
830bf215546Sopenharmony_ci         // for TXF
831bf215546Sopenharmony_ci         Value *layer = i->getSrc(arg - 1);
832bf215546Sopenharmony_ci         LValue *src = new_LValue(func, FILE_GPR);
833bf215546Sopenharmony_ci         bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
834bf215546Sopenharmony_ci         bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
835bf215546Sopenharmony_ci         i->setSrc(arg - 1, src);
836bf215546Sopenharmony_ci      }
837bf215546Sopenharmony_ci      if (i->tex.target.isCube() && i->srcCount() > 4) {
838bf215546Sopenharmony_ci         std::vector<Value *> acube, a2d;
839bf215546Sopenharmony_ci         int c;
840bf215546Sopenharmony_ci
841bf215546Sopenharmony_ci         acube.resize(4);
842bf215546Sopenharmony_ci         for (c = 0; c < 4; ++c)
843bf215546Sopenharmony_ci            acube[c] = i->getSrc(c);
844bf215546Sopenharmony_ci         a2d.resize(4);
845bf215546Sopenharmony_ci         for (c = 0; c < 3; ++c)
846bf215546Sopenharmony_ci            a2d[c] = new_LValue(func, FILE_GPR);
847bf215546Sopenharmony_ci         a2d[3] = NULL;
848bf215546Sopenharmony_ci
849bf215546Sopenharmony_ci         bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
850bf215546Sopenharmony_ci                   a2d, acube)->asTex()->tex.mask = 0x7;
851bf215546Sopenharmony_ci
852bf215546Sopenharmony_ci         for (c = 0; c < 3; ++c)
853bf215546Sopenharmony_ci            i->setSrc(c, a2d[c]);
854bf215546Sopenharmony_ci         for (; i->srcExists(c + 1); ++c)
855bf215546Sopenharmony_ci            i->setSrc(c, i->getSrc(c + 1));
856bf215546Sopenharmony_ci         i->setSrc(c, NULL);
857bf215546Sopenharmony_ci         assert(c <= 4);
858bf215546Sopenharmony_ci
859bf215546Sopenharmony_ci         i->tex.target = i->tex.target.isShadow() ?
860bf215546Sopenharmony_ci            TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
861bf215546Sopenharmony_ci      }
862bf215546Sopenharmony_ci   }
863bf215546Sopenharmony_ci
864bf215546Sopenharmony_ci   // texel offsets are 3 immediate fields in the instruction,
865bf215546Sopenharmony_ci   // nv50 cannot do textureGatherOffsets
866bf215546Sopenharmony_ci   assert(i->tex.useOffsets <= 1);
867bf215546Sopenharmony_ci   if (i->tex.useOffsets) {
868bf215546Sopenharmony_ci      for (int c = 0; c < 3; ++c) {
869bf215546Sopenharmony_ci         ImmediateValue val;
870bf215546Sopenharmony_ci         if (!i->offset[0][c].getImmediate(val))
871bf215546Sopenharmony_ci            assert(!"non-immediate offset");
872bf215546Sopenharmony_ci         i->tex.offset[c] = val.reg.data.u32;
873bf215546Sopenharmony_ci         i->offset[0][c].set(NULL);
874bf215546Sopenharmony_ci      }
875bf215546Sopenharmony_ci   }
876bf215546Sopenharmony_ci
877bf215546Sopenharmony_ci   return true;
878bf215546Sopenharmony_ci}
879bf215546Sopenharmony_ci
880bf215546Sopenharmony_ci// Bias must be equal for all threads of a quad or lod calculation will fail.
881bf215546Sopenharmony_ci//
882bf215546Sopenharmony_ci// The lanes of a quad are grouped by the bit in the condition register they
883bf215546Sopenharmony_ci// have set, which is selected by differing bias values.
884bf215546Sopenharmony_ci// Move the input values for TEX into a new register set for each group and
885bf215546Sopenharmony_ci// execute TEX only for a specific group.
886bf215546Sopenharmony_ci// We always need to use 4 new registers for the inputs/outputs because the
887bf215546Sopenharmony_ci// implicitly calculated derivatives must be correct.
888bf215546Sopenharmony_ci//
889bf215546Sopenharmony_ci// TODO: move to SSA phase so we can easily determine whether bias is constant
890bf215546Sopenharmony_cibool
891bf215546Sopenharmony_ciNV50LoweringPreSSA::handleTXB(TexInstruction *i)
892bf215546Sopenharmony_ci{
893bf215546Sopenharmony_ci   const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
894bf215546Sopenharmony_ci   int l, d;
895bf215546Sopenharmony_ci
896bf215546Sopenharmony_ci   // We can't actually apply bias *and* do a compare for a cube
897bf215546Sopenharmony_ci   // texture. Since the compare has to be done before the filtering, just
898bf215546Sopenharmony_ci   // drop the bias on the floor.
899bf215546Sopenharmony_ci   if (i->tex.target == TEX_TARGET_CUBE_SHADOW) {
900bf215546Sopenharmony_ci      i->op = OP_TEX;
901bf215546Sopenharmony_ci      i->setSrc(3, i->getSrc(4));
902bf215546Sopenharmony_ci      i->setSrc(4, NULL);
903bf215546Sopenharmony_ci      return handleTEX(i);
904bf215546Sopenharmony_ci   }
905bf215546Sopenharmony_ci
906bf215546Sopenharmony_ci   handleTEX(i);
907bf215546Sopenharmony_ci   Value *bias = i->getSrc(i->tex.target.getArgCount());
908bf215546Sopenharmony_ci   if (bias->isUniform())
909bf215546Sopenharmony_ci      return true;
910bf215546Sopenharmony_ci
911bf215546Sopenharmony_ci   Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
912bf215546Sopenharmony_ci                                 bld.loadImm(NULL, 1));
913bf215546Sopenharmony_ci   bld.setPosition(cond, false);
914bf215546Sopenharmony_ci
915bf215546Sopenharmony_ci   for (l = 1; l < 4; ++l) {
916bf215546Sopenharmony_ci      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
917bf215546Sopenharmony_ci      Value *bit = bld.getSSA();
918bf215546Sopenharmony_ci      Value *pred = bld.getScratch(1, FILE_FLAGS);
919bf215546Sopenharmony_ci      Value *imm = bld.loadImm(NULL, (1 << l));
920bf215546Sopenharmony_ci      bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
921bf215546Sopenharmony_ci      bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
922bf215546Sopenharmony_ci      cond->setSrc(l, bit);
923bf215546Sopenharmony_ci   }
924bf215546Sopenharmony_ci   Value *flags = bld.getScratch(1, FILE_FLAGS);
925bf215546Sopenharmony_ci   bld.setPosition(cond, true);
926bf215546Sopenharmony_ci   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;
927bf215546Sopenharmony_ci
928bf215546Sopenharmony_ci   Instruction *tex[4];
929bf215546Sopenharmony_ci   for (l = 0; l < 4; ++l) {
930bf215546Sopenharmony_ci      (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
931bf215546Sopenharmony_ci      bld.insert(tex[l]);
932bf215546Sopenharmony_ci   }
933bf215546Sopenharmony_ci
934bf215546Sopenharmony_ci   Value *res[4][4];
935bf215546Sopenharmony_ci   for (d = 0; i->defExists(d); ++d)
936bf215546Sopenharmony_ci      res[0][d] = tex[0]->getDef(d);
937bf215546Sopenharmony_ci   for (l = 1; l < 4; ++l) {
938bf215546Sopenharmony_ci      for (d = 0; tex[l]->defExists(d); ++d) {
939bf215546Sopenharmony_ci         res[l][d] = cloneShallow(func, res[0][d]);
940bf215546Sopenharmony_ci         bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
941bf215546Sopenharmony_ci      }
942bf215546Sopenharmony_ci   }
943bf215546Sopenharmony_ci
944bf215546Sopenharmony_ci   for (d = 0; i->defExists(d); ++d) {
945bf215546Sopenharmony_ci      Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
946bf215546Sopenharmony_ci      for (l = 0; l < 4; ++l)
947bf215546Sopenharmony_ci         dst->setSrc(l, res[l][d]);
948bf215546Sopenharmony_ci   }
949bf215546Sopenharmony_ci   delete_Instruction(prog, i);
950bf215546Sopenharmony_ci   return true;
951bf215546Sopenharmony_ci}
952bf215546Sopenharmony_ci
953bf215546Sopenharmony_ci// LOD must be equal for all threads of a quad.
954bf215546Sopenharmony_ci// Unlike with TXB, here we can just diverge since there's no LOD calculation
955bf215546Sopenharmony_ci// that would require all 4 threads' sources to be set up properly.
956bf215546Sopenharmony_cibool
957bf215546Sopenharmony_ciNV50LoweringPreSSA::handleTXL(TexInstruction *i)
958bf215546Sopenharmony_ci{
959bf215546Sopenharmony_ci   handleTEX(i);
960bf215546Sopenharmony_ci   Value *lod = i->getSrc(i->tex.target.getArgCount());
961bf215546Sopenharmony_ci   if (lod->isUniform())
962bf215546Sopenharmony_ci      return true;
963bf215546Sopenharmony_ci
964bf215546Sopenharmony_ci   BasicBlock *currBB = i->bb;
965bf215546Sopenharmony_ci   BasicBlock *texiBB = i->bb->splitBefore(i, false);
966bf215546Sopenharmony_ci   BasicBlock *joinBB = i->bb->splitAfter(i);
967bf215546Sopenharmony_ci
968bf215546Sopenharmony_ci   bld.setPosition(currBB, true);
969bf215546Sopenharmony_ci   assert(!currBB->joinAt);
970bf215546Sopenharmony_ci   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
971bf215546Sopenharmony_ci
972bf215546Sopenharmony_ci   for (int l = 0; l <= 3; ++l) {
973bf215546Sopenharmony_ci      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
974bf215546Sopenharmony_ci      Value *pred = bld.getScratch(1, FILE_FLAGS);
975bf215546Sopenharmony_ci      bld.setPosition(currBB, true);
976bf215546Sopenharmony_ci      bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
977bf215546Sopenharmony_ci      bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
978bf215546Sopenharmony_ci      currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
979bf215546Sopenharmony_ci      if (l <= 2) {
980bf215546Sopenharmony_ci         BasicBlock *laneBB = new BasicBlock(func);
981bf215546Sopenharmony_ci         currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
982bf215546Sopenharmony_ci         currBB = laneBB;
983bf215546Sopenharmony_ci      }
984bf215546Sopenharmony_ci   }
985bf215546Sopenharmony_ci   bld.setPosition(joinBB, false);
986bf215546Sopenharmony_ci   bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
987bf215546Sopenharmony_ci   return true;
988bf215546Sopenharmony_ci}
989bf215546Sopenharmony_ci
990bf215546Sopenharmony_cibool
991bf215546Sopenharmony_ciNV50LoweringPreSSA::handleTXD(TexInstruction *i)
992bf215546Sopenharmony_ci{
993bf215546Sopenharmony_ci   static const uint8_t qOps[4][2] =
994bf215546Sopenharmony_ci   {
995bf215546Sopenharmony_ci      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
996bf215546Sopenharmony_ci      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
997bf215546Sopenharmony_ci      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
998bf215546Sopenharmony_ci      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
999bf215546Sopenharmony_ci   };
1000bf215546Sopenharmony_ci   Value *def[4][4];
1001bf215546Sopenharmony_ci   Value *crd[3];
1002bf215546Sopenharmony_ci   Instruction *tex;
1003bf215546Sopenharmony_ci   Value *zero = bld.loadImm(bld.getSSA(), 0);
1004bf215546Sopenharmony_ci   int l, c;
1005bf215546Sopenharmony_ci   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
1006bf215546Sopenharmony_ci
1007bf215546Sopenharmony_ci   handleTEX(i);
1008bf215546Sopenharmony_ci   i->op = OP_TEX; // no need to clone dPdx/dPdy later
1009bf215546Sopenharmony_ci   i->tex.derivAll = true;
1010bf215546Sopenharmony_ci
1011bf215546Sopenharmony_ci   for (c = 0; c < dim; ++c)
1012bf215546Sopenharmony_ci      crd[c] = bld.getScratch();
1013bf215546Sopenharmony_ci
1014bf215546Sopenharmony_ci   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
1015bf215546Sopenharmony_ci   for (l = 0; l < 4; ++l) {
1016bf215546Sopenharmony_ci      Value *src[3], *val;
1017bf215546Sopenharmony_ci      // mov coordinates from lane l to all lanes
1018bf215546Sopenharmony_ci      for (c = 0; c < dim; ++c)
1019bf215546Sopenharmony_ci         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
1020bf215546Sopenharmony_ci      // add dPdx from lane l to lanes dx
1021bf215546Sopenharmony_ci      for (c = 0; c < dim; ++c)
1022bf215546Sopenharmony_ci         bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
1023bf215546Sopenharmony_ci      // add dPdy from lane l to lanes dy
1024bf215546Sopenharmony_ci      for (c = 0; c < dim; ++c)
1025bf215546Sopenharmony_ci         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
1026bf215546Sopenharmony_ci      // normalize cube coordinates if necessary
1027bf215546Sopenharmony_ci      if (i->tex.target.isCube()) {
1028bf215546Sopenharmony_ci         for (c = 0; c < 3; ++c)
1029bf215546Sopenharmony_ci            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
1030bf215546Sopenharmony_ci         val = bld.getScratch();
1031bf215546Sopenharmony_ci         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
1032bf215546Sopenharmony_ci         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
1033bf215546Sopenharmony_ci         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
1034bf215546Sopenharmony_ci         for (c = 0; c < 3; ++c)
1035bf215546Sopenharmony_ci            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
1036bf215546Sopenharmony_ci      } else {
1037bf215546Sopenharmony_ci         for (c = 0; c < dim; ++c)
1038bf215546Sopenharmony_ci            src[c] = crd[c];
1039bf215546Sopenharmony_ci      }
1040bf215546Sopenharmony_ci      // texture
1041bf215546Sopenharmony_ci      bld.insert(tex = cloneForward(func, i));
1042bf215546Sopenharmony_ci      for (c = 0; c < dim; ++c)
1043bf215546Sopenharmony_ci         tex->setSrc(c, src[c]);
1044bf215546Sopenharmony_ci      // save results
1045bf215546Sopenharmony_ci      for (c = 0; i->defExists(c); ++c) {
1046bf215546Sopenharmony_ci         Instruction *mov;
1047bf215546Sopenharmony_ci         def[c][l] = bld.getSSA();
1048bf215546Sopenharmony_ci         mov = bld.mkMov(def[c][l], tex->getDef(c));
1049bf215546Sopenharmony_ci         mov->fixed = 1;
1050bf215546Sopenharmony_ci         mov->lanes = 1 << l;
1051bf215546Sopenharmony_ci      }
1052bf215546Sopenharmony_ci   }
1053bf215546Sopenharmony_ci   bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
1054bf215546Sopenharmony_ci
1055bf215546Sopenharmony_ci   for (c = 0; i->defExists(c); ++c) {
1056bf215546Sopenharmony_ci      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
1057bf215546Sopenharmony_ci      for (l = 0; l < 4; ++l)
1058bf215546Sopenharmony_ci         u->setSrc(l, def[c][l]);
1059bf215546Sopenharmony_ci   }
1060bf215546Sopenharmony_ci
1061bf215546Sopenharmony_ci   i->bb->remove(i);
1062bf215546Sopenharmony_ci   return true;
1063bf215546Sopenharmony_ci}
1064bf215546Sopenharmony_ci
1065bf215546Sopenharmony_cibool
1066bf215546Sopenharmony_ciNV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
1067bf215546Sopenharmony_ci{
1068bf215546Sopenharmony_ci   handleTEX(i);
1069bf215546Sopenharmony_ci   bld.setPosition(i, true);
1070bf215546Sopenharmony_ci
1071bf215546Sopenharmony_ci   /* The returned values are not quite what we want:
1072bf215546Sopenharmony_ci    * (a) convert from s32 to f32
1073bf215546Sopenharmony_ci    * (b) multiply by 1/256
1074bf215546Sopenharmony_ci    */
1075bf215546Sopenharmony_ci   for (int def = 0; def < 2; ++def) {
1076bf215546Sopenharmony_ci      if (!i->defExists(def))
1077bf215546Sopenharmony_ci         continue;
1078bf215546Sopenharmony_ci      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def));
1079bf215546Sopenharmony_ci      bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
1080bf215546Sopenharmony_ci                i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
1081bf215546Sopenharmony_ci   }
1082bf215546Sopenharmony_ci   return true;
1083bf215546Sopenharmony_ci}
1084bf215546Sopenharmony_ci
1085bf215546Sopenharmony_cibool
1086bf215546Sopenharmony_ciNV50LoweringPreSSA::handleTXQ(TexInstruction *i)
1087bf215546Sopenharmony_ci{
1088bf215546Sopenharmony_ci   Value *ms, *ms_x, *ms_y;
1089bf215546Sopenharmony_ci   if (i->tex.query == TXQ_DIMS) {
1090bf215546Sopenharmony_ci      if (i->tex.target.isMS()) {
1091bf215546Sopenharmony_ci         bld.setPosition(i, true);
1092bf215546Sopenharmony_ci         loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
1093bf215546Sopenharmony_ci         int d = 0;
1094bf215546Sopenharmony_ci         if (i->tex.mask & 1) {
1095bf215546Sopenharmony_ci            bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_x);
1096bf215546Sopenharmony_ci            d++;
1097bf215546Sopenharmony_ci         }
1098bf215546Sopenharmony_ci         if (i->tex.mask & 2) {
1099bf215546Sopenharmony_ci            bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_y);
1100bf215546Sopenharmony_ci            d++;
1101bf215546Sopenharmony_ci         }
1102bf215546Sopenharmony_ci      }
1103bf215546Sopenharmony_ci      return true;
1104bf215546Sopenharmony_ci   }
1105bf215546Sopenharmony_ci   assert(i->tex.query == TXQ_TYPE);
1106bf215546Sopenharmony_ci   assert(i->tex.mask == 4);
1107bf215546Sopenharmony_ci
1108bf215546Sopenharmony_ci   loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
1109bf215546Sopenharmony_ci   bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.loadImm(NULL, 1), ms);
1110bf215546Sopenharmony_ci   i->bb->remove(i);
1111bf215546Sopenharmony_ci
1112bf215546Sopenharmony_ci   return true;
1113bf215546Sopenharmony_ci}
1114bf215546Sopenharmony_ci
1115bf215546Sopenharmony_cibool
1116bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSUQ(TexInstruction *suq)
1117bf215546Sopenharmony_ci{
1118bf215546Sopenharmony_ci   const int dim = suq->tex.target.getDim();
1119bf215546Sopenharmony_ci   const int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
1120bf215546Sopenharmony_ci   int mask = suq->tex.mask;
1121bf215546Sopenharmony_ci   int slot = suq->tex.r;
1122bf215546Sopenharmony_ci   int c, d;
1123bf215546Sopenharmony_ci
1124bf215546Sopenharmony_ci   for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
1125bf215546Sopenharmony_ci      if (c >= arg || !(mask & 1))
1126bf215546Sopenharmony_ci         continue;
1127bf215546Sopenharmony_ci
1128bf215546Sopenharmony_ci      int offset;
1129bf215546Sopenharmony_ci
1130bf215546Sopenharmony_ci      if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
1131bf215546Sopenharmony_ci         offset = NV50_SU_INFO_SIZE(2);
1132bf215546Sopenharmony_ci      } else {
1133bf215546Sopenharmony_ci         offset = NV50_SU_INFO_SIZE(c);
1134bf215546Sopenharmony_ci      }
1135bf215546Sopenharmony_ci      bld.mkMov(suq->getDef(d++), loadSuInfo(slot, offset));
1136bf215546Sopenharmony_ci      if (c == 2 && suq->tex.target.isCube())
1137bf215546Sopenharmony_ci         bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
1138bf215546Sopenharmony_ci                   bld.loadImm(NULL, 6));
1139bf215546Sopenharmony_ci   }
1140bf215546Sopenharmony_ci
1141bf215546Sopenharmony_ci   if (mask & 1) {
1142bf215546Sopenharmony_ci      if (suq->tex.target.isMS()) {
1143bf215546Sopenharmony_ci         Value *ms_x = loadSuInfo(slot, NV50_SU_INFO_MS(0));
1144bf215546Sopenharmony_ci         Value *ms_y = loadSuInfo(slot, NV50_SU_INFO_MS(1));
1145bf215546Sopenharmony_ci         Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
1146bf215546Sopenharmony_ci         bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
1147bf215546Sopenharmony_ci      } else {
1148bf215546Sopenharmony_ci         bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
1149bf215546Sopenharmony_ci      }
1150bf215546Sopenharmony_ci   }
1151bf215546Sopenharmony_ci
1152bf215546Sopenharmony_ci   bld.remove(suq);
1153bf215546Sopenharmony_ci   return true;
1154bf215546Sopenharmony_ci}
1155bf215546Sopenharmony_ci
1156bf215546Sopenharmony_cibool
1157bf215546Sopenharmony_ciNV50LoweringPreSSA::handleBUFQ(Instruction *bufq)
1158bf215546Sopenharmony_ci{
1159bf215546Sopenharmony_ci   bufq->op = OP_MOV;
1160bf215546Sopenharmony_ci   bufq->setSrc(0, loadSuInfo(bufq->getSrc(0)->reg.fileIndex, NV50_SU_INFO_SIZE_X));
1161bf215546Sopenharmony_ci   bufq->setIndirect(0, 0, NULL);
1162bf215546Sopenharmony_ci   bufq->setIndirect(0, 1, NULL);
1163bf215546Sopenharmony_ci   return true;
1164bf215546Sopenharmony_ci}
1165bf215546Sopenharmony_ci
1166bf215546Sopenharmony_cibool
1167bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSET(Instruction *i)
1168bf215546Sopenharmony_ci{
1169bf215546Sopenharmony_ci   if (i->dType == TYPE_F32) {
1170bf215546Sopenharmony_ci      bld.setPosition(i, true);
1171bf215546Sopenharmony_ci      i->dType = TYPE_U32;
1172bf215546Sopenharmony_ci      bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
1173bf215546Sopenharmony_ci      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
1174bf215546Sopenharmony_ci   }
1175bf215546Sopenharmony_ci   return true;
1176bf215546Sopenharmony_ci}
1177bf215546Sopenharmony_ci
1178bf215546Sopenharmony_cibool
1179bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
1180bf215546Sopenharmony_ci{
1181bf215546Sopenharmony_ci   Value *src0 = bld.getSSA();
1182bf215546Sopenharmony_ci   Value *src1 = bld.getSSA();
1183bf215546Sopenharmony_ci   Value *pred = bld.getScratch(1, FILE_FLAGS);
1184bf215546Sopenharmony_ci
1185bf215546Sopenharmony_ci   Value *v0 = i->getSrc(0);
1186bf215546Sopenharmony_ci   Value *v1 = i->getSrc(1);
1187bf215546Sopenharmony_ci   // XXX: these probably shouldn't be immediates in the first place ...
1188bf215546Sopenharmony_ci   if (v0->asImm())
1189bf215546Sopenharmony_ci      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1190bf215546Sopenharmony_ci   if (v1->asImm())
1191bf215546Sopenharmony_ci      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1192bf215546Sopenharmony_ci
1193bf215546Sopenharmony_ci   bld.setPosition(i, true);
1194bf215546Sopenharmony_ci   bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
1195bf215546Sopenharmony_ci   bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
1196bf215546Sopenharmony_ci   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1197bf215546Sopenharmony_ci
1198bf215546Sopenharmony_ci   bld.setPosition(i, false);
1199bf215546Sopenharmony_ci   i->op = OP_SET;
1200bf215546Sopenharmony_ci   i->setFlagsDef(0, pred);
1201bf215546Sopenharmony_ci   i->dType = TYPE_U8;
1202bf215546Sopenharmony_ci   i->setSrc(0, i->getSrc(2));
1203bf215546Sopenharmony_ci   i->setSrc(2, NULL);
1204bf215546Sopenharmony_ci   i->setSrc(1, bld.loadImm(NULL, 0));
1205bf215546Sopenharmony_ci
1206bf215546Sopenharmony_ci   return true;
1207bf215546Sopenharmony_ci}
1208bf215546Sopenharmony_ci
1209bf215546Sopenharmony_cibool
1210bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSELP(Instruction *i)
1211bf215546Sopenharmony_ci{
1212bf215546Sopenharmony_ci   Value *src0 = bld.getSSA();
1213bf215546Sopenharmony_ci   Value *src1 = bld.getSSA();
1214bf215546Sopenharmony_ci
1215bf215546Sopenharmony_ci   Value *v0 = i->getSrc(0);
1216bf215546Sopenharmony_ci   Value *v1 = i->getSrc(1);
1217bf215546Sopenharmony_ci   if (v0->asImm())
1218bf215546Sopenharmony_ci      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1219bf215546Sopenharmony_ci   if (v1->asImm())
1220bf215546Sopenharmony_ci      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1221bf215546Sopenharmony_ci
1222bf215546Sopenharmony_ci   bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
1223bf215546Sopenharmony_ci   bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
1224bf215546Sopenharmony_ci   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1225bf215546Sopenharmony_ci   delete_Instruction(prog, i);
1226bf215546Sopenharmony_ci   return true;
1227bf215546Sopenharmony_ci}
1228bf215546Sopenharmony_ci
1229bf215546Sopenharmony_cibool
1230bf215546Sopenharmony_ciNV50LoweringPreSSA::handleWRSV(Instruction *i)
1231bf215546Sopenharmony_ci{
1232bf215546Sopenharmony_ci   Symbol *sym = i->getSrc(0)->asSym();
1233bf215546Sopenharmony_ci
1234bf215546Sopenharmony_ci   // these are all shader outputs, $sreg are not writeable
1235bf215546Sopenharmony_ci   uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
1236bf215546Sopenharmony_ci   if (addr >= 0x400)
1237bf215546Sopenharmony_ci      return false;
1238bf215546Sopenharmony_ci   sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
1239bf215546Sopenharmony_ci
1240bf215546Sopenharmony_ci   bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
1241bf215546Sopenharmony_ci
1242bf215546Sopenharmony_ci   bld.getBB()->remove(i);
1243bf215546Sopenharmony_ci   return true;
1244bf215546Sopenharmony_ci}
1245bf215546Sopenharmony_ci
1246bf215546Sopenharmony_cibool
1247bf215546Sopenharmony_ciNV50LoweringPreSSA::handleCALL(Instruction *i)
1248bf215546Sopenharmony_ci{
1249bf215546Sopenharmony_ci   if (prog->getType() == Program::TYPE_COMPUTE) {
1250bf215546Sopenharmony_ci      // Add implicit "thread id" argument in $r0 to the function
1251bf215546Sopenharmony_ci      i->setSrc(i->srcCount(), tid);
1252bf215546Sopenharmony_ci   }
1253bf215546Sopenharmony_ci   return true;
1254bf215546Sopenharmony_ci}
1255bf215546Sopenharmony_ci
1256bf215546Sopenharmony_cibool
1257bf215546Sopenharmony_ciNV50LoweringPreSSA::handlePRECONT(Instruction *i)
1258bf215546Sopenharmony_ci{
1259bf215546Sopenharmony_ci   delete_Instruction(prog, i);
1260bf215546Sopenharmony_ci   return true;
1261bf215546Sopenharmony_ci}
1262bf215546Sopenharmony_ci
1263bf215546Sopenharmony_cibool
1264bf215546Sopenharmony_ciNV50LoweringPreSSA::handleCONT(Instruction *i)
1265bf215546Sopenharmony_ci{
1266bf215546Sopenharmony_ci   i->op = OP_BRA;
1267bf215546Sopenharmony_ci   return true;
1268bf215546Sopenharmony_ci}
1269bf215546Sopenharmony_ci
1270bf215546Sopenharmony_cibool
1271bf215546Sopenharmony_ciNV50LoweringPreSSA::handleRDSV(Instruction *i)
1272bf215546Sopenharmony_ci{
1273bf215546Sopenharmony_ci   Symbol *sym = i->getSrc(0)->asSym();
1274bf215546Sopenharmony_ci   uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
1275bf215546Sopenharmony_ci   Value *def = i->getDef(0);
1276bf215546Sopenharmony_ci   SVSemantic sv = sym->reg.data.sv.sv;
1277bf215546Sopenharmony_ci   int idx = sym->reg.data.sv.index;
1278bf215546Sopenharmony_ci
1279bf215546Sopenharmony_ci   if (addr >= 0x400) // mov $sreg
1280bf215546Sopenharmony_ci      return true;
1281bf215546Sopenharmony_ci
1282bf215546Sopenharmony_ci   switch (sv) {
1283bf215546Sopenharmony_ci   case SV_POSITION:
1284bf215546Sopenharmony_ci      assert(prog->getType() == Program::TYPE_FRAGMENT);
1285bf215546Sopenharmony_ci      bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
1286bf215546Sopenharmony_ci      break;
1287bf215546Sopenharmony_ci   case SV_FACE:
1288bf215546Sopenharmony_ci      bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
1289bf215546Sopenharmony_ci      if (i->dType == TYPE_F32) {
1290bf215546Sopenharmony_ci         bld.mkOp2(OP_OR, TYPE_U32, def, def, bld.mkImm(0x00000001));
1291bf215546Sopenharmony_ci         bld.mkOp1(OP_NEG, TYPE_S32, def, def);
1292bf215546Sopenharmony_ci         bld.mkCvt(OP_CVT, TYPE_F32, def, TYPE_S32, def);
1293bf215546Sopenharmony_ci      }
1294bf215546Sopenharmony_ci      break;
1295bf215546Sopenharmony_ci   case SV_NCTAID:
1296bf215546Sopenharmony_ci   case SV_CTAID:
1297bf215546Sopenharmony_ci   case SV_NTID: {
1298bf215546Sopenharmony_ci      Value *x = bld.getSSA(2);
1299bf215546Sopenharmony_ci      bld.mkOp1(OP_LOAD, TYPE_U16, x,
1300bf215546Sopenharmony_ci                bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
1301bf215546Sopenharmony_ci      bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
1302bf215546Sopenharmony_ci      break;
1303bf215546Sopenharmony_ci   }
1304bf215546Sopenharmony_ci   case SV_TID:
1305bf215546Sopenharmony_ci      if (idx == 0) {
1306bf215546Sopenharmony_ci         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
1307bf215546Sopenharmony_ci      } else if (idx == 1) {
1308bf215546Sopenharmony_ci         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
1309bf215546Sopenharmony_ci         bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
1310bf215546Sopenharmony_ci      } else if (idx == 2) {
1311bf215546Sopenharmony_ci         bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
1312bf215546Sopenharmony_ci      } else {
1313bf215546Sopenharmony_ci         bld.mkMov(def, bld.mkImm(0));
1314bf215546Sopenharmony_ci      }
1315bf215546Sopenharmony_ci      break;
1316bf215546Sopenharmony_ci   case SV_COMBINED_TID:
1317bf215546Sopenharmony_ci      bld.mkMov(def, tid);
1318bf215546Sopenharmony_ci      break;
1319bf215546Sopenharmony_ci   case SV_SAMPLE_POS: {
1320bf215546Sopenharmony_ci      Value *off = new_LValue(func, FILE_ADDRESS);
1321bf215546Sopenharmony_ci      bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
1322bf215546Sopenharmony_ci      bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3));
1323bf215546Sopenharmony_ci      bld.mkLoad(TYPE_F32,
1324bf215546Sopenharmony_ci                 def,
1325bf215546Sopenharmony_ci                 bld.mkSymbol(
1326bf215546Sopenharmony_ci                       FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
1327bf215546Sopenharmony_ci                       TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
1328bf215546Sopenharmony_ci                 off);
1329bf215546Sopenharmony_ci      break;
1330bf215546Sopenharmony_ci   }
1331bf215546Sopenharmony_ci   case SV_THREAD_KILL:
1332bf215546Sopenharmony_ci      // Not actually supported. But it's implementation-dependent, so we can
1333bf215546Sopenharmony_ci      // always just say it's not a helper.
1334bf215546Sopenharmony_ci      bld.mkMov(def, bld.loadImm(NULL, 0));
1335bf215546Sopenharmony_ci      break;
1336bf215546Sopenharmony_ci   default:
1337bf215546Sopenharmony_ci      bld.mkFetch(i->getDef(0), i->dType,
1338bf215546Sopenharmony_ci                  FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
1339bf215546Sopenharmony_ci      break;
1340bf215546Sopenharmony_ci   }
1341bf215546Sopenharmony_ci   bld.getBB()->remove(i);
1342bf215546Sopenharmony_ci   return true;
1343bf215546Sopenharmony_ci}
1344bf215546Sopenharmony_ci
1345bf215546Sopenharmony_cibool
1346bf215546Sopenharmony_ciNV50LoweringPreSSA::handleDIV(Instruction *i)
1347bf215546Sopenharmony_ci{
1348bf215546Sopenharmony_ci   if (!isFloatType(i->dType))
1349bf215546Sopenharmony_ci      return true;
1350bf215546Sopenharmony_ci   bld.setPosition(i, false);
1351bf215546Sopenharmony_ci   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
1352bf215546Sopenharmony_ci   i->op = OP_MUL;
1353bf215546Sopenharmony_ci   i->setSrc(1, rcp->getDef(0));
1354bf215546Sopenharmony_ci   return true;
1355bf215546Sopenharmony_ci}
1356bf215546Sopenharmony_ci
1357bf215546Sopenharmony_cibool
1358bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSQRT(Instruction *i)
1359bf215546Sopenharmony_ci{
1360bf215546Sopenharmony_ci   bld.setPosition(i, true);
1361bf215546Sopenharmony_ci   i->op = OP_RSQ;
1362bf215546Sopenharmony_ci   bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
1363bf215546Sopenharmony_ci
1364bf215546Sopenharmony_ci   return true;
1365bf215546Sopenharmony_ci}
1366bf215546Sopenharmony_ci
1367bf215546Sopenharmony_cibool
1368bf215546Sopenharmony_ciNV50LoweringPreSSA::handlePOW(Instruction *i)
1369bf215546Sopenharmony_ci{
1370bf215546Sopenharmony_ci   LValue *val = bld.getScratch();
1371bf215546Sopenharmony_ci
1372bf215546Sopenharmony_ci   bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
1373bf215546Sopenharmony_ci   bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
1374bf215546Sopenharmony_ci   bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
1375bf215546Sopenharmony_ci
1376bf215546Sopenharmony_ci   i->op = OP_EX2;
1377bf215546Sopenharmony_ci   i->setSrc(0, val);
1378bf215546Sopenharmony_ci   i->setSrc(1, NULL);
1379bf215546Sopenharmony_ci
1380bf215546Sopenharmony_ci   return true;
1381bf215546Sopenharmony_ci}
1382bf215546Sopenharmony_ci
1383bf215546Sopenharmony_cibool
1384bf215546Sopenharmony_ciNV50LoweringPreSSA::handleEXPORT(Instruction *i)
1385bf215546Sopenharmony_ci{
1386bf215546Sopenharmony_ci   if (prog->getType() == Program::TYPE_FRAGMENT) {
1387bf215546Sopenharmony_ci      if (i->getIndirect(0, 0)) {
1388bf215546Sopenharmony_ci         // TODO: redirect to l[] here, load to GPRs at exit
1389bf215546Sopenharmony_ci         return false;
1390bf215546Sopenharmony_ci      } else {
1391bf215546Sopenharmony_ci         int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
1392bf215546Sopenharmony_ci
1393bf215546Sopenharmony_ci         i->op = OP_MOV;
1394bf215546Sopenharmony_ci         i->subOp = NV50_IR_SUBOP_MOV_FINAL;
1395bf215546Sopenharmony_ci         i->src(0).set(i->src(1));
1396bf215546Sopenharmony_ci         i->setSrc(1, NULL);
1397bf215546Sopenharmony_ci         i->setDef(0, new_LValue(func, FILE_GPR));
1398bf215546Sopenharmony_ci         i->getDef(0)->reg.data.id = id;
1399bf215546Sopenharmony_ci
1400bf215546Sopenharmony_ci         prog->maxGPR = MAX2(prog->maxGPR, id * 2);
1401bf215546Sopenharmony_ci      }
1402bf215546Sopenharmony_ci   }
1403bf215546Sopenharmony_ci   return true;
1404bf215546Sopenharmony_ci}
1405bf215546Sopenharmony_ci
1406bf215546Sopenharmony_ci// Handle indirect addressing in geometry shaders:
1407bf215546Sopenharmony_ci//
1408bf215546Sopenharmony_ci// ld $r0 a[$a1][$a2+k] ->
1409bf215546Sopenharmony_ci// ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit
1410bf215546Sopenharmony_ci//
1411bf215546Sopenharmony_cibool
1412bf215546Sopenharmony_ciNV50LoweringPreSSA::handleLOAD(Instruction *i)
1413bf215546Sopenharmony_ci{
1414bf215546Sopenharmony_ci   ValueRef src = i->src(0);
1415bf215546Sopenharmony_ci   Symbol *sym = i->getSrc(0)->asSym();
1416bf215546Sopenharmony_ci
1417bf215546Sopenharmony_ci   if (prog->getType() == Program::TYPE_COMPUTE) {
1418bf215546Sopenharmony_ci      if (sym->inFile(FILE_MEMORY_SHARED) ||
1419bf215546Sopenharmony_ci          sym->inFile(FILE_MEMORY_BUFFER) ||
1420bf215546Sopenharmony_ci          sym->inFile(FILE_MEMORY_GLOBAL)) {
1421bf215546Sopenharmony_ci         return handleLDST(i);
1422bf215546Sopenharmony_ci      }
1423bf215546Sopenharmony_ci   }
1424bf215546Sopenharmony_ci
1425bf215546Sopenharmony_ci   if (src.isIndirect(1)) {
1426bf215546Sopenharmony_ci      assert(prog->getType() == Program::TYPE_GEOMETRY);
1427bf215546Sopenharmony_ci      Value *addr = i->getIndirect(0, 1);
1428bf215546Sopenharmony_ci
1429bf215546Sopenharmony_ci      if (src.isIndirect(0)) {
1430bf215546Sopenharmony_ci         // base address is in an address register, so move to a GPR
1431bf215546Sopenharmony_ci         Value *base = bld.getScratch();
1432bf215546Sopenharmony_ci         bld.mkMov(base, addr);
1433bf215546Sopenharmony_ci
1434bf215546Sopenharmony_ci         Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0);
1435bf215546Sopenharmony_ci         Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv);
1436bf215546Sopenharmony_ci         Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1437bf215546Sopenharmony_ci                                    i->getIndirect(0, 0), bld.mkImm(2));
1438bf215546Sopenharmony_ci
1439bf215546Sopenharmony_ci         // Calculate final address: addr = base + attr*vstride; use 16-bit
1440bf215546Sopenharmony_ci         // multiplication since 32-bit would be lowered to multiple
1441bf215546Sopenharmony_ci         // instructions, and we only need the low 16 bits of the result
1442bf215546Sopenharmony_ci         Value *a[2], *b[2];
1443bf215546Sopenharmony_ci         bld.mkSplit(a, 2, attrib);
1444bf215546Sopenharmony_ci         bld.mkSplit(b, 2, vstride);
1445bf215546Sopenharmony_ci         Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0],
1446bf215546Sopenharmony_ci                                 base);
1447bf215546Sopenharmony_ci
1448bf215546Sopenharmony_ci         // move address from GPR into an address register
1449bf215546Sopenharmony_ci         addr = bld.getSSA(2, FILE_ADDRESS);
1450bf215546Sopenharmony_ci         bld.mkMov(addr, sum);
1451bf215546Sopenharmony_ci      }
1452bf215546Sopenharmony_ci
1453bf215546Sopenharmony_ci      i->setIndirect(0, 1, NULL);
1454bf215546Sopenharmony_ci      i->setIndirect(0, 0, addr);
1455bf215546Sopenharmony_ci   }
1456bf215546Sopenharmony_ci
1457bf215546Sopenharmony_ci   return true;
1458bf215546Sopenharmony_ci}
1459bf215546Sopenharmony_ci
1460bf215546Sopenharmony_cibool
1461bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSharedATOM(Instruction *atom)
1462bf215546Sopenharmony_ci{
1463bf215546Sopenharmony_ci   assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1464bf215546Sopenharmony_ci
1465bf215546Sopenharmony_ci   BasicBlock *currBB = atom->bb;
1466bf215546Sopenharmony_ci   BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
1467bf215546Sopenharmony_ci   BasicBlock *joinBB = atom->bb->splitAfter(atom);
1468bf215546Sopenharmony_ci   BasicBlock *setAndUnlockBB = new BasicBlock(func);
1469bf215546Sopenharmony_ci   BasicBlock *failLockBB = new BasicBlock(func);
1470bf215546Sopenharmony_ci
1471bf215546Sopenharmony_ci   bld.setPosition(currBB, true);
1472bf215546Sopenharmony_ci   assert(!currBB->joinAt);
1473bf215546Sopenharmony_ci   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1474bf215546Sopenharmony_ci
1475bf215546Sopenharmony_ci   bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
1476bf215546Sopenharmony_ci   currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
1477bf215546Sopenharmony_ci
1478bf215546Sopenharmony_ci   bld.setPosition(tryLockBB, true);
1479bf215546Sopenharmony_ci
1480bf215546Sopenharmony_ci   Instruction *ld =
1481bf215546Sopenharmony_ci      bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1482bf215546Sopenharmony_ci                 atom->getIndirect(0, 0));
1483bf215546Sopenharmony_ci   Value *locked = bld.getSSA(1, FILE_FLAGS);
1484bf215546Sopenharmony_ci   if (prog->getTarget()->getChipset() >= 0xa0) {
1485bf215546Sopenharmony_ci      ld->setFlagsDef(1, locked);
1486bf215546Sopenharmony_ci      ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1487bf215546Sopenharmony_ci   } else {
1488bf215546Sopenharmony_ci      bld.mkMov(locked, bld.loadImm(NULL, 2))
1489bf215546Sopenharmony_ci         ->flagsDef = 0;
1490bf215546Sopenharmony_ci   }
1491bf215546Sopenharmony_ci
1492bf215546Sopenharmony_ci   bld.mkFlow(OP_BRA, setAndUnlockBB, CC_LT, locked);
1493bf215546Sopenharmony_ci   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1494bf215546Sopenharmony_ci   tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
1495bf215546Sopenharmony_ci   tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
1496bf215546Sopenharmony_ci
1497bf215546Sopenharmony_ci   tryLockBB->cfg.detach(&joinBB->cfg);
1498bf215546Sopenharmony_ci   bld.remove(atom);
1499bf215546Sopenharmony_ci
1500bf215546Sopenharmony_ci   bld.setPosition(setAndUnlockBB, true);
1501bf215546Sopenharmony_ci   Value *stVal;
1502bf215546Sopenharmony_ci   if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1503bf215546Sopenharmony_ci      // Read the old value, and write the new one.
1504bf215546Sopenharmony_ci      stVal = atom->getSrc(1);
1505bf215546Sopenharmony_ci   } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1506bf215546Sopenharmony_ci      CmpInstruction *set =
1507bf215546Sopenharmony_ci         bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_FLAGS),
1508bf215546Sopenharmony_ci                   TYPE_U32, ld->getDef(0), atom->getSrc(1));
1509bf215546Sopenharmony_ci
1510bf215546Sopenharmony_ci      Instruction *selp =
1511bf215546Sopenharmony_ci         bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), atom->getSrc(2),
1512bf215546Sopenharmony_ci                   ld->getDef(0), set->getDef(0));
1513bf215546Sopenharmony_ci      stVal = selp->getDef(0);
1514bf215546Sopenharmony_ci
1515bf215546Sopenharmony_ci      handleSELP(selp);
1516bf215546Sopenharmony_ci   } else {
1517bf215546Sopenharmony_ci      operation op;
1518bf215546Sopenharmony_ci
1519bf215546Sopenharmony_ci      switch (atom->subOp) {
1520bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_ADD:
1521bf215546Sopenharmony_ci         op = OP_ADD;
1522bf215546Sopenharmony_ci         break;
1523bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_AND:
1524bf215546Sopenharmony_ci         op = OP_AND;
1525bf215546Sopenharmony_ci         break;
1526bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_OR:
1527bf215546Sopenharmony_ci         op = OP_OR;
1528bf215546Sopenharmony_ci         break;
1529bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_XOR:
1530bf215546Sopenharmony_ci         op = OP_XOR;
1531bf215546Sopenharmony_ci         break;
1532bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_MIN:
1533bf215546Sopenharmony_ci         op = OP_MIN;
1534bf215546Sopenharmony_ci         break;
1535bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_MAX:
1536bf215546Sopenharmony_ci         op = OP_MAX;
1537bf215546Sopenharmony_ci         break;
1538bf215546Sopenharmony_ci      default:
1539bf215546Sopenharmony_ci         assert(0);
1540bf215546Sopenharmony_ci         return false;
1541bf215546Sopenharmony_ci      }
1542bf215546Sopenharmony_ci
1543bf215546Sopenharmony_ci      Instruction *i =
1544bf215546Sopenharmony_ci         bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
1545bf215546Sopenharmony_ci                   atom->getSrc(1));
1546bf215546Sopenharmony_ci
1547bf215546Sopenharmony_ci      stVal = i->getDef(0);
1548bf215546Sopenharmony_ci   }
1549bf215546Sopenharmony_ci
1550bf215546Sopenharmony_ci   Instruction *store = bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1551bf215546Sopenharmony_ci               atom->getIndirect(0, 0), stVal);
1552bf215546Sopenharmony_ci   if (prog->getTarget()->getChipset() >= 0xa0) {
1553bf215546Sopenharmony_ci      store->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1554bf215546Sopenharmony_ci   }
1555bf215546Sopenharmony_ci
1556bf215546Sopenharmony_ci   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1557bf215546Sopenharmony_ci   setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
1558bf215546Sopenharmony_ci
1559bf215546Sopenharmony_ci   // Loop until the lock is acquired.
1560bf215546Sopenharmony_ci   bld.setPosition(failLockBB, true);
1561bf215546Sopenharmony_ci   bld.mkFlow(OP_BRA, tryLockBB, CC_GEU, locked);
1562bf215546Sopenharmony_ci   bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1563bf215546Sopenharmony_ci   failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
1564bf215546Sopenharmony_ci   failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
1565bf215546Sopenharmony_ci
1566bf215546Sopenharmony_ci   bld.setPosition(joinBB, false);
1567bf215546Sopenharmony_ci   bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1568bf215546Sopenharmony_ci
1569bf215546Sopenharmony_ci   return true;
1570bf215546Sopenharmony_ci}
1571bf215546Sopenharmony_ci
1572bf215546Sopenharmony_cibool
1573bf215546Sopenharmony_ciNV50LoweringPreSSA::handleLDST(Instruction *i)
1574bf215546Sopenharmony_ci{
1575bf215546Sopenharmony_ci   ValueRef src = i->src(0);
1576bf215546Sopenharmony_ci   Symbol *sym = i->getSrc(0)->asSym();
1577bf215546Sopenharmony_ci
1578bf215546Sopenharmony_ci   if (prog->getType() != Program::TYPE_COMPUTE) {
1579bf215546Sopenharmony_ci      return true;
1580bf215546Sopenharmony_ci   }
1581bf215546Sopenharmony_ci
1582bf215546Sopenharmony_ci   // Buffers just map directly to the different global memory spaces
1583bf215546Sopenharmony_ci   if (sym->inFile(FILE_MEMORY_BUFFER)) {
1584bf215546Sopenharmony_ci      sym->reg.file = FILE_MEMORY_GLOBAL;
1585bf215546Sopenharmony_ci   }
1586bf215546Sopenharmony_ci
1587bf215546Sopenharmony_ci   if (sym->inFile(FILE_MEMORY_SHARED)) {
1588bf215546Sopenharmony_ci
1589bf215546Sopenharmony_ci      if (src.isIndirect(0)) {
1590bf215546Sopenharmony_ci         Value *addr = i->getIndirect(0, 0);
1591bf215546Sopenharmony_ci
1592bf215546Sopenharmony_ci         if (!addr->inFile(FILE_ADDRESS)) {
1593bf215546Sopenharmony_ci            // Move address from GPR into an address register
1594bf215546Sopenharmony_ci            Value *new_addr = bld.getSSA(2, FILE_ADDRESS);
1595bf215546Sopenharmony_ci            bld.mkMov(new_addr, addr);
1596bf215546Sopenharmony_ci
1597bf215546Sopenharmony_ci            i->setIndirect(0, 0, new_addr);
1598bf215546Sopenharmony_ci         }
1599bf215546Sopenharmony_ci      }
1600bf215546Sopenharmony_ci
1601bf215546Sopenharmony_ci      if (i->op == OP_ATOM)
1602bf215546Sopenharmony_ci         handleSharedATOM(i);
1603bf215546Sopenharmony_ci   } else if (sym->inFile(FILE_MEMORY_GLOBAL)) {
1604bf215546Sopenharmony_ci      // All global access must be indirect. There are no instruction forms
1605bf215546Sopenharmony_ci      // with direct access.
1606bf215546Sopenharmony_ci      Value *addr = i->getIndirect(0, 0);
1607bf215546Sopenharmony_ci
1608bf215546Sopenharmony_ci      Value *offset = bld.loadImm(bld.getSSA(), sym->reg.data.offset);
1609bf215546Sopenharmony_ci      Value *sum;
1610bf215546Sopenharmony_ci      if (addr != NULL)
1611bf215546Sopenharmony_ci         sum = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), addr,
1612bf215546Sopenharmony_ci                          offset);
1613bf215546Sopenharmony_ci      else
1614bf215546Sopenharmony_ci         sum = offset;
1615bf215546Sopenharmony_ci
1616bf215546Sopenharmony_ci      i->setIndirect(0, 0, sum);
1617bf215546Sopenharmony_ci      sym->reg.data.offset = 0;
1618bf215546Sopenharmony_ci   }
1619bf215546Sopenharmony_ci
1620bf215546Sopenharmony_ci   return true;
1621bf215546Sopenharmony_ci}
1622bf215546Sopenharmony_ci
1623bf215546Sopenharmony_cibool
1624bf215546Sopenharmony_ciNV50LoweringPreSSA::handleMEMBAR(Instruction *i)
1625bf215546Sopenharmony_ci{
1626bf215546Sopenharmony_ci   // For global memory, apparently doing a bunch of reads at different
1627bf215546Sopenharmony_ci   // addresses forces things to get sufficiently flushed.
1628bf215546Sopenharmony_ci   if (i->subOp & NV50_IR_SUBOP_MEMBAR_GL) {
1629bf215546Sopenharmony_ci      uint8_t b = prog->driver->io.auxCBSlot;
1630bf215546Sopenharmony_ci      Value *base =
1631bf215546Sopenharmony_ci         bld.mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32,
1632bf215546Sopenharmony_ci                                            prog->driver->io.membarOffset), NULL);
1633bf215546Sopenharmony_ci      Value *physid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), bld.mkSysVal(SV_PHYSID, 0));
1634bf215546Sopenharmony_ci      Value *off = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1635bf215546Sopenharmony_ci                              bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(),
1636bf215546Sopenharmony_ci                                         physid, bld.loadImm(NULL, 0x1f)),
1637bf215546Sopenharmony_ci                              bld.loadImm(NULL, 2));
1638bf215546Sopenharmony_ci      base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, off);
1639bf215546Sopenharmony_ci      Symbol *gmemMembar = bld.mkSymbol(FILE_MEMORY_GLOBAL, prog->driver->io.gmemMembar, TYPE_U32, 0);
1640bf215546Sopenharmony_ci      for (int i = 0; i < 8; i++) {
1641bf215546Sopenharmony_ci         if (i != 0) {
1642bf215546Sopenharmony_ci            base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, bld.loadImm(NULL, 0x100));
1643bf215546Sopenharmony_ci         }
1644bf215546Sopenharmony_ci         bld.mkLoad(TYPE_U32, bld.getSSA(), gmemMembar, base)
1645bf215546Sopenharmony_ci            ->fixed = 1;
1646bf215546Sopenharmony_ci      }
1647bf215546Sopenharmony_ci   }
1648bf215546Sopenharmony_ci
1649bf215546Sopenharmony_ci   // Both global and shared memory barriers also need a regular control bar
1650bf215546Sopenharmony_ci   // TODO: double-check this is the case
1651bf215546Sopenharmony_ci   i->op = OP_BAR;
1652bf215546Sopenharmony_ci   i->subOp = NV50_IR_SUBOP_BAR_SYNC;
1653bf215546Sopenharmony_ci   i->setSrc(0, bld.mkImm(0u));
1654bf215546Sopenharmony_ci   i->setSrc(1, bld.mkImm(0u));
1655bf215546Sopenharmony_ci
1656bf215546Sopenharmony_ci   return true;
1657bf215546Sopenharmony_ci}
1658bf215546Sopenharmony_ci
1659bf215546Sopenharmony_ci// The type that bests represents how each component can be stored when packed.
1660bf215546Sopenharmony_cistatic DataType
1661bf215546Sopenharmony_cigetPackedType(const TexInstruction::ImgFormatDesc *t, int c)
1662bf215546Sopenharmony_ci{
1663bf215546Sopenharmony_ci   switch (t->type) {
1664bf215546Sopenharmony_ci   case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
1665bf215546Sopenharmony_ci   case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
1666bf215546Sopenharmony_ci   case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
1667bf215546Sopenharmony_ci   case UINT:
1668bf215546Sopenharmony_ci      return (t->bits[c] == 8 ? TYPE_U8 :
1669bf215546Sopenharmony_ci              (t->bits[c] <= 16 ? TYPE_U16 : TYPE_U32));
1670bf215546Sopenharmony_ci   case SINT:
1671bf215546Sopenharmony_ci      return (t->bits[c] == 8 ? TYPE_S8 :
1672bf215546Sopenharmony_ci              (t->bits[c] <= 16 ? TYPE_S16 : TYPE_S32));
1673bf215546Sopenharmony_ci   }
1674bf215546Sopenharmony_ci   return TYPE_NONE;
1675bf215546Sopenharmony_ci}
1676bf215546Sopenharmony_ci
1677bf215546Sopenharmony_ci// The type that the rest of the shader expects to process this image type in.
1678bf215546Sopenharmony_cistatic DataType
1679bf215546Sopenharmony_cigetShaderType(const ImgType type) {
1680bf215546Sopenharmony_ci   switch (type) {
1681bf215546Sopenharmony_ci   case FLOAT:
1682bf215546Sopenharmony_ci   case UNORM:
1683bf215546Sopenharmony_ci   case SNORM:
1684bf215546Sopenharmony_ci      return TYPE_F32;
1685bf215546Sopenharmony_ci   case UINT:
1686bf215546Sopenharmony_ci      return TYPE_U32;
1687bf215546Sopenharmony_ci   case SINT:
1688bf215546Sopenharmony_ci      return TYPE_S32;
1689bf215546Sopenharmony_ci   default:
1690bf215546Sopenharmony_ci      assert(!"Impossible type");
1691bf215546Sopenharmony_ci      return TYPE_NONE;
1692bf215546Sopenharmony_ci   }
1693bf215546Sopenharmony_ci}
1694bf215546Sopenharmony_ci
1695bf215546Sopenharmony_ci// Reads the raw coordinates out of the input instruction, and returns a
1696bf215546Sopenharmony_ci// single-value coordinate which is what the hardware expects to receive in a
1697bf215546Sopenharmony_ci// ld/st op.
1698bf215546Sopenharmony_ciValue *
1699bf215546Sopenharmony_ciNV50LoweringPreSSA::processSurfaceCoords(TexInstruction *su)
1700bf215546Sopenharmony_ci{
1701bf215546Sopenharmony_ci   const int slot = su->tex.r;
1702bf215546Sopenharmony_ci   const int dim = su->tex.target.getDim();
1703bf215546Sopenharmony_ci   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
1704bf215546Sopenharmony_ci
1705bf215546Sopenharmony_ci   const TexInstruction::ImgFormatDesc *format = su->tex.format;
1706bf215546Sopenharmony_ci   const uint16_t bytes = (format->bits[0] + format->bits[1] +
1707bf215546Sopenharmony_ci                           format->bits[2] + format->bits[3]) / 8;
1708bf215546Sopenharmony_ci   uint16_t shift = ffs(bytes) - 1;
1709bf215546Sopenharmony_ci
1710bf215546Sopenharmony_ci   // Buffer sizes don't necessarily fit in 16-bit values
1711bf215546Sopenharmony_ci   if (su->tex.target == TEX_TARGET_BUFFER) {
1712bf215546Sopenharmony_ci      return bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1713bf215546Sopenharmony_ci                        su->getSrc(0), bld.loadImm(NULL, (uint32_t)shift));
1714bf215546Sopenharmony_ci   }
1715bf215546Sopenharmony_ci
1716bf215546Sopenharmony_ci   // For buffers, we just need the byte offset. And for 2d buffers we want
1717bf215546Sopenharmony_ci   // the x coordinate in bytes as well.
1718bf215546Sopenharmony_ci   Value *coords[3] = {};
1719bf215546Sopenharmony_ci   for (int i = 0; i < arg; i++) {
1720bf215546Sopenharmony_ci      Value *src[2];
1721bf215546Sopenharmony_ci      bld.mkSplit(src, 2, su->getSrc(i));
1722bf215546Sopenharmony_ci      coords[i] = src[0];
1723bf215546Sopenharmony_ci      // For 1d-images, we want the y coord to be 0, which it will be here.
1724bf215546Sopenharmony_ci      if (i == 0)
1725bf215546Sopenharmony_ci         coords[1] = src[1];
1726bf215546Sopenharmony_ci   }
1727bf215546Sopenharmony_ci
1728bf215546Sopenharmony_ci   coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
1729bf215546Sopenharmony_ci                          coords[0], bld.loadImm(NULL, shift));
1730bf215546Sopenharmony_ci
1731bf215546Sopenharmony_ci   if (su->tex.target.isMS()) {
1732bf215546Sopenharmony_ci      Value *ms_x = loadSuInfo16(slot, NV50_SU_INFO_MS(0));
1733bf215546Sopenharmony_ci      Value *ms_y = loadSuInfo16(slot, NV50_SU_INFO_MS(1));
1734bf215546Sopenharmony_ci      coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[0], ms_x);
1735bf215546Sopenharmony_ci      coords[1] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[1], ms_y);
1736bf215546Sopenharmony_ci   }
1737bf215546Sopenharmony_ci
1738bf215546Sopenharmony_ci   // If there are more dimensions, we just want the y-offset. But that needs
1739bf215546Sopenharmony_ci   // to be adjusted up by the y-stride for array images.
1740bf215546Sopenharmony_ci   if (su->tex.target.isArray() || su->tex.target.isCube()) {
1741bf215546Sopenharmony_ci      Value *index = coords[dim];
1742bf215546Sopenharmony_ci      Value *height = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);
1743bf215546Sopenharmony_ci      Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4), index, height);
1744bf215546Sopenharmony_ci      mul->sType = TYPE_U16;
1745bf215546Sopenharmony_ci      Value *muls[2];
1746bf215546Sopenharmony_ci      bld.mkSplit(muls, 2, mul->getDef(0));
1747bf215546Sopenharmony_ci      if (dim > 1)
1748bf215546Sopenharmony_ci         coords[1] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), coords[1], muls[0]);
1749bf215546Sopenharmony_ci      else
1750bf215546Sopenharmony_ci         coords[1] = muls[0];
1751bf215546Sopenharmony_ci   }
1752bf215546Sopenharmony_ci
1753bf215546Sopenharmony_ci   // 3d is special-cased. Note that a single "slice" of a 3d image may
1754bf215546Sopenharmony_ci   // also be attached as 2d, so we have to do the same 3d processing for
1755bf215546Sopenharmony_ci   // 2d as well, just in case. In order to remap a 3d image onto a 2d
1756bf215546Sopenharmony_ci   // image, we have to retile it "by hand".
1757bf215546Sopenharmony_ci   if (su->tex.target == TEX_TARGET_3D || su->tex.target == TEX_TARGET_2D) {
1758bf215546Sopenharmony_ci      Value *z = loadSuInfo16(slot, NV50_SU_INFO_OFFSET_Z);
1759bf215546Sopenharmony_ci      Value *y_size_aligned = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);
1760bf215546Sopenharmony_ci      // Add the z coordinate for actual 3d-images
1761bf215546Sopenharmony_ci      if (dim > 2)
1762bf215546Sopenharmony_ci         coords[2] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), z, coords[2]);
1763bf215546Sopenharmony_ci      else
1764bf215546Sopenharmony_ci         coords[2] = z;
1765bf215546Sopenharmony_ci
1766bf215546Sopenharmony_ci      // Compute the surface parameters from tile shifts
1767bf215546Sopenharmony_ci      Value *tile_shift[3];
1768bf215546Sopenharmony_ci      Value *tile_size[3];
1769bf215546Sopenharmony_ci      Value *tile_mask[3];
1770bf215546Sopenharmony_ci      // We only ever use one kind of X-tiling.
1771bf215546Sopenharmony_ci      tile_shift[0] = bld.loadImm(NULL, (uint16_t)6);
1772bf215546Sopenharmony_ci      tile_size[0] = bld.loadImm(NULL, (uint16_t)64);
1773bf215546Sopenharmony_ci      tile_mask[0] = bld.loadImm(NULL, (uint16_t)63);
1774bf215546Sopenharmony_ci      // Fetch the "real" tiling parameters of the underlying surface
1775bf215546Sopenharmony_ci      for (int i = 1; i < 3; i++) {
1776bf215546Sopenharmony_ci         tile_shift[i] = loadSuInfo16(slot, NV50_SU_INFO_TILE_SHIFT(i));
1777bf215546Sopenharmony_ci         tile_size[i] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), bld.loadImm(NULL, (uint16_t)1), tile_shift[i]);
1778bf215546Sopenharmony_ci         tile_mask[i] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), tile_size[i], bld.loadImm(NULL, (uint16_t)-1));
1779bf215546Sopenharmony_ci      }
1780bf215546Sopenharmony_ci
1781bf215546Sopenharmony_ci      // Compute the location of given coordinate, both inside the tile as
1782bf215546Sopenharmony_ci      // well as which (linearly-laid out) tile it's in.
1783bf215546Sopenharmony_ci      Value *coord_in_tile[3];
1784bf215546Sopenharmony_ci      Value *tile[3];
1785bf215546Sopenharmony_ci      for (int i = 0; i < 3; i++) {
1786bf215546Sopenharmony_ci         coord_in_tile[i] = bld.mkOp2v(OP_AND, TYPE_U16, bld.getSSA(2), coords[i], tile_mask[i]);
1787bf215546Sopenharmony_ci         tile[i] = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), coords[i], tile_shift[i]);
1788bf215546Sopenharmony_ci      }
1789bf215546Sopenharmony_ci
1790bf215546Sopenharmony_ci      // Based on the "real" tiling parameters, compute x/y coordinates in the
1791bf215546Sopenharmony_ci      // larger surface with 2d tiling that was supplied to the hardware. This
1792bf215546Sopenharmony_ci      // was determined and verified with the help of the tiling pseudocode in
1793bf215546Sopenharmony_ci      // the envytools docs.
1794bf215546Sopenharmony_ci      //
1795bf215546Sopenharmony_ci      // adj_x = x_coord_in_tile + x_tile * x_tile_size * z_tile_size +
1796bf215546Sopenharmony_ci      //         z_coord_in_tile * x_tile_size
1797bf215546Sopenharmony_ci      // adj_y = y_coord_in_tile + y_tile * y_tile_size +
1798bf215546Sopenharmony_ci      //         z_tile * y_tile_size * y_tiles
1799bf215546Sopenharmony_ci      //
1800bf215546Sopenharmony_ci      // Note: STRIDE_Y = y_tile_size * y_tiles
1801bf215546Sopenharmony_ci
1802bf215546Sopenharmony_ci      coords[0] = bld.mkOp2v(
1803bf215546Sopenharmony_ci            OP_ADD, TYPE_U16, bld.getSSA(2),
1804bf215546Sopenharmony_ci            bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
1805bf215546Sopenharmony_ci                       coord_in_tile[0],
1806bf215546Sopenharmony_ci                       bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
1807bf215546Sopenharmony_ci                                  tile[0],
1808bf215546Sopenharmony_ci                                  bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
1809bf215546Sopenharmony_ci                                             tile_shift[2], tile_shift[0]))),
1810bf215546Sopenharmony_ci            bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
1811bf215546Sopenharmony_ci                       coord_in_tile[2], tile_shift[0]));
1812bf215546Sopenharmony_ci
1813bf215546Sopenharmony_ci      Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4),
1814bf215546Sopenharmony_ci                                   tile[2], y_size_aligned);
1815bf215546Sopenharmony_ci      mul->sType = TYPE_U16;
1816bf215546Sopenharmony_ci      Value *muls[2];
1817bf215546Sopenharmony_ci      bld.mkSplit(muls, 2, mul->getDef(0));
1818bf215546Sopenharmony_ci
1819bf215546Sopenharmony_ci      coords[1] = bld.mkOp2v(
1820bf215546Sopenharmony_ci            OP_ADD, TYPE_U16, bld.getSSA(2),
1821bf215546Sopenharmony_ci            muls[0],
1822bf215546Sopenharmony_ci            bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
1823bf215546Sopenharmony_ci                       coord_in_tile[1],
1824bf215546Sopenharmony_ci                       bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
1825bf215546Sopenharmony_ci                                  tile[1], tile_shift[1])));
1826bf215546Sopenharmony_ci   }
1827bf215546Sopenharmony_ci
1828bf215546Sopenharmony_ci   return bld.mkOp2v(OP_MERGE, TYPE_U32, bld.getSSA(), coords[0], coords[1]);
1829bf215546Sopenharmony_ci}
1830bf215546Sopenharmony_ci
1831bf215546Sopenharmony_ci// This is largely a copy of NVC0LoweringPass::convertSurfaceFormat, but
1832bf215546Sopenharmony_ci// adjusted to make use of 16-bit math where possible.
1833bf215546Sopenharmony_cibool
1834bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSULDP(TexInstruction *su)
1835bf215546Sopenharmony_ci{
1836bf215546Sopenharmony_ci   const int slot = su->tex.r;
1837bf215546Sopenharmony_ci   assert(!su->getIndirectR());
1838bf215546Sopenharmony_ci
1839bf215546Sopenharmony_ci   bld.setPosition(su, false);
1840bf215546Sopenharmony_ci
1841bf215546Sopenharmony_ci   const TexInstruction::ImgFormatDesc *format = su->tex.format;
1842bf215546Sopenharmony_ci   const int bytes = (su->tex.format->bits[0] +
1843bf215546Sopenharmony_ci                      su->tex.format->bits[1] +
1844bf215546Sopenharmony_ci                      su->tex.format->bits[2] +
1845bf215546Sopenharmony_ci                      su->tex.format->bits[3]) / 8;
1846bf215546Sopenharmony_ci   DataType ty = typeOfSize(bytes);
1847bf215546Sopenharmony_ci
1848bf215546Sopenharmony_ci   Value *coord = processSurfaceCoords(su);
1849bf215546Sopenharmony_ci
1850bf215546Sopenharmony_ci   Value *untypedDst[4] = {};
1851bf215546Sopenharmony_ci   Value *typedDst[4] = {};
1852bf215546Sopenharmony_ci   int i;
1853bf215546Sopenharmony_ci   for (i = 0; i < bytes / 4; i++)
1854bf215546Sopenharmony_ci      untypedDst[i] = bld.getSSA();
1855bf215546Sopenharmony_ci   if (bytes < 4)
1856bf215546Sopenharmony_ci      untypedDst[0] = bld.getSSA();
1857bf215546Sopenharmony_ci
1858bf215546Sopenharmony_ci   for (i = 0; i < 4; i++)
1859bf215546Sopenharmony_ci      typedDst[i] = su->getDef(i);
1860bf215546Sopenharmony_ci
1861bf215546Sopenharmony_ci   Instruction *load = bld.mkLoad(ty, NULL, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, ty, 0), coord);
1862bf215546Sopenharmony_ci   for (i = 0; i < 4 && untypedDst[i]; i++)
1863bf215546Sopenharmony_ci      load->setDef(i, untypedDst[i]);
1864bf215546Sopenharmony_ci
1865bf215546Sopenharmony_ci   // Unpack each component into the typed dsts
1866bf215546Sopenharmony_ci   int bits = 0;
1867bf215546Sopenharmony_ci   for (int i = 0; i < 4; bits += format->bits[i], i++) {
1868bf215546Sopenharmony_ci      if (!typedDst[i])
1869bf215546Sopenharmony_ci         continue;
1870bf215546Sopenharmony_ci
1871bf215546Sopenharmony_ci      if (i >= format->components) {
1872bf215546Sopenharmony_ci         if (format->type == FLOAT ||
1873bf215546Sopenharmony_ci             format->type == UNORM ||
1874bf215546Sopenharmony_ci             format->type == SNORM)
1875bf215546Sopenharmony_ci            bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
1876bf215546Sopenharmony_ci         else
1877bf215546Sopenharmony_ci            bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
1878bf215546Sopenharmony_ci         continue;
1879bf215546Sopenharmony_ci      }
1880bf215546Sopenharmony_ci
1881bf215546Sopenharmony_ci      // Get just that component's data into the relevant place
1882bf215546Sopenharmony_ci      if (format->bits[i] == 32)
1883bf215546Sopenharmony_ci         bld.mkMov(typedDst[i], untypedDst[i]);
1884bf215546Sopenharmony_ci      else if (format->bits[i] == 16) {
1885bf215546Sopenharmony_ci         // We can always convert directly from the appropriate half of the
1886bf215546Sopenharmony_ci         // loaded value into the typed result.
1887bf215546Sopenharmony_ci         Value *src[2];
1888bf215546Sopenharmony_ci         bld.mkSplit(src, 2, untypedDst[i / 2]);
1889bf215546Sopenharmony_ci         bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],
1890bf215546Sopenharmony_ci                   getPackedType(format, i), src[i & 1]);
1891bf215546Sopenharmony_ci      }
1892bf215546Sopenharmony_ci      else if (format->bits[i] == 8) {
1893bf215546Sopenharmony_ci         // Same approach as for 16 bits, but we have to massage the value a
1894bf215546Sopenharmony_ci         // bit more, since we have to get the appropriate 8 bits from the
1895bf215546Sopenharmony_ci         // half-register. In all cases, we can CVT from a 8-bit source, so we
1896bf215546Sopenharmony_ci         // only have to shift when we want the upper 8 bits.
1897bf215546Sopenharmony_ci         Value *src[2], *shifted;
1898bf215546Sopenharmony_ci         bld.mkSplit(src, 2, untypedDst[0]);
1899bf215546Sopenharmony_ci         DataType packedType = getPackedType(format, i);
1900bf215546Sopenharmony_ci         if (i & 1)
1901bf215546Sopenharmony_ci            shifted = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), src[!!(i & 2)], bld.loadImm(NULL, (uint16_t)8));
1902bf215546Sopenharmony_ci         else
1903bf215546Sopenharmony_ci            shifted = src[!!(i & 2)];
1904bf215546Sopenharmony_ci
1905bf215546Sopenharmony_ci         bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],
1906bf215546Sopenharmony_ci                   packedType, shifted);
1907bf215546Sopenharmony_ci      }
1908bf215546Sopenharmony_ci      else {
1909bf215546Sopenharmony_ci         // The options are 10, 11, and 2. Get it into a 32-bit reg, then
1910bf215546Sopenharmony_ci         // shift/mask. That's where it'll have to end up anyways. For signed,
1911bf215546Sopenharmony_ci         // we have to make sure to get sign-extension, so we actually have to
1912bf215546Sopenharmony_ci         // shift *up* first, and then shift down. There's no advantage to
1913bf215546Sopenharmony_ci         // AND'ing, so we don't.
1914bf215546Sopenharmony_ci         DataType ty = TYPE_U32;
1915bf215546Sopenharmony_ci         if (format->type == SNORM || format->type == SINT) {
1916bf215546Sopenharmony_ci            ty = TYPE_S32;
1917bf215546Sopenharmony_ci         }
1918bf215546Sopenharmony_ci
1919bf215546Sopenharmony_ci         // Poor man's EXTBF
1920bf215546Sopenharmony_ci         bld.mkOp2(
1921bf215546Sopenharmony_ci               OP_SHR, ty, typedDst[i],
1922bf215546Sopenharmony_ci               bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), untypedDst[0], bld.loadImm(NULL, 32 - bits - format->bits[i])),
1923bf215546Sopenharmony_ci               bld.loadImm(NULL, 32 - format->bits[i]));
1924bf215546Sopenharmony_ci
1925bf215546Sopenharmony_ci         // If the stored data is already in the appropriate type, we don't
1926bf215546Sopenharmony_ci         // have to do anything. Convert to float for the *NORM formats.
1927bf215546Sopenharmony_ci         if (format->type == UNORM || format->type == SNORM)
1928bf215546Sopenharmony_ci            bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_U32, typedDst[i]);
1929bf215546Sopenharmony_ci      }
1930bf215546Sopenharmony_ci
1931bf215546Sopenharmony_ci      // Normalize / convert as necessary
1932bf215546Sopenharmony_ci      if (format->type == UNORM)
1933bf215546Sopenharmony_ci         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
1934bf215546Sopenharmony_ci      else if (format->type == SNORM)
1935bf215546Sopenharmony_ci         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
1936bf215546Sopenharmony_ci      else if (format->type == FLOAT && format->bits[i] < 16) {
1937bf215546Sopenharmony_ci         // We expect the value to be in the low bits of the register, so we
1938bf215546Sopenharmony_ci         // have to shift back up.
1939bf215546Sopenharmony_ci         bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
1940bf215546Sopenharmony_ci         Value *src[2];
1941bf215546Sopenharmony_ci         bld.mkSplit(src, 2, typedDst[i]);
1942bf215546Sopenharmony_ci         bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, src[0]);
1943bf215546Sopenharmony_ci      }
1944bf215546Sopenharmony_ci   }
1945bf215546Sopenharmony_ci
1946bf215546Sopenharmony_ci   if (format->bgra) {
1947bf215546Sopenharmony_ci      std::swap(typedDst[0], typedDst[2]);
1948bf215546Sopenharmony_ci   }
1949bf215546Sopenharmony_ci
1950bf215546Sopenharmony_ci   bld.getBB()->remove(su);
1951bf215546Sopenharmony_ci   return true;
1952bf215546Sopenharmony_ci}
1953bf215546Sopenharmony_ci
1954bf215546Sopenharmony_cibool
1955bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSUREDP(TexInstruction *su)
1956bf215546Sopenharmony_ci{
1957bf215546Sopenharmony_ci   const int slot = su->tex.r;
1958bf215546Sopenharmony_ci   const int dim = su->tex.target.getDim();
1959bf215546Sopenharmony_ci   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
1960bf215546Sopenharmony_ci   assert(!su->getIndirectR());
1961bf215546Sopenharmony_ci
1962bf215546Sopenharmony_ci   bld.setPosition(su, false);
1963bf215546Sopenharmony_ci
1964bf215546Sopenharmony_ci   Value *coord = processSurfaceCoords(su);
1965bf215546Sopenharmony_ci
1966bf215546Sopenharmony_ci   // This is guaranteed to be a 32-bit format. So there's nothing to
1967bf215546Sopenharmony_ci   // pack/unpack.
1968bf215546Sopenharmony_ci   Instruction *atom = bld.mkOp2(
1969bf215546Sopenharmony_ci         OP_ATOM, su->dType, su->getDef(0),
1970bf215546Sopenharmony_ci         bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), su->getSrc(arg));
1971bf215546Sopenharmony_ci   if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
1972bf215546Sopenharmony_ci      atom->setSrc(2, su->getSrc(arg + 1));
1973bf215546Sopenharmony_ci   atom->setIndirect(0, 0, coord);
1974bf215546Sopenharmony_ci   atom->subOp = su->subOp;
1975bf215546Sopenharmony_ci
1976bf215546Sopenharmony_ci   bld.getBB()->remove(su);
1977bf215546Sopenharmony_ci   return true;
1978bf215546Sopenharmony_ci}
1979bf215546Sopenharmony_ci
1980bf215546Sopenharmony_cibool
1981bf215546Sopenharmony_ciNV50LoweringPreSSA::handleSUSTP(TexInstruction *su)
1982bf215546Sopenharmony_ci{
1983bf215546Sopenharmony_ci   const int slot = su->tex.r;
1984bf215546Sopenharmony_ci   const int dim = su->tex.target.getDim();
1985bf215546Sopenharmony_ci   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
1986bf215546Sopenharmony_ci   assert(!su->getIndirectR());
1987bf215546Sopenharmony_ci
1988bf215546Sopenharmony_ci   bld.setPosition(su, false);
1989bf215546Sopenharmony_ci
1990bf215546Sopenharmony_ci   const TexInstruction::ImgFormatDesc *format = su->tex.format;
1991bf215546Sopenharmony_ci   const int bytes = (su->tex.format->bits[0] +
1992bf215546Sopenharmony_ci                      su->tex.format->bits[1] +
1993bf215546Sopenharmony_ci                      su->tex.format->bits[2] +
1994bf215546Sopenharmony_ci                      su->tex.format->bits[3]) / 8;
1995bf215546Sopenharmony_ci   DataType ty = typeOfSize(bytes);
1996bf215546Sopenharmony_ci
1997bf215546Sopenharmony_ci   Value *coord = processSurfaceCoords(su);
1998bf215546Sopenharmony_ci
1999bf215546Sopenharmony_ci   // The packed values we will eventually store into memory
2000bf215546Sopenharmony_ci   Value *untypedDst[4] = {};
2001bf215546Sopenharmony_ci   // Each component's packed representation, in 16-bit registers (only used
2002bf215546Sopenharmony_ci   // where appropriate)
2003bf215546Sopenharmony_ci   Value *untypedDst16[4] = {};
2004bf215546Sopenharmony_ci   // The original values that are being packed
2005bf215546Sopenharmony_ci   Value *typedDst[4] = {};
2006bf215546Sopenharmony_ci   int i;
2007bf215546Sopenharmony_ci
2008bf215546Sopenharmony_ci   for (i = 0; i < bytes / 4; i++)
2009bf215546Sopenharmony_ci      untypedDst[i] = bld.getSSA();
2010bf215546Sopenharmony_ci   for (i = 0; i < format->components; i++)
2011bf215546Sopenharmony_ci      untypedDst16[i] = bld.getSSA(2);
2012bf215546Sopenharmony_ci   // Make sure we get at least one of each value allocated for the
2013bf215546Sopenharmony_ci   // super-narrow formats.
2014bf215546Sopenharmony_ci   if (bytes < 4)
2015bf215546Sopenharmony_ci      untypedDst[0] = bld.getSSA();
2016bf215546Sopenharmony_ci   if (bytes < 2)
2017bf215546Sopenharmony_ci      untypedDst16[0] = bld.getSSA(2);
2018bf215546Sopenharmony_ci
2019bf215546Sopenharmony_ci   for (i = 0; i < 4; i++) {
2020bf215546Sopenharmony_ci      typedDst[i] = bld.getSSA();
2021bf215546Sopenharmony_ci      bld.mkMov(typedDst[i], su->getSrc(arg + i));
2022bf215546Sopenharmony_ci   }
2023bf215546Sopenharmony_ci
2024bf215546Sopenharmony_ci   if (format->bgra) {
2025bf215546Sopenharmony_ci      std::swap(typedDst[0], typedDst[2]);
2026bf215546Sopenharmony_ci   }
2027bf215546Sopenharmony_ci
2028bf215546Sopenharmony_ci   // Pack each component into the untyped dsts.
2029bf215546Sopenharmony_ci   int bits = 0;
2030bf215546Sopenharmony_ci   for (int i = 0; i < format->components; bits += format->bits[i], i++) {
2031bf215546Sopenharmony_ci      // Un-normalize / convert as necessary
2032bf215546Sopenharmony_ci      if (format->type == UNORM)
2033bf215546Sopenharmony_ci         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << format->bits[i]) - 1)));
2034bf215546Sopenharmony_ci      else if (format->type == SNORM)
2035bf215546Sopenharmony_ci         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << (format->bits[i] - 1)) - 1)));
2036bf215546Sopenharmony_ci
2037bf215546Sopenharmony_ci      // There is nothing to convert/pack for 32-bit values
2038bf215546Sopenharmony_ci      if (format->bits[i] == 32) {
2039bf215546Sopenharmony_ci         bld.mkMov(untypedDst[i], typedDst[i]);
2040bf215546Sopenharmony_ci         continue;
2041bf215546Sopenharmony_ci      }
2042bf215546Sopenharmony_ci
2043bf215546Sopenharmony_ci      // The remainder of the cases will naturally want to deal in 16-bit
2044bf215546Sopenharmony_ci      // registers. We will put these into untypedDst16 and then merge them
2045bf215546Sopenharmony_ci      // together later.
2046bf215546Sopenharmony_ci      if (format->type == FLOAT && format->bits[i] < 16) {
2047bf215546Sopenharmony_ci         bld.mkCvt(OP_CVT, TYPE_F16, untypedDst16[i], TYPE_F32, typedDst[i]);
2048bf215546Sopenharmony_ci         bld.mkOp2(OP_SHR, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(15 - format->bits[i])));
2049bf215546Sopenharmony_ci
2050bf215546Sopenharmony_ci         // For odd bit sizes, it's easier to pack it into the final
2051bf215546Sopenharmony_ci         // destination directly.
2052bf215546Sopenharmony_ci         Value *tmp = bld.getSSA();
2053bf215546Sopenharmony_ci         bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);
2054bf215546Sopenharmony_ci         if (i == 0) {
2055bf215546Sopenharmony_ci            untypedDst[0] = tmp;
2056bf215546Sopenharmony_ci         } else {
2057bf215546Sopenharmony_ci            bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));
2058bf215546Sopenharmony_ci            bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);
2059bf215546Sopenharmony_ci         }
2060bf215546Sopenharmony_ci      } else if (format->bits[i] == 16) {
2061bf215546Sopenharmony_ci         // We can always convert the shader value into the packed value
2062bf215546Sopenharmony_ci         // directly here
2063bf215546Sopenharmony_ci         bld.mkCvt(OP_CVT, getPackedType(format, i), untypedDst16[i],
2064bf215546Sopenharmony_ci                   getShaderType(format->type), typedDst[i]);
2065bf215546Sopenharmony_ci      } else if (format->bits[i] < 16) {
2066bf215546Sopenharmony_ci         DataType packedType = getPackedType(format, i);
2067bf215546Sopenharmony_ci         DataType shaderType = getShaderType(format->type);
2068bf215546Sopenharmony_ci         // We can't convert F32 to U8/S8 directly, so go to U16/S16 first.
2069bf215546Sopenharmony_ci         if (shaderType == TYPE_F32 && typeSizeof(packedType) == 1) {
2070bf215546Sopenharmony_ci            packedType = format->type == SNORM ? TYPE_S16 : TYPE_U16;
2071bf215546Sopenharmony_ci         }
2072bf215546Sopenharmony_ci         bld.mkCvt(OP_CVT, packedType, untypedDst16[i], shaderType, typedDst[i]);
2073bf215546Sopenharmony_ci         // TODO: clamp for 10- and 2-bit sizes. Also, due to the oddness of
2074bf215546Sopenharmony_ci         // the size, it's easier to dump them into a 32-bit value and OR
2075bf215546Sopenharmony_ci         // everything later.
2076bf215546Sopenharmony_ci         if (format->bits[i] != 8) {
2077bf215546Sopenharmony_ci            // Restrict value to the appropriate bits (although maybe supposed
2078bf215546Sopenharmony_ci            // to clamp instead?)
2079bf215546Sopenharmony_ci            bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)((1 << format->bits[i]) - 1)));
2080bf215546Sopenharmony_ci            // And merge into final packed value
2081bf215546Sopenharmony_ci            Value *tmp = bld.getSSA();
2082bf215546Sopenharmony_ci            bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);
2083bf215546Sopenharmony_ci            if (i == 0) {
2084bf215546Sopenharmony_ci               untypedDst[0] = tmp;
2085bf215546Sopenharmony_ci            } else {
2086bf215546Sopenharmony_ci               bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));
2087bf215546Sopenharmony_ci               bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);
2088bf215546Sopenharmony_ci            }
2089bf215546Sopenharmony_ci         } else if (i & 1) {
2090bf215546Sopenharmony_ci            // Shift the 8-bit value up (so that it can be OR'd later)
2091bf215546Sopenharmony_ci            bld.mkOp2(OP_SHL, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(bits % 16)));
2092bf215546Sopenharmony_ci         } else if (packedType != TYPE_U8) {
2093bf215546Sopenharmony_ci            // S8 (or the *16 if converted from float) will all have high bits
2094bf215546Sopenharmony_ci            // set, so AND them out.
2095bf215546Sopenharmony_ci            bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)0xff));
2096bf215546Sopenharmony_ci         }
2097bf215546Sopenharmony_ci      }
2098bf215546Sopenharmony_ci   }
2099bf215546Sopenharmony_ci
2100bf215546Sopenharmony_ci   // OR pairs of 8-bit values together (into the even value)
2101bf215546Sopenharmony_ci   if (format->bits[0] == 8) {
2102bf215546Sopenharmony_ci      for (i = 0; i < 2 && untypedDst16[2 * i] && untypedDst16[2 * i + 1]; i++)
2103bf215546Sopenharmony_ci         bld.mkOp2(OP_OR, TYPE_U16, untypedDst16[2 * i], untypedDst16[2 * i], untypedDst16[2 * i + 1]);
2104bf215546Sopenharmony_ci   }
2105bf215546Sopenharmony_ci
2106bf215546Sopenharmony_ci   // We'll always want to have at least a 32-bit source register for the store
2107bf215546Sopenharmony_ci   Instruction *merge = bld.mkOp(OP_MERGE, bytes < 4 ? TYPE_U32 : ty, bld.getSSA(bytes < 4 ? 4 : bytes));
2108bf215546Sopenharmony_ci   if (format->bits[0] == 32) {
2109bf215546Sopenharmony_ci      for (i = 0; i < 4 && untypedDst[i]; i++)
2110bf215546Sopenharmony_ci         merge->setSrc(i, untypedDst[i]);
2111bf215546Sopenharmony_ci   } else if (format->bits[0] == 16) {
2112bf215546Sopenharmony_ci      for (i = 0; i < 4 && untypedDst16[i]; i++)
2113bf215546Sopenharmony_ci         merge->setSrc(i, untypedDst16[i]);
2114bf215546Sopenharmony_ci      if (i == 1)
2115bf215546Sopenharmony_ci         merge->setSrc(i, bld.getSSA(2));
2116bf215546Sopenharmony_ci   } else if (format->bits[0] == 8) {
2117bf215546Sopenharmony_ci      for (i = 0; i < 2 && untypedDst16[2 * i]; i++)
2118bf215546Sopenharmony_ci         merge->setSrc(i, untypedDst16[2 * i]);
2119bf215546Sopenharmony_ci      if (i == 1)
2120bf215546Sopenharmony_ci         merge->setSrc(i, bld.getSSA(2));
2121bf215546Sopenharmony_ci   } else {
2122bf215546Sopenharmony_ci      merge->setSrc(0, untypedDst[0]);
2123bf215546Sopenharmony_ci   }
2124bf215546Sopenharmony_ci
2125bf215546Sopenharmony_ci   bld.mkStore(OP_STORE, ty, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), coord, merge->getDef(0));
2126bf215546Sopenharmony_ci
2127bf215546Sopenharmony_ci   bld.getBB()->remove(su);
2128bf215546Sopenharmony_ci   return true;
2129bf215546Sopenharmony_ci}
2130bf215546Sopenharmony_ci
2131bf215546Sopenharmony_cibool
2132bf215546Sopenharmony_ciNV50LoweringPreSSA::handlePFETCH(Instruction *i)
2133bf215546Sopenharmony_ci{
2134bf215546Sopenharmony_ci   assert(prog->getType() == Program::TYPE_GEOMETRY);
2135bf215546Sopenharmony_ci
2136bf215546Sopenharmony_ci   // NOTE: cannot use getImmediate here, not in SSA form yet, move to
2137bf215546Sopenharmony_ci   // later phase if that assertion ever triggers:
2138bf215546Sopenharmony_ci
2139bf215546Sopenharmony_ci   ImmediateValue *imm = i->getSrc(0)->asImm();
2140bf215546Sopenharmony_ci   assert(imm);
2141bf215546Sopenharmony_ci
2142bf215546Sopenharmony_ci   assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens
2143bf215546Sopenharmony_ci
2144bf215546Sopenharmony_ci   if (i->srcExists(1)) {
2145bf215546Sopenharmony_ci      // indirect addressing of vertex in primitive space
2146bf215546Sopenharmony_ci
2147bf215546Sopenharmony_ci      LValue *val = bld.getScratch();
2148bf215546Sopenharmony_ci      Value *ptr = bld.getSSA(2, FILE_ADDRESS);
2149bf215546Sopenharmony_ci      bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2));
2150bf215546Sopenharmony_ci      bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr);
2151bf215546Sopenharmony_ci
2152bf215546Sopenharmony_ci      // NOTE: PFETCH directly to an $aX only works with direct addressing
2153bf215546Sopenharmony_ci      i->op = OP_SHL;
2154bf215546Sopenharmony_ci      i->setSrc(0, val);
2155bf215546Sopenharmony_ci      i->setSrc(1, bld.mkImm(0));
2156bf215546Sopenharmony_ci   }
2157bf215546Sopenharmony_ci
2158bf215546Sopenharmony_ci   return true;
2159bf215546Sopenharmony_ci}
2160bf215546Sopenharmony_ci
2161bf215546Sopenharmony_ci// Set flags according to predicate and make the instruction read $cX.
2162bf215546Sopenharmony_civoid
2163bf215546Sopenharmony_ciNV50LoweringPreSSA::checkPredicate(Instruction *insn)
2164bf215546Sopenharmony_ci{
2165bf215546Sopenharmony_ci   Value *pred = insn->getPredicate();
2166bf215546Sopenharmony_ci   Value *cdst;
2167bf215546Sopenharmony_ci
2168bf215546Sopenharmony_ci   // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
2169bf215546Sopenharmony_ci   if (!pred ||
2170bf215546Sopenharmony_ci       pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE)
2171bf215546Sopenharmony_ci      return;
2172bf215546Sopenharmony_ci
2173bf215546Sopenharmony_ci   cdst = bld.getSSA(1, FILE_FLAGS);
2174bf215546Sopenharmony_ci
2175bf215546Sopenharmony_ci   bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);
2176bf215546Sopenharmony_ci
2177bf215546Sopenharmony_ci   insn->setPredicate(insn->cc, cdst);
2178bf215546Sopenharmony_ci}
2179bf215546Sopenharmony_ci
2180bf215546Sopenharmony_ci//
2181bf215546Sopenharmony_ci// - add quadop dance for texturing
2182bf215546Sopenharmony_ci// - put FP outputs in GPRs
2183bf215546Sopenharmony_ci// - convert instruction sequences
2184bf215546Sopenharmony_ci//
2185bf215546Sopenharmony_cibool
2186bf215546Sopenharmony_ciNV50LoweringPreSSA::visit(Instruction *i)
2187bf215546Sopenharmony_ci{
2188bf215546Sopenharmony_ci   bld.setPosition(i, false);
2189bf215546Sopenharmony_ci
2190bf215546Sopenharmony_ci   if (i->cc != CC_ALWAYS)
2191bf215546Sopenharmony_ci      checkPredicate(i);
2192bf215546Sopenharmony_ci
2193bf215546Sopenharmony_ci   switch (i->op) {
2194bf215546Sopenharmony_ci   case OP_TEX:
2195bf215546Sopenharmony_ci   case OP_TXF:
2196bf215546Sopenharmony_ci   case OP_TXG:
2197bf215546Sopenharmony_ci      return handleTEX(i->asTex());
2198bf215546Sopenharmony_ci   case OP_TXB:
2199bf215546Sopenharmony_ci      return handleTXB(i->asTex());
2200bf215546Sopenharmony_ci   case OP_TXL:
2201bf215546Sopenharmony_ci      return handleTXL(i->asTex());
2202bf215546Sopenharmony_ci   case OP_TXD:
2203bf215546Sopenharmony_ci      return handleTXD(i->asTex());
2204bf215546Sopenharmony_ci   case OP_TXLQ:
2205bf215546Sopenharmony_ci      return handleTXLQ(i->asTex());
2206bf215546Sopenharmony_ci   case OP_TXQ:
2207bf215546Sopenharmony_ci      return handleTXQ(i->asTex());
2208bf215546Sopenharmony_ci   case OP_EX2:
2209bf215546Sopenharmony_ci      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
2210bf215546Sopenharmony_ci      i->setSrc(0, i->getDef(0));
2211bf215546Sopenharmony_ci      break;
2212bf215546Sopenharmony_ci   case OP_SET:
2213bf215546Sopenharmony_ci      return handleSET(i);
2214bf215546Sopenharmony_ci   case OP_SLCT:
2215bf215546Sopenharmony_ci      return handleSLCT(i->asCmp());
2216bf215546Sopenharmony_ci   case OP_SELP:
2217bf215546Sopenharmony_ci      return handleSELP(i);
2218bf215546Sopenharmony_ci   case OP_POW:
2219bf215546Sopenharmony_ci      return handlePOW(i);
2220bf215546Sopenharmony_ci   case OP_DIV:
2221bf215546Sopenharmony_ci      return handleDIV(i);
2222bf215546Sopenharmony_ci   case OP_SQRT:
2223bf215546Sopenharmony_ci      return handleSQRT(i);
2224bf215546Sopenharmony_ci   case OP_EXPORT:
2225bf215546Sopenharmony_ci      return handleEXPORT(i);
2226bf215546Sopenharmony_ci   case OP_LOAD:
2227bf215546Sopenharmony_ci      return handleLOAD(i);
2228bf215546Sopenharmony_ci   case OP_MEMBAR:
2229bf215546Sopenharmony_ci      return handleMEMBAR(i);
2230bf215546Sopenharmony_ci   case OP_ATOM:
2231bf215546Sopenharmony_ci   case OP_STORE:
2232bf215546Sopenharmony_ci      return handleLDST(i);
2233bf215546Sopenharmony_ci   case OP_SULDP:
2234bf215546Sopenharmony_ci      return handleSULDP(i->asTex());
2235bf215546Sopenharmony_ci   case OP_SUSTP:
2236bf215546Sopenharmony_ci      return handleSUSTP(i->asTex());
2237bf215546Sopenharmony_ci   case OP_SUREDP:
2238bf215546Sopenharmony_ci      return handleSUREDP(i->asTex());
2239bf215546Sopenharmony_ci   case OP_SUQ:
2240bf215546Sopenharmony_ci      return handleSUQ(i->asTex());
2241bf215546Sopenharmony_ci   case OP_BUFQ:
2242bf215546Sopenharmony_ci      return handleBUFQ(i);
2243bf215546Sopenharmony_ci   case OP_RDSV:
2244bf215546Sopenharmony_ci      return handleRDSV(i);
2245bf215546Sopenharmony_ci   case OP_WRSV:
2246bf215546Sopenharmony_ci      return handleWRSV(i);
2247bf215546Sopenharmony_ci   case OP_CALL:
2248bf215546Sopenharmony_ci      return handleCALL(i);
2249bf215546Sopenharmony_ci   case OP_PRECONT:
2250bf215546Sopenharmony_ci      return handlePRECONT(i);
2251bf215546Sopenharmony_ci   case OP_CONT:
2252bf215546Sopenharmony_ci      return handleCONT(i);
2253bf215546Sopenharmony_ci   case OP_PFETCH:
2254bf215546Sopenharmony_ci      return handlePFETCH(i);
2255bf215546Sopenharmony_ci   default:
2256bf215546Sopenharmony_ci      break;
2257bf215546Sopenharmony_ci   }
2258bf215546Sopenharmony_ci   return true;
2259bf215546Sopenharmony_ci}
2260bf215546Sopenharmony_ci
2261bf215546Sopenharmony_cibool
2262bf215546Sopenharmony_ciTargetNV50::runLegalizePass(Program *prog, CGStage stage) const
2263bf215546Sopenharmony_ci{
2264bf215546Sopenharmony_ci   bool ret = false;
2265bf215546Sopenharmony_ci
2266bf215546Sopenharmony_ci   if (stage == CG_STAGE_PRE_SSA) {
2267bf215546Sopenharmony_ci      NV50LoweringPreSSA pass(prog);
2268bf215546Sopenharmony_ci      ret = pass.run(prog, false, true);
2269bf215546Sopenharmony_ci   } else
2270bf215546Sopenharmony_ci   if (stage == CG_STAGE_SSA) {
2271bf215546Sopenharmony_ci      if (!prog->targetPriv)
2272bf215546Sopenharmony_ci         prog->targetPriv = new std::list<Instruction *>();
2273bf215546Sopenharmony_ci      NV50LegalizeSSA pass(prog);
2274bf215546Sopenharmony_ci      ret = pass.run(prog, false, true);
2275bf215546Sopenharmony_ci   } else
2276bf215546Sopenharmony_ci   if (stage == CG_STAGE_POST_RA) {
2277bf215546Sopenharmony_ci      NV50LegalizePostRA pass;
2278bf215546Sopenharmony_ci      ret = pass.run(prog, false, true);
2279bf215546Sopenharmony_ci      if (prog->targetPriv)
2280bf215546Sopenharmony_ci         delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
2281bf215546Sopenharmony_ci   }
2282bf215546Sopenharmony_ci   return ret;
2283bf215546Sopenharmony_ci}
2284bf215546Sopenharmony_ci
2285bf215546Sopenharmony_ci} // namespace nv50_ir
2286