1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright 2011 Christoph Bumiller
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice shall be included in
12bf215546Sopenharmony_ci * all copies or substantial portions of the Software.
13bf215546Sopenharmony_ci *
14bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18bf215546Sopenharmony_ci * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19bf215546Sopenharmony_ci * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20bf215546Sopenharmony_ci * OTHER DEALINGS IN THE SOFTWARE.
21bf215546Sopenharmony_ci */
22bf215546Sopenharmony_ci
23bf215546Sopenharmony_ci#include "nv50_ir.h"
24bf215546Sopenharmony_ci#include "nv50_ir_build_util.h"
25bf215546Sopenharmony_ci
26bf215546Sopenharmony_ci#include "nv50_ir_target_nvc0.h"
27bf215546Sopenharmony_ci#include "nv50_ir_lowering_nvc0.h"
28bf215546Sopenharmony_ci
29bf215546Sopenharmony_ci#include <limits>
30bf215546Sopenharmony_ci
31bf215546Sopenharmony_cinamespace nv50_ir {
32bf215546Sopenharmony_ci
33bf215546Sopenharmony_ci#define QOP_ADD  0
34bf215546Sopenharmony_ci#define QOP_SUBR 1
35bf215546Sopenharmony_ci#define QOP_SUB  2
36bf215546Sopenharmony_ci#define QOP_MOV2 3
37bf215546Sopenharmony_ci
38bf215546Sopenharmony_ci//             UL UR LL LR
39bf215546Sopenharmony_ci#define QUADOP(q, r, s, t)                      \
40bf215546Sopenharmony_ci   ((QOP_##q << 6) | (QOP_##r << 4) |           \
41bf215546Sopenharmony_ci    (QOP_##s << 2) | (QOP_##t << 0))
42bf215546Sopenharmony_ci
43bf215546Sopenharmony_civoid
44bf215546Sopenharmony_ciNVC0LegalizeSSA::handleDIV(Instruction *i)
45bf215546Sopenharmony_ci{
46bf215546Sopenharmony_ci   FlowInstruction *call;
47bf215546Sopenharmony_ci   int builtin;
48bf215546Sopenharmony_ci
49bf215546Sopenharmony_ci   bld.setPosition(i, false);
50bf215546Sopenharmony_ci
51bf215546Sopenharmony_ci   // Generate movs to the input regs for the call we want to generate
52bf215546Sopenharmony_ci   for (int s = 0; i->srcExists(s); ++s) {
53bf215546Sopenharmony_ci      Instruction *ld = i->getSrc(s)->getInsn();
54bf215546Sopenharmony_ci      // check if we are moving an immediate, propagate it in that case
55bf215546Sopenharmony_ci      if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV) ||
56bf215546Sopenharmony_ci            !(ld->src(0).getFile() == FILE_IMMEDIATE))
57bf215546Sopenharmony_ci         bld.mkMovToReg(s, i->getSrc(s));
58bf215546Sopenharmony_ci      else {
59bf215546Sopenharmony_ci         assert(ld->getSrc(0) != NULL);
60bf215546Sopenharmony_ci         bld.mkMovToReg(s, ld->getSrc(0));
61bf215546Sopenharmony_ci         // Clear the src, to make code elimination possible here before we
62bf215546Sopenharmony_ci         // delete the instruction i later
63bf215546Sopenharmony_ci         i->setSrc(s, NULL);
64bf215546Sopenharmony_ci         if (ld->isDead())
65bf215546Sopenharmony_ci            delete_Instruction(prog, ld);
66bf215546Sopenharmony_ci      }
67bf215546Sopenharmony_ci   }
68bf215546Sopenharmony_ci
69bf215546Sopenharmony_ci   switch (i->dType) {
70bf215546Sopenharmony_ci   case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
71bf215546Sopenharmony_ci   case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
72bf215546Sopenharmony_ci   default:
73bf215546Sopenharmony_ci      return;
74bf215546Sopenharmony_ci   }
75bf215546Sopenharmony_ci   call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
76bf215546Sopenharmony_ci   bld.mkMovFromReg(i->getDef(0), i->op == OP_DIV ? 0 : 1);
77bf215546Sopenharmony_ci   bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
78bf215546Sopenharmony_ci   bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
79bf215546Sopenharmony_ci
80bf215546Sopenharmony_ci   call->fixed = 1;
81bf215546Sopenharmony_ci   call->absolute = call->builtin = 1;
82bf215546Sopenharmony_ci   call->target.builtin = builtin;
83bf215546Sopenharmony_ci   delete_Instruction(prog, i);
84bf215546Sopenharmony_ci}
85bf215546Sopenharmony_ci
86bf215546Sopenharmony_civoid
87bf215546Sopenharmony_ciNVC0LegalizeSSA::handleRCPRSQLib(Instruction *i, Value *src[])
88bf215546Sopenharmony_ci{
89bf215546Sopenharmony_ci   FlowInstruction *call;
90bf215546Sopenharmony_ci   Value *def[2];
91bf215546Sopenharmony_ci   int builtin;
92bf215546Sopenharmony_ci
93bf215546Sopenharmony_ci   def[0] = bld.mkMovToReg(0, src[0])->getDef(0);
94bf215546Sopenharmony_ci   def[1] = bld.mkMovToReg(1, src[1])->getDef(0);
95bf215546Sopenharmony_ci
96bf215546Sopenharmony_ci   if (i->op == OP_RCP)
97bf215546Sopenharmony_ci      builtin = NVC0_BUILTIN_RCP_F64;
98bf215546Sopenharmony_ci   else
99bf215546Sopenharmony_ci      builtin = NVC0_BUILTIN_RSQ_F64;
100bf215546Sopenharmony_ci
101bf215546Sopenharmony_ci   call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
102bf215546Sopenharmony_ci   def[0] = bld.getSSA();
103bf215546Sopenharmony_ci   def[1] = bld.getSSA();
104bf215546Sopenharmony_ci   bld.mkMovFromReg(def[0], 0);
105bf215546Sopenharmony_ci   bld.mkMovFromReg(def[1], 1);
106bf215546Sopenharmony_ci   bld.mkClobber(FILE_GPR, 0x3fc, 2);
107bf215546Sopenharmony_ci   bld.mkClobber(FILE_PREDICATE, i->op == OP_RSQ ? 0x3 : 0x1, 0);
108bf215546Sopenharmony_ci   bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), def[0], def[1]);
109bf215546Sopenharmony_ci
110bf215546Sopenharmony_ci   call->fixed = 1;
111bf215546Sopenharmony_ci   call->absolute = call->builtin = 1;
112bf215546Sopenharmony_ci   call->target.builtin = builtin;
113bf215546Sopenharmony_ci   delete_Instruction(prog, i);
114bf215546Sopenharmony_ci
115bf215546Sopenharmony_ci   prog->fp64 = true;
116bf215546Sopenharmony_ci}
117bf215546Sopenharmony_ci
118bf215546Sopenharmony_civoid
119bf215546Sopenharmony_ciNVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
120bf215546Sopenharmony_ci{
121bf215546Sopenharmony_ci   assert(i->dType == TYPE_F64);
122bf215546Sopenharmony_ci   // There are instructions that will compute the high 32 bits of the 64-bit
123bf215546Sopenharmony_ci   // float. We will just stick 0 in the bottom 32 bits.
124bf215546Sopenharmony_ci
125bf215546Sopenharmony_ci   bld.setPosition(i, false);
126bf215546Sopenharmony_ci
127bf215546Sopenharmony_ci   // 1. Take the source and it up.
128bf215546Sopenharmony_ci   Value *src[2], *dst[2], *def = i->getDef(0);
129bf215546Sopenharmony_ci   bld.mkSplit(src, 4, i->getSrc(0));
130bf215546Sopenharmony_ci
131bf215546Sopenharmony_ci   int chip = prog->getTarget()->getChipset();
132bf215546Sopenharmony_ci   if (chip >= NVISA_GK104_CHIPSET) {
133bf215546Sopenharmony_ci      handleRCPRSQLib(i, src);
134bf215546Sopenharmony_ci      return;
135bf215546Sopenharmony_ci   }
136bf215546Sopenharmony_ci
137bf215546Sopenharmony_ci   // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
138bf215546Sopenharmony_ci   dst[0] = bld.loadImm(NULL, 0);
139bf215546Sopenharmony_ci   dst[1] = bld.getSSA();
140bf215546Sopenharmony_ci
141bf215546Sopenharmony_ci   // 3. The new version of the instruction takes the high 32 bits of the
142bf215546Sopenharmony_ci   // source and outputs the high 32 bits of the destination.
143bf215546Sopenharmony_ci   i->setSrc(0, src[1]);
144bf215546Sopenharmony_ci   i->setDef(0, dst[1]);
145bf215546Sopenharmony_ci   i->setType(TYPE_F32);
146bf215546Sopenharmony_ci   i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
147bf215546Sopenharmony_ci
148bf215546Sopenharmony_ci   // 4. Recombine the two dst pieces back into the original destination.
149bf215546Sopenharmony_ci   bld.setPosition(i, true);
150bf215546Sopenharmony_ci   bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
151bf215546Sopenharmony_ci}
152bf215546Sopenharmony_ci
153bf215546Sopenharmony_civoid
154bf215546Sopenharmony_ciNVC0LegalizeSSA::handleFTZ(Instruction *i)
155bf215546Sopenharmony_ci{
156bf215546Sopenharmony_ci   // Only want to flush float inputs
157bf215546Sopenharmony_ci   assert(i->sType == TYPE_F32);
158bf215546Sopenharmony_ci
159bf215546Sopenharmony_ci   // If we're already flushing denorms (and NaN's) to zero, no need for this.
160bf215546Sopenharmony_ci   if (i->dnz)
161bf215546Sopenharmony_ci      return;
162bf215546Sopenharmony_ci
163bf215546Sopenharmony_ci   // Only certain classes of operations can flush
164bf215546Sopenharmony_ci   OpClass cls = prog->getTarget()->getOpClass(i->op);
165bf215546Sopenharmony_ci   if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
166bf215546Sopenharmony_ci       cls != OPCLASS_CONVERT)
167bf215546Sopenharmony_ci      return;
168bf215546Sopenharmony_ci
169bf215546Sopenharmony_ci   i->ftz = true;
170bf215546Sopenharmony_ci}
171bf215546Sopenharmony_ci
172bf215546Sopenharmony_civoid
173bf215546Sopenharmony_ciNVC0LegalizeSSA::handleTEXLOD(TexInstruction *i)
174bf215546Sopenharmony_ci{
175bf215546Sopenharmony_ci   if (i->tex.levelZero)
176bf215546Sopenharmony_ci      return;
177bf215546Sopenharmony_ci
178bf215546Sopenharmony_ci   ImmediateValue lod;
179bf215546Sopenharmony_ci
180bf215546Sopenharmony_ci   // The LOD argument comes right after the coordinates (before depth bias,
181bf215546Sopenharmony_ci   // offsets, etc).
182bf215546Sopenharmony_ci   int arg = i->tex.target.getArgCount();
183bf215546Sopenharmony_ci
184bf215546Sopenharmony_ci   // SM30+ stores the indirect handle as a separate arg, which comes before
185bf215546Sopenharmony_ci   // the LOD.
186bf215546Sopenharmony_ci   if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET &&
187bf215546Sopenharmony_ci       i->tex.rIndirectSrc >= 0)
188bf215546Sopenharmony_ci      arg++;
189bf215546Sopenharmony_ci   // SM20 stores indirect handle combined with array coordinate
190bf215546Sopenharmony_ci   if (prog->getTarget()->getChipset() < NVISA_GK104_CHIPSET &&
191bf215546Sopenharmony_ci       !i->tex.target.isArray() &&
192bf215546Sopenharmony_ci       i->tex.rIndirectSrc >= 0)
193bf215546Sopenharmony_ci      arg++;
194bf215546Sopenharmony_ci
195bf215546Sopenharmony_ci   if (!i->src(arg).getImmediate(lod) || !lod.isInteger(0))
196bf215546Sopenharmony_ci      return;
197bf215546Sopenharmony_ci
198bf215546Sopenharmony_ci   if (i->op == OP_TXL)
199bf215546Sopenharmony_ci      i->op = OP_TEX;
200bf215546Sopenharmony_ci   i->tex.levelZero = true;
201bf215546Sopenharmony_ci   i->moveSources(arg + 1, -1);
202bf215546Sopenharmony_ci}
203bf215546Sopenharmony_ci
204bf215546Sopenharmony_civoid
205bf215546Sopenharmony_ciNVC0LegalizeSSA::handleShift(Instruction *lo)
206bf215546Sopenharmony_ci{
207bf215546Sopenharmony_ci   Value *shift = lo->getSrc(1);
208bf215546Sopenharmony_ci   Value *dst64 = lo->getDef(0);
209bf215546Sopenharmony_ci   Value *src[2], *dst[2];
210bf215546Sopenharmony_ci   operation op = lo->op;
211bf215546Sopenharmony_ci
212bf215546Sopenharmony_ci   bld.setPosition(lo, false);
213bf215546Sopenharmony_ci
214bf215546Sopenharmony_ci   bld.mkSplit(src, 4, lo->getSrc(0));
215bf215546Sopenharmony_ci
216bf215546Sopenharmony_ci   // SM30 and prior don't have the fancy new SHF.L/R ops. So the logic has to
217bf215546Sopenharmony_ci   // be completely emulated. For SM35+, we can use the more directed SHF
218bf215546Sopenharmony_ci   // operations.
219bf215546Sopenharmony_ci   if (prog->getTarget()->getChipset() < NVISA_GK20A_CHIPSET) {
220bf215546Sopenharmony_ci      // The strategy here is to handle shifts >= 32 and less than 32 as
221bf215546Sopenharmony_ci      // separate parts.
222bf215546Sopenharmony_ci      //
223bf215546Sopenharmony_ci      // For SHL:
224bf215546Sopenharmony_ci      // If the shift is <= 32, then
225bf215546Sopenharmony_ci      //   (HI,LO) << x = (HI << x | (LO >> (32 - x)), LO << x)
226bf215546Sopenharmony_ci      // If the shift is > 32, then
227bf215546Sopenharmony_ci      //   (HI,LO) << x = (LO << (x - 32), 0)
228bf215546Sopenharmony_ci      //
229bf215546Sopenharmony_ci      // For SHR:
230bf215546Sopenharmony_ci      // If the shift is <= 32, then
231bf215546Sopenharmony_ci      //   (HI,LO) >> x = (HI >> x, (HI << (32 - x)) | LO >> x)
232bf215546Sopenharmony_ci      // If the shift is > 32, then
233bf215546Sopenharmony_ci      //   (HI,LO) >> x = (0, HI >> (x - 32))
234bf215546Sopenharmony_ci      //
235bf215546Sopenharmony_ci      // Note that on NVIDIA hardware, a shift > 32 yields a 0 value, which we
236bf215546Sopenharmony_ci      // can use to our advantage. Also note the structural similarities
237bf215546Sopenharmony_ci      // between the right/left cases. The main difference is swapping hi/lo
238bf215546Sopenharmony_ci      // on input and output.
239bf215546Sopenharmony_ci
240bf215546Sopenharmony_ci      Value *x32_minus_shift, *pred, *hi1, *hi2;
241bf215546Sopenharmony_ci      DataType type = isSignedIntType(lo->dType) ? TYPE_S32 : TYPE_U32;
242bf215546Sopenharmony_ci      operation antiop = op == OP_SHR ? OP_SHL : OP_SHR;
243bf215546Sopenharmony_ci      if (op == OP_SHR)
244bf215546Sopenharmony_ci         std::swap(src[0], src[1]);
245bf215546Sopenharmony_ci      bld.mkOp2(OP_ADD, TYPE_U32, (x32_minus_shift = bld.getSSA()), shift, bld.mkImm(0x20))
246bf215546Sopenharmony_ci         ->src(0).mod = Modifier(NV50_IR_MOD_NEG);
247bf215546Sopenharmony_ci      bld.mkCmp(OP_SET, CC_LE, TYPE_U8, (pred = bld.getSSA(1, FILE_PREDICATE)),
248bf215546Sopenharmony_ci                TYPE_U32, shift, bld.mkImm(32));
249bf215546Sopenharmony_ci      // Compute HI (shift <= 32)
250bf215546Sopenharmony_ci      bld.mkOp2(OP_OR, TYPE_U32, (hi1 = bld.getSSA()),
251bf215546Sopenharmony_ci                bld.mkOp2v(op, TYPE_U32, bld.getSSA(), src[1], shift),
252bf215546Sopenharmony_ci                bld.mkOp2v(antiop, TYPE_U32, bld.getSSA(), src[0], x32_minus_shift))
253bf215546Sopenharmony_ci         ->setPredicate(CC_P, pred);
254bf215546Sopenharmony_ci      // Compute LO (all shift values)
255bf215546Sopenharmony_ci      bld.mkOp2(op, type, (dst[0] = bld.getSSA()), src[0], shift);
256bf215546Sopenharmony_ci      // Compute HI (shift > 32)
257bf215546Sopenharmony_ci      bld.mkOp2(op, type, (hi2 = bld.getSSA()), src[0],
258bf215546Sopenharmony_ci                bld.mkOp1v(OP_NEG, TYPE_S32, bld.getSSA(), x32_minus_shift))
259bf215546Sopenharmony_ci         ->setPredicate(CC_NOT_P, pred);
260bf215546Sopenharmony_ci      bld.mkOp2(OP_UNION, TYPE_U32, (dst[1] = bld.getSSA()), hi1, hi2);
261bf215546Sopenharmony_ci      if (op == OP_SHR)
262bf215546Sopenharmony_ci         std::swap(dst[0], dst[1]);
263bf215546Sopenharmony_ci      bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
264bf215546Sopenharmony_ci      delete_Instruction(prog, lo);
265bf215546Sopenharmony_ci      return;
266bf215546Sopenharmony_ci   }
267bf215546Sopenharmony_ci
268bf215546Sopenharmony_ci   Instruction *hi = new_Instruction(func, op, TYPE_U32);
269bf215546Sopenharmony_ci   lo->bb->insertAfter(lo, hi);
270bf215546Sopenharmony_ci
271bf215546Sopenharmony_ci   hi->sType = lo->sType;
272bf215546Sopenharmony_ci   lo->dType = TYPE_U32;
273bf215546Sopenharmony_ci
274bf215546Sopenharmony_ci   hi->setDef(0, (dst[1] = bld.getSSA()));
275bf215546Sopenharmony_ci   if (lo->op == OP_SHR)
276bf215546Sopenharmony_ci      hi->subOp |= NV50_IR_SUBOP_SHIFT_HIGH;
277bf215546Sopenharmony_ci   lo->setDef(0, (dst[0] = bld.getSSA()));
278bf215546Sopenharmony_ci
279bf215546Sopenharmony_ci   bld.setPosition(hi, true);
280bf215546Sopenharmony_ci
281bf215546Sopenharmony_ci   if (lo->op == OP_SHL)
282bf215546Sopenharmony_ci      std::swap(hi, lo);
283bf215546Sopenharmony_ci
284bf215546Sopenharmony_ci   hi->setSrc(0, new_ImmediateValue(prog, 0u));
285bf215546Sopenharmony_ci   hi->setSrc(1, shift);
286bf215546Sopenharmony_ci   hi->setSrc(2, lo->op == OP_SHL ? src[0] : src[1]);
287bf215546Sopenharmony_ci
288bf215546Sopenharmony_ci   lo->setSrc(0, src[0]);
289bf215546Sopenharmony_ci   lo->setSrc(1, shift);
290bf215546Sopenharmony_ci   lo->setSrc(2, src[1]);
291bf215546Sopenharmony_ci
292bf215546Sopenharmony_ci   bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
293bf215546Sopenharmony_ci}
294bf215546Sopenharmony_ci
295bf215546Sopenharmony_civoid
296bf215546Sopenharmony_ciNVC0LegalizeSSA::handleSET(CmpInstruction *cmp)
297bf215546Sopenharmony_ci{
298bf215546Sopenharmony_ci   DataType hTy = cmp->sType == TYPE_S64 ? TYPE_S32 : TYPE_U32;
299bf215546Sopenharmony_ci   Value *carry;
300bf215546Sopenharmony_ci   Value *src0[2], *src1[2];
301bf215546Sopenharmony_ci   bld.setPosition(cmp, false);
302bf215546Sopenharmony_ci
303bf215546Sopenharmony_ci   bld.mkSplit(src0, 4, cmp->getSrc(0));
304bf215546Sopenharmony_ci   bld.mkSplit(src1, 4, cmp->getSrc(1));
305bf215546Sopenharmony_ci   bld.mkOp2(OP_SUB, hTy, NULL, src0[0], src1[0])
306bf215546Sopenharmony_ci      ->setFlagsDef(0, (carry = bld.getSSA(1, FILE_FLAGS)));
307bf215546Sopenharmony_ci   cmp->setFlagsSrc(cmp->srcCount(), carry);
308bf215546Sopenharmony_ci   cmp->setSrc(0, src0[1]);
309bf215546Sopenharmony_ci   cmp->setSrc(1, src1[1]);
310bf215546Sopenharmony_ci   cmp->sType = hTy;
311bf215546Sopenharmony_ci}
312bf215546Sopenharmony_ci
313bf215546Sopenharmony_civoid
314bf215546Sopenharmony_ciNVC0LegalizeSSA::handleBREV(Instruction *i)
315bf215546Sopenharmony_ci{
316bf215546Sopenharmony_ci   i->op = OP_EXTBF;
317bf215546Sopenharmony_ci   i->subOp = NV50_IR_SUBOP_EXTBF_REV;
318bf215546Sopenharmony_ci   i->setSrc(1, bld.mkImm(0x2000));
319bf215546Sopenharmony_ci}
320bf215546Sopenharmony_ci
321bf215546Sopenharmony_cibool
322bf215546Sopenharmony_ciNVC0LegalizeSSA::visit(Function *fn)
323bf215546Sopenharmony_ci{
324bf215546Sopenharmony_ci   bld.setProgram(fn->getProgram());
325bf215546Sopenharmony_ci   return true;
326bf215546Sopenharmony_ci}
327bf215546Sopenharmony_ci
328bf215546Sopenharmony_cibool
329bf215546Sopenharmony_ciNVC0LegalizeSSA::visit(BasicBlock *bb)
330bf215546Sopenharmony_ci{
331bf215546Sopenharmony_ci   Instruction *next;
332bf215546Sopenharmony_ci   for (Instruction *i = bb->getEntry(); i; i = next) {
333bf215546Sopenharmony_ci      next = i->next;
334bf215546Sopenharmony_ci
335bf215546Sopenharmony_ci      if (i->sType == TYPE_F32 && prog->getType() != Program::TYPE_COMPUTE)
336bf215546Sopenharmony_ci         handleFTZ(i);
337bf215546Sopenharmony_ci
338bf215546Sopenharmony_ci      switch (i->op) {
339bf215546Sopenharmony_ci      case OP_DIV:
340bf215546Sopenharmony_ci      case OP_MOD:
341bf215546Sopenharmony_ci         if (i->sType != TYPE_F32)
342bf215546Sopenharmony_ci            handleDIV(i);
343bf215546Sopenharmony_ci         break;
344bf215546Sopenharmony_ci      case OP_RCP:
345bf215546Sopenharmony_ci      case OP_RSQ:
346bf215546Sopenharmony_ci         if (i->dType == TYPE_F64)
347bf215546Sopenharmony_ci            handleRCPRSQ(i);
348bf215546Sopenharmony_ci         break;
349bf215546Sopenharmony_ci      case OP_TXL:
350bf215546Sopenharmony_ci      case OP_TXF:
351bf215546Sopenharmony_ci         handleTEXLOD(i->asTex());
352bf215546Sopenharmony_ci         break;
353bf215546Sopenharmony_ci      case OP_SHR:
354bf215546Sopenharmony_ci      case OP_SHL:
355bf215546Sopenharmony_ci         if (typeSizeof(i->sType) == 8)
356bf215546Sopenharmony_ci            handleShift(i);
357bf215546Sopenharmony_ci         break;
358bf215546Sopenharmony_ci      case OP_SET:
359bf215546Sopenharmony_ci      case OP_SET_AND:
360bf215546Sopenharmony_ci      case OP_SET_OR:
361bf215546Sopenharmony_ci      case OP_SET_XOR:
362bf215546Sopenharmony_ci         if (typeSizeof(i->sType) == 8 && i->sType != TYPE_F64)
363bf215546Sopenharmony_ci            handleSET(i->asCmp());
364bf215546Sopenharmony_ci         break;
365bf215546Sopenharmony_ci      case OP_BREV:
366bf215546Sopenharmony_ci         handleBREV(i);
367bf215546Sopenharmony_ci         break;
368bf215546Sopenharmony_ci      default:
369bf215546Sopenharmony_ci         break;
370bf215546Sopenharmony_ci      }
371bf215546Sopenharmony_ci   }
372bf215546Sopenharmony_ci   return true;
373bf215546Sopenharmony_ci}
374bf215546Sopenharmony_ci
375bf215546Sopenharmony_ciNVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
376bf215546Sopenharmony_ci   : rZero(NULL),
377bf215546Sopenharmony_ci     carry(NULL),
378bf215546Sopenharmony_ci     pOne(NULL),
379bf215546Sopenharmony_ci     needTexBar(prog->getTarget()->getChipset() >= 0xe0 &&
380bf215546Sopenharmony_ci                prog->getTarget()->getChipset() < 0x110)
381bf215546Sopenharmony_ci{
382bf215546Sopenharmony_ci}
383bf215546Sopenharmony_ci
384bf215546Sopenharmony_cibool
385bf215546Sopenharmony_ciNVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
386bf215546Sopenharmony_ci                                    const Instruction *early) const
387bf215546Sopenharmony_ci{
388bf215546Sopenharmony_ci   if (early->bb == later->bb)
389bf215546Sopenharmony_ci      return early->serial < later->serial;
390bf215546Sopenharmony_ci   return later->bb->dominatedBy(early->bb);
391bf215546Sopenharmony_ci}
392bf215546Sopenharmony_ci
393bf215546Sopenharmony_civoid
394bf215546Sopenharmony_ciNVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
395bf215546Sopenharmony_ci                              Instruction *usei, const Instruction *texi)
396bf215546Sopenharmony_ci{
397bf215546Sopenharmony_ci   bool add = true;
398bf215546Sopenharmony_ci   bool dominated = insnDominatedBy(usei, texi);
399bf215546Sopenharmony_ci   // Uses before the tex have to all be included. Just because an earlier
400bf215546Sopenharmony_ci   // instruction dominates another instruction doesn't mean that there's no
401bf215546Sopenharmony_ci   // way to get from the tex to the later instruction. For example you could
402bf215546Sopenharmony_ci   // have nested loops, with the tex in the inner loop, and uses before it in
403bf215546Sopenharmony_ci   // both loops - even though the outer loop's instruction would dominate the
404bf215546Sopenharmony_ci   // inner's, we still want a texbar before the inner loop's instruction.
405bf215546Sopenharmony_ci   //
406bf215546Sopenharmony_ci   // However we can still use the eliding logic between uses dominated by the
407bf215546Sopenharmony_ci   // tex instruction, as that is unambiguously correct.
408bf215546Sopenharmony_ci   if (dominated) {
409bf215546Sopenharmony_ci      for (std::list<TexUse>::iterator it = uses.begin(); it != uses.end();) {
410bf215546Sopenharmony_ci         if (it->after) {
411bf215546Sopenharmony_ci            if (insnDominatedBy(usei, it->insn)) {
412bf215546Sopenharmony_ci               add = false;
413bf215546Sopenharmony_ci               break;
414bf215546Sopenharmony_ci            }
415bf215546Sopenharmony_ci            if (insnDominatedBy(it->insn, usei)) {
416bf215546Sopenharmony_ci               it = uses.erase(it);
417bf215546Sopenharmony_ci               continue;
418bf215546Sopenharmony_ci            }
419bf215546Sopenharmony_ci         }
420bf215546Sopenharmony_ci         ++it;
421bf215546Sopenharmony_ci      }
422bf215546Sopenharmony_ci   }
423bf215546Sopenharmony_ci   if (add)
424bf215546Sopenharmony_ci      uses.push_back(TexUse(usei, texi, dominated));
425bf215546Sopenharmony_ci}
426bf215546Sopenharmony_ci
427bf215546Sopenharmony_ci// While it might be tempting to use the an algorithm that just looks at tex
428bf215546Sopenharmony_ci// uses, not all texture results are guaranteed to be used on all paths. In
429bf215546Sopenharmony_ci// the case where along some control flow path a texture result is never used,
430bf215546Sopenharmony_ci// we might reuse that register for something else, creating a
431bf215546Sopenharmony_ci// write-after-write hazard. So we have to manually look through all
432bf215546Sopenharmony_ci// instructions looking for ones that reference the registers in question.
433bf215546Sopenharmony_civoid
434bf215546Sopenharmony_ciNVC0LegalizePostRA::findFirstUses(
435bf215546Sopenharmony_ci   Instruction *texi, std::list<TexUse> &uses)
436bf215546Sopenharmony_ci{
437bf215546Sopenharmony_ci   int minGPR = texi->def(0).rep()->reg.data.id;
438bf215546Sopenharmony_ci   int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1;
439bf215546Sopenharmony_ci
440bf215546Sopenharmony_ci   std::unordered_set<const BasicBlock *> visited;
441bf215546Sopenharmony_ci   findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited);
442bf215546Sopenharmony_ci}
443bf215546Sopenharmony_ci
444bf215546Sopenharmony_civoid
445bf215546Sopenharmony_ciNVC0LegalizePostRA::findFirstUsesBB(
446bf215546Sopenharmony_ci   int minGPR, int maxGPR, Instruction *start,
447bf215546Sopenharmony_ci   const Instruction *texi, std::list<TexUse> &uses,
448bf215546Sopenharmony_ci   std::unordered_set<const BasicBlock *> &visited)
449bf215546Sopenharmony_ci{
450bf215546Sopenharmony_ci   const BasicBlock *bb = start->bb;
451bf215546Sopenharmony_ci
452bf215546Sopenharmony_ci   // We don't process the whole bb the first time around. This is correct,
453bf215546Sopenharmony_ci   // however we might be in a loop and hit this BB again, and need to process
454bf215546Sopenharmony_ci   // the full thing. So only mark a bb as visited if we processed it from the
455bf215546Sopenharmony_ci   // beginning.
456bf215546Sopenharmony_ci   if (start == bb->getEntry()) {
457bf215546Sopenharmony_ci      if (visited.find(bb) != visited.end())
458bf215546Sopenharmony_ci         return;
459bf215546Sopenharmony_ci      visited.insert(bb);
460bf215546Sopenharmony_ci   }
461bf215546Sopenharmony_ci
462bf215546Sopenharmony_ci   for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) {
463bf215546Sopenharmony_ci      if (insn->isNop())
464bf215546Sopenharmony_ci         continue;
465bf215546Sopenharmony_ci
466bf215546Sopenharmony_ci      for (int d = 0; insn->defExists(d); ++d) {
467bf215546Sopenharmony_ci         const Value *def = insn->def(d).rep();
468bf215546Sopenharmony_ci         if (insn->def(d).getFile() != FILE_GPR ||
469bf215546Sopenharmony_ci             def->reg.data.id + def->reg.size / 4 - 1 < minGPR ||
470bf215546Sopenharmony_ci             def->reg.data.id > maxGPR)
471bf215546Sopenharmony_ci            continue;
472bf215546Sopenharmony_ci         addTexUse(uses, insn, texi);
473bf215546Sopenharmony_ci         return;
474bf215546Sopenharmony_ci      }
475bf215546Sopenharmony_ci
476bf215546Sopenharmony_ci      for (int s = 0; insn->srcExists(s); ++s) {
477bf215546Sopenharmony_ci         const Value *src = insn->src(s).rep();
478bf215546Sopenharmony_ci         if (insn->src(s).getFile() != FILE_GPR ||
479bf215546Sopenharmony_ci             src->reg.data.id + src->reg.size / 4 - 1 < minGPR ||
480bf215546Sopenharmony_ci             src->reg.data.id > maxGPR)
481bf215546Sopenharmony_ci            continue;
482bf215546Sopenharmony_ci         addTexUse(uses, insn, texi);
483bf215546Sopenharmony_ci         return;
484bf215546Sopenharmony_ci      }
485bf215546Sopenharmony_ci   }
486bf215546Sopenharmony_ci
487bf215546Sopenharmony_ci   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
488bf215546Sopenharmony_ci      findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(),
489bf215546Sopenharmony_ci                      texi, uses, visited);
490bf215546Sopenharmony_ci   }
491bf215546Sopenharmony_ci}
492bf215546Sopenharmony_ci
493bf215546Sopenharmony_ci// Texture barriers:
494bf215546Sopenharmony_ci// This pass is a bit long and ugly and can probably be optimized.
495bf215546Sopenharmony_ci//
496bf215546Sopenharmony_ci// 1. obtain a list of TEXes and their outputs' first use(s)
497bf215546Sopenharmony_ci// 2. calculate the barrier level of each first use (minimal number of TEXes,
498bf215546Sopenharmony_ci//    over all paths, between the TEX and the use in question)
499bf215546Sopenharmony_ci// 3. for each barrier, if all paths from the source TEX to that barrier
500bf215546Sopenharmony_ci//    contain a barrier of lesser level, it can be culled
501bf215546Sopenharmony_cibool
502bf215546Sopenharmony_ciNVC0LegalizePostRA::insertTextureBarriers(Function *fn)
503bf215546Sopenharmony_ci{
504bf215546Sopenharmony_ci   std::list<TexUse> *uses;
505bf215546Sopenharmony_ci   std::vector<Instruction *> texes;
506bf215546Sopenharmony_ci   std::vector<int> bbFirstTex;
507bf215546Sopenharmony_ci   std::vector<int> bbFirstUse;
508bf215546Sopenharmony_ci   std::vector<int> texCounts;
509bf215546Sopenharmony_ci   std::vector<TexUse> useVec;
510bf215546Sopenharmony_ci   ArrayList insns;
511bf215546Sopenharmony_ci
512bf215546Sopenharmony_ci   fn->orderInstructions(insns);
513bf215546Sopenharmony_ci
514bf215546Sopenharmony_ci   texCounts.resize(fn->allBBlocks.getSize(), 0);
515bf215546Sopenharmony_ci   bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
516bf215546Sopenharmony_ci   bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
517bf215546Sopenharmony_ci
518bf215546Sopenharmony_ci   // tag BB CFG nodes by their id for later
519bf215546Sopenharmony_ci   for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
520bf215546Sopenharmony_ci      BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
521bf215546Sopenharmony_ci      if (bb)
522bf215546Sopenharmony_ci         bb->cfg.tag = bb->getId();
523bf215546Sopenharmony_ci   }
524bf215546Sopenharmony_ci
525bf215546Sopenharmony_ci   // gather the first uses for each TEX
526bf215546Sopenharmony_ci   for (int i = 0; i < insns.getSize(); ++i) {
527bf215546Sopenharmony_ci      Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
528bf215546Sopenharmony_ci      if (isTextureOp(tex->op)) {
529bf215546Sopenharmony_ci         texes.push_back(tex);
530bf215546Sopenharmony_ci         if (!texCounts.at(tex->bb->getId()))
531bf215546Sopenharmony_ci            bbFirstTex[tex->bb->getId()] = texes.size() - 1;
532bf215546Sopenharmony_ci         texCounts[tex->bb->getId()]++;
533bf215546Sopenharmony_ci      }
534bf215546Sopenharmony_ci   }
535bf215546Sopenharmony_ci   insns.clear();
536bf215546Sopenharmony_ci   if (texes.empty())
537bf215546Sopenharmony_ci      return false;
538bf215546Sopenharmony_ci   uses = new std::list<TexUse>[texes.size()];
539bf215546Sopenharmony_ci   if (!uses)
540bf215546Sopenharmony_ci      return false;
541bf215546Sopenharmony_ci   for (size_t i = 0; i < texes.size(); ++i) {
542bf215546Sopenharmony_ci      findFirstUses(texes[i], uses[i]);
543bf215546Sopenharmony_ci   }
544bf215546Sopenharmony_ci
545bf215546Sopenharmony_ci   // determine the barrier level at each use
546bf215546Sopenharmony_ci   for (size_t i = 0; i < texes.size(); ++i) {
547bf215546Sopenharmony_ci      for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
548bf215546Sopenharmony_ci           ++u) {
549bf215546Sopenharmony_ci         BasicBlock *tb = texes[i]->bb;
550bf215546Sopenharmony_ci         BasicBlock *ub = u->insn->bb;
551bf215546Sopenharmony_ci         if (tb == ub) {
552bf215546Sopenharmony_ci            u->level = 0;
553bf215546Sopenharmony_ci            for (size_t j = i + 1; j < texes.size() &&
554bf215546Sopenharmony_ci                    texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
555bf215546Sopenharmony_ci                 ++j)
556bf215546Sopenharmony_ci               u->level++;
557bf215546Sopenharmony_ci         } else {
558bf215546Sopenharmony_ci            u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
559bf215546Sopenharmony_ci                                                      &ub->cfg, texCounts);
560bf215546Sopenharmony_ci            if (u->level < 0) {
561bf215546Sopenharmony_ci               WARN("Failed to find path TEX -> TEXBAR\n");
562bf215546Sopenharmony_ci               u->level = 0;
563bf215546Sopenharmony_ci               continue;
564bf215546Sopenharmony_ci            }
565bf215546Sopenharmony_ci            // this counted all TEXes in the origin block, correct that
566bf215546Sopenharmony_ci            u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
567bf215546Sopenharmony_ci            // and did not count the TEXes in the destination block, add those
568bf215546Sopenharmony_ci            for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
569bf215546Sopenharmony_ci                    texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
570bf215546Sopenharmony_ci                 ++j)
571bf215546Sopenharmony_ci               u->level++;
572bf215546Sopenharmony_ci         }
573bf215546Sopenharmony_ci         assert(u->level >= 0);
574bf215546Sopenharmony_ci         useVec.push_back(*u);
575bf215546Sopenharmony_ci      }
576bf215546Sopenharmony_ci   }
577bf215546Sopenharmony_ci   delete[] uses;
578bf215546Sopenharmony_ci
579bf215546Sopenharmony_ci   // insert the barriers
580bf215546Sopenharmony_ci   for (size_t i = 0; i < useVec.size(); ++i) {
581bf215546Sopenharmony_ci      Instruction *prev = useVec[i].insn->prev;
582bf215546Sopenharmony_ci      if (useVec[i].level < 0)
583bf215546Sopenharmony_ci         continue;
584bf215546Sopenharmony_ci      if (prev && prev->op == OP_TEXBAR) {
585bf215546Sopenharmony_ci         if (prev->subOp > useVec[i].level)
586bf215546Sopenharmony_ci            prev->subOp = useVec[i].level;
587bf215546Sopenharmony_ci         prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
588bf215546Sopenharmony_ci      } else {
589bf215546Sopenharmony_ci         Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
590bf215546Sopenharmony_ci         bar->fixed = 1;
591bf215546Sopenharmony_ci         bar->subOp = useVec[i].level;
592bf215546Sopenharmony_ci         // make use explicit to ease latency calculation
593bf215546Sopenharmony_ci         bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
594bf215546Sopenharmony_ci         useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
595bf215546Sopenharmony_ci      }
596bf215546Sopenharmony_ci   }
597bf215546Sopenharmony_ci
598bf215546Sopenharmony_ci   if (fn->getProgram()->optLevel < 3)
599bf215546Sopenharmony_ci      return true;
600bf215546Sopenharmony_ci
601bf215546Sopenharmony_ci   std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
602bf215546Sopenharmony_ci
603bf215546Sopenharmony_ci   limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
604bf215546Sopenharmony_ci   limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
605bf215546Sopenharmony_ci   limitS.resize(fn->allBBlocks.getSize());
606bf215546Sopenharmony_ci
607bf215546Sopenharmony_ci   // cull unneeded barriers (should do that earlier, but for simplicity)
608bf215546Sopenharmony_ci   IteratorRef bi = fn->cfg.iteratorCFG();
609bf215546Sopenharmony_ci   // first calculate min/max outstanding TEXes for each BB
610bf215546Sopenharmony_ci   for (bi->reset(); !bi->end(); bi->next()) {
611bf215546Sopenharmony_ci      Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
612bf215546Sopenharmony_ci      BasicBlock *bb = BasicBlock::get(n);
613bf215546Sopenharmony_ci      int min = 0;
614bf215546Sopenharmony_ci      int max = std::numeric_limits<int>::max();
615bf215546Sopenharmony_ci      for (Instruction *i = bb->getFirst(); i; i = i->next) {
616bf215546Sopenharmony_ci         if (isTextureOp(i->op)) {
617bf215546Sopenharmony_ci            min++;
618bf215546Sopenharmony_ci            if (max < std::numeric_limits<int>::max())
619bf215546Sopenharmony_ci               max++;
620bf215546Sopenharmony_ci         } else
621bf215546Sopenharmony_ci         if (i->op == OP_TEXBAR) {
622bf215546Sopenharmony_ci            min = MIN2(min, i->subOp);
623bf215546Sopenharmony_ci            max = MIN2(max, i->subOp);
624bf215546Sopenharmony_ci         }
625bf215546Sopenharmony_ci      }
626bf215546Sopenharmony_ci      // limits when looking at an isolated block
627bf215546Sopenharmony_ci      limitS[bb->getId()].min = min;
628bf215546Sopenharmony_ci      limitS[bb->getId()].max = max;
629bf215546Sopenharmony_ci   }
630bf215546Sopenharmony_ci   // propagate the min/max values
631bf215546Sopenharmony_ci   for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
632bf215546Sopenharmony_ci      for (bi->reset(); !bi->end(); bi->next()) {
633bf215546Sopenharmony_ci         Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
634bf215546Sopenharmony_ci         BasicBlock *bb = BasicBlock::get(n);
635bf215546Sopenharmony_ci         const int bbId = bb->getId();
636bf215546Sopenharmony_ci         for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
637bf215546Sopenharmony_ci            BasicBlock *in = BasicBlock::get(ei.getNode());
638bf215546Sopenharmony_ci            const int inId = in->getId();
639bf215546Sopenharmony_ci            limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
640bf215546Sopenharmony_ci            limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
641bf215546Sopenharmony_ci         }
642bf215546Sopenharmony_ci         // I just hope this is correct ...
643bf215546Sopenharmony_ci         if (limitS[bbId].max == std::numeric_limits<int>::max()) {
644bf215546Sopenharmony_ci            // no barrier
645bf215546Sopenharmony_ci            limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
646bf215546Sopenharmony_ci            limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
647bf215546Sopenharmony_ci         } else {
648bf215546Sopenharmony_ci            // block contained a barrier
649bf215546Sopenharmony_ci            limitB[bbId].min = MIN2(limitS[bbId].max,
650bf215546Sopenharmony_ci                                    limitT[bbId].min + limitS[bbId].min);
651bf215546Sopenharmony_ci            limitB[bbId].max = MIN2(limitS[bbId].max,
652bf215546Sopenharmony_ci                                    limitT[bbId].max + limitS[bbId].min);
653bf215546Sopenharmony_ci         }
654bf215546Sopenharmony_ci      }
655bf215546Sopenharmony_ci   }
656bf215546Sopenharmony_ci   // finally delete unnecessary barriers
657bf215546Sopenharmony_ci   for (bi->reset(); !bi->end(); bi->next()) {
658bf215546Sopenharmony_ci      Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
659bf215546Sopenharmony_ci      BasicBlock *bb = BasicBlock::get(n);
660bf215546Sopenharmony_ci      Instruction *prev = NULL;
661bf215546Sopenharmony_ci      Instruction *next;
662bf215546Sopenharmony_ci      int max = limitT[bb->getId()].max;
663bf215546Sopenharmony_ci      for (Instruction *i = bb->getFirst(); i; i = next) {
664bf215546Sopenharmony_ci         next = i->next;
665bf215546Sopenharmony_ci         if (i->op == OP_TEXBAR) {
666bf215546Sopenharmony_ci            if (i->subOp >= max) {
667bf215546Sopenharmony_ci               delete_Instruction(prog, i);
668bf215546Sopenharmony_ci               i = NULL;
669bf215546Sopenharmony_ci            } else {
670bf215546Sopenharmony_ci               max = i->subOp;
671bf215546Sopenharmony_ci               if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
672bf215546Sopenharmony_ci                  delete_Instruction(prog, prev);
673bf215546Sopenharmony_ci                  prev = NULL;
674bf215546Sopenharmony_ci               }
675bf215546Sopenharmony_ci            }
676bf215546Sopenharmony_ci         } else
677bf215546Sopenharmony_ci         if (isTextureOp(i->op)) {
678bf215546Sopenharmony_ci            max++;
679bf215546Sopenharmony_ci         }
680bf215546Sopenharmony_ci         if (i && !i->isNop())
681bf215546Sopenharmony_ci            prev = i;
682bf215546Sopenharmony_ci      }
683bf215546Sopenharmony_ci   }
684bf215546Sopenharmony_ci   return true;
685bf215546Sopenharmony_ci}
686bf215546Sopenharmony_ci
687bf215546Sopenharmony_cibool
688bf215546Sopenharmony_ciNVC0LegalizePostRA::visit(Function *fn)
689bf215546Sopenharmony_ci{
690bf215546Sopenharmony_ci   if (needTexBar)
691bf215546Sopenharmony_ci      insertTextureBarriers(fn);
692bf215546Sopenharmony_ci
693bf215546Sopenharmony_ci   rZero = new_LValue(fn, FILE_GPR);
694bf215546Sopenharmony_ci   pOne = new_LValue(fn, FILE_PREDICATE);
695bf215546Sopenharmony_ci   carry = new_LValue(fn, FILE_FLAGS);
696bf215546Sopenharmony_ci
697bf215546Sopenharmony_ci   rZero->reg.data.id = (prog->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET) ? 255 : 63;
698bf215546Sopenharmony_ci   carry->reg.data.id = 0;
699bf215546Sopenharmony_ci   pOne->reg.data.id = 7;
700bf215546Sopenharmony_ci
701bf215546Sopenharmony_ci   return true;
702bf215546Sopenharmony_ci}
703bf215546Sopenharmony_ci
704bf215546Sopenharmony_civoid
705bf215546Sopenharmony_ciNVC0LegalizePostRA::replaceZero(Instruction *i)
706bf215546Sopenharmony_ci{
707bf215546Sopenharmony_ci   for (int s = 0; i->srcExists(s); ++s) {
708bf215546Sopenharmony_ci      if (s == 2 && i->op == OP_SUCLAMP)
709bf215546Sopenharmony_ci         continue;
710bf215546Sopenharmony_ci      if (s == 1 && i->op == OP_SHLADD)
711bf215546Sopenharmony_ci         continue;
712bf215546Sopenharmony_ci      ImmediateValue *imm = i->getSrc(s)->asImm();
713bf215546Sopenharmony_ci      if (imm) {
714bf215546Sopenharmony_ci         if (i->op == OP_SELP && s == 2) {
715bf215546Sopenharmony_ci            i->setSrc(s, pOne);
716bf215546Sopenharmony_ci            if (imm->reg.data.u64 == 0)
717bf215546Sopenharmony_ci               i->src(s).mod = i->src(s).mod ^ Modifier(NV50_IR_MOD_NOT);
718bf215546Sopenharmony_ci         } else if (imm->reg.data.u64 == 0) {
719bf215546Sopenharmony_ci            i->setSrc(s, rZero);
720bf215546Sopenharmony_ci         }
721bf215546Sopenharmony_ci      }
722bf215546Sopenharmony_ci   }
723bf215546Sopenharmony_ci}
724bf215546Sopenharmony_ci
725bf215546Sopenharmony_ci// replace CONT with BRA for single unconditional continue
726bf215546Sopenharmony_cibool
727bf215546Sopenharmony_ciNVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
728bf215546Sopenharmony_ci{
729bf215546Sopenharmony_ci   if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
730bf215546Sopenharmony_ci      return false;
731bf215546Sopenharmony_ci   Graph::EdgeIterator ei = bb->cfg.incident();
732bf215546Sopenharmony_ci   if (ei.getType() != Graph::Edge::BACK)
733bf215546Sopenharmony_ci      ei.next();
734bf215546Sopenharmony_ci   if (ei.getType() != Graph::Edge::BACK)
735bf215546Sopenharmony_ci      return false;
736bf215546Sopenharmony_ci   BasicBlock *contBB = BasicBlock::get(ei.getNode());
737bf215546Sopenharmony_ci
738bf215546Sopenharmony_ci   if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
739bf215546Sopenharmony_ci       contBB->getExit()->getPredicate())
740bf215546Sopenharmony_ci      return false;
741bf215546Sopenharmony_ci   contBB->getExit()->op = OP_BRA;
742bf215546Sopenharmony_ci   bb->remove(bb->getEntry()); // delete PRECONT
743bf215546Sopenharmony_ci
744bf215546Sopenharmony_ci   ei.next();
745bf215546Sopenharmony_ci   assert(ei.end() || ei.getType() != Graph::Edge::BACK);
746bf215546Sopenharmony_ci   return true;
747bf215546Sopenharmony_ci}
748bf215546Sopenharmony_ci
749bf215546Sopenharmony_ci// replace branches to join blocks with join ops
750bf215546Sopenharmony_civoid
751bf215546Sopenharmony_ciNVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
752bf215546Sopenharmony_ci{
753bf215546Sopenharmony_ci   if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
754bf215546Sopenharmony_ci      return;
755bf215546Sopenharmony_ci   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
756bf215546Sopenharmony_ci      BasicBlock *in = BasicBlock::get(ei.getNode());
757bf215546Sopenharmony_ci      Instruction *exit = in->getExit();
758bf215546Sopenharmony_ci      if (!exit) {
759bf215546Sopenharmony_ci         in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
760bf215546Sopenharmony_ci         // there should always be a terminator instruction
761bf215546Sopenharmony_ci         WARN("inserted missing terminator in BB:%i\n", in->getId());
762bf215546Sopenharmony_ci      } else
763bf215546Sopenharmony_ci      if (exit->op == OP_BRA) {
764bf215546Sopenharmony_ci         exit->op = OP_JOIN;
765bf215546Sopenharmony_ci         exit->asFlow()->limit = 1; // must-not-propagate marker
766bf215546Sopenharmony_ci      }
767bf215546Sopenharmony_ci   }
768bf215546Sopenharmony_ci   bb->remove(bb->getEntry());
769bf215546Sopenharmony_ci}
770bf215546Sopenharmony_ci
771bf215546Sopenharmony_ci// replaces instructions which would end up as f2f or i2i with faster
772bf215546Sopenharmony_ci// alternatives:
773bf215546Sopenharmony_ci//  - fabs(a)     -> fadd(0, abs a)
774bf215546Sopenharmony_ci//  - fneg(a)     -> fadd(neg 0, neg a)
775bf215546Sopenharmony_ci//  - ineg(a)     -> iadd(0, neg a)
776bf215546Sopenharmony_ci//  - fneg(abs a) -> fadd(neg 0, neg abs a)
777bf215546Sopenharmony_ci//  - sat(a)      -> sat add(0, a)
778bf215546Sopenharmony_civoid
779bf215546Sopenharmony_ciNVC0LegalizePostRA::replaceCvt(Instruction *cvt)
780bf215546Sopenharmony_ci{
781bf215546Sopenharmony_ci   if (!isFloatType(cvt->sType) && typeSizeof(cvt->sType) != 4)
782bf215546Sopenharmony_ci      return;
783bf215546Sopenharmony_ci   if (cvt->sType != cvt->dType)
784bf215546Sopenharmony_ci      return;
785bf215546Sopenharmony_ci   // we could make it work, but in this case we have optimizations disabled
786bf215546Sopenharmony_ci   // and we don't really care either way.
787bf215546Sopenharmony_ci   if (cvt->src(0).getFile() != FILE_GPR &&
788bf215546Sopenharmony_ci       cvt->src(0).getFile() != FILE_MEMORY_CONST)
789bf215546Sopenharmony_ci      return;
790bf215546Sopenharmony_ci
791bf215546Sopenharmony_ci   Modifier mod0, mod1;
792bf215546Sopenharmony_ci
793bf215546Sopenharmony_ci   switch (cvt->op) {
794bf215546Sopenharmony_ci   case OP_ABS:
795bf215546Sopenharmony_ci      if (cvt->src(0).mod)
796bf215546Sopenharmony_ci         return;
797bf215546Sopenharmony_ci      if (!isFloatType(cvt->sType))
798bf215546Sopenharmony_ci         return;
799bf215546Sopenharmony_ci      mod0 = 0;
800bf215546Sopenharmony_ci      mod1 = NV50_IR_MOD_ABS;
801bf215546Sopenharmony_ci      break;
802bf215546Sopenharmony_ci   case OP_NEG:
803bf215546Sopenharmony_ci      if (!isFloatType(cvt->sType) && cvt->src(0).mod)
804bf215546Sopenharmony_ci         return;
805bf215546Sopenharmony_ci      if (isFloatType(cvt->sType) &&
806bf215546Sopenharmony_ci          (cvt->src(0).mod && cvt->src(0).mod != Modifier(NV50_IR_MOD_ABS)))
807bf215546Sopenharmony_ci         return;
808bf215546Sopenharmony_ci
809bf215546Sopenharmony_ci      mod0 = isFloatType(cvt->sType) ? NV50_IR_MOD_NEG : 0;
810bf215546Sopenharmony_ci      mod1 = cvt->src(0).mod == Modifier(NV50_IR_MOD_ABS) ?
811bf215546Sopenharmony_ci         NV50_IR_MOD_NEG_ABS : NV50_IR_MOD_NEG;
812bf215546Sopenharmony_ci      break;
813bf215546Sopenharmony_ci   case OP_SAT:
814bf215546Sopenharmony_ci      if (!isFloatType(cvt->sType) && cvt->src(0).mod.abs())
815bf215546Sopenharmony_ci         return;
816bf215546Sopenharmony_ci      mod0 = 0;
817bf215546Sopenharmony_ci      mod1 = cvt->src(0).mod;
818bf215546Sopenharmony_ci      cvt->saturate = true;
819bf215546Sopenharmony_ci      break;
820bf215546Sopenharmony_ci   default:
821bf215546Sopenharmony_ci      return;
822bf215546Sopenharmony_ci   }
823bf215546Sopenharmony_ci
824bf215546Sopenharmony_ci   cvt->op = OP_ADD;
825bf215546Sopenharmony_ci   cvt->moveSources(0, 1);
826bf215546Sopenharmony_ci   cvt->setSrc(0, rZero);
827bf215546Sopenharmony_ci   cvt->src(0).mod = mod0;
828bf215546Sopenharmony_ci   cvt->src(1).mod = mod1;
829bf215546Sopenharmony_ci}
830bf215546Sopenharmony_ci
831bf215546Sopenharmony_cibool
832bf215546Sopenharmony_ciNVC0LegalizePostRA::visit(BasicBlock *bb)
833bf215546Sopenharmony_ci{
834bf215546Sopenharmony_ci   Instruction *i, *next;
835bf215546Sopenharmony_ci
836bf215546Sopenharmony_ci   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
837bf215546Sopenharmony_ci   for (i = bb->getFirst(); i; i = next) {
838bf215546Sopenharmony_ci      next = i->next;
839bf215546Sopenharmony_ci      if (i->op == OP_EMIT || i->op == OP_RESTART) {
840bf215546Sopenharmony_ci         if (!i->getDef(0)->refCount())
841bf215546Sopenharmony_ci            i->setDef(0, NULL);
842bf215546Sopenharmony_ci         if (i->src(0).getFile() == FILE_IMMEDIATE)
843bf215546Sopenharmony_ci            i->setSrc(0, rZero); // initial value must be 0
844bf215546Sopenharmony_ci         replaceZero(i);
845bf215546Sopenharmony_ci      } else
846bf215546Sopenharmony_ci      if (i->isNop()) {
847bf215546Sopenharmony_ci         bb->remove(i);
848bf215546Sopenharmony_ci      } else
849bf215546Sopenharmony_ci      if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&
850bf215546Sopenharmony_ci          prog->getType() != Program::TYPE_COMPUTE) {
851bf215546Sopenharmony_ci         // It seems like barriers are never required for tessellation since
852bf215546Sopenharmony_ci         // the warp size is 32, and there are always at most 32 tcs threads.
853bf215546Sopenharmony_ci         bb->remove(i);
854bf215546Sopenharmony_ci      } else
855bf215546Sopenharmony_ci      if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) {
856bf215546Sopenharmony_ci         int offset = i->src(0).get()->reg.data.offset;
857bf215546Sopenharmony_ci         if (abs(offset) >= 0x10000)
858bf215546Sopenharmony_ci            i->src(0).get()->reg.fileIndex += offset >> 16;
859bf215546Sopenharmony_ci         i->src(0).get()->reg.data.offset = (int)(short)offset;
860bf215546Sopenharmony_ci      } else {
861bf215546Sopenharmony_ci         // TODO: Move this to before register allocation for operations that
862bf215546Sopenharmony_ci         // need the $c register !
863bf215546Sopenharmony_ci         if (typeSizeof(i->sType) == 8 || typeSizeof(i->dType) == 8) {
864bf215546Sopenharmony_ci            Instruction *hi;
865bf215546Sopenharmony_ci            hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
866bf215546Sopenharmony_ci            if (hi)
867bf215546Sopenharmony_ci               next = hi;
868bf215546Sopenharmony_ci         }
869bf215546Sopenharmony_ci
870bf215546Sopenharmony_ci         if (i->op != OP_MOV && i->op != OP_PFETCH)
871bf215546Sopenharmony_ci            replaceZero(i);
872bf215546Sopenharmony_ci
873bf215546Sopenharmony_ci         if (i->op == OP_SAT || i->op == OP_NEG || i->op == OP_ABS)
874bf215546Sopenharmony_ci            replaceCvt(i);
875bf215546Sopenharmony_ci      }
876bf215546Sopenharmony_ci   }
877bf215546Sopenharmony_ci   if (!bb->getEntry())
878bf215546Sopenharmony_ci      return true;
879bf215546Sopenharmony_ci
880bf215546Sopenharmony_ci   if (!tryReplaceContWithBra(bb))
881bf215546Sopenharmony_ci      propagateJoin(bb);
882bf215546Sopenharmony_ci
883bf215546Sopenharmony_ci   return true;
884bf215546Sopenharmony_ci}
885bf215546Sopenharmony_ci
886bf215546Sopenharmony_ciNVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget()),
887bf215546Sopenharmony_ci   gpEmitAddress(NULL)
888bf215546Sopenharmony_ci{
889bf215546Sopenharmony_ci   bld.setProgram(prog);
890bf215546Sopenharmony_ci}
891bf215546Sopenharmony_ci
892bf215546Sopenharmony_cibool
893bf215546Sopenharmony_ciNVC0LoweringPass::visit(Function *fn)
894bf215546Sopenharmony_ci{
895bf215546Sopenharmony_ci   if (prog->getType() == Program::TYPE_GEOMETRY) {
896bf215546Sopenharmony_ci      assert(!strncmp(fn->getName(), "MAIN", 4));
897bf215546Sopenharmony_ci      // TODO: when we generate actual functions pass this value along somehow
898bf215546Sopenharmony_ci      bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
899bf215546Sopenharmony_ci      gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
900bf215546Sopenharmony_ci      if (fn->cfgExit) {
901bf215546Sopenharmony_ci         bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
902bf215546Sopenharmony_ci         if (prog->getTarget()->getChipset() >= NVISA_GV100_CHIPSET)
903bf215546Sopenharmony_ci            bld.mkOp1(OP_FINAL, TYPE_NONE, NULL, gpEmitAddress)->fixed = 1;
904bf215546Sopenharmony_ci         bld.mkMovToReg(0, gpEmitAddress);
905bf215546Sopenharmony_ci      }
906bf215546Sopenharmony_ci   }
907bf215546Sopenharmony_ci   return true;
908bf215546Sopenharmony_ci}
909bf215546Sopenharmony_ci
910bf215546Sopenharmony_cibool
911bf215546Sopenharmony_ciNVC0LoweringPass::visit(BasicBlock *bb)
912bf215546Sopenharmony_ci{
913bf215546Sopenharmony_ci   return true;
914bf215546Sopenharmony_ci}
915bf215546Sopenharmony_ci
916bf215546Sopenharmony_ciinline Value *
917bf215546Sopenharmony_ciNVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
918bf215546Sopenharmony_ci{
919bf215546Sopenharmony_ci   uint8_t b = prog->driver->io.auxCBSlot;
920bf215546Sopenharmony_ci   uint32_t off = prog->driver->io.texBindBase + slot * 4;
921bf215546Sopenharmony_ci
922bf215546Sopenharmony_ci   if (ptr)
923bf215546Sopenharmony_ci      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(2));
924bf215546Sopenharmony_ci
925bf215546Sopenharmony_ci   return bld.
926bf215546Sopenharmony_ci      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
927bf215546Sopenharmony_ci}
928bf215546Sopenharmony_ci
929bf215546Sopenharmony_ci// move array source to first slot, convert to u16, add indirections
930bf215546Sopenharmony_cibool
931bf215546Sopenharmony_ciNVC0LoweringPass::handleTEX(TexInstruction *i)
932bf215546Sopenharmony_ci{
933bf215546Sopenharmony_ci   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
934bf215546Sopenharmony_ci   const int arg = i->tex.target.getArgCount();
935bf215546Sopenharmony_ci   const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
936bf215546Sopenharmony_ci   const int chipset = prog->getTarget()->getChipset();
937bf215546Sopenharmony_ci
938bf215546Sopenharmony_ci   /* Only normalize in the non-explicit derivatives case. For explicit
939bf215546Sopenharmony_ci    * derivatives, this is handled in handleManualTXD.
940bf215546Sopenharmony_ci    */
941bf215546Sopenharmony_ci   if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {
942bf215546Sopenharmony_ci      Value *src[3], *val;
943bf215546Sopenharmony_ci      int c;
944bf215546Sopenharmony_ci      for (c = 0; c < 3; ++c)
945bf215546Sopenharmony_ci         src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
946bf215546Sopenharmony_ci      val = bld.getScratch();
947bf215546Sopenharmony_ci      bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
948bf215546Sopenharmony_ci      bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
949bf215546Sopenharmony_ci      bld.mkOp1(OP_RCP, TYPE_F32, val, val);
950bf215546Sopenharmony_ci      for (c = 0; c < 3; ++c) {
951bf215546Sopenharmony_ci         i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
952bf215546Sopenharmony_ci                                 i->getSrc(c), val));
953bf215546Sopenharmony_ci      }
954bf215546Sopenharmony_ci   }
955bf215546Sopenharmony_ci
956bf215546Sopenharmony_ci   // Arguments to the TEX instruction are a little insane. Even though the
957bf215546Sopenharmony_ci   // encoding is identical between SM20 and SM30, the arguments mean
958bf215546Sopenharmony_ci   // different things between Fermi and Kepler+. A lot of arguments are
959bf215546Sopenharmony_ci   // optional based on flags passed to the instruction. This summarizes the
960bf215546Sopenharmony_ci   // order of things.
961bf215546Sopenharmony_ci   //
962bf215546Sopenharmony_ci   // Fermi:
963bf215546Sopenharmony_ci   //  array/indirect
964bf215546Sopenharmony_ci   //  coords
965bf215546Sopenharmony_ci   //  sample
966bf215546Sopenharmony_ci   //  lod bias
967bf215546Sopenharmony_ci   //  depth compare
968bf215546Sopenharmony_ci   //  offsets:
969bf215546Sopenharmony_ci   //    - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
970bf215546Sopenharmony_ci   //    - other: 4 bits each, single reg
971bf215546Sopenharmony_ci   //
972bf215546Sopenharmony_ci   // Kepler+:
973bf215546Sopenharmony_ci   //  indirect handle
974bf215546Sopenharmony_ci   //  array (+ offsets for txd in upper 16 bits)
975bf215546Sopenharmony_ci   //  coords
976bf215546Sopenharmony_ci   //  sample
977bf215546Sopenharmony_ci   //  lod bias
978bf215546Sopenharmony_ci   //  depth compare
979bf215546Sopenharmony_ci   //  offsets (same as fermi, except txd which takes it with array)
980bf215546Sopenharmony_ci   //
981bf215546Sopenharmony_ci   // Maxwell (tex):
982bf215546Sopenharmony_ci   //  array
983bf215546Sopenharmony_ci   //  coords
984bf215546Sopenharmony_ci   //  indirect handle
985bf215546Sopenharmony_ci   //  sample
986bf215546Sopenharmony_ci   //  lod bias
987bf215546Sopenharmony_ci   //  depth compare
988bf215546Sopenharmony_ci   //  offsets
989bf215546Sopenharmony_ci   //
990bf215546Sopenharmony_ci   // Maxwell (txd):
991bf215546Sopenharmony_ci   //  indirect handle
992bf215546Sopenharmony_ci   //  coords
993bf215546Sopenharmony_ci   //  array + offsets
994bf215546Sopenharmony_ci   //  derivatives
995bf215546Sopenharmony_ci
996bf215546Sopenharmony_ci   if (chipset >= NVISA_GK104_CHIPSET) {
997bf215546Sopenharmony_ci      if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
998bf215546Sopenharmony_ci         // XXX this ignores tsc, and assumes a 1:1 mapping
999bf215546Sopenharmony_ci         assert(i->tex.rIndirectSrc >= 0);
1000bf215546Sopenharmony_ci         if (!i->tex.bindless) {
1001bf215546Sopenharmony_ci            Value *hnd = loadTexHandle(i->getIndirectR(), i->tex.r);
1002bf215546Sopenharmony_ci            i->tex.r = 0xff;
1003bf215546Sopenharmony_ci            i->tex.s = 0x1f;
1004bf215546Sopenharmony_ci            i->setIndirectR(hnd);
1005bf215546Sopenharmony_ci         }
1006bf215546Sopenharmony_ci         i->setIndirectS(NULL);
1007bf215546Sopenharmony_ci      } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
1008bf215546Sopenharmony_ci         if (i->tex.r == 0xffff)
1009bf215546Sopenharmony_ci            i->tex.r = prog->driver->io.fbtexBindBase / 4;
1010bf215546Sopenharmony_ci         else
1011bf215546Sopenharmony_ci            i->tex.r += prog->driver->io.texBindBase / 4;
1012bf215546Sopenharmony_ci         i->tex.s  = 0; // only a single cX[] value possible here
1013bf215546Sopenharmony_ci      } else {
1014bf215546Sopenharmony_ci         Value *hnd = bld.getScratch();
1015bf215546Sopenharmony_ci         Value *rHnd = loadTexHandle(NULL, i->tex.r);
1016bf215546Sopenharmony_ci         Value *sHnd = loadTexHandle(NULL, i->tex.s);
1017bf215546Sopenharmony_ci
1018bf215546Sopenharmony_ci         bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
1019bf215546Sopenharmony_ci
1020bf215546Sopenharmony_ci         i->tex.r = 0; // not used for indirect tex
1021bf215546Sopenharmony_ci         i->tex.s = 0;
1022bf215546Sopenharmony_ci         i->setIndirectR(hnd);
1023bf215546Sopenharmony_ci      }
1024bf215546Sopenharmony_ci      if (i->tex.target.isArray()) {
1025bf215546Sopenharmony_ci         LValue *layer = new_LValue(func, FILE_GPR);
1026bf215546Sopenharmony_ci         Value *src = i->getSrc(lyr);
1027bf215546Sopenharmony_ci         const int sat = (i->op == OP_TXF) ? 1 : 0;
1028bf215546Sopenharmony_ci         DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
1029bf215546Sopenharmony_ci         bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
1030bf215546Sopenharmony_ci         if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {
1031bf215546Sopenharmony_ci            for (int s = dim; s >= 1; --s)
1032bf215546Sopenharmony_ci               i->setSrc(s, i->getSrc(s - 1));
1033bf215546Sopenharmony_ci            i->setSrc(0, layer);
1034bf215546Sopenharmony_ci         } else {
1035bf215546Sopenharmony_ci            i->setSrc(dim, layer);
1036bf215546Sopenharmony_ci         }
1037bf215546Sopenharmony_ci      }
1038bf215546Sopenharmony_ci      // Move the indirect reference to the first place
1039bf215546Sopenharmony_ci      if (i->tex.rIndirectSrc >= 0 && (
1040bf215546Sopenharmony_ci                i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {
1041bf215546Sopenharmony_ci         Value *hnd = i->getIndirectR();
1042bf215546Sopenharmony_ci
1043bf215546Sopenharmony_ci         i->setIndirectR(NULL);
1044bf215546Sopenharmony_ci         i->moveSources(0, 1);
1045bf215546Sopenharmony_ci         i->setSrc(0, hnd);
1046bf215546Sopenharmony_ci         i->tex.rIndirectSrc = 0;
1047bf215546Sopenharmony_ci         i->tex.sIndirectSrc = -1;
1048bf215546Sopenharmony_ci      }
1049bf215546Sopenharmony_ci      // Move the indirect reference to right after the coords
1050bf215546Sopenharmony_ci      else if (i->tex.rIndirectSrc >= 0 && chipset >= NVISA_GM107_CHIPSET) {
1051bf215546Sopenharmony_ci         Value *hnd = i->getIndirectR();
1052bf215546Sopenharmony_ci
1053bf215546Sopenharmony_ci         i->setIndirectR(NULL);
1054bf215546Sopenharmony_ci         i->moveSources(arg, 1);
1055bf215546Sopenharmony_ci         i->setSrc(arg, hnd);
1056bf215546Sopenharmony_ci         i->tex.rIndirectSrc = 0;
1057bf215546Sopenharmony_ci         i->tex.sIndirectSrc = -1;
1058bf215546Sopenharmony_ci      }
1059bf215546Sopenharmony_ci   } else
1060bf215546Sopenharmony_ci   // (nvc0) generate and move the tsc/tic/array source to the front
1061bf215546Sopenharmony_ci   if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
1062bf215546Sopenharmony_ci      LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
1063bf215546Sopenharmony_ci
1064bf215546Sopenharmony_ci      Value *ticRel = i->getIndirectR();
1065bf215546Sopenharmony_ci      Value *tscRel = i->getIndirectS();
1066bf215546Sopenharmony_ci
1067bf215546Sopenharmony_ci      if (i->tex.r == 0xffff) {
1068bf215546Sopenharmony_ci         i->tex.r = 0x20;
1069bf215546Sopenharmony_ci         i->tex.s = 0x10;
1070bf215546Sopenharmony_ci      }
1071bf215546Sopenharmony_ci
1072bf215546Sopenharmony_ci      if (ticRel) {
1073bf215546Sopenharmony_ci         i->setSrc(i->tex.rIndirectSrc, NULL);
1074bf215546Sopenharmony_ci         if (i->tex.r)
1075bf215546Sopenharmony_ci            ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1076bf215546Sopenharmony_ci                                ticRel, bld.mkImm(i->tex.r));
1077bf215546Sopenharmony_ci      }
1078bf215546Sopenharmony_ci      if (tscRel) {
1079bf215546Sopenharmony_ci         i->setSrc(i->tex.sIndirectSrc, NULL);
1080bf215546Sopenharmony_ci         if (i->tex.s)
1081bf215546Sopenharmony_ci            tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1082bf215546Sopenharmony_ci                                tscRel, bld.mkImm(i->tex.s));
1083bf215546Sopenharmony_ci      }
1084bf215546Sopenharmony_ci
1085bf215546Sopenharmony_ci      Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
1086bf215546Sopenharmony_ci      if (arrayIndex) {
1087bf215546Sopenharmony_ci         for (int s = dim; s >= 1; --s)
1088bf215546Sopenharmony_ci            i->setSrc(s, i->getSrc(s - 1));
1089bf215546Sopenharmony_ci         i->setSrc(0, arrayIndex);
1090bf215546Sopenharmony_ci      } else {
1091bf215546Sopenharmony_ci         i->moveSources(0, 1);
1092bf215546Sopenharmony_ci      }
1093bf215546Sopenharmony_ci
1094bf215546Sopenharmony_ci      if (arrayIndex) {
1095bf215546Sopenharmony_ci         int sat = (i->op == OP_TXF) ? 1 : 0;
1096bf215546Sopenharmony_ci         DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
1097bf215546Sopenharmony_ci         bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
1098bf215546Sopenharmony_ci      } else {
1099bf215546Sopenharmony_ci         bld.loadImm(src, 0);
1100bf215546Sopenharmony_ci      }
1101bf215546Sopenharmony_ci
1102bf215546Sopenharmony_ci      if (ticRel)
1103bf215546Sopenharmony_ci         bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
1104bf215546Sopenharmony_ci      if (tscRel)
1105bf215546Sopenharmony_ci         bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
1106bf215546Sopenharmony_ci
1107bf215546Sopenharmony_ci      i->setSrc(0, src);
1108bf215546Sopenharmony_ci   }
1109bf215546Sopenharmony_ci
1110bf215546Sopenharmony_ci   // For nvc0, the sample id has to be in the second operand, as the offset
1111bf215546Sopenharmony_ci   // does. Right now we don't know how to pass both in, and this case can't
1112bf215546Sopenharmony_ci   // happen with OpenGL. On nve0, the sample id is part of the texture
1113bf215546Sopenharmony_ci   // coordinate argument.
1114bf215546Sopenharmony_ci   assert(chipset >= NVISA_GK104_CHIPSET ||
1115bf215546Sopenharmony_ci          !i->tex.useOffsets || !i->tex.target.isMS());
1116bf215546Sopenharmony_ci
1117bf215546Sopenharmony_ci   // offset is between lod and dc
1118bf215546Sopenharmony_ci   if (i->tex.useOffsets) {
1119bf215546Sopenharmony_ci      int n, c;
1120bf215546Sopenharmony_ci      int s = i->srcCount(0xff, true);
1121bf215546Sopenharmony_ci      if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) {
1122bf215546Sopenharmony_ci         if (i->tex.target.isShadow())
1123bf215546Sopenharmony_ci            s--;
1124bf215546Sopenharmony_ci         if (i->srcExists(s)) // move potential predicate out of the way
1125bf215546Sopenharmony_ci            i->moveSources(s, 1);
1126bf215546Sopenharmony_ci         if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
1127bf215546Sopenharmony_ci            i->moveSources(s + 1, 1);
1128bf215546Sopenharmony_ci      }
1129bf215546Sopenharmony_ci      if (i->op == OP_TXG) {
1130bf215546Sopenharmony_ci         // Either there is 1 offset, which goes into the 2 low bytes of the
1131bf215546Sopenharmony_ci         // first source, or there are 4 offsets, which go into 2 sources (8
1132bf215546Sopenharmony_ci         // values, 1 byte each).
1133bf215546Sopenharmony_ci         Value *offs[2] = {NULL, NULL};
1134bf215546Sopenharmony_ci         for (n = 0; n < i->tex.useOffsets; n++) {
1135bf215546Sopenharmony_ci            for (c = 0; c < 2; ++c) {
1136bf215546Sopenharmony_ci               if ((n % 2) == 0 && c == 0)
1137bf215546Sopenharmony_ci                  bld.mkMov(offs[n / 2] = bld.getScratch(), i->offset[n][c].get());
1138bf215546Sopenharmony_ci               else
1139bf215546Sopenharmony_ci                  bld.mkOp3(OP_INSBF, TYPE_U32,
1140bf215546Sopenharmony_ci                            offs[n / 2],
1141bf215546Sopenharmony_ci                            i->offset[n][c].get(),
1142bf215546Sopenharmony_ci                            bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)),
1143bf215546Sopenharmony_ci                            offs[n / 2]);
1144bf215546Sopenharmony_ci            }
1145bf215546Sopenharmony_ci         }
1146bf215546Sopenharmony_ci         i->setSrc(s, offs[0]);
1147bf215546Sopenharmony_ci         if (offs[1])
1148bf215546Sopenharmony_ci            i->setSrc(s + 1, offs[1]);
1149bf215546Sopenharmony_ci      } else {
1150bf215546Sopenharmony_ci         unsigned imm = 0;
1151bf215546Sopenharmony_ci         assert(i->tex.useOffsets == 1);
1152bf215546Sopenharmony_ci         for (c = 0; c < 3; ++c) {
1153bf215546Sopenharmony_ci            ImmediateValue val;
1154bf215546Sopenharmony_ci            if (!i->offset[0][c].getImmediate(val))
1155bf215546Sopenharmony_ci               assert(!"non-immediate offset passed to non-TXG");
1156bf215546Sopenharmony_ci            imm |= (val.reg.data.u32 & 0xf) << (c * 4);
1157bf215546Sopenharmony_ci         }
1158bf215546Sopenharmony_ci         if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
1159bf215546Sopenharmony_ci            // The offset goes into the upper 16 bits of the array index. So
1160bf215546Sopenharmony_ci            // create it if it's not already there, and INSBF it if it already
1161bf215546Sopenharmony_ci            // is.
1162bf215546Sopenharmony_ci            s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
1163bf215546Sopenharmony_ci            if (chipset >= NVISA_GM107_CHIPSET)
1164bf215546Sopenharmony_ci               s += dim;
1165bf215546Sopenharmony_ci            if (i->tex.target.isArray()) {
1166bf215546Sopenharmony_ci               Value *offset = bld.getScratch();
1167bf215546Sopenharmony_ci               bld.mkOp3(OP_INSBF, TYPE_U32, offset,
1168bf215546Sopenharmony_ci                         bld.loadImm(NULL, imm), bld.mkImm(0xc10),
1169bf215546Sopenharmony_ci                         i->getSrc(s));
1170bf215546Sopenharmony_ci               i->setSrc(s, offset);
1171bf215546Sopenharmony_ci            } else {
1172bf215546Sopenharmony_ci               i->moveSources(s, 1);
1173bf215546Sopenharmony_ci               i->setSrc(s, bld.loadImm(NULL, imm << 16));
1174bf215546Sopenharmony_ci            }
1175bf215546Sopenharmony_ci         } else {
1176bf215546Sopenharmony_ci            i->setSrc(s, bld.loadImm(NULL, imm));
1177bf215546Sopenharmony_ci         }
1178bf215546Sopenharmony_ci      }
1179bf215546Sopenharmony_ci   }
1180bf215546Sopenharmony_ci
1181bf215546Sopenharmony_ci   return true;
1182bf215546Sopenharmony_ci}
1183bf215546Sopenharmony_ci
1184bf215546Sopenharmony_cibool
1185bf215546Sopenharmony_ciNVC0LoweringPass::handleManualTXD(TexInstruction *i)
1186bf215546Sopenharmony_ci{
1187bf215546Sopenharmony_ci   // Always done from the l0 perspective. This is the way that NVIDIA's
1188bf215546Sopenharmony_ci   // driver does it, and doing it from the "current" lane's perspective
1189bf215546Sopenharmony_ci   // doesn't seem to always work for reasons that aren't altogether clear,
1190bf215546Sopenharmony_ci   // even in frag shaders.
1191bf215546Sopenharmony_ci   //
1192bf215546Sopenharmony_ci   // Note that we must move not only the coordinates into lane0, but also all
1193bf215546Sopenharmony_ci   // ancillary arguments, like array indices and depth compare as they may
1194bf215546Sopenharmony_ci   // differ between lanes. Offsets for TXD are supposed to be uniform, so we
1195bf215546Sopenharmony_ci   // leave them alone.
1196bf215546Sopenharmony_ci   static const uint8_t qOps[2] =
1197bf215546Sopenharmony_ci      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) };
1198bf215546Sopenharmony_ci
1199bf215546Sopenharmony_ci   Value *def[4][4];
1200bf215546Sopenharmony_ci   Value *crd[3], *arr[2], *shadow;
1201bf215546Sopenharmony_ci   Instruction *tex;
1202bf215546Sopenharmony_ci   Value *zero = bld.loadImm(bld.getSSA(), 0);
1203bf215546Sopenharmony_ci   int l, c;
1204bf215546Sopenharmony_ci   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
1205bf215546Sopenharmony_ci
1206bf215546Sopenharmony_ci   // This function is invoked after handleTEX lowering, so we have to expect
1207bf215546Sopenharmony_ci   // the arguments in the order that the hw wants them. For Fermi, array and
1208bf215546Sopenharmony_ci   // indirect are both in the leading arg, while for Kepler, array and
1209bf215546Sopenharmony_ci   // indirect are separate (and both precede the coordinates). Maxwell is
1210bf215546Sopenharmony_ci   // handled in a separate function.
1211bf215546Sopenharmony_ci   int array;
1212bf215546Sopenharmony_ci   if (targ->getChipset() < NVISA_GK104_CHIPSET)
1213bf215546Sopenharmony_ci      array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;
1214bf215546Sopenharmony_ci   else
1215bf215546Sopenharmony_ci      array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);
1216bf215546Sopenharmony_ci
1217bf215546Sopenharmony_ci   i->op = OP_TEX; // no need to clone dPdx/dPdy later
1218bf215546Sopenharmony_ci
1219bf215546Sopenharmony_ci   for (c = 0; c < dim; ++c)
1220bf215546Sopenharmony_ci      crd[c] = bld.getScratch();
1221bf215546Sopenharmony_ci   for (c = 0; c < array; ++c)
1222bf215546Sopenharmony_ci      arr[c] = bld.getScratch();
1223bf215546Sopenharmony_ci   shadow = bld.getScratch();
1224bf215546Sopenharmony_ci
1225bf215546Sopenharmony_ci   for (l = 0; l < 4; ++l) {
1226bf215546Sopenharmony_ci      Value *src[3], *val;
1227bf215546Sopenharmony_ci
1228bf215546Sopenharmony_ci      bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
1229bf215546Sopenharmony_ci      // we're using the texture result from lane 0 in all cases, so make sure
1230bf215546Sopenharmony_ci      // that lane 0 is pointing at the proper array index, indirect value,
1231bf215546Sopenharmony_ci      // and depth compare.
1232bf215546Sopenharmony_ci      if (l != 0) {
1233bf215546Sopenharmony_ci         for (c = 0; c < array; ++c)
1234bf215546Sopenharmony_ci            bld.mkQuadop(0x00, arr[c], l, i->getSrc(c), zero);
1235bf215546Sopenharmony_ci         if (i->tex.target.isShadow()) {
1236bf215546Sopenharmony_ci            // The next argument after coords is the depth compare
1237bf215546Sopenharmony_ci            bld.mkQuadop(0x00, shadow, l, i->getSrc(array + dim), zero);
1238bf215546Sopenharmony_ci         }
1239bf215546Sopenharmony_ci      }
1240bf215546Sopenharmony_ci      // mov position coordinates from lane l to all lanes
1241bf215546Sopenharmony_ci      for (c = 0; c < dim; ++c)
1242bf215546Sopenharmony_ci         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
1243bf215546Sopenharmony_ci      // add dPdx from lane l to lanes dx
1244bf215546Sopenharmony_ci      for (c = 0; c < dim; ++c)
1245bf215546Sopenharmony_ci         bld.mkQuadop(qOps[0], crd[c], l, i->dPdx[c].get(), crd[c]);
1246bf215546Sopenharmony_ci      // add dPdy from lane l to lanes dy
1247bf215546Sopenharmony_ci      for (c = 0; c < dim; ++c)
1248bf215546Sopenharmony_ci         bld.mkQuadop(qOps[1], crd[c], l, i->dPdy[c].get(), crd[c]);
1249bf215546Sopenharmony_ci      // normalize cube coordinates
1250bf215546Sopenharmony_ci      if (i->tex.target.isCube()) {
1251bf215546Sopenharmony_ci         for (c = 0; c < 3; ++c)
1252bf215546Sopenharmony_ci            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
1253bf215546Sopenharmony_ci         val = bld.getScratch();
1254bf215546Sopenharmony_ci         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
1255bf215546Sopenharmony_ci         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
1256bf215546Sopenharmony_ci         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
1257bf215546Sopenharmony_ci         for (c = 0; c < 3; ++c)
1258bf215546Sopenharmony_ci            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
1259bf215546Sopenharmony_ci      } else {
1260bf215546Sopenharmony_ci         for (c = 0; c < dim; ++c)
1261bf215546Sopenharmony_ci            src[c] = crd[c];
1262bf215546Sopenharmony_ci      }
1263bf215546Sopenharmony_ci      // texture
1264bf215546Sopenharmony_ci      bld.insert(tex = cloneForward(func, i));
1265bf215546Sopenharmony_ci      if (l != 0) {
1266bf215546Sopenharmony_ci         for (c = 0; c < array; ++c)
1267bf215546Sopenharmony_ci            tex->setSrc(c, arr[c]);
1268bf215546Sopenharmony_ci         if (i->tex.target.isShadow())
1269bf215546Sopenharmony_ci            tex->setSrc(array + dim, shadow);
1270bf215546Sopenharmony_ci      }
1271bf215546Sopenharmony_ci      for (c = 0; c < dim; ++c)
1272bf215546Sopenharmony_ci         tex->setSrc(c + array, src[c]);
1273bf215546Sopenharmony_ci      // broadcast results from lane 0 to all lanes so that the moves *into*
1274bf215546Sopenharmony_ci      // the target lane pick up the proper value.
1275bf215546Sopenharmony_ci      if (l != 0)
1276bf215546Sopenharmony_ci         for (c = 0; i->defExists(c); ++c)
1277bf215546Sopenharmony_ci            bld.mkQuadop(0x00, tex->getDef(c), 0, tex->getDef(c), zero);
1278bf215546Sopenharmony_ci      bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
1279bf215546Sopenharmony_ci
1280bf215546Sopenharmony_ci      // save results
1281bf215546Sopenharmony_ci      for (c = 0; i->defExists(c); ++c) {
1282bf215546Sopenharmony_ci         Instruction *mov;
1283bf215546Sopenharmony_ci         def[c][l] = bld.getSSA();
1284bf215546Sopenharmony_ci         mov = bld.mkMov(def[c][l], tex->getDef(c));
1285bf215546Sopenharmony_ci         mov->fixed = 1;
1286bf215546Sopenharmony_ci         mov->lanes = 1 << l;
1287bf215546Sopenharmony_ci      }
1288bf215546Sopenharmony_ci   }
1289bf215546Sopenharmony_ci
1290bf215546Sopenharmony_ci   for (c = 0; i->defExists(c); ++c) {
1291bf215546Sopenharmony_ci      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
1292bf215546Sopenharmony_ci      for (l = 0; l < 4; ++l)
1293bf215546Sopenharmony_ci         u->setSrc(l, def[c][l]);
1294bf215546Sopenharmony_ci   }
1295bf215546Sopenharmony_ci
1296bf215546Sopenharmony_ci   i->bb->remove(i);
1297bf215546Sopenharmony_ci   return true;
1298bf215546Sopenharmony_ci}
1299bf215546Sopenharmony_ci
1300bf215546Sopenharmony_cibool
1301bf215546Sopenharmony_ciNVC0LoweringPass::handleTXD(TexInstruction *txd)
1302bf215546Sopenharmony_ci{
1303bf215546Sopenharmony_ci   int dim = txd->tex.target.getDim() + txd->tex.target.isCube();
1304bf215546Sopenharmony_ci   unsigned arg = txd->tex.target.getArgCount();
1305bf215546Sopenharmony_ci   unsigned expected_args = arg;
1306bf215546Sopenharmony_ci   const int chipset = prog->getTarget()->getChipset();
1307bf215546Sopenharmony_ci
1308bf215546Sopenharmony_ci   if (chipset >= NVISA_GK104_CHIPSET) {
1309bf215546Sopenharmony_ci      if (!txd->tex.target.isArray() && txd->tex.useOffsets)
1310bf215546Sopenharmony_ci         expected_args++;
1311bf215546Sopenharmony_ci      if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)
1312bf215546Sopenharmony_ci         expected_args++;
1313bf215546Sopenharmony_ci   } else {
1314bf215546Sopenharmony_ci      if (txd->tex.useOffsets)
1315bf215546Sopenharmony_ci         expected_args++;
1316bf215546Sopenharmony_ci      if (!txd->tex.target.isArray() && (
1317bf215546Sopenharmony_ci                txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0))
1318bf215546Sopenharmony_ci         expected_args++;
1319bf215546Sopenharmony_ci   }
1320bf215546Sopenharmony_ci
1321bf215546Sopenharmony_ci   if (expected_args > 4 ||
1322bf215546Sopenharmony_ci       dim > 2 ||
1323bf215546Sopenharmony_ci       txd->tex.target.isShadow())
1324bf215546Sopenharmony_ci      txd->op = OP_TEX;
1325bf215546Sopenharmony_ci
1326bf215546Sopenharmony_ci   handleTEX(txd);
1327bf215546Sopenharmony_ci   while (txd->srcExists(arg))
1328bf215546Sopenharmony_ci      ++arg;
1329bf215546Sopenharmony_ci
1330bf215546Sopenharmony_ci   txd->tex.derivAll = true;
1331bf215546Sopenharmony_ci   if (txd->op == OP_TEX)
1332bf215546Sopenharmony_ci      return handleManualTXD(txd);
1333bf215546Sopenharmony_ci
1334bf215546Sopenharmony_ci   assert(arg == expected_args);
1335bf215546Sopenharmony_ci   for (int c = 0; c < dim; ++c) {
1336bf215546Sopenharmony_ci      txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
1337bf215546Sopenharmony_ci      txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
1338bf215546Sopenharmony_ci      txd->dPdx[c].set(NULL);
1339bf215546Sopenharmony_ci      txd->dPdy[c].set(NULL);
1340bf215546Sopenharmony_ci   }
1341bf215546Sopenharmony_ci
1342bf215546Sopenharmony_ci   // In this case we have fewer than 4 "real" arguments, which means that
1343bf215546Sopenharmony_ci   // handleTEX didn't apply any padding. However we have to make sure that
1344bf215546Sopenharmony_ci   // the second "group" of arguments still gets padded up to 4.
1345bf215546Sopenharmony_ci   if (chipset >= NVISA_GK104_CHIPSET) {
1346bf215546Sopenharmony_ci      int s = arg + 2 * dim;
1347bf215546Sopenharmony_ci      if (s >= 4 && s < 7) {
1348bf215546Sopenharmony_ci         if (txd->srcExists(s)) // move potential predicate out of the way
1349bf215546Sopenharmony_ci            txd->moveSources(s, 7 - s);
1350bf215546Sopenharmony_ci         while (s < 7)
1351bf215546Sopenharmony_ci            txd->setSrc(s++, bld.loadImm(NULL, 0));
1352bf215546Sopenharmony_ci      }
1353bf215546Sopenharmony_ci   }
1354bf215546Sopenharmony_ci
1355bf215546Sopenharmony_ci   return true;
1356bf215546Sopenharmony_ci}
1357bf215546Sopenharmony_ci
1358bf215546Sopenharmony_cibool
1359bf215546Sopenharmony_ciNVC0LoweringPass::handleTXQ(TexInstruction *txq)
1360bf215546Sopenharmony_ci{
1361bf215546Sopenharmony_ci   const int chipset = prog->getTarget()->getChipset();
1362bf215546Sopenharmony_ci   if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0)
1363bf215546Sopenharmony_ci      txq->tex.r += prog->driver->io.texBindBase / 4;
1364bf215546Sopenharmony_ci
1365bf215546Sopenharmony_ci   if (txq->tex.rIndirectSrc < 0)
1366bf215546Sopenharmony_ci      return true;
1367bf215546Sopenharmony_ci
1368bf215546Sopenharmony_ci   Value *ticRel = txq->getIndirectR();
1369bf215546Sopenharmony_ci
1370bf215546Sopenharmony_ci   txq->setIndirectS(NULL);
1371bf215546Sopenharmony_ci   txq->tex.sIndirectSrc = -1;
1372bf215546Sopenharmony_ci
1373bf215546Sopenharmony_ci   assert(ticRel);
1374bf215546Sopenharmony_ci
1375bf215546Sopenharmony_ci   if (chipset < NVISA_GK104_CHIPSET) {
1376bf215546Sopenharmony_ci      LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
1377bf215546Sopenharmony_ci
1378bf215546Sopenharmony_ci      txq->setSrc(txq->tex.rIndirectSrc, NULL);
1379bf215546Sopenharmony_ci      if (txq->tex.r)
1380bf215546Sopenharmony_ci         ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1381bf215546Sopenharmony_ci                             ticRel, bld.mkImm(txq->tex.r));
1382bf215546Sopenharmony_ci
1383bf215546Sopenharmony_ci      bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
1384bf215546Sopenharmony_ci
1385bf215546Sopenharmony_ci      txq->moveSources(0, 1);
1386bf215546Sopenharmony_ci      txq->setSrc(0, src);
1387bf215546Sopenharmony_ci   } else {
1388bf215546Sopenharmony_ci      Value *hnd = loadTexHandle(txq->getIndirectR(), txq->tex.r);
1389bf215546Sopenharmony_ci      txq->tex.r = 0xff;
1390bf215546Sopenharmony_ci      txq->tex.s = 0x1f;
1391bf215546Sopenharmony_ci
1392bf215546Sopenharmony_ci      txq->setIndirectR(NULL);
1393bf215546Sopenharmony_ci      txq->moveSources(0, 1);
1394bf215546Sopenharmony_ci      txq->setSrc(0, hnd);
1395bf215546Sopenharmony_ci      txq->tex.rIndirectSrc = 0;
1396bf215546Sopenharmony_ci   }
1397bf215546Sopenharmony_ci
1398bf215546Sopenharmony_ci   return true;
1399bf215546Sopenharmony_ci}
1400bf215546Sopenharmony_ci
1401bf215546Sopenharmony_cibool
1402bf215546Sopenharmony_ciNVC0LoweringPass::handleTXLQ(TexInstruction *i)
1403bf215546Sopenharmony_ci{
1404bf215546Sopenharmony_ci   /* The outputs are inverted compared to what the TGSI instruction
1405bf215546Sopenharmony_ci    * expects. Take that into account in the mask.
1406bf215546Sopenharmony_ci    */
1407bf215546Sopenharmony_ci   assert((i->tex.mask & ~3) == 0);
1408bf215546Sopenharmony_ci   if (i->tex.mask == 1)
1409bf215546Sopenharmony_ci      i->tex.mask = 2;
1410bf215546Sopenharmony_ci   else if (i->tex.mask == 2)
1411bf215546Sopenharmony_ci      i->tex.mask = 1;
1412bf215546Sopenharmony_ci   handleTEX(i);
1413bf215546Sopenharmony_ci   bld.setPosition(i, true);
1414bf215546Sopenharmony_ci
1415bf215546Sopenharmony_ci   /* The returned values are not quite what we want:
1416bf215546Sopenharmony_ci    * (a) convert from s16/u16 to f32
1417bf215546Sopenharmony_ci    * (b) multiply by 1/256
1418bf215546Sopenharmony_ci    */
1419bf215546Sopenharmony_ci   for (int def = 0; def < 2; ++def) {
1420bf215546Sopenharmony_ci      if (!i->defExists(def))
1421bf215546Sopenharmony_ci         continue;
1422bf215546Sopenharmony_ci      enum DataType type = TYPE_S16;
1423bf215546Sopenharmony_ci      if (i->tex.mask == 2 || def > 0)
1424bf215546Sopenharmony_ci         type = TYPE_U16;
1425bf215546Sopenharmony_ci      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def));
1426bf215546Sopenharmony_ci      bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
1427bf215546Sopenharmony_ci                i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
1428bf215546Sopenharmony_ci   }
1429bf215546Sopenharmony_ci   if (i->tex.mask == 3) {
1430bf215546Sopenharmony_ci      LValue *t = new_LValue(func, FILE_GPR);
1431bf215546Sopenharmony_ci      bld.mkMov(t, i->getDef(0));
1432bf215546Sopenharmony_ci      bld.mkMov(i->getDef(0), i->getDef(1));
1433bf215546Sopenharmony_ci      bld.mkMov(i->getDef(1), t);
1434bf215546Sopenharmony_ci   }
1435bf215546Sopenharmony_ci   return true;
1436bf215546Sopenharmony_ci}
1437bf215546Sopenharmony_ci
1438bf215546Sopenharmony_cibool
1439bf215546Sopenharmony_ciNVC0LoweringPass::handleBUFQ(Instruction *bufq)
1440bf215546Sopenharmony_ci{
1441bf215546Sopenharmony_ci   bufq->op = OP_MOV;
1442bf215546Sopenharmony_ci   bufq->setSrc(0, loadBufLength32(bufq->getIndirect(0, 1),
1443bf215546Sopenharmony_ci                                   bufq->getSrc(0)->reg.fileIndex * 16));
1444bf215546Sopenharmony_ci   bufq->setIndirect(0, 0, NULL);
1445bf215546Sopenharmony_ci   bufq->setIndirect(0, 1, NULL);
1446bf215546Sopenharmony_ci   return true;
1447bf215546Sopenharmony_ci}
1448bf215546Sopenharmony_ci
1449bf215546Sopenharmony_civoid
1450bf215546Sopenharmony_ciNVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom)
1451bf215546Sopenharmony_ci{
1452bf215546Sopenharmony_ci   assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1453bf215546Sopenharmony_ci
1454bf215546Sopenharmony_ci   BasicBlock *currBB = atom->bb;
1455bf215546Sopenharmony_ci   BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
1456bf215546Sopenharmony_ci   BasicBlock *joinBB = atom->bb->splitAfter(atom);
1457bf215546Sopenharmony_ci   BasicBlock *setAndUnlockBB = new BasicBlock(func);
1458bf215546Sopenharmony_ci   BasicBlock *failLockBB = new BasicBlock(func);
1459bf215546Sopenharmony_ci
1460bf215546Sopenharmony_ci   bld.setPosition(currBB, true);
1461bf215546Sopenharmony_ci   assert(!currBB->joinAt);
1462bf215546Sopenharmony_ci   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1463bf215546Sopenharmony_ci
1464bf215546Sopenharmony_ci   CmpInstruction *pred =
1465bf215546Sopenharmony_ci      bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1466bf215546Sopenharmony_ci                TYPE_U32, bld.mkImm(0), bld.mkImm(1));
1467bf215546Sopenharmony_ci
1468bf215546Sopenharmony_ci   bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
1469bf215546Sopenharmony_ci   currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
1470bf215546Sopenharmony_ci
1471bf215546Sopenharmony_ci   bld.setPosition(tryLockBB, true);
1472bf215546Sopenharmony_ci
1473bf215546Sopenharmony_ci   Instruction *ld =
1474bf215546Sopenharmony_ci      bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1475bf215546Sopenharmony_ci                 atom->getIndirect(0, 0));
1476bf215546Sopenharmony_ci   ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1477bf215546Sopenharmony_ci   ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1478bf215546Sopenharmony_ci
1479bf215546Sopenharmony_ci   bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1));
1480bf215546Sopenharmony_ci   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1481bf215546Sopenharmony_ci   tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
1482bf215546Sopenharmony_ci   tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
1483bf215546Sopenharmony_ci
1484bf215546Sopenharmony_ci   tryLockBB->cfg.detach(&joinBB->cfg);
1485bf215546Sopenharmony_ci   bld.remove(atom);
1486bf215546Sopenharmony_ci
1487bf215546Sopenharmony_ci   bld.setPosition(setAndUnlockBB, true);
1488bf215546Sopenharmony_ci   Value *stVal;
1489bf215546Sopenharmony_ci   if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1490bf215546Sopenharmony_ci      // Read the old value, and write the new one.
1491bf215546Sopenharmony_ci      stVal = atom->getSrc(1);
1492bf215546Sopenharmony_ci   } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1493bf215546Sopenharmony_ci      CmpInstruction *set =
1494bf215546Sopenharmony_ci         bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(),
1495bf215546Sopenharmony_ci                   TYPE_U32, ld->getDef(0), atom->getSrc(1));
1496bf215546Sopenharmony_ci
1497bf215546Sopenharmony_ci      bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()),
1498bf215546Sopenharmony_ci                TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0));
1499bf215546Sopenharmony_ci   } else {
1500bf215546Sopenharmony_ci      operation op;
1501bf215546Sopenharmony_ci
1502bf215546Sopenharmony_ci      switch (atom->subOp) {
1503bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_ADD:
1504bf215546Sopenharmony_ci         op = OP_ADD;
1505bf215546Sopenharmony_ci         break;
1506bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_AND:
1507bf215546Sopenharmony_ci         op = OP_AND;
1508bf215546Sopenharmony_ci         break;
1509bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_OR:
1510bf215546Sopenharmony_ci         op = OP_OR;
1511bf215546Sopenharmony_ci         break;
1512bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_XOR:
1513bf215546Sopenharmony_ci         op = OP_XOR;
1514bf215546Sopenharmony_ci         break;
1515bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_MIN:
1516bf215546Sopenharmony_ci         op = OP_MIN;
1517bf215546Sopenharmony_ci         break;
1518bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_MAX:
1519bf215546Sopenharmony_ci         op = OP_MAX;
1520bf215546Sopenharmony_ci         break;
1521bf215546Sopenharmony_ci      default:
1522bf215546Sopenharmony_ci         assert(0);
1523bf215546Sopenharmony_ci         return;
1524bf215546Sopenharmony_ci      }
1525bf215546Sopenharmony_ci
1526bf215546Sopenharmony_ci      stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0),
1527bf215546Sopenharmony_ci                         atom->getSrc(1));
1528bf215546Sopenharmony_ci   }
1529bf215546Sopenharmony_ci
1530bf215546Sopenharmony_ci   Instruction *st =
1531bf215546Sopenharmony_ci      bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1532bf215546Sopenharmony_ci                  atom->getIndirect(0, 0), stVal);
1533bf215546Sopenharmony_ci   st->setDef(0, pred->getDef(0));
1534bf215546Sopenharmony_ci   st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1535bf215546Sopenharmony_ci
1536bf215546Sopenharmony_ci   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1537bf215546Sopenharmony_ci   setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
1538bf215546Sopenharmony_ci
1539bf215546Sopenharmony_ci   // Lock until the store has not been performed.
1540bf215546Sopenharmony_ci   bld.setPosition(failLockBB, true);
1541bf215546Sopenharmony_ci   bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0));
1542bf215546Sopenharmony_ci   bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1543bf215546Sopenharmony_ci   failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
1544bf215546Sopenharmony_ci   failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
1545bf215546Sopenharmony_ci
1546bf215546Sopenharmony_ci   bld.setPosition(joinBB, false);
1547bf215546Sopenharmony_ci   bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1548bf215546Sopenharmony_ci}
1549bf215546Sopenharmony_ci
1550bf215546Sopenharmony_civoid
1551bf215546Sopenharmony_ciNVC0LoweringPass::handleSharedATOM(Instruction *atom)
1552bf215546Sopenharmony_ci{
1553bf215546Sopenharmony_ci   assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1554bf215546Sopenharmony_ci
1555bf215546Sopenharmony_ci   BasicBlock *currBB = atom->bb;
1556bf215546Sopenharmony_ci   BasicBlock *tryLockAndSetBB = atom->bb->splitBefore(atom, false);
1557bf215546Sopenharmony_ci   BasicBlock *joinBB = atom->bb->splitAfter(atom);
1558bf215546Sopenharmony_ci
1559bf215546Sopenharmony_ci   bld.setPosition(currBB, true);
1560bf215546Sopenharmony_ci   assert(!currBB->joinAt);
1561bf215546Sopenharmony_ci   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1562bf215546Sopenharmony_ci
1563bf215546Sopenharmony_ci   bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_ALWAYS, NULL);
1564bf215546Sopenharmony_ci   currBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::TREE);
1565bf215546Sopenharmony_ci
1566bf215546Sopenharmony_ci   bld.setPosition(tryLockAndSetBB, true);
1567bf215546Sopenharmony_ci
1568bf215546Sopenharmony_ci   Instruction *ld =
1569bf215546Sopenharmony_ci      bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1570bf215546Sopenharmony_ci                 atom->getIndirect(0, 0));
1571bf215546Sopenharmony_ci   ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1572bf215546Sopenharmony_ci   ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1573bf215546Sopenharmony_ci
1574bf215546Sopenharmony_ci   Value *stVal;
1575bf215546Sopenharmony_ci   if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1576bf215546Sopenharmony_ci      // Read the old value, and write the new one.
1577bf215546Sopenharmony_ci      stVal = atom->getSrc(1);
1578bf215546Sopenharmony_ci   } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1579bf215546Sopenharmony_ci      CmpInstruction *set =
1580bf215546Sopenharmony_ci         bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1581bf215546Sopenharmony_ci                   TYPE_U32, ld->getDef(0), atom->getSrc(1));
1582bf215546Sopenharmony_ci      set->setPredicate(CC_P, ld->getDef(1));
1583bf215546Sopenharmony_ci
1584bf215546Sopenharmony_ci      Instruction *selp =
1585bf215546Sopenharmony_ci         bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), ld->getDef(0),
1586bf215546Sopenharmony_ci                   atom->getSrc(2), set->getDef(0));
1587bf215546Sopenharmony_ci      selp->src(2).mod = Modifier(NV50_IR_MOD_NOT);
1588bf215546Sopenharmony_ci      selp->setPredicate(CC_P, ld->getDef(1));
1589bf215546Sopenharmony_ci
1590bf215546Sopenharmony_ci      stVal = selp->getDef(0);
1591bf215546Sopenharmony_ci   } else {
1592bf215546Sopenharmony_ci      operation op;
1593bf215546Sopenharmony_ci
1594bf215546Sopenharmony_ci      switch (atom->subOp) {
1595bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_ADD:
1596bf215546Sopenharmony_ci         op = OP_ADD;
1597bf215546Sopenharmony_ci         break;
1598bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_AND:
1599bf215546Sopenharmony_ci         op = OP_AND;
1600bf215546Sopenharmony_ci         break;
1601bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_OR:
1602bf215546Sopenharmony_ci         op = OP_OR;
1603bf215546Sopenharmony_ci         break;
1604bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_XOR:
1605bf215546Sopenharmony_ci         op = OP_XOR;
1606bf215546Sopenharmony_ci         break;
1607bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_MIN:
1608bf215546Sopenharmony_ci         op = OP_MIN;
1609bf215546Sopenharmony_ci         break;
1610bf215546Sopenharmony_ci      case NV50_IR_SUBOP_ATOM_MAX:
1611bf215546Sopenharmony_ci         op = OP_MAX;
1612bf215546Sopenharmony_ci         break;
1613bf215546Sopenharmony_ci      default:
1614bf215546Sopenharmony_ci         assert(0);
1615bf215546Sopenharmony_ci         return;
1616bf215546Sopenharmony_ci      }
1617bf215546Sopenharmony_ci
1618bf215546Sopenharmony_ci      Instruction *i =
1619bf215546Sopenharmony_ci         bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
1620bf215546Sopenharmony_ci                   atom->getSrc(1));
1621bf215546Sopenharmony_ci      i->setPredicate(CC_P, ld->getDef(1));
1622bf215546Sopenharmony_ci
1623bf215546Sopenharmony_ci      stVal = i->getDef(0);
1624bf215546Sopenharmony_ci   }
1625bf215546Sopenharmony_ci
1626bf215546Sopenharmony_ci   Instruction *st =
1627bf215546Sopenharmony_ci      bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1628bf215546Sopenharmony_ci                  atom->getIndirect(0, 0), stVal);
1629bf215546Sopenharmony_ci   st->setPredicate(CC_P, ld->getDef(1));
1630bf215546Sopenharmony_ci   st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1631bf215546Sopenharmony_ci
1632bf215546Sopenharmony_ci   // Loop until the lock is acquired.
1633bf215546Sopenharmony_ci   bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_NOT_P, ld->getDef(1));
1634bf215546Sopenharmony_ci   tryLockAndSetBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::BACK);
1635bf215546Sopenharmony_ci   tryLockAndSetBB->cfg.attach(&joinBB->cfg, Graph::Edge::CROSS);
1636bf215546Sopenharmony_ci   bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1637bf215546Sopenharmony_ci
1638bf215546Sopenharmony_ci   bld.remove(atom);
1639bf215546Sopenharmony_ci
1640bf215546Sopenharmony_ci   bld.setPosition(joinBB, false);
1641bf215546Sopenharmony_ci   bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1642bf215546Sopenharmony_ci}
1643bf215546Sopenharmony_ci
1644bf215546Sopenharmony_cibool
1645bf215546Sopenharmony_ciNVC0LoweringPass::handleATOM(Instruction *atom)
1646bf215546Sopenharmony_ci{
1647bf215546Sopenharmony_ci   SVSemantic sv;
1648bf215546Sopenharmony_ci   Value *ptr = atom->getIndirect(0, 0), *ind = atom->getIndirect(0, 1), *base;
1649bf215546Sopenharmony_ci
1650bf215546Sopenharmony_ci   switch (atom->src(0).getFile()) {
1651bf215546Sopenharmony_ci   case FILE_MEMORY_LOCAL:
1652bf215546Sopenharmony_ci      sv = SV_LBASE;
1653bf215546Sopenharmony_ci      break;
1654bf215546Sopenharmony_ci   case FILE_MEMORY_SHARED:
1655bf215546Sopenharmony_ci      // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
1656bf215546Sopenharmony_ci      // operations on shared memory. For Maxwell, ATOMS is enough.
1657bf215546Sopenharmony_ci      if (targ->getChipset() < NVISA_GK104_CHIPSET)
1658bf215546Sopenharmony_ci         handleSharedATOM(atom);
1659bf215546Sopenharmony_ci      else if (targ->getChipset() < NVISA_GM107_CHIPSET)
1660bf215546Sopenharmony_ci         handleSharedATOMNVE4(atom);
1661bf215546Sopenharmony_ci      return true;
1662bf215546Sopenharmony_ci   case FILE_MEMORY_GLOBAL:
1663bf215546Sopenharmony_ci      return true;
1664bf215546Sopenharmony_ci   default:
1665bf215546Sopenharmony_ci      assert(atom->src(0).getFile() == FILE_MEMORY_BUFFER);
1666bf215546Sopenharmony_ci      base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
1667bf215546Sopenharmony_ci      assert(base->reg.size == 8);
1668bf215546Sopenharmony_ci      if (ptr)
1669bf215546Sopenharmony_ci         base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
1670bf215546Sopenharmony_ci      assert(base->reg.size == 8);
1671bf215546Sopenharmony_ci      atom->setIndirect(0, 0, base);
1672bf215546Sopenharmony_ci      atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1673bf215546Sopenharmony_ci
1674bf215546Sopenharmony_ci      // Harden against out-of-bounds accesses
1675bf215546Sopenharmony_ci      Value *offset = bld.loadImm(NULL, atom->getSrc(0)->reg.data.offset + typeSizeof(atom->sType));
1676bf215546Sopenharmony_ci      Value *length = loadBufLength32(ind, atom->getSrc(0)->reg.fileIndex * 16);
1677bf215546Sopenharmony_ci      Value *pred = new_LValue(func, FILE_PREDICATE);
1678bf215546Sopenharmony_ci      if (ptr)
1679bf215546Sopenharmony_ci         bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, ptr);
1680bf215546Sopenharmony_ci      bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
1681bf215546Sopenharmony_ci      atom->setPredicate(CC_NOT_P, pred);
1682bf215546Sopenharmony_ci      if (atom->defExists(0)) {
1683bf215546Sopenharmony_ci         Value *zero, *dst = atom->getDef(0);
1684bf215546Sopenharmony_ci         atom->setDef(0, bld.getSSA());
1685bf215546Sopenharmony_ci
1686bf215546Sopenharmony_ci         bld.setPosition(atom, true);
1687bf215546Sopenharmony_ci         bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
1688bf215546Sopenharmony_ci            ->setPredicate(CC_P, pred);
1689bf215546Sopenharmony_ci         bld.mkOp2(OP_UNION, TYPE_U32, dst, atom->getDef(0), zero);
1690bf215546Sopenharmony_ci      }
1691bf215546Sopenharmony_ci
1692bf215546Sopenharmony_ci      return true;
1693bf215546Sopenharmony_ci   }
1694bf215546Sopenharmony_ci   base =
1695bf215546Sopenharmony_ci      bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
1696bf215546Sopenharmony_ci
1697bf215546Sopenharmony_ci   atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
1698bf215546Sopenharmony_ci   atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1699bf215546Sopenharmony_ci   if (ptr)
1700bf215546Sopenharmony_ci      base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
1701bf215546Sopenharmony_ci   atom->setIndirect(0, 1, NULL);
1702bf215546Sopenharmony_ci   atom->setIndirect(0, 0, base);
1703bf215546Sopenharmony_ci
1704bf215546Sopenharmony_ci   return true;
1705bf215546Sopenharmony_ci}
1706bf215546Sopenharmony_ci
1707bf215546Sopenharmony_cibool
1708bf215546Sopenharmony_ciNVC0LoweringPass::handleATOMCctl(Instruction *atom) {
1709bf215546Sopenharmony_ci   // Flush L1 cache manually since atomics go directly to L2. This ensures
1710bf215546Sopenharmony_ci   // that any later CA reads retrieve the updated data.
1711bf215546Sopenharmony_ci
1712bf215546Sopenharmony_ci   if (atom->cache != nv50_ir::CACHE_CA)
1713bf215546Sopenharmony_ci      return false;
1714bf215546Sopenharmony_ci
1715bf215546Sopenharmony_ci   bld.setPosition(atom, true);
1716bf215546Sopenharmony_ci
1717bf215546Sopenharmony_ci   Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, atom->getSrc(0));
1718bf215546Sopenharmony_ci   cctl->setIndirect(0, 0, atom->getIndirect(0, 0));
1719bf215546Sopenharmony_ci   cctl->fixed = 1;
1720bf215546Sopenharmony_ci   cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
1721bf215546Sopenharmony_ci   if (atom->isPredicated())
1722bf215546Sopenharmony_ci      cctl->setPredicate(atom->cc, atom->getPredicate());
1723bf215546Sopenharmony_ci
1724bf215546Sopenharmony_ci   return true;
1725bf215546Sopenharmony_ci}
1726bf215546Sopenharmony_ci
1727bf215546Sopenharmony_cibool
1728bf215546Sopenharmony_ciNVC0LoweringPass::handleCasExch(Instruction *cas)
1729bf215546Sopenharmony_ci{
1730bf215546Sopenharmony_ci   if (targ->getChipset() < NVISA_GM107_CHIPSET) {
1731bf215546Sopenharmony_ci      if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
1732bf215546Sopenharmony_ci         // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
1733bf215546Sopenharmony_ci         return false;
1734bf215546Sopenharmony_ci      }
1735bf215546Sopenharmony_ci   }
1736bf215546Sopenharmony_ci
1737bf215546Sopenharmony_ci   if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
1738bf215546Sopenharmony_ci       cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
1739bf215546Sopenharmony_ci      return false;
1740bf215546Sopenharmony_ci
1741bf215546Sopenharmony_ci   if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS &&
1742bf215546Sopenharmony_ci       targ->getChipset() < NVISA_GV100_CHIPSET) {
1743bf215546Sopenharmony_ci      // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
1744bf215546Sopenharmony_ci      // should be set to the high part of the double reg or bad things will
1745bf215546Sopenharmony_ci      // happen elsewhere in the universe.
1746bf215546Sopenharmony_ci      // Also, it sometimes returns the new value instead of the old one
1747bf215546Sopenharmony_ci      // under mysterious circumstances.
1748bf215546Sopenharmony_ci      DataType ty = typeOfSize(typeSizeof(cas->dType) * 2);
1749bf215546Sopenharmony_ci      Value *dreg = bld.getSSA(typeSizeof(ty));
1750bf215546Sopenharmony_ci      bld.setPosition(cas, false);
1751bf215546Sopenharmony_ci      bld.mkOp2(OP_MERGE, ty, dreg, cas->getSrc(1), cas->getSrc(2));
1752bf215546Sopenharmony_ci      cas->setSrc(1, dreg);
1753bf215546Sopenharmony_ci      cas->setSrc(2, dreg);
1754bf215546Sopenharmony_ci   }
1755bf215546Sopenharmony_ci
1756bf215546Sopenharmony_ci   return true;
1757bf215546Sopenharmony_ci}
1758bf215546Sopenharmony_ci
1759bf215546Sopenharmony_ciinline Value *
1760bf215546Sopenharmony_ciNVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base)
1761bf215546Sopenharmony_ci{
1762bf215546Sopenharmony_ci   uint8_t b = prog->driver->io.auxCBSlot;
1763bf215546Sopenharmony_ci   off += base;
1764bf215546Sopenharmony_ci
1765bf215546Sopenharmony_ci   return bld.
1766bf215546Sopenharmony_ci      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1767bf215546Sopenharmony_ci}
1768bf215546Sopenharmony_ci
1769bf215546Sopenharmony_ciinline Value *
1770bf215546Sopenharmony_ciNVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base)
1771bf215546Sopenharmony_ci{
1772bf215546Sopenharmony_ci   uint8_t b = prog->driver->io.auxCBSlot;
1773bf215546Sopenharmony_ci   off += base;
1774bf215546Sopenharmony_ci
1775bf215546Sopenharmony_ci   if (ptr)
1776bf215546Sopenharmony_ci      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1777bf215546Sopenharmony_ci
1778bf215546Sopenharmony_ci   return bld.
1779bf215546Sopenharmony_ci      mkLoadv(TYPE_U64, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off), ptr);
1780bf215546Sopenharmony_ci}
1781bf215546Sopenharmony_ci
1782bf215546Sopenharmony_ciinline Value *
1783bf215546Sopenharmony_ciNVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base)
1784bf215546Sopenharmony_ci{
1785bf215546Sopenharmony_ci   uint8_t b = prog->driver->io.auxCBSlot;
1786bf215546Sopenharmony_ci   off += base;
1787bf215546Sopenharmony_ci
1788bf215546Sopenharmony_ci   if (ptr)
1789bf215546Sopenharmony_ci      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1790bf215546Sopenharmony_ci
1791bf215546Sopenharmony_ci   return bld.
1792bf215546Sopenharmony_ci      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr);
1793bf215546Sopenharmony_ci}
1794bf215546Sopenharmony_ci
1795bf215546Sopenharmony_ciinline Value *
1796bf215546Sopenharmony_ciNVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off)
1797bf215546Sopenharmony_ci{
1798bf215546Sopenharmony_ci   return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase);
1799bf215546Sopenharmony_ci}
1800bf215546Sopenharmony_ci
1801bf215546Sopenharmony_ciinline Value *
1802bf215546Sopenharmony_ciNVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)
1803bf215546Sopenharmony_ci{
1804bf215546Sopenharmony_ci   return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);
1805bf215546Sopenharmony_ci}
1806bf215546Sopenharmony_ci
1807bf215546Sopenharmony_ciinline Value *
1808bf215546Sopenharmony_ciNVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off)
1809bf215546Sopenharmony_ci{
1810bf215546Sopenharmony_ci   return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase);
1811bf215546Sopenharmony_ci}
1812bf215546Sopenharmony_ci
1813bf215546Sopenharmony_ciinline Value *
1814bf215546Sopenharmony_ciNVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off)
1815bf215546Sopenharmony_ci{
1816bf215546Sopenharmony_ci   return loadResLength32(ptr, off, prog->driver->io.uboInfoBase);
1817bf215546Sopenharmony_ci}
1818bf215546Sopenharmony_ci
1819bf215546Sopenharmony_ciinline Value *
1820bf215546Sopenharmony_ciNVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
1821bf215546Sopenharmony_ci{
1822bf215546Sopenharmony_ci   uint8_t b = prog->driver->io.msInfoCBSlot;
1823bf215546Sopenharmony_ci   off += prog->driver->io.msInfoBase;
1824bf215546Sopenharmony_ci   return bld.
1825bf215546Sopenharmony_ci      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1826bf215546Sopenharmony_ci}
1827bf215546Sopenharmony_ci
1828bf215546Sopenharmony_ciinline Value *
1829bf215546Sopenharmony_ciNVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless)
1830bf215546Sopenharmony_ci{
1831bf215546Sopenharmony_ci   uint32_t base = slot * NVC0_SU_INFO__STRIDE;
1832bf215546Sopenharmony_ci
1833bf215546Sopenharmony_ci   // We don't upload surface info for bindless for GM107+
1834bf215546Sopenharmony_ci   assert(!bindless || targ->getChipset() < NVISA_GM107_CHIPSET);
1835bf215546Sopenharmony_ci
1836bf215546Sopenharmony_ci   if (ptr) {
1837bf215546Sopenharmony_ci      ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot));
1838bf215546Sopenharmony_ci      if (bindless)
1839bf215546Sopenharmony_ci         ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(511));
1840bf215546Sopenharmony_ci      else
1841bf215546Sopenharmony_ci         ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
1842bf215546Sopenharmony_ci      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(6));
1843bf215546Sopenharmony_ci      base = 0;
1844bf215546Sopenharmony_ci   }
1845bf215546Sopenharmony_ci   off += base;
1846bf215546Sopenharmony_ci
1847bf215546Sopenharmony_ci   return loadResInfo32(ptr, off, bindless ? prog->driver->io.bindlessBase :
1848bf215546Sopenharmony_ci                        prog->driver->io.suInfoBase);
1849bf215546Sopenharmony_ci}
1850bf215546Sopenharmony_ci
1851bf215546Sopenharmony_ciValue *
1852bf215546Sopenharmony_ciNVC0LoweringPass::loadMsAdjInfo32(TexInstruction::Target target, uint32_t index, int slot, Value *ind, bool bindless)
1853bf215546Sopenharmony_ci{
1854bf215546Sopenharmony_ci   if (!bindless || targ->getChipset() < NVISA_GM107_CHIPSET)
1855bf215546Sopenharmony_ci      return loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(index), bindless);
1856bf215546Sopenharmony_ci
1857bf215546Sopenharmony_ci   assert(bindless);
1858bf215546Sopenharmony_ci
1859bf215546Sopenharmony_ci   Value *samples = bld.getSSA();
1860bf215546Sopenharmony_ci   // this shouldn't be lowered because it's being inserted before the current instruction
1861bf215546Sopenharmony_ci   TexInstruction *tex = new_TexInstruction(func, OP_TXQ);
1862bf215546Sopenharmony_ci   tex->tex.target = target;
1863bf215546Sopenharmony_ci   tex->tex.query = TXQ_TYPE;
1864bf215546Sopenharmony_ci   tex->tex.mask = 0x4;
1865bf215546Sopenharmony_ci   tex->tex.r = 0xff;
1866bf215546Sopenharmony_ci   tex->tex.s = 0x1f;
1867bf215546Sopenharmony_ci   tex->tex.rIndirectSrc = 0;
1868bf215546Sopenharmony_ci   tex->setDef(0, samples);
1869bf215546Sopenharmony_ci   tex->setSrc(0, ind);
1870bf215546Sopenharmony_ci   tex->setSrc(1, bld.loadImm(NULL, 0));
1871bf215546Sopenharmony_ci   bld.insert(tex);
1872bf215546Sopenharmony_ci
1873bf215546Sopenharmony_ci   // doesn't work with sample counts other than 1/2/4/8 but they aren't supported
1874bf215546Sopenharmony_ci   switch (index) {
1875bf215546Sopenharmony_ci   case 0: {
1876bf215546Sopenharmony_ci      Value *tmp = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), samples, bld.mkImm(2));
1877bf215546Sopenharmony_ci      return bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(2));
1878bf215546Sopenharmony_ci   }
1879bf215546Sopenharmony_ci   case 1: {
1880bf215546Sopenharmony_ci      Value *tmp = bld.mkCmp(OP_SET, CC_GT, TYPE_U32, bld.getSSA(), TYPE_U32, samples, bld.mkImm(2))->getDef(0);
1881bf215546Sopenharmony_ci      return bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(1));
1882bf215546Sopenharmony_ci   }
1883bf215546Sopenharmony_ci   default: {
1884bf215546Sopenharmony_ci      assert(false);
1885bf215546Sopenharmony_ci      return NULL;
1886bf215546Sopenharmony_ci   }
1887bf215546Sopenharmony_ci   }
1888bf215546Sopenharmony_ci}
1889bf215546Sopenharmony_ci
1890bf215546Sopenharmony_cistatic inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
1891bf215546Sopenharmony_ci{
1892bf215546Sopenharmony_ci   switch (su->tex.target.getEnum()) {
1893bf215546Sopenharmony_ci   case TEX_TARGET_BUFFER:      return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
1894bf215546Sopenharmony_ci   case TEX_TARGET_RECT:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1895bf215546Sopenharmony_ci   case TEX_TARGET_1D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1896bf215546Sopenharmony_ci   case TEX_TARGET_1D_ARRAY:    return (c == 1) ?
1897bf215546Sopenharmony_ci                                   NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
1898bf215546Sopenharmony_ci                                   NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1899bf215546Sopenharmony_ci   case TEX_TARGET_2D:          return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1900bf215546Sopenharmony_ci   case TEX_TARGET_2D_MS:       return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1901bf215546Sopenharmony_ci   case TEX_TARGET_2D_ARRAY:    return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1902bf215546Sopenharmony_ci   case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1903bf215546Sopenharmony_ci   case TEX_TARGET_3D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1904bf215546Sopenharmony_ci   case TEX_TARGET_CUBE:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1905bf215546Sopenharmony_ci   case TEX_TARGET_CUBE_ARRAY:  return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1906bf215546Sopenharmony_ci   default:
1907bf215546Sopenharmony_ci      assert(0);
1908bf215546Sopenharmony_ci      return 0;
1909bf215546Sopenharmony_ci   }
1910bf215546Sopenharmony_ci}
1911bf215546Sopenharmony_ci
1912bf215546Sopenharmony_cibool
1913bf215546Sopenharmony_ciNVC0LoweringPass::handleSUQ(TexInstruction *suq)
1914bf215546Sopenharmony_ci{
1915bf215546Sopenharmony_ci   int mask = suq->tex.mask;
1916bf215546Sopenharmony_ci   int dim = suq->tex.target.getDim();
1917bf215546Sopenharmony_ci   int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
1918bf215546Sopenharmony_ci   Value *ind = suq->getIndirectR();
1919bf215546Sopenharmony_ci   int slot = suq->tex.r;
1920bf215546Sopenharmony_ci   int c, d;
1921bf215546Sopenharmony_ci
1922bf215546Sopenharmony_ci   for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
1923bf215546Sopenharmony_ci      if (c >= arg || !(mask & 1))
1924bf215546Sopenharmony_ci         continue;
1925bf215546Sopenharmony_ci
1926bf215546Sopenharmony_ci      int offset;
1927bf215546Sopenharmony_ci
1928bf215546Sopenharmony_ci      if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
1929bf215546Sopenharmony_ci         offset = NVC0_SU_INFO_SIZE(2);
1930bf215546Sopenharmony_ci      } else {
1931bf215546Sopenharmony_ci         offset = NVC0_SU_INFO_SIZE(c);
1932bf215546Sopenharmony_ci      }
1933bf215546Sopenharmony_ci      bld.mkMov(suq->getDef(d++), loadSuInfo32(ind, slot, offset, suq->tex.bindless));
1934bf215546Sopenharmony_ci      if (c == 2 && suq->tex.target.isCube())
1935bf215546Sopenharmony_ci         bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
1936bf215546Sopenharmony_ci                   bld.loadImm(NULL, 6));
1937bf215546Sopenharmony_ci   }
1938bf215546Sopenharmony_ci
1939bf215546Sopenharmony_ci   if (mask & 1) {
1940bf215546Sopenharmony_ci      if (suq->tex.target.isMS()) {
1941bf215546Sopenharmony_ci         Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), suq->tex.bindless);
1942bf215546Sopenharmony_ci         Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), suq->tex.bindless);
1943bf215546Sopenharmony_ci         Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
1944bf215546Sopenharmony_ci         bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
1945bf215546Sopenharmony_ci      } else {
1946bf215546Sopenharmony_ci         bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
1947bf215546Sopenharmony_ci      }
1948bf215546Sopenharmony_ci   }
1949bf215546Sopenharmony_ci
1950bf215546Sopenharmony_ci   bld.remove(suq);
1951bf215546Sopenharmony_ci   return true;
1952bf215546Sopenharmony_ci}
1953bf215546Sopenharmony_ci
1954bf215546Sopenharmony_civoid
1955bf215546Sopenharmony_ciNVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
1956bf215546Sopenharmony_ci{
1957bf215546Sopenharmony_ci   const int arg = tex->tex.target.getArgCount();
1958bf215546Sopenharmony_ci   int slot = tex->tex.r;
1959bf215546Sopenharmony_ci
1960bf215546Sopenharmony_ci   if (tex->tex.target == TEX_TARGET_2D_MS)
1961bf215546Sopenharmony_ci      tex->tex.target = TEX_TARGET_2D;
1962bf215546Sopenharmony_ci   else
1963bf215546Sopenharmony_ci   if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
1964bf215546Sopenharmony_ci      tex->tex.target = TEX_TARGET_2D_ARRAY;
1965bf215546Sopenharmony_ci   else
1966bf215546Sopenharmony_ci      return;
1967bf215546Sopenharmony_ci
1968bf215546Sopenharmony_ci   Value *x = tex->getSrc(0);
1969bf215546Sopenharmony_ci   Value *y = tex->getSrc(1);
1970bf215546Sopenharmony_ci   Value *s = tex->getSrc(arg - 1);
1971bf215546Sopenharmony_ci
1972bf215546Sopenharmony_ci   Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
1973bf215546Sopenharmony_ci   Value *ind = tex->getIndirectR();
1974bf215546Sopenharmony_ci
1975bf215546Sopenharmony_ci   Value *ms_x = loadMsAdjInfo32(tex->tex.target, 0, slot, ind, tex->tex.bindless);
1976bf215546Sopenharmony_ci   Value *ms_y = loadMsAdjInfo32(tex->tex.target, 1, slot, ind, tex->tex.bindless);
1977bf215546Sopenharmony_ci
1978bf215546Sopenharmony_ci   bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
1979bf215546Sopenharmony_ci   bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
1980bf215546Sopenharmony_ci
1981bf215546Sopenharmony_ci   s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
1982bf215546Sopenharmony_ci   s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
1983bf215546Sopenharmony_ci
1984bf215546Sopenharmony_ci   Value *dx = loadMsInfo32(ts, 0x0);
1985bf215546Sopenharmony_ci   Value *dy = loadMsInfo32(ts, 0x4);
1986bf215546Sopenharmony_ci
1987bf215546Sopenharmony_ci   bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
1988bf215546Sopenharmony_ci   bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
1989bf215546Sopenharmony_ci
1990bf215546Sopenharmony_ci   tex->setSrc(0, tx);
1991bf215546Sopenharmony_ci   tex->setSrc(1, ty);
1992bf215546Sopenharmony_ci   tex->moveSources(arg, -1);
1993bf215546Sopenharmony_ci}
1994bf215546Sopenharmony_ci
1995bf215546Sopenharmony_ci// Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
1996bf215546Sopenharmony_ci// They're computed from the coordinates using the surface info in c[] space.
1997bf215546Sopenharmony_civoid
1998bf215546Sopenharmony_ciNVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
1999bf215546Sopenharmony_ci{
2000bf215546Sopenharmony_ci   Instruction *insn;
2001bf215546Sopenharmony_ci   const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
2002bf215546Sopenharmony_ci   const bool raw =
2003bf215546Sopenharmony_ci      su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
2004bf215546Sopenharmony_ci   const int slot = su->tex.r;
2005bf215546Sopenharmony_ci   const int dim = su->tex.target.getDim();
2006bf215546Sopenharmony_ci   const bool array = su->tex.target.isArray() || su->tex.target.isCube();
2007bf215546Sopenharmony_ci   const int arg = dim + array;
2008bf215546Sopenharmony_ci   int c;
2009bf215546Sopenharmony_ci   Value *zero = bld.mkImm(0);
2010bf215546Sopenharmony_ci   Value *p1 = NULL;
2011bf215546Sopenharmony_ci   Value *v;
2012bf215546Sopenharmony_ci   Value *src[3];
2013bf215546Sopenharmony_ci   Value *bf, *eau, *off;
2014bf215546Sopenharmony_ci   Value *addr, *pred;
2015bf215546Sopenharmony_ci   Value *ind = su->getIndirectR();
2016bf215546Sopenharmony_ci   Value *y, *z;
2017bf215546Sopenharmony_ci
2018bf215546Sopenharmony_ci   off = bld.getScratch(4);
2019bf215546Sopenharmony_ci   bf = bld.getScratch(4);
2020bf215546Sopenharmony_ci   addr = bld.getSSA(8);
2021bf215546Sopenharmony_ci   pred = bld.getScratch(1, FILE_PREDICATE);
2022bf215546Sopenharmony_ci
2023bf215546Sopenharmony_ci   bld.setPosition(su, false);
2024bf215546Sopenharmony_ci
2025bf215546Sopenharmony_ci   adjustCoordinatesMS(su);
2026bf215546Sopenharmony_ci
2027bf215546Sopenharmony_ci   // calculate clamped coordinates
2028bf215546Sopenharmony_ci   for (c = 0; c < arg; ++c) {
2029bf215546Sopenharmony_ci      int dimc = c;
2030bf215546Sopenharmony_ci
2031bf215546Sopenharmony_ci      if (c == 1 && su->tex.target == TEX_TARGET_1D_ARRAY) {
2032bf215546Sopenharmony_ci         // The array index is stored in the Z component for 1D arrays.
2033bf215546Sopenharmony_ci         dimc = 2;
2034bf215546Sopenharmony_ci      }
2035bf215546Sopenharmony_ci
2036bf215546Sopenharmony_ci      src[c] = bld.getScratch();
2037bf215546Sopenharmony_ci      if (c == 0 && raw)
2038bf215546Sopenharmony_ci         v = loadSuInfo32(ind, slot, NVC0_SU_INFO_RAW_X, su->tex.bindless);
2039bf215546Sopenharmony_ci      else
2040bf215546Sopenharmony_ci         v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(dimc), su->tex.bindless);
2041bf215546Sopenharmony_ci      bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
2042bf215546Sopenharmony_ci         ->subOp = getSuClampSubOp(su, dimc);
2043bf215546Sopenharmony_ci   }
2044bf215546Sopenharmony_ci   for (; c < 3; ++c)
2045bf215546Sopenharmony_ci      src[c] = zero;
2046bf215546Sopenharmony_ci
2047bf215546Sopenharmony_ci   if (dim == 2 && !array) {
2048bf215546Sopenharmony_ci      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2049bf215546Sopenharmony_ci      src[2] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(),
2050bf215546Sopenharmony_ci                          v, bld.loadImm(NULL, 16));
2051bf215546Sopenharmony_ci
2052bf215546Sopenharmony_ci      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(2), su->tex.bindless);
2053bf215546Sopenharmony_ci      bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[2], src[2], v, zero)
2054bf215546Sopenharmony_ci         ->subOp = NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
2055bf215546Sopenharmony_ci   }
2056bf215546Sopenharmony_ci
2057bf215546Sopenharmony_ci   // set predicate output
2058bf215546Sopenharmony_ci   if (su->tex.target == TEX_TARGET_BUFFER) {
2059bf215546Sopenharmony_ci      src[0]->getInsn()->setFlagsDef(1, pred);
2060bf215546Sopenharmony_ci   } else
2061bf215546Sopenharmony_ci   if (array) {
2062bf215546Sopenharmony_ci      p1 = bld.getSSA(1, FILE_PREDICATE);
2063bf215546Sopenharmony_ci      src[dim]->getInsn()->setFlagsDef(1, p1);
2064bf215546Sopenharmony_ci   }
2065bf215546Sopenharmony_ci
2066bf215546Sopenharmony_ci   // calculate pixel offset
2067bf215546Sopenharmony_ci   if (dim == 1) {
2068bf215546Sopenharmony_ci      y = z = zero;
2069bf215546Sopenharmony_ci      if (su->tex.target != TEX_TARGET_BUFFER)
2070bf215546Sopenharmony_ci         bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
2071bf215546Sopenharmony_ci   } else {
2072bf215546Sopenharmony_ci      y = src[1];
2073bf215546Sopenharmony_ci      z = src[2];
2074bf215546Sopenharmony_ci
2075bf215546Sopenharmony_ci      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2076bf215546Sopenharmony_ci      bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
2077bf215546Sopenharmony_ci         ->subOp = NV50_IR_SUBOP_MADSP(4,4,8); // u16l u16l u16l
2078bf215546Sopenharmony_ci
2079bf215546Sopenharmony_ci      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
2080bf215546Sopenharmony_ci      bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
2081bf215546Sopenharmony_ci         ->subOp = array ?
2082bf215546Sopenharmony_ci         NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
2083bf215546Sopenharmony_ci   }
2084bf215546Sopenharmony_ci
2085bf215546Sopenharmony_ci   // calculate effective address part 1
2086bf215546Sopenharmony_ci   if (su->tex.target == TEX_TARGET_BUFFER) {
2087bf215546Sopenharmony_ci      if (raw) {
2088bf215546Sopenharmony_ci         bf = src[0];
2089bf215546Sopenharmony_ci      } else {
2090bf215546Sopenharmony_ci         v = loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);
2091bf215546Sopenharmony_ci         bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
2092bf215546Sopenharmony_ci            ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
2093bf215546Sopenharmony_ci      }
2094bf215546Sopenharmony_ci   } else {
2095bf215546Sopenharmony_ci      uint16_t subOp = 0;
2096bf215546Sopenharmony_ci
2097bf215546Sopenharmony_ci      switch (dim) {
2098bf215546Sopenharmony_ci      case 1:
2099bf215546Sopenharmony_ci         break;
2100bf215546Sopenharmony_ci      case 2:
2101bf215546Sopenharmony_ci         if (array) {
2102bf215546Sopenharmony_ci            z = off;
2103bf215546Sopenharmony_ci         } else {
2104bf215546Sopenharmony_ci            subOp = NV50_IR_SUBOP_SUBFM_3D;
2105bf215546Sopenharmony_ci         }
2106bf215546Sopenharmony_ci         break;
2107bf215546Sopenharmony_ci      default:
2108bf215546Sopenharmony_ci         subOp = NV50_IR_SUBOP_SUBFM_3D;
2109bf215546Sopenharmony_ci         assert(dim == 3);
2110bf215546Sopenharmony_ci         break;
2111bf215546Sopenharmony_ci      }
2112bf215546Sopenharmony_ci      insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
2113bf215546Sopenharmony_ci      insn->subOp = subOp;
2114bf215546Sopenharmony_ci      insn->setFlagsDef(1, pred);
2115bf215546Sopenharmony_ci   }
2116bf215546Sopenharmony_ci
2117bf215546Sopenharmony_ci   // part 2
2118bf215546Sopenharmony_ci   v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless);
2119bf215546Sopenharmony_ci
2120bf215546Sopenharmony_ci   if (su->tex.target == TEX_TARGET_BUFFER) {
2121bf215546Sopenharmony_ci      eau = v;
2122bf215546Sopenharmony_ci   } else {
2123bf215546Sopenharmony_ci      eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
2124bf215546Sopenharmony_ci   }
2125bf215546Sopenharmony_ci   // add array layer offset
2126bf215546Sopenharmony_ci   if (array) {
2127bf215546Sopenharmony_ci      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
2128bf215546Sopenharmony_ci      if (dim == 1)
2129bf215546Sopenharmony_ci         bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
2130bf215546Sopenharmony_ci            ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
2131bf215546Sopenharmony_ci      else
2132bf215546Sopenharmony_ci         bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
2133bf215546Sopenharmony_ci            ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
2134bf215546Sopenharmony_ci      // combine predicates
2135bf215546Sopenharmony_ci      assert(p1);
2136bf215546Sopenharmony_ci      bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
2137bf215546Sopenharmony_ci   }
2138bf215546Sopenharmony_ci
2139bf215546Sopenharmony_ci   if (atom) {
2140bf215546Sopenharmony_ci      Value *lo = bf;
2141bf215546Sopenharmony_ci      if (su->tex.target == TEX_TARGET_BUFFER) {
2142bf215546Sopenharmony_ci         lo = zero;
2143bf215546Sopenharmony_ci         bld.mkMov(off, bf);
2144bf215546Sopenharmony_ci      }
2145bf215546Sopenharmony_ci      //  bf == g[] address & 0xff
2146bf215546Sopenharmony_ci      // eau == g[] address >> 8
2147bf215546Sopenharmony_ci      bld.mkOp3(OP_PERMT, TYPE_U32,  bf,   lo, bld.loadImm(NULL, 0x6540), eau);
2148bf215546Sopenharmony_ci      bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
2149bf215546Sopenharmony_ci   } else
2150bf215546Sopenharmony_ci   if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
2151bf215546Sopenharmony_ci      // Convert from u32 to u8 address format, which is what the library code
2152bf215546Sopenharmony_ci      // doing SULDP currently uses.
2153bf215546Sopenharmony_ci      // XXX: can SUEAU do this ?
2154bf215546Sopenharmony_ci      // XXX: does it matter that we don't mask high bytes in bf ?
2155bf215546Sopenharmony_ci      // Grrr.
2156bf215546Sopenharmony_ci      bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
2157bf215546Sopenharmony_ci      bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
2158bf215546Sopenharmony_ci   }
2159bf215546Sopenharmony_ci
2160bf215546Sopenharmony_ci   bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
2161bf215546Sopenharmony_ci
2162bf215546Sopenharmony_ci   if (atom && su->tex.target == TEX_TARGET_BUFFER)
2163bf215546Sopenharmony_ci      bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
2164bf215546Sopenharmony_ci
2165bf215546Sopenharmony_ci   // let's just set it 0 for raw access and hope it works
2166bf215546Sopenharmony_ci   v = raw ?
2167bf215546Sopenharmony_ci      bld.mkImm(0) : loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);
2168bf215546Sopenharmony_ci
2169bf215546Sopenharmony_ci   // get rid of old coordinate sources, make space for fmt info and predicate
2170bf215546Sopenharmony_ci   su->moveSources(arg, 3 - arg);
2171bf215546Sopenharmony_ci   // set 64 bit address and 32-bit format sources
2172bf215546Sopenharmony_ci   su->setSrc(0, addr);
2173bf215546Sopenharmony_ci   su->setSrc(1, v);
2174bf215546Sopenharmony_ci   su->setSrc(2, pred);
2175bf215546Sopenharmony_ci   su->setIndirectR(NULL);
2176bf215546Sopenharmony_ci
2177bf215546Sopenharmony_ci   // prevent read fault when the image is not actually bound
2178bf215546Sopenharmony_ci   CmpInstruction *pred1 =
2179bf215546Sopenharmony_ci      bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2180bf215546Sopenharmony_ci                TYPE_U32, bld.mkImm(0),
2181bf215546Sopenharmony_ci                loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2182bf215546Sopenharmony_ci
2183bf215546Sopenharmony_ci   if (su->op != OP_SUSTP && su->tex.format) {
2184bf215546Sopenharmony_ci      const TexInstruction::ImgFormatDesc *format = su->tex.format;
2185bf215546Sopenharmony_ci      int blockwidth = format->bits[0] + format->bits[1] +
2186bf215546Sopenharmony_ci                       format->bits[2] + format->bits[3];
2187bf215546Sopenharmony_ci
2188bf215546Sopenharmony_ci      // make sure that the format doesn't mismatch
2189bf215546Sopenharmony_ci      assert(format->components != 0);
2190bf215546Sopenharmony_ci      bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred1->getDef(0),
2191bf215546Sopenharmony_ci                TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2192bf215546Sopenharmony_ci                loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2193bf215546Sopenharmony_ci                pred1->getDef(0));
2194bf215546Sopenharmony_ci   }
2195bf215546Sopenharmony_ci   su->setPredicate(CC_NOT_P, pred1->getDef(0));
2196bf215546Sopenharmony_ci
2197bf215546Sopenharmony_ci   // TODO: initialize def values to 0 when the surface operation is not
2198bf215546Sopenharmony_ci   // performed (not needed for stores). Also, fix the "address bounds test"
2199bf215546Sopenharmony_ci   // subtests from arb_shader_image_load_store-invalid for buffers, because it
2200bf215546Sopenharmony_ci   // seems like that the predicate is not correctly set by suclamp.
2201bf215546Sopenharmony_ci}
2202bf215546Sopenharmony_ci
2203bf215546Sopenharmony_cistatic DataType
2204bf215546Sopenharmony_cigetSrcType(const TexInstruction::ImgFormatDesc *t, int c)
2205bf215546Sopenharmony_ci{
2206bf215546Sopenharmony_ci   switch (t->type) {
2207bf215546Sopenharmony_ci   case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
2208bf215546Sopenharmony_ci   case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
2209bf215546Sopenharmony_ci   case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
2210bf215546Sopenharmony_ci   case UINT:
2211bf215546Sopenharmony_ci      return (t->bits[c] == 8 ? TYPE_U8 :
2212bf215546Sopenharmony_ci              (t->bits[c] == 16 ? TYPE_U16 : TYPE_U32));
2213bf215546Sopenharmony_ci   case SINT:
2214bf215546Sopenharmony_ci      return (t->bits[c] == 8 ? TYPE_S8 :
2215bf215546Sopenharmony_ci              (t->bits[c] == 16 ? TYPE_S16 : TYPE_S32));
2216bf215546Sopenharmony_ci   }
2217bf215546Sopenharmony_ci   return TYPE_NONE;
2218bf215546Sopenharmony_ci}
2219bf215546Sopenharmony_ci
2220bf215546Sopenharmony_cistatic DataType
2221bf215546Sopenharmony_cigetDestType(const ImgType type) {
2222bf215546Sopenharmony_ci   switch (type) {
2223bf215546Sopenharmony_ci   case FLOAT:
2224bf215546Sopenharmony_ci   case UNORM:
2225bf215546Sopenharmony_ci   case SNORM:
2226bf215546Sopenharmony_ci      return TYPE_F32;
2227bf215546Sopenharmony_ci   case UINT:
2228bf215546Sopenharmony_ci      return TYPE_U32;
2229bf215546Sopenharmony_ci   case SINT:
2230bf215546Sopenharmony_ci      return TYPE_S32;
2231bf215546Sopenharmony_ci   default:
2232bf215546Sopenharmony_ci      assert(!"Impossible type");
2233bf215546Sopenharmony_ci      return TYPE_NONE;
2234bf215546Sopenharmony_ci   }
2235bf215546Sopenharmony_ci}
2236bf215546Sopenharmony_ci
2237bf215546Sopenharmony_civoid
2238bf215546Sopenharmony_ciNVC0LoweringPass::convertSurfaceFormat(TexInstruction *su, Instruction **loaded)
2239bf215546Sopenharmony_ci{
2240bf215546Sopenharmony_ci   const TexInstruction::ImgFormatDesc *format = su->tex.format;
2241bf215546Sopenharmony_ci   int width = format->bits[0] + format->bits[1] +
2242bf215546Sopenharmony_ci      format->bits[2] + format->bits[3];
2243bf215546Sopenharmony_ci   Value *untypedDst[4] = {};
2244bf215546Sopenharmony_ci   Value *typedDst[4] = {};
2245bf215546Sopenharmony_ci
2246bf215546Sopenharmony_ci   // We must convert this to a generic load.
2247bf215546Sopenharmony_ci   su->op = OP_SULDB;
2248bf215546Sopenharmony_ci
2249bf215546Sopenharmony_ci   su->dType = typeOfSize(width / 8);
2250bf215546Sopenharmony_ci   su->sType = TYPE_U8;
2251bf215546Sopenharmony_ci
2252bf215546Sopenharmony_ci   for (int i = 0; i < width / 32; i++)
2253bf215546Sopenharmony_ci      untypedDst[i] = bld.getSSA();
2254bf215546Sopenharmony_ci   if (width < 32)
2255bf215546Sopenharmony_ci      untypedDst[0] = bld.getSSA();
2256bf215546Sopenharmony_ci
2257bf215546Sopenharmony_ci   if (loaded && loaded[0]) {
2258bf215546Sopenharmony_ci      for (int i = 0; i < 4; i++) {
2259bf215546Sopenharmony_ci         if (loaded[i])
2260bf215546Sopenharmony_ci            typedDst[i] = loaded[i]->getDef(0);
2261bf215546Sopenharmony_ci      }
2262bf215546Sopenharmony_ci   } else {
2263bf215546Sopenharmony_ci      for (int i = 0; i < 4; i++) {
2264bf215546Sopenharmony_ci         typedDst[i] = su->getDef(i);
2265bf215546Sopenharmony_ci      }
2266bf215546Sopenharmony_ci   }
2267bf215546Sopenharmony_ci
2268bf215546Sopenharmony_ci   // Set the untyped dsts as the su's destinations
2269bf215546Sopenharmony_ci   if (loaded && loaded[0]) {
2270bf215546Sopenharmony_ci      for (int i = 0; i < 4; i++)
2271bf215546Sopenharmony_ci         if (loaded[i])
2272bf215546Sopenharmony_ci            loaded[i]->setDef(0, untypedDst[i]);
2273bf215546Sopenharmony_ci   } else {
2274bf215546Sopenharmony_ci      for (int i = 0; i < 4; i++)
2275bf215546Sopenharmony_ci         su->setDef(i, untypedDst[i]);
2276bf215546Sopenharmony_ci
2277bf215546Sopenharmony_ci      bld.setPosition(su, true);
2278bf215546Sopenharmony_ci   }
2279bf215546Sopenharmony_ci
2280bf215546Sopenharmony_ci   // Unpack each component into the typed dsts
2281bf215546Sopenharmony_ci   int bits = 0;
2282bf215546Sopenharmony_ci   for (int i = 0; i < 4; bits += format->bits[i], i++) {
2283bf215546Sopenharmony_ci      if (!typedDst[i])
2284bf215546Sopenharmony_ci         continue;
2285bf215546Sopenharmony_ci
2286bf215546Sopenharmony_ci      if (loaded && loaded[0])
2287bf215546Sopenharmony_ci         bld.setPosition(loaded[i], true);
2288bf215546Sopenharmony_ci
2289bf215546Sopenharmony_ci      if (i >= format->components) {
2290bf215546Sopenharmony_ci         if (format->type == FLOAT ||
2291bf215546Sopenharmony_ci             format->type == UNORM ||
2292bf215546Sopenharmony_ci             format->type == SNORM)
2293bf215546Sopenharmony_ci            bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
2294bf215546Sopenharmony_ci         else
2295bf215546Sopenharmony_ci            bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
2296bf215546Sopenharmony_ci         continue;
2297bf215546Sopenharmony_ci      }
2298bf215546Sopenharmony_ci
2299bf215546Sopenharmony_ci      // Get just that component's data into the relevant place
2300bf215546Sopenharmony_ci      if (format->bits[i] == 32)
2301bf215546Sopenharmony_ci         bld.mkMov(typedDst[i], untypedDst[i]);
2302bf215546Sopenharmony_ci      else if (format->bits[i] == 16)
2303bf215546Sopenharmony_ci         bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
2304bf215546Sopenharmony_ci                   getSrcType(format, i), untypedDst[i / 2])
2305bf215546Sopenharmony_ci         ->subOp = (i & 1) << (format->type == FLOAT ? 0 : 1);
2306bf215546Sopenharmony_ci      else if (format->bits[i] == 8)
2307bf215546Sopenharmony_ci         bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
2308bf215546Sopenharmony_ci                   getSrcType(format, i), untypedDst[0])->subOp = i;
2309bf215546Sopenharmony_ci      else {
2310bf215546Sopenharmony_ci         bld.mkOp2(OP_EXTBF, TYPE_U32, typedDst[i], untypedDst[bits / 32],
2311bf215546Sopenharmony_ci                   bld.mkImm((bits % 32) | (format->bits[i] << 8)));
2312bf215546Sopenharmony_ci         if (format->type == UNORM || format->type == SNORM)
2313bf215546Sopenharmony_ci            bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], getSrcType(format, i), typedDst[i]);
2314bf215546Sopenharmony_ci      }
2315bf215546Sopenharmony_ci
2316bf215546Sopenharmony_ci      // Normalize / convert as necessary
2317bf215546Sopenharmony_ci      if (format->type == UNORM)
2318bf215546Sopenharmony_ci         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
2319bf215546Sopenharmony_ci      else if (format->type == SNORM)
2320bf215546Sopenharmony_ci         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
2321bf215546Sopenharmony_ci      else if (format->type == FLOAT && format->bits[i] < 16) {
2322bf215546Sopenharmony_ci         bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
2323bf215546Sopenharmony_ci         bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, typedDst[i]);
2324bf215546Sopenharmony_ci      }
2325bf215546Sopenharmony_ci   }
2326bf215546Sopenharmony_ci
2327bf215546Sopenharmony_ci   if (format->bgra) {
2328bf215546Sopenharmony_ci      std::swap(typedDst[0], typedDst[2]);
2329bf215546Sopenharmony_ci   }
2330bf215546Sopenharmony_ci}
2331bf215546Sopenharmony_ci
2332bf215546Sopenharmony_civoid
2333bf215546Sopenharmony_ciNVC0LoweringPass::insertOOBSurfaceOpResult(TexInstruction *su)
2334bf215546Sopenharmony_ci{
2335bf215546Sopenharmony_ci   if (!su->getPredicate())
2336bf215546Sopenharmony_ci      return;
2337bf215546Sopenharmony_ci
2338bf215546Sopenharmony_ci   bld.setPosition(su, true);
2339bf215546Sopenharmony_ci
2340bf215546Sopenharmony_ci   for (unsigned i = 0; su->defExists(i); ++i) {
2341bf215546Sopenharmony_ci      Value *def = su->getDef(i);
2342bf215546Sopenharmony_ci      Value *newDef = bld.getSSA();
2343bf215546Sopenharmony_ci      su->setDef(i, newDef);
2344bf215546Sopenharmony_ci
2345bf215546Sopenharmony_ci      Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2346bf215546Sopenharmony_ci      assert(su->cc == CC_NOT_P);
2347bf215546Sopenharmony_ci      mov->setPredicate(CC_P, su->getPredicate());
2348bf215546Sopenharmony_ci      Instruction *uni = bld.mkOp2(OP_UNION, TYPE_U32, bld.getSSA(), newDef, mov->getDef(0));
2349bf215546Sopenharmony_ci      bld.mkMov(def, uni->getDef(0));
2350bf215546Sopenharmony_ci   }
2351bf215546Sopenharmony_ci}
2352bf215546Sopenharmony_ci
2353bf215546Sopenharmony_civoid
2354bf215546Sopenharmony_ciNVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
2355bf215546Sopenharmony_ci{
2356bf215546Sopenharmony_ci   processSurfaceCoordsNVE4(su);
2357bf215546Sopenharmony_ci
2358bf215546Sopenharmony_ci   if (su->op == OP_SULDP) {
2359bf215546Sopenharmony_ci      convertSurfaceFormat(su, NULL);
2360bf215546Sopenharmony_ci      insertOOBSurfaceOpResult(su);
2361bf215546Sopenharmony_ci   }
2362bf215546Sopenharmony_ci
2363bf215546Sopenharmony_ci   if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2364bf215546Sopenharmony_ci      assert(su->getPredicate());
2365bf215546Sopenharmony_ci      Value *pred =
2366bf215546Sopenharmony_ci         bld.mkOp2v(OP_OR, TYPE_U8, bld.getScratch(1, FILE_PREDICATE),
2367bf215546Sopenharmony_ci                    su->getPredicate(), su->getSrc(2));
2368bf215546Sopenharmony_ci
2369bf215546Sopenharmony_ci      Instruction *red = bld.mkOp(OP_ATOM, su->dType, bld.getSSA());
2370bf215546Sopenharmony_ci      red->subOp = su->subOp;
2371bf215546Sopenharmony_ci      red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0));
2372bf215546Sopenharmony_ci      red->setSrc(1, su->getSrc(3));
2373bf215546Sopenharmony_ci      if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
2374bf215546Sopenharmony_ci         red->setSrc(2, su->getSrc(4));
2375bf215546Sopenharmony_ci      red->setIndirect(0, 0, su->getSrc(0));
2376bf215546Sopenharmony_ci
2377bf215546Sopenharmony_ci      // make sure to initialize dst value when the atomic operation is not
2378bf215546Sopenharmony_ci      // performed
2379bf215546Sopenharmony_ci      Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2380bf215546Sopenharmony_ci
2381bf215546Sopenharmony_ci      assert(su->cc == CC_NOT_P);
2382bf215546Sopenharmony_ci      red->setPredicate(su->cc, pred);
2383bf215546Sopenharmony_ci      mov->setPredicate(CC_P, pred);
2384bf215546Sopenharmony_ci
2385bf215546Sopenharmony_ci      bld.mkOp2(OP_UNION, TYPE_U32, su->getDef(0),
2386bf215546Sopenharmony_ci                red->getDef(0), mov->getDef(0));
2387bf215546Sopenharmony_ci
2388bf215546Sopenharmony_ci      delete_Instruction(bld.getProgram(), su);
2389bf215546Sopenharmony_ci
2390bf215546Sopenharmony_ci      handleATOMCctl(red);
2391bf215546Sopenharmony_ci      handleCasExch(red);
2392bf215546Sopenharmony_ci   }
2393bf215546Sopenharmony_ci
2394bf215546Sopenharmony_ci   if (su->op == OP_SUSTB || su->op == OP_SUSTP)
2395bf215546Sopenharmony_ci      su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
2396bf215546Sopenharmony_ci}
2397bf215546Sopenharmony_ci
2398bf215546Sopenharmony_civoid
2399bf215546Sopenharmony_ciNVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction *su)
2400bf215546Sopenharmony_ci{
2401bf215546Sopenharmony_ci   const int slot = su->tex.r;
2402bf215546Sopenharmony_ci   const int dim = su->tex.target.getDim();
2403bf215546Sopenharmony_ci   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2404bf215546Sopenharmony_ci   int c;
2405bf215546Sopenharmony_ci   Value *zero = bld.mkImm(0);
2406bf215546Sopenharmony_ci   Value *src[3];
2407bf215546Sopenharmony_ci   Value *v;
2408bf215546Sopenharmony_ci   Value *ind = su->getIndirectR();
2409bf215546Sopenharmony_ci
2410bf215546Sopenharmony_ci   bld.setPosition(su, false);
2411bf215546Sopenharmony_ci
2412bf215546Sopenharmony_ci   adjustCoordinatesMS(su);
2413bf215546Sopenharmony_ci
2414bf215546Sopenharmony_ci   if (ind) {
2415bf215546Sopenharmony_ci      Value *ptr;
2416bf215546Sopenharmony_ci      ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ind, bld.mkImm(su->tex.r));
2417bf215546Sopenharmony_ci      ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
2418bf215546Sopenharmony_ci      su->setIndirectR(ptr);
2419bf215546Sopenharmony_ci   }
2420bf215546Sopenharmony_ci
2421bf215546Sopenharmony_ci   // get surface coordinates
2422bf215546Sopenharmony_ci   for (c = 0; c < arg; ++c)
2423bf215546Sopenharmony_ci      src[c] = su->getSrc(c);
2424bf215546Sopenharmony_ci   for (; c < 3; ++c)
2425bf215546Sopenharmony_ci      src[c] = zero;
2426bf215546Sopenharmony_ci
2427bf215546Sopenharmony_ci   // calculate pixel offset
2428bf215546Sopenharmony_ci   if (su->op == OP_SULDP || su->op == OP_SUREDP) {
2429bf215546Sopenharmony_ci      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless);
2430bf215546Sopenharmony_ci      su->setSrc(0, (src[0] = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), src[0], v)));
2431bf215546Sopenharmony_ci   }
2432bf215546Sopenharmony_ci
2433bf215546Sopenharmony_ci   // add array layer offset
2434bf215546Sopenharmony_ci   if (su->tex.target.isArray() || su->tex.target.isCube()) {
2435bf215546Sopenharmony_ci      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
2436bf215546Sopenharmony_ci      assert(dim > 1);
2437bf215546Sopenharmony_ci      su->setSrc(2, (src[2] = bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[2], v)));
2438bf215546Sopenharmony_ci   }
2439bf215546Sopenharmony_ci
2440bf215546Sopenharmony_ci   // 3d is special-cased. Note that a single "slice" of a 3d image may
2441bf215546Sopenharmony_ci   // also be attached as 2d, so we have to do the same 3d processing for
2442bf215546Sopenharmony_ci   // 2d as well, just in case. In order to remap a 3d image onto a 2d
2443bf215546Sopenharmony_ci   // image, we have to retile it "by hand".
2444bf215546Sopenharmony_ci   if (su->tex.target == TEX_TARGET_3D || su->tex.target == TEX_TARGET_2D) {
2445bf215546Sopenharmony_ci      Value *z = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2446bf215546Sopenharmony_ci      Value *y_size_aligned =
2447bf215546Sopenharmony_ci         bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(),
2448bf215546Sopenharmony_ci                    loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM_Y, su->tex.bindless),
2449bf215546Sopenharmony_ci                    bld.loadImm(NULL, 0x0000ffff));
2450bf215546Sopenharmony_ci      // Add the z coordinate for actual 3d-images
2451bf215546Sopenharmony_ci      if (dim > 2)
2452bf215546Sopenharmony_ci         src[2] = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), z, src[2]);
2453bf215546Sopenharmony_ci      else
2454bf215546Sopenharmony_ci         src[2] = z;
2455bf215546Sopenharmony_ci
2456bf215546Sopenharmony_ci      // Compute the surface parameters from tile shifts
2457bf215546Sopenharmony_ci      Value *tile_shift[3];
2458bf215546Sopenharmony_ci      Value *tile_extbf[3];
2459bf215546Sopenharmony_ci      // Fetch the "real" tiling parameters of the underlying surface
2460bf215546Sopenharmony_ci      for (int i = 0; i < 3; i++) {
2461bf215546Sopenharmony_ci         tile_extbf[i] =
2462bf215546Sopenharmony_ci            bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(),
2463bf215546Sopenharmony_ci                       loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(i), su->tex.bindless),
2464bf215546Sopenharmony_ci                       bld.loadImm(NULL, 16));
2465bf215546Sopenharmony_ci         tile_shift[i] =
2466bf215546Sopenharmony_ci            bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(),
2467bf215546Sopenharmony_ci                       loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(i), su->tex.bindless),
2468bf215546Sopenharmony_ci                       bld.loadImm(NULL, 24));
2469bf215546Sopenharmony_ci      }
2470bf215546Sopenharmony_ci
2471bf215546Sopenharmony_ci      // However for load/atomics, we use byte-indexing. And for byte
2472bf215546Sopenharmony_ci      // indexing, the X tile size is always the same. This leads to slightly
2473bf215546Sopenharmony_ci      // better code.
2474bf215546Sopenharmony_ci      if (su->op == OP_SULDP || su->op == OP_SUREDP) {
2475bf215546Sopenharmony_ci         tile_extbf[0] = bld.loadImm(NULL, 0x600);
2476bf215546Sopenharmony_ci         tile_shift[0] = bld.loadImm(NULL, 6);
2477bf215546Sopenharmony_ci      }
2478bf215546Sopenharmony_ci
2479bf215546Sopenharmony_ci      // Compute the location of given coordinate, both inside the tile as
2480bf215546Sopenharmony_ci      // well as which (linearly-laid out) tile it's in.
2481bf215546Sopenharmony_ci      Value *coord_in_tile[3];
2482bf215546Sopenharmony_ci      Value *tile[3];
2483bf215546Sopenharmony_ci      for (int i = 0; i < 3; i++) {
2484bf215546Sopenharmony_ci         coord_in_tile[i] = bld.mkOp2v(OP_EXTBF, TYPE_U32, bld.getSSA(), src[i], tile_extbf[i]);
2485bf215546Sopenharmony_ci         tile[i] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), src[i], tile_shift[i]);
2486bf215546Sopenharmony_ci      }
2487bf215546Sopenharmony_ci
2488bf215546Sopenharmony_ci      // Based on the "real" tiling parameters, compute x/y coordinates in the
2489bf215546Sopenharmony_ci      // larger surface with 2d tiling that was supplied to the hardware. This
2490bf215546Sopenharmony_ci      // was determined and verified with the help of the tiling pseudocode in
2491bf215546Sopenharmony_ci      // the envytools docs.
2492bf215546Sopenharmony_ci      //
2493bf215546Sopenharmony_ci      // adj_x = x_coord_in_tile + x_tile * x_tile_size * z_tile_size +
2494bf215546Sopenharmony_ci      //         z_coord_in_tile * x_tile_size
2495bf215546Sopenharmony_ci      // adj_y = y_coord_in_tile + y_tile * y_tile_size +
2496bf215546Sopenharmony_ci      //         z_tile * y_tile_size * y_tiles
2497bf215546Sopenharmony_ci      //
2498bf215546Sopenharmony_ci      // Note: STRIDE_Y = y_tile_size * y_tiles
2499bf215546Sopenharmony_ci
2500bf215546Sopenharmony_ci      su->setSrc(0, bld.mkOp2v(
2501bf215546Sopenharmony_ci            OP_ADD, TYPE_U32, bld.getSSA(),
2502bf215546Sopenharmony_ci            bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
2503bf215546Sopenharmony_ci                       coord_in_tile[0],
2504bf215546Sopenharmony_ci                       bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2505bf215546Sopenharmony_ci                                  tile[0],
2506bf215546Sopenharmony_ci                                  bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
2507bf215546Sopenharmony_ci                                             tile_shift[2], tile_shift[0]))),
2508bf215546Sopenharmony_ci            bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2509bf215546Sopenharmony_ci                       coord_in_tile[2], tile_shift[0])));
2510bf215546Sopenharmony_ci
2511bf215546Sopenharmony_ci      su->setSrc(1, bld.mkOp2v(
2512bf215546Sopenharmony_ci            OP_ADD, TYPE_U32, bld.getSSA(),
2513bf215546Sopenharmony_ci            bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(),
2514bf215546Sopenharmony_ci                       tile[2], y_size_aligned),
2515bf215546Sopenharmony_ci            bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
2516bf215546Sopenharmony_ci                       coord_in_tile[1],
2517bf215546Sopenharmony_ci                       bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2518bf215546Sopenharmony_ci                                  tile[1], tile_shift[1]))));
2519bf215546Sopenharmony_ci
2520bf215546Sopenharmony_ci      if (su->tex.target == TEX_TARGET_3D) {
2521bf215546Sopenharmony_ci         su->moveSources(3, -1);
2522bf215546Sopenharmony_ci         su->tex.target = TEX_TARGET_2D;
2523bf215546Sopenharmony_ci      }
2524bf215546Sopenharmony_ci   }
2525bf215546Sopenharmony_ci
2526bf215546Sopenharmony_ci   // prevent read fault when the image is not actually bound
2527bf215546Sopenharmony_ci   CmpInstruction *pred =
2528bf215546Sopenharmony_ci      bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2529bf215546Sopenharmony_ci                TYPE_U32, bld.mkImm(0),
2530bf215546Sopenharmony_ci                loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2531bf215546Sopenharmony_ci   if (su->op != OP_SUSTP && su->tex.format) {
2532bf215546Sopenharmony_ci      const TexInstruction::ImgFormatDesc *format = su->tex.format;
2533bf215546Sopenharmony_ci      int blockwidth = format->bits[0] + format->bits[1] +
2534bf215546Sopenharmony_ci                       format->bits[2] + format->bits[3];
2535bf215546Sopenharmony_ci
2536bf215546Sopenharmony_ci      assert(format->components != 0);
2537bf215546Sopenharmony_ci      // make sure that the format doesn't mismatch when it's not FMT_NONE
2538bf215546Sopenharmony_ci      bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2539bf215546Sopenharmony_ci                TYPE_U32, bld.loadImm(NULL, ffs(blockwidth / 8) - 1),
2540bf215546Sopenharmony_ci                loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2541bf215546Sopenharmony_ci                pred->getDef(0));
2542bf215546Sopenharmony_ci   }
2543bf215546Sopenharmony_ci   su->setPredicate(CC_NOT_P, pred->getDef(0));
2544bf215546Sopenharmony_ci}
2545bf215546Sopenharmony_ci
2546bf215546Sopenharmony_civoid
2547bf215546Sopenharmony_ciNVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
2548bf215546Sopenharmony_ci{
2549bf215546Sopenharmony_ci   if (su->tex.target == TEX_TARGET_1D_ARRAY) {
2550bf215546Sopenharmony_ci      /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY
2551bf215546Sopenharmony_ci       * will simplify the lowering pass and the texture constraints. */
2552bf215546Sopenharmony_ci      su->moveSources(1, 1);
2553bf215546Sopenharmony_ci      su->setSrc(1, bld.loadImm(NULL, 0));
2554bf215546Sopenharmony_ci      su->tex.target = TEX_TARGET_2D_ARRAY;
2555bf215546Sopenharmony_ci   }
2556bf215546Sopenharmony_ci
2557bf215546Sopenharmony_ci   processSurfaceCoordsNVC0(su);
2558bf215546Sopenharmony_ci
2559bf215546Sopenharmony_ci   if (su->op == OP_SULDP) {
2560bf215546Sopenharmony_ci      convertSurfaceFormat(su, NULL);
2561bf215546Sopenharmony_ci      insertOOBSurfaceOpResult(su);
2562bf215546Sopenharmony_ci   }
2563bf215546Sopenharmony_ci
2564bf215546Sopenharmony_ci   if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2565bf215546Sopenharmony_ci      const int dim = su->tex.target.getDim();
2566bf215546Sopenharmony_ci      const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2567bf215546Sopenharmony_ci      LValue *addr = bld.getSSA(8);
2568bf215546Sopenharmony_ci      Value *def = su->getDef(0);
2569bf215546Sopenharmony_ci
2570bf215546Sopenharmony_ci      su->op = OP_SULEA;
2571bf215546Sopenharmony_ci
2572bf215546Sopenharmony_ci      // Set the destination to the address
2573bf215546Sopenharmony_ci      su->dType = TYPE_U64;
2574bf215546Sopenharmony_ci      su->setDef(0, addr);
2575bf215546Sopenharmony_ci      su->setDef(1, su->getPredicate());
2576bf215546Sopenharmony_ci
2577bf215546Sopenharmony_ci      bld.setPosition(su, true);
2578bf215546Sopenharmony_ci
2579bf215546Sopenharmony_ci      // Perform the atomic op
2580bf215546Sopenharmony_ci      Instruction *red = bld.mkOp(OP_ATOM, su->sType, bld.getSSA());
2581bf215546Sopenharmony_ci      red->subOp = su->subOp;
2582bf215546Sopenharmony_ci      red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, su->sType, 0));
2583bf215546Sopenharmony_ci      red->setSrc(1, su->getSrc(arg));
2584bf215546Sopenharmony_ci      if (red->subOp == NV50_IR_SUBOP_ATOM_CAS)
2585bf215546Sopenharmony_ci         red->setSrc(2, su->getSrc(arg + 1));
2586bf215546Sopenharmony_ci      red->setIndirect(0, 0, addr);
2587bf215546Sopenharmony_ci
2588bf215546Sopenharmony_ci      // make sure to initialize dst value when the atomic operation is not
2589bf215546Sopenharmony_ci      // performed
2590bf215546Sopenharmony_ci      Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2591bf215546Sopenharmony_ci
2592bf215546Sopenharmony_ci      assert(su->cc == CC_NOT_P);
2593bf215546Sopenharmony_ci      red->setPredicate(su->cc, su->getPredicate());
2594bf215546Sopenharmony_ci      mov->setPredicate(CC_P, su->getPredicate());
2595bf215546Sopenharmony_ci
2596bf215546Sopenharmony_ci      bld.mkOp2(OP_UNION, TYPE_U32, def, red->getDef(0), mov->getDef(0));
2597bf215546Sopenharmony_ci
2598bf215546Sopenharmony_ci      handleCasExch(red);
2599bf215546Sopenharmony_ci   }
2600bf215546Sopenharmony_ci}
2601bf215546Sopenharmony_ci
2602bf215546Sopenharmony_ciTexInstruction *
2603bf215546Sopenharmony_ciNVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su, Instruction *ret[4])
2604bf215546Sopenharmony_ci{
2605bf215546Sopenharmony_ci   const int slot = su->tex.r;
2606bf215546Sopenharmony_ci   const int dim = su->tex.target.getDim();
2607bf215546Sopenharmony_ci   const bool array = su->tex.target.isArray() || su->tex.target.isCube();
2608bf215546Sopenharmony_ci   const int arg = dim + array;
2609bf215546Sopenharmony_ci   Value *ind = su->getIndirectR();
2610bf215546Sopenharmony_ci   Value *handle;
2611bf215546Sopenharmony_ci   Instruction *pred = NULL, *pred2d = NULL;
2612bf215546Sopenharmony_ci   int pos = 0;
2613bf215546Sopenharmony_ci
2614bf215546Sopenharmony_ci   bld.setPosition(su, false);
2615bf215546Sopenharmony_ci
2616bf215546Sopenharmony_ci   adjustCoordinatesMS(su);
2617bf215546Sopenharmony_ci
2618bf215546Sopenharmony_ci   // add texture handle
2619bf215546Sopenharmony_ci   switch (su->op) {
2620bf215546Sopenharmony_ci   case OP_SUSTP:
2621bf215546Sopenharmony_ci      pos = 4;
2622bf215546Sopenharmony_ci      break;
2623bf215546Sopenharmony_ci   case OP_SUREDP:
2624bf215546Sopenharmony_ci      pos = (su->subOp == NV50_IR_SUBOP_ATOM_CAS) ? 2 : 1;
2625bf215546Sopenharmony_ci      break;
2626bf215546Sopenharmony_ci   default:
2627bf215546Sopenharmony_ci      assert(pos == 0);
2628bf215546Sopenharmony_ci      break;
2629bf215546Sopenharmony_ci   }
2630bf215546Sopenharmony_ci
2631bf215546Sopenharmony_ci   if (dim == 2 && !array) {
2632bf215546Sopenharmony_ci      // This might be a 2d slice of a 3d texture, try to load the z
2633bf215546Sopenharmony_ci      // coordinate in.
2634bf215546Sopenharmony_ci      Value *v;
2635bf215546Sopenharmony_ci      if (!su->tex.bindless)
2636bf215546Sopenharmony_ci         v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2637bf215546Sopenharmony_ci      else
2638bf215546Sopenharmony_ci         v = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), ind, bld.mkImm(11));
2639bf215546Sopenharmony_ci      Value *is_3d = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), v, bld.mkImm(1));
2640bf215546Sopenharmony_ci      pred2d = bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2641bf215546Sopenharmony_ci                         TYPE_U32, bld.mkImm(0), is_3d);
2642bf215546Sopenharmony_ci
2643bf215546Sopenharmony_ci      bld.mkOp2(OP_SHR, TYPE_U32, v, v, bld.loadImm(NULL, 16));
2644bf215546Sopenharmony_ci      su->moveSources(dim, 1);
2645bf215546Sopenharmony_ci      su->setSrc(dim, v);
2646bf215546Sopenharmony_ci      su->tex.target = nv50_ir::TEX_TARGET_3D;
2647bf215546Sopenharmony_ci      pos++;
2648bf215546Sopenharmony_ci   }
2649bf215546Sopenharmony_ci
2650bf215546Sopenharmony_ci   if (su->tex.bindless)
2651bf215546Sopenharmony_ci      handle = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ind, bld.mkImm(2047));
2652bf215546Sopenharmony_ci   else
2653bf215546Sopenharmony_ci      handle = loadTexHandle(ind, slot + 32);
2654bf215546Sopenharmony_ci
2655bf215546Sopenharmony_ci   su->setSrc(arg + pos, handle);
2656bf215546Sopenharmony_ci
2657bf215546Sopenharmony_ci   // The address check doesn't make sense here. The format check could make
2658bf215546Sopenharmony_ci   // sense but it's a bit of a pain.
2659bf215546Sopenharmony_ci   if (!su->tex.bindless) {
2660bf215546Sopenharmony_ci      // prevent read fault when the image is not actually bound
2661bf215546Sopenharmony_ci      pred =
2662bf215546Sopenharmony_ci         bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2663bf215546Sopenharmony_ci                   TYPE_U32, bld.mkImm(0),
2664bf215546Sopenharmony_ci                   loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2665bf215546Sopenharmony_ci      if (su->op != OP_SUSTP && su->tex.format) {
2666bf215546Sopenharmony_ci         const TexInstruction::ImgFormatDesc *format = su->tex.format;
2667bf215546Sopenharmony_ci         int blockwidth = format->bits[0] + format->bits[1] +
2668bf215546Sopenharmony_ci            format->bits[2] + format->bits[3];
2669bf215546Sopenharmony_ci
2670bf215546Sopenharmony_ci         assert(format->components != 0);
2671bf215546Sopenharmony_ci         // make sure that the format doesn't mismatch when it's not FMT_NONE
2672bf215546Sopenharmony_ci         bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2673bf215546Sopenharmony_ci                   TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2674bf215546Sopenharmony_ci                   loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2675bf215546Sopenharmony_ci                   pred->getDef(0));
2676bf215546Sopenharmony_ci      }
2677bf215546Sopenharmony_ci   }
2678bf215546Sopenharmony_ci
2679bf215546Sopenharmony_ci   // Now we have "pred" which (optionally) contains whether to do the surface
2680bf215546Sopenharmony_ci   // op at all, and a "pred2d" which indicates that, in case of doing the
2681bf215546Sopenharmony_ci   // surface op, we have to create a 2d and 3d version, conditioned on pred2d.
2682bf215546Sopenharmony_ci   TexInstruction *su2d = NULL;
2683bf215546Sopenharmony_ci   if (pred2d) {
2684bf215546Sopenharmony_ci      su2d = cloneForward(func, su)->asTex();
2685bf215546Sopenharmony_ci      for (unsigned i = 0; su->defExists(i); ++i)
2686bf215546Sopenharmony_ci         su2d->setDef(i, bld.getSSA());
2687bf215546Sopenharmony_ci      su2d->moveSources(dim + 1, -1);
2688bf215546Sopenharmony_ci      su2d->tex.target = nv50_ir::TEX_TARGET_2D;
2689bf215546Sopenharmony_ci   }
2690bf215546Sopenharmony_ci   if (pred2d && pred) {
2691bf215546Sopenharmony_ci      Instruction *pred3d = bld.mkOp2(OP_AND, TYPE_U8,
2692bf215546Sopenharmony_ci                                      bld.getSSA(1, FILE_PREDICATE),
2693bf215546Sopenharmony_ci                                      pred->getDef(0), pred2d->getDef(0));
2694bf215546Sopenharmony_ci      pred3d->src(0).mod = Modifier(NV50_IR_MOD_NOT);
2695bf215546Sopenharmony_ci      pred3d->src(1).mod = Modifier(NV50_IR_MOD_NOT);
2696bf215546Sopenharmony_ci      su->setPredicate(CC_P, pred3d->getDef(0));
2697bf215546Sopenharmony_ci      pred2d = bld.mkOp2(OP_AND, TYPE_U8, bld.getSSA(1, FILE_PREDICATE),
2698bf215546Sopenharmony_ci                         pred->getDef(0), pred2d->getDef(0));
2699bf215546Sopenharmony_ci      pred2d->src(0).mod = Modifier(NV50_IR_MOD_NOT);
2700bf215546Sopenharmony_ci   } else if (pred) {
2701bf215546Sopenharmony_ci      su->setPredicate(CC_NOT_P, pred->getDef(0));
2702bf215546Sopenharmony_ci   } else if (pred2d) {
2703bf215546Sopenharmony_ci      su->setPredicate(CC_NOT_P, pred2d->getDef(0));
2704bf215546Sopenharmony_ci   }
2705bf215546Sopenharmony_ci   if (su2d) {
2706bf215546Sopenharmony_ci      su2d->setPredicate(CC_P, pred2d->getDef(0));
2707bf215546Sopenharmony_ci      bld.insert(su2d);
2708bf215546Sopenharmony_ci
2709bf215546Sopenharmony_ci      // Create a UNION so that RA assigns the same registers
2710bf215546Sopenharmony_ci      bld.setPosition(su, true);
2711bf215546Sopenharmony_ci      for (unsigned i = 0; su->defExists(i); ++i) {
2712bf215546Sopenharmony_ci         assert(i < 4);
2713bf215546Sopenharmony_ci
2714bf215546Sopenharmony_ci         Value *def = su->getDef(i);
2715bf215546Sopenharmony_ci         Value *newDef = bld.getSSA();
2716bf215546Sopenharmony_ci         ValueDef &def2 = su2d->def(i);
2717bf215546Sopenharmony_ci         Instruction *mov = NULL;
2718bf215546Sopenharmony_ci
2719bf215546Sopenharmony_ci         su->setDef(i, newDef);
2720bf215546Sopenharmony_ci         if (pred) {
2721bf215546Sopenharmony_ci            mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2722bf215546Sopenharmony_ci            mov->setPredicate(CC_P, pred->getDef(0));
2723bf215546Sopenharmony_ci         }
2724bf215546Sopenharmony_ci
2725bf215546Sopenharmony_ci         Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32,
2726bf215546Sopenharmony_ci                                      bld.getSSA(),
2727bf215546Sopenharmony_ci                                      newDef, def2.get());
2728bf215546Sopenharmony_ci         if (mov)
2729bf215546Sopenharmony_ci            uni->setSrc(2, mov->getDef(0));
2730bf215546Sopenharmony_ci         bld.mkMov(def, uni->getDef(0));
2731bf215546Sopenharmony_ci      }
2732bf215546Sopenharmony_ci   } else if (pred) {
2733bf215546Sopenharmony_ci      // Create a UNION so that RA assigns the same registers
2734bf215546Sopenharmony_ci      bld.setPosition(su, true);
2735bf215546Sopenharmony_ci      for (unsigned i = 0; su->defExists(i); ++i) {
2736bf215546Sopenharmony_ci         assert(i < 4);
2737bf215546Sopenharmony_ci
2738bf215546Sopenharmony_ci         Value *def = su->getDef(i);
2739bf215546Sopenharmony_ci         Value *newDef = bld.getSSA();
2740bf215546Sopenharmony_ci         su->setDef(i, newDef);
2741bf215546Sopenharmony_ci
2742bf215546Sopenharmony_ci         Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2743bf215546Sopenharmony_ci         mov->setPredicate(CC_P, pred->getDef(0));
2744bf215546Sopenharmony_ci
2745bf215546Sopenharmony_ci         Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32,
2746bf215546Sopenharmony_ci                                      bld.getSSA(),
2747bf215546Sopenharmony_ci                                      newDef, mov->getDef(0));
2748bf215546Sopenharmony_ci         bld.mkMov(def, uni->getDef(0));
2749bf215546Sopenharmony_ci      }
2750bf215546Sopenharmony_ci   }
2751bf215546Sopenharmony_ci
2752bf215546Sopenharmony_ci   return su2d;
2753bf215546Sopenharmony_ci}
2754bf215546Sopenharmony_ci
2755bf215546Sopenharmony_civoid
2756bf215546Sopenharmony_ciNVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
2757bf215546Sopenharmony_ci{
2758bf215546Sopenharmony_ci   // processSurfaceCoords also takes care of fixing up the outputs and
2759bf215546Sopenharmony_ci   // union'ing them with 0 as necessary. Additionally it may create a second
2760bf215546Sopenharmony_ci   // surface which needs some of the similar fixups.
2761bf215546Sopenharmony_ci
2762bf215546Sopenharmony_ci   Instruction *loaded[4] = {};
2763bf215546Sopenharmony_ci   TexInstruction *su2 = processSurfaceCoordsGM107(su, loaded);
2764bf215546Sopenharmony_ci
2765bf215546Sopenharmony_ci   if (su->op == OP_SULDP) {
2766bf215546Sopenharmony_ci      convertSurfaceFormat(su, loaded);
2767bf215546Sopenharmony_ci   }
2768bf215546Sopenharmony_ci
2769bf215546Sopenharmony_ci   if (su->op == OP_SUREDP) {
2770bf215546Sopenharmony_ci      su->op = OP_SUREDB;
2771bf215546Sopenharmony_ci   }
2772bf215546Sopenharmony_ci
2773bf215546Sopenharmony_ci   // If we fixed up the type of the regular surface load instruction, we also
2774bf215546Sopenharmony_ci   // have to fix up the copy.
2775bf215546Sopenharmony_ci   if (su2) {
2776bf215546Sopenharmony_ci      su2->op = su->op;
2777bf215546Sopenharmony_ci      su2->dType = su->dType;
2778bf215546Sopenharmony_ci      su2->sType = su->sType;
2779bf215546Sopenharmony_ci   }
2780bf215546Sopenharmony_ci}
2781bf215546Sopenharmony_ci
2782bf215546Sopenharmony_cibool
2783bf215546Sopenharmony_ciNVC0LoweringPass::handleWRSV(Instruction *i)
2784bf215546Sopenharmony_ci{
2785bf215546Sopenharmony_ci   Instruction *st;
2786bf215546Sopenharmony_ci   Symbol *sym;
2787bf215546Sopenharmony_ci   uint32_t addr;
2788bf215546Sopenharmony_ci
2789bf215546Sopenharmony_ci   // must replace, $sreg are not writeable
2790bf215546Sopenharmony_ci   addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
2791bf215546Sopenharmony_ci   if (addr >= 0x400)
2792bf215546Sopenharmony_ci      return false;
2793bf215546Sopenharmony_ci   sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
2794bf215546Sopenharmony_ci
2795bf215546Sopenharmony_ci   st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
2796bf215546Sopenharmony_ci                    i->getSrc(1));
2797bf215546Sopenharmony_ci   st->perPatch = i->perPatch;
2798bf215546Sopenharmony_ci
2799bf215546Sopenharmony_ci   bld.getBB()->remove(i);
2800bf215546Sopenharmony_ci   return true;
2801bf215546Sopenharmony_ci}
2802bf215546Sopenharmony_ci
2803bf215546Sopenharmony_civoid
2804bf215546Sopenharmony_ciNVC0LoweringPass::handleLDST(Instruction *i)
2805bf215546Sopenharmony_ci{
2806bf215546Sopenharmony_ci   if (i->src(0).getFile() == FILE_SHADER_INPUT) {
2807bf215546Sopenharmony_ci      if (prog->getType() == Program::TYPE_COMPUTE) {
2808bf215546Sopenharmony_ci         i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
2809bf215546Sopenharmony_ci         i->getSrc(0)->reg.fileIndex = 0;
2810bf215546Sopenharmony_ci      } else
2811bf215546Sopenharmony_ci      if (prog->getType() == Program::TYPE_GEOMETRY &&
2812bf215546Sopenharmony_ci          i->src(0).isIndirect(0)) {
2813bf215546Sopenharmony_ci         // XXX: this assumes vec4 units
2814bf215546Sopenharmony_ci         Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2815bf215546Sopenharmony_ci                                 i->getIndirect(0, 0), bld.mkImm(4));
2816bf215546Sopenharmony_ci         i->setIndirect(0, 0, ptr);
2817bf215546Sopenharmony_ci         i->op = OP_VFETCH;
2818bf215546Sopenharmony_ci      } else {
2819bf215546Sopenharmony_ci         i->op = OP_VFETCH;
2820bf215546Sopenharmony_ci         assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
2821bf215546Sopenharmony_ci      }
2822bf215546Sopenharmony_ci   } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
2823bf215546Sopenharmony_ci      int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
2824bf215546Sopenharmony_ci      Value *ind = i->getIndirect(0, 1);
2825bf215546Sopenharmony_ci
2826bf215546Sopenharmony_ci      if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
2827bf215546Sopenharmony_ci          prog->getType() == Program::TYPE_COMPUTE &&
2828bf215546Sopenharmony_ci          (fileIndex >= 6 || ind)) {
2829bf215546Sopenharmony_ci         // The launch descriptor only allows to set up 8 CBs, but OpenGL
2830bf215546Sopenharmony_ci         // requires at least 12 UBOs. To bypass this limitation, for constant
2831bf215546Sopenharmony_ci         // buffers 7+, we store the addrs into the driver constbuf and we
2832bf215546Sopenharmony_ci         // directly load from the global memory.
2833bf215546Sopenharmony_ci         if (ind) {
2834bf215546Sopenharmony_ci            // Clamp the UBO index when an indirect access is used to avoid
2835bf215546Sopenharmony_ci            // loading information from the wrong place in the driver cb.
2836bf215546Sopenharmony_ci            // TODO - synchronize the max with the driver.
2837bf215546Sopenharmony_ci            ind = bld.mkOp2v(OP_MIN, TYPE_U32, bld.getSSA(),
2838bf215546Sopenharmony_ci                             bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
2839bf215546Sopenharmony_ci                                        ind, bld.loadImm(NULL, fileIndex)),
2840bf215546Sopenharmony_ci                             bld.loadImm(NULL, 13));
2841bf215546Sopenharmony_ci            fileIndex = 0;
2842bf215546Sopenharmony_ci         }
2843bf215546Sopenharmony_ci
2844bf215546Sopenharmony_ci         Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2845bf215546Sopenharmony_ci         Value *ptr = loadUboInfo64(ind, fileIndex * 16);
2846bf215546Sopenharmony_ci         Value *length = loadUboLength32(ind, fileIndex * 16);
2847bf215546Sopenharmony_ci         Value *pred = new_LValue(func, FILE_PREDICATE);
2848bf215546Sopenharmony_ci         if (i->src(0).isIndirect(0)) {
2849bf215546Sopenharmony_ci            bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2850bf215546Sopenharmony_ci            bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2851bf215546Sopenharmony_ci         }
2852bf215546Sopenharmony_ci         i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2853bf215546Sopenharmony_ci         i->setIndirect(0, 1, NULL);
2854bf215546Sopenharmony_ci         i->setIndirect(0, 0, ptr);
2855bf215546Sopenharmony_ci         bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2856bf215546Sopenharmony_ci         i->setPredicate(CC_NOT_P, pred);
2857bf215546Sopenharmony_ci         Value *zero, *dst = i->getDef(0);
2858bf215546Sopenharmony_ci         i->setDef(0, bld.getSSA());
2859bf215546Sopenharmony_ci
2860bf215546Sopenharmony_ci         bld.setPosition(i, true);
2861bf215546Sopenharmony_ci         bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2862bf215546Sopenharmony_ci            ->setPredicate(CC_P, pred);
2863bf215546Sopenharmony_ci         bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2864bf215546Sopenharmony_ci      } else if (i->src(0).isIndirect(1)) {
2865bf215546Sopenharmony_ci         Value *ptr;
2866bf215546Sopenharmony_ci         if (i->src(0).isIndirect(0))
2867bf215546Sopenharmony_ci            ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
2868bf215546Sopenharmony_ci                             i->getIndirect(0, 1), bld.mkImm(0x1010),
2869bf215546Sopenharmony_ci                             i->getIndirect(0, 0));
2870bf215546Sopenharmony_ci         else
2871bf215546Sopenharmony_ci            ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2872bf215546Sopenharmony_ci                             i->getIndirect(0, 1), bld.mkImm(16));
2873bf215546Sopenharmony_ci         i->setIndirect(0, 1, NULL);
2874bf215546Sopenharmony_ci         i->setIndirect(0, 0, ptr);
2875bf215546Sopenharmony_ci         i->subOp = NV50_IR_SUBOP_LDC_IS;
2876bf215546Sopenharmony_ci      }
2877bf215546Sopenharmony_ci   } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
2878bf215546Sopenharmony_ci      assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
2879bf215546Sopenharmony_ci      i->op = OP_VFETCH;
2880bf215546Sopenharmony_ci   } else if (i->src(0).getFile() == FILE_MEMORY_BUFFER) {
2881bf215546Sopenharmony_ci      Value *ind = i->getIndirect(0, 1);
2882bf215546Sopenharmony_ci      Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
2883bf215546Sopenharmony_ci      // XXX come up with a way not to do this for EVERY little access but
2884bf215546Sopenharmony_ci      // rather to batch these up somehow. Unfortunately we've lost the
2885bf215546Sopenharmony_ci      // information about the field width by the time we get here.
2886bf215546Sopenharmony_ci      Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2887bf215546Sopenharmony_ci      Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
2888bf215546Sopenharmony_ci      Value *pred = new_LValue(func, FILE_PREDICATE);
2889bf215546Sopenharmony_ci      if (i->src(0).isIndirect(0)) {
2890bf215546Sopenharmony_ci         bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2891bf215546Sopenharmony_ci         bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2892bf215546Sopenharmony_ci      }
2893bf215546Sopenharmony_ci      i->setIndirect(0, 1, NULL);
2894bf215546Sopenharmony_ci      i->setIndirect(0, 0, ptr);
2895bf215546Sopenharmony_ci      i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2896bf215546Sopenharmony_ci      bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2897bf215546Sopenharmony_ci      i->setPredicate(CC_NOT_P, pred);
2898bf215546Sopenharmony_ci      if (i->defExists(0)) {
2899bf215546Sopenharmony_ci         Value *zero, *dst = i->getDef(0);
2900bf215546Sopenharmony_ci         uint8_t size = dst->reg.size;
2901bf215546Sopenharmony_ci         i->setDef(0, bld.getSSA(size));
2902bf215546Sopenharmony_ci
2903bf215546Sopenharmony_ci         bld.setPosition(i, true);
2904bf215546Sopenharmony_ci         bld.mkMov((zero = bld.getSSA(size)), bld.mkImm(0), i->dType)
2905bf215546Sopenharmony_ci            ->setPredicate(CC_P, pred);
2906bf215546Sopenharmony_ci         bld.mkOp2(OP_UNION, i->dType, dst, i->getDef(0), zero);
2907bf215546Sopenharmony_ci      }
2908bf215546Sopenharmony_ci   }
2909bf215546Sopenharmony_ci}
2910bf215546Sopenharmony_ci
2911bf215546Sopenharmony_civoid
2912bf215546Sopenharmony_ciNVC0LoweringPass::readTessCoord(LValue *dst, int c)
2913bf215546Sopenharmony_ci{
2914bf215546Sopenharmony_ci   Value *laneid = bld.getSSA();
2915bf215546Sopenharmony_ci   Value *x, *y;
2916bf215546Sopenharmony_ci
2917bf215546Sopenharmony_ci   bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
2918bf215546Sopenharmony_ci
2919bf215546Sopenharmony_ci   if (c == 0) {
2920bf215546Sopenharmony_ci      x = dst;
2921bf215546Sopenharmony_ci      y = NULL;
2922bf215546Sopenharmony_ci   } else
2923bf215546Sopenharmony_ci   if (c == 1) {
2924bf215546Sopenharmony_ci      x = NULL;
2925bf215546Sopenharmony_ci      y = dst;
2926bf215546Sopenharmony_ci   } else {
2927bf215546Sopenharmony_ci      assert(c == 2);
2928bf215546Sopenharmony_ci      if (prog->driver_out->prop.tp.domain != PIPE_PRIM_TRIANGLES) {
2929bf215546Sopenharmony_ci         bld.mkMov(dst, bld.loadImm(NULL, 0));
2930bf215546Sopenharmony_ci         return;
2931bf215546Sopenharmony_ci      }
2932bf215546Sopenharmony_ci      x = bld.getSSA();
2933bf215546Sopenharmony_ci      y = bld.getSSA();
2934bf215546Sopenharmony_ci   }
2935bf215546Sopenharmony_ci   if (x)
2936bf215546Sopenharmony_ci      bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
2937bf215546Sopenharmony_ci   if (y)
2938bf215546Sopenharmony_ci      bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
2939bf215546Sopenharmony_ci
2940bf215546Sopenharmony_ci   if (c == 2) {
2941bf215546Sopenharmony_ci      bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
2942bf215546Sopenharmony_ci      bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
2943bf215546Sopenharmony_ci   }
2944bf215546Sopenharmony_ci}
2945bf215546Sopenharmony_ci
2946bf215546Sopenharmony_cibool
2947bf215546Sopenharmony_ciNVC0LoweringPass::handleRDSV(Instruction *i)
2948bf215546Sopenharmony_ci{
2949bf215546Sopenharmony_ci   Symbol *sym = i->getSrc(0)->asSym();
2950bf215546Sopenharmony_ci   const SVSemantic sv = sym->reg.data.sv.sv;
2951bf215546Sopenharmony_ci   Value *vtx = NULL;
2952bf215546Sopenharmony_ci   Instruction *ld;
2953bf215546Sopenharmony_ci   uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
2954bf215546Sopenharmony_ci
2955bf215546Sopenharmony_ci   if (addr >= 0x400) {
2956bf215546Sopenharmony_ci      // mov $sreg
2957bf215546Sopenharmony_ci      if (sym->reg.data.sv.index == 3) {
2958bf215546Sopenharmony_ci         // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
2959bf215546Sopenharmony_ci         i->op = OP_MOV;
2960bf215546Sopenharmony_ci         i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
2961bf215546Sopenharmony_ci      } else
2962bf215546Sopenharmony_ci      if (sv == SV_TID) {
2963bf215546Sopenharmony_ci         // Help CSE combine TID fetches
2964bf215546Sopenharmony_ci         Value *tid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(),
2965bf215546Sopenharmony_ci                                 bld.mkSysVal(SV_COMBINED_TID, 0));
2966bf215546Sopenharmony_ci         i->op = OP_EXTBF;
2967bf215546Sopenharmony_ci         i->setSrc(0, tid);
2968bf215546Sopenharmony_ci         switch (sym->reg.data.sv.index) {
2969bf215546Sopenharmony_ci         case 0: i->setSrc(1, bld.mkImm(0x1000)); break;
2970bf215546Sopenharmony_ci         case 1: i->setSrc(1, bld.mkImm(0x0a10)); break;
2971bf215546Sopenharmony_ci         case 2: i->setSrc(1, bld.mkImm(0x061a)); break;
2972bf215546Sopenharmony_ci         }
2973bf215546Sopenharmony_ci      }
2974bf215546Sopenharmony_ci      if (sv == SV_VERTEX_COUNT) {
2975bf215546Sopenharmony_ci         bld.setPosition(i, true);
2976bf215546Sopenharmony_ci         bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808));
2977bf215546Sopenharmony_ci      }
2978bf215546Sopenharmony_ci      return true;
2979bf215546Sopenharmony_ci   }
2980bf215546Sopenharmony_ci
2981bf215546Sopenharmony_ci   switch (sv) {
2982bf215546Sopenharmony_ci   case SV_POSITION:
2983bf215546Sopenharmony_ci      assert(prog->getType() == Program::TYPE_FRAGMENT);
2984bf215546Sopenharmony_ci      if (i->srcExists(1)) {
2985bf215546Sopenharmony_ci         // Pass offset through to the interpolation logic
2986bf215546Sopenharmony_ci         ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET,
2987bf215546Sopenharmony_ci                           i->getDef(0), addr, NULL);
2988bf215546Sopenharmony_ci         ld->setSrc(1, i->getSrc(1));
2989bf215546Sopenharmony_ci      } else {
2990bf215546Sopenharmony_ci         bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
2991bf215546Sopenharmony_ci      }
2992bf215546Sopenharmony_ci      break;
2993bf215546Sopenharmony_ci   case SV_FACE:
2994bf215546Sopenharmony_ci   {
2995bf215546Sopenharmony_ci      Value *face = i->getDef(0);
2996bf215546Sopenharmony_ci      bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
2997bf215546Sopenharmony_ci      if (i->dType == TYPE_F32) {
2998bf215546Sopenharmony_ci         bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));
2999bf215546Sopenharmony_ci         bld.mkOp1(OP_NEG, TYPE_S32, face, face);
3000bf215546Sopenharmony_ci         bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);
3001bf215546Sopenharmony_ci      }
3002bf215546Sopenharmony_ci   }
3003bf215546Sopenharmony_ci      break;
3004bf215546Sopenharmony_ci   case SV_TESS_COORD:
3005bf215546Sopenharmony_ci      assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
3006bf215546Sopenharmony_ci      readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
3007bf215546Sopenharmony_ci      break;
3008bf215546Sopenharmony_ci   case SV_NTID:
3009bf215546Sopenharmony_ci   case SV_NCTAID:
3010bf215546Sopenharmony_ci   case SV_GRIDID:
3011bf215546Sopenharmony_ci      assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
3012bf215546Sopenharmony_ci      if (sym->reg.data.sv.index == 3) {
3013bf215546Sopenharmony_ci         i->op = OP_MOV;
3014bf215546Sopenharmony_ci         i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
3015bf215546Sopenharmony_ci         return true;
3016bf215546Sopenharmony_ci      }
3017bf215546Sopenharmony_ci      FALLTHROUGH;
3018bf215546Sopenharmony_ci   case SV_WORK_DIM:
3019bf215546Sopenharmony_ci      addr += prog->driver->prop.cp.gridInfoBase;
3020bf215546Sopenharmony_ci      bld.mkLoad(TYPE_U32, i->getDef(0),
3021bf215546Sopenharmony_ci                 bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
3022bf215546Sopenharmony_ci                              TYPE_U32, addr), NULL);
3023bf215546Sopenharmony_ci      break;
3024bf215546Sopenharmony_ci   case SV_SAMPLE_INDEX:
3025bf215546Sopenharmony_ci      // TODO: Properly pass source as an address in the PIX address space
3026bf215546Sopenharmony_ci      // (which can be of the form [r0+offset]). But this is currently
3027bf215546Sopenharmony_ci      // unnecessary.
3028bf215546Sopenharmony_ci      ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
3029bf215546Sopenharmony_ci      ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
3030bf215546Sopenharmony_ci      break;
3031bf215546Sopenharmony_ci   case SV_SAMPLE_POS: {
3032bf215546Sopenharmony_ci      Value *sampleID = bld.getScratch();
3033bf215546Sopenharmony_ci      ld = bld.mkOp1(OP_PIXLD, TYPE_U32, sampleID, bld.mkImm(0));
3034bf215546Sopenharmony_ci      ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
3035bf215546Sopenharmony_ci      Value *offset = calculateSampleOffset(sampleID);
3036bf215546Sopenharmony_ci
3037bf215546Sopenharmony_ci      assert(prog->driver_out->prop.fp.readsSampleLocations);
3038bf215546Sopenharmony_ci
3039bf215546Sopenharmony_ci      if (targ->getChipset() >= NVISA_GM200_CHIPSET) {
3040bf215546Sopenharmony_ci         bld.mkLoad(TYPE_F32,
3041bf215546Sopenharmony_ci                    i->getDef(0),
3042bf215546Sopenharmony_ci                    bld.mkSymbol(
3043bf215546Sopenharmony_ci                          FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
3044bf215546Sopenharmony_ci                          TYPE_U32, prog->driver->io.sampleInfoBase),
3045bf215546Sopenharmony_ci                    offset);
3046bf215546Sopenharmony_ci         bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0),
3047bf215546Sopenharmony_ci                   bld.mkImm(0x040c + sym->reg.data.sv.index * 16));
3048bf215546Sopenharmony_ci         bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_U32, i->getDef(0));
3049bf215546Sopenharmony_ci         bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(0), i->getDef(0), bld.mkImm(1.0f / 16.0f));
3050bf215546Sopenharmony_ci      } else {
3051bf215546Sopenharmony_ci         bld.mkLoad(TYPE_F32,
3052bf215546Sopenharmony_ci                    i->getDef(0),
3053bf215546Sopenharmony_ci                    bld.mkSymbol(
3054bf215546Sopenharmony_ci                          FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
3055bf215546Sopenharmony_ci                          TYPE_U32, prog->driver->io.sampleInfoBase +
3056bf215546Sopenharmony_ci                          4 * sym->reg.data.sv.index),
3057bf215546Sopenharmony_ci                    offset);
3058bf215546Sopenharmony_ci      }
3059bf215546Sopenharmony_ci      break;
3060bf215546Sopenharmony_ci   }
3061bf215546Sopenharmony_ci   case SV_SAMPLE_MASK: {
3062bf215546Sopenharmony_ci      ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
3063bf215546Sopenharmony_ci      ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
3064bf215546Sopenharmony_ci      Instruction *sampleid =
3065bf215546Sopenharmony_ci         bld.mkOp1(OP_PIXLD, TYPE_U32, bld.getSSA(), bld.mkImm(0));
3066bf215546Sopenharmony_ci      sampleid->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
3067bf215546Sopenharmony_ci      Value *masked =
3068bf215546Sopenharmony_ci         bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ld->getDef(0),
3069bf215546Sopenharmony_ci                    bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
3070bf215546Sopenharmony_ci                               bld.loadImm(NULL, 1), sampleid->getDef(0)));
3071bf215546Sopenharmony_ci      if (prog->persampleInvocation) {
3072bf215546Sopenharmony_ci         bld.mkMov(i->getDef(0), masked);
3073bf215546Sopenharmony_ci      } else {
3074bf215546Sopenharmony_ci         bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), ld->getDef(0), masked,
3075bf215546Sopenharmony_ci                   bld.mkImm(0))
3076bf215546Sopenharmony_ci            ->subOp = 1;
3077bf215546Sopenharmony_ci      }
3078bf215546Sopenharmony_ci      break;
3079bf215546Sopenharmony_ci   }
3080bf215546Sopenharmony_ci   case SV_BASEVERTEX:
3081bf215546Sopenharmony_ci   case SV_BASEINSTANCE:
3082bf215546Sopenharmony_ci   case SV_DRAWID:
3083bf215546Sopenharmony_ci      ld = bld.mkLoad(TYPE_U32, i->getDef(0),
3084bf215546Sopenharmony_ci                      bld.mkSymbol(FILE_MEMORY_CONST,
3085bf215546Sopenharmony_ci                                   prog->driver->io.auxCBSlot,
3086bf215546Sopenharmony_ci                                   TYPE_U32,
3087bf215546Sopenharmony_ci                                   prog->driver->io.drawInfoBase +
3088bf215546Sopenharmony_ci                                   4 * (sv - SV_BASEVERTEX)),
3089bf215546Sopenharmony_ci                      NULL);
3090bf215546Sopenharmony_ci      break;
3091bf215546Sopenharmony_ci   default:
3092bf215546Sopenharmony_ci      if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
3093bf215546Sopenharmony_ci         vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
3094bf215546Sopenharmony_ci      if (prog->getType() == Program::TYPE_FRAGMENT) {
3095bf215546Sopenharmony_ci         bld.mkInterp(NV50_IR_INTERP_FLAT, i->getDef(0), addr, NULL);
3096bf215546Sopenharmony_ci      } else {
3097bf215546Sopenharmony_ci         ld = bld.mkFetch(i->getDef(0), i->dType,
3098bf215546Sopenharmony_ci                          FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
3099bf215546Sopenharmony_ci         ld->perPatch = i->perPatch;
3100bf215546Sopenharmony_ci      }
3101bf215546Sopenharmony_ci      break;
3102bf215546Sopenharmony_ci   }
3103bf215546Sopenharmony_ci   bld.getBB()->remove(i);
3104bf215546Sopenharmony_ci   return true;
3105bf215546Sopenharmony_ci}
3106bf215546Sopenharmony_ci
3107bf215546Sopenharmony_cibool
3108bf215546Sopenharmony_ciNVC0LoweringPass::handleDIV(Instruction *i)
3109bf215546Sopenharmony_ci{
3110bf215546Sopenharmony_ci   if (!isFloatType(i->dType))
3111bf215546Sopenharmony_ci      return true;
3112bf215546Sopenharmony_ci   bld.setPosition(i, false);
3113bf215546Sopenharmony_ci   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
3114bf215546Sopenharmony_ci   i->op = OP_MUL;
3115bf215546Sopenharmony_ci   i->setSrc(1, rcp->getDef(0));
3116bf215546Sopenharmony_ci   return true;
3117bf215546Sopenharmony_ci}
3118bf215546Sopenharmony_ci
3119bf215546Sopenharmony_cibool
3120bf215546Sopenharmony_ciNVC0LoweringPass::handleMOD(Instruction *i)
3121bf215546Sopenharmony_ci{
3122bf215546Sopenharmony_ci   if (!isFloatType(i->dType))
3123bf215546Sopenharmony_ci      return true;
3124bf215546Sopenharmony_ci   LValue *value = bld.getScratch(typeSizeof(i->dType));
3125bf215546Sopenharmony_ci   bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
3126bf215546Sopenharmony_ci   bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
3127bf215546Sopenharmony_ci   bld.mkOp1(OP_TRUNC, i->dType, value, value);
3128bf215546Sopenharmony_ci   bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
3129bf215546Sopenharmony_ci   i->op = OP_SUB;
3130bf215546Sopenharmony_ci   i->setSrc(1, value);
3131bf215546Sopenharmony_ci   return true;
3132bf215546Sopenharmony_ci}
3133bf215546Sopenharmony_ci
3134bf215546Sopenharmony_cibool
3135bf215546Sopenharmony_ciNVC0LoweringPass::handleSQRT(Instruction *i)
3136bf215546Sopenharmony_ci{
3137bf215546Sopenharmony_ci   if (targ->isOpSupported(OP_SQRT, i->dType))
3138bf215546Sopenharmony_ci      return true;
3139bf215546Sopenharmony_ci
3140bf215546Sopenharmony_ci   if (i->dType == TYPE_F64) {
3141bf215546Sopenharmony_ci      Value *pred = bld.getSSA(1, FILE_PREDICATE);
3142bf215546Sopenharmony_ci      Value *zero = bld.loadImm(NULL, 0.0);
3143bf215546Sopenharmony_ci      Value *dst = bld.getSSA(8);
3144bf215546Sopenharmony_ci      bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
3145bf215546Sopenharmony_ci      bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
3146bf215546Sopenharmony_ci      bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred);
3147bf215546Sopenharmony_ci      i->op = OP_MUL;
3148bf215546Sopenharmony_ci      i->setSrc(1, dst);
3149bf215546Sopenharmony_ci      // TODO: Handle this properly with a library function
3150bf215546Sopenharmony_ci   } else {
3151bf215546Sopenharmony_ci      bld.setPosition(i, true);
3152bf215546Sopenharmony_ci      i->op = OP_RSQ;
3153bf215546Sopenharmony_ci      bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
3154bf215546Sopenharmony_ci   }
3155bf215546Sopenharmony_ci
3156bf215546Sopenharmony_ci   return true;
3157bf215546Sopenharmony_ci}
3158bf215546Sopenharmony_ci
3159bf215546Sopenharmony_cibool
3160bf215546Sopenharmony_ciNVC0LoweringPass::handlePOW(Instruction *i)
3161bf215546Sopenharmony_ci{
3162bf215546Sopenharmony_ci   LValue *val = bld.getScratch();
3163bf215546Sopenharmony_ci
3164bf215546Sopenharmony_ci   bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
3165bf215546Sopenharmony_ci   bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
3166bf215546Sopenharmony_ci   bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
3167bf215546Sopenharmony_ci
3168bf215546Sopenharmony_ci   i->op = OP_EX2;
3169bf215546Sopenharmony_ci   i->setSrc(0, val);
3170bf215546Sopenharmony_ci   i->setSrc(1, NULL);
3171bf215546Sopenharmony_ci
3172bf215546Sopenharmony_ci   return true;
3173bf215546Sopenharmony_ci}
3174bf215546Sopenharmony_ci
3175bf215546Sopenharmony_cibool
3176bf215546Sopenharmony_ciNVC0LoweringPass::handleEXPORT(Instruction *i)
3177bf215546Sopenharmony_ci{
3178bf215546Sopenharmony_ci   if (prog->getType() == Program::TYPE_FRAGMENT) {
3179bf215546Sopenharmony_ci      int id = i->getSrc(0)->reg.data.offset / 4;
3180bf215546Sopenharmony_ci
3181bf215546Sopenharmony_ci      if (i->src(0).isIndirect(0)) // TODO, ugly
3182bf215546Sopenharmony_ci         return false;
3183bf215546Sopenharmony_ci      i->op = OP_MOV;
3184bf215546Sopenharmony_ci      i->subOp = NV50_IR_SUBOP_MOV_FINAL;
3185bf215546Sopenharmony_ci      i->src(0).set(i->src(1));
3186bf215546Sopenharmony_ci      i->setSrc(1, NULL);
3187bf215546Sopenharmony_ci      i->setDef(0, new_LValue(func, FILE_GPR));
3188bf215546Sopenharmony_ci      i->getDef(0)->reg.data.id = id;
3189bf215546Sopenharmony_ci
3190bf215546Sopenharmony_ci      prog->maxGPR = MAX2(prog->maxGPR, id);
3191bf215546Sopenharmony_ci   } else
3192bf215546Sopenharmony_ci   if (prog->getType() == Program::TYPE_GEOMETRY) {
3193bf215546Sopenharmony_ci      i->setIndirect(0, 1, gpEmitAddress);
3194bf215546Sopenharmony_ci   }
3195bf215546Sopenharmony_ci   return true;
3196bf215546Sopenharmony_ci}
3197bf215546Sopenharmony_ci
3198bf215546Sopenharmony_cibool
3199bf215546Sopenharmony_ciNVC0LoweringPass::handleOUT(Instruction *i)
3200bf215546Sopenharmony_ci{
3201bf215546Sopenharmony_ci   Instruction *prev = i->prev;
3202bf215546Sopenharmony_ci   ImmediateValue stream, prevStream;
3203bf215546Sopenharmony_ci
3204bf215546Sopenharmony_ci   // Only merge if the stream ids match. Also, note that the previous
3205bf215546Sopenharmony_ci   // instruction would have already been lowered, so we take arg1 from it.
3206bf215546Sopenharmony_ci   if (i->op == OP_RESTART && prev && prev->op == OP_EMIT &&
3207bf215546Sopenharmony_ci       i->src(0).getImmediate(stream) &&
3208bf215546Sopenharmony_ci       prev->src(1).getImmediate(prevStream) &&
3209bf215546Sopenharmony_ci       stream.reg.data.u32 == prevStream.reg.data.u32) {
3210bf215546Sopenharmony_ci      i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
3211bf215546Sopenharmony_ci      delete_Instruction(prog, i);
3212bf215546Sopenharmony_ci   } else {
3213bf215546Sopenharmony_ci      assert(gpEmitAddress);
3214bf215546Sopenharmony_ci      i->setDef(0, gpEmitAddress);
3215bf215546Sopenharmony_ci      i->setSrc(1, i->getSrc(0));
3216bf215546Sopenharmony_ci      i->setSrc(0, gpEmitAddress);
3217bf215546Sopenharmony_ci   }
3218bf215546Sopenharmony_ci   return true;
3219bf215546Sopenharmony_ci}
3220bf215546Sopenharmony_ci
3221bf215546Sopenharmony_ciValue *
3222bf215546Sopenharmony_ciNVC0LoweringPass::calculateSampleOffset(Value *sampleID)
3223bf215546Sopenharmony_ci{
3224bf215546Sopenharmony_ci   Value *offset = bld.getScratch();
3225bf215546Sopenharmony_ci   if (targ->getChipset() >= NVISA_GM200_CHIPSET) {
3226bf215546Sopenharmony_ci      // Sample location offsets (in bytes) are calculated like so:
3227bf215546Sopenharmony_ci      // offset = (SV_POSITION.y % 4 * 2) + (SV_POSITION.x % 2)
3228bf215546Sopenharmony_ci      // offset = offset * 32 + sampleID % 8 * 4;
3229bf215546Sopenharmony_ci      // which is equivalent to:
3230bf215546Sopenharmony_ci      // offset = (SV_POSITION.y & 0x3) << 6 + (SV_POSITION.x & 0x1) << 5;
3231bf215546Sopenharmony_ci      // offset += sampleID << 2
3232bf215546Sopenharmony_ci
3233bf215546Sopenharmony_ci      // The second operand (src1) of the INSBF instructions are like so:
3234bf215546Sopenharmony_ci      // 0xssll where ss is the size and ll is the offset.
3235bf215546Sopenharmony_ci      // so: dest = src2 | (src0 & (1 << ss - 1)) << ll
3236bf215546Sopenharmony_ci
3237bf215546Sopenharmony_ci      // Add sample ID (offset = (sampleID & 0x7) << 2)
3238bf215546Sopenharmony_ci      bld.mkOp3(OP_INSBF, TYPE_U32, offset, sampleID, bld.mkImm(0x0302), bld.mkImm(0x0));
3239bf215546Sopenharmony_ci
3240bf215546Sopenharmony_ci      Symbol *xSym = bld.mkSysVal(SV_POSITION, 0);
3241bf215546Sopenharmony_ci      Symbol *ySym = bld.mkSysVal(SV_POSITION, 1);
3242bf215546Sopenharmony_ci      Value *coord = bld.getScratch();
3243bf215546Sopenharmony_ci
3244bf215546Sopenharmony_ci      // Add X coordinate (offset |= (SV_POSITION.x & 0x1) << 5)
3245bf215546Sopenharmony_ci      bld.mkInterp(NV50_IR_INTERP_LINEAR, coord,
3246bf215546Sopenharmony_ci                   targ->getSVAddress(FILE_SHADER_INPUT, xSym), NULL);
3247bf215546Sopenharmony_ci      bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord)
3248bf215546Sopenharmony_ci         ->rnd = ROUND_ZI;
3249bf215546Sopenharmony_ci      bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0105), offset);
3250bf215546Sopenharmony_ci
3251bf215546Sopenharmony_ci      // Add Y coordinate (offset |= (SV_POSITION.y & 0x3) << 6)
3252bf215546Sopenharmony_ci      bld.mkInterp(NV50_IR_INTERP_LINEAR, coord,
3253bf215546Sopenharmony_ci                   targ->getSVAddress(FILE_SHADER_INPUT, ySym), NULL);
3254bf215546Sopenharmony_ci      bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord)
3255bf215546Sopenharmony_ci         ->rnd = ROUND_ZI;
3256bf215546Sopenharmony_ci      bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0206), offset);
3257bf215546Sopenharmony_ci   } else {
3258bf215546Sopenharmony_ci      bld.mkOp2(OP_SHL, TYPE_U32, offset, sampleID, bld.mkImm(3));
3259bf215546Sopenharmony_ci   }
3260bf215546Sopenharmony_ci   return offset;
3261bf215546Sopenharmony_ci}
3262bf215546Sopenharmony_ci
3263bf215546Sopenharmony_ci// Handle programmable sample locations for GM20x+
3264bf215546Sopenharmony_civoid
3265bf215546Sopenharmony_ciNVC0LoweringPass::handlePIXLD(Instruction *i)
3266bf215546Sopenharmony_ci{
3267bf215546Sopenharmony_ci   if (i->subOp != NV50_IR_SUBOP_PIXLD_OFFSET)
3268bf215546Sopenharmony_ci      return;
3269bf215546Sopenharmony_ci   if (targ->getChipset() < NVISA_GM200_CHIPSET)
3270bf215546Sopenharmony_ci      return;
3271bf215546Sopenharmony_ci
3272bf215546Sopenharmony_ci   assert(prog->driver_out->prop.fp.readsSampleLocations);
3273bf215546Sopenharmony_ci
3274bf215546Sopenharmony_ci   bld.mkLoad(TYPE_F32,
3275bf215546Sopenharmony_ci              i->getDef(0),
3276bf215546Sopenharmony_ci              bld.mkSymbol(
3277bf215546Sopenharmony_ci                    FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
3278bf215546Sopenharmony_ci                    TYPE_U32, prog->driver->io.sampleInfoBase),
3279bf215546Sopenharmony_ci              calculateSampleOffset(i->getSrc(0)));
3280bf215546Sopenharmony_ci
3281bf215546Sopenharmony_ci   bld.getBB()->remove(i);
3282bf215546Sopenharmony_ci}
3283bf215546Sopenharmony_ci
3284bf215546Sopenharmony_ci// Generate a binary predicate if an instruction is predicated by
3285bf215546Sopenharmony_ci// e.g. an f32 value.
3286bf215546Sopenharmony_civoid
3287bf215546Sopenharmony_ciNVC0LoweringPass::checkPredicate(Instruction *insn)
3288bf215546Sopenharmony_ci{
3289bf215546Sopenharmony_ci   Value *pred = insn->getPredicate();
3290bf215546Sopenharmony_ci   Value *pdst;
3291bf215546Sopenharmony_ci
3292bf215546Sopenharmony_ci   if (!pred || pred->reg.file == FILE_PREDICATE)
3293bf215546Sopenharmony_ci      return;
3294bf215546Sopenharmony_ci   pdst = new_LValue(func, FILE_PREDICATE);
3295bf215546Sopenharmony_ci
3296bf215546Sopenharmony_ci   // CAUTION: don't use pdst->getInsn, the definition might not be unique,
3297bf215546Sopenharmony_ci   //  delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
3298bf215546Sopenharmony_ci
3299bf215546Sopenharmony_ci   bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred);
3300bf215546Sopenharmony_ci
3301bf215546Sopenharmony_ci   insn->setPredicate(insn->cc, pdst);
3302bf215546Sopenharmony_ci}
3303bf215546Sopenharmony_ci
3304bf215546Sopenharmony_ci//
3305bf215546Sopenharmony_ci// - add quadop dance for texturing
3306bf215546Sopenharmony_ci// - put FP outputs in GPRs
3307bf215546Sopenharmony_ci// - convert instruction sequences
3308bf215546Sopenharmony_ci//
3309bf215546Sopenharmony_cibool
3310bf215546Sopenharmony_ciNVC0LoweringPass::visit(Instruction *i)
3311bf215546Sopenharmony_ci{
3312bf215546Sopenharmony_ci   bool ret = true;
3313bf215546Sopenharmony_ci   bld.setPosition(i, false);
3314bf215546Sopenharmony_ci
3315bf215546Sopenharmony_ci   if (i->cc != CC_ALWAYS)
3316bf215546Sopenharmony_ci      checkPredicate(i);
3317bf215546Sopenharmony_ci
3318bf215546Sopenharmony_ci   switch (i->op) {
3319bf215546Sopenharmony_ci   case OP_TEX:
3320bf215546Sopenharmony_ci   case OP_TXB:
3321bf215546Sopenharmony_ci   case OP_TXL:
3322bf215546Sopenharmony_ci   case OP_TXF:
3323bf215546Sopenharmony_ci   case OP_TXG:
3324bf215546Sopenharmony_ci      return handleTEX(i->asTex());
3325bf215546Sopenharmony_ci   case OP_TXD:
3326bf215546Sopenharmony_ci      return handleTXD(i->asTex());
3327bf215546Sopenharmony_ci   case OP_TXLQ:
3328bf215546Sopenharmony_ci      return handleTXLQ(i->asTex());
3329bf215546Sopenharmony_ci   case OP_TXQ:
3330bf215546Sopenharmony_ci     return handleTXQ(i->asTex());
3331bf215546Sopenharmony_ci   case OP_EX2:
3332bf215546Sopenharmony_ci      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
3333bf215546Sopenharmony_ci      i->setSrc(0, i->getDef(0));
3334bf215546Sopenharmony_ci      break;
3335bf215546Sopenharmony_ci   case OP_POW:
3336bf215546Sopenharmony_ci      return handlePOW(i);
3337bf215546Sopenharmony_ci   case OP_DIV:
3338bf215546Sopenharmony_ci      return handleDIV(i);
3339bf215546Sopenharmony_ci   case OP_MOD:
3340bf215546Sopenharmony_ci      return handleMOD(i);
3341bf215546Sopenharmony_ci   case OP_SQRT:
3342bf215546Sopenharmony_ci      return handleSQRT(i);
3343bf215546Sopenharmony_ci   case OP_EXPORT:
3344bf215546Sopenharmony_ci      ret = handleEXPORT(i);
3345bf215546Sopenharmony_ci      break;
3346bf215546Sopenharmony_ci   case OP_EMIT:
3347bf215546Sopenharmony_ci   case OP_RESTART:
3348bf215546Sopenharmony_ci      return handleOUT(i);
3349bf215546Sopenharmony_ci   case OP_RDSV:
3350bf215546Sopenharmony_ci      return handleRDSV(i);
3351bf215546Sopenharmony_ci   case OP_WRSV:
3352bf215546Sopenharmony_ci      return handleWRSV(i);
3353bf215546Sopenharmony_ci   case OP_STORE:
3354bf215546Sopenharmony_ci   case OP_LOAD:
3355bf215546Sopenharmony_ci      handleLDST(i);
3356bf215546Sopenharmony_ci      break;
3357bf215546Sopenharmony_ci   case OP_ATOM:
3358bf215546Sopenharmony_ci   {
3359bf215546Sopenharmony_ci      const bool cctl = i->src(0).getFile() == FILE_MEMORY_BUFFER;
3360bf215546Sopenharmony_ci      handleATOM(i);
3361bf215546Sopenharmony_ci      if (cctl)
3362bf215546Sopenharmony_ci         handleATOMCctl(i);
3363bf215546Sopenharmony_ci      handleCasExch(i);
3364bf215546Sopenharmony_ci   }
3365bf215546Sopenharmony_ci      break;
3366bf215546Sopenharmony_ci   case OP_SULDB:
3367bf215546Sopenharmony_ci   case OP_SULDP:
3368bf215546Sopenharmony_ci   case OP_SUSTB:
3369bf215546Sopenharmony_ci   case OP_SUSTP:
3370bf215546Sopenharmony_ci   case OP_SUREDB:
3371bf215546Sopenharmony_ci   case OP_SUREDP:
3372bf215546Sopenharmony_ci      if (targ->getChipset() >= NVISA_GM107_CHIPSET)
3373bf215546Sopenharmony_ci         handleSurfaceOpGM107(i->asTex());
3374bf215546Sopenharmony_ci      else if (targ->getChipset() >= NVISA_GK104_CHIPSET)
3375bf215546Sopenharmony_ci         handleSurfaceOpNVE4(i->asTex());
3376bf215546Sopenharmony_ci      else
3377bf215546Sopenharmony_ci         handleSurfaceOpNVC0(i->asTex());
3378bf215546Sopenharmony_ci      break;
3379bf215546Sopenharmony_ci   case OP_SUQ:
3380bf215546Sopenharmony_ci      handleSUQ(i->asTex());
3381bf215546Sopenharmony_ci      break;
3382bf215546Sopenharmony_ci   case OP_BUFQ:
3383bf215546Sopenharmony_ci      handleBUFQ(i);
3384bf215546Sopenharmony_ci      break;
3385bf215546Sopenharmony_ci   case OP_PIXLD:
3386bf215546Sopenharmony_ci      handlePIXLD(i);
3387bf215546Sopenharmony_ci      break;
3388bf215546Sopenharmony_ci   default:
3389bf215546Sopenharmony_ci      break;
3390bf215546Sopenharmony_ci   }
3391bf215546Sopenharmony_ci
3392bf215546Sopenharmony_ci   /* Kepler+ has a special opcode to compute a new base address to be used
3393bf215546Sopenharmony_ci    * for indirect loads.
3394bf215546Sopenharmony_ci    *
3395bf215546Sopenharmony_ci    * Maxwell+ has an additional similar requirement for indirect
3396bf215546Sopenharmony_ci    * interpolation ops in frag shaders.
3397bf215546Sopenharmony_ci    */
3398bf215546Sopenharmony_ci   bool doAfetch = false;
3399bf215546Sopenharmony_ci   if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
3400bf215546Sopenharmony_ci       !i->perPatch &&
3401bf215546Sopenharmony_ci       (i->op == OP_VFETCH || i->op == OP_EXPORT) &&
3402bf215546Sopenharmony_ci       i->src(0).isIndirect(0)) {
3403bf215546Sopenharmony_ci      doAfetch = true;
3404bf215546Sopenharmony_ci   }
3405bf215546Sopenharmony_ci   if (targ->getChipset() >= NVISA_GM107_CHIPSET &&
3406bf215546Sopenharmony_ci       (i->op == OP_LINTERP || i->op == OP_PINTERP) &&
3407bf215546Sopenharmony_ci       i->src(0).isIndirect(0)) {
3408bf215546Sopenharmony_ci      doAfetch = true;
3409bf215546Sopenharmony_ci   }
3410bf215546Sopenharmony_ci
3411bf215546Sopenharmony_ci   if (doAfetch) {
3412bf215546Sopenharmony_ci      Value *addr = cloneShallow(func, i->getSrc(0));
3413bf215546Sopenharmony_ci      Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(),
3414bf215546Sopenharmony_ci                                      i->getSrc(0));
3415bf215546Sopenharmony_ci      afetch->setIndirect(0, 0, i->getIndirect(0, 0));
3416bf215546Sopenharmony_ci      addr->reg.data.offset = 0;
3417bf215546Sopenharmony_ci      i->setSrc(0, addr);
3418bf215546Sopenharmony_ci      i->setIndirect(0, 0, afetch->getDef(0));
3419bf215546Sopenharmony_ci      i->subOp = NV50_IR_SUBOP_VFETCH_PHYS;
3420bf215546Sopenharmony_ci   }
3421bf215546Sopenharmony_ci
3422bf215546Sopenharmony_ci   return ret;
3423bf215546Sopenharmony_ci}
3424bf215546Sopenharmony_ci
3425bf215546Sopenharmony_cibool
3426bf215546Sopenharmony_ciTargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
3427bf215546Sopenharmony_ci{
3428bf215546Sopenharmony_ci   if (stage == CG_STAGE_PRE_SSA) {
3429bf215546Sopenharmony_ci      NVC0LoweringPass pass(prog);
3430bf215546Sopenharmony_ci      return pass.run(prog, false, true);
3431bf215546Sopenharmony_ci   } else
3432bf215546Sopenharmony_ci   if (stage == CG_STAGE_POST_RA) {
3433bf215546Sopenharmony_ci      NVC0LegalizePostRA pass(prog);
3434bf215546Sopenharmony_ci      return pass.run(prog, false, true);
3435bf215546Sopenharmony_ci   } else
3436bf215546Sopenharmony_ci   if (stage == CG_STAGE_SSA) {
3437bf215546Sopenharmony_ci      NVC0LegalizeSSA pass;
3438bf215546Sopenharmony_ci      return pass.run(prog, false, true);
3439bf215546Sopenharmony_ci   }
3440bf215546Sopenharmony_ci   return false;
3441bf215546Sopenharmony_ci}
3442bf215546Sopenharmony_ci
3443bf215546Sopenharmony_ci} // namespace nv50_ir
3444