1/*
2 * Copyright 2011 Christoph Bumiller
3 *           2014 Red Hat Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24#include "nv50_ir.h"
25#include "nv50_ir_build_util.h"
26
27#include "nv50_ir_target_nvc0.h"
28#include "nv50_ir_lowering_gm107.h"
29
30#include <limits>
31
32namespace nv50_ir {
33
34#define QOP_ADD  0
35#define QOP_SUBR 1
36#define QOP_SUB  2
37#define QOP_MOV2 3
38
39//             UL UR LL LR
40#define QUADOP(q, r, s, t)                      \
41   ((QOP_##q << 6) | (QOP_##r << 4) |           \
42    (QOP_##s << 2) | (QOP_##t << 0))
43
44#define SHFL_BOUND_QUAD 0x1c03
45
46void
47GM107LegalizeSSA::handlePFETCH(Instruction *i)
48{
49   Value *src0;
50
51   if (i->src(0).getFile() == FILE_GPR && !i->srcExists(1))
52      return;
53
54   bld.setPosition(i, false);
55   src0 = bld.getSSA();
56
57   if (i->srcExists(1))
58      bld.mkOp2(OP_ADD , TYPE_U32, src0, i->getSrc(0), i->getSrc(1));
59   else
60      bld.mkOp1(OP_MOV , TYPE_U32, src0, i->getSrc(0));
61
62   i->setSrc(0, src0);
63   i->setSrc(1, NULL);
64}
65
66void
67GM107LegalizeSSA::handleLOAD(Instruction *i)
68{
69   if (i->src(0).getFile() != FILE_MEMORY_CONST)
70      return;
71   if (i->src(0).isIndirect(0))
72      return;
73   if (typeSizeof(i->dType) != 4)
74      return;
75
76   i->op = OP_MOV;
77}
78
79void
80GM107LegalizeSSA::handleQUADON(Instruction *i)
81{
82   i->setDef(0, NULL);
83}
84
85void
86GM107LegalizeSSA::handleQUADPOP(Instruction *i)
87{
88   i->setSrc(0, NULL);
89}
90
91bool
92GM107LegalizeSSA::visit(Instruction *i)
93{
94   switch (i->op) {
95   case OP_QUADON:
96      handleQUADON(i);
97      break;
98   case OP_QUADPOP:
99      handleQUADPOP(i);
100      break;
101   case OP_PFETCH:
102      handlePFETCH(i);
103      break;
104   case OP_LOAD:
105      handleLOAD(i);
106      break;
107   default:
108      break;
109   }
110   return true;
111}
112
113bool
114GM107LoweringPass::handleManualTXD(TexInstruction *i)
115{
116   // See NVC0LoweringPass::handleManualTXD for rationale. This function
117   // implements the same logic, but using SM50-friendly primitives.
118   static const uint8_t qOps[2] =
119      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) };
120   Value *def[4][4];
121   Value *crd[3], *arr, *shadow;
122   Value *tmp;
123   Instruction *tex, *add;
124   Value *quad = bld.mkImm(SHFL_BOUND_QUAD);
125   int l, c;
126   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
127   const int array = i->tex.target.isArray();
128   const int indirect = i->tex.rIndirectSrc >= 0;
129
130   i->op = OP_TEX; // no need to clone dPdx/dPdy later
131
132   for (c = 0; c < dim; ++c)
133      crd[c] = bld.getScratch();
134   arr = bld.getScratch();
135   shadow = bld.getScratch();
136   tmp = bld.getScratch();
137
138   for (l = 0; l < 4; ++l) {
139      Value *bar = bld.getSSA(4, FILE_BARRIER);
140      Value *src[3], *val;
141      Value *lane = bld.mkImm(l);
142      bld.mkOp(OP_QUADON, TYPE_U32, bar);
143      // Make sure lane 0 has the appropriate array/depth compare values
144      if (l != 0) {
145         if (array)
146            bld.mkOp3(OP_SHFL, TYPE_F32, arr, i->getSrc(0), lane, quad);
147         if (i->tex.target.isShadow())
148            bld.mkOp3(OP_SHFL, TYPE_F32, shadow, i->getSrc(array + dim + indirect), lane, quad);
149      }
150
151      // mov coordinates from lane l to all lanes
152      for (c = 0; c < dim; ++c) {
153         bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array), lane, quad);
154      }
155
156      // add dPdx from lane l to lanes dx
157      for (c = 0; c < dim; ++c) {
158         bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), lane, quad);
159         add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
160         add->subOp = qOps[0];
161         add->lanes = 1; /* abused for .ndv */
162      }
163
164      // add dPdy from lane l to lanes dy
165      for (c = 0; c < dim; ++c) {
166         bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), lane, quad);
167         add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
168         add->subOp = qOps[1];
169         add->lanes = 1; /* abused for .ndv */
170      }
171
172      // normalize cube coordinates if necessary
173      if (i->tex.target.isCube()) {
174         for (c = 0; c < 3; ++c)
175            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
176         val = bld.getScratch();
177         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
178         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
179         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
180         for (c = 0; c < 3; ++c)
181            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
182      } else {
183         for (c = 0; c < dim; ++c)
184            src[c] = crd[c];
185      }
186
187      // texture
188      bld.insert(tex = cloneForward(func, i));
189      if (l != 0) {
190         if (array)
191            tex->setSrc(0, arr);
192         if (i->tex.target.isShadow())
193            tex->setSrc(array + dim + indirect, shadow);
194      }
195      for (c = 0; c < dim; ++c)
196         tex->setSrc(c + array, src[c]);
197      // broadcast results from lane 0 to all lanes
198      if (l != 0)
199         for (c = 0; i->defExists(c); ++c)
200            bld.mkOp3(OP_SHFL, TYPE_F32, tex->getDef(c), tex->getDef(c), bld.mkImm(0), quad);
201      bld.mkOp1(OP_QUADPOP, TYPE_U32, NULL, bar)->fixed = 1;
202
203      // save results
204      for (c = 0; i->defExists(c); ++c) {
205         Instruction *mov;
206         def[c][l] = bld.getSSA();
207         mov = bld.mkMov(def[c][l], tex->getDef(c));
208         mov->fixed = 1;
209         mov->lanes = 1 << l;
210      }
211   }
212
213   for (c = 0; i->defExists(c); ++c) {
214      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
215      for (l = 0; l < 4; ++l)
216         u->setSrc(l, def[c][l]);
217   }
218
219   i->bb->remove(i);
220   return true;
221}
222
223bool
224GM107LoweringPass::handleDFDX(Instruction *insn)
225{
226   Instruction *shfl;
227   int qop = 0, xid = 0;
228
229   switch (insn->op) {
230   case OP_DFDX:
231      qop = QUADOP(SUB, SUBR, SUB, SUBR);
232      xid = 1;
233      break;
234   case OP_DFDY:
235      qop = QUADOP(SUB, SUB, SUBR, SUBR);
236      xid = 2;
237      break;
238   default:
239      assert(!"invalid dfdx opcode");
240      break;
241   }
242
243   shfl = bld.mkOp3(OP_SHFL, TYPE_F32, bld.getScratch(), insn->getSrc(0),
244                    bld.mkImm(xid), bld.mkImm(SHFL_BOUND_QUAD));
245   shfl->subOp = NV50_IR_SUBOP_SHFL_BFLY;
246   insn->op = OP_QUADOP;
247   insn->subOp = qop;
248   insn->lanes = 0; /* abused for !.ndv */
249   insn->setSrc(1, insn->getSrc(0));
250   insn->setSrc(0, shfl->getDef(0));
251   return true;
252}
253
254bool
255GM107LoweringPass::handlePFETCH(Instruction *i)
256{
257   Value *tmp0 = bld.getScratch();
258   Value *tmp1 = bld.getScratch();
259   Value *tmp2 = bld.getScratch();
260   bld.mkOp1(OP_RDSV, TYPE_U32, tmp0, bld.mkSysVal(SV_INVOCATION_INFO, 0));
261   bld.mkOp3(OP_PERMT, TYPE_U32, tmp1, tmp0, bld.mkImm(0x4442), bld.mkImm(0));
262   bld.mkOp3(OP_PERMT, TYPE_U32, tmp0, tmp0, bld.mkImm(0x4440), bld.mkImm(0));
263   if (i->getSrc(1))
264      bld.mkOp2(OP_ADD , TYPE_U32, tmp2, i->getSrc(0), i->getSrc(1));
265   else
266      bld.mkOp1(OP_MOV , TYPE_U32, tmp2, i->getSrc(0));
267   bld.mkOp3(OP_MAD , TYPE_U32, tmp0, tmp0, tmp1, tmp2);
268   i->setSrc(0, tmp0);
269   i->setSrc(1, NULL);
270   return true;
271}
272
273bool
274GM107LoweringPass::handlePOPCNT(Instruction *i)
275{
276   Value *tmp = bld.mkOp2v(OP_AND, i->sType, bld.getScratch(),
277                           i->getSrc(0), i->getSrc(1));
278   i->setSrc(0, tmp);
279   i->setSrc(1, NULL);
280   return true;
281}
282
283bool
284GM107LoweringPass::handleSUQ(TexInstruction *suq)
285{
286   Value *ind = suq->getIndirectR();
287   Value *handle;
288   const int slot = suq->tex.r;
289   const int mask = suq->tex.mask;
290
291   if (suq->tex.bindless)
292      handle = ind;
293   else
294      handle = loadTexHandle(ind, slot + 32);
295
296   suq->tex.r = 0xff;
297   suq->tex.s = 0x1f;
298
299   suq->setIndirectR(NULL);
300   suq->setSrc(0, handle);
301   suq->tex.rIndirectSrc = 0;
302   suq->setSrc(1, bld.loadImm(NULL, 0));
303   suq->tex.query = TXQ_DIMS;
304   suq->op = OP_TXQ;
305
306   // We store CUBE / CUBE_ARRAY as a 2D ARRAY. Make sure that depth gets
307   // divided by 6.
308   if (mask & 0x4 && suq->tex.target.isCube()) {
309      int d = util_bitcount(mask & 0x3);
310      bld.setPosition(suq, true);
311      bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d), suq->getDef(d),
312                bld.loadImm(NULL, 6));
313   }
314
315   // Samples come from a different query. If we want both samples and dims,
316   // create a second suq.
317   if (mask & 0x8) {
318      int d = util_bitcount(mask & 0x7);
319      Value *dst = suq->getDef(d);
320      TexInstruction *samples = suq;
321      assert(dst);
322
323      if (mask != 0x8) {
324         suq->setDef(d, NULL);
325         suq->tex.mask &= 0x7;
326         samples = cloneShallow(func, suq);
327         for (int i = 0; i < d; i++)
328            samples->setDef(d, NULL);
329         samples->setDef(0, dst);
330         suq->bb->insertAfter(suq, samples);
331      }
332      samples->tex.mask = 0x4;
333      samples->tex.query = TXQ_TYPE;
334   }
335
336   if (suq->tex.target.isMS()) {
337      bld.setPosition(suq, true);
338
339      if (mask & 0x1)
340         bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(0), suq->getDef(0),
341                   loadMsAdjInfo32(suq->tex.target, 0, slot, ind, suq->tex.bindless));
342      if (mask & 0x2) {
343         int d = util_bitcount(mask & 0x1);
344         bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(d), suq->getDef(d),
345                   loadMsAdjInfo32(suq->tex.target, 1, slot, ind, suq->tex.bindless));
346      }
347   }
348
349   return true;
350}
351
352//
353// - add quadop dance for texturing
354// - put FP outputs in GPRs
355// - convert instruction sequences
356//
357bool
358GM107LoweringPass::visit(Instruction *i)
359{
360   bld.setPosition(i, false);
361
362   if (i->cc != CC_ALWAYS)
363      checkPredicate(i);
364
365   switch (i->op) {
366   case OP_PFETCH:
367      return handlePFETCH(i);
368   case OP_DFDX:
369   case OP_DFDY:
370      return handleDFDX(i);
371   case OP_POPCNT:
372      return handlePOPCNT(i);
373   case OP_SUQ:
374      return handleSUQ(i->asTex());
375   default:
376      return NVC0LoweringPass::visit(i);
377   }
378}
379
380} // namespace nv50_ir
381