1/*
2 * Copyright 2020 Red Hat Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22#include "nv50_ir.h"
23#include "nv50_ir_build_util.h"
24
25#include "nv50_ir_target_nvc0.h"
26#include "nv50_ir_lowering_gv100.h"
27
28#include <limits>
29
30namespace nv50_ir {
31
32bool
33GV100LegalizeSSA::handleCMP(Instruction *i)
34{
35   Value *pred = bld.getSSA(1, FILE_PREDICATE);
36
37   bld.mkCmp(OP_SET, reverseCondCode(i->asCmp()->setCond), TYPE_U8, pred,
38             i->sType, bld.mkImm(0), i->getSrc(2))->ftz = i->ftz;
39   bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), i->getSrc(0), i->getSrc(1), pred);
40   return true;
41}
42
43// NIR deals with most of these for us, but codegen generates more in pointer
44// calculations from other lowering passes.
45bool
46GV100LegalizeSSA::handleIADD64(Instruction *i)
47{
48   Value *carry = bld.getSSA(1, FILE_PREDICATE);
49   Value *def[2] = { bld.getSSA(), bld.getSSA() };
50   Value *src[2][2];
51
52   for (int s = 0; s < 2; s++) {
53      if (i->getSrc(s)->reg.size == 8) {
54         bld.mkSplit(src[s], 4, i->getSrc(s));
55      } else {
56         src[s][0] = i->getSrc(s);
57         src[s][1] = bld.mkImm(0);
58      }
59   }
60
61   bld.mkOp2(OP_ADD, TYPE_U32, def[0], src[0][0], src[1][0])->
62      setFlagsDef(1, carry);
63   bld.mkOp2(OP_ADD, TYPE_U32, def[1], src[0][1], src[1][1])->
64      setFlagsSrc(2, carry);
65   bld.mkOp2(OP_MERGE, i->dType, i->getDef(0), def[0], def[1]);
66   return true;
67}
68
69bool
70GV100LegalizeSSA::handleIMAD_HIGH(Instruction *i)
71{
72   Value *def = bld.getSSA(8), *defs[2];
73   Value *src2;
74
75   if (i->srcExists(2) &&
76       (!i->getSrc(2)->asImm() || i->getSrc(2)->asImm()->reg.data.u32)) {
77      Value *src2s[2] = { bld.getSSA(), bld.getSSA() };
78      bld.mkMov(src2s[0], bld.mkImm(0));
79      bld.mkMov(src2s[1], i->getSrc(2));
80      src2 = bld.mkOp2(OP_MERGE, TYPE_U64, bld.getSSA(8), src2s[0], src2s[1])->getDef(0);
81   } else {
82      src2 = bld.mkImm(0);
83   }
84
85   bld.mkOp3(OP_MAD, isSignedType(i->sType) ? TYPE_S64 : TYPE_U64, def,
86             i->getSrc(0), i->getSrc(1), src2);
87
88   bld.mkSplit(defs, 4, def);
89   i->def(0).replace(defs[1], false);
90   return true;
91}
92
93// XXX: We should be able to do this in GV100LoweringPass, but codegen messes
94//      up somehow and swaps the condcode without swapping the sources.
95//      - tests/spec/glsl-1.50/execution/geometry/primitive-id-in.shader_test
96bool
97GV100LegalizeSSA::handleIMNMX(Instruction *i)
98{
99   Value *pred = bld.getSSA(1, FILE_PREDICATE);
100
101   bld.mkCmp(OP_SET, (i->op == OP_MIN) ? CC_LT : CC_GT, i->dType, pred,
102             i->sType, i->getSrc(0), i->getSrc(1));
103   bld.mkOp3(OP_SELP, i->dType, i->getDef(0), i->getSrc(0), i->getSrc(1), pred);
104   return true;
105}
106
107bool
108GV100LegalizeSSA::handleIMUL(Instruction *i)
109{
110   if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
111      return handleIMAD_HIGH(i);
112
113   bld.mkOp3(OP_MAD, i->dType, i->getDef(0), i->getSrc(0), i->getSrc(1),
114             bld.mkImm(0));
115   return true;
116}
117
118bool
119GV100LegalizeSSA::handleLOP2(Instruction *i)
120{
121   uint8_t src0 = NV50_IR_SUBOP_LOP3_LUT_SRC0;
122   uint8_t src1 = NV50_IR_SUBOP_LOP3_LUT_SRC1;
123   uint8_t subOp;
124
125   if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT))
126      src0 = ~src0;
127   if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT))
128      src1 = ~src1;
129
130   switch (i->op) {
131   case OP_AND: subOp = src0 & src1; break;
132   case OP_OR : subOp = src0 | src1; break;
133   case OP_XOR: subOp = src0 ^ src1; break;
134   default:
135      unreachable("invalid LOP2 opcode");
136   }
137
138   bld.mkOp3(OP_LOP3_LUT, TYPE_U32, i->getDef(0), i->getSrc(0), i->getSrc(1),
139             bld.mkImm(0))->subOp = subOp;
140   return true;
141}
142
143bool
144GV100LegalizeSSA::handleNOT(Instruction *i)
145{
146   bld.mkOp3(OP_LOP3_LUT, TYPE_U32, i->getDef(0), bld.mkImm(0), i->getSrc(0),
147             bld.mkImm(0))->subOp = (uint8_t)~NV50_IR_SUBOP_LOP3_LUT_SRC1;
148   return true;
149}
150
151bool
152GV100LegalizeSSA::handlePREEX2(Instruction *i)
153{
154   i->def(0).replace(i->src(0), false);
155   return true;
156}
157
158bool
159GV100LegalizeSSA::handleQUADON(Instruction *i)
160{
161   bld.mkBMov(i->getDef(0), bld.mkTSVal(TS_MACTIVE));
162   Instruction *b = bld.mkBMov(bld.mkTSVal(TS_PQUAD_MACTIVE), i->getDef(0));
163   b->fixed = 1;
164   return true;
165}
166
167bool
168GV100LegalizeSSA::handleQUADPOP(Instruction *i)
169{
170   Instruction *b = bld.mkBMov(bld.mkTSVal(TS_MACTIVE), i->getSrc(0));
171   b->fixed = 1;
172   return true;
173}
174
175bool
176GV100LegalizeSSA::handleSET(Instruction *i)
177{
178   Value *src2 = i->srcExists(2) ? i->getSrc(2) : NULL;
179   Value *pred = bld.getSSA(1, FILE_PREDICATE), *met;
180   Instruction *xsetp;
181
182   if (isFloatType(i->dType)) {
183      if (i->sType == TYPE_F32)
184         return false; // HW has FSET.BF
185      met = bld.mkImm(0x3f800000);
186   } else {
187      met = bld.mkImm(0xffffffff);
188   }
189
190   xsetp = bld.mkCmp(i->op, i->asCmp()->setCond, TYPE_U8, pred, i->sType,
191                     i->getSrc(0), i->getSrc(1));
192   xsetp->src(0).mod = i->src(0).mod;
193   xsetp->src(1).mod = i->src(1).mod;
194   xsetp->setSrc(2, src2);
195   xsetp->ftz = i->ftz;
196
197   i = bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), bld.mkImm(0), met, pred);
198   i->src(2).mod = Modifier(NV50_IR_MOD_NOT);
199   return true;
200}
201
202bool
203GV100LegalizeSSA::handleSHFL(Instruction *i)
204{
205   Instruction *sync = new_Instruction(func, OP_WARPSYNC, TYPE_NONE);
206   sync->fixed = 1;
207   sync->setSrc(0, bld.mkImm(0xffffffff));
208   i->bb->insertBefore(i, sync);
209   return false;
210}
211
212bool
213GV100LegalizeSSA::handleShift(Instruction *i)
214{
215   Value *zero = bld.mkImm(0);
216   Value *src1 = i->getSrc(1);
217   Value *src0, *src2;
218   uint8_t subOp = i->op == OP_SHL ? NV50_IR_SUBOP_SHF_L : NV50_IR_SUBOP_SHF_R;
219
220   if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR) {
221      src0 = i->getSrc(0);
222      src2 = zero;
223   } else {
224      src0 = zero;
225      src2 = i->getSrc(0);
226      subOp |= NV50_IR_SUBOP_SHF_HI;
227   }
228   if (i->subOp & NV50_IR_SUBOP_SHIFT_WRAP)
229      subOp |= NV50_IR_SUBOP_SHF_W;
230
231   bld.mkOp3(OP_SHF, i->dType, i->getDef(0), src0, src1, src2)->subOp = subOp;
232   return true;
233}
234
235bool
236GV100LegalizeSSA::handleSUB(Instruction *i)
237{
238   Instruction *xadd =
239      bld.mkOp2(OP_ADD, i->dType, i->getDef(0), i->getSrc(0), i->getSrc(1));
240   xadd->src(0).mod = i->src(0).mod;
241   xadd->src(1).mod = i->src(1).mod ^ Modifier(NV50_IR_MOD_NEG);
242   xadd->ftz = i->ftz;
243   return true;
244}
245
246bool
247GV100LegalizeSSA::visit(Instruction *i)
248{
249   bool lowered = false;
250
251   bld.setPosition(i, false);
252   if (i->sType == TYPE_F32 && i->dType != TYPE_F16 &&
253       prog->getType() != Program::TYPE_COMPUTE)
254      handleFTZ(i);
255
256   switch (i->op) {
257   case OP_AND:
258   case OP_OR:
259   case OP_XOR:
260      if (i->def(0).getFile() != FILE_PREDICATE)
261         lowered = handleLOP2(i);
262      break;
263   case OP_NOT:
264      lowered = handleNOT(i);
265      break;
266   case OP_SHL:
267   case OP_SHR:
268      lowered = handleShift(i);
269      break;
270   case OP_SET:
271   case OP_SET_AND:
272   case OP_SET_OR:
273   case OP_SET_XOR:
274      if (i->def(0).getFile() != FILE_PREDICATE)
275         lowered = handleSET(i);
276      break;
277   case OP_SLCT:
278      lowered = handleCMP(i);
279      break;
280   case OP_PREEX2:
281      lowered = handlePREEX2(i);
282      break;
283   case OP_MUL:
284      if (!isFloatType(i->dType))
285         lowered = handleIMUL(i);
286      break;
287   case OP_MAD:
288      if (!isFloatType(i->dType) && i->subOp == NV50_IR_SUBOP_MUL_HIGH)
289         lowered = handleIMAD_HIGH(i);
290      break;
291   case OP_SHFL:
292      lowered = handleSHFL(i);
293      break;
294   case OP_QUADON:
295      lowered = handleQUADON(i);
296      break;
297   case OP_QUADPOP:
298      lowered = handleQUADPOP(i);
299      break;
300   case OP_SUB:
301      lowered = handleSUB(i);
302      break;
303   case OP_MAX:
304   case OP_MIN:
305      if (!isFloatType(i->dType))
306         lowered = handleIMNMX(i);
307      break;
308   case OP_ADD:
309      if (!isFloatType(i->dType) && typeSizeof(i->dType) == 8)
310         lowered = handleIADD64(i);
311      break;
312   case OP_PFETCH:
313      handlePFETCH(i);
314      break;
315   case OP_LOAD:
316      handleLOAD(i);
317      break;
318   default:
319      break;
320   }
321
322   if (lowered)
323      delete_Instruction(prog, i);
324
325   return true;
326}
327
328bool
329GV100LoweringPass::handleDMNMX(Instruction *i)
330{
331   Value *pred = bld.getSSA(1, FILE_PREDICATE);
332   Value *src0[2], *src1[2], *dest[2];
333
334   bld.mkCmp(OP_SET, (i->op == OP_MIN) ? CC_LT : CC_GT, TYPE_U32, pred,
335             i->sType, i->getSrc(0), i->getSrc(1));
336   bld.mkSplit(src0, 4, i->getSrc(0));
337   bld.mkSplit(src1, 4, i->getSrc(1));
338   bld.mkSplit(dest, 4, i->getDef(0));
339   bld.mkOp3(OP_SELP, TYPE_U32, dest[0], src0[0], src1[0], pred);
340   bld.mkOp3(OP_SELP, TYPE_U32, dest[1], src0[1], src1[1], pred);
341   bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), dest[0], dest[1]);
342   return true;
343}
344
345bool
346GV100LoweringPass::handleEXTBF(Instruction *i)
347{
348   Value *bit = bld.getScratch();
349   Value *cnt = bld.getScratch();
350   Value *mask = bld.getScratch();
351   Value *zero = bld.mkImm(0);
352
353   bld.mkOp3(OP_PERMT, TYPE_U32, bit, i->getSrc(1), bld.mkImm(0x4440), zero);
354   bld.mkOp3(OP_PERMT, TYPE_U32, cnt, i->getSrc(1), bld.mkImm(0x4441), zero);
355   bld.mkOp2(OP_BMSK, TYPE_U32, mask, bit, cnt);
356   bld.mkOp2(OP_AND, TYPE_U32, mask, i->getSrc(0), mask);
357   bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), mask, bit);
358   if (isSignedType(i->dType))
359      bld.mkOp2(OP_SGXT, TYPE_S32, i->getDef(0), i->getDef(0), cnt);
360
361   return true;
362}
363
364bool
365GV100LoweringPass::handleFLOW(Instruction *i)
366{
367   i->op = OP_BRA;
368   return false;
369}
370
371bool
372GV100LoweringPass::handleI2I(Instruction *i)
373{
374   bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), i->sType, i->getSrc(0))->
375      subOp = i->subOp;
376   bld.mkCvt(OP_CVT, i->dType, i->getDef(0), TYPE_F32, i->getDef(0));
377   return true;
378}
379
380bool
381GV100LoweringPass::handleINSBF(Instruction *i)
382{
383   Value *bit = bld.getScratch();
384   Value *cnt = bld.getScratch();
385   Value *mask = bld.getScratch();
386   Value *src0 = bld.getScratch();
387   Value *zero = bld.mkImm(0);
388
389   bld.mkOp3(OP_PERMT, TYPE_U32, bit, i->getSrc(1), bld.mkImm(0x4440), zero);
390   bld.mkOp3(OP_PERMT, TYPE_U32, cnt, i->getSrc(1), bld.mkImm(0x4441), zero);
391   bld.mkOp2(OP_BMSK, TYPE_U32, mask, zero, cnt);
392
393   bld.mkOp2(OP_AND, TYPE_U32, src0, i->getSrc(0), mask);
394   bld.mkOp2(OP_SHL, TYPE_U32, src0, src0, bit);
395
396   bld.mkOp2(OP_SHL, TYPE_U32, mask, mask, bit);
397   bld.mkOp3(OP_LOP3_LUT, TYPE_U32, i->getDef(0), src0, i->getSrc(2), mask)->
398      subOp = NV50_IR_SUBOP_LOP3_LUT(a | (b & ~c));
399
400   return true;
401}
402
403bool
404GV100LoweringPass::handlePINTERP(Instruction *i)
405{
406   Value *src2 = i->srcExists(2) ? i->getSrc(2) : NULL;
407   Instruction *ipa, *mul;
408
409   ipa = bld.mkOp2(OP_LINTERP, TYPE_F32, i->getDef(0), i->getSrc(0), src2);
410   ipa->ipa = i->ipa;
411   mul = bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(0), i->getDef(0), i->getSrc(1));
412
413   if (i->getInterpMode() == NV50_IR_INTERP_SC) {
414      ipa->setDef(1, bld.getSSA(1, FILE_PREDICATE));
415      mul->setPredicate(CC_NOT_P, ipa->getDef(1));
416   }
417
418   return true;
419}
420
421bool
422GV100LoweringPass::handlePREFLOW(Instruction *i)
423{
424   return true;
425}
426
427bool
428GV100LoweringPass::handlePRESIN(Instruction *i)
429{
430   const float f = 1.0 / (2.0 * 3.14159265);
431   bld.mkOp2(OP_MUL, i->dType, i->getDef(0), i->getSrc(0), bld.mkImm(f));
432   return true;
433}
434
435bool
436GV100LoweringPass::visit(Instruction *i)
437{
438   bool lowered = false;
439
440   bld.setPosition(i, false);
441
442   switch (i->op) {
443   case OP_BREAK:
444   case OP_CONT:
445      lowered = handleFLOW(i);
446      break;
447   case OP_PREBREAK:
448   case OP_PRECONT:
449      lowered = handlePREFLOW(i);
450      break;
451   case OP_CVT:
452      if (i->src(0).getFile() != FILE_PREDICATE &&
453          i->def(0).getFile() != FILE_PREDICATE &&
454          !isFloatType(i->dType) && !isFloatType(i->sType))
455         lowered = handleI2I(i);
456      break;
457   case OP_EXTBF:
458      lowered = handleEXTBF(i);
459      break;
460   case OP_INSBF:
461      lowered = handleINSBF(i);
462      break;
463   case OP_MAX:
464   case OP_MIN:
465      if (i->dType == TYPE_F64)
466         lowered = handleDMNMX(i);
467      break;
468   case OP_PINTERP:
469      lowered = handlePINTERP(i);
470      break;
471   case OP_PRESIN:
472      lowered = handlePRESIN(i);
473      break;
474   default:
475      break;
476   }
477
478   if (lowered)
479      delete_Instruction(prog, i);
480
481   return true;
482}
483
484} // namespace nv50_ir
485