1/*
2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 *    Jonathan Marek <jonathan@marek.ca>
25 */
26
27#include "ir2_private.h"
28
29static bool
30scalar_possible(struct ir2_instr *instr)
31{
32   if (instr->alu.scalar_opc == SCALAR_NONE)
33      return false;
34
35   return src_ncomp(instr) == 1;
36}
37
38static bool
39is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b)
40{
41   if (!a)
42      return true;
43
44   /* dont use same instruction twice */
45   if (a == b)
46      return false;
47
48   /* PRED_SET must be alone */
49   if (b->alu.scalar_opc >= PRED_SETEs &&
50       b->alu.scalar_opc <= PRED_SET_RESTOREs)
51      return false;
52
53   /* must write to same export (issues otherwise?) */
54   return a->alu.export == b->alu.export;
55}
56
57/* priority of vector instruction for scheduling (lower=higher prio) */
58static unsigned
59alu_vector_prio(struct ir2_instr *instr)
60{
61   if (instr->alu.vector_opc == VECTOR_NONE)
62      return ~0u;
63
64   if (is_export(instr))
65      return 4;
66
67   /* TODO check src type and ncomps */
68   if (instr->src_count == 3)
69      return 0;
70
71   if (!scalar_possible(instr))
72      return 1;
73
74   return instr->src_count == 2 ? 2 : 3;
75}
76
77/* priority of scalar instruction for scheduling (lower=higher prio) */
78static unsigned
79alu_scalar_prio(struct ir2_instr *instr)
80{
81   if (!scalar_possible(instr))
82      return ~0u;
83
84   /* this case is dealt with later */
85   if (instr->src_count > 1)
86      return ~0u;
87
88   if (is_export(instr))
89      return 4;
90
91   /* PRED to end of block */
92   if (instr->alu.scalar_opc >= PRED_SETEs &&
93       instr->alu.scalar_opc <= PRED_SET_RESTOREs)
94      return 5;
95
96   /* scalar only have highest priority */
97   return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3;
98}
99
100/* this is a bit messy:
101 * we want to find a slot where we can insert a scalar MOV with
102 * a vector instruction that was already scheduled
103 */
104static struct ir2_sched_instr *
105insert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx,
106       struct ir2_src src1, unsigned *comp)
107{
108   struct ir2_sched_instr *sched = NULL, *s;
109   unsigned i, mask = 0xf;
110
111   /* go first earliest point where the mov can be inserted */
112   for (i = ctx->instr_sched_count - 1; i > 0; i--) {
113      s = &ctx->instr_sched[i - 1];
114
115      if (s->instr && s->instr->block_idx != block_idx)
116         break;
117      if (s->instr_s && s->instr_s->block_idx != block_idx)
118         break;
119
120      if (src1.type == IR2_SRC_SSA) {
121         if ((s->instr && s->instr->idx == src1.num) ||
122             (s->instr_s && s->instr_s->idx == src1.num))
123            break;
124      }
125
126      unsigned mr = ~(s->reg_state[reg_idx / 8] >> reg_idx % 8 * 4 & 0xf);
127      if ((mask & mr) == 0)
128         break;
129
130      mask &= mr;
131      if (s->instr_s || s->instr->src_count == 3)
132         continue;
133
134      if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0)
135         continue;
136
137      sched = s;
138   }
139   *comp = ffs(mask) - 1;
140
141   if (sched) {
142      for (s = sched; s != &ctx->instr_sched[ctx->instr_sched_count]; s++)
143         s->reg_state[reg_idx / 8] |= 1 << (*comp + reg_idx % 8 * 4);
144   }
145
146   return sched;
147}
148
149/* case1:
150 * in this case, insert a mov to place the 2nd src into to same reg
151 * (scalar sources come from the same register)
152 *
153 * this is a common case which works when one of the srcs is input/const
154 * but for instrs which have 2 ssa/reg srcs, then its not ideal
155 */
156static bool
157scalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order)
158{
159   struct ir2_src src0 = instr->src[order];
160   struct ir2_src src1 = instr->src[!order];
161   struct ir2_sched_instr *sched;
162   struct ir2_instr *ins;
163   struct ir2_reg *reg;
164   unsigned idx, comp;
165
166   switch (src0.type) {
167   case IR2_SRC_CONST:
168   case IR2_SRC_INPUT:
169      return false;
170   default:
171      break;
172   }
173
174   /* TODO, insert needs logic for this */
175   if (src1.type == IR2_SRC_REG)
176      return false;
177
178   /* we could do something if they match src1.. */
179   if (src0.negate || src0.abs)
180      return false;
181
182   reg = get_reg_src(ctx, &src0);
183
184   /* result not used more since we will overwrite */
185   for (int i = 0; i < 4; i++)
186      if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i))
187         return false;
188
189   /* find a place to insert the mov */
190   sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp);
191   if (!sched)
192      return false;
193
194   ins = &ctx->instr[idx = ctx->instr_count++];
195   ins->idx = idx;
196   ins->type = IR2_ALU;
197   ins->src[0] = src1;
198   ins->src_count = 1;
199   ins->is_ssa = true;
200   ins->ssa.idx = reg->idx;
201   ins->ssa.ncomp = 1;
202   ins->ssa.comp[0].c = comp;
203   ins->alu.scalar_opc = MAXs;
204   ins->alu.export = -1;
205   ins->alu.write_mask = 1;
206   ins->pred = instr->pred;
207   ins->block_idx = instr->block_idx;
208
209   instr->src[0] = src0;
210   instr->alu.src1_swizzle = comp;
211
212   sched->instr_s = ins;
213   return true;
214}
215
216/* fill sched with next fetch or (vector and/or scalar) alu instruction */
217static int
218sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched)
219{
220   struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL;
221   unsigned avail_count = 0;
222
223   instr_alloc_type_t export = ~0u;
224   int block_idx = -1;
225
226   /* XXX merge this loop with the other one somehow? */
227   ir2_foreach_instr (instr, ctx) {
228      if (!instr->need_emit)
229         continue;
230      if (is_export(instr))
231         export = MIN2(export, export_buf(instr->alu.export));
232   }
233
234   ir2_foreach_instr (instr, ctx) {
235      if (!instr->need_emit)
236         continue;
237
238      /* dont mix exports */
239      if (is_export(instr) && export_buf(instr->alu.export) != export)
240         continue;
241
242      if (block_idx < 0)
243         block_idx = instr->block_idx;
244      else if (block_idx != instr->block_idx || /* must be same block */
245               instr->type == IR2_CF ||         /* CF/MEM must be alone */
246               (is_export(instr) && export == SQ_MEMORY))
247         break;
248      /* it works because IR2_CF is always at end of block
249       * and somewhat same idea with MEM exports, which might not be alone
250       * but will end up in-order at least
251       */
252
253      /* check if dependencies are satisfied */
254      bool is_ok = true;
255      ir2_foreach_src (src, instr) {
256         if (src->type == IR2_SRC_REG) {
257            /* need to check if all previous instructions in the block
258             * which write the reg have been emitted
259             * slow..
260             * XXX: check components instead of whole register
261             */
262            struct ir2_reg *reg = get_reg_src(ctx, src);
263            ir2_foreach_instr (p, ctx) {
264               if (!p->is_ssa && p->reg == reg && p->idx < instr->idx)
265                  is_ok &= !p->need_emit;
266            }
267         } else if (src->type == IR2_SRC_SSA) {
268            /* in this case its easy, just check need_emit */
269            is_ok &= !ctx->instr[src->num].need_emit;
270         }
271      }
272      /* don't reorder non-ssa write before read */
273      if (!instr->is_ssa) {
274         ir2_foreach_instr (p, ctx) {
275            if (!p->need_emit || p->idx >= instr->idx)
276               continue;
277
278            ir2_foreach_src (src, p) {
279               if (get_reg_src(ctx, src) == instr->reg)
280                  is_ok = false;
281            }
282         }
283      }
284      /* don't reorder across predicates */
285      if (avail_count && instr->pred != avail[0]->pred)
286         is_ok = false;
287
288      if (!is_ok)
289         continue;
290
291      avail[avail_count++] = instr;
292   }
293
294   if (!avail_count) {
295      assert(block_idx == -1);
296      return -1;
297   }
298
299   /* priority to FETCH instructions */
300   ir2_foreach_avail (instr) {
301      if (instr->type == IR2_ALU)
302         continue;
303
304      ra_src_free(ctx, instr);
305      ra_reg(ctx, get_reg(instr), -1, false, 0);
306
307      instr->need_emit = false;
308      sched->instr = instr;
309      sched->instr_s = NULL;
310      return block_idx;
311   }
312
313   /* TODO precompute priorities */
314
315   unsigned prio_v = ~0u, prio_s = ~0u, prio;
316   ir2_foreach_avail (instr) {
317      prio = alu_vector_prio(instr);
318      if (prio < prio_v) {
319         instr_v = instr;
320         prio_v = prio;
321      }
322   }
323
324   /* TODO can still insert scalar if src_count=3, if smart about it */
325   if (!instr_v || instr_v->src_count < 3) {
326      ir2_foreach_avail (instr) {
327         bool compat = is_alu_compatible(instr_v, instr);
328
329         prio = alu_scalar_prio(instr);
330         if (prio >= prio_v && !compat)
331            continue;
332
333         if (prio < prio_s) {
334            instr_s = instr;
335            prio_s = prio;
336            if (!compat)
337               instr_v = NULL;
338         }
339      }
340   }
341
342   assert(instr_v || instr_s);
343
344   /* now, we try more complex insertion of vector instruction as scalar
345    * TODO: if we are smart we can still insert if instr_v->src_count==3
346    */
347   if (!instr_s && instr_v->src_count < 3) {
348      ir2_foreach_avail (instr) {
349         if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr))
350            continue;
351
352         /* at this point, src_count should always be 2 */
353         assert(instr->src_count == 2);
354
355         if (scalarize_case1(ctx, instr, 0)) {
356            instr_s = instr;
357            break;
358         }
359         if (scalarize_case1(ctx, instr, 1)) {
360            instr_s = instr;
361            break;
362         }
363      }
364   }
365
366   /* free src registers */
367   if (instr_v) {
368      instr_v->need_emit = false;
369      ra_src_free(ctx, instr_v);
370   }
371
372   if (instr_s) {
373      instr_s->need_emit = false;
374      ra_src_free(ctx, instr_s);
375   }
376
377   /* allocate dst registers */
378   if (instr_v)
379      ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v),
380             instr_v->alu.write_mask);
381
382   if (instr_s)
383      ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s),
384             instr_s->alu.write_mask);
385
386   sched->instr = instr_v;
387   sched->instr_s = instr_s;
388   return block_idx;
389}
390
391/* scheduling: determine order of instructions */
392static void
393schedule_instrs(struct ir2_context *ctx)
394{
395   struct ir2_sched_instr *sched;
396   int block_idx;
397
398   /* allocate input registers */
399   for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++)
400      if (ctx->input[idx].initialized)
401         ra_reg(ctx, &ctx->input[idx], idx, false, 0);
402
403   for (;;) {
404      sched = &ctx->instr_sched[ctx->instr_sched_count++];
405      block_idx = sched_next(ctx, sched);
406      if (block_idx < 0)
407         break;
408      memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state));
409
410      /* catch texture fetch after scheduling and insert the
411       * SET_TEX_LOD right before it if necessary
412       * TODO clean this up
413       */
414      struct ir2_instr *instr = sched->instr, *tex_lod;
415      if (instr && instr->type == IR2_FETCH && instr->fetch.opc == TEX_FETCH &&
416          instr->src_count == 2) {
417         /* generate the SET_LOD instruction */
418         tex_lod = &ctx->instr[ctx->instr_count++];
419         tex_lod->type = IR2_FETCH;
420         tex_lod->block_idx = instr->block_idx;
421         tex_lod->pred = instr->pred;
422         tex_lod->fetch.opc = TEX_SET_TEX_LOD;
423         tex_lod->src[0] = instr->src[1];
424         tex_lod->src_count = 1;
425
426         sched[1] = sched[0];
427         sched->instr = tex_lod;
428         ctx->instr_sched_count++;
429      }
430
431      bool free_block = true;
432      ir2_foreach_instr (instr, ctx)
433         free_block &= instr->block_idx != block_idx;
434      if (free_block)
435         ra_block_free(ctx, block_idx);
436   };
437   ctx->instr_sched_count--;
438}
439
440void
441ir2_compile(struct fd2_shader_stateobj *so, unsigned variant,
442            struct fd2_shader_stateobj *fp)
443{
444   struct ir2_context ctx = {};
445   bool binning = !fp && so->type == MESA_SHADER_VERTEX;
446
447   if (fp)
448      so->variant[variant].f = fp->variant[0].f;
449
450   ctx.so = so;
451   ctx.info = &so->variant[variant].info;
452   ctx.f = &so->variant[variant].f;
453   ctx.info->max_reg = -1;
454
455   /* convert nir to internal representation */
456   ir2_nir_compile(&ctx, binning);
457
458   /* copy propagate srcs */
459   cp_src(&ctx);
460
461   /* get ref_counts and kill non-needed instructions */
462   ra_count_refs(&ctx);
463
464   /* remove movs used to write outputs */
465   cp_export(&ctx);
466
467   /* instruction order.. and vector->scalar conversions */
468   schedule_instrs(&ctx);
469
470   /* finally, assemble to bitcode */
471   assemble(&ctx, binning);
472}
473