xref: /third_party/mesa3d/src/freedreno/ir3/ir3.c (revision bf215546)
1/*
2 * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#include "ir3.h"
25
26#include <assert.h>
27#include <errno.h>
28#include <stdbool.h>
29#include <stdio.h>
30#include <stdlib.h>
31#include <string.h>
32
33#include "util/bitscan.h"
34#include "util/half_float.h"
35#include "util/ralloc.h"
36#include "util/u_math.h"
37
38#include "instr-a3xx.h"
39#include "ir3_shader.h"
40
41/* simple allocator to carve allocations out of an up-front allocated heap,
42 * so that we can free everything easily in one shot.
43 */
44void *
45ir3_alloc(struct ir3 *shader, int sz)
46{
47   return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
48}
49
50struct ir3 *
51ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v)
52{
53   struct ir3 *shader = rzalloc(v, struct ir3);
54
55   shader->compiler = compiler;
56   shader->type = v->type;
57
58   list_inithead(&shader->block_list);
59   list_inithead(&shader->array_list);
60
61   return shader;
62}
63
64void
65ir3_destroy(struct ir3 *shader)
66{
67   ralloc_free(shader);
68}
69
70static bool
71is_shared_consts(struct ir3_compiler *compiler,
72                 struct ir3_const_state *const_state,
73                 struct ir3_register *reg)
74{
75   if (const_state->shared_consts_enable && reg->flags & IR3_REG_CONST) {
76      uint32_t min_const_reg = regid(compiler->shared_consts_base_offset, 0);
77      uint32_t max_const_reg =
78         regid(compiler->shared_consts_base_offset +
79               compiler->shared_consts_size, 0);
80      return reg->num >= min_const_reg && min_const_reg < max_const_reg;
81   }
82
83   return false;
84}
85
86static void
87collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
88                 struct ir3_info *info)
89{
90   struct ir3_shader_variant *v = info->data;
91   unsigned repeat = instr->repeat;
92
93   if (reg->flags & IR3_REG_IMMED) {
94      /* nothing to do */
95      return;
96   }
97
98   /* Shared consts don't need to be included into constlen. */
99   if (is_shared_consts(v->compiler, ir3_const_state(v), reg))
100      return;
101
102   if (!(reg->flags & IR3_REG_R)) {
103      repeat = 0;
104   }
105
106   unsigned components;
107   int16_t max;
108
109   if (reg->flags & IR3_REG_RELATIV) {
110      components = reg->size;
111      max = (reg->array.base + components - 1);
112   } else {
113      components = util_last_bit(reg->wrmask);
114      max = (reg->num + repeat + components - 1);
115   }
116
117   if (reg->flags & IR3_REG_CONST) {
118      info->max_const = MAX2(info->max_const, max >> 2);
119   } else if (max < regid(48, 0)) {
120      if (reg->flags & IR3_REG_HALF) {
121         if (v->mergedregs) {
122            /* starting w/ a6xx, half regs conflict with full regs: */
123            info->max_reg = MAX2(info->max_reg, max >> 3);
124         } else {
125            info->max_half_reg = MAX2(info->max_half_reg, max >> 2);
126         }
127      } else {
128         info->max_reg = MAX2(info->max_reg, max >> 2);
129      }
130   }
131}
132
133bool
134ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
135{
136   const struct ir3_compiler *compiler = v->compiler;
137
138   /* If the user forced a particular wavesize respect that. */
139   if (v->real_wavesize == IR3_SINGLE_ONLY)
140      return false;
141   if (v->real_wavesize == IR3_DOUBLE_ONLY)
142      return true;
143
144   /* We can't support more than compiler->branchstack_size diverging threads
145    * in a wave. Thus, doubling the threadsize is only possible if we don't
146    * exceed the branchstack size limit.
147    */
148   if (MIN2(v->branchstack, compiler->threadsize_base * 2) >
149       compiler->branchstack_size) {
150      return false;
151   }
152
153   switch (v->type) {
154   case MESA_SHADER_KERNEL:
155   case MESA_SHADER_COMPUTE: {
156      unsigned threads_per_wg =
157         v->local_size[0] * v->local_size[1] * v->local_size[2];
158
159      /* For a5xx, if the workgroup size is greater than the maximum number
160       * of threads per core with 32 threads per wave (512) then we have to
161       * use the doubled threadsize because otherwise the workgroup wouldn't
162       * fit. For smaller workgroup sizes, we follow the blob and use the
163       * smaller threadsize.
164       */
165      if (compiler->gen < 6) {
166         return v->local_size_variable ||
167                threads_per_wg >
168                   compiler->threadsize_base * compiler->max_waves;
169      }
170
171      /* On a6xx, we prefer the larger threadsize unless the workgroup is
172       * small enough that it would be useless. Note that because
173       * threadsize_base is bumped to 64, we don't have to worry about the
174       * workgroup fitting, unlike the a5xx case.
175       */
176      if (!v->local_size_variable) {
177         if (threads_per_wg <= compiler->threadsize_base)
178            return false;
179      }
180   }
181      FALLTHROUGH;
182   case MESA_SHADER_FRAGMENT: {
183      /* Check that doubling the threadsize wouldn't exceed the regfile size */
184      return regs_count * 2 <= compiler->reg_size_vec4;
185   }
186
187   default:
188      /* On a6xx+, it's impossible to use a doubled wavesize in the geometry
189       * stages - the bit doesn't exist. The blob never used it for the VS
190       * on earlier gen's anyway.
191       */
192      return false;
193   }
194}
195
196/* Get the maximum number of waves that could be used even if this shader
197 * didn't use any registers.
198 */
199unsigned
200ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
201                                  bool double_threadsize)
202{
203   const struct ir3_compiler *compiler = v->compiler;
204   unsigned max_waves = compiler->max_waves;
205
206   /* Compute the limit based on branchstack */
207   if (v->branchstack > 0) {
208      unsigned branchstack_max_waves = compiler->branchstack_size /
209                                       v->branchstack *
210                                       compiler->wave_granularity;
211      max_waves = MIN2(max_waves, branchstack_max_waves);
212   }
213
214   /* If this is a compute shader, compute the limit based on shared size */
215   if ((v->type == MESA_SHADER_COMPUTE) ||
216       (v->type == MESA_SHADER_KERNEL)) {
217      unsigned threads_per_wg =
218         v->local_size[0] * v->local_size[1] * v->local_size[2];
219      unsigned waves_per_wg =
220         DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base *
221                                         (double_threadsize ? 2 : 1) *
222                                         compiler->wave_granularity);
223
224      /* Shared is allocated in chunks of 1k */
225      unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
226      if (shared_per_wg > 0 && !v->local_size_variable) {
227         unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
228
229         max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core *
230                                        compiler->wave_granularity);
231      }
232
233      /* If we have a compute shader that has a big workgroup, a barrier, and
234       * a branchstack which limits max_waves - this may result in a situation
235       * when we cannot run concurrently all waves of the workgroup, which
236       * would lead to a hang.
237       *
238       * TODO: Could we spill branchstack or is there other way around?
239       * Blob just explodes in such case.
240       */
241      if (v->has_barrier && (max_waves < waves_per_wg)) {
242         mesa_loge(
243            "Compute shader (%s) which has workgroup barrier cannot be used "
244            "because it's impossible to have enough concurrent waves.",
245            v->name);
246         exit(1);
247      }
248   }
249
250   return max_waves;
251}
252
253/* Get the maximum number of waves that could be launched limited by reg size.
254 */
255unsigned
256ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
257                                unsigned reg_count, bool double_threadsize)
258{
259   return reg_count ? (compiler->reg_size_vec4 /
260                       (reg_count * (double_threadsize ? 2 : 1)) *
261                       compiler->wave_granularity)
262                    : compiler->max_waves;
263}
264
265void
266ir3_collect_info(struct ir3_shader_variant *v)
267{
268   struct ir3_info *info = &v->info;
269   struct ir3 *shader = v->ir;
270   const struct ir3_compiler *compiler = v->compiler;
271
272   memset(info, 0, sizeof(*info));
273   info->data = v;
274   info->max_reg = -1;
275   info->max_half_reg = -1;
276   info->max_const = -1;
277   info->multi_dword_ldp_stp = false;
278
279   uint32_t instr_count = 0;
280   foreach_block (block, &shader->block_list) {
281      foreach_instr (instr, &block->instr_list) {
282         instr_count++;
283      }
284   }
285
286   v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align);
287
288   /* Pad out with NOPs to instrlen, including at least 4 so that cffdump
289    * doesn't try to decode the following data as instructions (such as the
290    * next stage's shader in turnip)
291    */
292   info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8;
293   info->sizedwords = info->size / 4;
294
295   bool in_preamble = false;
296
297   foreach_block (block, &shader->block_list) {
298      int sfu_delay = 0, mem_delay = 0;
299
300      foreach_instr (instr, &block->instr_list) {
301
302         foreach_src (reg, instr) {
303            collect_reg_info(instr, reg, info);
304         }
305
306         foreach_dst (reg, instr) {
307            if (is_dest_gpr(reg)) {
308               collect_reg_info(instr, reg, info);
309            }
310         }
311
312         if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) {
313            unsigned components = instr->srcs[2]->uim_val;
314            if (components * type_size(instr->cat6.type) > 32) {
315               info->multi_dword_ldp_stp = true;
316            }
317
318            if (instr->opc == OPC_STP)
319               info->stp_count += components;
320            else
321               info->ldp_count += components;
322         }
323
324         if ((instr->opc == OPC_BARY_F || instr->opc == OPC_FLAT_B) &&
325             (instr->dsts[0]->flags & IR3_REG_EI))
326            info->last_baryf = info->instrs_count;
327
328         if (instr->opc == OPC_SHPS)
329            in_preamble = true;
330
331         /* Don't count instructions in the preamble for instruction-count type
332          * stats, because their effect should be much smaller.
333          * TODO: we should probably have separate stats for preamble
334          * instructions, but that would blow up the amount of stats...
335          */
336         if (!in_preamble) {
337            unsigned instrs_count = 1 + instr->repeat + instr->nop;
338            unsigned nops_count = instr->nop;
339
340            if (instr->opc == OPC_NOP) {
341               nops_count = 1 + instr->repeat;
342               info->instrs_per_cat[0] += nops_count;
343            } else {
344               info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat;
345               info->instrs_per_cat[0] += nops_count;
346            }
347
348            if (instr->opc == OPC_MOV) {
349               if (instr->cat1.src_type == instr->cat1.dst_type) {
350                  info->mov_count += 1 + instr->repeat;
351               } else {
352                  info->cov_count += 1 + instr->repeat;
353               }
354            }
355
356            info->instrs_count += instrs_count;
357            info->nops_count += nops_count;
358
359            if (instr->flags & IR3_INSTR_SS) {
360               info->ss++;
361               info->sstall += sfu_delay;
362               sfu_delay = 0;
363            }
364
365            if (instr->flags & IR3_INSTR_SY) {
366               info->sy++;
367               info->systall += mem_delay;
368               mem_delay = 0;
369            }
370
371            if (is_ss_producer(instr)) {
372               sfu_delay = soft_ss_delay(instr);
373            } else {
374               int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
375               sfu_delay -= n;
376            }
377
378            if (is_sy_producer(instr)) {
379               mem_delay = soft_sy_delay(instr, shader);
380            } else {
381               int n = MIN2(mem_delay, 1 + instr->repeat + instr->nop);
382               mem_delay -= n;
383            }
384         }
385
386         if (instr->opc == OPC_SHPE)
387            in_preamble = false;
388      }
389   }
390
391   /* TODO: for a5xx and below, is there a separate regfile for
392    * half-registers?
393    */
394   unsigned regs_count =
395      info->max_reg + 1 +
396      (compiler->gen >= 6 ? ((info->max_half_reg + 2) / 2) : 0);
397
398   info->double_threadsize = ir3_should_double_threadsize(v, regs_count);
399   unsigned reg_independent_max_waves =
400      ir3_get_reg_independent_max_waves(v, info->double_threadsize);
401   unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
402      compiler, regs_count, info->double_threadsize);
403   info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
404   assert(info->max_waves <= v->compiler->max_waves);
405}
406
407static struct ir3_register *
408reg_create(struct ir3 *shader, int num, int flags)
409{
410   struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register));
411   reg->wrmask = 1;
412   reg->flags = flags;
413   reg->num = num;
414   return reg;
415}
416
417static void
418insert_instr(struct ir3_block *block, struct ir3_instruction *instr)
419{
420   struct ir3 *shader = block->shader;
421
422   instr->serialno = ++shader->instr_count;
423
424   list_addtail(&instr->node, &block->instr_list);
425
426   if (is_input(instr))
427      array_insert(shader, shader->baryfs, instr);
428}
429
430struct ir3_block *
431ir3_block_create(struct ir3 *shader)
432{
433   struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
434#ifdef DEBUG
435   block->serialno = ++shader->block_count;
436#endif
437   block->shader = shader;
438   list_inithead(&block->node);
439   list_inithead(&block->instr_list);
440   return block;
441}
442
443void
444ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred)
445{
446   array_insert(block, block->predecessors, pred);
447}
448
449void
450ir3_block_add_physical_predecessor(struct ir3_block *block,
451                                   struct ir3_block *pred)
452{
453   array_insert(block, block->physical_predecessors, pred);
454}
455
456void
457ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred)
458{
459   for (unsigned i = 0; i < block->predecessors_count; i++) {
460      if (block->predecessors[i] == pred) {
461         if (i < block->predecessors_count - 1) {
462            block->predecessors[i] =
463               block->predecessors[block->predecessors_count - 1];
464         }
465
466         block->predecessors_count--;
467         return;
468      }
469   }
470}
471
472void
473ir3_block_remove_physical_predecessor(struct ir3_block *block, struct ir3_block *pred)
474{
475   for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
476      if (block->physical_predecessors[i] == pred) {
477         if (i < block->physical_predecessors_count - 1) {
478            block->physical_predecessors[i] =
479               block->physical_predecessors[block->physical_predecessors_count - 1];
480         }
481
482         block->physical_predecessors_count--;
483         return;
484      }
485   }
486}
487
488unsigned
489ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred)
490{
491   for (unsigned i = 0; i < block->predecessors_count; i++) {
492      if (block->predecessors[i] == pred) {
493         return i;
494      }
495   }
496
497   unreachable("ir3_block_get_pred_index() invalid predecessor");
498}
499
500static struct ir3_instruction *
501instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
502{
503   /* Add extra sources for array destinations and the address reg */
504   if (1 <= opc_cat(opc))
505      nsrc += 2;
506   struct ir3_instruction *instr;
507   unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) +
508                 (nsrc * sizeof(instr->srcs[0]));
509   char *ptr = ir3_alloc(block->shader, sz);
510
511   instr = (struct ir3_instruction *)ptr;
512   ptr += sizeof(*instr);
513   instr->dsts = (struct ir3_register **)ptr;
514   instr->srcs = instr->dsts + ndst;
515
516#ifdef DEBUG
517   instr->dsts_max = ndst;
518   instr->srcs_max = nsrc;
519#endif
520
521   return instr;
522}
523
524struct ir3_instruction *
525ir3_instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
526{
527   struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc);
528   instr->block = block;
529   instr->opc = opc;
530   insert_instr(block, instr);
531   return instr;
532}
533
534struct ir3_instruction *
535ir3_instr_clone(struct ir3_instruction *instr)
536{
537   struct ir3_instruction *new_instr = instr_create(
538      instr->block, instr->opc, instr->dsts_count, instr->srcs_count);
539   struct ir3_register **dsts, **srcs;
540
541   dsts = new_instr->dsts;
542   srcs = new_instr->srcs;
543   *new_instr = *instr;
544   new_instr->dsts = dsts;
545   new_instr->srcs = srcs;
546
547   insert_instr(instr->block, new_instr);
548
549   /* clone registers: */
550   new_instr->dsts_count = 0;
551   new_instr->srcs_count = 0;
552   foreach_dst (reg, instr) {
553      struct ir3_register *new_reg =
554         ir3_dst_create(new_instr, reg->num, reg->flags);
555      *new_reg = *reg;
556      if (new_reg->instr)
557         new_reg->instr = new_instr;
558   }
559   foreach_src (reg, instr) {
560      struct ir3_register *new_reg =
561         ir3_src_create(new_instr, reg->num, reg->flags);
562      *new_reg = *reg;
563   }
564
565   if (instr->address) {
566      assert(instr->srcs_count > 0);
567      new_instr->address = new_instr->srcs[instr->srcs_count - 1];
568   }
569
570   return new_instr;
571}
572
573/* Add a false dependency to instruction, to ensure it is scheduled first: */
574void
575ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
576{
577   for (unsigned i = 0; i < instr->deps_count; i++) {
578      if (instr->deps[i] == dep)
579         return;
580   }
581
582   array_insert(instr, instr->deps, dep);
583}
584
585struct ir3_register *
586ir3_src_create(struct ir3_instruction *instr, int num, int flags)
587{
588   struct ir3 *shader = instr->block->shader;
589#ifdef DEBUG
590   assert(instr->srcs_count < instr->srcs_max);
591#endif
592   struct ir3_register *reg = reg_create(shader, num, flags);
593   instr->srcs[instr->srcs_count++] = reg;
594   return reg;
595}
596
597struct ir3_register *
598ir3_dst_create(struct ir3_instruction *instr, int num, int flags)
599{
600   struct ir3 *shader = instr->block->shader;
601#ifdef DEBUG
602   assert(instr->dsts_count < instr->dsts_max);
603#endif
604   struct ir3_register *reg = reg_create(shader, num, flags);
605   instr->dsts[instr->dsts_count++] = reg;
606   return reg;
607}
608
609struct ir3_register *
610ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg)
611{
612   struct ir3_register *new_reg = reg_create(shader, 0, 0);
613   *new_reg = *reg;
614   return new_reg;
615}
616
617void
618ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg,
619                       struct ir3_register *last_write)
620{
621   assert(reg->flags & IR3_REG_ARRAY);
622   struct ir3_register *new_reg = ir3_src_create(instr, 0, 0);
623   *new_reg = *reg;
624   new_reg->def = last_write;
625   ir3_reg_tie(reg, new_reg);
626}
627
628void
629ir3_instr_set_address(struct ir3_instruction *instr,
630                      struct ir3_instruction *addr)
631{
632   if (!instr->address) {
633      struct ir3 *ir = instr->block->shader;
634
635      assert(instr->block == addr->block);
636
637      instr->address =
638         ir3_src_create(instr, addr->dsts[0]->num, addr->dsts[0]->flags);
639      instr->address->def = addr->dsts[0];
640      assert(reg_num(addr->dsts[0]) == REG_A0);
641      unsigned comp = reg_comp(addr->dsts[0]);
642      if (comp == 0) {
643         array_insert(ir, ir->a0_users, instr);
644      } else {
645         assert(comp == 1);
646         array_insert(ir, ir->a1_users, instr);
647      }
648   } else {
649      assert(instr->address->def->instr == addr);
650   }
651}
652
653void
654ir3_block_clear_mark(struct ir3_block *block)
655{
656   foreach_instr (instr, &block->instr_list)
657      instr->flags &= ~IR3_INSTR_MARK;
658}
659
660void
661ir3_clear_mark(struct ir3 *ir)
662{
663   foreach_block (block, &ir->block_list) {
664      ir3_block_clear_mark(block);
665   }
666}
667
668unsigned
669ir3_count_instructions(struct ir3 *ir)
670{
671   unsigned cnt = 1;
672   foreach_block (block, &ir->block_list) {
673      block->start_ip = cnt;
674      foreach_instr (instr, &block->instr_list) {
675         instr->ip = cnt++;
676      }
677      block->end_ip = cnt;
678   }
679   return cnt;
680}
681
682/* When counting instructions for RA, we insert extra fake instructions at the
683 * beginning of each block, where values become live, and at the end where
684 * values die. This prevents problems where values live-in at the beginning or
685 * live-out at the end of a block from being treated as if they were
686 * live-in/live-out at the first/last instruction, which would be incorrect.
687 * In ir3_legalize these ip's are assumed to be actual ip's of the final
688 * program, so it would be incorrect to use this everywhere.
689 */
690
691unsigned
692ir3_count_instructions_ra(struct ir3 *ir)
693{
694   unsigned cnt = 1;
695   foreach_block (block, &ir->block_list) {
696      block->start_ip = cnt++;
697      foreach_instr (instr, &block->instr_list) {
698         instr->ip = cnt++;
699      }
700      block->end_ip = cnt++;
701   }
702   return cnt;
703}
704
705struct ir3_array *
706ir3_lookup_array(struct ir3 *ir, unsigned id)
707{
708   foreach_array (arr, &ir->array_list)
709      if (arr->id == id)
710         return arr;
711   return NULL;
712}
713
714void
715ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps)
716{
717   /* We could do this in a single pass if we can assume instructions
718    * are always sorted.  Which currently might not always be true.
719    * (In particular after ir3_group pass, but maybe other places.)
720    */
721   foreach_block (block, &ir->block_list)
722      foreach_instr (instr, &block->instr_list)
723         instr->uses = NULL;
724
725   foreach_block (block, &ir->block_list) {
726      foreach_instr (instr, &block->instr_list) {
727         foreach_ssa_src_n (src, n, instr) {
728            if (__is_false_dep(instr, n) && !falsedeps)
729               continue;
730            if (!src->uses)
731               src->uses = _mesa_pointer_set_create(mem_ctx);
732            _mesa_set_add(src->uses, instr);
733         }
734      }
735   }
736}
737
738/**
739 * Set the destination type of an instruction, for example if a
740 * conversion is folded in, handling the special cases where the
741 * instruction's dest type or opcode needs to be fixed up.
742 */
743void
744ir3_set_dst_type(struct ir3_instruction *instr, bool half)
745{
746   if (half) {
747      instr->dsts[0]->flags |= IR3_REG_HALF;
748   } else {
749      instr->dsts[0]->flags &= ~IR3_REG_HALF;
750   }
751
752   switch (opc_cat(instr->opc)) {
753   case 1: /* move instructions */
754      if (half) {
755         instr->cat1.dst_type = half_type(instr->cat1.dst_type);
756      } else {
757         instr->cat1.dst_type = full_type(instr->cat1.dst_type);
758      }
759      break;
760   case 4:
761      if (half) {
762         instr->opc = cat4_half_opc(instr->opc);
763      } else {
764         instr->opc = cat4_full_opc(instr->opc);
765      }
766      break;
767   case 5:
768      if (half) {
769         instr->cat5.type = half_type(instr->cat5.type);
770      } else {
771         instr->cat5.type = full_type(instr->cat5.type);
772      }
773      break;
774   }
775}
776
777/**
778 * One-time fixup for instruction src-types.  Other than cov's that
779 * are folded, an instruction's src type does not change.
780 */
781void
782ir3_fixup_src_type(struct ir3_instruction *instr)
783{
784   if (instr->srcs_count == 0)
785      return;
786
787   switch (opc_cat(instr->opc)) {
788   case 1: /* move instructions */
789      if (instr->srcs[0]->flags & IR3_REG_HALF) {
790         instr->cat1.src_type = half_type(instr->cat1.src_type);
791      } else {
792         instr->cat1.src_type = full_type(instr->cat1.src_type);
793      }
794      break;
795   case 3:
796      if (instr->srcs[0]->flags & IR3_REG_HALF) {
797         instr->opc = cat3_half_opc(instr->opc);
798      } else {
799         instr->opc = cat3_full_opc(instr->opc);
800      }
801      break;
802   }
803}
804
805/**
806 * Map a floating point immed to FLUT (float lookup table) value,
807 * returns negative for immediates that cannot be mapped.
808 */
809int
810ir3_flut(struct ir3_register *src_reg)
811{
812   static const struct {
813      uint32_t f32;
814      uint16_t f16;
815   } flut[] = {
816         { .f32 = 0x00000000, .f16 = 0x0000 },    /* 0.0 */
817         { .f32 = 0x3f000000, .f16 = 0x3800 },    /* 0.5 */
818         { .f32 = 0x3f800000, .f16 = 0x3c00 },    /* 1.0 */
819         { .f32 = 0x40000000, .f16 = 0x4000 },    /* 2.0 */
820         { .f32 = 0x402df854, .f16 = 0x4170 },    /* e */
821         { .f32 = 0x40490fdb, .f16 = 0x4248 },    /* pi */
822         { .f32 = 0x3ea2f983, .f16 = 0x3518 },    /* 1/pi */
823         { .f32 = 0x3f317218, .f16 = 0x398c },    /* 1/log2(e) */
824         { .f32 = 0x3fb8aa3b, .f16 = 0x3dc5 },    /* log2(e) */
825         { .f32 = 0x3e9a209b, .f16 = 0x34d1 },    /* 1/log2(10) */
826         { .f32 = 0x40549a78, .f16 = 0x42a5 },    /* log2(10) */
827         { .f32 = 0x40800000, .f16 = 0x4400 },    /* 4.0 */
828   };
829
830   if (src_reg->flags & IR3_REG_HALF) {
831      /* Note that half-float immeds are already lowered to 16b in nir: */
832      uint32_t imm = src_reg->uim_val;
833      for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
834         if (flut[i].f16 == imm) {
835            return i;
836         }
837      }
838   } else {
839      uint32_t imm = src_reg->uim_val;
840      for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
841         if (flut[i].f32 == imm) {
842            return i;
843         }
844      }
845   }
846
847   return -1;
848}
849
850static unsigned
851cp_flags(unsigned flags)
852{
853   /* only considering these flags (at least for now): */
854   flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
855             IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV |
856             IR3_REG_SHARED);
857   return flags;
858}
859
860bool
861ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
862{
863   struct ir3_compiler *compiler = instr->block->shader->compiler;
864   unsigned valid_flags;
865
866   if ((flags & IR3_REG_SHARED) && opc_cat(instr->opc) > 3)
867      return false;
868
869   flags = cp_flags(flags);
870
871   /* If destination is indirect, then source cannot be.. at least
872    * I don't think so..
873    */
874   if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) &&
875       (flags & IR3_REG_RELATIV))
876      return false;
877
878   if (flags & IR3_REG_RELATIV) {
879      /* TODO need to test on earlier gens.. pretty sure the earlier
880       * problem was just that we didn't check that the src was from
881       * same block (since we can't propagate address register values
882       * across blocks currently)
883       */
884      if (compiler->gen < 6)
885         return false;
886
887      /* NOTE in the special try_swap_mad_two_srcs() case we can be
888       * called on a src that has already had an indirect load folded
889       * in, in which case ssa() returns NULL
890       */
891      if (instr->srcs[n]->flags & IR3_REG_SSA) {
892         struct ir3_instruction *src = ssa(instr->srcs[n]);
893         if (src->address->def->instr->block != instr->block)
894            return false;
895      }
896   }
897
898   if (is_meta(instr)) {
899      /* collect and phi nodes support const/immed sources, which will be
900       * turned into move instructions, but not anything else.
901       */
902      if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED))
903         return false;
904
905      if ((flags & IR3_REG_SHARED) && !(instr->dsts[0]->flags & IR3_REG_SHARED))
906         return false;
907
908      return true;
909   }
910
911   switch (opc_cat(instr->opc)) {
912   case 0: /* end, chmask */
913      return flags == 0;
914   case 1:
915      switch (instr->opc) {
916      case OPC_MOVMSK:
917      case OPC_SWZ:
918      case OPC_SCT:
919      case OPC_GAT:
920         valid_flags = IR3_REG_SHARED;
921         break;
922      case OPC_SCAN_MACRO:
923         return flags == 0;
924         break;
925      default:
926         valid_flags =
927            IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED;
928      }
929      if (flags & ~valid_flags)
930         return false;
931      break;
932   case 2:
933      valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST |
934                    IR3_REG_RELATIV | IR3_REG_IMMED | IR3_REG_SHARED;
935
936      if (flags & ~valid_flags)
937         return false;
938
939      /* Allow an immediate src1 for flat.b, since it's ignored */
940      if (instr->opc == OPC_FLAT_B &&
941          n == 1 && flags == IR3_REG_IMMED)
942         return true;
943
944      if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) {
945         unsigned m = n ^ 1;
946         /* cannot deal w/ const or shared in both srcs:
947          * (note that some cat2 actually only have a single src)
948          */
949         if (m < instr->srcs_count) {
950            struct ir3_register *reg = instr->srcs[m];
951            if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) &&
952                (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED)))
953               return false;
954            if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED))
955               return false;
956         }
957      }
958      break;
959   case 3:
960      valid_flags =
961         ir3_cat3_absneg(instr->opc) | IR3_REG_RELATIV | IR3_REG_SHARED;
962
963      switch (instr->opc) {
964      case OPC_SHRM:
965      case OPC_SHLM:
966      case OPC_SHRG:
967      case OPC_SHLG:
968      case OPC_ANDG: {
969         valid_flags |= IR3_REG_IMMED;
970         /* Can be RELATIV+CONST but not CONST: */
971         if (flags & IR3_REG_RELATIV)
972            valid_flags |= IR3_REG_CONST;
973         break;
974      }
975      case OPC_WMM:
976      case OPC_WMM_ACCU: {
977         valid_flags = IR3_REG_SHARED;
978         if (n == 2)
979            valid_flags = IR3_REG_CONST;
980         break;
981      }
982      case OPC_DP2ACC:
983      case OPC_DP4ACC:
984         break;
985      default:
986         valid_flags |= IR3_REG_CONST;
987      }
988
989      if (flags & ~valid_flags)
990         return false;
991
992      if (flags & (IR3_REG_CONST | IR3_REG_SHARED | IR3_REG_RELATIV)) {
993         /* cannot deal w/ const/shared/relativ in 2nd src: */
994         if (n == 1)
995            return false;
996      }
997
998      break;
999   case 4:
1000      /* seems like blob compiler avoids const as src.. */
1001      /* TODO double check if this is still the case on a4xx */
1002      if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
1003         return false;
1004      if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
1005         return false;
1006      break;
1007   case 5:
1008      /* no flags allowed */
1009      if (flags)
1010         return false;
1011      break;
1012   case 6:
1013      valid_flags = IR3_REG_IMMED;
1014      if (flags & ~valid_flags)
1015         return false;
1016
1017      if (flags & IR3_REG_IMMED) {
1018         /* doesn't seem like we can have immediate src for store
1019          * instructions:
1020          *
1021          * TODO this restriction could also apply to load instructions,
1022          * but for load instructions this arg is the address (and not
1023          * really sure any good way to test a hard-coded immed addr src)
1024          */
1025         if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1))
1026            return false;
1027
1028         if ((instr->opc == OPC_LDL) && (n == 0))
1029            return false;
1030
1031         if ((instr->opc == OPC_STL) && (n != 2))
1032            return false;
1033
1034         if ((instr->opc == OPC_LDP) && (n == 0))
1035            return false;
1036
1037         if ((instr->opc == OPC_STP) && (n != 2))
1038            return false;
1039
1040         if (instr->opc == OPC_STLW && n == 0)
1041            return false;
1042
1043         if (instr->opc == OPC_LDLW && n == 0)
1044            return false;
1045
1046         /* disallow immediates in anything but the SSBO slot argument for
1047          * cat6 instructions:
1048          */
1049         if (is_global_a3xx_atomic(instr->opc) && (n != 0))
1050            return false;
1051
1052         if (is_local_atomic(instr->opc) || is_global_a6xx_atomic(instr->opc) ||
1053             is_bindless_atomic(instr->opc))
1054            return false;
1055
1056         if (instr->opc == OPC_STG && (n == 2))
1057            return false;
1058
1059         if (instr->opc == OPC_STG_A && (n == 4))
1060            return false;
1061
1062         if (instr->opc == OPC_LDG && (n == 0))
1063            return false;
1064
1065         if (instr->opc == OPC_LDG_A && (n < 2))
1066            return false;
1067
1068         /* as with atomics, these cat6 instrs can only have an immediate
1069          * for SSBO/IBO slot argument
1070          */
1071         switch (instr->opc) {
1072         case OPC_LDIB:
1073         case OPC_STIB:
1074         case OPC_RESINFO:
1075            if (n != 0)
1076               return false;
1077            break;
1078         default:
1079            break;
1080         }
1081      }
1082
1083      break;
1084   }
1085
1086   return true;
1087}
1088
1089bool
1090ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed)
1091{
1092   if (instr->opc == OPC_MOV || is_meta(instr))
1093      return true;
1094
1095   if (is_mem(instr)) {
1096      switch (instr->opc) {
1097      /* Some load/store instructions have a 13-bit offset and size which must
1098       * always be an immediate and the rest of the sources cannot be
1099       * immediates, so the frontend is responsible for checking the size:
1100       */
1101      case OPC_LDL:
1102      case OPC_STL:
1103      case OPC_LDP:
1104      case OPC_STP:
1105      case OPC_LDG:
1106      case OPC_STG:
1107      case OPC_SPILL_MACRO:
1108      case OPC_RELOAD_MACRO:
1109      case OPC_LDG_A:
1110      case OPC_STG_A:
1111      case OPC_LDLW:
1112      case OPC_STLW:
1113      case OPC_LDLV:
1114         return true;
1115      default:
1116         /* most cat6 src immediates can only encode 8 bits: */
1117         return !(immed & ~0xff);
1118      }
1119   }
1120
1121   /* Other than cat1 (mov) we can only encode up to 10 bits, sign-extended: */
1122   return !(immed & ~0x1ff) || !(-immed & ~0x1ff);
1123}
1124