1/*
2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 *    Jonathan Marek <jonathan@marek.ca>
25 */
26
27#include "ir2_private.h"
28
29static unsigned
30src_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
31{
32   struct ir2_reg_component *comps;
33   unsigned swiz = 0;
34
35   switch (src->type) {
36   case IR2_SRC_SSA:
37   case IR2_SRC_REG:
38      break;
39   default:
40      return src->swizzle;
41   }
42   /* we need to take into account where the components were allocated */
43   comps = get_reg_src(ctx, src)->comp;
44   for (int i = 0; i < ncomp; i++) {
45      swiz |= swiz_set(comps[swiz_get(src->swizzle, i)].c, i);
46   }
47   return swiz;
48}
49
50/* alu instr need to take into how the output components are allocated */
51
52/* scalar doesn't need to take into account dest swizzle */
53
54static unsigned
55alu_swizzle_scalar(struct ir2_context *ctx, struct ir2_src *reg)
56{
57   /* hardware seems to take from W, but swizzle everywhere just in case */
58   return swiz_merge(src_swizzle(ctx, reg, 1), IR2_SWIZZLE_XXXX);
59}
60
61static unsigned
62alu_swizzle(struct ir2_context *ctx, struct ir2_instr *instr,
63            struct ir2_src *src)
64{
65   struct ir2_reg_component *comp = get_reg(instr)->comp;
66   unsigned swiz0 = src_swizzle(ctx, src, src_ncomp(instr));
67   unsigned swiz = 0;
68
69   /* non per component special cases */
70   switch (instr->alu.vector_opc) {
71   case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv:
72      return alu_swizzle_scalar(ctx, src);
73   case DOT2ADDv:
74   case DOT3v:
75   case DOT4v:
76   case CUBEv:
77      return swiz0;
78   default:
79      break;
80   }
81
82   for (int i = 0, j = 0; i < dst_ncomp(instr); j++) {
83      if (instr->alu.write_mask & 1 << j) {
84         if (comp[j].c != 7)
85            swiz |= swiz_set(i, comp[j].c);
86         i++;
87      }
88   }
89   return swiz_merge(swiz0, swiz);
90}
91
92static unsigned
93alu_swizzle_scalar2(struct ir2_context *ctx, struct ir2_src *src, unsigned s1)
94{
95   /* hardware seems to take from ZW, but swizzle everywhere (ABAB) */
96   unsigned s0 = swiz_get(src_swizzle(ctx, src, 1), 0);
97   return swiz_merge(swiz_set(s0, 0) | swiz_set(s1, 1), IR2_SWIZZLE_XYXY);
98}
99
100/* write_mask needs to be transformed by allocation information */
101
102static unsigned
103alu_write_mask(struct ir2_context *ctx, struct ir2_instr *instr)
104{
105   struct ir2_reg_component *comp = get_reg(instr)->comp;
106   unsigned write_mask = 0;
107
108   for (int i = 0; i < 4; i++) {
109      if (instr->alu.write_mask & 1 << i)
110         write_mask |= 1 << comp[i].c;
111   }
112
113   return write_mask;
114}
115
116/* fetch instructions can swizzle dest, but src swizzle needs conversion */
117
118static unsigned
119fetch_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
120{
121   unsigned alu_swiz = src_swizzle(ctx, src, ncomp);
122   unsigned swiz = 0;
123   for (int i = 0; i < ncomp; i++)
124      swiz |= swiz_get(alu_swiz, i) << i * 2;
125   return swiz;
126}
127
128static unsigned
129fetch_dst_swiz(struct ir2_context *ctx, struct ir2_instr *instr)
130{
131   struct ir2_reg_component *comp = get_reg(instr)->comp;
132   unsigned dst_swiz = 0xfff;
133   for (int i = 0; i < dst_ncomp(instr); i++) {
134      dst_swiz &= ~(7 << comp[i].c * 3);
135      dst_swiz |= i << comp[i].c * 3;
136   }
137   return dst_swiz;
138}
139
140/* register / export # for instr */
141static unsigned
142dst_to_reg(struct ir2_context *ctx, struct ir2_instr *instr)
143{
144   if (is_export(instr))
145      return instr->alu.export;
146
147   return get_reg(instr)->idx;
148}
149
150/* register # for src */
151static unsigned
152src_to_reg(struct ir2_context *ctx, struct ir2_src *src)
153{
154   return get_reg_src(ctx, src)->idx;
155}
156
157static unsigned
158src_reg_byte(struct ir2_context *ctx, struct ir2_src *src)
159{
160   if (src->type == IR2_SRC_CONST) {
161      assert(!src->abs); /* no abs bit for const */
162      return src->num;
163   }
164   return src_to_reg(ctx, src) | (src->abs ? 0x80 : 0);
165}
166
167/* produce the 12 byte binary instruction for a given sched_instr */
168static void
169fill_instr(struct ir2_context *ctx, struct ir2_sched_instr *sched, instr_t *bc,
170           bool *is_fetch)
171{
172   struct ir2_instr *instr = sched->instr, *instr_s, *instr_v;
173
174   *bc = (instr_t){};
175
176   if (instr && instr->type == IR2_FETCH) {
177      *is_fetch = true;
178
179      bc->fetch.opc = instr->fetch.opc;
180      bc->fetch.pred_select = !!instr->pred;
181      bc->fetch.pred_condition = instr->pred & 1;
182
183      struct ir2_src *src = instr->src;
184
185      if (instr->fetch.opc == VTX_FETCH) {
186         instr_fetch_vtx_t *vtx = &bc->fetch.vtx;
187
188         assert(instr->fetch.vtx.const_idx <= 0x1f);
189         assert(instr->fetch.vtx.const_idx_sel <= 0x3);
190
191         vtx->src_reg = src_to_reg(ctx, src);
192         vtx->src_swiz = fetch_swizzle(ctx, src, 1);
193         vtx->dst_reg = dst_to_reg(ctx, instr);
194         vtx->dst_swiz = fetch_dst_swiz(ctx, instr);
195
196         vtx->must_be_one = 1;
197         vtx->const_index = instr->fetch.vtx.const_idx;
198         vtx->const_index_sel = instr->fetch.vtx.const_idx_sel;
199
200         /* other fields will be patched */
201
202         /* XXX seems like every FETCH but the first has
203          * this bit set:
204          */
205         vtx->reserved3 = instr->idx ? 0x1 : 0x0;
206         vtx->reserved0 = instr->idx ? 0x2 : 0x3;
207      } else if (instr->fetch.opc == TEX_FETCH) {
208         instr_fetch_tex_t *tex = &bc->fetch.tex;
209
210         tex->src_reg = src_to_reg(ctx, src);
211         tex->src_swiz = fetch_swizzle(ctx, src, 3);
212         tex->dst_reg = dst_to_reg(ctx, instr);
213         tex->dst_swiz = fetch_dst_swiz(ctx, instr);
214         /* tex->const_idx = patch_fetches */
215         tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
216         tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
217         tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
218         tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
219         tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
220         tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
221         tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
222         tex->use_comp_lod = ctx->so->type == MESA_SHADER_FRAGMENT;
223         tex->use_reg_lod = instr->src_count == 2;
224         tex->sample_location = SAMPLE_CENTER;
225         tex->tx_coord_denorm = instr->fetch.tex.is_rect;
226      } else if (instr->fetch.opc == TEX_SET_TEX_LOD) {
227         instr_fetch_tex_t *tex = &bc->fetch.tex;
228
229         tex->src_reg = src_to_reg(ctx, src);
230         tex->src_swiz = fetch_swizzle(ctx, src, 1);
231         tex->dst_reg = 0;
232         tex->dst_swiz = 0xfff;
233
234         tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
235         tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
236         tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
237         tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
238         tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
239         tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
240         tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
241         tex->use_comp_lod = 1;
242         tex->use_reg_lod = 0;
243         tex->sample_location = SAMPLE_CENTER;
244      } else {
245         assert(0);
246      }
247      return;
248   }
249
250   instr_v = sched->instr;
251   instr_s = sched->instr_s;
252
253   if (instr_v) {
254      struct ir2_src src1, src2, *src3;
255
256      src1 = instr_v->src[0];
257      src2 = instr_v->src[instr_v->src_count > 1];
258      src3 = instr_v->src_count == 3 ? &instr_v->src[2] : NULL;
259
260      bc->alu.vector_opc = instr_v->alu.vector_opc;
261      bc->alu.vector_write_mask = alu_write_mask(ctx, instr_v);
262      bc->alu.vector_dest = dst_to_reg(ctx, instr_v);
263      bc->alu.vector_clamp = instr_v->alu.saturate;
264      bc->alu.export_data = instr_v->alu.export >= 0;
265
266      /* single operand SETEv, use 0.0f as src2 */
267      if (instr_v->src_count == 1 &&
268          (bc->alu.vector_opc == SETEv || bc->alu.vector_opc == SETNEv ||
269           bc->alu.vector_opc == SETGTv || bc->alu.vector_opc == SETGTEv))
270         src2 = ir2_zero(ctx);
271
272      /* export32 instr for a20x hw binning has this bit set..
273       * it seems to do more than change the base address of constants
274       * XXX this is a hack
275       */
276      bc->alu.relative_addr =
277         (bc->alu.export_data && bc->alu.vector_dest == 32);
278
279      bc->alu.src1_reg_byte = src_reg_byte(ctx, &src1);
280      bc->alu.src1_swiz = alu_swizzle(ctx, instr_v, &src1);
281      bc->alu.src1_reg_negate = src1.negate;
282      bc->alu.src1_sel = src1.type != IR2_SRC_CONST;
283
284      bc->alu.src2_reg_byte = src_reg_byte(ctx, &src2);
285      bc->alu.src2_swiz = alu_swizzle(ctx, instr_v, &src2);
286      bc->alu.src2_reg_negate = src2.negate;
287      bc->alu.src2_sel = src2.type != IR2_SRC_CONST;
288
289      if (src3) {
290         bc->alu.src3_reg_byte = src_reg_byte(ctx, src3);
291         bc->alu.src3_swiz = alu_swizzle(ctx, instr_v, src3);
292         bc->alu.src3_reg_negate = src3->negate;
293         bc->alu.src3_sel = src3->type != IR2_SRC_CONST;
294      }
295
296      bc->alu.pred_select = instr_v->pred;
297   }
298
299   if (instr_s) {
300      struct ir2_src *src = instr_s->src;
301
302      bc->alu.scalar_opc = instr_s->alu.scalar_opc;
303      bc->alu.scalar_write_mask = alu_write_mask(ctx, instr_s);
304      bc->alu.scalar_dest = dst_to_reg(ctx, instr_s);
305      bc->alu.scalar_clamp = instr_s->alu.saturate;
306      bc->alu.export_data = instr_s->alu.export >= 0;
307
308      if (instr_s->src_count == 1) {
309         bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
310         bc->alu.src3_swiz = alu_swizzle_scalar(ctx, src);
311         bc->alu.src3_reg_negate = src->negate;
312         bc->alu.src3_sel = src->type != IR2_SRC_CONST;
313      } else {
314         assert(instr_s->src_count == 2);
315
316         bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
317         bc->alu.src3_swiz =
318            alu_swizzle_scalar2(ctx, src, instr_s->alu.src1_swizzle);
319         bc->alu.src3_reg_negate = src->negate;
320         bc->alu.src3_sel = src->type != IR2_SRC_CONST;
321         ;
322      }
323
324      if (instr_v)
325         assert(instr_s->pred == instr_v->pred);
326      bc->alu.pred_select = instr_s->pred;
327   }
328
329   *is_fetch = false;
330   return;
331}
332
333static unsigned
334write_cfs(struct ir2_context *ctx, instr_cf_t *cfs, unsigned cf_idx,
335          instr_cf_alloc_t *alloc, instr_cf_exec_t *exec)
336{
337   assert(exec->count);
338
339   if (alloc)
340      cfs[cf_idx++].alloc = *alloc;
341
342   /* for memory alloc offset for patching */
343   if (alloc && alloc->buffer_select == SQ_MEMORY &&
344       ctx->info->mem_export_ptr == -1)
345      ctx->info->mem_export_ptr = cf_idx / 2 * 3;
346
347   cfs[cf_idx++].exec = *exec;
348   exec->address += exec->count;
349   exec->serialize = 0;
350   exec->count = 0;
351
352   return cf_idx;
353}
354
355/* assemble the final shader */
356void
357assemble(struct ir2_context *ctx, bool binning)
358{
359   /* hw seems to have a limit of 384 (num_cf/2+num_instr <= 384)
360    * address is 9 bits so could it be 512 ?
361    */
362   instr_cf_t cfs[384];
363   instr_t bytecode[384], bc;
364   unsigned block_addr[128];
365   unsigned num_cf = 0;
366
367   /* CF instr state */
368   instr_cf_exec_t exec = {.opc = EXEC};
369   instr_cf_alloc_t alloc = {.opc = ALLOC};
370
371   int sync_id, sync_id_prev = -1;
372   bool is_fetch = false;
373   bool need_sync = true;
374   bool need_alloc = false;
375   unsigned block_idx = 0;
376
377   ctx->info->mem_export_ptr = -1;
378   ctx->info->num_fetch_instrs = 0;
379
380   /* vertex shader always needs to allocate at least one parameter
381    * if it will never happen,
382    */
383   if (ctx->so->type == MESA_SHADER_VERTEX && ctx->f->inputs_count == 0) {
384      alloc.buffer_select = SQ_PARAMETER_PIXEL;
385      cfs[num_cf++].alloc = alloc;
386   }
387
388   block_addr[0] = 0;
389
390   for (int i = 0, j = 0; j < ctx->instr_sched_count; j++) {
391      struct ir2_instr *instr = ctx->instr_sched[j].instr;
392
393      /* catch IR2_CF since it isn't a regular instruction */
394      if (instr && instr->type == IR2_CF) {
395         assert(!need_alloc); /* XXX */
396
397         /* flush any exec cf before inserting jmp */
398         if (exec.count)
399            num_cf = write_cfs(ctx, cfs, num_cf, NULL, &exec);
400
401         cfs[num_cf++].jmp_call = (instr_cf_jmp_call_t){
402            .opc = COND_JMP,
403            .address = instr->cf.block_idx, /* will be fixed later */
404            .force_call = !instr->pred,
405            .predicated_jmp = 1,
406            .direction = instr->cf.block_idx > instr->block_idx,
407            .condition = instr->pred & 1,
408         };
409         continue;
410      }
411
412      /* fill the 3 dwords for the instruction */
413      fill_instr(ctx, &ctx->instr_sched[j], &bc, &is_fetch);
414
415      /* we need to sync between ALU/VTX_FETCH/TEX_FETCH types */
416      sync_id = 0;
417      if (is_fetch)
418         sync_id = bc.fetch.opc == VTX_FETCH ? 1 : 2;
419
420      need_sync = sync_id != sync_id_prev;
421      sync_id_prev = sync_id;
422
423      unsigned block;
424      {
425
426         if (ctx->instr_sched[j].instr)
427            block = ctx->instr_sched[j].instr->block_idx;
428         else
429            block = ctx->instr_sched[j].instr_s->block_idx;
430
431         assert(block_idx <= block);
432      }
433
434      /* info for patching */
435      if (is_fetch) {
436         struct ir2_fetch_info *info =
437            &ctx->info->fetch_info[ctx->info->num_fetch_instrs++];
438         info->offset = i * 3; /* add cf offset later */
439
440         if (bc.fetch.opc == VTX_FETCH) {
441            info->vtx.dst_swiz = bc.fetch.vtx.dst_swiz;
442         } else if (bc.fetch.opc == TEX_FETCH) {
443            info->tex.samp_id = instr->fetch.tex.samp_id;
444            info->tex.src_swiz = bc.fetch.tex.src_swiz;
445         } else {
446            ctx->info->num_fetch_instrs--;
447         }
448      }
449
450      /* exec cf after 6 instr or when switching between fetch / alu */
451      if (exec.count == 6 ||
452          (exec.count && (need_sync || block != block_idx))) {
453         num_cf =
454            write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
455         need_alloc = false;
456      }
457
458      /* update block_addrs for jmp patching */
459      while (block_idx < block)
460         block_addr[++block_idx] = num_cf;
461
462      /* export - fill alloc cf */
463      if (!is_fetch && bc.alu.export_data) {
464         /* get the export buffer from either vector/scalar dest */
465         instr_alloc_type_t buffer = export_buf(bc.alu.vector_dest);
466         if (bc.alu.scalar_write_mask) {
467            if (bc.alu.vector_write_mask)
468               assert(buffer == export_buf(bc.alu.scalar_dest));
469            buffer = export_buf(bc.alu.scalar_dest);
470         }
471
472         /* flush previous alloc if the buffer changes */
473         bool need_new_alloc = buffer != alloc.buffer_select;
474
475         /* memory export always in 32/33 pair, new alloc on 32 */
476         if (bc.alu.vector_dest == 32)
477            need_new_alloc = true;
478
479         if (need_new_alloc && exec.count) {
480            num_cf =
481               write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
482            need_alloc = false;
483         }
484
485         need_alloc |= need_new_alloc;
486
487         alloc.size = 0;
488         alloc.buffer_select = buffer;
489
490         if (buffer == SQ_PARAMETER_PIXEL &&
491             ctx->so->type == MESA_SHADER_VERTEX)
492            alloc.size = ctx->f->inputs_count - 1;
493
494         if (buffer == SQ_POSITION)
495            alloc.size = ctx->so->writes_psize;
496      }
497
498      if (is_fetch)
499         exec.serialize |= 0x1 << exec.count * 2;
500      if (need_sync)
501         exec.serialize |= 0x2 << exec.count * 2;
502
503      need_sync = false;
504      exec.count += 1;
505      bytecode[i++] = bc;
506   }
507
508   /* final exec cf */
509   exec.opc = EXEC_END;
510   num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
511
512   /* insert nop to get an even # of CFs */
513   if (num_cf % 2)
514      cfs[num_cf++] = (instr_cf_t){.opc = NOP};
515
516   /* patch cf addrs */
517   for (int idx = 0; idx < num_cf; idx++) {
518      switch (cfs[idx].opc) {
519      case NOP:
520      case ALLOC:
521         break;
522      case EXEC:
523      case EXEC_END:
524         cfs[idx].exec.address += num_cf / 2;
525         break;
526      case COND_JMP:
527         cfs[idx].jmp_call.address = block_addr[cfs[idx].jmp_call.address];
528         break;
529      default:
530         assert(0);
531      }
532   }
533
534   /* concatenate cfs and alu/fetch */
535   uint32_t cfdwords = num_cf / 2 * 3;
536   uint32_t alufetchdwords = exec.address * 3;
537   uint32_t sizedwords = cfdwords + alufetchdwords;
538   uint32_t *dwords = malloc(sizedwords * 4);
539   assert(dwords);
540   memcpy(dwords, cfs, cfdwords * 4);
541   memcpy(&dwords[cfdwords], bytecode, alufetchdwords * 4);
542
543   /* finalize ir2_shader_info */
544   ctx->info->dwords = dwords;
545   ctx->info->sizedwords = sizedwords;
546   for (int i = 0; i < ctx->info->num_fetch_instrs; i++)
547      ctx->info->fetch_info[i].offset += cfdwords;
548
549   if (FD_DBG(DISASM)) {
550      DBG("disassemble: type=%d", ctx->so->type);
551      disasm_a2xx(dwords, sizedwords, 0, ctx->so->type);
552   }
553}
554