1/*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * This code is based on original work by Ilia Mirkin.
24 */
25
26/**
27 * \file gfx6_gs_visitor.cpp
28 *
29 * Gfx6 geometry shader implementation
30 */
31
32#include "gfx6_gs_visitor.h"
33#include "brw_eu.h"
34#include "brw_prim.h"
35
36namespace brw {
37
38void
39gfx6_gs_visitor::emit_prolog()
40{
41   vec4_gs_visitor::emit_prolog();
42
43   /* Gfx6 geometry shaders require to allocate an initial VUE handle via
44    * FF_SYNC message, however the documentation remarks that only one thread
45    * can write to the URB simultaneously and the FF_SYNC message provides the
46    * synchronization mechanism for this, so using this message effectively
47    * stalls the thread until it is its turn to write to the URB. Because of
48    * this, the best way to implement geometry shader algorithms in gfx6 is to
49    * execute the algorithm before the FF_SYNC message to maximize parallelism.
50    *
51    * To achieve this we buffer the geometry shader outputs for each emitted
52    * vertex in vertex_output during operation. Then, when we have processed
53    * the last vertex (that is, at thread end time), we send the FF_SYNC
54    * message to allocate the initial VUE handle and write all buffered vertex
55    * data to the URB in one go.
56    *
57    * For each emitted vertex, vertex_output will hold vue_map.num_slots
58    * data items plus one additional item to hold required flags
59    * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
60    * which come right after the data items for that vertex. Vertex data and
61    * flags for the next vertex come right after the data items and flags for
62    * the previous vertex.
63    */
64   this->current_annotation = "gfx6 prolog";
65   this->vertex_output = src_reg(this,
66                                 glsl_type::uint_type,
67                                 (prog_data->vue_map.num_slots + 1) *
68                                 nir->info.gs.vertices_out);
69   this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
70   emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
71
72   /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
73    * so initialize it once to R0.
74    */
75   vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
76                                     retype(brw_vec8_grf(0, 0),
77                                            BRW_REGISTER_TYPE_UD)));
78   inst->force_writemask_all = true;
79
80   /* This will be used as a temporary to store writeback data of FF_SYNC
81    * and URB_WRITE messages.
82    */
83   this->temp = src_reg(this, glsl_type::uint_type);
84
85   /* This will be used to know when we are processing the first vertex of
86    * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
87    * that we are processing the first vertex in the primitive and to zero
88    * otherwise. This way we can use its value directly in the URB write
89    * headers.
90    */
91   this->first_vertex = src_reg(this, glsl_type::uint_type);
92   emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(URB_WRITE_PRIM_START)));
93
94   /* The FF_SYNC message requires to know the number of primitives generated,
95    * so keep a counter for this.
96    */
97   this->prim_count = src_reg(this, glsl_type::uint_type);
98   emit(MOV(dst_reg(this->prim_count), brw_imm_ud(0u)));
99
100   if (gs_prog_data->num_transform_feedback_bindings) {
101      /* Create a virtual register to hold destination indices in SOL */
102      this->destination_indices = src_reg(this, glsl_type::uvec4_type);
103      /* Create a virtual register to hold number of written primitives */
104      this->sol_prim_written = src_reg(this, glsl_type::uint_type);
105      /* Create a virtual register to hold Streamed Vertex Buffer Indices */
106      this->svbi = src_reg(this, glsl_type::uvec4_type);
107      /* Create a virtual register to hold max values of SVBI */
108      this->max_svbi = src_reg(this, glsl_type::uvec4_type);
109      emit(MOV(dst_reg(this->max_svbi),
110               src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
111   }
112
113   /* PrimitveID is delivered in r0.1 of the thread payload. If the program
114    * needs it we have to move it to a separate register where we can map
115    * the attribute.
116    *
117    * Notice that we cannot use a virtual register for this, because we need to
118    * map all input attributes to hardware registers in setup_payload(),
119    * which happens before virtual registers are mapped to hardware registers.
120    * We could work around that issue if we were able to compute the first
121    * non-payload register here and move the PrimitiveID information to that
122    * register, but we can't because at this point we don't know the final
123    * number uniforms that will be included in the payload.
124    *
125    * So, what we do is to place PrimitiveID information in r1, which is always
126    * delivered as part of the payload, but its only populated with data
127    * relevant for transform feedback when we set GFX6_GS_SVBI_PAYLOAD_ENABLE
128    * in the 3DSTATE_GS state packet. That information can be obtained by other
129    * means though, so we can safely use r1 for this purpose.
130    */
131   if (gs_prog_data->include_primitive_id) {
132      this->primitive_id =
133         src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
134      emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
135   }
136}
137
138void
139gfx6_gs_visitor::gs_emit_vertex(int stream_id)
140{
141   this->current_annotation = "gfx6 emit vertex";
142
143   /* Buffer all output slots for this vertex in vertex_output */
144   for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
145      int varying = prog_data->vue_map.slot_to_varying[slot];
146      if (varying != VARYING_SLOT_PSIZ) {
147         dst_reg dst(this->vertex_output);
148         dst.reladdr = ralloc(mem_ctx, src_reg);
149         memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
150         emit_urb_slot(dst, varying);
151      } else {
152         /* The PSIZ slot can pack multiple varyings in different channels
153          * and emit_urb_slot() will produce a MOV instruction for each of
154          * them. Since we are writing to an array, that will translate to
155          * possibly multiple MOV instructions with an array destination and
156          * each will generate a scratch write with the same offset into
157          * scratch space (thus, each one overwriting the previous). This is
158          * not what we want. What we will do instead is emit PSIZ to a
159          * a regular temporary register, then move that register into the
160          * array. This way we only have one instruction with an array
161          * destination and we only produce a single scratch write.
162          */
163         dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
164         emit_urb_slot(tmp, varying);
165         dst_reg dst(this->vertex_output);
166         dst.reladdr = ralloc(mem_ctx, src_reg);
167         memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
168         vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
169         inst->force_writemask_all = true;
170      }
171
172      emit(ADD(dst_reg(this->vertex_output_offset),
173               this->vertex_output_offset, brw_imm_ud(1u)));
174   }
175
176   /* Now buffer flags for this vertex */
177   dst_reg dst(this->vertex_output);
178   dst.reladdr = ralloc(mem_ctx, src_reg);
179   memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
180   if (nir->info.gs.output_primitive == GL_POINTS) {
181      /* If we are outputting points, then every vertex has PrimStart and
182       * PrimEnd set.
183       */
184      emit(MOV(dst, brw_imm_d((_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
185                              URB_WRITE_PRIM_START | URB_WRITE_PRIM_END)));
186      emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
187   } else {
188      /* Otherwise, we can only set the PrimStart flag, which we have stored
189       * in the first_vertex register. We will have to wait until we execute
190       * EndPrimitive() or we end the thread to set the PrimEnd flag on a
191       * vertex.
192       */
193      emit(OR(dst, this->first_vertex,
194              brw_imm_ud(gs_prog_data->output_topology <<
195                         URB_WRITE_PRIM_TYPE_SHIFT)));
196      emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(0u)));
197   }
198   emit(ADD(dst_reg(this->vertex_output_offset),
199            this->vertex_output_offset, brw_imm_ud(1u)));
200}
201
202void
203gfx6_gs_visitor::gs_end_primitive()
204{
205   this->current_annotation = "gfx6 end primitive";
206   /* Calling EndPrimitive() is optional for point output. In this case we set
207    * the PrimEnd flag when we process EmitVertex().
208    */
209   if (nir->info.gs.output_primitive == GL_POINTS)
210      return;
211
212   /* Otherwise we know that the last vertex we have processed was the last
213    * vertex in the primitive and we need to set its PrimEnd flag, so do this
214    * unless we haven't emitted that vertex at all (vertex_count != 0).
215    *
216    * Notice that we have already incremented vertex_count when we processed
217    * the last emit_vertex, so we need to take that into account in the
218    * comparison below (hence the num_output_vertices + 1 in the comparison
219    * below).
220    */
221   unsigned num_output_vertices = nir->info.gs.vertices_out;
222   emit(CMP(dst_null_ud(), this->vertex_count,
223            brw_imm_ud(num_output_vertices + 1), BRW_CONDITIONAL_L));
224   vec4_instruction *inst = emit(CMP(dst_null_ud(),
225                                     this->vertex_count, brw_imm_ud(0u),
226                                     BRW_CONDITIONAL_NEQ));
227   inst->predicate = BRW_PREDICATE_NORMAL;
228   emit(IF(BRW_PREDICATE_NORMAL));
229   {
230      /* vertex_output_offset is already pointing at the first entry of the
231       * next vertex. So subtract 1 to modify the flags for the previous
232       * vertex.
233       */
234      src_reg offset(this, glsl_type::uint_type);
235      emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1)));
236
237      src_reg dst(this->vertex_output);
238      dst.reladdr = ralloc(mem_ctx, src_reg);
239      memcpy(dst.reladdr, &offset, sizeof(src_reg));
240
241      emit(OR(dst_reg(dst), dst, brw_imm_d(URB_WRITE_PRIM_END)));
242      emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
243
244      /* Set the first vertex flag to indicate that the next vertex will start
245       * a primitive.
246       */
247      emit(MOV(dst_reg(this->first_vertex), brw_imm_d(URB_WRITE_PRIM_START)));
248   }
249   emit(BRW_OPCODE_ENDIF);
250}
251
252void
253gfx6_gs_visitor::emit_urb_write_header(int mrf)
254{
255   this->current_annotation = "gfx6 urb header";
256   /* Compute offset of the flags for the current vertex in vertex_output and
257    * write them in dw2 of the message header.
258    *
259    * Notice that by the time that emit_thread_end() calls here
260    * vertex_output_offset should point to the first data item of the current
261    * vertex in vertex_output, thus we only need to add the number of output
262    * slots per vertex to that offset to obtain the flags data offset.
263    */
264   src_reg flags_offset(this, glsl_type::uint_type);
265   emit(ADD(dst_reg(flags_offset),
266            this->vertex_output_offset,
267            brw_imm_d(prog_data->vue_map.num_slots)));
268
269   src_reg flags_data(this->vertex_output);
270   flags_data.reladdr = ralloc(mem_ctx, src_reg);
271   memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
272
273   emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
274}
275
276static unsigned
277align_interleaved_urb_mlen(unsigned mlen)
278{
279   /* URB data written (does not include the message header reg) must
280    * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
281    * section 5.4.3.2.2: URB_INTERLEAVED.
282    */
283   if ((mlen % 2) != 1)
284      mlen++;
285   return mlen;
286}
287
288void
289gfx6_gs_visitor::emit_snb_gs_urb_write_opcode(bool complete, int base_mrf,
290                                              int last_mrf, int urb_offset)
291{
292   vec4_instruction *inst = NULL;
293
294   if (!complete) {
295      /* If the vertex is not complete we don't have to do anything special */
296      inst = emit(VEC4_GS_OPCODE_URB_WRITE);
297      inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
298   } else {
299      /* Otherwise we always request to allocate a new VUE handle. If this is
300       * the last write before the EOT message and the new handle never gets
301       * used it will be dereferenced when we send the EOT message. This is
302       * necessary to avoid different setups for the EOT message (one for the
303       * case when there is no output and another for the case when there is)
304       * which would require to end the program with an IF/ELSE/ENDIF block,
305       * something we do not want.
306       */
307      inst = emit(VEC4_GS_OPCODE_URB_WRITE_ALLOCATE);
308      inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
309      inst->dst = dst_reg(MRF, base_mrf);
310      inst->src[0] = this->temp;
311   }
312
313   inst->base_mrf = base_mrf;
314   inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf);
315   inst->offset = urb_offset;
316}
317
318void
319gfx6_gs_visitor::emit_thread_end()
320{
321   /* Make sure the current primitive is ended: we know it is not ended when
322    * first_vertex is not zero. This is only relevant for outputs other than
323    * points because in the point case we set PrimEnd on all vertices.
324    */
325   if (nir->info.gs.output_primitive != GL_POINTS) {
326      emit(CMP(dst_null_ud(), this->first_vertex, brw_imm_ud(0u), BRW_CONDITIONAL_Z));
327      emit(IF(BRW_PREDICATE_NORMAL));
328      gs_end_primitive();
329      emit(BRW_OPCODE_ENDIF);
330   }
331
332   /* Here we have to:
333    * 1) Emit an FF_SYNC message to obtain an initial VUE handle.
334    * 2) Loop over all buffered vertex data and write it to corresponding
335    *    URB entries.
336    * 3) Allocate new VUE handles for all vertices other than the first.
337    * 4) Send a final EOT message.
338    */
339
340   /* MRF 0 is reserved for the debugger, so start with message header
341    * in MRF 1.
342    */
343   int base_mrf = 1;
344
345   /* In the process of generating our URB write message contents, we
346    * may need to unspill a register or load from an array.  Those
347    * reads would use MRFs 21..23
348    */
349   int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver);
350
351   /* Issue the FF_SYNC message and obtain the initial VUE handle. */
352   this->current_annotation = "gfx6 thread end: ff_sync";
353
354   vec4_instruction *inst = NULL;
355   if (gs_prog_data->num_transform_feedback_bindings) {
356      src_reg sol_temp(this, glsl_type::uvec4_type);
357      emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
358           dst_reg(this->svbi),
359           this->vertex_count,
360           this->prim_count,
361           sol_temp);
362      inst = emit(GS_OPCODE_FF_SYNC,
363                  dst_reg(this->temp), this->prim_count, this->svbi);
364   } else {
365      inst = emit(GS_OPCODE_FF_SYNC,
366                  dst_reg(this->temp), this->prim_count, brw_imm_ud(0u));
367   }
368   inst->base_mrf = base_mrf;
369
370   emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_G));
371   emit(IF(BRW_PREDICATE_NORMAL));
372   {
373      /* Loop over all buffered vertices and emit URB write messages */
374      this->current_annotation = "gfx6 thread end: urb writes init";
375      src_reg vertex(this, glsl_type::uint_type);
376      emit(MOV(dst_reg(vertex), brw_imm_ud(0u)));
377      emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
378
379      this->current_annotation = "gfx6 thread end: urb writes";
380      emit(BRW_OPCODE_DO);
381      {
382         emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
383         inst = emit(BRW_OPCODE_BREAK);
384         inst->predicate = BRW_PREDICATE_NORMAL;
385
386         /* First we prepare the message header */
387         emit_urb_write_header(base_mrf);
388
389         /* Then add vertex data to the message in interleaved fashion */
390         int slot = 0;
391         bool complete = false;
392         do {
393            int mrf = base_mrf + 1;
394
395            /* URB offset is in URB row increments, and each of our MRFs is half
396             * of one of those, since we're doing interleaved writes.
397             */
398            int urb_offset = slot / 2;
399
400            for (; slot < prog_data->vue_map.num_slots; ++slot) {
401               int varying = prog_data->vue_map.slot_to_varying[slot];
402               current_annotation = output_reg_annotation[varying];
403
404               /* Compute offset of this slot for the current vertex
405                * in vertex_output
406                */
407               src_reg data(this->vertex_output);
408               data.reladdr = ralloc(mem_ctx, src_reg);
409               memcpy(data.reladdr, &this->vertex_output_offset,
410                      sizeof(src_reg));
411
412               /* Copy this slot to the appropriate message register */
413               dst_reg reg = dst_reg(MRF, mrf);
414               reg.type = output_reg[varying][0].type;
415               data.type = reg.type;
416               inst = emit(MOV(reg, data));
417               inst->force_writemask_all = true;
418
419               mrf++;
420               emit(ADD(dst_reg(this->vertex_output_offset),
421                        this->vertex_output_offset, brw_imm_ud(1u)));
422
423               /* If this was max_usable_mrf, we can't fit anything more into
424                * this URB WRITE. Same if we reached the max. message length.
425                */
426               if (mrf > max_usable_mrf ||
427                   align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
428                  slot++;
429                  break;
430               }
431            }
432
433            complete = slot >= prog_data->vue_map.num_slots;
434            emit_snb_gs_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
435         } while (!complete);
436
437         /* Skip over the flags data item so that vertex_output_offset points
438          * to the first data item of the next vertex, so that we can start
439          * writing the next vertex.
440          */
441         emit(ADD(dst_reg(this->vertex_output_offset),
442                  this->vertex_output_offset, brw_imm_ud(1u)));
443
444         emit(ADD(dst_reg(vertex), vertex, brw_imm_ud(1u)));
445      }
446      emit(BRW_OPCODE_WHILE);
447
448      if (gs_prog_data->num_transform_feedback_bindings)
449         xfb_write();
450   }
451   emit(BRW_OPCODE_ENDIF);
452
453   /* Finally, emit EOT message.
454    *
455    * In gfx6 we need to end the thread differently depending on whether we have
456    * emitted at least one vertex or not. In case we did, the EOT message must
457    * always include the COMPLETE flag or else the GPU hangs. If we have not
458    * produced any output we can't use the COMPLETE flag.
459    *
460    * However, this would lead us to end the program with an ENDIF opcode,
461    * which we want to avoid, so what we do is that we always request a new
462    * VUE handle every time, even if GS produces no output.
463    * With this we make sure that whether we have emitted at least one vertex
464    * or none at all, we have to finish the thread without writing to the URB,
465    * which works for both cases by setting the COMPLETE and UNUSED flags in
466    * the EOT message.
467    */
468   this->current_annotation = "gfx6 thread end: EOT";
469
470   if (gs_prog_data->num_transform_feedback_bindings) {
471      /* When emitting EOT, set SONumPrimsWritten Increment Value. */
472      src_reg data(this, glsl_type::uint_type);
473      emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu)));
474      emit(SHL(dst_reg(data), data, brw_imm_ud(16u)));
475      emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
476   }
477
478   inst = emit(GS_OPCODE_THREAD_END);
479   inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
480   inst->base_mrf = base_mrf;
481   inst->mlen = 1;
482}
483
484void
485gfx6_gs_visitor::setup_payload()
486{
487   int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
488
489   /* Attributes are going to be interleaved, so one register contains two
490    * attribute slots.
491    */
492   int attributes_per_reg = 2;
493
494   /* If a geometry shader tries to read from an input that wasn't written by
495    * the vertex shader, that produces undefined results, but it shouldn't
496    * crash anything.  So initialize attribute_map to zeros--that ensures that
497    * these undefined results are read from r0.
498    */
499   memset(attribute_map, 0, sizeof(attribute_map));
500
501   int reg = 0;
502
503   /* The payload always contains important data in r0. */
504   reg++;
505
506   /* r1 is always part of the payload and it holds information relevant
507    * for transform feedback when we set the GFX6_GS_SVBI_PAYLOAD_ENABLE bit in
508    * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
509    * information (and move the original value to a virtual register if
510    * necessary).
511    */
512   if (gs_prog_data->include_primitive_id)
513      attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
514   reg++;
515
516   reg = setup_uniforms(reg);
517
518   reg = setup_varying_inputs(reg, attributes_per_reg);
519
520   this->first_non_payload_grf = reg;
521}
522
523void
524gfx6_gs_visitor::xfb_write()
525{
526   unsigned num_verts;
527
528   switch (gs_prog_data->output_topology) {
529   case _3DPRIM_POINTLIST:
530      num_verts = 1;
531      break;
532   case _3DPRIM_LINELIST:
533   case _3DPRIM_LINESTRIP:
534   case _3DPRIM_LINELOOP:
535      num_verts = 2;
536      break;
537   case _3DPRIM_TRILIST:
538   case _3DPRIM_TRIFAN:
539   case _3DPRIM_TRISTRIP:
540   case _3DPRIM_RECTLIST:
541      num_verts = 3;
542      break;
543   case _3DPRIM_QUADLIST:
544   case _3DPRIM_QUADSTRIP:
545   case _3DPRIM_POLYGON:
546      num_verts = 3;
547      break;
548   default:
549      unreachable("Unexpected primitive type in Gfx6 SOL program.");
550   }
551
552   this->current_annotation = "gfx6 thread end: svb writes init";
553
554   emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
555   emit(MOV(dst_reg(this->sol_prim_written), brw_imm_ud(0u)));
556
557   /* Check that at least one primitive can be written
558    *
559    * Note: since we use the binding table to keep track of buffer offsets
560    * and stride, the GS doesn't need to keep track of a separate pointer
561    * into each buffer; it uses a single pointer which increments by 1 for
562    * each vertex.  So we use SVBI0 for this pointer, regardless of whether
563    * transform feedback is in interleaved or separate attribs mode.
564    */
565   src_reg sol_temp(this, glsl_type::uvec4_type);
566   emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts)));
567
568   /* Compare SVBI calculated number with the maximum value, which is
569    * in R1.4 (previously saved in this->max_svbi) for gfx6.
570    */
571   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
572   emit(IF(BRW_PREDICATE_NORMAL));
573   {
574      vec4_instruction *inst = emit(MOV(dst_reg(destination_indices),
575                                        brw_imm_vf4(brw_float_to_vf(0.0),
576                                                    brw_float_to_vf(1.0),
577                                                    brw_float_to_vf(2.0),
578                                                    brw_float_to_vf(0.0))));
579      inst->force_writemask_all = true;
580
581      emit(ADD(dst_reg(this->destination_indices),
582               this->destination_indices,
583               this->svbi));
584   }
585   emit(BRW_OPCODE_ENDIF);
586
587   /* Write transform feedback data for all processed vertices. */
588   for (int i = 0; i < (int)nir->info.gs.vertices_out; i++) {
589      emit(MOV(dst_reg(sol_temp), brw_imm_d(i)));
590      emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
591               BRW_CONDITIONAL_L));
592      emit(IF(BRW_PREDICATE_NORMAL));
593      {
594         xfb_program(i, num_verts);
595      }
596      emit(BRW_OPCODE_ENDIF);
597   }
598}
599
600void
601gfx6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
602{
603   unsigned binding;
604   unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings;
605   src_reg sol_temp(this, glsl_type::uvec4_type);
606
607   /* Check for buffer overflow: we need room to write the complete primitive
608    * (all vertices). Otherwise, avoid writing any vertices for it
609    */
610   emit(ADD(dst_reg(sol_temp), this->sol_prim_written, brw_imm_ud(1u)));
611   emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts)));
612   emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
613   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
614   emit(IF(BRW_PREDICATE_NORMAL));
615   {
616      /* Avoid overwriting MRF 1 as it is used as URB write message header */
617      dst_reg mrf_reg(MRF, 2);
618
619      this->current_annotation = "gfx6: emit SOL vertex data";
620      /* For each vertex, generate code to output each varying using the
621       * appropriate binding table entry.
622       */
623      for (binding = 0; binding < num_bindings; ++binding) {
624         unsigned char varying =
625            gs_prog_data->transform_feedback_bindings[binding];
626
627         /* Set up the correct destination index for this vertex */
628         vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
629                                       mrf_reg,
630                                       this->destination_indices);
631         inst->sol_vertex = vertex % num_verts;
632
633         /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
634          *
635          *   "Prior to End of Thread with a URB_WRITE, the kernel must
636          *   ensure that all writes are complete by sending the final
637          *   write as a committed write."
638          */
639         bool final_write = binding == (unsigned) num_bindings - 1 &&
640                            inst->sol_vertex == num_verts - 1;
641
642         /* Compute offset of this varying for the current vertex
643          * in vertex_output
644          */
645         this->current_annotation = output_reg_annotation[varying];
646         src_reg data(this->vertex_output);
647         data.reladdr = ralloc(mem_ctx, src_reg);
648         int offset = get_vertex_output_offset_for_varying(vertex, varying);
649         emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_d(offset)));
650         memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
651         data.type = output_reg[varying][0].type;
652         data.swizzle = gs_prog_data->transform_feedback_swizzles[binding];
653
654         /* Write data */
655         inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
656         inst->sol_binding = binding;
657         inst->sol_final_write = final_write;
658
659         if (final_write) {
660            /* This is the last vertex of the primitive, then increment
661             * SO num primitive counter and destination indices.
662             */
663            emit(ADD(dst_reg(this->destination_indices),
664                     this->destination_indices,
665                     brw_imm_ud(num_verts)));
666            emit(ADD(dst_reg(this->sol_prim_written),
667                     this->sol_prim_written, brw_imm_ud(1u)));
668         }
669
670      }
671      this->current_annotation = NULL;
672   }
673   emit(BRW_OPCODE_ENDIF);
674}
675
676int
677gfx6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying)
678{
679   /* Find the output slot assigned to this varying.
680    *
681    * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
682    * as VARYING_SLOT_PSIZ.
683    */
684   if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
685      varying = VARYING_SLOT_PSIZ;
686   int slot = prog_data->vue_map.varying_to_slot[varying];
687
688   if (slot < 0) {
689      /* This varying does not exist in the VUE so we are not writing to it
690       * and its value is undefined. We still want to return a valid offset
691       * into vertex_output though, to prevent any out-of-bound accesses into
692       * the vertex_output array. Since the value for this varying is undefined
693       * we don't really care for the value we assign to it, so any offset
694       * within the limits of vertex_output will do.
695       */
696      slot = 0;
697   }
698
699   return vertex * (prog_data->vue_map.num_slots + 1) + slot;
700}
701
702} /* namespace brw */
703