1/*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "brw_vec4.h"
25#include "brw_cfg.h"
26#include "brw_eu.h"
27#include "util/u_math.h"
28
29namespace brw {
30
31vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32                                   const src_reg &src0, const src_reg &src1,
33                                   const src_reg &src2)
34{
35   this->opcode = opcode;
36   this->dst = dst;
37   this->src[0] = src0;
38   this->src[1] = src1;
39   this->src[2] = src2;
40   this->saturate = false;
41   this->force_writemask_all = false;
42   this->no_dd_clear = false;
43   this->no_dd_check = false;
44   this->writes_accumulator = false;
45   this->conditional_mod = BRW_CONDITIONAL_NONE;
46   this->predicate = BRW_PREDICATE_NONE;
47   this->predicate_inverse = false;
48   this->target = 0;
49   this->shadow_compare = false;
50   this->eot = false;
51   this->ir = NULL;
52   this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53   this->header_size = 0;
54   this->flag_subreg = 0;
55   this->mlen = 0;
56   this->base_mrf = 0;
57   this->offset = 0;
58   this->exec_size = 8;
59   this->group = 0;
60   this->size_written = (dst.file == BAD_FILE ?
61                         0 : this->exec_size * type_sz(dst.type));
62   this->annotation = NULL;
63}
64
65vec4_instruction *
66vec4_visitor::emit(vec4_instruction *inst)
67{
68   inst->ir = this->base_ir;
69   inst->annotation = this->current_annotation;
70
71   this->instructions.push_tail(inst);
72
73   return inst;
74}
75
76vec4_instruction *
77vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
78                          vec4_instruction *new_inst)
79{
80   new_inst->ir = inst->ir;
81   new_inst->annotation = inst->annotation;
82
83   inst->insert_before(block, new_inst);
84
85   return inst;
86}
87
88vec4_instruction *
89vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
90                   const src_reg &src1, const src_reg &src2)
91{
92   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
93}
94
95
96vec4_instruction *
97vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
98                   const src_reg &src1)
99{
100   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
101}
102
103vec4_instruction *
104vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
105{
106   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
107}
108
109vec4_instruction *
110vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
111{
112   return emit(new(mem_ctx) vec4_instruction(opcode, dst));
113}
114
115vec4_instruction *
116vec4_visitor::emit(enum opcode opcode)
117{
118   return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
119}
120
121#define ALU1(op)							\
122   vec4_instruction *							\
123   vec4_visitor::op(const dst_reg &dst, const src_reg &src0)		\
124   {									\
125      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
126   }
127
128#define ALU2(op)							\
129   vec4_instruction *							\
130   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
131                    const src_reg &src1)				\
132   {									\
133      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
134                                           src0, src1);                 \
135   }
136
137#define ALU2_ACC(op)							\
138   vec4_instruction *							\
139   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
140                    const src_reg &src1)				\
141   {									\
142      vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
143                       BRW_OPCODE_##op, dst, src0, src1);		\
144      inst->writes_accumulator = true;                                  \
145      return inst;                                                      \
146   }
147
148#define ALU3(op)							\
149   vec4_instruction *							\
150   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
151                    const src_reg &src1, const src_reg &src2)		\
152   {									\
153      assert(devinfo->ver >= 6);						\
154      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,	\
155					   src0, src1, src2);		\
156   }
157
158ALU1(NOT)
159ALU1(MOV)
160ALU1(FRC)
161ALU1(RNDD)
162ALU1(RNDE)
163ALU1(RNDZ)
164ALU1(F32TO16)
165ALU1(F16TO32)
166ALU2(ADD)
167ALU2(MUL)
168ALU2_ACC(MACH)
169ALU2(AND)
170ALU2(OR)
171ALU2(XOR)
172ALU2(DP3)
173ALU2(DP4)
174ALU2(DPH)
175ALU2(SHL)
176ALU2(SHR)
177ALU2(ASR)
178ALU3(LRP)
179ALU1(BFREV)
180ALU3(BFE)
181ALU2(BFI1)
182ALU3(BFI2)
183ALU1(FBH)
184ALU1(FBL)
185ALU1(CBIT)
186ALU3(MAD)
187ALU2_ACC(ADDC)
188ALU2_ACC(SUBB)
189ALU2(MAC)
190ALU1(DIM)
191
192/** Gfx4 predicated IF. */
193vec4_instruction *
194vec4_visitor::IF(enum brw_predicate predicate)
195{
196   vec4_instruction *inst;
197
198   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
199   inst->predicate = predicate;
200
201   return inst;
202}
203
204/** Gfx6 IF with embedded comparison. */
205vec4_instruction *
206vec4_visitor::IF(src_reg src0, src_reg src1,
207                 enum brw_conditional_mod condition)
208{
209   assert(devinfo->ver == 6);
210
211   vec4_instruction *inst;
212
213   resolve_ud_negate(&src0);
214   resolve_ud_negate(&src1);
215
216   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
217					src0, src1);
218   inst->conditional_mod = condition;
219
220   return inst;
221}
222
223/**
224 * CMP: Sets the low bit of the destination channels with the result
225 * of the comparison, while the upper bits are undefined, and updates
226 * the flag register with the packed 16 bits of the result.
227 */
228vec4_instruction *
229vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
230                  enum brw_conditional_mod condition)
231{
232   vec4_instruction *inst;
233
234   /* Take the instruction:
235    *
236    * CMP null<d> src0<f> src1<f>
237    *
238    * Original gfx4 does type conversion to the destination type before
239    * comparison, producing garbage results for floating point comparisons.
240    *
241    * The destination type doesn't matter on newer generations, so we set the
242    * type to match src0 so we can compact the instruction.
243    */
244   dst.type = src0.type;
245
246   resolve_ud_negate(&src0);
247   resolve_ud_negate(&src1);
248
249   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
250   inst->conditional_mod = condition;
251
252   return inst;
253}
254
255vec4_instruction *
256vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
257{
258   vec4_instruction *inst;
259
260   inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GFX4_SCRATCH_READ,
261					dst, index);
262   inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver) + 1;
263   inst->mlen = 2;
264
265   return inst;
266}
267
268vec4_instruction *
269vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
270                            const src_reg &index)
271{
272   vec4_instruction *inst;
273
274   inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GFX4_SCRATCH_WRITE,
275					dst, src, index);
276   inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver);
277   inst->mlen = 3;
278
279   return inst;
280}
281
282src_reg
283vec4_visitor::fix_3src_operand(const src_reg &src)
284{
285   /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
286    * able to use vertical stride of zero to replicate the vec4 uniform, like
287    *
288    *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
289    *
290    * But you can't, since vertical stride is always four in three-source
291    * instructions. Instead, insert a MOV instruction to do the replication so
292    * that the three-source instruction can consume it.
293    */
294
295   /* The MOV is only needed if the source is a uniform or immediate. */
296   if (src.file != UNIFORM && src.file != IMM)
297      return src;
298
299   if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
300      return src;
301
302   dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
303   expanded.type = src.type;
304   emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
305   return src_reg(expanded);
306}
307
308src_reg
309vec4_visitor::fix_math_operand(const src_reg &src)
310{
311   if (devinfo->ver < 6 || src.file == BAD_FILE)
312      return src;
313
314   /* The gfx6 math instruction ignores the source modifiers --
315    * swizzle, abs, negate, and at least some parts of the register
316    * region description.
317    *
318    * Rather than trying to enumerate all these cases, *always* expand the
319    * operand to a temp GRF for gfx6.
320    *
321    * For gfx7, keep the operand as-is, except if immediate, which gfx7 still
322    * can't use.
323    */
324
325   if (devinfo->ver == 7 && src.file != IMM)
326      return src;
327
328   dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
329   expanded.type = src.type;
330   emit(MOV(expanded, src));
331   return src_reg(expanded);
332}
333
334vec4_instruction *
335vec4_visitor::emit_math(enum opcode opcode,
336                        const dst_reg &dst,
337                        const src_reg &src0, const src_reg &src1)
338{
339   vec4_instruction *math =
340      emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
341
342   if (devinfo->ver == 6 && dst.writemask != WRITEMASK_XYZW) {
343      /* MATH on Gfx6 must be align1, so we can't do writemasks. */
344      math->dst = dst_reg(this, glsl_type::vec4_type);
345      math->dst.type = dst.type;
346      math = emit(MOV(dst, src_reg(math->dst)));
347   } else if (devinfo->ver < 6) {
348      math->base_mrf = 1;
349      math->mlen = src1.file == BAD_FILE ? 1 : 2;
350   }
351
352   return math;
353}
354
355void
356vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
357{
358   if (devinfo->ver < 7) {
359      unreachable("ir_unop_pack_half_2x16 should be lowered");
360   }
361
362   assert(dst.type == BRW_REGISTER_TYPE_UD);
363   assert(src0.type == BRW_REGISTER_TYPE_F);
364
365   /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
366    *
367    *   Because this instruction does not have a 16-bit floating-point type,
368    *   the destination data type must be Word (W).
369    *
370    *   The destination must be DWord-aligned and specify a horizontal stride
371    *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
372    *   each destination channel and the upper word is not modified.
373    *
374    * The above restriction implies that the f32to16 instruction must use
375    * align1 mode, because only in align1 mode is it possible to specify
376    * horizontal stride.  We choose here to defy the hardware docs and emit
377    * align16 instructions.
378    *
379    * (I [chadv] did attempt to emit align1 instructions for VS f32to16
380    * instructions. I was partially successful in that the code passed all
381    * tests.  However, the code was dubiously correct and fragile, and the
382    * tests were not harsh enough to probe that frailty. Not trusting the
383    * code, I chose instead to remain in align16 mode in defiance of the hw
384    * docs).
385    *
386    * I've [chadv] experimentally confirmed that, on gfx7 hardware and the
387    * simulator, emitting a f32to16 in align16 mode with UD as destination
388    * data type is safe. The behavior differs from that specified in the PRM
389    * in that the upper word of each destination channel is cleared to 0.
390    */
391
392   dst_reg tmp_dst(this, glsl_type::uvec2_type);
393   src_reg tmp_src(tmp_dst);
394
395#if 0
396   /* Verify the undocumented behavior on which the following instructions
397    * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
398    * then the result of the bit-or instruction below will be incorrect.
399    *
400    * You should inspect the disasm output in order to verify that the MOV is
401    * not optimized away.
402    */
403   emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
404#endif
405
406   /* Give tmp the form below, where "." means untouched.
407    *
408    *     w z          y          x w z          y          x
409    *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
410    *
411    * That the upper word of each write-channel be 0 is required for the
412    * following bit-shift and bit-or instructions to work. Note that this
413    * relies on the undocumented hardware behavior mentioned above.
414    */
415   tmp_dst.writemask = WRITEMASK_XY;
416   emit(F32TO16(tmp_dst, src0));
417
418   /* Give the write-channels of dst the form:
419    *   0xhhhh0000
420    */
421   tmp_src.swizzle = BRW_SWIZZLE_YYYY;
422   emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
423
424   /* Finally, give the write-channels of dst the form of packHalf2x16's
425    * output:
426    *   0xhhhhllll
427    */
428   tmp_src.swizzle = BRW_SWIZZLE_XXXX;
429   emit(OR(dst, src_reg(dst), tmp_src));
430}
431
432void
433vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
434{
435   if (devinfo->ver < 7) {
436      unreachable("ir_unop_unpack_half_2x16 should be lowered");
437   }
438
439   assert(dst.type == BRW_REGISTER_TYPE_F);
440   assert(src0.type == BRW_REGISTER_TYPE_UD);
441
442   /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
443    *
444    *   Because this instruction does not have a 16-bit floating-point type,
445    *   the source data type must be Word (W). The destination type must be
446    *   F (Float).
447    *
448    * To use W as the source data type, we must adjust horizontal strides,
449    * which is only possible in align1 mode. All my [chadv] attempts at
450    * emitting align1 instructions for unpackHalf2x16 failed to pass the
451    * Piglit tests, so I gave up.
452    *
453    * I've verified that, on gfx7 hardware and the simulator, it is safe to
454    * emit f16to32 in align16 mode with UD as source data type.
455    */
456
457   dst_reg tmp_dst(this, glsl_type::uvec2_type);
458   src_reg tmp_src(tmp_dst);
459
460   tmp_dst.writemask = WRITEMASK_X;
461   emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
462
463   tmp_dst.writemask = WRITEMASK_Y;
464   emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
465
466   dst.writemask = WRITEMASK_XY;
467   emit(F16TO32(dst, tmp_src));
468}
469
470void
471vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
472{
473   /* Instead of splitting the 32-bit integer, shifting, and ORing it back
474    * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
475    * is not suitable to generate the shift values, but we can use the packed
476    * vector float and a type-converting MOV.
477    */
478   dst_reg shift(this, glsl_type::uvec4_type);
479   emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
480
481   dst_reg shifted(this, glsl_type::uvec4_type);
482   src0.swizzle = BRW_SWIZZLE_XXXX;
483   emit(SHR(shifted, src0, src_reg(shift)));
484
485   shifted.type = BRW_REGISTER_TYPE_UB;
486   dst_reg f(this, glsl_type::vec4_type);
487   emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
488
489   emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
490}
491
492void
493vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
494{
495   /* Instead of splitting the 32-bit integer, shifting, and ORing it back
496    * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
497    * is not suitable to generate the shift values, but we can use the packed
498    * vector float and a type-converting MOV.
499    */
500   dst_reg shift(this, glsl_type::uvec4_type);
501   emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
502
503   dst_reg shifted(this, glsl_type::uvec4_type);
504   src0.swizzle = BRW_SWIZZLE_XXXX;
505   emit(SHR(shifted, src0, src_reg(shift)));
506
507   shifted.type = BRW_REGISTER_TYPE_B;
508   dst_reg f(this, glsl_type::vec4_type);
509   emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
510
511   dst_reg scaled(this, glsl_type::vec4_type);
512   emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
513
514   dst_reg max(this, glsl_type::vec4_type);
515   emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
516   emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
517}
518
519void
520vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
521{
522   dst_reg saturated(this, glsl_type::vec4_type);
523   vec4_instruction *inst = emit(MOV(saturated, src0));
524   inst->saturate = true;
525
526   dst_reg scaled(this, glsl_type::vec4_type);
527   emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
528
529   dst_reg rounded(this, glsl_type::vec4_type);
530   emit(RNDE(rounded, src_reg(scaled)));
531
532   dst_reg u(this, glsl_type::uvec4_type);
533   emit(MOV(u, src_reg(rounded)));
534
535   src_reg bytes(u);
536   emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
537}
538
539void
540vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
541{
542   dst_reg max(this, glsl_type::vec4_type);
543   emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
544
545   dst_reg min(this, glsl_type::vec4_type);
546   emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
547
548   dst_reg scaled(this, glsl_type::vec4_type);
549   emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
550
551   dst_reg rounded(this, glsl_type::vec4_type);
552   emit(RNDE(rounded, src_reg(scaled)));
553
554   dst_reg i(this, glsl_type::ivec4_type);
555   emit(MOV(i, src_reg(rounded)));
556
557   src_reg bytes(i);
558   emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
559}
560
561/*
562 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
563 * false) elements needed to pack a type.
564 */
565static int
566type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless)
567{
568   unsigned int i;
569   int size;
570
571   switch (type->base_type) {
572   case GLSL_TYPE_UINT:
573   case GLSL_TYPE_INT:
574   case GLSL_TYPE_FLOAT:
575   case GLSL_TYPE_FLOAT16:
576   case GLSL_TYPE_BOOL:
577   case GLSL_TYPE_DOUBLE:
578   case GLSL_TYPE_UINT16:
579   case GLSL_TYPE_INT16:
580   case GLSL_TYPE_UINT8:
581   case GLSL_TYPE_INT8:
582   case GLSL_TYPE_UINT64:
583   case GLSL_TYPE_INT64:
584      if (type->is_matrix()) {
585         const glsl_type *col_type = type->column_type();
586         unsigned col_slots =
587            (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
588         return type->matrix_columns * col_slots;
589      } else {
590         /* Regardless of size of vector, it gets a vec4. This is bad
591          * packing for things like floats, but otherwise arrays become a
592          * mess.  Hopefully a later pass over the code can pack scalars
593          * down if appropriate.
594          */
595         return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
596      }
597   case GLSL_TYPE_ARRAY:
598      assert(type->length > 0);
599      return type_size_xvec4(type->fields.array, as_vec4, bindless) *
600             type->length;
601   case GLSL_TYPE_STRUCT:
602   case GLSL_TYPE_INTERFACE:
603      size = 0;
604      for (i = 0; i < type->length; i++) {
605	 size += type_size_xvec4(type->fields.structure[i].type, as_vec4,
606                                 bindless);
607      }
608      return size;
609   case GLSL_TYPE_SUBROUTINE:
610      return 1;
611
612   case GLSL_TYPE_SAMPLER:
613   case GLSL_TYPE_TEXTURE:
614      /* Samplers and textures take up no register space, since they're baked
615       * in at link time.
616       */
617      return bindless ? 1 : 0;
618   case GLSL_TYPE_ATOMIC_UINT:
619      return 0;
620   case GLSL_TYPE_IMAGE:
621      return bindless ? 1 : DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
622   case GLSL_TYPE_VOID:
623   case GLSL_TYPE_ERROR:
624   case GLSL_TYPE_FUNCTION:
625      unreachable("not reached");
626   }
627
628   return 0;
629}
630
631/**
632 * Returns the minimum number of vec4 elements needed to pack a type.
633 *
634 * For simple types, it will return 1 (a single vec4); for matrices, the
635 * number of columns; for array and struct, the sum of the vec4_size of
636 * each of its elements; and for sampler and atomic, zero.
637 *
638 * This method is useful to calculate how much register space is needed to
639 * store a particular type.
640 */
641extern "C" int
642type_size_vec4(const struct glsl_type *type, bool bindless)
643{
644   return type_size_xvec4(type, true, bindless);
645}
646
647/**
648 * Returns the minimum number of dvec4 elements needed to pack a type.
649 *
650 * For simple types, it will return 1 (a single dvec4); for matrices, the
651 * number of columns; for array and struct, the sum of the dvec4_size of
652 * each of its elements; and for sampler and atomic, zero.
653 *
654 * This method is useful to calculate how much register space is needed to
655 * store a particular type.
656 *
657 * Measuring double-precision vertex inputs as dvec4 is required because
658 * ARB_vertex_attrib_64bit states that these uses the same number of locations
659 * than the single-precision version. That is, two consecutives dvec4 would be
660 * located in location "x" and location "x+1", not "x+2".
661 *
662 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
663 * remap_vs_attrs() will take in account both the location and also if the
664 * type fits in one or two vec4 slots.
665 */
666extern "C" int
667type_size_dvec4(const struct glsl_type *type, bool bindless)
668{
669   return type_size_xvec4(type, false, bindless);
670}
671
672src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
673{
674   init();
675
676   this->file = VGRF;
677   this->nr = v->alloc.allocate(type_size_vec4(type, false));
678
679   if (type->is_array() || type->is_struct()) {
680      this->swizzle = BRW_SWIZZLE_NOOP;
681   } else {
682      this->swizzle = brw_swizzle_for_size(type->vector_elements);
683   }
684
685   this->type = brw_type_for_base_type(type);
686}
687
688src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
689{
690   assert(size > 0);
691
692   init();
693
694   this->file = VGRF;
695   this->nr = v->alloc.allocate(type_size_vec4(type, false) * size);
696
697   this->swizzle = BRW_SWIZZLE_NOOP;
698
699   this->type = brw_type_for_base_type(type);
700}
701
702dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
703{
704   init();
705
706   this->file = VGRF;
707   this->nr = v->alloc.allocate(type_size_vec4(type, false));
708
709   if (type->is_array() || type->is_struct()) {
710      this->writemask = WRITEMASK_XYZW;
711   } else {
712      this->writemask = (1 << type->vector_elements) - 1;
713   }
714
715   this->type = brw_type_for_base_type(type);
716}
717
718vec4_instruction *
719vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
720                          src_reg src0, src_reg src1)
721{
722   vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
723   inst->conditional_mod = conditionalmod;
724   return inst;
725}
726
727/**
728 * Emits the instructions needed to perform a pull constant load. before_block
729 * and before_inst can be NULL in which case the instruction will be appended
730 * to the end of the instruction list.
731 */
732void
733vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
734                                          src_reg surf_index,
735                                          src_reg offset_reg,
736                                          bblock_t *before_block,
737                                          vec4_instruction *before_inst)
738{
739   assert((before_inst == NULL && before_block == NULL) ||
740          (before_inst && before_block));
741
742   vec4_instruction *pull;
743
744   if (devinfo->ver >= 7) {
745      dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
746
747      grf_offset.type = offset_reg.type;
748
749      pull = MOV(grf_offset, offset_reg);
750
751      if (before_inst)
752         emit_before(before_block, before_inst, pull);
753      else
754         emit(pull);
755
756      pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GFX7,
757                                           dst,
758                                           surf_index,
759                                           src_reg(grf_offset));
760      pull->mlen = 1;
761   } else {
762      pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
763                                           dst,
764                                           surf_index,
765                                           offset_reg);
766      pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
767      pull->mlen = 1;
768   }
769
770   if (before_inst)
771      emit_before(before_block, before_inst, pull);
772   else
773      emit(pull);
774}
775
776src_reg
777vec4_visitor::emit_uniformize(const src_reg &src)
778{
779   const src_reg chan_index(this, glsl_type::uint_type);
780   const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
781                              src.type);
782
783   emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
784      ->force_writemask_all = true;
785   emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
786      ->force_writemask_all = true;
787
788   return src_reg(dst);
789}
790
791void
792vec4_visitor::gs_emit_vertex(int /* stream_id */)
793{
794   unreachable("not reached");
795}
796
797void
798vec4_visitor::gs_end_primitive()
799{
800   unreachable("not reached");
801}
802
803void
804vec4_visitor::emit_ndc_computation()
805{
806   if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
807      return;
808
809   /* Get the position */
810   src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
811
812   /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
813   dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
814   output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
815   output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
816
817   current_annotation = "NDC";
818   dst_reg ndc_w = ndc;
819   ndc_w.writemask = WRITEMASK_W;
820   src_reg pos_w = pos;
821   pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
822   emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
823
824   dst_reg ndc_xyz = ndc;
825   ndc_xyz.writemask = WRITEMASK_XYZ;
826
827   emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
828}
829
830void
831vec4_visitor::emit_psiz_and_flags(dst_reg reg)
832{
833   if (devinfo->ver < 6 &&
834       ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
835        output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
836        devinfo->has_negative_rhw_bug)) {
837      dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
838      dst_reg header1_w = header1;
839      header1_w.writemask = WRITEMASK_W;
840
841      emit(MOV(header1, brw_imm_ud(0u)));
842
843      if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
844	 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
845
846	 current_annotation = "Point size";
847	 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
848	 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
849      }
850
851      if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
852         current_annotation = "Clipping flags";
853         dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
854
855         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
856         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
857         emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
858      }
859
860      if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) {
861         dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
862         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
863         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
864         emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
865         emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
866      }
867
868      /* i965 clipping workaround:
869       * 1) Test for -ve rhw
870       * 2) If set,
871       *      set ndc = (0,0,0,0)
872       *      set ucp[6] = 1
873       *
874       * Later, clipping will detect ucp[6] and ensure the primitive is
875       * clipped against all fixed planes.
876       */
877      if (devinfo->has_negative_rhw_bug &&
878          output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
879         src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
880         ndc_w.swizzle = BRW_SWIZZLE_WWWW;
881         emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
882         vec4_instruction *inst;
883         inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
884         inst->predicate = BRW_PREDICATE_NORMAL;
885         output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
886         inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
887         inst->predicate = BRW_PREDICATE_NORMAL;
888      }
889
890      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
891   } else if (devinfo->ver < 6) {
892      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
893   } else {
894      emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
895      if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
896         dst_reg reg_w = reg;
897         reg_w.writemask = WRITEMASK_W;
898         src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
899         reg_as_src.type = reg_w.type;
900         reg_as_src.swizzle = brw_swizzle_for_size(1);
901         emit(MOV(reg_w, reg_as_src));
902      }
903      if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
904         dst_reg reg_y = reg;
905         reg_y.writemask = WRITEMASK_Y;
906         reg_y.type = BRW_REGISTER_TYPE_D;
907         output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
908         emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
909      }
910      if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
911         dst_reg reg_z = reg;
912         reg_z.writemask = WRITEMASK_Z;
913         reg_z.type = BRW_REGISTER_TYPE_D;
914         output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
915         emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
916      }
917   }
918}
919
920vec4_instruction *
921vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
922{
923   assert(varying < VARYING_SLOT_MAX);
924
925   unsigned num_comps = output_num_components[varying][component];
926   if (num_comps == 0)
927      return NULL;
928
929   assert(output_reg[varying][component].type == reg.type);
930   current_annotation = output_reg_annotation[varying];
931   if (output_reg[varying][component].file != BAD_FILE) {
932      src_reg src = src_reg(output_reg[varying][component]);
933      src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
934      reg.writemask =
935         brw_writemask_for_component_packing(num_comps, component);
936      return emit(MOV(reg, src));
937   }
938   return NULL;
939}
940
941void
942vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
943{
944   reg.type = BRW_REGISTER_TYPE_F;
945   output_reg[varying][0].type = reg.type;
946
947   switch (varying) {
948   case VARYING_SLOT_PSIZ:
949   {
950      /* PSIZ is always in slot 0, and is coupled with other flags. */
951      current_annotation = "indices, point width, clip flags";
952      emit_psiz_and_flags(reg);
953      break;
954   }
955   case BRW_VARYING_SLOT_NDC:
956      current_annotation = "NDC";
957      if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
958         emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
959      break;
960   case VARYING_SLOT_POS:
961      current_annotation = "gl_Position";
962      if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
963         emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
964      break;
965   case BRW_VARYING_SLOT_PAD:
966      /* No need to write to this slot */
967      break;
968   default:
969      for (int i = 0; i < 4; i++) {
970         emit_generic_urb_slot(reg, varying, i);
971      }
972      break;
973   }
974}
975
976static unsigned
977align_interleaved_urb_mlen(const struct intel_device_info *devinfo,
978                           unsigned mlen)
979{
980   if (devinfo->ver >= 6) {
981      /* URB data written (does not include the message header reg) must
982       * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
983       * section 5.4.3.2.2: URB_INTERLEAVED.
984       *
985       * URB entries are allocated on a multiple of 1024 bits, so an
986       * extra 128 bits written here to make the end align to 256 is
987       * no problem.
988       */
989      if ((mlen % 2) != 1)
990	 mlen++;
991   }
992
993   return mlen;
994}
995
996
997/**
998 * Generates the VUE payload plus the necessary URB write instructions to
999 * output it.
1000 *
1001 * The VUE layout is documented in Volume 2a.
1002 */
1003void
1004vec4_visitor::emit_vertex()
1005{
1006   /* MRF 0 is reserved for the debugger, so start with message header
1007    * in MRF 1.
1008    */
1009   int base_mrf = 1;
1010   int mrf = base_mrf;
1011   /* In the process of generating our URB write message contents, we
1012    * may need to unspill a register or load from an array.  Those
1013    * reads would use MRFs 14-15.
1014    */
1015   int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver);
1016
1017   /* The following assertion verifies that max_usable_mrf causes an
1018    * even-numbered amount of URB write data, which will meet gfx6's
1019    * requirements for length alignment.
1020    */
1021   assert ((max_usable_mrf - base_mrf) % 2 == 0);
1022
1023   /* First mrf is the g0-based message header containing URB handles and
1024    * such.
1025    */
1026   emit_urb_write_header(mrf++);
1027
1028   if (devinfo->ver < 6) {
1029      emit_ndc_computation();
1030   }
1031
1032   /* We may need to split this up into several URB writes, so do them in a
1033    * loop.
1034    */
1035   int slot = 0;
1036   bool complete = false;
1037   do {
1038      /* URB offset is in URB row increments, and each of our MRFs is half of
1039       * one of those, since we're doing interleaved writes.
1040       */
1041      int offset = slot / 2;
1042
1043      mrf = base_mrf + 1;
1044      for (; slot < prog_data->vue_map.num_slots; ++slot) {
1045         emit_urb_slot(dst_reg(MRF, mrf++),
1046                       prog_data->vue_map.slot_to_varying[slot]);
1047
1048         /* If this was max_usable_mrf, we can't fit anything more into this
1049          * URB WRITE. Same thing if we reached the maximum length available.
1050          */
1051         if (mrf > max_usable_mrf ||
1052             align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1053            slot++;
1054            break;
1055         }
1056      }
1057
1058      complete = slot >= prog_data->vue_map.num_slots;
1059      current_annotation = "URB write";
1060      vec4_instruction *inst = emit_urb_write_opcode(complete);
1061      inst->base_mrf = base_mrf;
1062      inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1063      inst->offset += offset;
1064   } while(!complete);
1065}
1066
1067
1068src_reg
1069vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1070				 src_reg *reladdr, int reg_offset)
1071{
1072   /* Because we store the values to scratch interleaved like our
1073    * vertex data, we need to scale the vec4 index by 2.
1074    */
1075   int message_header_scale = 2;
1076
1077   /* Pre-gfx6, the message header uses byte offsets instead of vec4
1078    * (16-byte) offset units.
1079    */
1080   if (devinfo->ver < 6)
1081      message_header_scale *= 16;
1082
1083   if (reladdr) {
1084      /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1085       * to multiply the reladdr by 2. Notice that the reg_offset part
1086       * is in units of 16 bytes and is used to select the low/high 16-byte
1087       * chunk of a full dvec4, so we don't want to multiply that part.
1088       */
1089      src_reg index = src_reg(this, glsl_type::int_type);
1090      if (type_sz(inst->dst.type) < 8) {
1091         emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1092                                      brw_imm_d(reg_offset)));
1093         emit_before(block, inst, MUL(dst_reg(index), index,
1094                                      brw_imm_d(message_header_scale)));
1095      } else {
1096         emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1097                                      brw_imm_d(message_header_scale * 2)));
1098         emit_before(block, inst, ADD(dst_reg(index), index,
1099                                      brw_imm_d(reg_offset * message_header_scale)));
1100      }
1101      return index;
1102   } else {
1103      return brw_imm_d(reg_offset * message_header_scale);
1104   }
1105}
1106
1107/**
1108 * Emits an instruction before @inst to load the value named by @orig_src
1109 * from scratch space at @base_offset to @temp.
1110 *
1111 * @base_offset is measured in 32-byte units (the size of a register).
1112 */
1113void
1114vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1115				dst_reg temp, src_reg orig_src,
1116				int base_offset)
1117{
1118   assert(orig_src.offset % REG_SIZE == 0);
1119   int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1120   src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1121                                      reg_offset);
1122
1123   if (type_sz(orig_src.type) < 8) {
1124      emit_before(block, inst, SCRATCH_READ(temp, index));
1125   } else {
1126      dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1127      dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1128      emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1129      index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1130      vec4_instruction *last_read =
1131         SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1132      emit_before(block, inst, last_read);
1133      shuffle_64bit_data(temp, src_reg(shuffled), false, true, block, last_read);
1134   }
1135}
1136
1137/**
1138 * Emits an instruction after @inst to store the value to be written
1139 * to @orig_dst to scratch space at @base_offset, from @temp.
1140 *
1141 * @base_offset is measured in 32-byte units (the size of a register).
1142 */
1143void
1144vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1145                                 int base_offset)
1146{
1147   assert(inst->dst.offset % REG_SIZE == 0);
1148   int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1149   src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1150                                      reg_offset);
1151
1152   /* Create a temporary register to store *inst's result in.
1153    *
1154    * We have to be careful in MOVing from our temporary result register in
1155    * the scratch write.  If we swizzle from channels of the temporary that
1156    * weren't initialized, it will confuse live interval analysis, which will
1157    * make spilling fail to make progress.
1158    */
1159   bool is_64bit = type_sz(inst->dst.type) == 8;
1160   const glsl_type *alloc_type =
1161      is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1162   const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1163                                       inst->dst.type),
1164                                brw_swizzle_for_mask(inst->dst.writemask));
1165
1166   if (!is_64bit) {
1167      dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1168				          inst->dst.writemask));
1169      vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1170      if (inst->opcode != BRW_OPCODE_SEL)
1171         write->predicate = inst->predicate;
1172      write->ir = inst->ir;
1173      write->annotation = inst->annotation;
1174      inst->insert_after(block, write);
1175   } else {
1176      dst_reg shuffled = dst_reg(this, alloc_type);
1177      vec4_instruction *last =
1178         shuffle_64bit_data(shuffled, temp, true, true, block, inst);
1179      src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1180
1181      uint8_t mask = 0;
1182      if (inst->dst.writemask & WRITEMASK_X)
1183         mask |= WRITEMASK_XY;
1184      if (inst->dst.writemask & WRITEMASK_Y)
1185         mask |= WRITEMASK_ZW;
1186      if (mask) {
1187         dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1188
1189         vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1190         if (inst->opcode != BRW_OPCODE_SEL)
1191            write->predicate = inst->predicate;
1192         write->ir = inst->ir;
1193         write->annotation = inst->annotation;
1194         last->insert_after(block, write);
1195      }
1196
1197      mask = 0;
1198      if (inst->dst.writemask & WRITEMASK_Z)
1199         mask |= WRITEMASK_XY;
1200      if (inst->dst.writemask & WRITEMASK_W)
1201         mask |= WRITEMASK_ZW;
1202      if (mask) {
1203         dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1204
1205         src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1206                                            reg_offset + 1);
1207         vec4_instruction *write =
1208            SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1209         if (inst->opcode != BRW_OPCODE_SEL)
1210            write->predicate = inst->predicate;
1211         write->ir = inst->ir;
1212         write->annotation = inst->annotation;
1213         last->insert_after(block, write);
1214      }
1215   }
1216
1217   inst->dst.file = temp.file;
1218   inst->dst.nr = temp.nr;
1219   inst->dst.offset %= REG_SIZE;
1220   inst->dst.reladdr = NULL;
1221}
1222
1223/**
1224 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1225 * adds the scratch read(s) before \p inst. The function also checks for
1226 * recursive reladdr scratch accesses, issuing the corresponding scratch
1227 * loads and rewriting reladdr references accordingly.
1228 *
1229 * \return \p src if it did not require a scratch load, otherwise, the
1230 * register holding the result of the scratch load that the caller should
1231 * use to rewrite src.
1232 */
1233src_reg
1234vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1235                                   vec4_instruction *inst, src_reg src)
1236{
1237   /* Resolve recursive reladdr scratch access by calling ourselves
1238    * with src.reladdr
1239    */
1240   if (src.reladdr)
1241      *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1242                                          *src.reladdr);
1243
1244   /* Now handle scratch access on src */
1245   if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1246      dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1247         glsl_type::dvec4_type : glsl_type::vec4_type);
1248      emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1249      src.nr = temp.nr;
1250      src.offset %= REG_SIZE;
1251      src.reladdr = NULL;
1252   }
1253
1254   return src;
1255}
1256
1257/**
1258 * We can't generally support array access in GRF space, because a
1259 * single instruction's destination can only span 2 contiguous
1260 * registers.  So, we send all GRF arrays that get variable index
1261 * access to scratch space.
1262 */
1263void
1264vec4_visitor::move_grf_array_access_to_scratch()
1265{
1266   int scratch_loc[this->alloc.count];
1267   memset(scratch_loc, -1, sizeof(scratch_loc));
1268
1269   /* First, calculate the set of virtual GRFs that need to be punted
1270    * to scratch due to having any array access on them, and where in
1271    * scratch.
1272    */
1273   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1274      if (inst->dst.file == VGRF && inst->dst.reladdr) {
1275         if (scratch_loc[inst->dst.nr] == -1) {
1276            scratch_loc[inst->dst.nr] = last_scratch;
1277            last_scratch += this->alloc.sizes[inst->dst.nr];
1278         }
1279
1280         for (src_reg *iter = inst->dst.reladdr;
1281              iter->reladdr;
1282              iter = iter->reladdr) {
1283            if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1284               scratch_loc[iter->nr] = last_scratch;
1285               last_scratch += this->alloc.sizes[iter->nr];
1286            }
1287         }
1288      }
1289
1290      for (int i = 0 ; i < 3; i++) {
1291         for (src_reg *iter = &inst->src[i];
1292              iter->reladdr;
1293              iter = iter->reladdr) {
1294            if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1295               scratch_loc[iter->nr] = last_scratch;
1296               last_scratch += this->alloc.sizes[iter->nr];
1297            }
1298         }
1299      }
1300   }
1301
1302   /* Now, for anything that will be accessed through scratch, rewrite
1303    * it to load/store.  Note that this is a _safe list walk, because
1304    * we may generate a new scratch_write instruction after the one
1305    * we're processing.
1306    */
1307   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1308      /* Set up the annotation tracking for new generated instructions. */
1309      base_ir = inst->ir;
1310      current_annotation = inst->annotation;
1311
1312      /* First handle scratch access on the dst. Notice we have to handle
1313       * the case where the dst's reladdr also points to scratch space.
1314       */
1315      if (inst->dst.reladdr)
1316         *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1317                                                   *inst->dst.reladdr);
1318
1319      /* Now that we have handled any (possibly recursive) reladdr scratch
1320       * accesses for dst we can safely do the scratch write for dst itself
1321       */
1322      if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1323         emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1324
1325      /* Now handle scratch access on any src. In this case, since inst->src[i]
1326       * already is a src_reg, we can just call emit_resolve_reladdr with
1327       * inst->src[i] and it will take care of handling scratch loads for
1328       * both src and src.reladdr (recursively).
1329       */
1330      for (int i = 0 ; i < 3; i++) {
1331         inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1332                                             inst->src[i]);
1333      }
1334   }
1335}
1336
1337void
1338vec4_visitor::resolve_ud_negate(src_reg *reg)
1339{
1340   if (reg->type != BRW_REGISTER_TYPE_UD ||
1341       !reg->negate)
1342      return;
1343
1344   src_reg temp = src_reg(this, glsl_type::uvec4_type);
1345   emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1346   *reg = temp;
1347}
1348
1349vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1350                           void *log_data,
1351                           const struct brw_sampler_prog_key_data *key_tex,
1352                           struct brw_vue_prog_data *prog_data,
1353                           const nir_shader *shader,
1354			   void *mem_ctx,
1355                           bool no_spills,
1356                           bool debug_enabled)
1357   : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base,
1358                    debug_enabled),
1359     key_tex(key_tex),
1360     prog_data(prog_data),
1361     fail_msg(NULL),
1362     first_non_payload_grf(0),
1363     ubo_push_start(),
1364     push_length(0),
1365     live_analysis(this), performance_analysis(this),
1366     need_all_constants_in_pull_buffer(false),
1367     no_spills(no_spills),
1368     last_scratch(0)
1369{
1370   this->failed = false;
1371
1372   this->base_ir = NULL;
1373   this->current_annotation = NULL;
1374   memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1375
1376   memset(this->output_num_components, 0, sizeof(this->output_num_components));
1377
1378   this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : BRW_MAX_GRF;
1379
1380   this->uniforms = 0;
1381
1382   this->nir_locals = NULL;
1383   this->nir_ssa_values = NULL;
1384}
1385
1386
1387void
1388vec4_visitor::fail(const char *format, ...)
1389{
1390   va_list va;
1391   char *msg;
1392
1393   if (failed)
1394      return;
1395
1396   failed = true;
1397
1398   va_start(va, format);
1399   msg = ralloc_vasprintf(mem_ctx, format, va);
1400   va_end(va);
1401   msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1402
1403   this->fail_msg = msg;
1404
1405   if (unlikely(debug_enabled)) {
1406      fprintf(stderr, "%s",  msg);
1407   }
1408}
1409
1410} /* namespace brw */
1411