1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_formats.h"
25#include "r600_opcodes.h"
26#include "r600_shader.h"
27#include "r600_dump.h"
28#include "r600d.h"
29#include "sfn/sfn_nir.h"
30
31#include "sb/sb_public.h"
32
33#include "pipe/p_shader_tokens.h"
34#include "tgsi/tgsi_info.h"
35#include "tgsi/tgsi_parse.h"
36#include "tgsi/tgsi_scan.h"
37#include "tgsi/tgsi_dump.h"
38#include "tgsi/tgsi_from_mesa.h"
39#include "nir/tgsi_to_nir.h"
40#include "nir/nir_to_tgsi_info.h"
41#include "compiler/nir/nir.h"
42#include "util/u_bitcast.h"
43#include "util/u_memory.h"
44#include "util/u_math.h"
45#include <stdio.h>
46#include <errno.h>
47
48/* CAYMAN notes
49Why CAYMAN got loops for lots of instructions is explained here.
50
51-These 8xx t-slot only ops are implemented in all vector slots.
52MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
53These 8xx t-slot only opcodes become vector ops, with all four
54slots expecting the arguments on sources a and b. Result is
55broadcast to all channels.
56MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
57These 8xx t-slot only opcodes become vector ops in the z, y, and
58x slots.
59EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
60RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
61SQRT_IEEE/_64
62SIN/COS
63The w slot may have an independent co-issued operation, or if the
64result is required to be in the w slot, the opcode above may be
65issued in the w slot as well.
66The compiler must issue the source argument to slots z, y, and x
67*/
68
69/* Contents of r0 on entry to various shaders
70
71 VS - .x = VertexID
72      .y = RelVertexID (??)
73      .w = InstanceID
74
75 GS - r0.xyw, r1.xyz = per-vertex offsets
76      r0.z = PrimitiveID
77
78 TCS - .x = PatchID
79       .y = RelPatchID (??)
80       .z = InvocationID
81       .w = tess factor base.
82
83 TES - .x = TessCoord.x
84     - .y = TessCoord.y
85     - .z = RelPatchID (??)
86     - .w = PrimitiveID
87
88 PS - face_gpr.z = SampleMask
89      face_gpr.w = SampleID
90*/
91#define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
92static int r600_shader_from_tgsi(struct r600_context *rctx,
93				 struct r600_pipe_shader *pipeshader,
94				 union r600_shader_key key);
95
96static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
97                           int size, unsigned comp_mask) {
98
99	if (!size)
100		return;
101
102	if (ps->num_arrays == ps->max_arrays) {
103		ps->max_arrays += 64;
104		ps->arrays = realloc(ps->arrays, ps->max_arrays *
105		                     sizeof(struct r600_shader_array));
106	}
107
108	int n = ps->num_arrays;
109	++ps->num_arrays;
110
111	ps->arrays[n].comp_mask = comp_mask;
112	ps->arrays[n].gpr_start = start_gpr;
113	ps->arrays[n].gpr_count = size;
114}
115
116static void r600_dump_streamout(struct pipe_stream_output_info *so)
117{
118	unsigned i;
119
120	fprintf(stderr, "STREAMOUT\n");
121	for (i = 0; i < so->num_outputs; i++) {
122		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
123				so->output[i].start_component;
124		fprintf(stderr, "  %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
125			i,
126			so->output[i].stream,
127			so->output[i].output_buffer,
128			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
129			so->output[i].register_index,
130			mask & 1 ? "x" : "",
131		        mask & 2 ? "y" : "",
132		        mask & 4 ? "z" : "",
133		        mask & 8 ? "w" : "",
134			so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
135	}
136}
137
138static int store_shader(struct pipe_context *ctx,
139			struct r600_pipe_shader *shader)
140{
141	struct r600_context *rctx = (struct r600_context *)ctx;
142	uint32_t *ptr, i;
143
144	if (shader->bo == NULL) {
145		shader->bo = (struct r600_resource*)
146			pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
147		if (shader->bo == NULL) {
148			return -ENOMEM;
149		}
150		ptr = r600_buffer_map_sync_with_rings(
151			&rctx->b, shader->bo,
152			PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
153		if (R600_BIG_ENDIAN) {
154			for (i = 0; i < shader->shader.bc.ndw; ++i) {
155				ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
156			}
157		} else {
158			memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
159		}
160		rctx->b.ws->buffer_unmap(rctx->b.ws, shader->bo->buf);
161	}
162
163	return 0;
164}
165
166extern const struct nir_shader_compiler_options r600_nir_options;
167static int nshader = 0;
168int r600_pipe_shader_create(struct pipe_context *ctx,
169			    struct r600_pipe_shader *shader,
170			    union r600_shader_key key)
171{
172	struct r600_context *rctx = (struct r600_context *)ctx;
173	struct r600_pipe_shader_selector *sel = shader->selector;
174	int r;
175	struct r600_screen *rscreen = (struct r600_screen *)ctx->screen;
176
177	int processor = sel->ir_type == PIPE_SHADER_IR_TGSI ?
178		tgsi_get_processor_type(sel->tokens):
179		pipe_shader_type_from_mesa(sel->nir->info.stage);
180
181	bool dump = r600_can_dump_shader(&rctx->screen->b, processor);
182	unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB) ||
183                     (rctx->screen->b.debug_flags & DBG_NIR_SB);
184	unsigned sb_disasm;
185	unsigned export_shader;
186
187	shader->shader.bc.isa = rctx->isa;
188
189	if (!(rscreen->b.debug_flags & DBG_NIR_PREFERRED)) {
190		assert(sel->ir_type == PIPE_SHADER_IR_TGSI);
191		r = r600_shader_from_tgsi(rctx, shader, key);
192		if (r) {
193			R600_ERR("translation from TGSI failed !\n");
194			goto error;
195		}
196	} else {
197		if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
198			if (sel->nir)
199				ralloc_free(sel->nir);
200			sel->nir = tgsi_to_nir(sel->tokens, ctx->screen, true);
201                        const nir_shader_compiler_options *nir_options =
202                              (const nir_shader_compiler_options *)
203                              ctx->screen->get_compiler_options(ctx->screen,
204                                                                PIPE_SHADER_IR_NIR,
205                                                                shader->shader.processor_type);
206                        /* Lower int64 ops because we have some r600 build-in shaders that use it */
207			if (nir_options->lower_int64_options) {
208				NIR_PASS_V(sel->nir, nir_lower_regs_to_ssa);
209				NIR_PASS_V(sel->nir, nir_lower_alu_to_scalar, NULL, NULL);
210				NIR_PASS_V(sel->nir, nir_lower_int64);
211				NIR_PASS_V(sel->nir, nir_opt_vectorize, NULL, NULL);
212			}
213			NIR_PASS_V(sel->nir, nir_lower_flrp, ~0, false);
214		}
215		nir_tgsi_scan_shader(sel->nir, &sel->info, true);
216
217		r = r600_shader_from_nir(rctx, shader, &key);
218		if (r) {
219			fprintf(stderr, "--Failed shader--------------------------------------------------\n");
220
221			if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
222				fprintf(stderr, "--TGSI--------------------------------------------------------\n");
223				tgsi_dump(sel->tokens, 0);
224			}
225
226			if (rscreen->b.debug_flags & (DBG_NIR_PREFERRED)) {
227				fprintf(stderr, "--NIR --------------------------------------------------------\n");
228				nir_print_shader(sel->nir, stderr);
229			}
230
231			R600_ERR("translation from NIR failed !\n");
232			goto error;
233		}
234	}
235
236	if (dump) {
237		if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
238			fprintf(stderr, "--TGSI--------------------------------------------------------\n");
239			tgsi_dump(sel->tokens, 0);
240		}
241
242		if (sel->so.num_outputs) {
243			r600_dump_streamout(&sel->so);
244		}
245	}
246
247	if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
248		/* only disable for vertex shaders in tess paths */
249		if (key.vs.as_ls)
250			use_sb = 0;
251	}
252	use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
253	use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
254	use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE);
255
256	/* disable SB for shaders using doubles */
257	use_sb &= !shader->shader.uses_doubles;
258
259	use_sb &= !shader->shader.uses_atomics;
260	use_sb &= !shader->shader.uses_images;
261	use_sb &= !shader->shader.uses_helper_invocation;
262
263	/* SB can't handle READ_SCRATCH properly */
264	use_sb &= !(shader->shader.needs_scratch_space && rscreen->b.gfx_level < R700);
265
266	/* sb has bugs in array reg allocation
267	 * (dEQP-GLES2.functional.shaders.struct.local.struct_array_dynamic_index_fragment
268	 * with NTT)
269	 */
270	use_sb &= !(shader->shader.indirect_files & (1 << TGSI_FILE_TEMPORARY));
271	use_sb &= !(shader->shader.indirect_files & (1 << TGSI_FILE_CONSTANT));
272
273	/* sb has scheduling assertion fails with interpolate_at. */
274	use_sb &= !shader->shader.uses_interpolate_at_sample;
275
276	/* Check if the bytecode has already been built. */
277	if (!shader->shader.bc.bytecode) {
278		r = r600_bytecode_build(&shader->shader.bc);
279		if (r) {
280			R600_ERR("building bytecode failed !\n");
281			goto error;
282		}
283	}
284
285	sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
286	if (dump && !sb_disasm) {
287		fprintf(stderr, "--------------------------------------------------------------\n");
288		r600_bytecode_disasm(&shader->shader.bc);
289		fprintf(stderr, "______________________________________________________________\n");
290	} else if ((dump && sb_disasm) || use_sb) {
291                r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
292		                             dump, use_sb);
293		if (r) {
294			R600_ERR("r600_sb_bytecode_process failed !\n");
295			goto error;
296		}
297	}
298
299	if (dump) {
300		print_shader_info(stderr, nshader++, &shader->shader);
301		print_pipe_info(stderr, &sel->info);
302	}
303
304	if (shader->gs_copy_shader) {
305		if (dump) {
306			// dump copy shader
307			r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
308						     &shader->gs_copy_shader->shader, dump, 0);
309			if (r)
310				goto error;
311		}
312
313		if ((r = store_shader(ctx, shader->gs_copy_shader)))
314			goto error;
315	}
316
317	/* Store the shader in a buffer. */
318	if ((r = store_shader(ctx, shader)))
319		goto error;
320
321	/* Build state. */
322	switch (shader->shader.processor_type) {
323	case PIPE_SHADER_TESS_CTRL:
324		evergreen_update_hs_state(ctx, shader);
325		break;
326	case PIPE_SHADER_TESS_EVAL:
327		if (key.tes.as_es)
328			evergreen_update_es_state(ctx, shader);
329		else
330			evergreen_update_vs_state(ctx, shader);
331		break;
332	case PIPE_SHADER_GEOMETRY:
333		if (rctx->b.gfx_level >= EVERGREEN) {
334			evergreen_update_gs_state(ctx, shader);
335			evergreen_update_vs_state(ctx, shader->gs_copy_shader);
336		} else {
337			r600_update_gs_state(ctx, shader);
338			r600_update_vs_state(ctx, shader->gs_copy_shader);
339		}
340		break;
341	case PIPE_SHADER_VERTEX:
342		export_shader = key.vs.as_es;
343		if (rctx->b.gfx_level >= EVERGREEN) {
344			if (key.vs.as_ls)
345				evergreen_update_ls_state(ctx, shader);
346			else if (key.vs.as_es)
347				evergreen_update_es_state(ctx, shader);
348			else
349				evergreen_update_vs_state(ctx, shader);
350		} else {
351			if (export_shader)
352				r600_update_es_state(ctx, shader);
353			else
354				r600_update_vs_state(ctx, shader);
355		}
356		break;
357	case PIPE_SHADER_FRAGMENT:
358		if (rctx->b.gfx_level >= EVERGREEN) {
359			evergreen_update_ps_state(ctx, shader);
360		} else {
361			r600_update_ps_state(ctx, shader);
362		}
363		break;
364	case PIPE_SHADER_COMPUTE:
365		evergreen_update_ls_state(ctx, shader);
366		break;
367	default:
368		r = -EINVAL;
369		goto error;
370	}
371
372	util_debug_message(&rctx->b.debug, SHADER_INFO, "%s shader: %d dw, %d gprs, %d alu_groups, %d loops, %d cf, %d stack",
373		           _mesa_shader_stage_to_abbrev(tgsi_processor_to_shader_stage(processor)),
374	                   shader->shader.bc.ndw,
375	                   shader->shader.bc.ngpr,
376			   shader->shader.bc.nalu_groups,
377			   shader->shader.num_loops,
378			   shader->shader.bc.ncf,
379			   shader->shader.bc.nstack);
380
381	return 0;
382
383error:
384	r600_pipe_shader_destroy(ctx, shader);
385	return r;
386}
387
388void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
389{
390	r600_resource_reference(&shader->bo, NULL);
391	if (list_is_linked(&shader->shader.bc.cf))
392		r600_bytecode_clear(&shader->shader.bc);
393	r600_release_command_buffer(&shader->command_buffer);
394}
395
396/*
397 * tgsi -> r600 shader
398 */
399struct r600_shader_tgsi_instruction;
400
401struct r600_shader_src {
402	unsigned				sel;
403	unsigned				swizzle[4];
404	unsigned				neg;
405	unsigned				abs;
406	unsigned				rel;
407	unsigned				kc_bank;
408	boolean					kc_rel; /* true if cache bank is indexed */
409	uint32_t				value[4];
410};
411
412struct eg_interp {
413	boolean					enabled;
414	unsigned				ij_index;
415};
416
417struct r600_shader_ctx {
418	struct tgsi_shader_info			info;
419	struct tgsi_array_info			*array_infos;
420	/* flag for each tgsi temp array if its been spilled or not */
421	bool					*spilled_arrays;
422	struct tgsi_parse_context		parse;
423	const struct tgsi_token			*tokens;
424	unsigned				type;
425	unsigned				file_offset[TGSI_FILE_COUNT];
426	unsigned				temp_reg;
427	const struct r600_shader_tgsi_instruction	*inst_info;
428	struct r600_bytecode			*bc;
429	struct r600_shader			*shader;
430	struct r600_shader_src			src[4];
431	uint32_t				*literals;
432	uint32_t				nliterals;
433	uint32_t				max_driver_temp_used;
434	/* needed for evergreen interpolation */
435	struct eg_interp		eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
436	/* evergreen/cayman also store sample mask in face register */
437	int					face_gpr;
438	/* sample id is .w component stored in fixed point position register */
439	int					fixed_pt_position_gpr;
440	int					colors_used;
441	boolean                 clip_vertex_write;
442	unsigned                cv_output;
443	unsigned		edgeflag_output;
444	int					helper_invoc_reg;
445	int                                     cs_block_size_reg;
446	int                                     cs_grid_size_reg;
447	bool cs_block_size_loaded, cs_grid_size_loaded;
448	int					fragcoord_input;
449	int					next_ring_offset;
450	int					gs_out_ring_offset;
451	int					gs_next_vertex;
452	struct r600_shader	*gs_for_vs;
453	int					gs_export_gpr_tregs[4];
454	int                                     gs_rotated_input[2];
455	const struct pipe_stream_output_info	*gs_stream_output_info;
456	unsigned				enabled_stream_buffers_mask;
457	unsigned                                tess_input_info; /* temp with tess input offsets */
458	unsigned                                tess_output_info; /* temp with tess input offsets */
459	unsigned                                thread_id_gpr; /* temp with thread id calculated for images */
460};
461
462struct r600_shader_tgsi_instruction {
463	unsigned	op;
464	int (*process)(struct r600_shader_ctx *ctx);
465};
466
467static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
468static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
469static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
470static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
471static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
472static int tgsi_else(struct r600_shader_ctx *ctx);
473static int tgsi_endif(struct r600_shader_ctx *ctx);
474static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
475static int tgsi_endloop(struct r600_shader_ctx *ctx);
476static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
477static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
478                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
479                                unsigned int dst_reg);
480static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
481			const struct r600_shader_src *shader_src,
482			unsigned chan);
483static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
484			       unsigned dst_reg, unsigned mask);
485
486static bool ctx_needs_stack_workaround_8xx(struct r600_shader_ctx *ctx)
487{
488	if (ctx->bc->family == CHIP_HEMLOCK ||
489	    ctx->bc->family == CHIP_CYPRESS ||
490	    ctx->bc->family == CHIP_JUNIPER)
491		return false;
492	return true;
493}
494
495static int tgsi_last_instruction(unsigned writemask)
496{
497	int i, lasti = 0;
498
499	for (i = 0; i < 4; i++) {
500		if (writemask & (1 << i)) {
501			lasti = i;
502		}
503	}
504	return lasti;
505}
506
507static int tgsi_is_supported(struct r600_shader_ctx *ctx)
508{
509	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
510	unsigned j;
511
512	if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
513		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
514		return -EINVAL;
515	}
516#if 0
517	if (i->Instruction.Label) {
518		R600_ERR("label unsupported\n");
519		return -EINVAL;
520	}
521#endif
522	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
523		if (i->Src[j].Register.Dimension) {
524			switch (i->Src[j].Register.File) {
525			case TGSI_FILE_CONSTANT:
526			case TGSI_FILE_HW_ATOMIC:
527				break;
528			case TGSI_FILE_INPUT:
529				if (ctx->type == PIPE_SHADER_GEOMETRY ||
530				    ctx->type == PIPE_SHADER_TESS_CTRL ||
531				    ctx->type == PIPE_SHADER_TESS_EVAL)
532					break;
533				FALLTHROUGH;
534			case TGSI_FILE_OUTPUT:
535				if (ctx->type == PIPE_SHADER_TESS_CTRL)
536					break;
537				FALLTHROUGH;
538			default:
539				R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
540					 i->Src[j].Register.File,
541					 i->Src[j].Register.Dimension);
542				return -EINVAL;
543			}
544		}
545	}
546	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
547		if (i->Dst[j].Register.Dimension) {
548			if (ctx->type == PIPE_SHADER_TESS_CTRL)
549				continue;
550			R600_ERR("unsupported dst (dimension)\n");
551			return -EINVAL;
552		}
553	}
554	return 0;
555}
556
557int eg_get_interpolator_index(unsigned interpolate, unsigned location)
558{
559	if (interpolate == TGSI_INTERPOLATE_COLOR ||
560		interpolate == TGSI_INTERPOLATE_LINEAR ||
561		interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
562	{
563		int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
564		int loc;
565
566		switch(location) {
567		case TGSI_INTERPOLATE_LOC_CENTER:
568			loc = 1;
569			break;
570		case TGSI_INTERPOLATE_LOC_CENTROID:
571			loc = 2;
572			break;
573		case TGSI_INTERPOLATE_LOC_SAMPLE:
574		default:
575			loc = 0; break;
576		}
577
578		return is_linear * 3 + loc;
579	}
580
581	return -1;
582}
583
584static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
585		int input)
586{
587	int i = eg_get_interpolator_index(
588		ctx->shader->input[input].interpolate,
589		ctx->shader->input[input].interpolate_location);
590	assert(i >= 0);
591	ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
592}
593
594static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
595{
596	int i, r;
597	struct r600_bytecode_alu alu;
598	int gpr = 0, base_chan = 0;
599	int ij_index = ctx->shader->input[input].ij_index;
600
601	/* work out gpr and base_chan from index */
602	gpr = ij_index / 2;
603	base_chan = (2 * (ij_index % 2)) + 1;
604
605	for (i = 0; i < 8; i++) {
606		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
607
608		if (i < 4)
609			alu.op = ALU_OP2_INTERP_ZW;
610		else
611			alu.op = ALU_OP2_INTERP_XY;
612
613		if ((i > 1) && (i < 6)) {
614			alu.dst.sel = ctx->shader->input[input].gpr;
615			alu.dst.write = 1;
616		}
617
618		alu.dst.chan = i % 4;
619
620		alu.src[0].sel = gpr;
621		alu.src[0].chan = (base_chan - (i % 2));
622
623		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
624
625		alu.bank_swizzle_force = SQ_ALU_VEC_210;
626		if ((i % 4) == 3)
627			alu.last = 1;
628		r = r600_bytecode_add_alu(ctx->bc, &alu);
629		if (r)
630			return r;
631	}
632	return 0;
633}
634
635static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
636{
637	int i, r;
638	struct r600_bytecode_alu alu;
639
640	for (i = 0; i < 4; i++) {
641		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
642
643		alu.op = ALU_OP1_INTERP_LOAD_P0;
644
645		alu.dst.sel = ctx->shader->input[input].gpr;
646		alu.dst.write = 1;
647
648		alu.dst.chan = i;
649
650		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
651		alu.src[0].chan = i;
652
653		if (i == 3)
654			alu.last = 1;
655		r = r600_bytecode_add_alu(ctx->bc, &alu);
656		if (r)
657			return r;
658	}
659	return 0;
660}
661
662/*
663 * Special export handling in shaders
664 *
665 * shader export ARRAY_BASE for EXPORT_POS:
666 * 60 is position
667 * 61 is misc vector
668 * 62, 63 are clip distance vectors
669 *
670 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
671 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
672 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
673 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
674 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
675 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
676 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
677 * exclusive from render target index)
678 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
679 *
680 *
681 * shader export ARRAY_BASE for EXPORT_PIXEL:
682 * 0-7 CB targets
683 * 61 computed Z vector
684 *
685 * The use of the values exported in the computed Z vector are controlled
686 * by DB_SHADER_CONTROL:
687 * Z_EXPORT_ENABLE - Z as a float in RED
688 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
689 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
690 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
691 * DB_SOURCE_FORMAT - export control restrictions
692 *
693 */
694
695
696/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
697static int r600_spi_sid(struct r600_shader_io * io)
698{
699	int index, name = io->name;
700
701	/* These params are handled differently, they don't need
702	 * semantic indices, so we'll use 0 for them.
703	 */
704	if (name == TGSI_SEMANTIC_POSITION ||
705	    name == TGSI_SEMANTIC_PSIZE ||
706	    name == TGSI_SEMANTIC_EDGEFLAG ||
707	    name == TGSI_SEMANTIC_FACE ||
708	    name == TGSI_SEMANTIC_SAMPLEMASK)
709		index = 0;
710	else {
711		if (name == TGSI_SEMANTIC_GENERIC) {
712			/* For generic params simply use sid from tgsi */
713			index = 9 + io->sid;
714		} else if (name == TGSI_SEMANTIC_TEXCOORD) {
715			index = io->sid;
716		} else {
717			/* For non-generic params - pack name and sid into 8 bits */
718			index = 0x80 | (name<<3) | (io->sid);
719		}
720
721		/* Make sure that all really used indices have nonzero value, so
722		 * we can just compare it to 0 later instead of comparing the name
723		 * with different values to detect special cases. */
724		index++;
725	}
726
727	return index;
728};
729
730/* we need this to get a common lds index for vs/tcs/tes input/outputs */
731int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
732{
733	switch (semantic_name) {
734	case TGSI_SEMANTIC_POSITION:
735		return 0;
736	case TGSI_SEMANTIC_PSIZE:
737		return 1;
738	case TGSI_SEMANTIC_CLIPDIST:
739		assert(index <= 1);
740		return 2 + index;
741	case TGSI_SEMANTIC_TEXCOORD:
742		return 4 + index;
743	case TGSI_SEMANTIC_COLOR:
744		return 12 + index;
745	case TGSI_SEMANTIC_BCOLOR:
746		return 14 + index;
747	case TGSI_SEMANTIC_CLIPVERTEX:
748		return 16;
749	case TGSI_SEMANTIC_GENERIC:
750		if (index <= 63-17)
751			return 17 + index;
752		else
753			/* same explanation as in the default statement,
754			 * the only user hitting this is st/nine.
755			 */
756			return 0;
757
758	/* patch indices are completely separate and thus start from 0 */
759	case TGSI_SEMANTIC_TESSOUTER:
760		return 0;
761	case TGSI_SEMANTIC_TESSINNER:
762		return 1;
763	case TGSI_SEMANTIC_PATCH:
764		return 2 + index;
765
766	default:
767		/* Don't fail here. The result of this function is only used
768		 * for LS, TCS, TES, and GS, where legacy GL semantics can't
769		 * occur, but this function is called for all vertex shaders
770		 * before it's known whether LS will be compiled or not.
771		 */
772		return 0;
773	}
774}
775
776/* turn input into interpolate on EG */
777static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
778{
779	int r = 0;
780
781	if (ctx->shader->input[index].spi_sid) {
782		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
783		if (ctx->shader->input[index].interpolate > 0) {
784			evergreen_interp_assign_ij_index(ctx, index);
785			r = evergreen_interp_alu(ctx, index);
786		} else {
787			r = evergreen_interp_flat(ctx, index);
788		}
789	}
790	return r;
791}
792
793static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
794{
795	struct r600_bytecode_alu alu;
796	int i, r;
797	int gpr_front = ctx->shader->input[front].gpr;
798	int gpr_back = ctx->shader->input[back].gpr;
799
800	for (i = 0; i < 4; i++) {
801		memset(&alu, 0, sizeof(alu));
802		alu.op = ALU_OP3_CNDGT;
803		alu.is_op3 = 1;
804		alu.dst.write = 1;
805		alu.dst.sel = gpr_front;
806		alu.src[0].sel = ctx->face_gpr;
807		alu.src[1].sel = gpr_front;
808		alu.src[2].sel = gpr_back;
809
810		alu.dst.chan = i;
811		alu.src[1].chan = i;
812		alu.src[2].chan = i;
813		alu.last = (i==3);
814
815		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
816			return r;
817	}
818
819	return 0;
820}
821
822/* execute a single slot ALU calculation */
823static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
824			  int dst_sel, int dst_chan,
825			  int src0_sel, unsigned src0_chan_val,
826			  int src1_sel, unsigned src1_chan_val)
827{
828	struct r600_bytecode_alu alu;
829	int r, i;
830
831	if (ctx->bc->gfx_level == CAYMAN && op == ALU_OP2_MULLO_INT) {
832		for (i = 0; i < 4; i++) {
833			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
834			alu.op = op;
835			alu.src[0].sel = src0_sel;
836			if (src0_sel == V_SQ_ALU_SRC_LITERAL)
837				alu.src[0].value = src0_chan_val;
838			else
839				alu.src[0].chan = src0_chan_val;
840			alu.src[1].sel = src1_sel;
841			if (src1_sel == V_SQ_ALU_SRC_LITERAL)
842				alu.src[1].value = src1_chan_val;
843			else
844				alu.src[1].chan = src1_chan_val;
845			alu.dst.sel = dst_sel;
846			alu.dst.chan = i;
847			alu.dst.write = i == dst_chan;
848			alu.last = (i == 3);
849			r = r600_bytecode_add_alu(ctx->bc, &alu);
850			if (r)
851				return r;
852		}
853		return 0;
854	}
855
856	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
857	alu.op = op;
858	alu.src[0].sel = src0_sel;
859	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
860		alu.src[0].value = src0_chan_val;
861	else
862		alu.src[0].chan = src0_chan_val;
863	alu.src[1].sel = src1_sel;
864	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
865		alu.src[1].value = src1_chan_val;
866	else
867		alu.src[1].chan = src1_chan_val;
868	alu.dst.sel = dst_sel;
869	alu.dst.chan = dst_chan;
870	alu.dst.write = 1;
871	alu.last = 1;
872	r = r600_bytecode_add_alu(ctx->bc, &alu);
873	if (r)
874		return r;
875	return 0;
876}
877
878/* execute a single slot ALU calculation */
879static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
880			  int dst_sel, int dst_chan,
881			  int src0_sel, unsigned src0_chan_val,
882			  int src1_sel, unsigned src1_chan_val,
883			  int src2_sel, unsigned src2_chan_val)
884{
885	struct r600_bytecode_alu alu;
886	int r;
887
888	/* validate this for other ops */
889	assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT || op == ALU_OP3_BFE_UINT);
890	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
891	alu.op = op;
892	alu.src[0].sel = src0_sel;
893	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
894		alu.src[0].value = src0_chan_val;
895	else
896		alu.src[0].chan = src0_chan_val;
897	alu.src[1].sel = src1_sel;
898	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
899		alu.src[1].value = src1_chan_val;
900	else
901		alu.src[1].chan = src1_chan_val;
902	alu.src[2].sel = src2_sel;
903	if (src2_sel == V_SQ_ALU_SRC_LITERAL)
904		alu.src[2].value = src2_chan_val;
905	else
906		alu.src[2].chan = src2_chan_val;
907	alu.dst.sel = dst_sel;
908	alu.dst.chan = dst_chan;
909	alu.is_op3 = 1;
910	alu.last = 1;
911	r = r600_bytecode_add_alu(ctx->bc, &alu);
912	if (r)
913		return r;
914	return 0;
915}
916
917/* put it in temp_reg.x */
918static int get_lds_offset0(struct r600_shader_ctx *ctx,
919			   int rel_patch_chan,
920			   int temp_reg, bool is_patch_var)
921{
922	int r;
923
924	/* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
925	/* ADD
926	   Dimension - patch0_offset (input_vals.z),
927	   Non-dim - patch0_data_offset (input_vals.w)
928	*/
929	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
930			   temp_reg, 0,
931			   ctx->tess_output_info, 0,
932			   0, rel_patch_chan,
933			   ctx->tess_output_info, is_patch_var ? 3 : 2);
934	if (r)
935		return r;
936	return 0;
937}
938
939static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
940{
941	return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
942}
943
944static int r600_get_temp(struct r600_shader_ctx *ctx)
945{
946	return ctx->temp_reg + ctx->max_driver_temp_used++;
947}
948
949static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
950{
951	int i;
952	i = ctx->shader->noutput++;
953	ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
954	ctx->shader->output[i].sid = 0;
955	ctx->shader->output[i].gpr = 0;
956	ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
957	ctx->shader->output[i].write_mask = 0x4;
958	ctx->shader->output[i].spi_sid = prim_id_sid;
959
960	return 0;
961}
962
963static int tgsi_barrier(struct r600_shader_ctx *ctx)
964{
965	struct r600_bytecode_alu alu;
966	int r;
967
968	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
969	alu.op = ctx->inst_info->op;
970	alu.last = 1;
971
972	r = r600_bytecode_add_alu(ctx->bc, &alu);
973	if (r)
974		return r;
975
976	/* XXX: Need to implement GWS ops to sync across wavefronts */
977
978	return 0;
979}
980
981static int tgsi_membar(struct r600_shader_ctx *ctx)
982{
983	/* Wait for any SSBO/image stores to land. */
984	return r600_bytecode_wait_acks(ctx->bc);
985}
986
987static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed)
988{
989	// pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays
990	unsigned n = ctx->info.array_max[TGSI_FILE_TEMPORARY];
991	unsigned narrays_left = n;
992	bool *spilled = ctx->spilled_arrays; // assumed calloc:ed
993
994	*scratch_space_needed = 0;
995	while (*regno > 124 && narrays_left) {
996		unsigned i;
997		unsigned largest = 0;
998		unsigned largest_index = 0;
999
1000		for (i = 0; i < n; i++) {
1001			unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
1002			if (!spilled[i] && size > largest) {
1003				largest = size;
1004				largest_index = i;
1005			}
1006		}
1007
1008		spilled[largest_index] = true;
1009		*regno -= largest;
1010		*scratch_space_needed += largest;
1011
1012		narrays_left --;
1013	}
1014
1015	if (narrays_left == 0) {
1016		ctx->info.indirect_files &= ~(1 << TGSI_FILE_TEMPORARY);
1017	}
1018}
1019
1020/* Take spilled temp arrays into account when translating tgsi register
1021 * indexes into r600 gprs if spilled is false, or scratch array offset if
1022 * spilled is true */
1023static int map_tgsi_reg_index_to_r600_gpr(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, bool *spilled)
1024{
1025	unsigned i;
1026	unsigned spilled_size = 0;
1027
1028	for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
1029		if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
1030			if (ctx->spilled_arrays[i]) {
1031				/* vec4 index into spilled scratch memory */
1032				*spilled = true;
1033				return tgsi_reg_index - ctx->array_infos[i].range.First + spilled_size;
1034			}
1035			else {
1036				/* regular GPR array */
1037				*spilled = false;
1038				return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
1039			}
1040		}
1041
1042		if (tgsi_reg_index < ctx->array_infos[i].range.First)
1043			break;
1044		if (ctx->spilled_arrays[i]) {
1045			spilled_size += ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
1046		}
1047	}
1048
1049	/* regular GPR index, minus the holes from spilled arrays */
1050	*spilled = false;
1051
1052	return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
1053}
1054
1055/* look up spill area base offset and array size for a spilled temp array */
1056static void get_spilled_array_base_and_size(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index,
1057	unsigned *array_base, unsigned *array_size)
1058{
1059	unsigned i;
1060	unsigned offset = 0;
1061
1062	for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
1063		if (ctx->spilled_arrays[i]) {
1064			unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
1065
1066			if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
1067				*array_base = offset;
1068				*array_size = size - 1; /* hw counts from 1 */
1069
1070				return;
1071			}
1072
1073			offset += size;
1074		}
1075	}
1076}
1077
1078static int tgsi_declaration(struct r600_shader_ctx *ctx)
1079{
1080	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
1081	int r, i, j, count = d->Range.Last - d->Range.First + 1;
1082
1083	switch (d->Declaration.File) {
1084	case TGSI_FILE_INPUT:
1085		for (j = 0; j < count; j++) {
1086			i = ctx->shader->ninput + j;
1087			assert(i < ARRAY_SIZE(ctx->shader->input));
1088			ctx->shader->input[i].name = d->Semantic.Name;
1089			ctx->shader->input[i].sid = d->Semantic.Index + j;
1090			ctx->shader->input[i].interpolate = d->Interp.Interpolate;
1091			ctx->shader->input[i].interpolate_location = d->Interp.Location;
1092			ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
1093			if (ctx->type == PIPE_SHADER_FRAGMENT) {
1094				ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
1095				switch (ctx->shader->input[i].name) {
1096				case TGSI_SEMANTIC_FACE:
1097					if (ctx->face_gpr != -1)
1098						ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
1099					else
1100						ctx->face_gpr = ctx->shader->input[i].gpr;
1101					break;
1102				case TGSI_SEMANTIC_COLOR:
1103					ctx->colors_used++;
1104					break;
1105				case TGSI_SEMANTIC_POSITION:
1106					ctx->fragcoord_input = i;
1107					break;
1108				case TGSI_SEMANTIC_PRIMID:
1109					/* set this for now */
1110					ctx->shader->gs_prim_id_input = true;
1111					ctx->shader->ps_prim_id_input = i;
1112					break;
1113				}
1114				if (ctx->bc->gfx_level >= EVERGREEN) {
1115					if ((r = evergreen_interp_input(ctx, i)))
1116						return r;
1117				}
1118			} else if (ctx->type == PIPE_SHADER_GEOMETRY) {
1119				/* FIXME probably skip inputs if they aren't passed in the ring */
1120				ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
1121				ctx->next_ring_offset += 16;
1122				if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
1123					ctx->shader->gs_prim_id_input = true;
1124			}
1125		}
1126		ctx->shader->ninput += count;
1127		break;
1128	case TGSI_FILE_OUTPUT:
1129		for (j = 0; j < count; j++) {
1130			i = ctx->shader->noutput + j;
1131			assert(i < ARRAY_SIZE(ctx->shader->output));
1132			ctx->shader->output[i].name = d->Semantic.Name;
1133			ctx->shader->output[i].sid = d->Semantic.Index + j;
1134			ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
1135			ctx->shader->output[i].interpolate = d->Interp.Interpolate;
1136			ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
1137			if (ctx->type == PIPE_SHADER_VERTEX ||
1138			    ctx->type == PIPE_SHADER_GEOMETRY ||
1139			    ctx->type == PIPE_SHADER_TESS_EVAL) {
1140				ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
1141				switch (d->Semantic.Name) {
1142				case TGSI_SEMANTIC_CLIPDIST:
1143					break;
1144				case TGSI_SEMANTIC_PSIZE:
1145					ctx->shader->vs_out_misc_write = 1;
1146					ctx->shader->vs_out_point_size = 1;
1147					break;
1148				case TGSI_SEMANTIC_EDGEFLAG:
1149					ctx->shader->vs_out_misc_write = 1;
1150					ctx->shader->vs_out_edgeflag = 1;
1151					ctx->edgeflag_output = i;
1152					break;
1153				case TGSI_SEMANTIC_VIEWPORT_INDEX:
1154					ctx->shader->vs_out_misc_write = 1;
1155					ctx->shader->vs_out_viewport = 1;
1156					break;
1157				case TGSI_SEMANTIC_LAYER:
1158					ctx->shader->vs_out_misc_write = 1;
1159					ctx->shader->vs_out_layer = 1;
1160					break;
1161				case TGSI_SEMANTIC_CLIPVERTEX:
1162					ctx->clip_vertex_write = TRUE;
1163					ctx->cv_output = i;
1164					break;
1165				}
1166				if (ctx->type == PIPE_SHADER_GEOMETRY) {
1167					ctx->gs_out_ring_offset += 16;
1168				}
1169			}
1170		}
1171		ctx->shader->noutput += count;
1172		break;
1173	case TGSI_FILE_TEMPORARY:
1174		if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
1175			if (d->Array.ArrayID) {
1176				bool spilled;
1177				unsigned idx = map_tgsi_reg_index_to_r600_gpr(ctx,
1178					d->Range.First,
1179					&spilled);
1180
1181				if (!spilled) {
1182					r600_add_gpr_array(ctx->shader, idx,
1183						d->Range.Last - d->Range.First + 1, 0x0F);
1184				}
1185			}
1186		}
1187		break;
1188
1189	case TGSI_FILE_CONSTANT:
1190	case TGSI_FILE_SAMPLER:
1191	case TGSI_FILE_SAMPLER_VIEW:
1192	case TGSI_FILE_ADDRESS:
1193	case TGSI_FILE_BUFFER:
1194	case TGSI_FILE_IMAGE:
1195	case TGSI_FILE_MEMORY:
1196		break;
1197
1198	case TGSI_FILE_HW_ATOMIC:
1199		i = ctx->shader->nhwatomic_ranges;
1200		ctx->shader->atomics[i].start = d->Range.First;
1201		ctx->shader->atomics[i].end = d->Range.Last;
1202		ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
1203		ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
1204		ctx->shader->nhwatomic_ranges++;
1205		ctx->shader->nhwatomic += count;
1206		break;
1207
1208	case TGSI_FILE_SYSTEM_VALUE:
1209		if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
1210			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
1211			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
1212			break; /* Already handled from allocate_system_value_inputs */
1213		} else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
1214			break;
1215		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
1216			break;
1217		else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
1218			break;
1219		else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
1220			 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
1221			int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
1222			int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
1223			unsigned temp_reg = r600_get_temp(ctx);
1224
1225			r = get_lds_offset0(ctx, 2, temp_reg, true);
1226			if (r)
1227				return r;
1228
1229			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1230					   temp_reg, 0,
1231					   temp_reg, 0,
1232					   V_SQ_ALU_SRC_LITERAL, param * 16);
1233			if (r)
1234				return r;
1235
1236			do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
1237		}
1238		else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
1239			/* MOV r1.x, r0.x;
1240			   MOV r1.y, r0.y;
1241			*/
1242			for (i = 0; i < 2; i++) {
1243				struct r600_bytecode_alu alu;
1244				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1245				alu.op = ALU_OP1_MOV;
1246				alu.src[0].sel = 0;
1247				alu.src[0].chan = 0 + i;
1248				alu.dst.sel = 1;
1249				alu.dst.chan = 0 + i;
1250				alu.dst.write = 1;
1251				alu.last = (i == 1) ? 1 : 0;
1252				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1253					return r;
1254			}
1255			/* ADD r1.z, 1.0f, -r0.x */
1256			struct r600_bytecode_alu alu;
1257			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1258			alu.op = ALU_OP2_ADD;
1259			alu.src[0].sel = V_SQ_ALU_SRC_1;
1260			alu.src[1].sel = 1;
1261			alu.src[1].chan = 0;
1262			alu.src[1].neg = 1;
1263			alu.dst.sel = 1;
1264			alu.dst.chan = 2;
1265			alu.dst.write = 1;
1266			alu.last = 1;
1267			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1268				return r;
1269
1270			/* ADD r1.z, r1.z, -r1.y */
1271			alu.op = ALU_OP2_ADD;
1272			alu.src[0].sel = 1;
1273			alu.src[0].chan = 2;
1274			alu.src[1].sel = 1;
1275			alu.src[1].chan = 1;
1276			alu.src[1].neg = 1;
1277			alu.dst.sel = 1;
1278			alu.dst.chan = 2;
1279			alu.dst.write = 1;
1280			alu.last = 1;
1281			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1282				return r;
1283			break;
1284		}
1285		break;
1286	default:
1287		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
1288		return -EINVAL;
1289	}
1290	return 0;
1291}
1292
1293static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
1294{
1295	struct tgsi_parse_context parse;
1296	struct {
1297		boolean enabled;
1298		int *reg;
1299		unsigned name, alternate_name;
1300	} inputs[2] = {
1301		{ false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
1302
1303		{ false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
1304	};
1305	int num_regs = 0;
1306	unsigned k, i;
1307
1308	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1309		return 0;
1310	}
1311
1312	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1313	while (!tgsi_parse_end_of_tokens(&parse)) {
1314		tgsi_parse_token(&parse);
1315
1316		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1317			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1318			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1319				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1320				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1321			{
1322				int interpolate, location, k;
1323
1324				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1325					location = TGSI_INTERPOLATE_LOC_CENTER;
1326				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1327					location = TGSI_INTERPOLATE_LOC_CENTER;
1328					/* Needs sample positions, currently those are always available */
1329				} else {
1330					location = TGSI_INTERPOLATE_LOC_CENTROID;
1331				}
1332
1333				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1334				k = eg_get_interpolator_index(interpolate, location);
1335				if (k >= 0)
1336					ctx->eg_interpolators[k].enabled = true;
1337			}
1338		} else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1339			struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1340			if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1341				for (k = 0; k < ARRAY_SIZE(inputs); k++) {
1342					if (d->Semantic.Name == inputs[k].name ||
1343						d->Semantic.Name == inputs[k].alternate_name) {
1344						inputs[k].enabled = true;
1345					}
1346				}
1347			}
1348		}
1349	}
1350
1351	tgsi_parse_free(&parse);
1352
1353	if (ctx->info.reads_samplemask &&
1354	    (ctx->info.uses_linear_sample || ctx->info.uses_persp_sample)) {
1355		inputs[1].enabled = true;
1356	}
1357
1358	if (ctx->bc->gfx_level >= EVERGREEN) {
1359		int num_baryc = 0;
1360		/* assign gpr to each interpolator according to priority */
1361		for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
1362			if (ctx->eg_interpolators[i].enabled) {
1363				ctx->eg_interpolators[i].ij_index = num_baryc;
1364				num_baryc++;
1365			}
1366		}
1367		num_baryc = (num_baryc + 1) >> 1;
1368		gpr_offset += num_baryc;
1369	}
1370
1371	for (i = 0; i < ARRAY_SIZE(inputs); i++) {
1372		boolean enabled = inputs[i].enabled;
1373		int *reg = inputs[i].reg;
1374		unsigned name = inputs[i].name;
1375
1376		if (enabled) {
1377			int gpr = gpr_offset + num_regs++;
1378			ctx->shader->nsys_inputs++;
1379
1380			// add to inputs, allocate a gpr
1381			k = ctx->shader->ninput++;
1382			ctx->shader->input[k].name = name;
1383			ctx->shader->input[k].sid = 0;
1384			ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1385			ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1386			*reg = ctx->shader->input[k].gpr = gpr;
1387		}
1388	}
1389
1390	return gpr_offset + num_regs;
1391}
1392
1393/*
1394 * for evergreen we need to scan the shader to find the number of GPRs we need to
1395 * reserve for interpolation and system values
1396 *
1397 * we need to know if we are going to emit any sample or centroid inputs
1398 * if perspective and linear are required
1399*/
1400static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1401{
1402	unsigned i;
1403
1404	memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1405
1406	/*
1407	 * Could get this information from the shader info. But right now
1408	 * we interpolate all declared inputs, whereas the shader info will
1409	 * only contain the bits if the inputs are actually used, so it might
1410	 * not be safe...
1411	 */
1412	for (i = 0; i < ctx->info.num_inputs; i++) {
1413		int k;
1414		/* skip position/face/mask/sampleid */
1415		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1416		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1417		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1418		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1419			continue;
1420
1421		k = eg_get_interpolator_index(
1422			ctx->info.input_interpolate[i],
1423			ctx->info.input_interpolate_loc[i]);
1424		if (k >= 0)
1425			ctx->eg_interpolators[k].enabled = TRUE;
1426	}
1427
1428	/* XXX PULL MODEL and LINE STIPPLE */
1429
1430	return allocate_system_value_inputs(ctx, 0);
1431}
1432
1433/* sample_id_sel == NULL means fetch for current sample */
1434static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1435{
1436	struct r600_bytecode_vtx vtx;
1437	int r, t1;
1438
1439	t1 = r600_get_temp(ctx);
1440
1441	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1442	vtx.op = FETCH_OP_VFETCH;
1443	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1444	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1445	if (sample_id == NULL) {
1446		assert(ctx->fixed_pt_position_gpr != -1);
1447
1448		vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1449		vtx.src_sel_x = 3;
1450	}
1451	else {
1452		struct r600_bytecode_alu alu;
1453
1454		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1455		alu.op = ALU_OP1_MOV;
1456		r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1457		alu.dst.sel = t1;
1458		alu.dst.write = 1;
1459		alu.last = 1;
1460		r = r600_bytecode_add_alu(ctx->bc, &alu);
1461		if (r)
1462			return r;
1463
1464		vtx.src_gpr = t1;
1465		vtx.src_sel_x = 0;
1466	}
1467	vtx.mega_fetch_count = 16;
1468	vtx.dst_gpr = t1;
1469	vtx.dst_sel_x = 0;
1470	vtx.dst_sel_y = 1;
1471	vtx.dst_sel_z = 2;
1472	vtx.dst_sel_w = 3;
1473	vtx.data_format = FMT_32_32_32_32_FLOAT;
1474	vtx.num_format_all = 2;
1475	vtx.format_comp_all = 1;
1476	vtx.use_const_fields = 0;
1477	vtx.offset = 0;
1478	vtx.endian = r600_endian_swap(32);
1479	vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1480
1481	r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1482	if (r)
1483		return r;
1484
1485	return t1;
1486}
1487
1488static int eg_load_helper_invocation(struct r600_shader_ctx *ctx)
1489{
1490	int r;
1491	struct r600_bytecode_alu alu;
1492
1493	/* do a vtx fetch with wqm set on the vtx fetch */
1494	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1495	alu.op = ALU_OP1_MOV;
1496	alu.dst.sel = ctx->helper_invoc_reg;
1497	alu.dst.chan = 0;
1498	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1499	alu.src[0].value = 0xffffffff;
1500	alu.dst.write = 1;
1501	alu.last = 1;
1502	r = r600_bytecode_add_alu(ctx->bc, &alu);
1503	if (r)
1504		return r;
1505
1506	/* do a vtx fetch in VPM mode */
1507	struct r600_bytecode_vtx vtx;
1508	memset(&vtx, 0, sizeof(vtx));
1509	vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
1510	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1511	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1512	vtx.src_gpr = 0;
1513	vtx.mega_fetch_count = 16; /* no idea here really... */
1514	vtx.dst_gpr = ctx->helper_invoc_reg;
1515	vtx.dst_sel_x = 4;
1516	vtx.dst_sel_y = 7;		/* SEL_Y */
1517	vtx.dst_sel_z = 7;		/* SEL_Z */
1518	vtx.dst_sel_w = 7;		/* SEL_W */
1519	vtx.data_format = FMT_32;
1520	if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
1521		return r;
1522	ctx->bc->cf_last->vpm = 1;
1523	return 0;
1524}
1525
1526static int cm_load_helper_invocation(struct r600_shader_ctx *ctx)
1527{
1528	int r;
1529	struct r600_bytecode_alu alu;
1530
1531	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1532	alu.op = ALU_OP1_MOV;
1533	alu.dst.sel = ctx->helper_invoc_reg;
1534	alu.dst.chan = 0;
1535	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1536	alu.src[0].value = 0xffffffff;
1537	alu.dst.write = 1;
1538	alu.last = 1;
1539	r = r600_bytecode_add_alu(ctx->bc, &alu);
1540	if (r)
1541		return r;
1542
1543	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1544	alu.op = ALU_OP1_MOV;
1545	alu.dst.sel = ctx->helper_invoc_reg;
1546	alu.dst.chan = 0;
1547	alu.src[0].sel = V_SQ_ALU_SRC_0;
1548	alu.dst.write = 1;
1549	alu.last = 1;
1550	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE);
1551	if (r)
1552		return r;
1553
1554	return ctx->helper_invoc_reg;
1555}
1556
1557static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)
1558{
1559	struct r600_bytecode_vtx vtx;
1560	int r, t1;
1561
1562	if (ctx->cs_block_size_loaded)
1563		return ctx->cs_block_size_reg;
1564	if (ctx->cs_grid_size_loaded)
1565		return ctx->cs_grid_size_reg;
1566
1567	t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg;
1568	struct r600_bytecode_alu alu;
1569	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1570	alu.op = ALU_OP1_MOV;
1571	alu.src[0].sel = V_SQ_ALU_SRC_0;
1572	alu.dst.sel = t1;
1573	alu.dst.write = 1;
1574	alu.last = 1;
1575	r = r600_bytecode_add_alu(ctx->bc, &alu);
1576	if (r)
1577		return r;
1578
1579	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1580	vtx.op = FETCH_OP_VFETCH;
1581	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1582	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1583	vtx.src_gpr = t1;
1584	vtx.src_sel_x = 0;
1585
1586	vtx.mega_fetch_count = 16;
1587	vtx.dst_gpr = t1;
1588	vtx.dst_sel_x = 0;
1589	vtx.dst_sel_y = 1;
1590	vtx.dst_sel_z = 2;
1591	vtx.dst_sel_w = 7;
1592	vtx.data_format = FMT_32_32_32_32;
1593	vtx.num_format_all = 1;
1594	vtx.format_comp_all = 0;
1595	vtx.use_const_fields = 0;
1596	vtx.offset = load_block ? 0 : 16; // first element is size of buffer
1597	vtx.endian = r600_endian_swap(32);
1598	vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1599
1600	r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1601	if (r)
1602		return r;
1603
1604	if (load_block)
1605		ctx->cs_block_size_loaded = true;
1606	else
1607		ctx->cs_grid_size_loaded = true;
1608	return t1;
1609}
1610
1611static void tgsi_src(struct r600_shader_ctx *ctx,
1612		     const struct tgsi_full_src_register *tgsi_src,
1613		     struct r600_shader_src *r600_src)
1614{
1615	memset(r600_src, 0, sizeof(*r600_src));
1616	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1617	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1618	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1619	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1620	r600_src->neg = tgsi_src->Register.Negate;
1621	r600_src->abs = tgsi_src->Register.Absolute;
1622
1623	if (tgsi_src->Register.File == TGSI_FILE_TEMPORARY) {
1624		bool spilled;
1625		unsigned idx;
1626
1627		idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_src->Register.Index, &spilled);
1628
1629		if (spilled) {
1630			int reg = r600_get_temp(ctx);
1631			int r;
1632
1633			r600_src->sel = reg;
1634
1635			if (ctx->bc->gfx_level < R700) {
1636				struct r600_bytecode_output cf;
1637
1638				memset(&cf, 0, sizeof(struct r600_bytecode_output));
1639				cf.op = CF_OP_MEM_SCRATCH;
1640				cf.elem_size = 3;
1641				cf.gpr = reg;
1642				cf.comp_mask = 0xF;
1643				cf.swizzle_x = 0;
1644				cf.swizzle_y = 1;
1645				cf.swizzle_z = 2;
1646				cf.swizzle_w = 3;
1647				cf.burst_count = 1;
1648
1649				get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1650					&cf.array_base, &cf.array_size);
1651
1652				if (tgsi_src->Register.Indirect) {
1653					cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
1654					cf.index_gpr = ctx->bc->ar_reg;
1655				}
1656				else {
1657					cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ;
1658					cf.array_base += idx;
1659					cf.array_size = 0;
1660				}
1661
1662				r = r600_bytecode_add_output(ctx->bc, &cf);
1663			}
1664			else {
1665				struct r600_bytecode_vtx vtx;
1666
1667				r600_bytecode_wait_acks(ctx->bc);
1668
1669				memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1670				vtx.op = FETCH_OP_READ_SCRATCH;
1671				vtx.dst_gpr = reg;
1672				vtx.uncached = 1; // Must bypass cache since prior spill written in same invocation
1673				vtx.elem_size = 3;
1674				vtx.data_format = FMT_32_32_32_32;
1675				vtx.num_format_all = V_038010_SQ_NUM_FORMAT_INT;
1676				vtx.dst_sel_x = tgsi_src->Register.SwizzleX;
1677				vtx.dst_sel_y = tgsi_src->Register.SwizzleY;
1678				vtx.dst_sel_z = tgsi_src->Register.SwizzleZ;
1679				vtx.dst_sel_w = tgsi_src->Register.SwizzleW;
1680
1681				get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1682					&vtx.array_base, &vtx.array_size);
1683
1684				if (tgsi_src->Register.Indirect) {
1685					vtx.indexed = 1;
1686					vtx.src_gpr = ctx->bc->ar_reg;
1687				}
1688				else {
1689					vtx.array_base += idx;
1690					vtx.array_size = 0;
1691				}
1692
1693				r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1694			}
1695
1696			if (r)
1697				return;
1698		}
1699		else {
1700			if (tgsi_src->Register.Indirect)
1701				r600_src->rel = V_SQ_REL_RELATIVE;
1702
1703			r600_src->sel = idx;
1704		}
1705
1706		return;
1707	}
1708
1709	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1710		int index;
1711		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1712			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1713			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1714
1715			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1716			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel);
1717			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1718				return;
1719		}
1720		index = tgsi_src->Register.Index;
1721		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1722		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1723	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1724		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1725			r600_src->swizzle[0] = 2; // Z value
1726			r600_src->swizzle[1] = 2;
1727			r600_src->swizzle[2] = 2;
1728			r600_src->swizzle[3] = 2;
1729			r600_src->sel = ctx->face_gpr;
1730		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1731			r600_src->swizzle[0] = 3; // W value
1732			r600_src->swizzle[1] = 3;
1733			r600_src->swizzle[2] = 3;
1734			r600_src->swizzle[3] = 3;
1735			r600_src->sel = ctx->fixed_pt_position_gpr;
1736		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1737			r600_src->swizzle[0] = 0;
1738			r600_src->swizzle[1] = 1;
1739			r600_src->swizzle[2] = 4;
1740			r600_src->swizzle[3] = 4;
1741			r600_src->sel = load_sample_position(ctx, NULL, -1);
1742		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1743			r600_src->swizzle[0] = 3;
1744			r600_src->swizzle[1] = 3;
1745			r600_src->swizzle[2] = 3;
1746			r600_src->swizzle[3] = 3;
1747			r600_src->sel = 0;
1748		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1749			r600_src->swizzle[0] = 0;
1750			r600_src->swizzle[1] = 0;
1751			r600_src->swizzle[2] = 0;
1752			r600_src->swizzle[3] = 0;
1753			r600_src->sel = 0;
1754		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) {
1755			r600_src->sel = 0;
1756		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) {
1757			r600_src->sel = 1;
1758		} else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1759			r600_src->swizzle[0] = 3;
1760			r600_src->swizzle[1] = 3;
1761			r600_src->swizzle[2] = 3;
1762			r600_src->swizzle[3] = 3;
1763			r600_src->sel = 1;
1764		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1765			r600_src->swizzle[0] = 2;
1766			r600_src->swizzle[1] = 2;
1767			r600_src->swizzle[2] = 2;
1768			r600_src->swizzle[3] = 2;
1769			r600_src->sel = 0;
1770		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
1771			r600_src->sel = 1;
1772		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
1773			r600_src->sel = 3;
1774		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
1775			r600_src->sel = 2;
1776		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
1777			r600_src->sel = ctx->tess_input_info;
1778			r600_src->swizzle[0] = 2;
1779			r600_src->swizzle[1] = 2;
1780			r600_src->swizzle[2] = 2;
1781			r600_src->swizzle[3] = 2;
1782		} else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1783			r600_src->sel = 0;
1784			r600_src->swizzle[0] = 0;
1785			r600_src->swizzle[1] = 0;
1786			r600_src->swizzle[2] = 0;
1787			r600_src->swizzle[3] = 0;
1788		} else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1789			r600_src->sel = 0;
1790			r600_src->swizzle[0] = 3;
1791			r600_src->swizzle[1] = 3;
1792			r600_src->swizzle[2] = 3;
1793			r600_src->swizzle[3] = 3;
1794		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) {
1795			r600_src->sel = load_block_grid_size(ctx, false);
1796		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {
1797			r600_src->sel = load_block_grid_size(ctx, true);
1798		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) {
1799			r600_src->sel = ctx->helper_invoc_reg;
1800			r600_src->swizzle[0] = 0;
1801			r600_src->swizzle[1] = 0;
1802			r600_src->swizzle[2] = 0;
1803			r600_src->swizzle[3] = 0;
1804		}
1805	} else {
1806		if (tgsi_src->Register.Indirect)
1807			r600_src->rel = V_SQ_REL_RELATIVE;
1808		r600_src->sel = tgsi_src->Register.Index;
1809		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1810	}
1811	if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1812		if (tgsi_src->Register.Dimension) {
1813			r600_src->kc_bank = tgsi_src->Dimension.Index;
1814			if (tgsi_src->Dimension.Indirect) {
1815				r600_src->kc_rel = 1;
1816			}
1817		}
1818	}
1819}
1820
1821static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1822                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1823                                unsigned int dst_reg)
1824{
1825	struct r600_bytecode_vtx vtx;
1826	unsigned int ar_reg;
1827	int r;
1828
1829	if (offset) {
1830		struct r600_bytecode_alu alu;
1831
1832		memset(&alu, 0, sizeof(alu));
1833
1834		alu.op = ALU_OP2_ADD_INT;
1835		alu.src[0].sel = ctx->bc->ar_reg;
1836		alu.src[0].chan = ar_chan;
1837
1838		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1839		alu.src[1].value = offset;
1840
1841		alu.dst.sel = dst_reg;
1842		alu.dst.chan = ar_chan;
1843		alu.dst.write = 1;
1844		alu.last = 1;
1845
1846		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1847			return r;
1848
1849		ar_reg = dst_reg;
1850	} else {
1851		ar_reg = ctx->bc->ar_reg;
1852	}
1853
1854	memset(&vtx, 0, sizeof(vtx));
1855	vtx.buffer_id = cb_idx;
1856	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1857	vtx.src_gpr = ar_reg;
1858	vtx.src_sel_x = ar_chan;
1859	vtx.mega_fetch_count = 16;
1860	vtx.dst_gpr = dst_reg;
1861	vtx.dst_sel_x = 0;		/* SEL_X */
1862	vtx.dst_sel_y = 1;		/* SEL_Y */
1863	vtx.dst_sel_z = 2;		/* SEL_Z */
1864	vtx.dst_sel_w = 3;		/* SEL_W */
1865	vtx.data_format = FMT_32_32_32_32_FLOAT;
1866	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1867	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1868	vtx.endian = r600_endian_swap(32);
1869	vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1870
1871	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1872		return r;
1873
1874	return 0;
1875}
1876
1877static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1878{
1879	struct r600_bytecode_vtx vtx;
1880	int r;
1881	unsigned index = src->Register.Index;
1882	unsigned vtx_id = src->Dimension.Index;
1883	int offset_reg = ctx->gs_rotated_input[vtx_id / 3];
1884	int offset_chan = vtx_id % 3;
1885	int t2 = 0;
1886
1887	/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1888	 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1889
1890	if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2)
1891		offset_chan = 3;
1892
1893	if (src->Dimension.Indirect || src->Register.Indirect)
1894		t2 = r600_get_temp(ctx);
1895
1896	if (src->Dimension.Indirect) {
1897		int treg[3];
1898		struct r600_bytecode_alu alu;
1899		int r, i;
1900		unsigned addr_reg;
1901		addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
1902		if (src->DimIndirect.Index > 0) {
1903			r = single_alu_op2(ctx, ALU_OP1_MOV,
1904					   ctx->bc->ar_reg, 0,
1905					   addr_reg, 0,
1906					   0, 0);
1907			if (r)
1908				return r;
1909		}
1910		/*
1911		   we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1912		   at least this is what fglrx seems to do. */
1913		for (i = 0; i < 3; i++) {
1914			treg[i] = r600_get_temp(ctx);
1915		}
1916		r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1917
1918		for (i = 0; i < 3; i++) {
1919			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1920			alu.op = ALU_OP1_MOV;
1921			alu.src[0].sel = ctx->gs_rotated_input[0];
1922			alu.src[0].chan = i == 2 ? 3 : i;
1923			alu.dst.sel = treg[i];
1924			alu.dst.chan = 0;
1925			alu.dst.write = 1;
1926			alu.last = 1;
1927			r = r600_bytecode_add_alu(ctx->bc, &alu);
1928			if (r)
1929				return r;
1930		}
1931		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1932		alu.op = ALU_OP1_MOV;
1933		alu.src[0].sel = treg[0];
1934		alu.src[0].rel = 1;
1935		alu.dst.sel = t2;
1936		alu.dst.write = 1;
1937		alu.last = 1;
1938		r = r600_bytecode_add_alu(ctx->bc, &alu);
1939		if (r)
1940			return r;
1941		offset_reg = t2;
1942		offset_chan = 0;
1943	}
1944
1945	if (src->Register.Indirect) {
1946		int addr_reg;
1947		unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
1948
1949		addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
1950
1951		/* pull the value from index_reg */
1952		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1953				   t2, 1,
1954				   addr_reg, 0,
1955				   V_SQ_ALU_SRC_LITERAL, first);
1956		if (r)
1957			return r;
1958		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1959				   t2, 0,
1960				   t2, 1,
1961				   V_SQ_ALU_SRC_LITERAL, 4,
1962				   offset_reg, offset_chan);
1963		if (r)
1964			return r;
1965		offset_reg = t2;
1966		offset_chan = 0;
1967		index = src->Register.Index - first;
1968	}
1969
1970	memset(&vtx, 0, sizeof(vtx));
1971	vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1972	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1973	vtx.src_gpr = offset_reg;
1974	vtx.src_sel_x = offset_chan;
1975	vtx.offset = index * 16; /*bytes*/
1976	vtx.mega_fetch_count = 16;
1977	vtx.dst_gpr = dst_reg;
1978	vtx.dst_sel_x = 0;		/* SEL_X */
1979	vtx.dst_sel_y = 1;		/* SEL_Y */
1980	vtx.dst_sel_z = 2;		/* SEL_Z */
1981	vtx.dst_sel_w = 3;		/* SEL_W */
1982	if (ctx->bc->gfx_level >= EVERGREEN) {
1983		vtx.use_const_fields = 1;
1984	} else {
1985		vtx.data_format = FMT_32_32_32_32_FLOAT;
1986	}
1987
1988	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1989		return r;
1990
1991	return 0;
1992}
1993
1994static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1995{
1996	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1997	unsigned i;
1998
1999	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2000		struct tgsi_full_src_register *src = &inst->Src[i];
2001
2002		if (src->Register.File == TGSI_FILE_INPUT) {
2003			if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
2004				/* primitive id is in R0.z */
2005				ctx->src[i].sel = 0;
2006				ctx->src[i].swizzle[0] = 2;
2007			}
2008		}
2009		if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
2010			int treg = r600_get_temp(ctx);
2011
2012			fetch_gs_input(ctx, src, treg);
2013			ctx->src[i].sel = treg;
2014			ctx->src[i].rel = 0;
2015		}
2016	}
2017	return 0;
2018}
2019
2020
2021/* Tessellation shaders pass outputs to the next shader using LDS.
2022 *
2023 * LS outputs = TCS(HS) inputs
2024 * TCS(HS) outputs = TES(DS) inputs
2025 *
2026 * The LDS layout is:
2027 * - TCS inputs for patch 0
2028 * - TCS inputs for patch 1
2029 * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
2030 * - ...
2031 * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
2032 * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
2033 * - TCS outputs for patch 1
2034 * - Per-patch TCS outputs for patch 1
2035 * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
2036 * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
2037 * - ...
2038 *
2039 * All three shaders VS(LS), TCS, TES share the same LDS space.
2040 */
2041/* this will return with the dw address in temp_reg.x */
2042static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
2043				 const struct tgsi_full_dst_register *dst,
2044				 const struct tgsi_full_src_register *src,
2045				 int stride_bytes_reg, int stride_bytes_chan)
2046{
2047	struct tgsi_full_dst_register reg;
2048	ubyte *name, *index, *array_first;
2049	int r;
2050	int param;
2051	struct tgsi_shader_info *info = &ctx->info;
2052	/* Set the register description. The address computation is the same
2053	 * for sources and destinations. */
2054	if (src) {
2055		reg.Register.File = src->Register.File;
2056		reg.Register.Index = src->Register.Index;
2057		reg.Register.Indirect = src->Register.Indirect;
2058		reg.Register.Dimension = src->Register.Dimension;
2059		reg.Indirect = src->Indirect;
2060		reg.Dimension = src->Dimension;
2061		reg.DimIndirect = src->DimIndirect;
2062	} else
2063		reg = *dst;
2064
2065	/* If the register is 2-dimensional (e.g. an array of vertices
2066	 * in a primitive), calculate the base address of the vertex. */
2067	if (reg.Register.Dimension) {
2068		int sel, chan;
2069		if (reg.Dimension.Indirect) {
2070			unsigned addr_reg;
2071			assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
2072
2073			addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
2074			/* pull the value from index_reg */
2075			sel = addr_reg;
2076			chan = 0;
2077		} else {
2078			sel = V_SQ_ALU_SRC_LITERAL;
2079			chan = reg.Dimension.Index;
2080		}
2081
2082		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2083				   temp_reg, 0,
2084				   stride_bytes_reg, stride_bytes_chan,
2085				   sel, chan,
2086				   temp_reg, 0);
2087		if (r)
2088			return r;
2089	}
2090
2091	if (reg.Register.File == TGSI_FILE_INPUT) {
2092		name = info->input_semantic_name;
2093		index = info->input_semantic_index;
2094		array_first = info->input_array_first;
2095	} else if (reg.Register.File == TGSI_FILE_OUTPUT) {
2096		name = info->output_semantic_name;
2097		index = info->output_semantic_index;
2098		array_first = info->output_array_first;
2099	} else {
2100		assert(0);
2101		return -1;
2102	}
2103	if (reg.Register.Indirect) {
2104		int addr_reg;
2105		int first;
2106		/* Add the relative address of the element. */
2107		if (reg.Indirect.ArrayID)
2108			first = array_first[reg.Indirect.ArrayID];
2109		else
2110			first = reg.Register.Index;
2111
2112		addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
2113
2114		/* pull the value from index_reg */
2115		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2116				   temp_reg, 0,
2117				   V_SQ_ALU_SRC_LITERAL, 16,
2118				   addr_reg, 0,
2119				   temp_reg, 0);
2120		if (r)
2121			return r;
2122
2123		param = r600_get_lds_unique_index(name[first],
2124						  index[first]);
2125
2126	} else {
2127		param = r600_get_lds_unique_index(name[reg.Register.Index],
2128						  index[reg.Register.Index]);
2129	}
2130
2131	/* add to base_addr - passed in temp_reg.x */
2132	if (param) {
2133		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2134				   temp_reg, 0,
2135				   temp_reg, 0,
2136				   V_SQ_ALU_SRC_LITERAL, param * 16);
2137		if (r)
2138			return r;
2139
2140	}
2141	return 0;
2142}
2143
2144static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
2145			       unsigned dst_reg, unsigned mask)
2146{
2147	struct r600_bytecode_alu alu;
2148	int r, i, lasti;
2149
2150	if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
2151		ctx->bc->force_add_cf = 1;
2152
2153	lasti = tgsi_last_instruction(mask);
2154	for (i = 1; i <= lasti; i++) {
2155		if (!(mask & (1 << i)))
2156			continue;
2157
2158		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2159				   temp_reg, i,
2160				   temp_reg, 0,
2161				   V_SQ_ALU_SRC_LITERAL, 4 * i);
2162		if (r)
2163			return r;
2164	}
2165	for (i = 0; i <= lasti; i++) {
2166		if (!(mask & (1 << i)))
2167			continue;
2168
2169		/* emit an LDS_READ_RET */
2170		memset(&alu, 0, sizeof(alu));
2171		alu.op = LDS_OP1_LDS_READ_RET;
2172		alu.src[0].sel = temp_reg;
2173		alu.src[0].chan = i;
2174		alu.src[1].sel = V_SQ_ALU_SRC_0;
2175		alu.src[2].sel = V_SQ_ALU_SRC_0;
2176		alu.dst.chan = 0;
2177		alu.is_lds_idx_op = true;
2178		alu.last = 1;
2179		r = r600_bytecode_add_alu(ctx->bc, &alu);
2180		if (r)
2181			return r;
2182	}
2183	for (i = 0; i <= lasti; i++) {
2184		if (!(mask & (1 << i)))
2185			continue;
2186
2187		/* then read from LDS_OQ_A_POP */
2188		memset(&alu, 0, sizeof(alu));
2189
2190		alu.op = ALU_OP1_MOV;
2191		alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
2192		alu.src[0].chan = 0;
2193		alu.dst.sel = dst_reg;
2194		alu.dst.chan = i;
2195		alu.dst.write = 1;
2196		alu.last = 1;
2197		r = r600_bytecode_add_alu(ctx->bc, &alu);
2198		if (r)
2199			return r;
2200	}
2201	return 0;
2202}
2203
2204static int fetch_mask(struct tgsi_src_register *reg)
2205{
2206	int mask = 0;
2207	mask |= 1 << reg->SwizzleX;
2208	mask |= 1 << reg->SwizzleY;
2209	mask |= 1 << reg->SwizzleZ;
2210	mask |= 1 << reg->SwizzleW;
2211	return mask;
2212}
2213
2214static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2215{
2216	int r;
2217	unsigned temp_reg = r600_get_temp(ctx);
2218
2219	r = get_lds_offset0(ctx, 2, temp_reg,
2220			    src->Register.Dimension ? false : true);
2221	if (r)
2222		return r;
2223
2224	/* the base address is now in temp.x */
2225	r = r600_get_byte_address(ctx, temp_reg,
2226				  NULL, src, ctx->tess_output_info, 1);
2227	if (r)
2228		return r;
2229
2230	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2231	if (r)
2232		return r;
2233	return 0;
2234}
2235
2236static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2237{
2238	int r;
2239	unsigned temp_reg = r600_get_temp(ctx);
2240
2241	/* t.x = ips * r0.y */
2242	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2243			   temp_reg, 0,
2244			   ctx->tess_input_info, 0,
2245			   0, 1);
2246
2247	if (r)
2248		return r;
2249
2250	/* the base address is now in temp.x */
2251	r = r600_get_byte_address(ctx, temp_reg,
2252				  NULL, src, ctx->tess_input_info, 1);
2253	if (r)
2254		return r;
2255
2256	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2257	if (r)
2258		return r;
2259	return 0;
2260}
2261
2262static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2263{
2264	int r;
2265	unsigned temp_reg = r600_get_temp(ctx);
2266
2267	r = get_lds_offset0(ctx, 1, temp_reg,
2268			    src->Register.Dimension ? false : true);
2269	if (r)
2270		return r;
2271	/* the base address is now in temp.x */
2272	r = r600_get_byte_address(ctx, temp_reg,
2273				  NULL, src,
2274				  ctx->tess_output_info, 1);
2275	if (r)
2276		return r;
2277
2278	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2279	if (r)
2280		return r;
2281	return 0;
2282}
2283
2284static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
2285{
2286	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2287	unsigned i;
2288
2289	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2290		struct tgsi_full_src_register *src = &inst->Src[i];
2291
2292		if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
2293			int treg = r600_get_temp(ctx);
2294			fetch_tes_input(ctx, src, treg);
2295			ctx->src[i].sel = treg;
2296			ctx->src[i].rel = 0;
2297		}
2298		if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
2299			int treg = r600_get_temp(ctx);
2300			fetch_tcs_input(ctx, src, treg);
2301			ctx->src[i].sel = treg;
2302			ctx->src[i].rel = 0;
2303		}
2304		if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
2305			int treg = r600_get_temp(ctx);
2306			fetch_tcs_output(ctx, src, treg);
2307			ctx->src[i].sel = treg;
2308			ctx->src[i].rel = 0;
2309		}
2310	}
2311	return 0;
2312}
2313
2314static int tgsi_split_constant(struct r600_shader_ctx *ctx)
2315{
2316	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2317	struct r600_bytecode_alu alu;
2318	int i, j, k, nconst, r;
2319
2320	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
2321		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
2322			nconst++;
2323		}
2324		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
2325	}
2326	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
2327		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
2328			continue;
2329		}
2330
2331		if (ctx->src[i].rel) {
2332			int chan = inst->Src[i].Indirect.Swizzle;
2333			int treg = r600_get_temp(ctx);
2334			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
2335				return r;
2336
2337			ctx->src[i].kc_bank = 0;
2338			ctx->src[i].kc_rel = 0;
2339			ctx->src[i].sel = treg;
2340			ctx->src[i].rel = 0;
2341			j--;
2342		} else if (j > 0) {
2343			int treg = r600_get_temp(ctx);
2344			for (k = 0; k < 4; k++) {
2345				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2346				alu.op = ALU_OP1_MOV;
2347				alu.src[0].sel = ctx->src[i].sel;
2348				alu.src[0].chan = k;
2349				alu.src[0].rel = ctx->src[i].rel;
2350				alu.src[0].kc_bank = ctx->src[i].kc_bank;
2351				alu.src[0].kc_rel = ctx->src[i].kc_rel;
2352				alu.dst.sel = treg;
2353				alu.dst.chan = k;
2354				alu.dst.write = 1;
2355				if (k == 3)
2356					alu.last = 1;
2357				r = r600_bytecode_add_alu(ctx->bc, &alu);
2358				if (r)
2359					return r;
2360			}
2361			ctx->src[i].sel = treg;
2362			ctx->src[i].rel =0;
2363			j--;
2364		}
2365	}
2366	return 0;
2367}
2368
2369/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
2370static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
2371{
2372	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2373	struct r600_bytecode_alu alu;
2374	int i, j, k, nliteral, r;
2375
2376	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
2377		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2378			nliteral++;
2379		}
2380	}
2381	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
2382		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2383			int treg = r600_get_temp(ctx);
2384			for (k = 0; k < 4; k++) {
2385				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2386				alu.op = ALU_OP1_MOV;
2387				alu.src[0].sel = ctx->src[i].sel;
2388				alu.src[0].chan = k;
2389				alu.src[0].value = ctx->src[i].value[k];
2390				alu.dst.sel = treg;
2391				alu.dst.chan = k;
2392				alu.dst.write = 1;
2393				if (k == 3)
2394					alu.last = 1;
2395				r = r600_bytecode_add_alu(ctx->bc, &alu);
2396				if (r)
2397					return r;
2398			}
2399			ctx->src[i].sel = treg;
2400			j--;
2401		}
2402	}
2403	return 0;
2404}
2405
2406static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
2407{
2408	int i, r, count = ctx->shader->ninput;
2409
2410	for (i = 0; i < count; i++) {
2411		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2412			r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
2413			if (r)
2414				return r;
2415		}
2416	}
2417	return 0;
2418}
2419
2420static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
2421						  int stream, unsigned *stream_item_size UNUSED)
2422{
2423	unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
2424	unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
2425	int j, r;
2426	unsigned i;
2427
2428	/* Sanity checking. */
2429	if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
2430		R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
2431		r = -EINVAL;
2432		goto out_err;
2433	}
2434	for (i = 0; i < so->num_outputs; i++) {
2435		if (so->output[i].output_buffer >= 4) {
2436			R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
2437				 so->output[i].output_buffer);
2438			r = -EINVAL;
2439			goto out_err;
2440		}
2441	}
2442
2443	/* Initialize locations where the outputs are stored. */
2444	for (i = 0; i < so->num_outputs; i++) {
2445
2446		so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
2447		start_comp[i] = so->output[i].start_component;
2448		/* Lower outputs with dst_offset < start_component.
2449		 *
2450		 * We can only output 4D vectors with a write mask, e.g. we can
2451		 * only output the W component at offset 3, etc. If we want
2452		 * to store Y, Z, or W at buffer offset 0, we need to use MOV
2453		 * to move it to X and output X. */
2454		if (so->output[i].dst_offset < so->output[i].start_component) {
2455			unsigned tmp = r600_get_temp(ctx);
2456
2457			for (j = 0; j < so->output[i].num_components; j++) {
2458				struct r600_bytecode_alu alu;
2459				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2460				alu.op = ALU_OP1_MOV;
2461				alu.src[0].sel = so_gpr[i];
2462				alu.src[0].chan = so->output[i].start_component + j;
2463
2464				alu.dst.sel = tmp;
2465				alu.dst.chan = j;
2466				alu.dst.write = 1;
2467				if (j == so->output[i].num_components - 1)
2468					alu.last = 1;
2469				r = r600_bytecode_add_alu(ctx->bc, &alu);
2470				if (r)
2471					return r;
2472			}
2473			start_comp[i] = 0;
2474			so_gpr[i] = tmp;
2475		}
2476	}
2477
2478	/* Write outputs to buffers. */
2479	for (i = 0; i < so->num_outputs; i++) {
2480		struct r600_bytecode_output output;
2481
2482		if (stream != -1 && stream != so->output[i].stream)
2483			continue;
2484
2485		memset(&output, 0, sizeof(struct r600_bytecode_output));
2486		output.gpr = so_gpr[i];
2487		output.elem_size = so->output[i].num_components - 1;
2488		if (output.elem_size == 2)
2489			output.elem_size = 3; // 3 not supported, write 4 with junk at end
2490		output.array_base = so->output[i].dst_offset - start_comp[i];
2491		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2492		output.burst_count = 1;
2493		/* array_size is an upper limit for the burst_count
2494		 * with MEM_STREAM instructions */
2495		output.array_size = 0xFFF;
2496		output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
2497
2498		if (ctx->bc->gfx_level >= EVERGREEN) {
2499			switch (so->output[i].output_buffer) {
2500			case 0:
2501				output.op = CF_OP_MEM_STREAM0_BUF0;
2502				break;
2503			case 1:
2504				output.op = CF_OP_MEM_STREAM0_BUF1;
2505				break;
2506			case 2:
2507				output.op = CF_OP_MEM_STREAM0_BUF2;
2508				break;
2509			case 3:
2510				output.op = CF_OP_MEM_STREAM0_BUF3;
2511				break;
2512			}
2513			output.op += so->output[i].stream * 4;
2514			assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
2515			ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
2516		} else {
2517			switch (so->output[i].output_buffer) {
2518			case 0:
2519				output.op = CF_OP_MEM_STREAM0;
2520				break;
2521			case 1:
2522				output.op = CF_OP_MEM_STREAM1;
2523				break;
2524			case 2:
2525				output.op = CF_OP_MEM_STREAM2;
2526				break;
2527			case 3:
2528				output.op = CF_OP_MEM_STREAM3;
2529					break;
2530			}
2531			ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
2532		}
2533		r = r600_bytecode_add_output(ctx->bc, &output);
2534		if (r)
2535			goto out_err;
2536	}
2537	return 0;
2538out_err:
2539	return r;
2540}
2541
2542static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
2543{
2544	struct r600_bytecode_alu alu;
2545	unsigned reg;
2546
2547	if (!ctx->shader->vs_out_edgeflag)
2548		return;
2549
2550	reg = ctx->shader->output[ctx->edgeflag_output].gpr;
2551
2552	/* clamp(x, 0, 1) */
2553	memset(&alu, 0, sizeof(alu));
2554	alu.op = ALU_OP1_MOV;
2555	alu.src[0].sel = reg;
2556	alu.dst.sel = reg;
2557	alu.dst.write = 1;
2558	alu.dst.clamp = 1;
2559	alu.last = 1;
2560	r600_bytecode_add_alu(ctx->bc, &alu);
2561
2562	memset(&alu, 0, sizeof(alu));
2563	alu.op = ALU_OP1_FLT_TO_INT;
2564	alu.src[0].sel = reg;
2565	alu.dst.sel = reg;
2566	alu.dst.write = 1;
2567	alu.last = 1;
2568	r600_bytecode_add_alu(ctx->bc, &alu);
2569}
2570
2571int generate_gs_copy_shader(struct r600_context *rctx,
2572                            struct r600_pipe_shader *gs,
2573                            struct pipe_stream_output_info *so)
2574{
2575	struct r600_shader_ctx ctx = {};
2576	struct r600_shader *gs_shader = &gs->shader;
2577	struct r600_pipe_shader *cshader;
2578	unsigned ocnt = gs_shader->noutput;
2579	struct r600_bytecode_alu alu;
2580	struct r600_bytecode_vtx vtx;
2581	struct r600_bytecode_output output;
2582	struct r600_bytecode_cf *cf_jump, *cf_pop,
2583		*last_exp_pos = NULL, *last_exp_param = NULL;
2584	int next_clip_pos = 61, next_param = 0;
2585	unsigned i, j;
2586	int ring;
2587	bool only_ring_0 = true;
2588	cshader = calloc(1, sizeof(struct r600_pipe_shader));
2589	if (!cshader)
2590		return 0;
2591
2592	memcpy(cshader->shader.output, gs_shader->output, ocnt *
2593	       sizeof(struct r600_shader_io));
2594
2595	cshader->shader.noutput = ocnt;
2596
2597	ctx.shader = &cshader->shader;
2598	ctx.bc = &ctx.shader->bc;
2599	ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
2600
2601	r600_bytecode_init(ctx.bc, rctx->b.gfx_level, rctx->b.family,
2602			   rctx->screen->has_compressed_msaa_texturing);
2603
2604	ctx.bc->isa = rctx->isa;
2605
2606	cf_jump = NULL;
2607	memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
2608
2609	/* R0.x = R0.x & 0x3fffffff */
2610	memset(&alu, 0, sizeof(alu));
2611	alu.op = ALU_OP2_AND_INT;
2612	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2613	alu.src[1].value = 0x3fffffff;
2614	alu.dst.write = 1;
2615	r600_bytecode_add_alu(ctx.bc, &alu);
2616
2617	/* R0.y = R0.x >> 30 */
2618	memset(&alu, 0, sizeof(alu));
2619	alu.op = ALU_OP2_LSHR_INT;
2620	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2621	alu.src[1].value = 0x1e;
2622	alu.dst.chan = 1;
2623	alu.dst.write = 1;
2624	alu.last = 1;
2625	r600_bytecode_add_alu(ctx.bc, &alu);
2626
2627	/* fetch vertex data from GSVS ring */
2628	for (i = 0; i < ocnt; ++i) {
2629		struct r600_shader_io *out = &ctx.shader->output[i];
2630
2631		out->gpr = i + 1;
2632		out->ring_offset = i * 16;
2633
2634		memset(&vtx, 0, sizeof(vtx));
2635		vtx.op = FETCH_OP_VFETCH;
2636		vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2637		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2638		vtx.mega_fetch_count = 16;
2639		vtx.offset = out->ring_offset;
2640		vtx.dst_gpr = out->gpr;
2641		vtx.src_gpr = 0;
2642		vtx.dst_sel_x = 0;
2643		vtx.dst_sel_y = 1;
2644		vtx.dst_sel_z = 2;
2645		vtx.dst_sel_w = 3;
2646		if (rctx->b.gfx_level >= EVERGREEN) {
2647			vtx.use_const_fields = 1;
2648		} else {
2649			vtx.data_format = FMT_32_32_32_32_FLOAT;
2650		}
2651
2652		r600_bytecode_add_vtx(ctx.bc, &vtx);
2653	}
2654	ctx.temp_reg = i + 1;
2655	for (ring = 3; ring >= 0; --ring) {
2656		bool enabled = false;
2657		for (i = 0; i < so->num_outputs; i++) {
2658			if (so->output[i].stream == ring) {
2659				enabled = true;
2660				if (ring > 0)
2661					only_ring_0 = false;
2662				break;
2663			}
2664		}
2665		if (ring != 0 && !enabled) {
2666			cshader->shader.ring_item_sizes[ring] = 0;
2667			continue;
2668		}
2669
2670		if (cf_jump) {
2671			// Patch up jump label
2672			r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2673			cf_pop = ctx.bc->cf_last;
2674
2675			cf_jump->cf_addr = cf_pop->id + 2;
2676			cf_jump->pop_count = 1;
2677			cf_pop->cf_addr = cf_pop->id + 2;
2678			cf_pop->pop_count = 1;
2679		}
2680
2681		/* PRED_SETE_INT __, R0.y, ring */
2682		memset(&alu, 0, sizeof(alu));
2683		alu.op = ALU_OP2_PRED_SETE_INT;
2684		alu.src[0].chan = 1;
2685		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2686		alu.src[1].value = ring;
2687		alu.execute_mask = 1;
2688		alu.update_pred = 1;
2689		alu.last = 1;
2690		r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2691
2692		r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2693		cf_jump = ctx.bc->cf_last;
2694
2695		if (enabled)
2696			emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
2697		cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2698	}
2699
2700	/* bc adds nops - copy it */
2701	if (ctx.bc->gfx_level == R600) {
2702		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2703		alu.op = ALU_OP0_NOP;
2704		alu.last = 1;
2705		r600_bytecode_add_alu(ctx.bc, &alu);
2706
2707		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2708	}
2709
2710	/* export vertex data */
2711	/* XXX factor out common code with r600_shader_from_tgsi ? */
2712	for (i = 0; i < ocnt; ++i) {
2713		struct r600_shader_io *out = &ctx.shader->output[i];
2714		bool instream0 = true;
2715		if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2716			continue;
2717
2718		for (j = 0; j < so->num_outputs; j++) {
2719			if (so->output[j].register_index == i) {
2720				if (so->output[j].stream == 0)
2721					break;
2722				if (so->output[j].stream > 0)
2723					instream0 = false;
2724			}
2725		}
2726		if (!instream0)
2727			continue;
2728		memset(&output, 0, sizeof(output));
2729		output.gpr = out->gpr;
2730		output.elem_size = 3;
2731		output.swizzle_x = 0;
2732		output.swizzle_y = 1;
2733		output.swizzle_z = 2;
2734		output.swizzle_w = 3;
2735		output.burst_count = 1;
2736		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2737		output.op = CF_OP_EXPORT;
2738		switch (out->name) {
2739		case TGSI_SEMANTIC_POSITION:
2740			output.array_base = 60;
2741			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2742			break;
2743
2744		case TGSI_SEMANTIC_PSIZE:
2745			output.array_base = 61;
2746			if (next_clip_pos == 61)
2747				next_clip_pos = 62;
2748			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2749			output.swizzle_y = 7;
2750			output.swizzle_z = 7;
2751			output.swizzle_w = 7;
2752			ctx.shader->vs_out_misc_write = 1;
2753			ctx.shader->vs_out_point_size = 1;
2754			break;
2755		case TGSI_SEMANTIC_LAYER:
2756			if (out->spi_sid) {
2757				/* duplicate it as PARAM to pass to the pixel shader */
2758				output.array_base = next_param++;
2759				r600_bytecode_add_output(ctx.bc, &output);
2760				last_exp_param = ctx.bc->cf_last;
2761			}
2762			output.array_base = 61;
2763			if (next_clip_pos == 61)
2764				next_clip_pos = 62;
2765			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2766			output.swizzle_x = 7;
2767			output.swizzle_y = 7;
2768			output.swizzle_z = 0;
2769			output.swizzle_w = 7;
2770			ctx.shader->vs_out_misc_write = 1;
2771			ctx.shader->vs_out_layer = 1;
2772			break;
2773		case TGSI_SEMANTIC_VIEWPORT_INDEX:
2774			if (out->spi_sid) {
2775				/* duplicate it as PARAM to pass to the pixel shader */
2776				output.array_base = next_param++;
2777				r600_bytecode_add_output(ctx.bc, &output);
2778				last_exp_param = ctx.bc->cf_last;
2779			}
2780			output.array_base = 61;
2781			if (next_clip_pos == 61)
2782				next_clip_pos = 62;
2783			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2784			ctx.shader->vs_out_misc_write = 1;
2785			ctx.shader->vs_out_viewport = 1;
2786			output.swizzle_x = 7;
2787			output.swizzle_y = 7;
2788			output.swizzle_z = 7;
2789			output.swizzle_w = 0;
2790			break;
2791		case TGSI_SEMANTIC_CLIPDIST:
2792			/* spi_sid is 0 for clipdistance outputs that were generated
2793			 * for clipvertex - we don't need to pass them to PS */
2794			ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2795			ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
2796			ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
2797			if (out->spi_sid) {
2798				/* duplicate it as PARAM to pass to the pixel shader */
2799				output.array_base = next_param++;
2800				r600_bytecode_add_output(ctx.bc, &output);
2801				last_exp_param = ctx.bc->cf_last;
2802			}
2803			output.array_base = next_clip_pos++;
2804			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2805			break;
2806		case TGSI_SEMANTIC_FOG:
2807			output.swizzle_y = 4; /* 0 */
2808			output.swizzle_z = 4; /* 0 */
2809			output.swizzle_w = 5; /* 1 */
2810			break;
2811		default:
2812			output.array_base = next_param++;
2813			break;
2814		}
2815		r600_bytecode_add_output(ctx.bc, &output);
2816		if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2817			last_exp_param = ctx.bc->cf_last;
2818		else
2819			last_exp_pos = ctx.bc->cf_last;
2820	}
2821
2822	if (!last_exp_pos) {
2823		memset(&output, 0, sizeof(output));
2824		output.gpr = 0;
2825		output.elem_size = 3;
2826		output.swizzle_x = 7;
2827		output.swizzle_y = 7;
2828		output.swizzle_z = 7;
2829		output.swizzle_w = 7;
2830		output.burst_count = 1;
2831		output.type = 2;
2832		output.op = CF_OP_EXPORT;
2833		output.array_base = 60;
2834		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2835		r600_bytecode_add_output(ctx.bc, &output);
2836		last_exp_pos = ctx.bc->cf_last;
2837	}
2838
2839	if (!last_exp_param) {
2840		memset(&output, 0, sizeof(output));
2841		output.gpr = 0;
2842		output.elem_size = 3;
2843		output.swizzle_x = 7;
2844		output.swizzle_y = 7;
2845		output.swizzle_z = 7;
2846		output.swizzle_w = 7;
2847		output.burst_count = 1;
2848		output.type = 2;
2849		output.op = CF_OP_EXPORT;
2850		output.array_base = next_param++;
2851		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2852		r600_bytecode_add_output(ctx.bc, &output);
2853		last_exp_param = ctx.bc->cf_last;
2854	}
2855
2856	last_exp_pos->op = CF_OP_EXPORT_DONE;
2857	last_exp_param->op = CF_OP_EXPORT_DONE;
2858
2859	r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2860	cf_pop = ctx.bc->cf_last;
2861
2862	cf_jump->cf_addr = cf_pop->id + 2;
2863	cf_jump->pop_count = 1;
2864	cf_pop->cf_addr = cf_pop->id + 2;
2865	cf_pop->pop_count = 1;
2866
2867	if (ctx.bc->gfx_level == CAYMAN)
2868		cm_bytecode_add_cf_end(ctx.bc);
2869	else {
2870		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2871		ctx.bc->cf_last->end_of_program = 1;
2872	}
2873
2874	gs->gs_copy_shader = cshader;
2875	cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2876
2877	ctx.bc->nstack = 1;
2878
2879	return r600_bytecode_build(ctx.bc);
2880}
2881
2882static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2883{
2884	if (ind) {
2885		struct r600_bytecode_alu alu;
2886		int r;
2887
2888		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2889		alu.op = ALU_OP2_ADD_INT;
2890		alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2891		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2892		alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2893		alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2894		alu.dst.write = 1;
2895		alu.last = 1;
2896		r = r600_bytecode_add_alu(ctx->bc, &alu);
2897		if (r)
2898			return r;
2899	}
2900	return 0;
2901}
2902
2903static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind)
2904{
2905	struct r600_bytecode_output output;
2906	int ring_offset;
2907	unsigned i, k;
2908	int effective_stream = stream == -1 ? 0 : stream;
2909	int idx = 0;
2910
2911	for (i = 0; i < ctx->shader->noutput; i++) {
2912		if (ctx->gs_for_vs) {
2913			/* for ES we need to lookup corresponding ring offset expected by GS
2914			 * (map this output to GS input by name and sid) */
2915			/* FIXME precompute offsets */
2916			ring_offset = -1;
2917			for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2918				struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2919				struct r600_shader_io *out = &ctx->shader->output[i];
2920				if (in->name == out->name && in->sid == out->sid)
2921					ring_offset = in->ring_offset;
2922			}
2923
2924			if (ring_offset == -1)
2925				continue;
2926		} else {
2927			ring_offset = idx * 16;
2928			idx++;
2929		}
2930
2931		if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2932			continue;
2933		/* next_ring_offset after parsing input decls contains total size of
2934		 * single vertex data, gs_next_vertex - current vertex index */
2935		if (!ind)
2936			ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2937
2938		memset(&output, 0, sizeof(struct r600_bytecode_output));
2939		output.gpr = ctx->shader->output[i].gpr;
2940		output.elem_size = 3;
2941		output.comp_mask = 0xF;
2942		output.burst_count = 1;
2943
2944		if (ind)
2945			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2946		else
2947			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2948
2949		switch (stream) {
2950		default:
2951		case 0:
2952			output.op = CF_OP_MEM_RING; break;
2953		case 1:
2954			output.op = CF_OP_MEM_RING1; break;
2955		case 2:
2956			output.op = CF_OP_MEM_RING2; break;
2957		case 3:
2958			output.op = CF_OP_MEM_RING3; break;
2959		}
2960
2961		if (ind) {
2962			output.array_base = ring_offset >> 2; /* in dwords */
2963			output.array_size = 0xfff;
2964			output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2965		} else
2966			output.array_base = ring_offset >> 2; /* in dwords */
2967		r600_bytecode_add_output(ctx->bc, &output);
2968	}
2969
2970	++ctx->gs_next_vertex;
2971	return 0;
2972}
2973
2974
2975static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2976{
2977	int r;
2978	struct r600_bytecode_vtx vtx;
2979	int temp_val = ctx->temp_reg;
2980	/* need to store the TCS output somewhere */
2981	r = single_alu_op2(ctx, ALU_OP1_MOV,
2982			   temp_val, 0,
2983			   V_SQ_ALU_SRC_LITERAL, 0,
2984			   0, 0);
2985	if (r)
2986		return r;
2987
2988	/* used by VS/TCS */
2989	if (ctx->tess_input_info) {
2990		/* fetch tcs input values into resv space */
2991		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2992		vtx.op = FETCH_OP_VFETCH;
2993		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2994		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2995		vtx.mega_fetch_count = 16;
2996		vtx.data_format = FMT_32_32_32_32;
2997		vtx.num_format_all = 2;
2998		vtx.format_comp_all = 1;
2999		vtx.use_const_fields = 0;
3000		vtx.endian = r600_endian_swap(32);
3001		vtx.srf_mode_all = 1;
3002		vtx.offset = 0;
3003		vtx.dst_gpr = ctx->tess_input_info;
3004		vtx.dst_sel_x = 0;
3005		vtx.dst_sel_y = 1;
3006		vtx.dst_sel_z = 2;
3007		vtx.dst_sel_w = 3;
3008		vtx.src_gpr = temp_val;
3009		vtx.src_sel_x = 0;
3010
3011		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
3012		if (r)
3013			return r;
3014	}
3015
3016	/* used by TCS/TES */
3017	if (ctx->tess_output_info) {
3018		/* fetch tcs output values into resv space */
3019		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
3020		vtx.op = FETCH_OP_VFETCH;
3021		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
3022		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
3023		vtx.mega_fetch_count = 16;
3024		vtx.data_format = FMT_32_32_32_32;
3025		vtx.num_format_all = 2;
3026		vtx.format_comp_all = 1;
3027		vtx.use_const_fields = 0;
3028		vtx.endian = r600_endian_swap(32);
3029		vtx.srf_mode_all = 1;
3030		vtx.offset = 16;
3031		vtx.dst_gpr = ctx->tess_output_info;
3032		vtx.dst_sel_x = 0;
3033		vtx.dst_sel_y = 1;
3034		vtx.dst_sel_z = 2;
3035		vtx.dst_sel_w = 3;
3036		vtx.src_gpr = temp_val;
3037		vtx.src_sel_x = 0;
3038
3039		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
3040		if (r)
3041			return r;
3042	}
3043	return 0;
3044}
3045
3046static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
3047{
3048	int j, r;
3049	int temp_reg;
3050	unsigned i;
3051
3052	/* fetch tcs input values into input_vals */
3053	ctx->tess_input_info = r600_get_temp(ctx);
3054	ctx->tess_output_info = 0;
3055	r = r600_fetch_tess_io_info(ctx);
3056	if (r)
3057		return r;
3058
3059	temp_reg = r600_get_temp(ctx);
3060	/* dst reg contains LDS address stride * idx */
3061	/* MUL vertexID, vertex_dw_stride */
3062	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
3063			   temp_reg, 0,
3064			   ctx->tess_input_info, 1,
3065			   0, 1); /* rel id in r0.y? */
3066	if (r)
3067		return r;
3068
3069	for (i = 0; i < ctx->shader->noutput; i++) {
3070		struct r600_bytecode_alu alu;
3071		int param = r600_get_lds_unique_index(ctx->shader->output[i].name,
3072						      ctx->shader->output[i].sid);
3073
3074		if (param) {
3075			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3076					   temp_reg, 1,
3077					   temp_reg, 0,
3078					   V_SQ_ALU_SRC_LITERAL, param * 16);
3079			if (r)
3080				return r;
3081		}
3082
3083		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3084				   temp_reg, 2,
3085				   temp_reg, param ? 1 : 0,
3086				   V_SQ_ALU_SRC_LITERAL, 8);
3087		if (r)
3088			return r;
3089
3090
3091		for (j = 0; j < 2; j++) {
3092			int chan = (j == 1) ? 2 : (param ? 1 : 0);
3093			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3094			alu.op = LDS_OP3_LDS_WRITE_REL;
3095			alu.src[0].sel = temp_reg;
3096			alu.src[0].chan = chan;
3097			alu.src[1].sel = ctx->shader->output[i].gpr;
3098			alu.src[1].chan = j * 2;
3099			alu.src[2].sel = ctx->shader->output[i].gpr;
3100			alu.src[2].chan = (j * 2) + 1;
3101			alu.last = 1;
3102			alu.dst.chan = 0;
3103			alu.lds_idx = 1;
3104			alu.is_lds_idx_op = true;
3105			r = r600_bytecode_add_alu(ctx->bc, &alu);
3106			if (r)
3107				return r;
3108		}
3109	}
3110	return 0;
3111}
3112
3113static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
3114{
3115	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3116	const struct tgsi_full_dst_register *dst = &inst->Dst[0];
3117	int i, r, lasti;
3118	int temp_reg = r600_get_temp(ctx);
3119	struct r600_bytecode_alu alu;
3120	unsigned write_mask = dst->Register.WriteMask;
3121
3122	if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
3123		return 0;
3124
3125	r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
3126	if (r)
3127		return r;
3128
3129	/* the base address is now in temp.x */
3130	r = r600_get_byte_address(ctx, temp_reg,
3131				  &inst->Dst[0], NULL, ctx->tess_output_info, 1);
3132	if (r)
3133		return r;
3134
3135	/* LDS write */
3136	lasti = tgsi_last_instruction(write_mask);
3137	for (i = 1; i <= lasti; i++) {
3138
3139		if (!(write_mask & (1 << i)))
3140			continue;
3141		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3142				   temp_reg, i,
3143				   temp_reg, 0,
3144				   V_SQ_ALU_SRC_LITERAL, 4 * i);
3145		if (r)
3146			return r;
3147	}
3148
3149	for (i = 0; i <= lasti; i++) {
3150		if (!(write_mask & (1 << i)))
3151			continue;
3152
3153		if ((i == 0 && ((write_mask & 3) == 3)) ||
3154		    (i == 2 && ((write_mask & 0xc) == 0xc))) {
3155			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3156			alu.op = LDS_OP3_LDS_WRITE_REL;
3157			alu.src[0].sel = temp_reg;
3158			alu.src[0].chan = i;
3159
3160			alu.src[1].sel = dst->Register.Index;
3161			alu.src[1].sel += ctx->file_offset[dst->Register.File];
3162			alu.src[1].chan = i;
3163
3164			alu.src[2].sel = dst->Register.Index;
3165			alu.src[2].sel += ctx->file_offset[dst->Register.File];
3166			alu.src[2].chan = i + 1;
3167			alu.lds_idx = 1;
3168			alu.dst.chan = 0;
3169			alu.last = 1;
3170			alu.is_lds_idx_op = true;
3171			r = r600_bytecode_add_alu(ctx->bc, &alu);
3172			if (r)
3173				return r;
3174			i += 1;
3175			continue;
3176		}
3177		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3178		alu.op = LDS_OP2_LDS_WRITE;
3179		alu.src[0].sel = temp_reg;
3180		alu.src[0].chan = i;
3181
3182		alu.src[1].sel = dst->Register.Index;
3183		alu.src[1].sel += ctx->file_offset[dst->Register.File];
3184		alu.src[1].chan = i;
3185
3186		alu.src[2].sel = V_SQ_ALU_SRC_0;
3187		alu.dst.chan = 0;
3188		alu.last = 1;
3189		alu.is_lds_idx_op = true;
3190		r = r600_bytecode_add_alu(ctx->bc, &alu);
3191		if (r)
3192			return r;
3193	}
3194	return 0;
3195}
3196
3197static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
3198				 int output_idx, int nc)
3199{
3200	int param;
3201	unsigned temp_reg = r600_get_temp(ctx);
3202	unsigned name = ctx->shader->output[output_idx].name;
3203	int dreg = ctx->shader->output[output_idx].gpr;
3204	int r;
3205
3206	param = r600_get_lds_unique_index(name, 0);
3207	r = get_lds_offset0(ctx, 1, temp_reg, true);
3208	if (r)
3209		return r;
3210
3211	if (param) {
3212		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3213				   temp_reg, 0,
3214				   temp_reg, 0,
3215				   V_SQ_ALU_SRC_LITERAL, param * 16);
3216		if (r)
3217			return r;
3218	}
3219
3220	do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1));
3221	return 0;
3222}
3223
3224static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
3225{
3226	int stride, outer_comps, inner_comps;
3227	int tessinner_idx = -1, tessouter_idx = -1;
3228	int i, r;
3229	unsigned j;
3230	int temp_reg = r600_get_temp(ctx);
3231	int treg[3] = {-1, -1, -1};
3232	struct r600_bytecode_alu alu;
3233	struct r600_bytecode_cf *cf_jump, *cf_pop;
3234
3235	/* only execute factor emission for invocation 0 */
3236	/* PRED_SETE_INT __, R0.x, 0 */
3237	memset(&alu, 0, sizeof(alu));
3238	alu.op = ALU_OP2_PRED_SETE_INT;
3239	alu.src[0].chan = 2;
3240	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3241	alu.execute_mask = 1;
3242	alu.update_pred = 1;
3243	alu.last = 1;
3244	r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
3245
3246	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
3247	cf_jump = ctx->bc->cf_last;
3248
3249	treg[0] = r600_get_temp(ctx);
3250	switch (ctx->shader->tcs_prim_mode) {
3251	case PIPE_PRIM_LINES:
3252		stride = 8; /* 2 dwords, 1 vec2 store */
3253		outer_comps = 2;
3254		inner_comps = 0;
3255		break;
3256	case PIPE_PRIM_TRIANGLES:
3257		stride = 16; /* 4 dwords, 1 vec4 store */
3258		outer_comps = 3;
3259		inner_comps = 1;
3260		treg[1] = r600_get_temp(ctx);
3261		break;
3262	case PIPE_PRIM_QUADS:
3263		stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
3264		outer_comps = 4;
3265		inner_comps = 2;
3266		treg[1] = r600_get_temp(ctx);
3267		treg[2] = r600_get_temp(ctx);
3268		break;
3269	default:
3270		assert(0);
3271		return -1;
3272	}
3273
3274	/* R0 is InvocationID, RelPatchID, PatchID, tf_base */
3275	/* TF_WRITE takes index in R.x, value in R.y */
3276	for (j = 0; j < ctx->shader->noutput; j++) {
3277		if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER)
3278			tessinner_idx = j;
3279		if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER)
3280			tessouter_idx = j;
3281	}
3282
3283	if (tessouter_idx == -1)
3284		return -1;
3285
3286	if (tessinner_idx == -1 && inner_comps)
3287		return -1;
3288
3289	if (tessouter_idx != -1) {
3290		r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps);
3291		if (r)
3292			return r;
3293	}
3294
3295	if (tessinner_idx != -1) {
3296		r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps);
3297		if (r)
3298			return r;
3299	}
3300
3301	/* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
3302	/* r.x = relpatchid(r0.y) * tf_stride */
3303
3304	/* multiply incoming r0.y * stride - t.x = r0.y * stride */
3305	/* add incoming r0.w to it: t.x = t.x + r0.w */
3306	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3307			   temp_reg, 0,
3308			   0, 1,
3309			   V_SQ_ALU_SRC_LITERAL, stride,
3310			   0, 3);
3311	if (r)
3312		return r;
3313
3314	for (i = 0; i < outer_comps + inner_comps; i++) {
3315		int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
3316		int out_comp = i >= outer_comps ? i - outer_comps : i;
3317
3318		if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) {
3319			if (out_comp == 1)
3320				out_comp = 0;
3321			else if (out_comp == 0)
3322				out_comp = 1;
3323		}
3324
3325		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3326				   treg[i / 2], (2 * (i % 2)),
3327				   temp_reg, 0,
3328				   V_SQ_ALU_SRC_LITERAL, 4 * i);
3329		if (r)
3330			return r;
3331		r = single_alu_op2(ctx, ALU_OP1_MOV,
3332				   treg[i / 2], 1 + (2 * (i%2)),
3333				   ctx->shader->output[out_idx].gpr, out_comp,
3334				   0, 0);
3335		if (r)
3336			return r;
3337	}
3338	for (i = 0; i < outer_comps + inner_comps; i++) {
3339		struct r600_bytecode_gds gds;
3340
3341		memset(&gds, 0, sizeof(struct r600_bytecode_gds));
3342		gds.src_gpr = treg[i / 2];
3343		gds.src_sel_x = 2 * (i % 2);
3344		gds.src_sel_y = 1 + (2 * (i % 2));
3345		gds.src_sel_z = 4;
3346		gds.dst_sel_x = 7;
3347		gds.dst_sel_y = 7;
3348		gds.dst_sel_z = 7;
3349		gds.dst_sel_w = 7;
3350		gds.op = FETCH_OP_TF_WRITE;
3351		r = r600_bytecode_add_gds(ctx->bc, &gds);
3352		if (r)
3353			return r;
3354	}
3355
3356	// Patch up jump label
3357	r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
3358	cf_pop = ctx->bc->cf_last;
3359
3360	cf_jump->cf_addr = cf_pop->id + 2;
3361	cf_jump->pop_count = 1;
3362	cf_pop->cf_addr = cf_pop->id + 2;
3363	cf_pop->pop_count = 1;
3364
3365	return 0;
3366}
3367
3368/*
3369 * We have to work out the thread ID for load and atomic
3370 * operations, which store the returned value to an index
3371 * in an intermediate buffer.
3372 * The index is calculated by taking the thread id,
3373 * calculated from the MBCNT instructions.
3374 * Then the shader engine ID is multiplied by 256,
3375 * and the wave id is added.
3376 * Then the result is multipled by 64 and thread id is
3377 * added.
3378 */
3379static int load_thread_id_gpr(struct r600_shader_ctx *ctx)
3380{
3381	struct r600_bytecode_alu alu;
3382	int r;
3383
3384	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3385	alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT;
3386	alu.dst.sel = ctx->temp_reg;
3387	alu.dst.chan = 0;
3388	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3389	alu.src[0].value = 0xffffffff;
3390	alu.dst.write = 1;
3391	r = r600_bytecode_add_alu(ctx->bc, &alu);
3392	if (r)
3393		return r;
3394
3395	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3396	alu.op = ALU_OP1_MBCNT_32HI_INT;
3397	alu.dst.sel = ctx->temp_reg;
3398	alu.dst.chan = 1;
3399	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3400	alu.src[0].value = 0xffffffff;
3401	alu.dst.write = 1;
3402	r = r600_bytecode_add_alu(ctx->bc, &alu);
3403	if (r)
3404		return r;
3405
3406	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3407	alu.op = ALU_OP3_MULADD_UINT24;
3408	alu.dst.sel = ctx->temp_reg;
3409	alu.dst.chan = 2;
3410	alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID;
3411	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3412	alu.src[1].value = 256;
3413	alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID;
3414	alu.dst.write = 1;
3415	alu.is_op3 = 1;
3416	alu.last = 1;
3417	r = r600_bytecode_add_alu(ctx->bc, &alu);
3418	if (r)
3419		return r;
3420
3421	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3422			   ctx->thread_id_gpr, 1,
3423			   ctx->temp_reg, 2,
3424			   V_SQ_ALU_SRC_LITERAL, 0x40,
3425			   ctx->temp_reg, 0);
3426	if (r)
3427		return r;
3428	return 0;
3429}
3430
3431static int r600_shader_from_tgsi(struct r600_context *rctx,
3432				 struct r600_pipe_shader *pipeshader,
3433				 union r600_shader_key key)
3434{
3435	struct r600_screen *rscreen = rctx->screen;
3436	struct r600_shader *shader = &pipeshader->shader;
3437	struct tgsi_token *tokens = pipeshader->selector->tokens;
3438	struct pipe_stream_output_info so = pipeshader->selector->so;
3439	struct tgsi_full_immediate *immediate;
3440	struct r600_shader_ctx ctx;
3441	struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
3442	unsigned output_done, noutput;
3443	unsigned opcode;
3444	int j, k, r = 0;
3445	unsigned i;
3446	int next_param_base = 0, next_clip_base;
3447	int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
3448	bool indirect_gprs;
3449	bool ring_outputs = false;
3450	bool lds_outputs = false;
3451	bool lds_inputs = false;
3452	bool pos_emitted = false;
3453
3454	ctx.bc = &shader->bc;
3455	ctx.shader = shader;
3456
3457	r600_bytecode_init(ctx.bc, rscreen->b.gfx_level, rscreen->b.family,
3458			   rscreen->has_compressed_msaa_texturing);
3459	ctx.tokens = tokens;
3460	tgsi_scan_shader(tokens, &ctx.info);
3461	shader->indirect_files = ctx.info.indirect_files;
3462
3463	int narrays = ctx.info.array_max[TGSI_FILE_TEMPORARY];
3464	ctx.array_infos = calloc(narrays, sizeof(*ctx.array_infos));
3465	ctx.spilled_arrays = calloc(narrays, sizeof(bool));
3466	tgsi_scan_arrays(tokens, TGSI_FILE_TEMPORARY, narrays, ctx.array_infos);
3467
3468	shader->uses_helper_invocation = false;
3469	shader->uses_doubles = ctx.info.uses_doubles;
3470	shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
3471	shader->num_loops = ctx.info.opcode_count[TGSI_OPCODE_BGNLOOP];
3472	shader->uses_interpolate_at_sample = ctx.info.opcode_count[TGSI_OPCODE_INTERP_SAMPLE] != 0;
3473
3474	shader->nsys_inputs = 0;
3475
3476	shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 ||
3477		ctx.info.file_count[TGSI_FILE_BUFFER] > 0;
3478	indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
3479	tgsi_parse_init(&ctx.parse, tokens);
3480	ctx.type = ctx.info.processor;
3481	shader->processor_type = ctx.type;
3482	ctx.bc->type = shader->processor_type;
3483
3484	switch (ctx.type) {
3485	case PIPE_SHADER_VERTEX:
3486		shader->vs_as_gs_a = key.vs.as_gs_a;
3487		shader->vs_as_es = key.vs.as_es;
3488		shader->vs_as_ls = key.vs.as_ls;
3489		shader->atomic_base = key.vs.first_atomic_counter;
3490		if (shader->vs_as_es)
3491			ring_outputs = true;
3492		if (shader->vs_as_ls)
3493			lds_outputs = true;
3494		break;
3495	case PIPE_SHADER_GEOMETRY:
3496		ring_outputs = true;
3497		shader->atomic_base = key.gs.first_atomic_counter;
3498		shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix;
3499		break;
3500	case PIPE_SHADER_TESS_CTRL:
3501		shader->tcs_prim_mode = key.tcs.prim_mode;
3502		shader->atomic_base = key.tcs.first_atomic_counter;
3503		lds_outputs = true;
3504		lds_inputs = true;
3505		break;
3506	case PIPE_SHADER_TESS_EVAL:
3507		shader->tes_as_es = key.tes.as_es;
3508		shader->atomic_base = key.tes.first_atomic_counter;
3509		lds_inputs = true;
3510		if (shader->tes_as_es)
3511			ring_outputs = true;
3512		break;
3513	case PIPE_SHADER_FRAGMENT:
3514		shader->two_side = key.ps.color_two_side;
3515		shader->atomic_base = key.ps.first_atomic_counter;
3516		shader->rat_base = key.ps.nr_cbufs;
3517		shader->image_size_const_offset = key.ps.image_size_const_offset;
3518		break;
3519	case PIPE_SHADER_COMPUTE:
3520		shader->rat_base = 0;
3521		shader->image_size_const_offset = ctx.info.file_count[TGSI_FILE_SAMPLER];
3522		break;
3523	default:
3524		break;
3525	}
3526
3527	if (shader->vs_as_es || shader->tes_as_es) {
3528		ctx.gs_for_vs = &rctx->gs_shader->current->shader;
3529	} else {
3530		ctx.gs_for_vs = NULL;
3531	}
3532
3533	ctx.next_ring_offset = 0;
3534	ctx.gs_out_ring_offset = 0;
3535	ctx.gs_next_vertex = 0;
3536	ctx.gs_stream_output_info = &so;
3537
3538	ctx.thread_id_gpr = -1;
3539	ctx.face_gpr = -1;
3540	ctx.fixed_pt_position_gpr = -1;
3541	ctx.fragcoord_input = -1;
3542	ctx.colors_used = 0;
3543	ctx.clip_vertex_write = 0;
3544
3545	ctx.helper_invoc_reg = -1;
3546	ctx.cs_block_size_reg = -1;
3547	ctx.cs_grid_size_reg = -1;
3548	ctx.cs_block_size_loaded = false;
3549	ctx.cs_grid_size_loaded = false;
3550
3551	shader->nr_ps_color_exports = 0;
3552
3553
3554	/* register allocations */
3555	/* Values [0,127] correspond to GPR[0..127].
3556	 * Values [128,159] correspond to constant buffer bank 0
3557	 * Values [160,191] correspond to constant buffer bank 1
3558	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
3559	 * Values [256,287] correspond to constant buffer bank 2 (EG)
3560	 * Values [288,319] correspond to constant buffer bank 3 (EG)
3561	 * Other special values are shown in the list below.
3562	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
3563	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
3564	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
3565	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
3566	 * 248	SQ_ALU_SRC_0: special constant 0.0.
3567	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
3568	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
3569	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
3570	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
3571	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
3572	 * 254	SQ_ALU_SRC_PV: previous vector result.
3573	 * 255	SQ_ALU_SRC_PS: previous scalar result.
3574	 */
3575	for (i = 0; i < TGSI_FILE_COUNT; i++) {
3576		ctx.file_offset[i] = 0;
3577	}
3578
3579	if (ctx.type == PIPE_SHADER_VERTEX)  {
3580
3581		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3582		if (ctx.info.num_inputs)
3583			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
3584	}
3585	if (ctx.type == PIPE_SHADER_FRAGMENT) {
3586		if (ctx.bc->gfx_level >= EVERGREEN)
3587			ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
3588		else
3589			ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
3590
3591		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3592			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) {
3593				ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3594				shader->uses_helper_invocation = true;
3595			}
3596		}
3597	}
3598	if (ctx.type == PIPE_SHADER_GEOMETRY) {
3599		/* FIXME 1 would be enough in some cases (3 or less input vertices) */
3600		ctx.file_offset[TGSI_FILE_INPUT] = 2;
3601	}
3602	if (ctx.type == PIPE_SHADER_TESS_CTRL)
3603		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3604	if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3605		bool add_tesscoord = false, add_tess_inout = false;
3606		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3607		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3608			/* if we have tesscoord save one reg */
3609			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)
3610				add_tesscoord = true;
3611			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||
3612			    ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)
3613				add_tess_inout = true;
3614		}
3615		if (add_tesscoord || add_tess_inout)
3616			ctx.file_offset[TGSI_FILE_INPUT]++;
3617		if (add_tess_inout)
3618			ctx.file_offset[TGSI_FILE_INPUT]+=2;
3619	}
3620	if (ctx.type == PIPE_SHADER_COMPUTE) {
3621		ctx.file_offset[TGSI_FILE_INPUT] = 2;
3622		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3623			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE)
3624				ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3625			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE)
3626				ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3627		}
3628	}
3629
3630	ctx.file_offset[TGSI_FILE_OUTPUT] =
3631			ctx.file_offset[TGSI_FILE_INPUT] +
3632			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3633	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
3634						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
3635
3636	/* Outside the GPR range. This will be translated to one of the
3637	 * kcache banks later. */
3638	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
3639	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
3640
3641	pipeshader->scratch_space_needed = 0;
3642	int regno = ctx.file_offset[TGSI_FILE_TEMPORARY] +
3643			ctx.info.file_max[TGSI_FILE_TEMPORARY];
3644	if (regno > 124) {
3645		choose_spill_arrays(&ctx, &regno, &pipeshader->scratch_space_needed);
3646		shader->indirect_files = ctx.info.indirect_files;
3647	}
3648	shader->needs_scratch_space = pipeshader->scratch_space_needed != 0;
3649
3650	ctx.bc->ar_reg = ++regno;
3651	ctx.bc->index_reg[0] = ++regno;
3652	ctx.bc->index_reg[1] = ++regno;
3653
3654	if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3655		ctx.tess_input_info = ++regno;
3656		ctx.tess_output_info = ++regno;
3657	} else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3658		ctx.tess_input_info = ++regno;
3659		ctx.tess_output_info = ++regno;
3660	} else if (ctx.type == PIPE_SHADER_GEOMETRY) {
3661		ctx.gs_export_gpr_tregs[0] = ++regno;
3662		ctx.gs_export_gpr_tregs[1] = ++regno;
3663		ctx.gs_export_gpr_tregs[2] = ++regno;
3664		ctx.gs_export_gpr_tregs[3] = ++regno;
3665		if (ctx.shader->gs_tri_strip_adj_fix) {
3666			ctx.gs_rotated_input[0] = ++regno;
3667			ctx.gs_rotated_input[1] = ++regno;
3668		} else {
3669			ctx.gs_rotated_input[0] = 0;
3670			ctx.gs_rotated_input[1] = 1;
3671		}
3672	}
3673
3674	if (shader->uses_images) {
3675		ctx.thread_id_gpr = ++regno;
3676	}
3677	ctx.temp_reg = ++regno;
3678
3679	shader->max_arrays = 0;
3680	shader->num_arrays = 0;
3681	if (indirect_gprs) {
3682
3683		if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
3684			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
3685			                   ctx.file_offset[TGSI_FILE_OUTPUT] -
3686			                   ctx.file_offset[TGSI_FILE_INPUT],
3687			                   0x0F);
3688		}
3689		if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
3690			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
3691			                   ctx.file_offset[TGSI_FILE_TEMPORARY] -
3692			                   ctx.file_offset[TGSI_FILE_OUTPUT],
3693			                   0x0F);
3694		}
3695	}
3696
3697	ctx.nliterals = 0;
3698	ctx.literals = NULL;
3699	ctx.max_driver_temp_used = 0;
3700
3701	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
3702			       ctx.info.colors_written == 1;
3703	shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
3704	shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
3705
3706	if (ctx.type == PIPE_SHADER_VERTEX ||
3707	    ctx.type == PIPE_SHADER_GEOMETRY ||
3708	    ctx.type == PIPE_SHADER_TESS_EVAL) {
3709		shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] +
3710					      ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1;
3711		shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1;
3712		shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED];
3713	}
3714
3715	if (shader->vs_as_gs_a)
3716		vs_add_primid_output(&ctx, key.vs.prim_id_out);
3717
3718	if (ctx.thread_id_gpr != -1) {
3719		r = load_thread_id_gpr(&ctx);
3720		if (r)
3721			return r;
3722	}
3723
3724	if (ctx.type == PIPE_SHADER_TESS_EVAL)
3725		r600_fetch_tess_io_info(&ctx);
3726
3727	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3728		tgsi_parse_token(&ctx.parse);
3729		switch (ctx.parse.FullToken.Token.Type) {
3730		case TGSI_TOKEN_TYPE_IMMEDIATE:
3731			immediate = &ctx.parse.FullToken.FullImmediate;
3732			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
3733			if(ctx.literals == NULL) {
3734				r = -ENOMEM;
3735				goto out_err;
3736			}
3737			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
3738			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
3739			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
3740			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
3741			ctx.nliterals++;
3742			break;
3743		case TGSI_TOKEN_TYPE_DECLARATION:
3744			r = tgsi_declaration(&ctx);
3745			if (r)
3746				goto out_err;
3747			break;
3748		case TGSI_TOKEN_TYPE_INSTRUCTION:
3749		case TGSI_TOKEN_TYPE_PROPERTY:
3750			break;
3751		default:
3752			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
3753			r = -EINVAL;
3754			goto out_err;
3755		}
3756	}
3757
3758	shader->ring_item_sizes[0] = ctx.next_ring_offset;
3759	shader->ring_item_sizes[1] = 0;
3760	shader->ring_item_sizes[2] = 0;
3761	shader->ring_item_sizes[3] = 0;
3762
3763	/* Process two side if needed */
3764	if (shader->two_side && ctx.colors_used) {
3765		int i, count = ctx.shader->ninput;
3766		unsigned next_lds_loc = ctx.shader->nlds;
3767
3768		/* additional inputs will be allocated right after the existing inputs,
3769		 * we won't need them after the color selection, so we don't need to
3770		 * reserve these gprs for the rest of the shader code and to adjust
3771		 * output offsets etc. */
3772		int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
3773				ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3774
3775		/* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
3776		if (ctx.face_gpr == -1) {
3777			i = ctx.shader->ninput++;
3778			ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
3779			ctx.shader->input[i].spi_sid = 0;
3780			ctx.shader->input[i].gpr = gpr++;
3781			ctx.face_gpr = ctx.shader->input[i].gpr;
3782		}
3783
3784		for (i = 0; i < count; i++) {
3785			if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
3786				int ni = ctx.shader->ninput++;
3787				memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
3788				ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
3789				ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
3790				ctx.shader->input[ni].gpr = gpr++;
3791				// TGSI to LLVM needs to know the lds position of inputs.
3792				// Non LLVM path computes it later (in process_twoside_color)
3793				ctx.shader->input[ni].lds_pos = next_lds_loc++;
3794				ctx.shader->input[i].back_color_input = ni;
3795				if (ctx.bc->gfx_level >= EVERGREEN) {
3796					if ((r = evergreen_interp_input(&ctx, ni)))
3797						return r;
3798				}
3799			}
3800		}
3801	}
3802
3803	if (ctx.shader->uses_helper_invocation) {
3804		if (ctx.bc->gfx_level == CAYMAN)
3805			r = cm_load_helper_invocation(&ctx);
3806		else
3807			r = eg_load_helper_invocation(&ctx);
3808		if (r)
3809			return r;
3810	}
3811
3812	/*
3813	 * XXX this relies on fixed_pt_position_gpr only being present when
3814	 * this shader should be executed per sample. Should be the case for now...
3815	 */
3816	if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) {
3817		/*
3818		 * Fix up sample mask. The hw always gives us coverage mask for
3819		 * the pixel. However, for per-sample shading, we need the
3820		 * coverage for the shader invocation only.
3821		 * Also, with disabled msaa, only the first bit should be set
3822		 * (luckily the same fixup works for both problems).
3823		 * For now, we can only do it if we know this shader is always
3824		 * executed per sample (due to usage of bits in the shader
3825		 * forcing per-sample execution).
3826		 * If the fb is not multisampled, we'd do unnecessary work but
3827		 * it should still be correct.
3828		 * It will however do nothing for sample shading according
3829		 * to MinSampleShading.
3830		 */
3831		struct r600_bytecode_alu alu;
3832		int tmp = r600_get_temp(&ctx);
3833		assert(ctx.face_gpr != -1);
3834		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3835
3836		alu.op = ALU_OP2_LSHL_INT;
3837		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3838		alu.src[0].value = 0x1;
3839		alu.src[1].sel = ctx.fixed_pt_position_gpr;
3840		alu.src[1].chan = 3;
3841		alu.dst.sel = tmp;
3842		alu.dst.chan = 0;
3843		alu.dst.write = 1;
3844		alu.last = 1;
3845		if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3846			return r;
3847
3848		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3849		alu.op = ALU_OP2_AND_INT;
3850		alu.src[0].sel = tmp;
3851		alu.src[1].sel = ctx.face_gpr;
3852		alu.src[1].chan = 2;
3853		alu.dst.sel = ctx.face_gpr;
3854		alu.dst.chan = 2;
3855		alu.dst.write = 1;
3856		alu.last = 1;
3857		if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3858			return r;
3859	}
3860
3861	if (ctx.fragcoord_input >= 0) {
3862		if (ctx.bc->gfx_level == CAYMAN) {
3863			for (j = 0 ; j < 4; j++) {
3864				struct r600_bytecode_alu alu;
3865				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3866				alu.op = ALU_OP1_RECIP_IEEE;
3867				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3868				alu.src[0].chan = 3;
3869
3870				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3871				alu.dst.chan = j;
3872				alu.dst.write = (j == 3);
3873				alu.last = (j == 3);
3874				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3875					return r;
3876			}
3877		} else {
3878			struct r600_bytecode_alu alu;
3879			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3880			alu.op = ALU_OP1_RECIP_IEEE;
3881			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3882			alu.src[0].chan = 3;
3883
3884			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3885			alu.dst.chan = 3;
3886			alu.dst.write = 1;
3887			alu.last = 1;
3888			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3889				return r;
3890		}
3891	}
3892
3893	if (ctx.type == PIPE_SHADER_GEOMETRY) {
3894		struct r600_bytecode_alu alu;
3895		int r;
3896
3897		/* GS thread with no output workaround - emit a cut at start of GS */
3898		if (ctx.bc->gfx_level == R600)
3899			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
3900
3901		for (j = 0; j < 4; j++) {
3902			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3903			alu.op = ALU_OP1_MOV;
3904			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3905			alu.src[0].value = 0;
3906			alu.dst.sel = ctx.gs_export_gpr_tregs[j];
3907			alu.dst.write = 1;
3908			alu.last = 1;
3909			r = r600_bytecode_add_alu(ctx.bc, &alu);
3910			if (r)
3911				return r;
3912		}
3913
3914		if (ctx.shader->gs_tri_strip_adj_fix) {
3915			r = single_alu_op2(&ctx, ALU_OP2_AND_INT,
3916					   ctx.gs_rotated_input[0], 2,
3917					   0, 2,
3918					   V_SQ_ALU_SRC_LITERAL, 1);
3919			if (r)
3920				return r;
3921
3922			for (i = 0; i < 6; i++) {
3923				int rotated = (i + 4) % 6;
3924				int offset_reg = i / 3;
3925				int offset_chan = i % 3;
3926				int rotated_offset_reg = rotated / 3;
3927				int rotated_offset_chan = rotated % 3;
3928
3929				if (offset_reg == 0 && offset_chan == 2)
3930					offset_chan = 3;
3931				if (rotated_offset_reg == 0 && rotated_offset_chan == 2)
3932					rotated_offset_chan = 3;
3933
3934				r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT,
3935						   ctx.gs_rotated_input[offset_reg], offset_chan,
3936						   ctx.gs_rotated_input[0], 2,
3937						   offset_reg, offset_chan,
3938						   rotated_offset_reg, rotated_offset_chan);
3939				if (r)
3940					return r;
3941			}
3942		}
3943	}
3944
3945	if (ctx.type == PIPE_SHADER_TESS_CTRL)
3946		r600_fetch_tess_io_info(&ctx);
3947
3948	if (shader->two_side && ctx.colors_used) {
3949		if ((r = process_twoside_color_inputs(&ctx)))
3950			return r;
3951	}
3952
3953	tgsi_parse_init(&ctx.parse, tokens);
3954	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3955		tgsi_parse_token(&ctx.parse);
3956		switch (ctx.parse.FullToken.Token.Type) {
3957		case TGSI_TOKEN_TYPE_INSTRUCTION:
3958			r = tgsi_is_supported(&ctx);
3959			if (r)
3960				goto out_err;
3961			ctx.max_driver_temp_used = 0;
3962			/* reserve first tmp for everyone */
3963			r600_get_temp(&ctx);
3964
3965			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
3966			if ((r = tgsi_split_constant(&ctx)))
3967				goto out_err;
3968			if ((r = tgsi_split_literal_constant(&ctx)))
3969				goto out_err;
3970			if (ctx.type == PIPE_SHADER_GEOMETRY) {
3971				if ((r = tgsi_split_gs_inputs(&ctx)))
3972					goto out_err;
3973			} else if (lds_inputs) {
3974				if ((r = tgsi_split_lds_inputs(&ctx)))
3975					goto out_err;
3976			}
3977			if (ctx.bc->gfx_level == CAYMAN)
3978				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
3979			else if (ctx.bc->gfx_level >= EVERGREEN)
3980				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
3981			else
3982				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
3983
3984			ctx.bc->precise |= ctx.parse.FullToken.FullInstruction.Instruction.Precise;
3985
3986			r = ctx.inst_info->process(&ctx);
3987			if (r)
3988				goto out_err;
3989
3990			if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3991				r = r600_store_tcs_output(&ctx);
3992				if (r)
3993					goto out_err;
3994			}
3995			break;
3996		default:
3997			break;
3998		}
3999	}
4000
4001	/* Reset the temporary register counter. */
4002	ctx.max_driver_temp_used = 0;
4003
4004	noutput = shader->noutput;
4005
4006	if (!ring_outputs && ctx.clip_vertex_write) {
4007		unsigned clipdist_temp[2];
4008
4009		clipdist_temp[0] = r600_get_temp(&ctx);
4010		clipdist_temp[1] = r600_get_temp(&ctx);
4011
4012		/* need to convert a clipvertex write into clipdistance writes and not export
4013		   the clip vertex anymore */
4014
4015		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
4016		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
4017		shader->output[noutput].gpr = clipdist_temp[0];
4018		noutput++;
4019		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
4020		shader->output[noutput].gpr = clipdist_temp[1];
4021		noutput++;
4022
4023		/* reset spi_sid for clipvertex output to avoid confusing spi */
4024		shader->output[ctx.cv_output].spi_sid = 0;
4025
4026		shader->clip_dist_write = 0xFF;
4027		shader->cc_dist_mask = 0xFF;
4028
4029		for (i = 0; i < 8; i++) {
4030			int oreg = i >> 2;
4031			int ochan = i & 3;
4032
4033			for (j = 0; j < 4; j++) {
4034				struct r600_bytecode_alu alu;
4035				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4036				alu.op = ALU_OP2_DOT4;
4037				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
4038				alu.src[0].chan = j;
4039
4040				alu.src[1].sel = 512 + i;
4041				alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
4042				alu.src[1].chan = j;
4043
4044				alu.dst.sel = clipdist_temp[oreg];
4045				alu.dst.chan = j;
4046				alu.dst.write = (j == ochan);
4047				if (j == 3)
4048					alu.last = 1;
4049				r = r600_bytecode_add_alu(ctx.bc, &alu);
4050				if (r)
4051					return r;
4052			}
4053		}
4054	}
4055
4056	/* Add stream outputs. */
4057	if (so.num_outputs) {
4058		bool emit = false;
4059		if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX)
4060			emit = true;
4061		if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL)
4062			emit = true;
4063		if (emit)
4064			emit_streamout(&ctx, &so, -1, NULL);
4065	}
4066	pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
4067	convert_edgeflag_to_int(&ctx);
4068
4069	if (ctx.type == PIPE_SHADER_TESS_CTRL)
4070		r600_emit_tess_factor(&ctx);
4071
4072	if (lds_outputs) {
4073		if (ctx.type == PIPE_SHADER_VERTEX) {
4074			if (ctx.shader->noutput)
4075				emit_lds_vs_writes(&ctx);
4076		}
4077	} else if (ring_outputs) {
4078		if (shader->vs_as_es || shader->tes_as_es) {
4079			ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
4080			ctx.gs_export_gpr_tregs[1] = -1;
4081			ctx.gs_export_gpr_tregs[2] = -1;
4082			ctx.gs_export_gpr_tregs[3] = -1;
4083
4084			emit_gs_ring_writes(&ctx, &so, -1, FALSE);
4085		}
4086	} else {
4087		/* Export output */
4088		next_clip_base = shader->vs_out_misc_write ? 62 : 61;
4089
4090		for (i = 0, j = 0; i < noutput; i++, j++) {
4091			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4092			output[j].gpr = shader->output[i].gpr;
4093			output[j].elem_size = 3;
4094			output[j].swizzle_x = 0;
4095			output[j].swizzle_y = 1;
4096			output[j].swizzle_z = 2;
4097			output[j].swizzle_w = 3;
4098			output[j].burst_count = 1;
4099			output[j].type = 0xffffffff;
4100			output[j].op = CF_OP_EXPORT;
4101			switch (ctx.type) {
4102			case PIPE_SHADER_VERTEX:
4103			case PIPE_SHADER_TESS_EVAL:
4104				switch (shader->output[i].name) {
4105				case TGSI_SEMANTIC_POSITION:
4106					output[j].array_base = 60;
4107					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4108					pos_emitted = true;
4109					break;
4110
4111				case TGSI_SEMANTIC_PSIZE:
4112					output[j].array_base = 61;
4113					output[j].swizzle_y = 7;
4114					output[j].swizzle_z = 7;
4115					output[j].swizzle_w = 7;
4116					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4117					pos_emitted = true;
4118					break;
4119				case TGSI_SEMANTIC_EDGEFLAG:
4120					output[j].array_base = 61;
4121					output[j].swizzle_x = 7;
4122					output[j].swizzle_y = 0;
4123					output[j].swizzle_z = 7;
4124					output[j].swizzle_w = 7;
4125					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4126					pos_emitted = true;
4127					break;
4128				case TGSI_SEMANTIC_LAYER:
4129					/* spi_sid is 0 for outputs that are
4130					 * not consumed by PS */
4131					if (shader->output[i].spi_sid) {
4132						output[j].array_base = next_param_base++;
4133						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4134						j++;
4135						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4136					}
4137					output[j].array_base = 61;
4138					output[j].swizzle_x = 7;
4139					output[j].swizzle_y = 7;
4140					output[j].swizzle_z = 0;
4141					output[j].swizzle_w = 7;
4142					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4143					pos_emitted = true;
4144					break;
4145				case TGSI_SEMANTIC_VIEWPORT_INDEX:
4146					/* spi_sid is 0 for outputs that are
4147					 * not consumed by PS */
4148					if (shader->output[i].spi_sid) {
4149						output[j].array_base = next_param_base++;
4150						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4151						j++;
4152						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4153					}
4154					output[j].array_base = 61;
4155					output[j].swizzle_x = 7;
4156					output[j].swizzle_y = 7;
4157					output[j].swizzle_z = 7;
4158					output[j].swizzle_w = 0;
4159					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4160					pos_emitted = true;
4161					break;
4162				case TGSI_SEMANTIC_CLIPVERTEX:
4163					j--;
4164					break;
4165				case TGSI_SEMANTIC_CLIPDIST:
4166					output[j].array_base = next_clip_base++;
4167					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4168					pos_emitted = true;
4169					/* spi_sid is 0 for clipdistance outputs that were generated
4170					 * for clipvertex - we don't need to pass them to PS */
4171					if (shader->output[i].spi_sid) {
4172						j++;
4173						/* duplicate it as PARAM to pass to the pixel shader */
4174						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4175						output[j].array_base = next_param_base++;
4176						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4177					}
4178					break;
4179				case TGSI_SEMANTIC_FOG:
4180					output[j].swizzle_y = 4; /* 0 */
4181					output[j].swizzle_z = 4; /* 0 */
4182					output[j].swizzle_w = 5; /* 1 */
4183					break;
4184				case TGSI_SEMANTIC_PRIMID:
4185					output[j].swizzle_x = 2;
4186					output[j].swizzle_y = 4; /* 0 */
4187					output[j].swizzle_z = 4; /* 0 */
4188					output[j].swizzle_w = 4; /* 0 */
4189					break;
4190				}
4191
4192				break;
4193			case PIPE_SHADER_FRAGMENT:
4194				if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
4195					/* never export more colors than the number of CBs */
4196					if (shader->output[i].sid >= max_color_exports) {
4197						/* skip export */
4198						j--;
4199						continue;
4200					}
4201					output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
4202					output[j].array_base = shader->output[i].sid;
4203					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4204					shader->nr_ps_color_exports++;
4205					shader->ps_color_export_mask |= (0xf << (shader->output[i].sid * 4));
4206
4207					/* If the i-th target format is set, all previous target formats must
4208					 * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well.
4209					 */
4210					if (shader->output[i].sid > 0)
4211						for (unsigned x = 0; x < shader->output[i].sid; x++)
4212							shader->ps_color_export_mask |= (1 << (x*4));
4213
4214					if (shader->output[i].sid > shader->ps_export_highest)
4215						shader->ps_export_highest = shader->output[i].sid;
4216					if (shader->fs_write_all && (rscreen->b.gfx_level >= EVERGREEN)) {
4217						for (k = 1; k < max_color_exports; k++) {
4218							j++;
4219							memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4220							output[j].gpr = shader->output[i].gpr;
4221							output[j].elem_size = 3;
4222							output[j].swizzle_x = 0;
4223							output[j].swizzle_y = 1;
4224							output[j].swizzle_z = 2;
4225							output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
4226							output[j].burst_count = 1;
4227							output[j].array_base = k;
4228							output[j].op = CF_OP_EXPORT;
4229							output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4230							shader->nr_ps_color_exports++;
4231							if (k > shader->ps_export_highest)
4232								shader->ps_export_highest = k;
4233							shader->ps_color_export_mask |= (0xf << (j * 4));
4234						}
4235					}
4236				} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
4237					output[j].array_base = 61;
4238					output[j].swizzle_x = 2;
4239					output[j].swizzle_y = 7;
4240					output[j].swizzle_z = output[j].swizzle_w = 7;
4241					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4242				} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
4243					output[j].array_base = 61;
4244					output[j].swizzle_x = 7;
4245					output[j].swizzle_y = 1;
4246					output[j].swizzle_z = output[j].swizzle_w = 7;
4247					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4248				} else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
4249					output[j].array_base = 61;
4250					output[j].swizzle_x = 7;
4251					output[j].swizzle_y = 7;
4252					output[j].swizzle_z = 0;
4253					output[j].swizzle_w = 7;
4254					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4255				} else {
4256					R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
4257					r = -EINVAL;
4258					goto out_err;
4259				}
4260				break;
4261			case PIPE_SHADER_TESS_CTRL:
4262				break;
4263			default:
4264				R600_ERR("unsupported processor type %d\n", ctx.type);
4265				r = -EINVAL;
4266				goto out_err;
4267			}
4268
4269			if (output[j].type == 0xffffffff) {
4270				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4271				output[j].array_base = next_param_base++;
4272			}
4273		}
4274
4275		/* add fake position export */
4276		if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) {
4277			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4278			output[j].gpr = 0;
4279			output[j].elem_size = 3;
4280			output[j].swizzle_x = 7;
4281			output[j].swizzle_y = 7;
4282			output[j].swizzle_z = 7;
4283			output[j].swizzle_w = 7;
4284			output[j].burst_count = 1;
4285			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4286			output[j].array_base = 60;
4287			output[j].op = CF_OP_EXPORT;
4288			j++;
4289		}
4290
4291		/* add fake param output for vertex shader if no param is exported */
4292		if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) {
4293			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4294			output[j].gpr = 0;
4295			output[j].elem_size = 3;
4296			output[j].swizzle_x = 7;
4297			output[j].swizzle_y = 7;
4298			output[j].swizzle_z = 7;
4299			output[j].swizzle_w = 7;
4300			output[j].burst_count = 1;
4301			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4302			output[j].array_base = 0;
4303			output[j].op = CF_OP_EXPORT;
4304			j++;
4305		}
4306
4307		/* add fake pixel export */
4308		if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) {
4309			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4310			output[j].gpr = 0;
4311			output[j].elem_size = 3;
4312			output[j].swizzle_x = 7;
4313			output[j].swizzle_y = 7;
4314			output[j].swizzle_z = 7;
4315			output[j].swizzle_w = 7;
4316			output[j].burst_count = 1;
4317			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4318			output[j].array_base = 0;
4319			output[j].op = CF_OP_EXPORT;
4320			j++;
4321			shader->nr_ps_color_exports++;
4322			shader->ps_color_export_mask = 0xf;
4323		}
4324
4325		noutput = j;
4326
4327		/* set export done on last export of each type */
4328		for (k = noutput - 1, output_done = 0; k >= 0; k--) {
4329			if (!(output_done & (1 << output[k].type))) {
4330				output_done |= (1 << output[k].type);
4331				output[k].op = CF_OP_EXPORT_DONE;
4332			}
4333		}
4334		/* add output to bytecode */
4335		for (i = 0; i < noutput; i++) {
4336			r = r600_bytecode_add_output(ctx.bc, &output[i]);
4337			if (r)
4338				goto out_err;
4339		}
4340	}
4341
4342	/* add program end */
4343	if (ctx.bc->gfx_level == CAYMAN)
4344		cm_bytecode_add_cf_end(ctx.bc);
4345	else {
4346		const struct cf_op_info *last = NULL;
4347
4348		if (ctx.bc->cf_last)
4349			last = r600_isa_cf(ctx.bc->cf_last->op);
4350
4351		/* alu clause instructions don't have EOP bit, so add NOP */
4352		if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP)
4353			r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
4354
4355		ctx.bc->cf_last->end_of_program = 1;
4356	}
4357
4358	/* check GPR limit - we have 124 = 128 - 4
4359	 * (4 are reserved as alu clause temporary registers) */
4360	if (ctx.bc->ngpr > 124) {
4361		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
4362		r = -ENOMEM;
4363		goto out_err;
4364	}
4365
4366	if (ctx.type == PIPE_SHADER_GEOMETRY) {
4367		if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
4368			return r;
4369	}
4370
4371	free(ctx.spilled_arrays);
4372	free(ctx.array_infos);
4373	free(ctx.literals);
4374	tgsi_parse_free(&ctx.parse);
4375	return 0;
4376out_err:
4377	free(ctx.spilled_arrays);
4378	free(ctx.array_infos);
4379	free(ctx.literals);
4380	tgsi_parse_free(&ctx.parse);
4381	return r;
4382}
4383
4384static int tgsi_unsupported(struct r600_shader_ctx *ctx)
4385{
4386	const unsigned tgsi_opcode =
4387		ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
4388	R600_ERR("%s tgsi opcode unsupported\n",
4389		 tgsi_get_opcode_name(tgsi_opcode));
4390	return -EINVAL;
4391}
4392
4393static int tgsi_end(struct r600_shader_ctx *ctx UNUSED)
4394{
4395	return 0;
4396}
4397
4398static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
4399			const struct r600_shader_src *shader_src,
4400			unsigned chan)
4401{
4402	bc_src->sel = shader_src->sel;
4403	bc_src->chan = shader_src->swizzle[chan];
4404	bc_src->neg = shader_src->neg;
4405	bc_src->abs = shader_src->abs;
4406	bc_src->rel = shader_src->rel;
4407	bc_src->value = shader_src->value[bc_src->chan];
4408	bc_src->kc_bank = shader_src->kc_bank;
4409	bc_src->kc_rel = shader_src->kc_rel;
4410}
4411
4412static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
4413{
4414	bc_src->abs = 1;
4415	bc_src->neg = 0;
4416}
4417
4418static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
4419{
4420	bc_src->neg = !bc_src->neg;
4421}
4422
4423static void tgsi_dst(struct r600_shader_ctx *ctx,
4424		     const struct tgsi_full_dst_register *tgsi_dst,
4425		     unsigned swizzle,
4426		     struct r600_bytecode_alu_dst *r600_dst)
4427{
4428	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4429
4430	if (tgsi_dst->Register.File == TGSI_FILE_TEMPORARY) {
4431		bool spilled;
4432		unsigned idx;
4433
4434		idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_dst->Register.Index, &spilled);
4435
4436		if (spilled) {
4437			struct r600_bytecode_output cf;
4438			int reg = 0;
4439			int r;
4440			bool add_pending_output = true;
4441
4442			memset(&cf, 0, sizeof(struct r600_bytecode_output));
4443			get_spilled_array_base_and_size(ctx, tgsi_dst->Register.Index,
4444				&cf.array_base, &cf.array_size);
4445
4446			/* If no component has spilled, reserve a register and add the spill code
4447			 *  ctx->bc->n_pending_outputs is cleared after each instruction group */
4448			if (ctx->bc->n_pending_outputs == 0) {
4449				reg = r600_get_temp(ctx);
4450			} else {
4451				/* If we are already spilling and the output address is the same like
4452				* before then just reuse the same slot */
4453				struct r600_bytecode_output *tmpl = &ctx->bc->pending_outputs[ctx->bc->n_pending_outputs-1];
4454				if ((cf.array_base + idx == tmpl->array_base) ||
4455				    (cf.array_base == tmpl->array_base &&
4456				     tmpl->index_gpr == ctx->bc->ar_reg &&
4457				     tgsi_dst->Register.Indirect)) {
4458					reg = ctx->bc->pending_outputs[0].gpr;
4459					add_pending_output = false;
4460				} else {
4461					reg = r600_get_temp(ctx);
4462				}
4463			}
4464
4465			r600_dst->sel = reg;
4466			r600_dst->chan = swizzle;
4467			r600_dst->write = 1;
4468			if (inst->Instruction.Saturate) {
4469				r600_dst->clamp = 1;
4470			}
4471
4472			/* Add new outputs as pending */
4473			if (add_pending_output) {
4474				cf.op = CF_OP_MEM_SCRATCH;
4475				cf.elem_size = 3;
4476				cf.gpr = reg;
4477				cf.type = r600_bytecode_write_export_ack_type(ctx->bc, tgsi_dst->Register.Indirect);
4478				cf.mark = 1;
4479				cf.comp_mask = inst->Dst[0].Register.WriteMask;
4480				cf.swizzle_x = 0;
4481				cf.swizzle_y = 1;
4482				cf.swizzle_z = 2;
4483				cf.swizzle_w = 3;
4484				cf.burst_count = 1;
4485
4486				if (tgsi_dst->Register.Indirect) {
4487					cf.index_gpr = ctx->bc->ar_reg;
4488				} else {
4489					cf.array_base += idx;
4490					cf.array_size = 0;
4491				}
4492
4493				r = r600_bytecode_add_pending_output(ctx->bc, &cf);
4494				if (r)
4495					return;
4496
4497				r600_bytecode_add_ack(ctx->bc);
4498			}
4499			return;
4500		}
4501		else {
4502			r600_dst->sel = idx;
4503		}
4504	}
4505	else {
4506		r600_dst->sel = tgsi_dst->Register.Index;
4507		r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
4508	}
4509	r600_dst->chan = swizzle;
4510	r600_dst->write = 1;
4511	if (inst->Instruction.Saturate) {
4512		r600_dst->clamp = 1;
4513	}
4514	if (ctx->type == PIPE_SHADER_TESS_CTRL) {
4515		if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
4516			return;
4517		}
4518	}
4519	if (tgsi_dst->Register.Indirect)
4520		r600_dst->rel = V_SQ_REL_RELATIVE;
4521
4522}
4523
4524static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override)
4525{
4526	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4527	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4528	struct r600_bytecode_alu alu;
4529	int i, j, r, lasti = tgsi_last_instruction(write_mask);
4530	int use_tmp = 0;
4531	int swizzle_x = inst->Src[0].Register.SwizzleX;
4532
4533	if (singledest) {
4534		switch (write_mask) {
4535		case 0x1:
4536			if (swizzle_x == 2) {
4537				write_mask = 0xc;
4538				use_tmp = 3;
4539			} else
4540				write_mask = 0x3;
4541			break;
4542		case 0x2:
4543			if (swizzle_x == 2) {
4544				write_mask = 0xc;
4545				use_tmp = 3;
4546			} else {
4547				write_mask = 0x3;
4548				use_tmp = 1;
4549			}
4550			break;
4551		case 0x4:
4552			if (swizzle_x == 0) {
4553				write_mask = 0x3;
4554				use_tmp = 1;
4555			} else
4556				write_mask = 0xc;
4557			break;
4558		case 0x8:
4559			if (swizzle_x == 0) {
4560				write_mask = 0x3;
4561				use_tmp = 1;
4562			} else {
4563				write_mask = 0xc;
4564				use_tmp = 3;
4565			}
4566			break;
4567		}
4568	}
4569
4570	lasti = tgsi_last_instruction(write_mask);
4571	for (i = 0; i <= lasti; i++) {
4572
4573		if (!(write_mask & (1 << i)))
4574			continue;
4575
4576		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4577
4578		if (singledest) {
4579			if (use_tmp || dest_temp) {
4580				alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp;
4581				alu.dst.chan = i;
4582				alu.dst.write = 1;
4583			} else {
4584				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4585			}
4586			if (i == 1 || i == 3)
4587				alu.dst.write = 0;
4588		} else
4589			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4590
4591		alu.op = op_override ? op_override : ctx->inst_info->op;
4592		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
4593			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4594		} else if (!swap) {
4595			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4596				r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4597			}
4598		} else {
4599			r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
4600			r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
4601		}
4602
4603		/* handle some special cases */
4604		if (i == 1 || i == 3) {
4605			switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
4606			case TGSI_OPCODE_DABS:
4607				r600_bytecode_src_set_abs(&alu.src[0]);
4608				break;
4609			default:
4610				break;
4611			}
4612		}
4613		if (i == lasti) {
4614			alu.last = 1;
4615		}
4616		r = r600_bytecode_add_alu(ctx->bc, &alu);
4617		if (r)
4618			return r;
4619	}
4620
4621	if (use_tmp) {
4622		write_mask = inst->Dst[0].Register.WriteMask;
4623
4624		lasti = tgsi_last_instruction(write_mask);
4625		/* move result from temp to dst */
4626		for (i = 0; i <= lasti; i++) {
4627			if (!(write_mask & (1 << i)))
4628				continue;
4629
4630			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4631			alu.op = ALU_OP1_MOV;
4632
4633			if (dest_temp) {
4634				alu.dst.sel = dest_temp;
4635				alu.dst.chan = i;
4636				alu.dst.write = 1;
4637			} else
4638				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4639			alu.src[0].sel = ctx->temp_reg;
4640			alu.src[0].chan = use_tmp - 1;
4641			alu.last = (i == lasti);
4642
4643			r = r600_bytecode_add_alu(ctx->bc, &alu);
4644			if (r)
4645				return r;
4646		}
4647	}
4648	return 0;
4649}
4650
4651static int tgsi_op2_64(struct r600_shader_ctx *ctx)
4652{
4653	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4654	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4655	/* confirm writemasking */
4656	if ((write_mask & 0x3) != 0x3 &&
4657	    (write_mask & 0xc) != 0xc) {
4658		fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
4659		return -1;
4660	}
4661	return tgsi_op2_64_params(ctx, false, false, 0, 0);
4662}
4663
4664static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
4665{
4666	return tgsi_op2_64_params(ctx, true, false, 0, 0);
4667}
4668
4669static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
4670{
4671	return tgsi_op2_64_params(ctx, true, true, 0, 0);
4672}
4673
4674static int tgsi_op3_64(struct r600_shader_ctx *ctx)
4675{
4676	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4677	struct r600_bytecode_alu alu;
4678	int i, j, r;
4679	int lasti = 3;
4680	int tmp = r600_get_temp(ctx);
4681
4682	for (i = 0; i < lasti + 1; i++) {
4683
4684		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4685		alu.op = ctx->inst_info->op;
4686		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4687			r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
4688		}
4689
4690		if (inst->Dst[0].Register.WriteMask & (1 << i))
4691			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4692		else
4693			alu.dst.sel = tmp;
4694
4695		alu.dst.chan = i;
4696		alu.is_op3 = 1;
4697		if (i == lasti) {
4698			alu.last = 1;
4699		}
4700		r = r600_bytecode_add_alu(ctx->bc, &alu);
4701		if (r)
4702			return r;
4703	}
4704	return 0;
4705}
4706
4707static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
4708{
4709	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4710	struct r600_bytecode_alu alu;
4711	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4712	int i, j, r, lasti = tgsi_last_instruction(write_mask);
4713	/* use temp register if trans_only and more than one dst component */
4714	int use_tmp = trans_only && (write_mask ^ (1 << lasti));
4715	unsigned op = ctx->inst_info->op;
4716
4717	if (op == ALU_OP2_MUL_IEEE &&
4718	    ctx->info.properties[TGSI_PROPERTY_LEGACY_MATH_RULES])
4719		op = ALU_OP2_MUL;
4720
4721	/* nir_to_tgsi lowers nir_op_isub to UADD + negate, since r600 doesn't support
4722	 * source modifiers with integer ops we switch back to SUB_INT */
4723	bool src1_neg = ctx->src[1].neg;
4724	if (op == ALU_OP2_ADD_INT && src1_neg) {
4725		src1_neg = false;
4726		op = ALU_OP2_SUB_INT;
4727	}
4728
4729	for (i = 0; i <= lasti; i++) {
4730		if (!(write_mask & (1 << i)))
4731			continue;
4732
4733		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4734		if (use_tmp) {
4735			alu.dst.sel = ctx->temp_reg;
4736			alu.dst.chan = i;
4737			alu.dst.write = 1;
4738		} else
4739			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4740
4741		alu.op = op;
4742		if (!swap) {
4743			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4744				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4745			}
4746			alu.src[1].neg = src1_neg;
4747		} else {
4748			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4749			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4750		}
4751		if (i == lasti || trans_only) {
4752			alu.last = 1;
4753		}
4754		r = r600_bytecode_add_alu(ctx->bc, &alu);
4755		if (r)
4756			return r;
4757	}
4758
4759	if (use_tmp) {
4760		/* move result from temp to dst */
4761		for (i = 0; i <= lasti; i++) {
4762			if (!(write_mask & (1 << i)))
4763				continue;
4764
4765			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4766			alu.op = ALU_OP1_MOV;
4767			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4768			alu.src[0].sel = ctx->temp_reg;
4769			alu.src[0].chan = i;
4770			alu.last = (i == lasti);
4771
4772			r = r600_bytecode_add_alu(ctx->bc, &alu);
4773			if (r)
4774				return r;
4775		}
4776	}
4777	return 0;
4778}
4779
4780static int tgsi_op2(struct r600_shader_ctx *ctx)
4781{
4782	return tgsi_op2_s(ctx, 0, 0);
4783}
4784
4785static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
4786{
4787	return tgsi_op2_s(ctx, 1, 0);
4788}
4789
4790static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
4791{
4792	return tgsi_op2_s(ctx, 0, 1);
4793}
4794
4795static int tgsi_ineg(struct r600_shader_ctx *ctx)
4796{
4797	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4798	struct r600_bytecode_alu alu;
4799	int i, r;
4800	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4801
4802	for (i = 0; i < lasti + 1; i++) {
4803
4804		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4805			continue;
4806		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4807		alu.op = ctx->inst_info->op;
4808
4809		alu.src[0].sel = V_SQ_ALU_SRC_0;
4810
4811		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4812
4813		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4814
4815		if (i == lasti) {
4816			alu.last = 1;
4817		}
4818		r = r600_bytecode_add_alu(ctx->bc, &alu);
4819		if (r)
4820			return r;
4821	}
4822	return 0;
4823
4824}
4825
4826static int tgsi_dneg(struct r600_shader_ctx *ctx)
4827{
4828	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4829	struct r600_bytecode_alu alu;
4830	int i, r;
4831	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4832
4833	for (i = 0; i < lasti + 1; i++) {
4834
4835		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4836			continue;
4837		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4838		alu.op = ALU_OP1_MOV;
4839
4840		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4841
4842		if (i == 1 || i == 3)
4843			r600_bytecode_src_toggle_neg(&alu.src[0]);
4844		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4845
4846		if (i == lasti) {
4847			alu.last = 1;
4848		}
4849		r = r600_bytecode_add_alu(ctx->bc, &alu);
4850		if (r)
4851			return r;
4852	}
4853	return 0;
4854
4855}
4856
4857static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
4858{
4859	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4860	struct r600_bytecode_alu alu;
4861	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4862	int i, j, r;
4863
4864	for (i = 0; i <= 3; i++) {
4865		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4866		alu.op = ctx->inst_info->op;
4867
4868		alu.dst.sel = ctx->temp_reg;
4869		alu.dst.chan = i;
4870		alu.dst.write = 1;
4871		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4872			r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4873		}
4874
4875		if (i == 3)
4876			alu.last = 1;
4877
4878		r = r600_bytecode_add_alu(ctx->bc, &alu);
4879		if (r)
4880			return r;
4881	}
4882
4883	/* Replicate significand result across channels. */
4884	for (i = 0; i <= 3; i++) {
4885		if (!(write_mask & (1 << i)))
4886			continue;
4887
4888		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4889		alu.op = ALU_OP1_MOV;
4890		alu.src[0].chan = (i & 1) + 2;
4891		alu.src[0].sel = ctx->temp_reg;
4892
4893		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4894		alu.dst.write = 1;
4895		alu.last = 1;
4896		r = r600_bytecode_add_alu(ctx->bc, &alu);
4897		if (r)
4898			return r;
4899	}
4900
4901	for (i = 0; i <= 3; i++) {
4902		if (inst->Dst[1].Register.WriteMask & (1 << i)) {
4903			/* MOV third channels to writemask dst1 */
4904			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4905			alu.op = ALU_OP1_MOV;
4906			alu.src[0].chan = 1;
4907			alu.src[0].sel = ctx->temp_reg;
4908
4909			tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
4910			alu.last = 1;
4911			r = r600_bytecode_add_alu(ctx->bc, &alu);
4912			if (r)
4913				return r;
4914			break;
4915		}
4916	}
4917	return 0;
4918}
4919
4920
4921static int egcm_int_to_double(struct r600_shader_ctx *ctx)
4922{
4923	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4924	struct r600_bytecode_alu alu;
4925	int i, c, r;
4926	int write_mask = inst->Dst[0].Register.WriteMask;
4927	int temp_reg = r600_get_temp(ctx);
4928
4929	assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
4930		inst->Instruction.Opcode == TGSI_OPCODE_U2D);
4931
4932	for (c = 0; c < 2; c++) {
4933		int dchan = c * 2;
4934		if (write_mask & (0x3 << dchan)) {
4935	/* split into 24-bit int and 8-bit int */
4936			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4937			alu.op = ALU_OP2_AND_INT;
4938			alu.dst.sel = temp_reg;
4939			alu.dst.chan = dchan;
4940			r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4941			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4942			alu.src[1].value = 0xffffff00;
4943			alu.dst.write = 1;
4944			r = r600_bytecode_add_alu(ctx->bc, &alu);
4945			if (r)
4946				return r;
4947
4948			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4949			alu.op = ALU_OP2_AND_INT;
4950			alu.dst.sel = temp_reg;
4951			alu.dst.chan = dchan + 1;
4952			r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4953			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4954			alu.src[1].value = 0xff;
4955			alu.dst.write = 1;
4956			alu.last = 1;
4957			r = r600_bytecode_add_alu(ctx->bc, &alu);
4958			if (r)
4959				return r;
4960		}
4961	}
4962
4963	for (c = 0; c < 2; c++) {
4964		int dchan = c * 2;
4965		if (write_mask & (0x3 << dchan)) {
4966			for (i = dchan; i <= dchan + 1; i++) {
4967				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4968				alu.op = i == dchan ? ctx->inst_info->op : ALU_OP1_UINT_TO_FLT;
4969
4970				alu.src[0].sel = temp_reg;
4971				alu.src[0].chan = i;
4972				alu.dst.sel = temp_reg;
4973				alu.dst.chan = i;
4974				alu.dst.write = 1;
4975				if (ctx->bc->gfx_level == CAYMAN)
4976					alu.last = i == dchan + 1;
4977				else
4978					alu.last = 1; /* trans only ops on evergreen */
4979
4980				r = r600_bytecode_add_alu(ctx->bc, &alu);
4981				if (r)
4982					return r;
4983			}
4984		}
4985	}
4986
4987	for (c = 0; c < 2; c++) {
4988		int dchan = c * 2;
4989		if (write_mask & (0x3 << dchan)) {
4990			for (i = 0; i < 4; i++) {
4991				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4992				alu.op = ALU_OP1_FLT32_TO_FLT64;
4993
4994				alu.src[0].chan = dchan + (i / 2);
4995				if (i == 0 || i == 2)
4996					alu.src[0].sel = temp_reg;
4997				else {
4998					alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4999					alu.src[0].value = 0x0;
5000				}
5001				alu.dst.sel = ctx->temp_reg;
5002				alu.dst.chan = i;
5003				alu.last = i == 3;
5004				alu.dst.write = 1;
5005
5006				r = r600_bytecode_add_alu(ctx->bc, &alu);
5007				if (r)
5008					return r;
5009			}
5010
5011			for (i = 0; i <= 1; i++) {
5012				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5013				alu.op = ALU_OP2_ADD_64;
5014
5015				alu.src[0].chan = fp64_switch(i);
5016				alu.src[0].sel = ctx->temp_reg;
5017
5018				alu.src[1].chan = fp64_switch(i + 2);
5019				alu.src[1].sel = ctx->temp_reg;
5020				tgsi_dst(ctx, &inst->Dst[0], dchan + i, &alu.dst);
5021				alu.last = i == 1;
5022
5023				r = r600_bytecode_add_alu(ctx->bc, &alu);
5024				if (r)
5025					return r;
5026			}
5027		}
5028	}
5029
5030	return 0;
5031}
5032
5033static int egcm_double_to_int(struct r600_shader_ctx *ctx)
5034{
5035	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5036	struct r600_bytecode_alu alu;
5037	int i, r;
5038	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5039	int treg = r600_get_temp(ctx);
5040	assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
5041		inst->Instruction.Opcode == TGSI_OPCODE_D2U);
5042
5043	/* do a 64->32 into a temp register */
5044	r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32);
5045	if (r)
5046		return r;
5047
5048	for (i = 0; i <= lasti; i++) {
5049		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5050			continue;
5051		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5052		alu.op = ctx->inst_info->op;
5053
5054		alu.src[0].chan = i;
5055		alu.src[0].sel = treg;
5056		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5057		alu.last = (i == lasti);
5058
5059		r = r600_bytecode_add_alu(ctx->bc, &alu);
5060		if (r)
5061			return r;
5062	}
5063
5064	return 0;
5065}
5066
5067static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
5068					unsigned op,
5069					int dst_reg,
5070					struct r600_shader_src *src,
5071					bool abs)
5072{
5073	struct r600_bytecode_alu alu;
5074	const int last_slot = 3;
5075	int r;
5076
5077	/* these have to write the result to X/Y by the looks of it */
5078	for (int i = 0 ; i < last_slot; i++) {
5079		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5080		alu.op = op;
5081
5082		r600_bytecode_src(&alu.src[0], src, 1);
5083		r600_bytecode_src(&alu.src[1], src, 0);
5084
5085		if (abs)
5086			r600_bytecode_src_set_abs(&alu.src[1]);
5087
5088		alu.dst.sel = dst_reg;
5089		alu.dst.chan = i;
5090		alu.dst.write = (i == 0 || i == 1);
5091
5092		if (bc->gfx_level != CAYMAN || i == last_slot - 1)
5093			alu.last = 1;
5094		r = r600_bytecode_add_alu(bc, &alu);
5095		if (r)
5096			return r;
5097	}
5098
5099	return 0;
5100}
5101
5102static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
5103{
5104	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5105	int i, r;
5106	struct r600_bytecode_alu alu;
5107	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5108	int t1 = ctx->temp_reg;
5109
5110	/* should only be one src regs */
5111	assert(inst->Instruction.NumSrcRegs == 1);
5112
5113	/* only support one double at a time */
5114	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5115	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5116
5117	r = cayman_emit_unary_double_raw(
5118		ctx->bc, ctx->inst_info->op, t1,
5119		&ctx->src[0],
5120		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
5121		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
5122	if (r)
5123		return r;
5124
5125	for (i = 0 ; i <= lasti; i++) {
5126		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5127			continue;
5128		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5129		alu.op = ALU_OP1_MOV;
5130		alu.src[0].sel = t1;
5131		alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
5132		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5133		alu.dst.write = 1;
5134		if (i == lasti)
5135			alu.last = 1;
5136		r = r600_bytecode_add_alu(ctx->bc, &alu);
5137		if (r)
5138			return r;
5139	}
5140	return 0;
5141}
5142
5143static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
5144{
5145	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5146	int i, j, r;
5147	struct r600_bytecode_alu alu;
5148	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5149
5150	for (i = 0 ; i < last_slot; i++) {
5151		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5152		alu.op = ctx->inst_info->op;
5153		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5154			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
5155
5156			/* RSQ should take the absolute value of src */
5157			if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
5158				r600_bytecode_src_set_abs(&alu.src[j]);
5159			}
5160		}
5161		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5162		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5163
5164		if (i == last_slot - 1)
5165			alu.last = 1;
5166		r = r600_bytecode_add_alu(ctx->bc, &alu);
5167		if (r)
5168			return r;
5169	}
5170	return 0;
5171}
5172
5173static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
5174{
5175	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5176	int i, j, k, r;
5177	struct r600_bytecode_alu alu;
5178	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5179	int t1 = ctx->temp_reg;
5180
5181	for (k = 0; k <= lasti; k++) {
5182		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
5183			continue;
5184
5185		for (i = 0 ; i < 4; i++) {
5186			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5187			alu.op = ctx->inst_info->op;
5188			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5189				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
5190			}
5191			alu.dst.sel = t1;
5192			alu.dst.chan = i;
5193			alu.dst.write = (i == k);
5194			if (i == 3)
5195				alu.last = 1;
5196			r = r600_bytecode_add_alu(ctx->bc, &alu);
5197			if (r)
5198				return r;
5199		}
5200	}
5201
5202	for (i = 0 ; i <= lasti; i++) {
5203		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5204			continue;
5205		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5206		alu.op = ALU_OP1_MOV;
5207		alu.src[0].sel = t1;
5208		alu.src[0].chan = i;
5209		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5210		alu.dst.write = 1;
5211		if (i == lasti)
5212			alu.last = 1;
5213		r = r600_bytecode_add_alu(ctx->bc, &alu);
5214		if (r)
5215			return r;
5216	}
5217
5218	return 0;
5219}
5220
5221
5222static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
5223{
5224	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5225	int i, j, k, r;
5226	struct r600_bytecode_alu alu;
5227	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5228	int t1 = ctx->temp_reg;
5229
5230	/* t1 would get overwritten below if we actually tried to
5231	 * multiply two pairs of doubles at a time. */
5232	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5233	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5234
5235	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
5236
5237	for (i = 0; i < 4; i++) {
5238		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5239		alu.op = ctx->inst_info->op;
5240		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5241			r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
5242		}
5243		alu.dst.sel = t1;
5244		alu.dst.chan = i;
5245		alu.dst.write = 1;
5246		if (i == 3)
5247			alu.last = 1;
5248		r = r600_bytecode_add_alu(ctx->bc, &alu);
5249		if (r)
5250			return r;
5251	}
5252
5253	for (i = 0; i <= lasti; i++) {
5254		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5255			continue;
5256		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5257		alu.op = ALU_OP1_MOV;
5258		alu.src[0].sel = t1;
5259		alu.src[0].chan = i;
5260		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5261		alu.dst.write = 1;
5262		if (i == lasti)
5263			alu.last = 1;
5264		r = r600_bytecode_add_alu(ctx->bc, &alu);
5265		if (r)
5266			return r;
5267	}
5268
5269	return 0;
5270}
5271
5272/*
5273 * Emit RECIP_64 + MUL_64 to implement division.
5274 */
5275static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
5276{
5277	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5278	int r;
5279	struct r600_bytecode_alu alu;
5280	int t1 = ctx->temp_reg;
5281	int k;
5282
5283	/* Only support one double at a time. This is the same constraint as
5284	 * in DMUL lowering. */
5285	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5286	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5287
5288	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
5289
5290	r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
5291	if (r)
5292		return r;
5293
5294	for (int i = 0; i < 4; i++) {
5295		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5296		alu.op = ALU_OP2_MUL_64;
5297
5298		r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
5299
5300		alu.src[1].sel = t1;
5301		alu.src[1].chan = (i == 3) ? 0 : 1;
5302
5303		alu.dst.sel = t1;
5304		alu.dst.chan = i;
5305		alu.dst.write = 1;
5306		if (i == 3)
5307			alu.last = 1;
5308		r = r600_bytecode_add_alu(ctx->bc, &alu);
5309		if (r)
5310			return r;
5311	}
5312
5313	for (int i = 0; i < 2; i++) {
5314		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5315		alu.op = ALU_OP1_MOV;
5316		alu.src[0].sel = t1;
5317		alu.src[0].chan = i;
5318		tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
5319		alu.dst.write = 1;
5320		if (i == 1)
5321			alu.last = 1;
5322		r = r600_bytecode_add_alu(ctx->bc, &alu);
5323		if (r)
5324			return r;
5325	}
5326	return 0;
5327}
5328
5329/*
5330 * r600 - trunc to -PI..PI range
5331 * r700 - normalize by dividing by 2PI
5332 * see fdo bug 27901
5333 */
5334static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
5335{
5336	int r;
5337	struct r600_bytecode_alu alu;
5338
5339	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5340	alu.op = ALU_OP3_MULADD;
5341	alu.is_op3 = 1;
5342
5343	alu.dst.chan = 0;
5344	alu.dst.sel = ctx->temp_reg;
5345	alu.dst.write = 1;
5346
5347	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5348
5349	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5350	alu.src[1].chan = 0;
5351	alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI);
5352	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
5353	alu.src[2].chan = 0;
5354	alu.last = 1;
5355	r = r600_bytecode_add_alu(ctx->bc, &alu);
5356	if (r)
5357		return r;
5358
5359	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5360	alu.op = ALU_OP1_FRACT;
5361
5362	alu.dst.chan = 0;
5363	alu.dst.sel = ctx->temp_reg;
5364	alu.dst.write = 1;
5365
5366	alu.src[0].sel = ctx->temp_reg;
5367	alu.src[0].chan = 0;
5368	alu.last = 1;
5369	r = r600_bytecode_add_alu(ctx->bc, &alu);
5370	if (r)
5371		return r;
5372
5373	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5374	alu.op = ALU_OP3_MULADD;
5375	alu.is_op3 = 1;
5376
5377	alu.dst.chan = 0;
5378	alu.dst.sel = ctx->temp_reg;
5379	alu.dst.write = 1;
5380
5381	alu.src[0].sel = ctx->temp_reg;
5382	alu.src[0].chan = 0;
5383
5384	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5385	alu.src[1].chan = 0;
5386	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5387	alu.src[2].chan = 0;
5388
5389	if (ctx->bc->gfx_level == R600) {
5390		alu.src[1].value = u_bitcast_f2u(2.0f * M_PI);
5391		alu.src[2].value = u_bitcast_f2u(-M_PI);
5392	} else {
5393		alu.src[1].sel = V_SQ_ALU_SRC_1;
5394		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
5395		alu.src[2].neg = 1;
5396	}
5397
5398	alu.last = 1;
5399	r = r600_bytecode_add_alu(ctx->bc, &alu);
5400	if (r)
5401		return r;
5402	return 0;
5403}
5404
5405static int cayman_trig(struct r600_shader_ctx *ctx)
5406{
5407	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5408	struct r600_bytecode_alu alu;
5409	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5410	int i, r;
5411
5412	r = tgsi_setup_trig(ctx);
5413	if (r)
5414		return r;
5415
5416
5417	for (i = 0; i < last_slot; i++) {
5418		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5419		alu.op = ctx->inst_info->op;
5420		alu.dst.chan = i;
5421
5422		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5423		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5424
5425		alu.src[0].sel = ctx->temp_reg;
5426		alu.src[0].chan = 0;
5427		if (i == last_slot - 1)
5428			alu.last = 1;
5429		r = r600_bytecode_add_alu(ctx->bc, &alu);
5430		if (r)
5431			return r;
5432	}
5433	return 0;
5434}
5435
5436static int tgsi_trig(struct r600_shader_ctx *ctx)
5437{
5438	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5439	struct r600_bytecode_alu alu;
5440	int i, r;
5441	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5442
5443	r = tgsi_setup_trig(ctx);
5444	if (r)
5445		return r;
5446
5447	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5448	alu.op = ctx->inst_info->op;
5449	alu.dst.chan = 0;
5450	alu.dst.sel = ctx->temp_reg;
5451	alu.dst.write = 1;
5452
5453	alu.src[0].sel = ctx->temp_reg;
5454	alu.src[0].chan = 0;
5455	alu.last = 1;
5456	r = r600_bytecode_add_alu(ctx->bc, &alu);
5457	if (r)
5458		return r;
5459
5460	/* replicate result */
5461	for (i = 0; i < lasti + 1; i++) {
5462		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5463			continue;
5464
5465		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5466		alu.op = ALU_OP1_MOV;
5467
5468		alu.src[0].sel = ctx->temp_reg;
5469		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5470		if (i == lasti)
5471			alu.last = 1;
5472		r = r600_bytecode_add_alu(ctx->bc, &alu);
5473		if (r)
5474			return r;
5475	}
5476	return 0;
5477}
5478
5479static int tgsi_kill(struct r600_shader_ctx *ctx)
5480{
5481	const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5482	struct r600_bytecode_alu alu;
5483	int i, r;
5484
5485	for (i = 0; i < 4; i++) {
5486		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5487		alu.op = ctx->inst_info->op;
5488
5489		alu.dst.chan = i;
5490
5491		alu.src[0].sel = V_SQ_ALU_SRC_0;
5492
5493		if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
5494			alu.src[1].sel = V_SQ_ALU_SRC_1;
5495			alu.src[1].neg = 1;
5496		} else {
5497			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5498		}
5499		if (i == 3) {
5500			alu.last = 1;
5501		}
5502		r = r600_bytecode_add_alu(ctx->bc, &alu);
5503		if (r)
5504			return r;
5505	}
5506
5507	/* kill must be last in ALU */
5508	ctx->bc->force_add_cf = 1;
5509	ctx->shader->uses_kill = TRUE;
5510	return 0;
5511}
5512
5513static int tgsi_lit(struct r600_shader_ctx *ctx)
5514{
5515	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5516	struct r600_bytecode_alu alu;
5517	int r;
5518
5519	/* tmp.x = max(src.y, 0.0) */
5520	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5521	alu.op = ALU_OP2_MAX;
5522	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
5523	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
5524	alu.src[1].chan = 1;
5525
5526	alu.dst.sel = ctx->temp_reg;
5527	alu.dst.chan = 0;
5528	alu.dst.write = 1;
5529
5530	alu.last = 1;
5531	r = r600_bytecode_add_alu(ctx->bc, &alu);
5532	if (r)
5533		return r;
5534
5535	if (inst->Dst[0].Register.WriteMask & (1 << 2))
5536	{
5537		int chan;
5538		int sel;
5539		unsigned i;
5540
5541		if (ctx->bc->gfx_level == CAYMAN) {
5542			for (i = 0; i < 3; i++) {
5543				/* tmp.z = log(tmp.x) */
5544				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5545				alu.op = ALU_OP1_LOG_CLAMPED;
5546				alu.src[0].sel = ctx->temp_reg;
5547				alu.src[0].chan = 0;
5548				alu.dst.sel = ctx->temp_reg;
5549				alu.dst.chan = i;
5550				if (i == 2) {
5551					alu.dst.write = 1;
5552					alu.last = 1;
5553				} else
5554					alu.dst.write = 0;
5555
5556				r = r600_bytecode_add_alu(ctx->bc, &alu);
5557				if (r)
5558					return r;
5559			}
5560		} else {
5561			/* tmp.z = log(tmp.x) */
5562			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5563			alu.op = ALU_OP1_LOG_CLAMPED;
5564			alu.src[0].sel = ctx->temp_reg;
5565			alu.src[0].chan = 0;
5566			alu.dst.sel = ctx->temp_reg;
5567			alu.dst.chan = 2;
5568			alu.dst.write = 1;
5569			alu.last = 1;
5570			r = r600_bytecode_add_alu(ctx->bc, &alu);
5571			if (r)
5572				return r;
5573		}
5574
5575		chan = alu.dst.chan;
5576		sel = alu.dst.sel;
5577
5578		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
5579		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5580		alu.op = ALU_OP3_MUL_LIT;
5581		alu.src[0].sel  = sel;
5582		alu.src[0].chan = chan;
5583		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
5584		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
5585		alu.dst.sel = ctx->temp_reg;
5586		alu.dst.chan = 0;
5587		alu.dst.write = 1;
5588		alu.is_op3 = 1;
5589		alu.last = 1;
5590		r = r600_bytecode_add_alu(ctx->bc, &alu);
5591		if (r)
5592			return r;
5593
5594		if (ctx->bc->gfx_level == CAYMAN) {
5595			for (i = 0; i < 3; i++) {
5596				/* dst.z = exp(tmp.x) */
5597				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5598				alu.op = ALU_OP1_EXP_IEEE;
5599				alu.src[0].sel = ctx->temp_reg;
5600				alu.src[0].chan = 0;
5601				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5602				if (i == 2) {
5603					alu.dst.write = 1;
5604					alu.last = 1;
5605				} else
5606					alu.dst.write = 0;
5607				r = r600_bytecode_add_alu(ctx->bc, &alu);
5608				if (r)
5609					return r;
5610			}
5611		} else {
5612			/* dst.z = exp(tmp.x) */
5613			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5614			alu.op = ALU_OP1_EXP_IEEE;
5615			alu.src[0].sel = ctx->temp_reg;
5616			alu.src[0].chan = 0;
5617			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
5618			alu.last = 1;
5619			r = r600_bytecode_add_alu(ctx->bc, &alu);
5620			if (r)
5621				return r;
5622		}
5623	}
5624
5625	/* dst.x, <- 1.0  */
5626	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5627	alu.op = ALU_OP1_MOV;
5628	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
5629	alu.src[0].chan = 0;
5630	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5631	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
5632	r = r600_bytecode_add_alu(ctx->bc, &alu);
5633	if (r)
5634		return r;
5635
5636	/* dst.y = max(src.x, 0.0) */
5637	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5638	alu.op = ALU_OP2_MAX;
5639	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5640	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
5641	alu.src[1].chan = 0;
5642	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
5643	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
5644	r = r600_bytecode_add_alu(ctx->bc, &alu);
5645	if (r)
5646		return r;
5647
5648	/* dst.w, <- 1.0  */
5649	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5650	alu.op = ALU_OP1_MOV;
5651	alu.src[0].sel  = V_SQ_ALU_SRC_1;
5652	alu.src[0].chan = 0;
5653	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
5654	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
5655	alu.last = 1;
5656	r = r600_bytecode_add_alu(ctx->bc, &alu);
5657	if (r)
5658		return r;
5659
5660	return 0;
5661}
5662
5663static int tgsi_rsq(struct r600_shader_ctx *ctx)
5664{
5665	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5666	struct r600_bytecode_alu alu;
5667	int i, r;
5668
5669	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5670
5671	alu.op = ALU_OP1_RECIPSQRT_IEEE;
5672
5673	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5674		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5675		r600_bytecode_src_set_abs(&alu.src[i]);
5676	}
5677	alu.dst.sel = ctx->temp_reg;
5678	alu.dst.write = 1;
5679	alu.last = 1;
5680	r = r600_bytecode_add_alu(ctx->bc, &alu);
5681	if (r)
5682		return r;
5683	/* replicate result */
5684	return tgsi_helper_tempx_replicate(ctx);
5685}
5686
5687static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
5688{
5689	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5690	struct r600_bytecode_alu alu;
5691	int i, r;
5692
5693	for (i = 0; i < 4; i++) {
5694		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5695		alu.src[0].sel = ctx->temp_reg;
5696		alu.op = ALU_OP1_MOV;
5697		alu.dst.chan = i;
5698		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5699		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5700		if (i == 3)
5701			alu.last = 1;
5702		r = r600_bytecode_add_alu(ctx->bc, &alu);
5703		if (r)
5704			return r;
5705	}
5706	return 0;
5707}
5708
5709static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
5710{
5711	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5712	struct r600_bytecode_alu alu;
5713	int i, r;
5714
5715	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5716	alu.op = ctx->inst_info->op;
5717	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5718		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5719	}
5720	alu.dst.sel = ctx->temp_reg;
5721	alu.dst.write = 1;
5722	alu.last = 1;
5723	r = r600_bytecode_add_alu(ctx->bc, &alu);
5724	if (r)
5725		return r;
5726	/* replicate result */
5727	return tgsi_helper_tempx_replicate(ctx);
5728}
5729
5730static int cayman_pow(struct r600_shader_ctx *ctx)
5731{
5732	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5733	int i, r;
5734	struct r600_bytecode_alu alu;
5735	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5736
5737	for (i = 0; i < 3; i++) {
5738		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5739		alu.op = ALU_OP1_LOG_IEEE;
5740		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5741		alu.dst.sel = ctx->temp_reg;
5742		alu.dst.chan = i;
5743		alu.dst.write = 1;
5744		if (i == 2)
5745			alu.last = 1;
5746		r = r600_bytecode_add_alu(ctx->bc, &alu);
5747		if (r)
5748			return r;
5749	}
5750
5751	/* b * LOG2(a) */
5752	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5753	alu.op = ALU_OP2_MUL;
5754	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5755	alu.src[1].sel = ctx->temp_reg;
5756	alu.dst.sel = ctx->temp_reg;
5757	alu.dst.write = 1;
5758	alu.last = 1;
5759	r = r600_bytecode_add_alu(ctx->bc, &alu);
5760	if (r)
5761		return r;
5762
5763	for (i = 0; i < last_slot; i++) {
5764		/* POW(a,b) = EXP2(b * LOG2(a))*/
5765		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5766		alu.op = ALU_OP1_EXP_IEEE;
5767		alu.src[0].sel = ctx->temp_reg;
5768
5769		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5770		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5771		if (i == last_slot - 1)
5772			alu.last = 1;
5773		r = r600_bytecode_add_alu(ctx->bc, &alu);
5774		if (r)
5775			return r;
5776	}
5777	return 0;
5778}
5779
5780static int tgsi_pow(struct r600_shader_ctx *ctx)
5781{
5782	struct r600_bytecode_alu alu;
5783	int r;
5784
5785	/* LOG2(a) */
5786	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5787	alu.op = ALU_OP1_LOG_IEEE;
5788	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5789	alu.dst.sel = ctx->temp_reg;
5790	alu.dst.write = 1;
5791	alu.last = 1;
5792	r = r600_bytecode_add_alu(ctx->bc, &alu);
5793	if (r)
5794		return r;
5795	/* b * LOG2(a) */
5796	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5797	alu.op = ALU_OP2_MUL;
5798	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5799	alu.src[1].sel = ctx->temp_reg;
5800	alu.dst.sel = ctx->temp_reg;
5801	alu.dst.write = 1;
5802	alu.last = 1;
5803	r = r600_bytecode_add_alu(ctx->bc, &alu);
5804	if (r)
5805		return r;
5806	/* POW(a,b) = EXP2(b * LOG2(a))*/
5807	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5808	alu.op = ALU_OP1_EXP_IEEE;
5809	alu.src[0].sel = ctx->temp_reg;
5810	alu.dst.sel = ctx->temp_reg;
5811	alu.dst.write = 1;
5812	alu.last = 1;
5813	r = r600_bytecode_add_alu(ctx->bc, &alu);
5814	if (r)
5815		return r;
5816	return tgsi_helper_tempx_replicate(ctx);
5817}
5818
5819static int emit_mul_int_op(struct r600_bytecode *bc,
5820			   struct r600_bytecode_alu *alu_src)
5821{
5822	struct r600_bytecode_alu alu;
5823	int i, r;
5824	alu = *alu_src;
5825	if (bc->gfx_level == CAYMAN) {
5826		for (i = 0; i < 4; i++) {
5827			alu.dst.chan = i;
5828			alu.dst.write = (i == alu_src->dst.chan);
5829			alu.last = (i == 3);
5830
5831			r = r600_bytecode_add_alu(bc, &alu);
5832			if (r)
5833				return r;
5834		}
5835	} else {
5836		alu.last = 1;
5837		r = r600_bytecode_add_alu(bc, &alu);
5838		if (r)
5839			return r;
5840	}
5841	return 0;
5842}
5843
5844static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
5845{
5846	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5847	struct r600_bytecode_alu alu;
5848	int i, r, j;
5849	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5850	int lasti = tgsi_last_instruction(write_mask);
5851	int tmp0 = ctx->temp_reg;
5852	int tmp1 = r600_get_temp(ctx);
5853	int tmp2 = r600_get_temp(ctx);
5854	int tmp3 = r600_get_temp(ctx);
5855	int tmp4 = 0;
5856
5857	/* Use additional temp if dst register and src register are the same */
5858	if (inst->Src[0].Register.Index == inst->Dst[0].Register.Index ||
5859	    inst->Src[1].Register.Index == inst->Dst[0].Register.Index) {
5860		tmp4 = r600_get_temp(ctx);
5861	}
5862
5863	/* Unsigned path:
5864	 *
5865	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
5866	 *
5867	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
5868	 * 2. tmp0.z = lo (tmp0.x * src2)
5869	 * 3. tmp0.w = -tmp0.z
5870	 * 4. tmp0.y = hi (tmp0.x * src2)
5871	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
5872	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
5873	 * 7. tmp1.x = tmp0.x - tmp0.w
5874	 * 8. tmp1.y = tmp0.x + tmp0.w
5875	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
5876	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
5877	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
5878	 *
5879	 * 12. tmp0.w = src1 - tmp0.y       = r
5880	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
5881	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
5882	 *
5883	 * if DIV
5884	 *
5885	 *   15. tmp1.z = tmp0.z + 1			= q + 1
5886	 *   16. tmp1.w = tmp0.z - 1			= q - 1
5887	 *
5888	 * else MOD
5889	 *
5890	 *   15. tmp1.z = tmp0.w - src2			= r - src2
5891	 *   16. tmp1.w = tmp0.w + src2			= r + src2
5892	 *
5893	 * endif
5894	 *
5895	 * 17. tmp1.x = tmp1.x & tmp1.y
5896	 *
5897	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
5898	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
5899	 *
5900	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
5901	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
5902	 *
5903	 * Signed path:
5904	 *
5905	 * Same as unsigned, using abs values of the operands,
5906	 * and fixing the sign of the result in the end.
5907	 */
5908
5909	for (i = 0; i < 4; i++) {
5910		if (!(write_mask & (1<<i)))
5911			continue;
5912
5913		if (signed_op) {
5914
5915			/* tmp2.x = -src0 */
5916			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5917			alu.op = ALU_OP2_SUB_INT;
5918
5919			alu.dst.sel = tmp2;
5920			alu.dst.chan = 0;
5921			alu.dst.write = 1;
5922
5923			alu.src[0].sel = V_SQ_ALU_SRC_0;
5924
5925			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5926
5927			alu.last = 1;
5928			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5929				return r;
5930
5931			/* tmp2.y = -src1 */
5932			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5933			alu.op = ALU_OP2_SUB_INT;
5934
5935			alu.dst.sel = tmp2;
5936			alu.dst.chan = 1;
5937			alu.dst.write = 1;
5938
5939			alu.src[0].sel = V_SQ_ALU_SRC_0;
5940
5941			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5942
5943			alu.last = 1;
5944			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5945				return r;
5946
5947			/* tmp2.z sign bit is set if src0 and src2 signs are different */
5948			/* it will be a sign of the quotient */
5949			if (!mod) {
5950
5951				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5952				alu.op = ALU_OP2_XOR_INT;
5953
5954				alu.dst.sel = tmp2;
5955				alu.dst.chan = 2;
5956				alu.dst.write = 1;
5957
5958				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5959				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5960
5961				alu.last = 1;
5962				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5963					return r;
5964			}
5965
5966			/* tmp2.x = |src0| */
5967			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5968			alu.op = ALU_OP3_CNDGE_INT;
5969			alu.is_op3 = 1;
5970
5971			alu.dst.sel = tmp2;
5972			alu.dst.chan = 0;
5973			alu.dst.write = 1;
5974
5975			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5976			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5977			alu.src[2].sel = tmp2;
5978			alu.src[2].chan = 0;
5979
5980			alu.last = 1;
5981			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5982				return r;
5983
5984			/* tmp2.y = |src1| */
5985			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5986			alu.op = ALU_OP3_CNDGE_INT;
5987			alu.is_op3 = 1;
5988
5989			alu.dst.sel = tmp2;
5990			alu.dst.chan = 1;
5991			alu.dst.write = 1;
5992
5993			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5994			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5995			alu.src[2].sel = tmp2;
5996			alu.src[2].chan = 1;
5997
5998			alu.last = 1;
5999			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6000				return r;
6001
6002		}
6003
6004		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
6005		if (ctx->bc->gfx_level == CAYMAN) {
6006			/* tmp3.x = u2f(src2) */
6007			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6008			alu.op = ALU_OP1_UINT_TO_FLT;
6009
6010			alu.dst.sel = tmp3;
6011			alu.dst.chan = 0;
6012			alu.dst.write = 1;
6013
6014			if (signed_op) {
6015				alu.src[0].sel = tmp2;
6016				alu.src[0].chan = 1;
6017			} else {
6018				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6019			}
6020
6021			alu.last = 1;
6022			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6023				return r;
6024
6025			/* tmp0.x = recip(tmp3.x) */
6026			for (j = 0 ; j < 3; j++) {
6027				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6028				alu.op = ALU_OP1_RECIP_IEEE;
6029
6030				alu.dst.sel = tmp0;
6031				alu.dst.chan = j;
6032				alu.dst.write = (j == 0);
6033
6034				alu.src[0].sel = tmp3;
6035				alu.src[0].chan = 0;
6036
6037				if (j == 2)
6038					alu.last = 1;
6039				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6040					return r;
6041			}
6042
6043			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6044			alu.op = ALU_OP2_MUL;
6045
6046			alu.src[0].sel = tmp0;
6047			alu.src[0].chan = 0;
6048
6049			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6050			alu.src[1].value = 0x4f800000;
6051
6052			alu.dst.sel = tmp3;
6053			alu.dst.write = 1;
6054			alu.last = 1;
6055			r = r600_bytecode_add_alu(ctx->bc, &alu);
6056			if (r)
6057				return r;
6058
6059			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6060			alu.op = ALU_OP1_FLT_TO_UINT;
6061
6062			alu.dst.sel = tmp0;
6063			alu.dst.chan = 0;
6064			alu.dst.write = 1;
6065
6066			alu.src[0].sel = tmp3;
6067			alu.src[0].chan = 0;
6068
6069			alu.last = 1;
6070			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6071				return r;
6072
6073		} else {
6074			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6075			alu.op = ALU_OP1_RECIP_UINT;
6076
6077			alu.dst.sel = tmp0;
6078			alu.dst.chan = 0;
6079			alu.dst.write = 1;
6080
6081			if (signed_op) {
6082				alu.src[0].sel = tmp2;
6083				alu.src[0].chan = 1;
6084			} else {
6085				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6086			}
6087
6088			alu.last = 1;
6089			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6090				return r;
6091		}
6092
6093		/* 2. tmp0.z = lo (tmp0.x * src2) */
6094		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6095		alu.op = ALU_OP2_MULLO_UINT;
6096
6097		alu.dst.sel = tmp0;
6098		alu.dst.chan = 2;
6099		alu.dst.write = 1;
6100
6101		alu.src[0].sel = tmp0;
6102		alu.src[0].chan = 0;
6103		if (signed_op) {
6104			alu.src[1].sel = tmp2;
6105			alu.src[1].chan = 1;
6106		} else {
6107			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6108		}
6109
6110		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6111			return r;
6112
6113		/* 3. tmp0.w = -tmp0.z */
6114		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6115		alu.op = ALU_OP2_SUB_INT;
6116
6117		alu.dst.sel = tmp0;
6118		alu.dst.chan = 3;
6119		alu.dst.write = 1;
6120
6121		alu.src[0].sel = V_SQ_ALU_SRC_0;
6122		alu.src[1].sel = tmp0;
6123		alu.src[1].chan = 2;
6124
6125		alu.last = 1;
6126		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6127			return r;
6128
6129		/* 4. tmp0.y = hi (tmp0.x * src2) */
6130		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6131		alu.op = ALU_OP2_MULHI_UINT;
6132
6133		alu.dst.sel = tmp0;
6134		alu.dst.chan = 1;
6135		alu.dst.write = 1;
6136
6137		alu.src[0].sel = tmp0;
6138		alu.src[0].chan = 0;
6139
6140		if (signed_op) {
6141			alu.src[1].sel = tmp2;
6142			alu.src[1].chan = 1;
6143		} else {
6144			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6145		}
6146
6147		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6148			return r;
6149
6150		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
6151		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6152		alu.op = ALU_OP3_CNDE_INT;
6153		alu.is_op3 = 1;
6154
6155		alu.dst.sel = tmp0;
6156		alu.dst.chan = 2;
6157		alu.dst.write = 1;
6158
6159		alu.src[0].sel = tmp0;
6160		alu.src[0].chan = 1;
6161		alu.src[1].sel = tmp0;
6162		alu.src[1].chan = 3;
6163		alu.src[2].sel = tmp0;
6164		alu.src[2].chan = 2;
6165
6166		alu.last = 1;
6167		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6168			return r;
6169
6170		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
6171		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6172		alu.op = ALU_OP2_MULHI_UINT;
6173
6174		alu.dst.sel = tmp0;
6175		alu.dst.chan = 3;
6176		alu.dst.write = 1;
6177
6178		alu.src[0].sel = tmp0;
6179		alu.src[0].chan = 2;
6180
6181		alu.src[1].sel = tmp0;
6182		alu.src[1].chan = 0;
6183
6184		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6185				return r;
6186
6187		/* 7. tmp1.x = tmp0.x - tmp0.w */
6188		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6189		alu.op = ALU_OP2_SUB_INT;
6190
6191		alu.dst.sel = tmp1;
6192		alu.dst.chan = 0;
6193		alu.dst.write = 1;
6194
6195		alu.src[0].sel = tmp0;
6196		alu.src[0].chan = 0;
6197		alu.src[1].sel = tmp0;
6198		alu.src[1].chan = 3;
6199
6200		alu.last = 1;
6201		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6202			return r;
6203
6204		/* 8. tmp1.y = tmp0.x + tmp0.w */
6205		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6206		alu.op = ALU_OP2_ADD_INT;
6207
6208		alu.dst.sel = tmp1;
6209		alu.dst.chan = 1;
6210		alu.dst.write = 1;
6211
6212		alu.src[0].sel = tmp0;
6213		alu.src[0].chan = 0;
6214		alu.src[1].sel = tmp0;
6215		alu.src[1].chan = 3;
6216
6217		alu.last = 1;
6218		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6219			return r;
6220
6221		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
6222		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6223		alu.op = ALU_OP3_CNDE_INT;
6224		alu.is_op3 = 1;
6225
6226		alu.dst.sel = tmp0;
6227		alu.dst.chan = 0;
6228		alu.dst.write = 1;
6229
6230		alu.src[0].sel = tmp0;
6231		alu.src[0].chan = 1;
6232		alu.src[1].sel = tmp1;
6233		alu.src[1].chan = 1;
6234		alu.src[2].sel = tmp1;
6235		alu.src[2].chan = 0;
6236
6237		alu.last = 1;
6238		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6239			return r;
6240
6241		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
6242		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6243		alu.op = ALU_OP2_MULHI_UINT;
6244
6245		alu.dst.sel = tmp0;
6246		alu.dst.chan = 2;
6247		alu.dst.write = 1;
6248
6249		alu.src[0].sel = tmp0;
6250		alu.src[0].chan = 0;
6251
6252		if (signed_op) {
6253			alu.src[1].sel = tmp2;
6254			alu.src[1].chan = 0;
6255		} else {
6256			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6257		}
6258
6259		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6260			return r;
6261
6262		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
6263		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6264		alu.op = ALU_OP2_MULLO_UINT;
6265
6266		alu.dst.sel = tmp0;
6267		alu.dst.chan = 1;
6268		alu.dst.write = 1;
6269
6270		if (signed_op) {
6271			alu.src[0].sel = tmp2;
6272			alu.src[0].chan = 1;
6273		} else {
6274			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6275		}
6276
6277		alu.src[1].sel = tmp0;
6278		alu.src[1].chan = 2;
6279
6280		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6281			return r;
6282
6283		/* 12. tmp0.w = src1 - tmp0.y       = r */
6284		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6285		alu.op = ALU_OP2_SUB_INT;
6286
6287		alu.dst.sel = tmp0;
6288		alu.dst.chan = 3;
6289		alu.dst.write = 1;
6290
6291		if (signed_op) {
6292			alu.src[0].sel = tmp2;
6293			alu.src[0].chan = 0;
6294		} else {
6295			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6296		}
6297
6298		alu.src[1].sel = tmp0;
6299		alu.src[1].chan = 1;
6300
6301		alu.last = 1;
6302		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6303			return r;
6304
6305		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
6306		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6307		alu.op = ALU_OP2_SETGE_UINT;
6308
6309		alu.dst.sel = tmp1;
6310		alu.dst.chan = 0;
6311		alu.dst.write = 1;
6312
6313		alu.src[0].sel = tmp0;
6314		alu.src[0].chan = 3;
6315		if (signed_op) {
6316			alu.src[1].sel = tmp2;
6317			alu.src[1].chan = 1;
6318		} else {
6319			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6320		}
6321
6322		alu.last = 1;
6323		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6324			return r;
6325
6326		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
6327		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6328		alu.op = ALU_OP2_SETGE_UINT;
6329
6330		alu.dst.sel = tmp1;
6331		alu.dst.chan = 1;
6332		alu.dst.write = 1;
6333
6334		if (signed_op) {
6335			alu.src[0].sel = tmp2;
6336			alu.src[0].chan = 0;
6337		} else {
6338			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6339		}
6340
6341		alu.src[1].sel = tmp0;
6342		alu.src[1].chan = 1;
6343
6344		alu.last = 1;
6345		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6346			return r;
6347
6348		if (mod) { /* UMOD */
6349
6350			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
6351			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6352			alu.op = ALU_OP2_SUB_INT;
6353
6354			alu.dst.sel = tmp1;
6355			alu.dst.chan = 2;
6356			alu.dst.write = 1;
6357
6358			alu.src[0].sel = tmp0;
6359			alu.src[0].chan = 3;
6360
6361			if (signed_op) {
6362				alu.src[1].sel = tmp2;
6363				alu.src[1].chan = 1;
6364			} else {
6365				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6366			}
6367
6368			alu.last = 1;
6369			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6370				return r;
6371
6372			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
6373			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6374			alu.op = ALU_OP2_ADD_INT;
6375
6376			alu.dst.sel = tmp1;
6377			alu.dst.chan = 3;
6378			alu.dst.write = 1;
6379
6380			alu.src[0].sel = tmp0;
6381			alu.src[0].chan = 3;
6382			if (signed_op) {
6383				alu.src[1].sel = tmp2;
6384				alu.src[1].chan = 1;
6385			} else {
6386				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6387			}
6388
6389			alu.last = 1;
6390			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6391				return r;
6392
6393		} else { /* UDIV */
6394
6395			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
6396			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6397			alu.op = ALU_OP2_ADD_INT;
6398
6399			alu.dst.sel = tmp1;
6400			alu.dst.chan = 2;
6401			alu.dst.write = 1;
6402
6403			alu.src[0].sel = tmp0;
6404			alu.src[0].chan = 2;
6405			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6406
6407			alu.last = 1;
6408			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6409				return r;
6410
6411			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
6412			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6413			alu.op = ALU_OP2_ADD_INT;
6414
6415			alu.dst.sel = tmp1;
6416			alu.dst.chan = 3;
6417			alu.dst.write = 1;
6418
6419			alu.src[0].sel = tmp0;
6420			alu.src[0].chan = 2;
6421			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
6422
6423			alu.last = 1;
6424			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6425				return r;
6426
6427		}
6428
6429		/* 17. tmp1.x = tmp1.x & tmp1.y */
6430		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6431		alu.op = ALU_OP2_AND_INT;
6432
6433		alu.dst.sel = tmp1;
6434		alu.dst.chan = 0;
6435		alu.dst.write = 1;
6436
6437		alu.src[0].sel = tmp1;
6438		alu.src[0].chan = 0;
6439		alu.src[1].sel = tmp1;
6440		alu.src[1].chan = 1;
6441
6442		alu.last = 1;
6443		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6444			return r;
6445
6446		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
6447		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
6448		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6449		alu.op = ALU_OP3_CNDE_INT;
6450		alu.is_op3 = 1;
6451
6452		alu.dst.sel = tmp0;
6453		alu.dst.chan = 2;
6454		alu.dst.write = 1;
6455
6456		alu.src[0].sel = tmp1;
6457		alu.src[0].chan = 0;
6458		alu.src[1].sel = tmp0;
6459		alu.src[1].chan = mod ? 3 : 2;
6460		alu.src[2].sel = tmp1;
6461		alu.src[2].chan = 2;
6462
6463		alu.last = 1;
6464		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6465			return r;
6466
6467		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
6468		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6469		alu.op = ALU_OP3_CNDE_INT;
6470		alu.is_op3 = 1;
6471
6472		if (signed_op) {
6473			alu.dst.sel = tmp0;
6474			alu.dst.chan = 2;
6475			alu.dst.write = 1;
6476		} else {
6477			if (tmp4 > 0) {
6478				alu.dst.sel = tmp4;
6479				alu.dst.chan = i;
6480				alu.dst.write = 1;
6481			} else {
6482				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6483			}
6484		}
6485
6486		alu.src[0].sel = tmp1;
6487		alu.src[0].chan = 1;
6488		alu.src[1].sel = tmp1;
6489		alu.src[1].chan = 3;
6490		alu.src[2].sel = tmp0;
6491		alu.src[2].chan = 2;
6492
6493		alu.last = 1;
6494		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6495			return r;
6496
6497		if (signed_op) {
6498
6499			/* fix the sign of the result */
6500
6501			if (mod) {
6502
6503				/* tmp0.x = -tmp0.z */
6504				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6505				alu.op = ALU_OP2_SUB_INT;
6506
6507				alu.dst.sel = tmp0;
6508				alu.dst.chan = 0;
6509				alu.dst.write = 1;
6510
6511				alu.src[0].sel = V_SQ_ALU_SRC_0;
6512				alu.src[1].sel = tmp0;
6513				alu.src[1].chan = 2;
6514
6515				alu.last = 1;
6516				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6517					return r;
6518
6519				/* sign of the remainder is the same as the sign of src0 */
6520				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
6521				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6522				alu.op = ALU_OP3_CNDGE_INT;
6523				alu.is_op3 = 1;
6524
6525				if (tmp4 > 0) {
6526					alu.dst.sel = tmp4;
6527					alu.dst.chan = i;
6528					alu.dst.write = 1;
6529				} else {
6530					tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6531				}
6532
6533				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6534				alu.src[1].sel = tmp0;
6535				alu.src[1].chan = 2;
6536				alu.src[2].sel = tmp0;
6537				alu.src[2].chan = 0;
6538
6539				alu.last = 1;
6540				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6541					return r;
6542
6543			} else {
6544
6545				/* tmp0.x = -tmp0.z */
6546				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6547				alu.op = ALU_OP2_SUB_INT;
6548
6549				alu.dst.sel = tmp0;
6550				alu.dst.chan = 0;
6551				alu.dst.write = 1;
6552
6553				alu.src[0].sel = V_SQ_ALU_SRC_0;
6554				alu.src[1].sel = tmp0;
6555				alu.src[1].chan = 2;
6556
6557				alu.last = 1;
6558				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6559					return r;
6560
6561				/* fix the quotient sign (same as the sign of src0*src1) */
6562				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
6563				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6564				alu.op = ALU_OP3_CNDGE_INT;
6565				alu.is_op3 = 1;
6566
6567				if (tmp4 > 0) {
6568					alu.dst.sel = tmp4;
6569					alu.dst.chan = i;
6570					alu.dst.write = 1;
6571				} else {
6572					tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6573				}
6574
6575				alu.src[0].sel = tmp2;
6576				alu.src[0].chan = 2;
6577				alu.src[1].sel = tmp0;
6578				alu.src[1].chan = 2;
6579				alu.src[2].sel = tmp0;
6580				alu.src[2].chan = 0;
6581
6582				alu.last = 1;
6583				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6584					return r;
6585			}
6586		}
6587	}
6588
6589	if (tmp4 > 0) {
6590		for (i = 0; i <= lasti; ++i) {
6591			if (!(write_mask & (1<<i)))
6592				continue;
6593
6594			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6595			alu.op = ALU_OP1_MOV;
6596			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6597			alu.src[0].sel = tmp4;
6598			alu.src[0].chan = i;
6599
6600			if (i == lasti)
6601				alu.last = 1;
6602			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6603				return r;
6604		}
6605	}
6606
6607	return 0;
6608}
6609
6610static int tgsi_udiv(struct r600_shader_ctx *ctx)
6611{
6612	return tgsi_divmod(ctx, 0, 0);
6613}
6614
6615static int tgsi_umod(struct r600_shader_ctx *ctx)
6616{
6617	return tgsi_divmod(ctx, 1, 0);
6618}
6619
6620static int tgsi_idiv(struct r600_shader_ctx *ctx)
6621{
6622	return tgsi_divmod(ctx, 0, 1);
6623}
6624
6625static int tgsi_imod(struct r600_shader_ctx *ctx)
6626{
6627	return tgsi_divmod(ctx, 1, 1);
6628}
6629
6630
6631static int tgsi_f2i(struct r600_shader_ctx *ctx)
6632{
6633	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6634	struct r600_bytecode_alu alu;
6635	int i, r;
6636	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6637	int last_inst = tgsi_last_instruction(write_mask);
6638
6639	for (i = 0; i < 4; i++) {
6640		if (!(write_mask & (1<<i)))
6641			continue;
6642
6643		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6644		alu.op = ALU_OP1_TRUNC;
6645
6646		alu.dst.sel = ctx->temp_reg;
6647		alu.dst.chan = i;
6648		alu.dst.write = 1;
6649
6650		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6651		if (i == last_inst)
6652			alu.last = 1;
6653		r = r600_bytecode_add_alu(ctx->bc, &alu);
6654		if (r)
6655			return r;
6656	}
6657
6658	for (i = 0; i < 4; i++) {
6659		if (!(write_mask & (1<<i)))
6660			continue;
6661
6662		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6663		alu.op = ctx->inst_info->op;
6664
6665		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6666
6667		alu.src[0].sel = ctx->temp_reg;
6668		alu.src[0].chan = i;
6669
6670		if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
6671			alu.last = 1;
6672		r = r600_bytecode_add_alu(ctx->bc, &alu);
6673		if (r)
6674			return r;
6675	}
6676
6677	return 0;
6678}
6679
6680static int tgsi_iabs(struct r600_shader_ctx *ctx)
6681{
6682	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6683	struct r600_bytecode_alu alu;
6684	int i, r;
6685	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6686	int last_inst = tgsi_last_instruction(write_mask);
6687
6688	/* tmp = -src */
6689	for (i = 0; i < 4; i++) {
6690		if (!(write_mask & (1<<i)))
6691			continue;
6692
6693		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6694		alu.op = ALU_OP2_SUB_INT;
6695
6696		alu.dst.sel = ctx->temp_reg;
6697		alu.dst.chan = i;
6698		alu.dst.write = 1;
6699
6700		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6701		alu.src[0].sel = V_SQ_ALU_SRC_0;
6702
6703		if (i == last_inst)
6704			alu.last = 1;
6705		r = r600_bytecode_add_alu(ctx->bc, &alu);
6706		if (r)
6707			return r;
6708	}
6709
6710	/* dst = (src >= 0 ? src : tmp) */
6711	for (i = 0; i < 4; i++) {
6712		if (!(write_mask & (1<<i)))
6713			continue;
6714
6715		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6716		alu.op = ALU_OP3_CNDGE_INT;
6717		alu.is_op3 = 1;
6718		alu.dst.write = 1;
6719
6720		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6721
6722		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6723		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6724		alu.src[2].sel = ctx->temp_reg;
6725		alu.src[2].chan = i;
6726
6727		if (i == last_inst)
6728			alu.last = 1;
6729		r = r600_bytecode_add_alu(ctx->bc, &alu);
6730		if (r)
6731			return r;
6732	}
6733	return 0;
6734}
6735
6736static int tgsi_issg(struct r600_shader_ctx *ctx)
6737{
6738	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6739	struct r600_bytecode_alu alu;
6740	int i, r;
6741	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6742	int last_inst = tgsi_last_instruction(write_mask);
6743
6744	/* tmp = (src >= 0 ? src : -1) */
6745	for (i = 0; i < 4; i++) {
6746		if (!(write_mask & (1<<i)))
6747			continue;
6748
6749		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6750		alu.op = ALU_OP3_CNDGE_INT;
6751		alu.is_op3 = 1;
6752
6753		alu.dst.sel = ctx->temp_reg;
6754		alu.dst.chan = i;
6755		alu.dst.write = 1;
6756
6757		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6758		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6759		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
6760
6761		if (i == last_inst)
6762			alu.last = 1;
6763		r = r600_bytecode_add_alu(ctx->bc, &alu);
6764		if (r)
6765			return r;
6766	}
6767
6768	/* dst = (tmp > 0 ? 1 : tmp) */
6769	for (i = 0; i < 4; i++) {
6770		if (!(write_mask & (1<<i)))
6771			continue;
6772
6773		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6774		alu.op = ALU_OP3_CNDGT_INT;
6775		alu.is_op3 = 1;
6776		alu.dst.write = 1;
6777
6778		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6779
6780		alu.src[0].sel = ctx->temp_reg;
6781		alu.src[0].chan = i;
6782
6783		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6784
6785		alu.src[2].sel = ctx->temp_reg;
6786		alu.src[2].chan = i;
6787
6788		if (i == last_inst)
6789			alu.last = 1;
6790		r = r600_bytecode_add_alu(ctx->bc, &alu);
6791		if (r)
6792			return r;
6793	}
6794	return 0;
6795}
6796
6797
6798
6799static int tgsi_ssg(struct r600_shader_ctx *ctx)
6800{
6801	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6802	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6803	int last_inst = tgsi_last_instruction(write_mask);
6804	struct r600_bytecode_alu alu;
6805	int i, r;
6806
6807	/* tmp = (src > 0 ? 1 : src) */
6808	for (i = 0; i <= last_inst; i++) {
6809		if (!(write_mask & (1 << i)))
6810			continue;
6811		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6812		alu.op = ALU_OP3_CNDGT;
6813		alu.is_op3 = 1;
6814
6815		alu.dst.sel = ctx->temp_reg;
6816		alu.dst.chan = i;
6817
6818		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6819		alu.src[1].sel = V_SQ_ALU_SRC_1;
6820		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6821
6822		if (i == last_inst)
6823			alu.last = 1;
6824		r = r600_bytecode_add_alu(ctx->bc, &alu);
6825		if (r)
6826			return r;
6827	}
6828
6829	/* dst = (-tmp > 0 ? -1 : tmp) */
6830	for (i = 0; i <= last_inst; i++) {
6831		if (!(write_mask & (1 << i)))
6832			continue;
6833		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6834		alu.op = ALU_OP3_CNDGT;
6835		alu.is_op3 = 1;
6836		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6837
6838		alu.src[0].sel = ctx->temp_reg;
6839		alu.src[0].chan = i;
6840		alu.src[0].neg = 1;
6841
6842		alu.src[1].sel = V_SQ_ALU_SRC_1;
6843		alu.src[1].neg = 1;
6844
6845		alu.src[2].sel = ctx->temp_reg;
6846		alu.src[2].chan = i;
6847
6848		if (i == last_inst)
6849			alu.last = 1;
6850		r = r600_bytecode_add_alu(ctx->bc, &alu);
6851		if (r)
6852			return r;
6853	}
6854	return 0;
6855}
6856
6857static int tgsi_bfi(struct r600_shader_ctx *ctx)
6858{
6859	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6860	struct r600_bytecode_alu alu;
6861	int i, r, t1, t2;
6862
6863	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6864	int last_inst = tgsi_last_instruction(write_mask);
6865
6866	t1 = r600_get_temp(ctx);
6867
6868	for (i = 0; i < 4; i++) {
6869		if (!(write_mask & (1<<i)))
6870			continue;
6871
6872		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6873		alu.op = ALU_OP2_SETGE_INT;
6874		r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6875		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6876		alu.src[1].value = 32;
6877		alu.dst.sel = ctx->temp_reg;
6878		alu.dst.chan = i;
6879		alu.dst.write = 1;
6880		alu.last = i == last_inst;
6881		r = r600_bytecode_add_alu(ctx->bc, &alu);
6882		if (r)
6883			return r;
6884	}
6885
6886	for (i = 0; i < 4; i++) {
6887		if (!(write_mask & (1<<i)))
6888			continue;
6889
6890		/* create mask tmp */
6891		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6892		alu.op = ALU_OP2_BFM_INT;
6893		alu.dst.sel = t1;
6894		alu.dst.chan = i;
6895		alu.dst.write = 1;
6896		alu.last = i == last_inst;
6897
6898		r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6899		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6900
6901		r = r600_bytecode_add_alu(ctx->bc, &alu);
6902		if (r)
6903			return r;
6904	}
6905
6906	t2 = r600_get_temp(ctx);
6907
6908	for (i = 0; i < 4; i++) {
6909		if (!(write_mask & (1<<i)))
6910			continue;
6911
6912		/* shift insert left */
6913		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6914		alu.op = ALU_OP2_LSHL_INT;
6915		alu.dst.sel = t2;
6916		alu.dst.chan = i;
6917		alu.dst.write = 1;
6918		alu.last = i == last_inst;
6919
6920		r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6921		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6922
6923		r = r600_bytecode_add_alu(ctx->bc, &alu);
6924		if (r)
6925			return r;
6926	}
6927
6928	for (i = 0; i < 4; i++) {
6929		if (!(write_mask & (1<<i)))
6930			continue;
6931
6932		/* actual bitfield insert */
6933		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6934		alu.op = ALU_OP3_BFI_INT;
6935		alu.is_op3 = 1;
6936		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6937		alu.dst.chan = i;
6938		alu.dst.write = 1;
6939		alu.last = i == last_inst;
6940
6941		alu.src[0].sel = t1;
6942		alu.src[0].chan = i;
6943		alu.src[1].sel = t2;
6944		alu.src[1].chan = i;
6945		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6946
6947		r = r600_bytecode_add_alu(ctx->bc, &alu);
6948		if (r)
6949			return r;
6950	}
6951
6952	for (i = 0; i < 4; i++) {
6953		if (!(write_mask & (1<<i)))
6954			continue;
6955		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6956		alu.op = ALU_OP3_CNDE_INT;
6957		alu.is_op3 = 1;
6958		alu.src[0].sel = ctx->temp_reg;
6959		alu.src[0].chan = i;
6960		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6961
6962		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6963
6964		alu.src[1].sel = alu.dst.sel;
6965		alu.src[1].chan = i;
6966
6967		alu.last = i == last_inst;
6968		r = r600_bytecode_add_alu(ctx->bc, &alu);
6969		if (r)
6970			return r;
6971	}
6972	return 0;
6973}
6974
6975static int tgsi_msb(struct r600_shader_ctx *ctx)
6976{
6977	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6978	struct r600_bytecode_alu alu;
6979	int i, r, t1, t2;
6980
6981	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6982	int last_inst = tgsi_last_instruction(write_mask);
6983
6984	assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
6985		ctx->inst_info->op == ALU_OP1_FFBH_UINT);
6986
6987	t1 = ctx->temp_reg;
6988
6989	/* bit position is indexed from lsb by TGSI, and from msb by the hardware */
6990	for (i = 0; i < 4; i++) {
6991		if (!(write_mask & (1<<i)))
6992			continue;
6993
6994		/* t1 = FFBH_INT / FFBH_UINT */
6995		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6996		alu.op = ctx->inst_info->op;
6997		alu.dst.sel = t1;
6998		alu.dst.chan = i;
6999		alu.dst.write = 1;
7000		alu.last = i == last_inst;
7001
7002		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7003
7004		r = r600_bytecode_add_alu(ctx->bc, &alu);
7005		if (r)
7006			return r;
7007	}
7008
7009	t2 = r600_get_temp(ctx);
7010
7011	for (i = 0; i < 4; i++) {
7012		if (!(write_mask & (1<<i)))
7013			continue;
7014
7015		/* t2 = 31 - t1 */
7016		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7017		alu.op = ALU_OP2_SUB_INT;
7018		alu.dst.sel = t2;
7019		alu.dst.chan = i;
7020		alu.dst.write = 1;
7021		alu.last = i == last_inst;
7022
7023		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
7024		alu.src[0].value = 31;
7025		alu.src[1].sel = t1;
7026		alu.src[1].chan = i;
7027
7028		r = r600_bytecode_add_alu(ctx->bc, &alu);
7029		if (r)
7030			return r;
7031	}
7032
7033	for (i = 0; i < 4; i++) {
7034		if (!(write_mask & (1<<i)))
7035			continue;
7036
7037		/* result = t1 >= 0 ? t2 : t1 */
7038		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7039		alu.op = ALU_OP3_CNDGE_INT;
7040		alu.is_op3 = 1;
7041		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7042		alu.dst.chan = i;
7043		alu.dst.write = 1;
7044		alu.last = i == last_inst;
7045
7046		alu.src[0].sel = t1;
7047		alu.src[0].chan = i;
7048		alu.src[1].sel = t2;
7049		alu.src[1].chan = i;
7050		alu.src[2].sel = t1;
7051		alu.src[2].chan = i;
7052
7053		r = r600_bytecode_add_alu(ctx->bc, &alu);
7054		if (r)
7055			return r;
7056	}
7057
7058	return 0;
7059}
7060
7061static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
7062{
7063	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7064	struct r600_bytecode_alu alu;
7065	int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
7066	unsigned location;
7067	const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs;
7068
7069	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
7070
7071	/* Interpolators have been marked for use already by allocate_system_value_inputs */
7072	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
7073		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7074		location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
7075	}
7076	else {
7077		location = TGSI_INTERPOLATE_LOC_CENTROID;
7078		ctx->shader->input[input].uses_interpolate_at_centroid = 1;
7079	}
7080
7081	k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
7082	if (k < 0)
7083		k = 0;
7084	interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
7085	interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
7086
7087	/* NOTE: currently offset is not perspective correct */
7088	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
7089		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7090		int sample_gpr = -1;
7091		int gradientsH, gradientsV;
7092		struct r600_bytecode_tex tex;
7093
7094		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7095			sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
7096		}
7097
7098		gradientsH = r600_get_temp(ctx);
7099		gradientsV = r600_get_temp(ctx);
7100		for (i = 0; i < 2; i++) {
7101			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7102			tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
7103			tex.src_gpr = interp_gpr;
7104			tex.src_sel_x = interp_base_chan + 0;
7105			tex.src_sel_y = interp_base_chan + 1;
7106			tex.src_sel_z = 0;
7107			tex.src_sel_w = 0;
7108			tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
7109			tex.dst_sel_x = 0;
7110			tex.dst_sel_y = 1;
7111			tex.dst_sel_z = 7;
7112			tex.dst_sel_w = 7;
7113			tex.inst_mod = 1; // Use per pixel gradient calculation
7114			tex.sampler_id = 0;
7115			tex.resource_id = tex.sampler_id;
7116			r = r600_bytecode_add_tex(ctx->bc, &tex);
7117			if (r)
7118				return r;
7119		}
7120
7121		for (i = 0; i < 2; i++) {
7122			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7123			alu.op = ALU_OP3_MULADD;
7124			alu.is_op3 = 1;
7125			alu.src[0].sel = gradientsH;
7126			alu.src[0].chan = i;
7127			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7128				alu.src[1].sel = sample_gpr;
7129				alu.src[1].chan = 2;
7130			}
7131			else {
7132				r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
7133			}
7134			alu.src[2].sel = interp_gpr;
7135			alu.src[2].chan = interp_base_chan + i;
7136			alu.dst.sel = ctx->temp_reg;
7137			alu.dst.chan = i;
7138			alu.last = i == 1;
7139
7140			r = r600_bytecode_add_alu(ctx->bc, &alu);
7141			if (r)
7142				return r;
7143		}
7144
7145		for (i = 0; i < 2; i++) {
7146			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7147			alu.op = ALU_OP3_MULADD;
7148			alu.is_op3 = 1;
7149			alu.src[0].sel = gradientsV;
7150			alu.src[0].chan = i;
7151			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7152				alu.src[1].sel = sample_gpr;
7153				alu.src[1].chan = 3;
7154			}
7155			else {
7156				r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
7157			}
7158			alu.src[2].sel = ctx->temp_reg;
7159			alu.src[2].chan = i;
7160			alu.dst.sel = ctx->temp_reg;
7161			alu.dst.chan = i;
7162			alu.last = i == 1;
7163
7164			r = r600_bytecode_add_alu(ctx->bc, &alu);
7165			if (r)
7166				return r;
7167		}
7168	}
7169
7170	tmp = r600_get_temp(ctx);
7171	for (i = 0; i < 8; i++) {
7172		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7173		alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
7174
7175		alu.dst.sel = tmp;
7176		if ((i > 1 && i < 6)) {
7177			alu.dst.write = 1;
7178		}
7179		else {
7180			alu.dst.write = 0;
7181		}
7182		alu.dst.chan = i % 4;
7183
7184		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
7185			inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7186			alu.src[0].sel = ctx->temp_reg;
7187			alu.src[0].chan = 1 - (i % 2);
7188		} else {
7189			alu.src[0].sel = interp_gpr;
7190			alu.src[0].chan = interp_base_chan + 1 - (i % 2);
7191		}
7192		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
7193		alu.src[1].chan = 0;
7194
7195		alu.last = i % 4 == 3;
7196		alu.bank_swizzle_force = SQ_ALU_VEC_210;
7197
7198		r = r600_bytecode_add_alu(ctx->bc, &alu);
7199		if (r)
7200			return r;
7201	}
7202
7203	// INTERP can't swizzle dst
7204	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7205	for (i = 0; i <= lasti; i++) {
7206		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7207			continue;
7208
7209		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7210		alu.op = ALU_OP1_MOV;
7211		alu.src[0].sel = tmp;
7212		alu.src[0].chan = ctx->src[0].swizzle[i];
7213		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7214		alu.dst.write = 1;
7215		alu.last = i == lasti;
7216		r = r600_bytecode_add_alu(ctx->bc, &alu);
7217		if (r)
7218			return r;
7219	}
7220
7221	return 0;
7222}
7223
7224
7225static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
7226{
7227	struct r600_bytecode_alu alu;
7228	int i, r;
7229
7230	for (i = 0; i < 4; i++) {
7231		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7232		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
7233			alu.op = ALU_OP0_NOP;
7234			alu.dst.chan = i;
7235		} else {
7236			alu.op = ALU_OP1_MOV;
7237			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7238			alu.src[0].sel = ctx->temp_reg;
7239			alu.src[0].chan = i;
7240		}
7241		if (i == 3) {
7242			alu.last = 1;
7243		}
7244		r = r600_bytecode_add_alu(ctx->bc, &alu);
7245		if (r)
7246			return r;
7247	}
7248	return 0;
7249}
7250
7251static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
7252                                 unsigned writemask,
7253                                 struct r600_bytecode_alu_src *bc_src,
7254                                 const struct r600_shader_src *shader_src)
7255{
7256	struct r600_bytecode_alu alu;
7257	int i, r;
7258	int lasti = tgsi_last_instruction(writemask);
7259	int temp_reg = 0;
7260
7261	r600_bytecode_src(&bc_src[0], shader_src, 0);
7262	r600_bytecode_src(&bc_src[1], shader_src, 1);
7263	r600_bytecode_src(&bc_src[2], shader_src, 2);
7264	r600_bytecode_src(&bc_src[3], shader_src, 3);
7265
7266	if (bc_src->abs) {
7267		temp_reg = r600_get_temp(ctx);
7268
7269		for (i = 0; i < lasti + 1; i++) {
7270			if (!(writemask & (1 << i)))
7271				continue;
7272			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7273			alu.op = ALU_OP1_MOV;
7274			alu.dst.sel = temp_reg;
7275			alu.dst.chan = i;
7276			alu.dst.write = 1;
7277			alu.src[0] = bc_src[i];
7278			if (i == lasti) {
7279				alu.last = 1;
7280			}
7281			r = r600_bytecode_add_alu(ctx->bc, &alu);
7282			if (r)
7283				return r;
7284			memset(&bc_src[i], 0, sizeof(*bc_src));
7285			bc_src[i].sel = temp_reg;
7286			bc_src[i].chan = i;
7287		}
7288	}
7289	return 0;
7290}
7291
7292static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst)
7293{
7294	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7295	struct r600_bytecode_alu alu;
7296	struct r600_bytecode_alu_src srcs[4][4];
7297	int i, j, r;
7298	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7299	unsigned op = ctx->inst_info->op;
7300
7301	if (op == ALU_OP3_MULADD_IEEE &&
7302	    ctx->info.properties[TGSI_PROPERTY_LEGACY_MATH_RULES])
7303		op = ALU_OP3_MULADD;
7304
7305	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7306		r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
7307					  srcs[j], &ctx->src[j]);
7308		if (r)
7309			return r;
7310	}
7311
7312	for (i = 0; i < lasti + 1; i++) {
7313		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7314			continue;
7315
7316		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7317		alu.op = op;
7318		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7319			alu.src[j] = srcs[j][i];
7320		}
7321
7322		if (dst == -1) {
7323			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7324		} else {
7325			alu.dst.sel = dst;
7326		}
7327		alu.dst.chan = i;
7328		alu.dst.write = 1;
7329		alu.is_op3 = 1;
7330		if (i == lasti) {
7331			alu.last = 1;
7332		}
7333		r = r600_bytecode_add_alu(ctx->bc, &alu);
7334		if (r)
7335			return r;
7336	}
7337	return 0;
7338}
7339
7340static int tgsi_op3(struct r600_shader_ctx *ctx)
7341{
7342	return tgsi_op3_dst(ctx, -1);
7343}
7344
7345static int tgsi_dp(struct r600_shader_ctx *ctx)
7346{
7347	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7348	struct r600_bytecode_alu alu;
7349	int i, j, r;
7350	unsigned op = ctx->inst_info->op;
7351	if (op == ALU_OP2_DOT4_IEEE &&
7352	    ctx->info.properties[TGSI_PROPERTY_LEGACY_MATH_RULES])
7353		op = ALU_OP2_DOT4;
7354
7355	for (i = 0; i < 4; i++) {
7356		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7357		alu.op = op;
7358		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7359			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
7360		}
7361
7362		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7363		alu.dst.chan = i;
7364		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
7365		/* handle some special cases */
7366		switch (inst->Instruction.Opcode) {
7367		case TGSI_OPCODE_DP2:
7368			if (i > 1) {
7369				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
7370				alu.src[0].chan = alu.src[1].chan = 0;
7371			}
7372			break;
7373		case TGSI_OPCODE_DP3:
7374			if (i > 2) {
7375				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
7376				alu.src[0].chan = alu.src[1].chan = 0;
7377			}
7378			break;
7379		default:
7380			break;
7381		}
7382		if (i == 3) {
7383			alu.last = 1;
7384		}
7385		r = r600_bytecode_add_alu(ctx->bc, &alu);
7386		if (r)
7387			return r;
7388	}
7389	return 0;
7390}
7391
7392static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
7393						    unsigned index)
7394{
7395	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7396	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
7397		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
7398		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
7399		ctx->src[index].neg || ctx->src[index].abs ||
7400		(inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY);
7401}
7402
7403static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
7404					unsigned index)
7405{
7406	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7407	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
7408}
7409
7410static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
7411{
7412	struct r600_bytecode_vtx vtx;
7413	struct r600_bytecode_alu alu;
7414	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7415	int src_gpr, r, i;
7416	int id = tgsi_tex_get_src_gpr(ctx, 1);
7417	int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7418
7419	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7420	if (src_requires_loading) {
7421		for (i = 0; i < 4; i++) {
7422			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7423			alu.op = ALU_OP1_MOV;
7424			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7425			alu.dst.sel = ctx->temp_reg;
7426			alu.dst.chan = i;
7427			if (i == 3)
7428				alu.last = 1;
7429			alu.dst.write = 1;
7430			r = r600_bytecode_add_alu(ctx->bc, &alu);
7431			if (r)
7432				return r;
7433		}
7434		src_gpr = ctx->temp_reg;
7435	}
7436
7437	memset(&vtx, 0, sizeof(vtx));
7438	vtx.op = FETCH_OP_VFETCH;
7439	vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
7440	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7441	vtx.src_gpr = src_gpr;
7442	vtx.mega_fetch_count = 16;
7443	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7444	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
7445	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
7446	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
7447	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
7448	vtx.use_const_fields = 1;
7449	vtx.buffer_index_mode = sampler_index_mode;
7450
7451	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
7452		return r;
7453
7454	if (ctx->bc->gfx_level >= EVERGREEN)
7455		return 0;
7456
7457	for (i = 0; i < 4; i++) {
7458		int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7459		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7460			continue;
7461
7462		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7463		alu.op = ALU_OP2_AND_INT;
7464
7465		alu.dst.chan = i;
7466		alu.dst.sel = vtx.dst_gpr;
7467		alu.dst.write = 1;
7468
7469		alu.src[0].sel = vtx.dst_gpr;
7470		alu.src[0].chan = i;
7471
7472		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
7473		alu.src[1].sel += (id * 2);
7474		alu.src[1].chan = i % 4;
7475		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7476
7477		if (i == lasti)
7478			alu.last = 1;
7479		r = r600_bytecode_add_alu(ctx->bc, &alu);
7480		if (r)
7481			return r;
7482	}
7483
7484	if (inst->Dst[0].Register.WriteMask & 3) {
7485		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7486		alu.op = ALU_OP2_OR_INT;
7487
7488		alu.dst.chan = 3;
7489		alu.dst.sel = vtx.dst_gpr;
7490		alu.dst.write = 1;
7491
7492		alu.src[0].sel = vtx.dst_gpr;
7493		alu.src[0].chan = 3;
7494
7495		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
7496		alu.src[1].chan = 0;
7497		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7498
7499		alu.last = 1;
7500		r = r600_bytecode_add_alu(ctx->bc, &alu);
7501		if (r)
7502			return r;
7503	}
7504	return 0;
7505}
7506
7507static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset, int eg_buffer_base)
7508{
7509	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7510	int r;
7511	int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;
7512	int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7513
7514	if (ctx->bc->gfx_level < EVERGREEN) {
7515		struct r600_bytecode_alu alu;
7516		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7517		alu.op = ALU_OP1_MOV;
7518		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7519		/* r600 we have them at channel 2 of the second dword */
7520		alu.src[0].sel += (id * 2) + 1;
7521		alu.src[0].chan = 1;
7522		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7523		tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
7524		alu.last = 1;
7525		r = r600_bytecode_add_alu(ctx->bc, &alu);
7526		if (r)
7527			return r;
7528		return 0;
7529	} else {
7530		struct r600_bytecode_vtx vtx;
7531		memset(&vtx, 0, sizeof(vtx));
7532		vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
7533		vtx.buffer_id = id + eg_buffer_base;
7534		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7535		vtx.src_gpr = 0;
7536		vtx.mega_fetch_count = 16; /* no idea here really... */
7537		vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7538		vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
7539		vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7;		/* SEL_Y */
7540		vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7;		/* SEL_Z */
7541		vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7;		/* SEL_W */
7542		vtx.data_format = FMT_32_32_32_32;
7543		vtx.buffer_index_mode = sampler_index_mode;
7544
7545		if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
7546			return r;
7547		return 0;
7548	}
7549}
7550
7551
7552static int tgsi_tex(struct r600_shader_ctx *ctx)
7553{
7554	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7555	struct r600_bytecode_tex tex;
7556	struct r600_bytecode_tex grad_offs[3];
7557	struct r600_bytecode_alu alu;
7558	unsigned src_gpr;
7559	int r, i, j, n_grad_offs = 0;
7560	int opcode;
7561	bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
7562				    inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7563				    (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
7564				     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
7565
7566	bool txf_add_offsets = inst->Texture.NumOffsets &&
7567			     inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7568			     inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
7569
7570	/* Texture fetch instructions can only use gprs as source.
7571	 * Also they cannot negate the source or take the absolute value */
7572	const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
7573                                              tgsi_tex_src_requires_loading(ctx, 0)) ||
7574					     read_compressed_msaa || txf_add_offsets;
7575
7576	boolean src_loaded = FALSE;
7577	unsigned sampler_src_reg = 1;
7578	int8_t offset_x = 0, offset_y = 0, offset_z = 0;
7579	boolean has_txq_cube_array_z = false;
7580	unsigned sampler_index_mode;
7581	int array_index_offset_channel = -1;
7582
7583	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
7584	    ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7585	      inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
7586		if (inst->Dst[0].Register.WriteMask & 4) {
7587			ctx->shader->has_txq_cube_array_z_comp = true;
7588			has_txq_cube_array_z = true;
7589		}
7590
7591	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
7592	    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7593	    inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
7594	    inst->Instruction.Opcode == TGSI_OPCODE_TG4)
7595		sampler_src_reg = 2;
7596
7597	/* TGSI moves the sampler to src reg 3 for TXD */
7598	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
7599		sampler_src_reg = 3;
7600
7601	sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7602
7603	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7604
7605	if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
7606		if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
7607			if (ctx->bc->gfx_level < EVERGREEN)
7608				ctx->shader->uses_tex_buffers = true;
7609			return r600_do_buffer_txq(ctx, 1, 0, R600_MAX_CONST_BUFFERS);
7610		}
7611		else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
7612			if (ctx->bc->gfx_level < EVERGREEN)
7613				ctx->shader->uses_tex_buffers = true;
7614			return do_vtx_fetch_inst(ctx, src_requires_loading);
7615		}
7616	}
7617
7618	if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
7619		int out_chan;
7620		/* Add perspective divide */
7621		if (ctx->bc->gfx_level == CAYMAN) {
7622			out_chan = 2;
7623			for (i = 0; i < 3; i++) {
7624				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7625				alu.op = ALU_OP1_RECIP_IEEE;
7626				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7627
7628				alu.dst.sel = ctx->temp_reg;
7629				alu.dst.chan = i;
7630				if (i == 2)
7631					alu.last = 1;
7632				if (out_chan == i)
7633					alu.dst.write = 1;
7634				r = r600_bytecode_add_alu(ctx->bc, &alu);
7635				if (r)
7636					return r;
7637			}
7638
7639		} else {
7640			out_chan = 3;
7641			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7642			alu.op = ALU_OP1_RECIP_IEEE;
7643			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7644
7645			alu.dst.sel = ctx->temp_reg;
7646			alu.dst.chan = out_chan;
7647			alu.last = 1;
7648			alu.dst.write = 1;
7649			r = r600_bytecode_add_alu(ctx->bc, &alu);
7650			if (r)
7651				return r;
7652		}
7653
7654		for (i = 0; i < 3; i++) {
7655			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7656			alu.op = ALU_OP2_MUL;
7657			alu.src[0].sel = ctx->temp_reg;
7658			alu.src[0].chan = out_chan;
7659			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
7660			alu.dst.sel = ctx->temp_reg;
7661			alu.dst.chan = i;
7662			alu.dst.write = 1;
7663			r = r600_bytecode_add_alu(ctx->bc, &alu);
7664			if (r)
7665				return r;
7666		}
7667		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7668		alu.op = ALU_OP1_MOV;
7669		alu.src[0].sel = V_SQ_ALU_SRC_1;
7670		alu.src[0].chan = 0;
7671		alu.dst.sel = ctx->temp_reg;
7672		alu.dst.chan = 3;
7673		alu.last = 1;
7674		alu.dst.write = 1;
7675		r = r600_bytecode_add_alu(ctx->bc, &alu);
7676		if (r)
7677			return r;
7678		src_loaded = TRUE;
7679		src_gpr = ctx->temp_reg;
7680	}
7681
7682
7683	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7684	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7685	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7686	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7687	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
7688
7689		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
7690		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
7691
7692		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
7693		for (i = 0; i < 4; i++) {
7694			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7695			alu.op = ALU_OP2_CUBE;
7696			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7697			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
7698			alu.dst.sel = ctx->temp_reg;
7699			alu.dst.chan = i;
7700			if (i == 3)
7701				alu.last = 1;
7702			alu.dst.write = 1;
7703			r = r600_bytecode_add_alu(ctx->bc, &alu);
7704			if (r)
7705				return r;
7706		}
7707
7708		/* tmp1.z = RCP_e(|tmp1.z|) */
7709		if (ctx->bc->gfx_level == CAYMAN) {
7710			for (i = 0; i < 3; i++) {
7711				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7712				alu.op = ALU_OP1_RECIP_IEEE;
7713				alu.src[0].sel = ctx->temp_reg;
7714				alu.src[0].chan = 2;
7715				alu.src[0].abs = 1;
7716				alu.dst.sel = ctx->temp_reg;
7717				alu.dst.chan = i;
7718				if (i == 2)
7719					alu.dst.write = 1;
7720				if (i == 2)
7721					alu.last = 1;
7722				r = r600_bytecode_add_alu(ctx->bc, &alu);
7723				if (r)
7724					return r;
7725			}
7726		} else {
7727			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7728			alu.op = ALU_OP1_RECIP_IEEE;
7729			alu.src[0].sel = ctx->temp_reg;
7730			alu.src[0].chan = 2;
7731			alu.src[0].abs = 1;
7732			alu.dst.sel = ctx->temp_reg;
7733			alu.dst.chan = 2;
7734			alu.dst.write = 1;
7735			alu.last = 1;
7736			r = r600_bytecode_add_alu(ctx->bc, &alu);
7737			if (r)
7738				return r;
7739		}
7740
7741		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
7742		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
7743		 * muladd has no writemask, have to use another temp
7744		 */
7745		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7746		alu.op = ALU_OP3_MULADD;
7747		alu.is_op3 = 1;
7748
7749		alu.src[0].sel = ctx->temp_reg;
7750		alu.src[0].chan = 0;
7751		alu.src[1].sel = ctx->temp_reg;
7752		alu.src[1].chan = 2;
7753
7754		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7755		alu.src[2].chan = 0;
7756		alu.src[2].value = u_bitcast_f2u(1.5f);
7757
7758		alu.dst.sel = ctx->temp_reg;
7759		alu.dst.chan = 0;
7760		alu.dst.write = 1;
7761
7762		r = r600_bytecode_add_alu(ctx->bc, &alu);
7763		if (r)
7764			return r;
7765
7766		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7767		alu.op = ALU_OP3_MULADD;
7768		alu.is_op3 = 1;
7769
7770		alu.src[0].sel = ctx->temp_reg;
7771		alu.src[0].chan = 1;
7772		alu.src[1].sel = ctx->temp_reg;
7773		alu.src[1].chan = 2;
7774
7775		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7776		alu.src[2].chan = 0;
7777		alu.src[2].value = u_bitcast_f2u(1.5f);
7778
7779		alu.dst.sel = ctx->temp_reg;
7780		alu.dst.chan = 1;
7781		alu.dst.write = 1;
7782
7783		alu.last = 1;
7784		r = r600_bytecode_add_alu(ctx->bc, &alu);
7785		if (r)
7786			return r;
7787		/* write initial compare value into Z component
7788		  - W src 0 for shadow cube
7789		  - X src 1 for shadow cube array */
7790		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7791		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7792			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7793			alu.op = ALU_OP1_MOV;
7794			if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
7795				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7796			else
7797				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7798			alu.dst.sel = ctx->temp_reg;
7799			alu.dst.chan = 2;
7800			alu.dst.write = 1;
7801			alu.last = 1;
7802			r = r600_bytecode_add_alu(ctx->bc, &alu);
7803			if (r)
7804				return r;
7805		}
7806
7807		if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7808		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7809			if (ctx->bc->gfx_level >= EVERGREEN) {
7810				int mytmp = r600_get_temp(ctx);
7811				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7812				alu.op = ALU_OP1_MOV;
7813				alu.src[0].sel = ctx->temp_reg;
7814				alu.src[0].chan = 3;
7815				alu.dst.sel = mytmp;
7816				alu.dst.chan = 0;
7817				alu.dst.write = 1;
7818				alu.last = 1;
7819				r = r600_bytecode_add_alu(ctx->bc, &alu);
7820				if (r)
7821					return r;
7822
7823				/* Evaluate the array index according to floor(idx + 0.5). This
7824				 * needs to be done before merging the face select value, because
7825				 * otherwise the fractional part of the array index will interfere
7826				 * with the face select value */
7827				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7828				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7829				alu.op = ALU_OP1_RNDNE;
7830				alu.dst.sel = ctx->temp_reg;
7831				alu.dst.chan = 3;
7832				alu.dst.write = 1;
7833				alu.last = 1;
7834				r = r600_bytecode_add_alu(ctx->bc, &alu);
7835				if (r)
7836					return r;
7837
7838				/* Because the array slice index and the cube face index are merged
7839				 * into one value we have to make sure the array slice index is >= 0,
7840				 * otherwise the face selection will fail */
7841				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7842				alu.op = ALU_OP2_MAX;
7843				alu.src[0].sel = ctx->temp_reg;
7844				alu.src[0].chan = 3;
7845				alu.src[1].sel = V_SQ_ALU_SRC_0;
7846				alu.dst.sel = ctx->temp_reg;
7847				alu.dst.chan = 3;
7848				alu.dst.write = 1;
7849				alu.last = 1;
7850				r = r600_bytecode_add_alu(ctx->bc, &alu);
7851				if (r)
7852					return r;
7853
7854				/* have to multiply original layer by 8 and add to face id (temp.w) in Z */
7855				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7856				alu.op = ALU_OP3_MULADD;
7857				alu.is_op3 = 1;
7858				alu.src[0].sel = ctx->temp_reg;
7859				alu.src[0].chan = 3;
7860				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7861				alu.src[1].chan = 0;
7862				alu.src[1].value = u_bitcast_f2u(8.0f);
7863				alu.src[2].sel = mytmp;
7864				alu.src[2].chan = 0;
7865				alu.dst.sel = ctx->temp_reg;
7866				alu.dst.chan = 3;
7867				alu.dst.write = 1;
7868				alu.last = 1;
7869				r = r600_bytecode_add_alu(ctx->bc, &alu);
7870				if (r)
7871					return r;
7872			} else if (ctx->bc->gfx_level < EVERGREEN) {
7873				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7874				tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
7875				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7876				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7877				tex.src_gpr = r600_get_temp(ctx);
7878				tex.src_sel_x = 0;
7879				tex.src_sel_y = 0;
7880				tex.src_sel_z = 0;
7881				tex.src_sel_w = 0;
7882				tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7883				tex.coord_type_x = 1;
7884				tex.coord_type_y = 1;
7885				tex.coord_type_z = 1;
7886				tex.coord_type_w = 1;
7887				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7888				alu.op = ALU_OP1_MOV;
7889				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7890				alu.dst.sel = tex.src_gpr;
7891				alu.dst.chan = 0;
7892				alu.last = 1;
7893				alu.dst.write = 1;
7894				r = r600_bytecode_add_alu(ctx->bc, &alu);
7895				if (r)
7896					return r;
7897
7898				r = r600_bytecode_add_tex(ctx->bc, &tex);
7899				if (r)
7900					return r;
7901			}
7902
7903		}
7904
7905		/* for cube forms of lod and bias we need to route things */
7906		if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
7907		    inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
7908		    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7909		    inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
7910			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7911			alu.op = ALU_OP1_MOV;
7912			if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7913			    inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
7914				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7915			else
7916				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7917			alu.dst.sel = ctx->temp_reg;
7918			alu.dst.chan = 2;
7919			alu.last = 1;
7920			alu.dst.write = 1;
7921			r = r600_bytecode_add_alu(ctx->bc, &alu);
7922			if (r)
7923				return r;
7924		}
7925
7926		src_loaded = TRUE;
7927		src_gpr = ctx->temp_reg;
7928	}
7929
7930	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
7931		int temp_h = 0, temp_v = 0;
7932		int start_val = 0;
7933
7934		/* if we've already loaded the src (i.e. CUBE don't reload it). */
7935		if (src_loaded == TRUE)
7936			start_val = 1;
7937		else
7938			src_loaded = TRUE;
7939		for (i = start_val; i < 3; i++) {
7940			int treg = r600_get_temp(ctx);
7941
7942			if (i == 0)
7943				src_gpr = treg;
7944			else if (i == 1)
7945				temp_h = treg;
7946			else
7947				temp_v = treg;
7948
7949			for (j = 0; j < 4; j++) {
7950				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7951				alu.op = ALU_OP1_MOV;
7952                                r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
7953                                alu.dst.sel = treg;
7954                                alu.dst.chan = j;
7955                                if (j == 3)
7956                                   alu.last = 1;
7957                                alu.dst.write = 1;
7958                                r = r600_bytecode_add_alu(ctx->bc, &alu);
7959                                if (r)
7960                                    return r;
7961			}
7962		}
7963		for (i = 1; i < 3; i++) {
7964			/* set gradients h/v */
7965			struct r600_bytecode_tex *t = &grad_offs[n_grad_offs++];
7966			memset(t, 0, sizeof(struct r600_bytecode_tex));
7967			t->op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
7968				FETCH_OP_SET_GRADIENTS_V;
7969			t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7970			t->sampler_index_mode = sampler_index_mode;
7971			t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;
7972			t->resource_index_mode = sampler_index_mode;
7973
7974			t->src_gpr = (i == 1) ? temp_h : temp_v;
7975			t->src_sel_x = 0;
7976			t->src_sel_y = 1;
7977			t->src_sel_z = 2;
7978			t->src_sel_w = 3;
7979
7980			t->dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
7981			t->dst_sel_x = t->dst_sel_y = t->dst_sel_z = t->dst_sel_w = 7;
7982			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
7983				t->coord_type_x = 1;
7984				t->coord_type_y = 1;
7985				t->coord_type_z = 1;
7986				t->coord_type_w = 1;
7987			}
7988		}
7989	}
7990
7991	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7992		/* Gather4 should follow the same rules as bilinear filtering, but the hardware
7993		 * incorrectly forces nearest filtering if the texture format is integer.
7994		 * The only effect it has on Gather4, which always returns 4 texels for
7995		 * bilinear filtering, is that the final coordinates are off by 0.5 of
7996		 * the texel size.
7997		 *
7998		 * The workaround is to subtract 0.5 from the unnormalized coordinates,
7999		 * or (0.5 / size) from the normalized coordinates.
8000		 */
8001		if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT ||
8002		    inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) {
8003			int treg = r600_get_temp(ctx);
8004
8005			/* mov array and comparison oordinate to temp_reg if needed */
8006			if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8007			     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8008			     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) && !src_loaded) {
8009				int end = inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ? 3 : 2;
8010				for (i = 2; i <= end; i++) {
8011					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8012					alu.op = ALU_OP1_MOV;
8013					alu.dst.sel = ctx->temp_reg;
8014					alu.dst.chan = i;
8015					alu.dst.write = 1;
8016					alu.last = (i == end);
8017					r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8018					r = r600_bytecode_add_alu(ctx->bc, &alu);
8019					if (r)
8020						return r;
8021				}
8022			}
8023
8024			if (inst->Texture.Texture == TGSI_TEXTURE_RECT ||
8025			    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) {
8026				for (i = 0; i < 2; i++) {
8027					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8028					alu.op = ALU_OP2_ADD;
8029					alu.dst.sel = ctx->temp_reg;
8030					alu.dst.chan = i;
8031					alu.dst.write = 1;
8032					alu.last = i == 1;
8033					if (src_loaded) {
8034						alu.src[0].sel = ctx->temp_reg;
8035						alu.src[0].chan = i;
8036					} else
8037						r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8038					alu.src[1].sel = V_SQ_ALU_SRC_0_5;
8039					alu.src[1].neg = 1;
8040					r = r600_bytecode_add_alu(ctx->bc, &alu);
8041					if (r)
8042						return r;
8043				}
8044			} else {
8045				/* execute a TXQ */
8046				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8047				tex.op = FETCH_OP_GET_TEXTURE_RESINFO;
8048				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8049				tex.sampler_index_mode = sampler_index_mode;
8050				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8051				tex.resource_index_mode = sampler_index_mode;
8052				tex.dst_gpr = treg;
8053				tex.src_sel_x = 4;
8054				tex.src_sel_y = 4;
8055				tex.src_sel_z = 4;
8056				tex.src_sel_w = 4;
8057				tex.dst_sel_x = 0;
8058				tex.dst_sel_y = 1;
8059				tex.dst_sel_z = 7;
8060				tex.dst_sel_w = 7;
8061				r = r600_bytecode_add_tex(ctx->bc, &tex);
8062				if (r)
8063					return r;
8064
8065				/* coord.xy = -0.5 * (1.0/int_to_flt(size)) + coord.xy */
8066				if (ctx->bc->gfx_level == CAYMAN) {
8067					/* */
8068					for (i = 0; i < 2; i++) {
8069						memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8070						alu.op = ALU_OP1_INT_TO_FLT;
8071						alu.dst.sel = treg;
8072						alu.dst.chan = i;
8073						alu.dst.write = 1;
8074						alu.src[0].sel = treg;
8075						alu.src[0].chan = i;
8076						alu.last = (i == 1) ? 1 : 0;
8077						r = r600_bytecode_add_alu(ctx->bc, &alu);
8078						if (r)
8079							return r;
8080					}
8081					for (j = 0; j < 2; j++) {
8082						for (i = 0; i < 3; i++) {
8083							memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8084							alu.op = ALU_OP1_RECIP_IEEE;
8085							alu.src[0].sel = treg;
8086							alu.src[0].chan = j;
8087							alu.dst.sel = treg;
8088							alu.dst.chan = i;
8089							if (i == 2)
8090								alu.last = 1;
8091							if (i == j)
8092								alu.dst.write = 1;
8093							r = r600_bytecode_add_alu(ctx->bc, &alu);
8094							if (r)
8095								return r;
8096						}
8097					}
8098				} else {
8099					for (i = 0; i < 2; i++) {
8100						memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8101						alu.op = ALU_OP1_INT_TO_FLT;
8102						alu.dst.sel = treg;
8103						alu.dst.chan = i;
8104						alu.dst.write = 1;
8105						alu.src[0].sel = treg;
8106						alu.src[0].chan = i;
8107						alu.last = 1;
8108						r = r600_bytecode_add_alu(ctx->bc, &alu);
8109						if (r)
8110							return r;
8111					}
8112					for (i = 0; i < 2; i++) {
8113						memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8114						alu.op = ALU_OP1_RECIP_IEEE;
8115						alu.src[0].sel = treg;
8116						alu.src[0].chan = i;
8117						alu.dst.sel = treg;
8118						alu.dst.chan = i;
8119						alu.last = 1;
8120						alu.dst.write = 1;
8121						r = r600_bytecode_add_alu(ctx->bc, &alu);
8122						if (r)
8123							return r;
8124					}
8125				}
8126				for (i = 0; i < 2; i++) {
8127					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8128					alu.op = ALU_OP3_MULADD;
8129					alu.is_op3 = 1;
8130					alu.dst.sel = ctx->temp_reg;
8131					alu.dst.chan = i;
8132					alu.dst.write = 1;
8133					alu.last = i == 1;
8134					alu.src[0].sel = treg;
8135					alu.src[0].chan = i;
8136					alu.src[1].sel = V_SQ_ALU_SRC_0_5;
8137					alu.src[1].neg = 1;
8138					if (src_loaded) {
8139						alu.src[2].sel = ctx->temp_reg;
8140						alu.src[2].chan = i;
8141					} else
8142						r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
8143					r = r600_bytecode_add_alu(ctx->bc, &alu);
8144					if (r)
8145						return r;
8146				}
8147			}
8148			src_loaded = TRUE;
8149			src_gpr = ctx->temp_reg;
8150		}
8151	}
8152
8153	if (src_requires_loading && !src_loaded) {
8154		for (i = 0; i < 4; i++) {
8155			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8156			alu.op = ALU_OP1_MOV;
8157			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8158			alu.dst.sel = ctx->temp_reg;
8159			alu.dst.chan = i;
8160			if (i == 3)
8161				alu.last = 1;
8162			alu.dst.write = 1;
8163			r = r600_bytecode_add_alu(ctx->bc, &alu);
8164			if (r)
8165				return r;
8166		}
8167		src_loaded = TRUE;
8168		src_gpr = ctx->temp_reg;
8169	}
8170
8171	/* get offset values */
8172	if (inst->Texture.NumOffsets) {
8173		assert(inst->Texture.NumOffsets == 1);
8174
8175		/* The texture offset feature doesn't work with the TXF instruction
8176		 * and must be emulated by adding the offset to the texture coordinates. */
8177		if (txf_add_offsets) {
8178			const struct tgsi_texture_offset *off = inst->TexOffsets;
8179
8180			switch (inst->Texture.Texture) {
8181			case TGSI_TEXTURE_3D:
8182				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8183				alu.op = ALU_OP2_ADD_INT;
8184				alu.src[0].sel = src_gpr;
8185				alu.src[0].chan = 2;
8186				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8187				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
8188				alu.dst.sel = src_gpr;
8189				alu.dst.chan = 2;
8190				alu.dst.write = 1;
8191				alu.last = 1;
8192				r = r600_bytecode_add_alu(ctx->bc, &alu);
8193				if (r)
8194					return r;
8195				FALLTHROUGH;
8196
8197			case TGSI_TEXTURE_2D:
8198			case TGSI_TEXTURE_SHADOW2D:
8199			case TGSI_TEXTURE_RECT:
8200			case TGSI_TEXTURE_SHADOWRECT:
8201			case TGSI_TEXTURE_2D_ARRAY:
8202			case TGSI_TEXTURE_SHADOW2D_ARRAY:
8203				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8204				alu.op = ALU_OP2_ADD_INT;
8205				alu.src[0].sel = src_gpr;
8206				alu.src[0].chan = 1;
8207				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8208				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
8209				alu.dst.sel = src_gpr;
8210				alu.dst.chan = 1;
8211				alu.dst.write = 1;
8212				alu.last = 1;
8213				r = r600_bytecode_add_alu(ctx->bc, &alu);
8214				if (r)
8215					return r;
8216				FALLTHROUGH;
8217
8218			case TGSI_TEXTURE_1D:
8219			case TGSI_TEXTURE_SHADOW1D:
8220			case TGSI_TEXTURE_1D_ARRAY:
8221			case TGSI_TEXTURE_SHADOW1D_ARRAY:
8222				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8223				alu.op = ALU_OP2_ADD_INT;
8224				alu.src[0].sel = src_gpr;
8225				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8226				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
8227				alu.dst.sel = src_gpr;
8228				alu.dst.write = 1;
8229				alu.last = 1;
8230				r = r600_bytecode_add_alu(ctx->bc, &alu);
8231				if (r)
8232					return r;
8233				break;
8234				/* texture offsets do not apply to other texture targets */
8235			}
8236		} else {
8237			switch (inst->Texture.Texture) {
8238			case TGSI_TEXTURE_3D:
8239				offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
8240				FALLTHROUGH;
8241			case TGSI_TEXTURE_2D:
8242			case TGSI_TEXTURE_SHADOW2D:
8243			case TGSI_TEXTURE_RECT:
8244			case TGSI_TEXTURE_SHADOWRECT:
8245			case TGSI_TEXTURE_2D_ARRAY:
8246			case TGSI_TEXTURE_SHADOW2D_ARRAY:
8247				offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
8248				FALLTHROUGH;
8249			case TGSI_TEXTURE_1D:
8250			case TGSI_TEXTURE_SHADOW1D:
8251			case TGSI_TEXTURE_1D_ARRAY:
8252			case TGSI_TEXTURE_SHADOW1D_ARRAY:
8253				offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
8254			}
8255		}
8256	}
8257
8258	/* Obtain the sample index for reading a compressed MSAA color texture.
8259	 * To read the FMASK, we use the ldfptr instruction, which tells us
8260	 * where the samples are stored.
8261	 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
8262	 * which is the identity mapping. Each nibble says which physical sample
8263	 * should be fetched to get that sample.
8264	 *
8265	 * Assume src.z contains the sample index. It should be modified like this:
8266	 *   src.z = (ldfptr() >> (src.z * 4)) & 0xF;
8267	 * Then fetch the texel with src.
8268	 */
8269	if (read_compressed_msaa) {
8270		unsigned sample_chan = 3;
8271		unsigned temp = r600_get_temp(ctx);
8272		assert(src_loaded);
8273
8274		/* temp.w = ldfptr() */
8275		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8276		tex.op = FETCH_OP_LD;
8277		tex.inst_mod = 1; /* to indicate this is ldfptr */
8278		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8279		tex.sampler_index_mode = sampler_index_mode;
8280		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8281		tex.resource_index_mode = sampler_index_mode;
8282		tex.src_gpr = src_gpr;
8283		tex.dst_gpr = temp;
8284		tex.dst_sel_x = 7; /* mask out these components */
8285		tex.dst_sel_y = 7;
8286		tex.dst_sel_z = 7;
8287		tex.dst_sel_w = 0; /* store X */
8288		tex.src_sel_x = 0;
8289		tex.src_sel_y = 1;
8290		tex.src_sel_z = 2;
8291		tex.src_sel_w = 3;
8292		tex.offset_x = offset_x;
8293		tex.offset_y = offset_y;
8294		tex.offset_z = offset_z;
8295		r = r600_bytecode_add_tex(ctx->bc, &tex);
8296		if (r)
8297			return r;
8298
8299		/* temp.x = sample_index*4 */
8300		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8301		alu.op = ALU_OP2_MULLO_INT;
8302		alu.src[0].sel = src_gpr;
8303		alu.src[0].chan = sample_chan;
8304		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8305		alu.src[1].value = 4;
8306		alu.dst.sel = temp;
8307		alu.dst.chan = 0;
8308		alu.dst.write = 1;
8309		r = emit_mul_int_op(ctx->bc, &alu);
8310		if (r)
8311			return r;
8312
8313		/* sample_index = temp.w >> temp.x */
8314		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8315		alu.op = ALU_OP2_LSHR_INT;
8316		alu.src[0].sel = temp;
8317		alu.src[0].chan = 3;
8318		alu.src[1].sel = temp;
8319		alu.src[1].chan = 0;
8320		alu.dst.sel = src_gpr;
8321		alu.dst.chan = sample_chan;
8322		alu.dst.write = 1;
8323		alu.last = 1;
8324		r = r600_bytecode_add_alu(ctx->bc, &alu);
8325		if (r)
8326			return r;
8327
8328		/* sample_index & 0xF */
8329		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8330		alu.op = ALU_OP2_AND_INT;
8331		alu.src[0].sel = src_gpr;
8332		alu.src[0].chan = sample_chan;
8333		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8334		alu.src[1].value = 0xF;
8335		alu.dst.sel = src_gpr;
8336		alu.dst.chan = sample_chan;
8337		alu.dst.write = 1;
8338		alu.last = 1;
8339		r = r600_bytecode_add_alu(ctx->bc, &alu);
8340		if (r)
8341			return r;
8342#if 0
8343		/* visualize the FMASK */
8344		for (i = 0; i < 4; i++) {
8345			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8346			alu.op = ALU_OP1_INT_TO_FLT;
8347			alu.src[0].sel = src_gpr;
8348			alu.src[0].chan = sample_chan;
8349			alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8350			alu.dst.chan = i;
8351			alu.dst.write = 1;
8352			alu.last = 1;
8353			r = r600_bytecode_add_alu(ctx->bc, &alu);
8354			if (r)
8355				return r;
8356		}
8357		return 0;
8358#endif
8359	}
8360
8361	/* does this shader want a num layers from TXQ for a cube array? */
8362	if (has_txq_cube_array_z) {
8363		int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8364
8365		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8366		alu.op = ALU_OP1_MOV;
8367
8368		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
8369		if (ctx->bc->gfx_level >= EVERGREEN) {
8370			/* with eg each dword is number of cubes */
8371			alu.src[0].sel += id / 4;
8372			alu.src[0].chan = id % 4;
8373		} else {
8374			/* r600 we have them at channel 2 of the second dword */
8375			alu.src[0].sel += (id * 2) + 1;
8376			alu.src[0].chan = 2;
8377		}
8378		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
8379		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
8380		alu.last = 1;
8381		r = r600_bytecode_add_alu(ctx->bc, &alu);
8382		if (r)
8383			return r;
8384		/* disable writemask from texture instruction */
8385		inst->Dst[0].Register.WriteMask &= ~4;
8386	}
8387
8388	opcode = ctx->inst_info->op;
8389	if (opcode == FETCH_OP_GATHER4 &&
8390		inst->TexOffsets[0].File != TGSI_FILE_NULL &&
8391		inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
8392		struct r600_bytecode_tex *t;
8393		opcode = FETCH_OP_GATHER4_O;
8394
8395		/* GATHER4_O/GATHER4_C_O use offset values loaded by
8396		   SET_TEXTURE_OFFSETS instruction. The immediate offset values
8397		   encoded in the instruction are ignored. */
8398		t = &grad_offs[n_grad_offs++];
8399		memset(t, 0, sizeof(struct r600_bytecode_tex));
8400		t->op = FETCH_OP_SET_TEXTURE_OFFSETS;
8401		t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8402		t->sampler_index_mode = sampler_index_mode;
8403		t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;
8404		t->resource_index_mode = sampler_index_mode;
8405
8406		t->src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
8407		t->src_sel_x = inst->TexOffsets[0].SwizzleX;
8408		t->src_sel_y = inst->TexOffsets[0].SwizzleY;
8409		if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8410			 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)
8411			/* make sure array index selector is 0, this is just a safety
8412			 * precausion because TGSI seems to emit something strange here */
8413			t->src_sel_z = 4;
8414		else
8415			t->src_sel_z = inst->TexOffsets[0].SwizzleZ;
8416
8417		t->src_sel_w = 4;
8418
8419		t->dst_sel_x = 7;
8420		t->dst_sel_y = 7;
8421		t->dst_sel_z = 7;
8422		t->dst_sel_w = 7;
8423	}
8424
8425	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
8426	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8427	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
8428	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
8429	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
8430	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
8431	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
8432		switch (opcode) {
8433		case FETCH_OP_SAMPLE:
8434			opcode = FETCH_OP_SAMPLE_C;
8435			break;
8436		case FETCH_OP_SAMPLE_L:
8437			opcode = FETCH_OP_SAMPLE_C_L;
8438			break;
8439		case FETCH_OP_SAMPLE_LB:
8440			opcode = FETCH_OP_SAMPLE_C_LB;
8441			break;
8442		case FETCH_OP_SAMPLE_G:
8443			opcode = FETCH_OP_SAMPLE_C_G;
8444			break;
8445		/* Texture gather variants */
8446		case FETCH_OP_GATHER4:
8447			opcode = FETCH_OP_GATHER4_C;
8448			break;
8449		case FETCH_OP_GATHER4_O:
8450			opcode = FETCH_OP_GATHER4_C_O;
8451			break;
8452		}
8453	}
8454
8455	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8456	tex.op = opcode;
8457
8458	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8459	tex.sampler_index_mode = sampler_index_mode;
8460	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8461	tex.resource_index_mode = sampler_index_mode;
8462	tex.src_gpr = src_gpr;
8463	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8464
8465	if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
8466		inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
8467		tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
8468	}
8469
8470	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
8471		if (inst->Src[1].Register.File != TGSI_FILE_IMMEDIATE) {
8472			/* TGSI doesn't have a spot to put the component for
8473			 * shadowcubes, so it drops it on the floor.  Just
8474			 * assume the user wanted component 0 (it's a shadow,
8475			 * anything else would be absurd).
8476			 */
8477			assert(inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY);
8478			tex.inst_mod = 0;
8479		} else {
8480			int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
8481			tex.inst_mod = texture_component_select;
8482		}
8483
8484		if (ctx->bc->gfx_level == CAYMAN) {
8485			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8486			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8487			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8488			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8489		} else {
8490			/* GATHER4 result order is different from TGSI TG4 */
8491			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 1 : 7;
8492			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 2 : 7;
8493			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 0 : 7;
8494			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8495		}
8496	}
8497	else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
8498		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8499		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8500		tex.dst_sel_z = 7;
8501		tex.dst_sel_w = 7;
8502	}
8503	else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
8504		tex.dst_sel_x = 3;
8505		tex.dst_sel_y = 7;
8506		tex.dst_sel_z = 7;
8507		tex.dst_sel_w = 7;
8508	}
8509	else {
8510		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8511		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8512		tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8513		tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8514	}
8515
8516
8517	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
8518		tex.src_sel_x = 4;
8519		tex.src_sel_y = 4;
8520		tex.src_sel_z = 4;
8521		tex.src_sel_w = 4;
8522	} else if (src_loaded) {
8523		tex.src_sel_x = 0;
8524		tex.src_sel_y = 1;
8525		tex.src_sel_z = 2;
8526		tex.src_sel_w = 3;
8527	} else {
8528		tex.src_sel_x = ctx->src[0].swizzle[0];
8529		tex.src_sel_y = ctx->src[0].swizzle[1];
8530		tex.src_sel_z = ctx->src[0].swizzle[2];
8531		tex.src_sel_w = ctx->src[0].swizzle[3];
8532		tex.src_rel = ctx->src[0].rel;
8533	}
8534
8535	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
8536	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
8537	    inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8538	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
8539		tex.src_sel_x = 1;
8540		tex.src_sel_y = 0;
8541		tex.src_sel_z = 3;
8542		tex.src_sel_w = 2; /* route Z compare or Lod value into W */
8543	}
8544
8545	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
8546	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
8547		tex.coord_type_x = 1;
8548		tex.coord_type_y = 1;
8549	}
8550	tex.coord_type_z = 1;
8551	tex.coord_type_w = 1;
8552
8553	tex.offset_x = offset_x;
8554	tex.offset_y = offset_y;
8555	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
8556		(inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8557		 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
8558		tex.offset_z = 0;
8559	}
8560	else {
8561		tex.offset_z = offset_z;
8562	}
8563
8564	/* Put the depth for comparison in W.
8565	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
8566	 * Some instructions expect the depth in Z. */
8567	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
8568	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8569	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
8570	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
8571	    opcode != FETCH_OP_SAMPLE_C_L &&
8572	    opcode != FETCH_OP_SAMPLE_C_LB) {
8573		tex.src_sel_w = tex.src_sel_z;
8574	}
8575
8576	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
8577	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
8578		if (opcode == FETCH_OP_SAMPLE_C_L ||
8579		    opcode == FETCH_OP_SAMPLE_C_LB) {
8580			/* the array index is read from Y */
8581			tex.coord_type_y = 0;
8582			array_index_offset_channel = tex.src_sel_y;
8583		} else {
8584			/* the array index is read from Z */
8585			tex.coord_type_z = 0;
8586			tex.src_sel_z = tex.src_sel_y;
8587			array_index_offset_channel = tex.src_sel_z;
8588		}
8589	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8590		    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) {
8591		tex.coord_type_z = 0;
8592		array_index_offset_channel = tex.src_sel_z;
8593	} else if  ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8594		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
8595		    (ctx->bc->gfx_level >= EVERGREEN))
8596		/* the array index is read from Z, coordinate will be corrected elsewhere  */
8597		tex.coord_type_z = 0;
8598
8599	/* We have array access to 1D or 2D ARRAY, the coordinates are not int ->
8600	 * evaluate the array index  */
8601	if (array_index_offset_channel >= 0 &&
8602		 opcode != FETCH_OP_LD &&
8603		 opcode != FETCH_OP_GET_TEXTURE_RESINFO) {
8604		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8605		alu.src[0].sel =  tex.src_gpr;
8606		alu.src[0].chan =  array_index_offset_channel;
8607		alu.src[0].rel = tex.src_rel;
8608		alu.op = ALU_OP1_RNDNE;
8609		alu.dst.sel = tex.src_gpr;
8610		alu.dst.chan = array_index_offset_channel;
8611		alu.dst.rel = tex.src_rel;
8612		alu.dst.write = 1;
8613		alu.last = 1;
8614		r = r600_bytecode_add_alu(ctx->bc, &alu);
8615		if (r)
8616			return r;
8617	}
8618
8619	/* mask unused source components */
8620	if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
8621		switch (inst->Texture.Texture) {
8622		case TGSI_TEXTURE_2D:
8623		case TGSI_TEXTURE_RECT:
8624			tex.src_sel_z = 7;
8625			tex.src_sel_w = 7;
8626			break;
8627		case TGSI_TEXTURE_1D_ARRAY:
8628			tex.src_sel_y = 7;
8629			tex.src_sel_w = 7;
8630			break;
8631		case TGSI_TEXTURE_1D:
8632			tex.src_sel_y = 7;
8633			tex.src_sel_z = 7;
8634			tex.src_sel_w = 7;
8635			break;
8636		}
8637	}
8638
8639	/* Emit set gradient and offset instructions. */
8640	for (i = 0; i < n_grad_offs; ++i) {
8641		r = r600_bytecode_add_tex(ctx->bc, &grad_offs[i]);
8642		if (r)
8643			return r;
8644	}
8645
8646	r = r600_bytecode_add_tex(ctx->bc, &tex);
8647	if (r)
8648		return r;
8649
8650	/* add shadow ambient support  - gallium doesn't do it yet */
8651	return 0;
8652}
8653
8654static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
8655				  struct tgsi_full_src_register *src)
8656{
8657	unsigned i;
8658
8659	uint32_t index = src->Register.Index;
8660	for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
8661		if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index)
8662			continue;
8663		if (index > ctx->shader->atomics[i].end)
8664			continue;
8665		if (index < ctx->shader->atomics[i].start)
8666			continue;
8667		uint32_t offset = (index - ctx->shader->atomics[i].start);
8668		return ctx->shader->atomics[i].hw_idx + offset;
8669	}
8670	assert(0);
8671	return -1;
8672}
8673
8674static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx,
8675			     int *uav_id_p, int *uav_index_mode_p)
8676{
8677	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8678	int uav_id, uav_index_mode = 0;
8679	int r;
8680	bool is_cm = (ctx->bc->gfx_level == CAYMAN);
8681
8682	uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
8683
8684	if (inst->Src[0].Register.Indirect) {
8685		if (is_cm) {
8686			struct r600_bytecode_alu alu;
8687			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8688			alu.op = ALU_OP2_LSHL_INT;
8689			alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index);
8690			alu.src[0].chan = 0;
8691			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8692			alu.src[1].value = 2;
8693			alu.dst.sel = ctx->temp_reg;
8694			alu.dst.chan = 0;
8695			alu.dst.write = 1;
8696			alu.last = 1;
8697			r = r600_bytecode_add_alu(ctx->bc, &alu);
8698			if (r)
8699				return r;
8700
8701			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8702					   ctx->temp_reg, 0,
8703					   ctx->temp_reg, 0,
8704					   V_SQ_ALU_SRC_LITERAL, uav_id * 4);
8705			if (r)
8706				return r;
8707		} else
8708			uav_index_mode = 2;
8709	} else if (is_cm) {
8710		r = single_alu_op2(ctx, ALU_OP1_MOV,
8711				   ctx->temp_reg, 0,
8712				   V_SQ_ALU_SRC_LITERAL, uav_id * 4,
8713				   0, 0);
8714		if (r)
8715			return r;
8716	}
8717	*uav_id_p = uav_id;
8718	*uav_index_mode_p = uav_index_mode;
8719	return 0;
8720}
8721
8722static int tgsi_load_gds(struct r600_shader_ctx *ctx)
8723{
8724	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8725	int r;
8726	struct r600_bytecode_gds gds;
8727	int uav_id = 0;
8728	int uav_index_mode = 0;
8729	bool is_cm = (ctx->bc->gfx_level == CAYMAN);
8730
8731	r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
8732	if (r)
8733		return r;
8734
8735	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
8736	gds.op = FETCH_OP_GDS_READ_RET;
8737	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8738	gds.uav_id = is_cm ? 0 : uav_id;
8739	gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
8740	gds.src_gpr = ctx->temp_reg;
8741	gds.src_sel_x = (is_cm) ? 0 : 4;
8742	gds.src_sel_y = 4;
8743	gds.src_sel_z = 4;
8744	gds.dst_sel_x = 0;
8745	gds.dst_sel_y = 7;
8746	gds.dst_sel_z = 7;
8747	gds.dst_sel_w = 7;
8748	gds.src_gpr2 = 0;
8749	gds.alloc_consume = !is_cm;
8750	r = r600_bytecode_add_gds(ctx->bc, &gds);
8751	if (r)
8752		return r;
8753
8754	ctx->bc->cf_last->vpm = 1;
8755	return 0;
8756}
8757
8758/* this fixes up 1D arrays properly */
8759static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr)
8760{
8761	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8762	int r, i;
8763	struct r600_bytecode_alu alu;
8764	int temp_reg = r600_get_temp(ctx);
8765
8766	for (i = 0; i < 4; i++) {
8767		bool def_val = true, write_zero = false;
8768		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8769		alu.op = ALU_OP1_MOV;
8770		alu.dst.sel = temp_reg;
8771		alu.dst.chan = i;
8772
8773		switch (inst->Memory.Texture) {
8774		case TGSI_TEXTURE_BUFFER:
8775		case TGSI_TEXTURE_1D:
8776			if (i == 1 || i == 2 || i == 3) {
8777				write_zero = true;
8778			}
8779			break;
8780		case TGSI_TEXTURE_1D_ARRAY:
8781			if (i == 1 || i == 3)
8782				write_zero = true;
8783			else if (i == 2) {
8784				r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1);
8785				def_val = false;
8786			}
8787			break;
8788		case TGSI_TEXTURE_2D:
8789			if (i == 2 || i == 3)
8790				write_zero = true;
8791			break;
8792		default:
8793			if (i == 3)
8794				write_zero = true;
8795			break;
8796		}
8797
8798		if (write_zero) {
8799			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8800			alu.src[0].value = 0;
8801		} else if (def_val) {
8802			r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i);
8803		}
8804
8805		if (i == 3)
8806			alu.last = 1;
8807		alu.dst.write = 1;
8808		r = r600_bytecode_add_alu(ctx->bc, &alu);
8809		if (r)
8810			return r;
8811	}
8812	*idx_gpr = temp_reg;
8813	return 0;
8814}
8815
8816static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx,
8817			     int temp_reg)
8818{
8819	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8820	int r;
8821	if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) {
8822		int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]);
8823		r = single_alu_op2(ctx, ALU_OP1_MOV,
8824				   temp_reg, 0,
8825				   V_SQ_ALU_SRC_LITERAL, value >> 2,
8826				   0, 0);
8827		if (r)
8828			return r;
8829	} else {
8830		struct r600_bytecode_alu alu;
8831		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8832		alu.op = ALU_OP2_LSHR_INT;
8833		r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0);
8834		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8835		alu.src[1].value = 2;
8836		alu.dst.sel = temp_reg;
8837		alu.dst.write = 1;
8838		alu.last = 1;
8839		r = r600_bytecode_add_alu(ctx->bc, &alu);
8840		if (r)
8841			return r;
8842	}
8843	return 0;
8844}
8845
8846/* ADDR[1,2] are stored in index_reg[0,1] on EG, and can be used for indexing
8847 * images and ssbos.  We assume that indirects are indexed by ADDR[2], as that's
8848 * what GLSL-to-TGSI emitted.
8849 */
8850static unsigned tgsi_indirect_to_rat_index_mode(struct tgsi_ind_register ind)
8851{
8852	if (ind.File == TGSI_FILE_NULL)
8853		return 0; /* CF_INDEX_NONE */
8854	else {
8855		assert(ind.Index == 2);
8856		return 2; /* CF_INDEX_1 */
8857	}
8858}
8859
8860static int tgsi_load_buffer(struct r600_shader_ctx *ctx)
8861{
8862	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8863	/* have to work out the offset into the RAT immediate return buffer */
8864	struct r600_bytecode_vtx vtx;
8865	struct r600_bytecode_cf *cf;
8866	int r;
8867	int temp_reg = r600_get_temp(ctx);
8868	unsigned rat_index_mode = tgsi_indirect_to_rat_index_mode(inst->Src[0].Indirect);
8869	unsigned base;
8870
8871	base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE];
8872
8873	r = load_buffer_coord(ctx, 1, temp_reg);
8874	if (r)
8875		return r;
8876	ctx->bc->cf_last->barrier = 1;
8877	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8878	vtx.op = FETCH_OP_VFETCH;
8879	vtx.buffer_id = inst->Src[0].Register.Index + base;
8880	vtx.buffer_index_mode = rat_index_mode;
8881	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8882	vtx.src_gpr = temp_reg;
8883	vtx.src_sel_x = 0;
8884	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8885	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
8886	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
8887	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
8888	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
8889	vtx.num_format_all = 1;
8890	vtx.format_comp_all = 1;
8891	vtx.srf_mode_all = 0;
8892
8893	if (inst->Dst[0].Register.WriteMask & 8) {
8894		vtx.data_format = FMT_32_32_32_32;
8895		vtx.use_const_fields = 0;
8896	} else if (inst->Dst[0].Register.WriteMask & 4) {
8897		vtx.data_format = FMT_32_32_32;
8898		vtx.use_const_fields = 0;
8899	} else if (inst->Dst[0].Register.WriteMask & 2) {
8900		vtx.data_format = FMT_32_32;
8901		vtx.use_const_fields = 0;
8902	} else {
8903		vtx.data_format = FMT_32;
8904		vtx.use_const_fields = 0;
8905	}
8906
8907	r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8908	if (r)
8909		return r;
8910	cf = ctx->bc->cf_last;
8911	cf->barrier = 1;
8912	return 0;
8913}
8914
8915static int tgsi_load_rat(struct r600_shader_ctx *ctx)
8916{
8917	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8918	/* have to work out the offset into the RAT immediate return buffer */
8919	struct r600_bytecode_vtx vtx;
8920	struct r600_bytecode_cf *cf;
8921	int r;
8922	int idx_gpr;
8923	unsigned format, num_format, format_comp, endian;
8924	const struct util_format_description *desc;
8925	unsigned rat_index_mode = tgsi_indirect_to_rat_index_mode(inst->Src[0].Indirect);
8926	unsigned immed_base;
8927
8928	immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8929	r = load_index_src(ctx, 1, &idx_gpr);
8930	if (r)
8931		return r;
8932
8933	if (rat_index_mode)
8934		egcm_load_index_reg(ctx->bc, 1, false);
8935
8936	r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8937	cf = ctx->bc->cf_last;
8938
8939	cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index;
8940	cf->rat.inst = V_RAT_INST_NOP_RTN;
8941	cf->rat.index_mode = rat_index_mode;
8942	cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8943	cf->output.gpr = ctx->thread_id_gpr;
8944	cf->output.index_gpr = idx_gpr;
8945	cf->output.comp_mask = 0xf;
8946	cf->output.burst_count = 1;
8947	cf->vpm = 1;
8948	cf->barrier = 1;
8949	cf->mark = 1;
8950	cf->output.elem_size = 0;
8951
8952	r600_bytecode_add_ack(ctx->bc);
8953	r600_bytecode_wait_acks(ctx->bc);
8954
8955	desc = util_format_description(inst->Memory.Format);
8956	r600_vertex_data_type(inst->Memory.Format,
8957			      &format, &num_format, &format_comp, &endian);
8958	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8959	vtx.op = FETCH_OP_VFETCH;
8960	vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8961	vtx.buffer_index_mode = rat_index_mode;
8962	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8963	vtx.src_gpr = ctx->thread_id_gpr;
8964	vtx.src_sel_x = 1;
8965	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8966	vtx.dst_sel_x = desc->swizzle[0];
8967	vtx.dst_sel_y = desc->swizzle[1];
8968	vtx.dst_sel_z = desc->swizzle[2];
8969	vtx.dst_sel_w = desc->swizzle[3];
8970	vtx.srf_mode_all = 1;
8971	vtx.data_format = format;
8972	vtx.num_format_all = num_format;
8973	vtx.format_comp_all = format_comp;
8974	vtx.endian = endian;
8975	vtx.offset = 0;
8976	vtx.mega_fetch_count = 3;
8977	r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8978	if (r)
8979		return r;
8980	cf = ctx->bc->cf_last;
8981	cf->barrier = 1;
8982	return 0;
8983}
8984
8985static int tgsi_load_lds(struct r600_shader_ctx *ctx)
8986{
8987	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8988	struct r600_bytecode_alu alu;
8989	int r;
8990	int temp_reg = r600_get_temp(ctx);
8991
8992	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8993	alu.op = ALU_OP1_MOV;
8994	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
8995	alu.dst.sel = temp_reg;
8996	alu.dst.write = 1;
8997	alu.last = 1;
8998	r = r600_bytecode_add_alu(ctx->bc, &alu);
8999	if (r)
9000		return r;
9001
9002	r = do_lds_fetch_values(ctx, temp_reg,
9003				ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask);
9004	if (r)
9005		return r;
9006	return 0;
9007}
9008
9009static int tgsi_load(struct r600_shader_ctx *ctx)
9010{
9011	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9012	if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
9013		return tgsi_load_rat(ctx);
9014	if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
9015		return tgsi_load_gds(ctx);
9016	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9017		return tgsi_load_buffer(ctx);
9018	if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
9019		return tgsi_load_lds(ctx);
9020	return 0;
9021}
9022
9023static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
9024{
9025	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9026	struct r600_bytecode_cf *cf;
9027	int r, i;
9028	unsigned rat_index_mode = tgsi_indirect_to_rat_index_mode(inst->Dst[0].Indirect);
9029	int lasti;
9030	int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx);
9031
9032	r = load_buffer_coord(ctx, 0, treg2);
9033	if (r)
9034		return r;
9035
9036	if (rat_index_mode)
9037		egcm_load_index_reg(ctx->bc, 1, false);
9038
9039	for (i = 0; i <= 3; i++) {
9040		struct r600_bytecode_alu alu;
9041		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9042		alu.op = ALU_OP1_MOV;
9043		alu.dst.sel = temp_reg;
9044		alu.dst.chan = i;
9045		alu.src[0].sel = V_SQ_ALU_SRC_0;
9046		alu.last = (i == 3);
9047		alu.dst.write = 1;
9048		r = r600_bytecode_add_alu(ctx->bc, &alu);
9049		if (r)
9050			return r;
9051	}
9052
9053	cf = NULL;
9054	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9055	for (i = 0; i <= lasti; i++) {
9056		struct r600_bytecode_alu alu;
9057		if (!((1 << i) & inst->Dst[0].Register.WriteMask))
9058			continue;
9059
9060		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
9061				   temp_reg, 0,
9062				   treg2, 0,
9063				   V_SQ_ALU_SRC_LITERAL, i);
9064		if (r)
9065			return r;
9066
9067		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9068		alu.op = ALU_OP1_MOV;
9069		alu.dst.sel = ctx->temp_reg;
9070		alu.dst.chan = 0;
9071
9072		r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9073		alu.last = 1;
9074		alu.dst.write = 1;
9075		r = r600_bytecode_add_alu(ctx->bc, &alu);
9076		if (r)
9077			return r;
9078
9079		r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9080		cf = ctx->bc->cf_last;
9081
9082		cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE];
9083		cf->rat.inst = V_RAT_INST_STORE_TYPED;
9084		cf->rat.index_mode = rat_index_mode;
9085		cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
9086		cf->output.gpr = ctx->temp_reg;
9087		cf->output.index_gpr = temp_reg;
9088		cf->output.comp_mask = 1;
9089		cf->output.burst_count = 1;
9090		cf->vpm = 1;
9091		cf->barrier = 1;
9092		cf->output.elem_size = 0;
9093	}
9094
9095	/* Request an ack from the last write emitted. */
9096	if (cf) {
9097		cf->mark = true;
9098		cf->output.type = r600_bytecode_write_export_ack_type(ctx->bc, true);
9099		r600_bytecode_add_ack(ctx->bc);
9100	}
9101
9102	return 0;
9103}
9104
9105static int tgsi_store_rat(struct r600_shader_ctx *ctx)
9106{
9107	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9108	struct r600_bytecode_cf *cf;
9109	bool src_requires_loading = false;
9110	int val_gpr, idx_gpr;
9111	int r, i;
9112	unsigned rat_index_mode = tgsi_indirect_to_rat_index_mode(inst->Dst[0].Indirect);
9113
9114	r = load_index_src(ctx, 0, &idx_gpr);
9115	if (r)
9116		return r;
9117
9118	if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY)
9119		src_requires_loading = true;
9120
9121	if (src_requires_loading) {
9122		struct r600_bytecode_alu alu;
9123		for (i = 0; i < 4; i++) {
9124			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9125			alu.op = ALU_OP1_MOV;
9126			alu.dst.sel = ctx->temp_reg;
9127			alu.dst.chan = i;
9128
9129			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9130			if (i == 3)
9131				alu.last = 1;
9132			alu.dst.write = 1;
9133			r = r600_bytecode_add_alu(ctx->bc, &alu);
9134			if (r)
9135				return r;
9136		}
9137		val_gpr = ctx->temp_reg;
9138	} else
9139		val_gpr = tgsi_tex_get_src_gpr(ctx, 1);
9140	if (rat_index_mode)
9141		egcm_load_index_reg(ctx->bc, 1, false);
9142
9143	r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9144	cf = ctx->bc->cf_last;
9145
9146	cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;
9147	cf->rat.inst = V_RAT_INST_STORE_TYPED;
9148	cf->rat.index_mode = rat_index_mode;
9149	cf->output.type = r600_bytecode_write_export_ack_type(ctx->bc, true);
9150	cf->output.gpr = val_gpr;
9151	cf->output.index_gpr = idx_gpr;
9152	cf->output.comp_mask = 0xf;
9153	cf->output.burst_count = 1;
9154	cf->vpm = 1;
9155	cf->barrier = 1;
9156	cf->output.elem_size = 0;
9157	cf->mark = 1;
9158
9159	r600_bytecode_add_ack(ctx->bc);
9160
9161	return 0;
9162}
9163
9164static int tgsi_store_lds(struct r600_shader_ctx *ctx)
9165{
9166	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9167	struct r600_bytecode_alu alu;
9168	int r, i, lasti;
9169	int write_mask = inst->Dst[0].Register.WriteMask;
9170	int temp_reg = r600_get_temp(ctx);
9171
9172	/* LDS write */
9173	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9174	alu.op = ALU_OP1_MOV;
9175	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9176	alu.dst.sel = temp_reg;
9177	alu.dst.write = 1;
9178	alu.last = 1;
9179	r = r600_bytecode_add_alu(ctx->bc, &alu);
9180	if (r)
9181		return r;
9182
9183	lasti = tgsi_last_instruction(write_mask);
9184	for (i = 1; i <= lasti; i++) {
9185		if (!(write_mask & (1 << i)))
9186			continue;
9187		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
9188				   temp_reg, i,
9189				   temp_reg, 0,
9190				   V_SQ_ALU_SRC_LITERAL, 4 * i);
9191		if (r)
9192			return r;
9193	}
9194	for (i = 0; i <= lasti; i++) {
9195		if (!(write_mask & (1 << i)))
9196			continue;
9197
9198		if ((i == 0 && ((write_mask & 3) == 3)) ||
9199		    (i == 2 && ((write_mask & 0xc) == 0xc))) {
9200			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9201			alu.op = LDS_OP3_LDS_WRITE_REL;
9202
9203			alu.src[0].sel = temp_reg;
9204			alu.src[0].chan = i;
9205			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9206			r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1);
9207			alu.last = 1;
9208			alu.is_lds_idx_op = true;
9209			alu.lds_idx = 1;
9210			r = r600_bytecode_add_alu(ctx->bc, &alu);
9211			if (r)
9212				return r;
9213			i += 1;
9214			continue;
9215		}
9216		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9217		alu.op = LDS_OP2_LDS_WRITE;
9218
9219		alu.src[0].sel = temp_reg;
9220		alu.src[0].chan = i;
9221		r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9222
9223		alu.last = 1;
9224		alu.is_lds_idx_op = true;
9225
9226		r = r600_bytecode_add_alu(ctx->bc, &alu);
9227		if (r)
9228			return r;
9229	}
9230	return 0;
9231}
9232
9233static int tgsi_store(struct r600_shader_ctx *ctx)
9234{
9235	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9236	if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
9237		return tgsi_store_buffer_rat(ctx);
9238	else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
9239		return tgsi_store_lds(ctx);
9240	else
9241		return tgsi_store_rat(ctx);
9242}
9243
9244static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)
9245{
9246	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9247	/* have to work out the offset into the RAT immediate return buffer */
9248	struct r600_bytecode_alu alu;
9249	struct r600_bytecode_vtx vtx;
9250	struct r600_bytecode_cf *cf;
9251	int r;
9252	int idx_gpr;
9253	unsigned format, num_format, format_comp, endian;
9254	const struct util_format_description *desc;
9255	unsigned rat_index_mode = tgsi_indirect_to_rat_index_mode(inst->Src[0].Indirect);
9256	unsigned immed_base;
9257	unsigned rat_base;
9258
9259	immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
9260	rat_base = ctx->shader->rat_base;
9261
9262        if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
9263		immed_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9264		rat_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9265
9266		r = load_buffer_coord(ctx, 1, ctx->temp_reg);
9267		if (r)
9268			return r;
9269		idx_gpr = ctx->temp_reg;
9270	} else {
9271		r = load_index_src(ctx, 1, &idx_gpr);
9272		if (r)
9273			return r;
9274	}
9275
9276	if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) {
9277		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9278		alu.op = ALU_OP1_MOV;
9279		alu.dst.sel = ctx->thread_id_gpr;
9280		alu.dst.chan = 0;
9281		alu.dst.write = 1;
9282		r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
9283		alu.last = 1;
9284		r = r600_bytecode_add_alu(ctx->bc, &alu);
9285		if (r)
9286			return r;
9287
9288		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9289		alu.op = ALU_OP1_MOV;
9290		alu.dst.sel = ctx->thread_id_gpr;
9291		if (ctx->bc->gfx_level == CAYMAN)
9292			alu.dst.chan = 2;
9293		else
9294			alu.dst.chan = 3;
9295		alu.dst.write = 1;
9296		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9297		alu.last = 1;
9298		r = r600_bytecode_add_alu(ctx->bc, &alu);
9299		if (r)
9300			return r;
9301	} else {
9302		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9303		alu.op = ALU_OP1_MOV;
9304		alu.dst.sel = ctx->thread_id_gpr;
9305		alu.dst.chan = 0;
9306		alu.dst.write = 1;
9307		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9308		alu.last = 1;
9309		r = r600_bytecode_add_alu(ctx->bc, &alu);
9310		if (r)
9311			return r;
9312	}
9313
9314	if (rat_index_mode)
9315		egcm_load_index_reg(ctx->bc, 1, false);
9316	r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9317	cf = ctx->bc->cf_last;
9318
9319	cf->rat.id = rat_base + inst->Src[0].Register.Index;
9320	cf->rat.inst = ctx->inst_info->op;
9321	cf->rat.index_mode = rat_index_mode;
9322	cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
9323	cf->output.gpr = ctx->thread_id_gpr;
9324	cf->output.index_gpr = idx_gpr;
9325	cf->output.comp_mask = 0xf;
9326	cf->output.burst_count = 1;
9327	cf->vpm = 1;
9328	cf->barrier = 1;
9329	cf->mark = 1;
9330	cf->output.elem_size = 0;
9331
9332	r600_bytecode_add_ack(ctx->bc);
9333	r600_bytecode_wait_acks(ctx->bc);
9334
9335	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
9336	if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
9337		desc = util_format_description(inst->Memory.Format);
9338		r600_vertex_data_type(inst->Memory.Format,
9339				      &format, &num_format, &format_comp, &endian);
9340		vtx.dst_sel_x = desc->swizzle[0];
9341	} else {
9342		format = FMT_32;
9343		num_format = 1;
9344		format_comp = 0;
9345		endian = 0;
9346		vtx.dst_sel_x = 0;
9347	}
9348	vtx.op = FETCH_OP_VFETCH;
9349	vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
9350	vtx.buffer_index_mode = rat_index_mode;
9351	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
9352	vtx.src_gpr = ctx->thread_id_gpr;
9353	vtx.src_sel_x = 1;
9354	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9355	vtx.dst_sel_y = 7;
9356	vtx.dst_sel_z = 7;
9357	vtx.dst_sel_w = 7;
9358	vtx.use_const_fields = 0;
9359	vtx.srf_mode_all = 1;
9360	vtx.data_format = format;
9361	vtx.num_format_all = num_format;
9362	vtx.format_comp_all = format_comp;
9363	vtx.endian = endian;
9364	vtx.offset = 0;
9365	vtx.mega_fetch_count = 0xf;
9366	r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
9367	if (r)
9368		return r;
9369	cf = ctx->bc->cf_last;
9370	cf->vpm = 1;
9371	cf->barrier = 1;
9372	return 0;
9373}
9374
9375static int get_gds_op(int opcode)
9376{
9377	switch (opcode) {
9378	case TGSI_OPCODE_ATOMUADD:
9379		return FETCH_OP_GDS_ADD_RET;
9380	case TGSI_OPCODE_ATOMAND:
9381		return FETCH_OP_GDS_AND_RET;
9382	case TGSI_OPCODE_ATOMOR:
9383		return FETCH_OP_GDS_OR_RET;
9384	case TGSI_OPCODE_ATOMXOR:
9385		return FETCH_OP_GDS_XOR_RET;
9386	case TGSI_OPCODE_ATOMUMIN:
9387		return FETCH_OP_GDS_MIN_UINT_RET;
9388	case TGSI_OPCODE_ATOMUMAX:
9389		return FETCH_OP_GDS_MAX_UINT_RET;
9390        case TGSI_OPCODE_ATOMIMIN:
9391		return FETCH_OP_GDS_MIN_INT_RET;
9392	case TGSI_OPCODE_ATOMIMAX:
9393		return FETCH_OP_GDS_MAX_INT_RET;
9394	case TGSI_OPCODE_ATOMXCHG:
9395		return FETCH_OP_GDS_XCHG_RET;
9396	case TGSI_OPCODE_ATOMCAS:
9397		return FETCH_OP_GDS_CMP_XCHG_RET;
9398	default:
9399		return -1;
9400	}
9401}
9402
9403static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
9404{
9405	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9406	struct r600_bytecode_gds gds;
9407	struct r600_bytecode_alu alu;
9408	int gds_op = get_gds_op(inst->Instruction.Opcode);
9409	int r;
9410	int uav_id = 0;
9411	int uav_index_mode = 0;
9412	bool is_cm = (ctx->bc->gfx_level == CAYMAN);
9413
9414	if (gds_op == -1) {
9415		fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
9416		return -1;
9417	}
9418
9419	r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
9420	if (r)
9421		return r;
9422
9423	if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) {
9424		if (inst->Src[3].Register.File == TGSI_FILE_IMMEDIATE) {
9425			int value = (ctx->literals[4 * inst->Src[3].Register.Index + inst->Src[3].Register.SwizzleX]);
9426			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9427			alu.op = ALU_OP1_MOV;
9428			alu.dst.sel = ctx->temp_reg;
9429			alu.dst.chan = is_cm ? 2 : 1;
9430			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
9431			alu.src[0].value = value;
9432			alu.last = 1;
9433			alu.dst.write = 1;
9434			r = r600_bytecode_add_alu(ctx->bc, &alu);
9435			if (r)
9436				return r;
9437		} else {
9438			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9439			alu.op = ALU_OP1_MOV;
9440			alu.dst.sel = ctx->temp_reg;
9441			alu.dst.chan = is_cm ? 2 : 1;
9442			r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
9443			alu.last = 1;
9444			alu.dst.write = 1;
9445			r = r600_bytecode_add_alu(ctx->bc, &alu);
9446			if (r)
9447				return r;
9448		}
9449	}
9450	if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
9451		int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
9452		int abs_value = abs(value);
9453		if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)
9454			gds_op = FETCH_OP_GDS_SUB_RET;
9455		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9456		alu.op = ALU_OP1_MOV;
9457		alu.dst.sel = ctx->temp_reg;
9458		alu.dst.chan = is_cm ? 1 : 0;
9459		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
9460		alu.src[0].value = abs_value;
9461		alu.last = 1;
9462		alu.dst.write = 1;
9463		r = r600_bytecode_add_alu(ctx->bc, &alu);
9464		if (r)
9465			return r;
9466	} else {
9467		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9468		alu.op = ALU_OP1_MOV;
9469		alu.dst.sel = ctx->temp_reg;
9470		alu.dst.chan = is_cm ? 1 : 0;
9471		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9472		alu.last = 1;
9473		alu.dst.write = 1;
9474		r = r600_bytecode_add_alu(ctx->bc, &alu);
9475		if (r)
9476			return r;
9477	}
9478
9479
9480	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
9481	gds.op = gds_op;
9482	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9483	gds.uav_id = is_cm ? 0 : uav_id;
9484	gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
9485	gds.src_gpr = ctx->temp_reg;
9486	gds.src_gpr2 = 0;
9487	gds.src_sel_x = is_cm ? 0 : 4;
9488	gds.src_sel_y = is_cm ? 1 : 0;
9489	if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET)
9490		gds.src_sel_z = is_cm ? 2 : 1;
9491	else
9492		gds.src_sel_z = 7;
9493	gds.dst_sel_x = 0;
9494	gds.dst_sel_y = 7;
9495	gds.dst_sel_z = 7;
9496	gds.dst_sel_w = 7;
9497	gds.alloc_consume = !is_cm;
9498
9499	r = r600_bytecode_add_gds(ctx->bc, &gds);
9500	if (r)
9501		return r;
9502	ctx->bc->cf_last->vpm = 1;
9503	return 0;
9504}
9505
9506static int get_lds_op(int opcode)
9507{
9508	switch (opcode) {
9509	case TGSI_OPCODE_ATOMUADD:
9510		return LDS_OP2_LDS_ADD_RET;
9511	case TGSI_OPCODE_ATOMAND:
9512		return LDS_OP2_LDS_AND_RET;
9513	case TGSI_OPCODE_ATOMOR:
9514		return LDS_OP2_LDS_OR_RET;
9515	case TGSI_OPCODE_ATOMXOR:
9516		return LDS_OP2_LDS_XOR_RET;
9517	case TGSI_OPCODE_ATOMUMIN:
9518		return LDS_OP2_LDS_MIN_UINT_RET;
9519	case TGSI_OPCODE_ATOMUMAX:
9520		return LDS_OP2_LDS_MAX_UINT_RET;
9521	case TGSI_OPCODE_ATOMIMIN:
9522		return LDS_OP2_LDS_MIN_INT_RET;
9523	case TGSI_OPCODE_ATOMIMAX:
9524		return LDS_OP2_LDS_MAX_INT_RET;
9525	case TGSI_OPCODE_ATOMXCHG:
9526		return LDS_OP2_LDS_XCHG_RET;
9527	case TGSI_OPCODE_ATOMCAS:
9528		return LDS_OP3_LDS_CMP_XCHG_RET;
9529	default:
9530		return -1;
9531	}
9532}
9533
9534static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx)
9535{
9536	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9537	int lds_op = get_lds_op(inst->Instruction.Opcode);
9538	int r;
9539
9540	struct r600_bytecode_alu alu;
9541	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9542	alu.op = lds_op;
9543	alu.is_lds_idx_op = true;
9544	alu.last = 1;
9545	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
9546	r600_bytecode_src(&alu.src[1], &ctx->src[2], 0);
9547	if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET)
9548		r600_bytecode_src(&alu.src[2], &ctx->src[3], 0);
9549	else
9550		alu.src[2].sel = V_SQ_ALU_SRC_0;
9551	r = r600_bytecode_add_alu(ctx->bc, &alu);
9552	if (r)
9553		return r;
9554
9555	/* then read from LDS_OQ_A_POP */
9556	memset(&alu, 0, sizeof(alu));
9557
9558	alu.op = ALU_OP1_MOV;
9559	alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
9560	alu.src[0].chan = 0;
9561	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
9562	alu.dst.write = 1;
9563	alu.last = 1;
9564	r = r600_bytecode_add_alu(ctx->bc, &alu);
9565	if (r)
9566		return r;
9567
9568	return 0;
9569}
9570
9571static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
9572{
9573	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9574	if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
9575		return tgsi_atomic_op_rat(ctx);
9576	if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
9577		return tgsi_atomic_op_gds(ctx);
9578	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9579		return tgsi_atomic_op_rat(ctx);
9580	if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
9581		return tgsi_atomic_op_lds(ctx);
9582	return 0;
9583}
9584
9585static int tgsi_resq(struct r600_shader_ctx *ctx)
9586{
9587	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9588	unsigned sampler_index_mode;
9589	struct r600_bytecode_tex tex;
9590	int r;
9591	boolean has_txq_cube_array_z = false;
9592
9593	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
9594	    (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
9595		if (ctx->bc->gfx_level < EVERGREEN)
9596			ctx->shader->uses_tex_buffers = true;
9597		unsigned eg_buffer_base = 0;
9598		eg_buffer_base = R600_IMAGE_REAL_RESOURCE_OFFSET;
9599		if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9600			eg_buffer_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9601		return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset, eg_buffer_base);
9602	}
9603
9604	if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY &&
9605	    inst->Dst[0].Register.WriteMask & 4) {
9606		ctx->shader->has_txq_cube_array_z_comp = true;
9607		has_txq_cube_array_z = true;
9608	}
9609
9610	sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9611	if (sampler_index_mode)
9612		egcm_load_index_reg(ctx->bc, 1, false);
9613
9614
9615	/* does this shader want a num layers from TXQ for a cube array? */
9616	if (has_txq_cube_array_z) {
9617		int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset;
9618		struct r600_bytecode_alu alu;
9619
9620		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9621		alu.op = ALU_OP1_MOV;
9622
9623		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
9624		/* with eg each dword is either number of cubes */
9625		alu.src[0].sel += id / 4;
9626		alu.src[0].chan = id % 4;
9627		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
9628		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
9629		alu.last = 1;
9630		r = r600_bytecode_add_alu(ctx->bc, &alu);
9631		if (r)
9632			return r;
9633		/* disable writemask from texture instruction */
9634		inst->Dst[0].Register.WriteMask &= ~4;
9635	}
9636	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
9637	tex.op = ctx->inst_info->op;
9638	tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index;
9639	tex.sampler_index_mode = sampler_index_mode;
9640	tex.resource_id = tex.sampler_id;
9641	tex.resource_index_mode = sampler_index_mode;
9642	tex.src_sel_x = 4;
9643	tex.src_sel_y = 4;
9644	tex.src_sel_z = 4;
9645	tex.src_sel_w = 4;
9646	tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
9647	tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
9648	tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
9649	tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
9650	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9651	r = r600_bytecode_add_tex(ctx->bc, &tex);
9652	if (r)
9653		return r;
9654
9655	return 0;
9656}
9657
9658static int tgsi_lrp(struct r600_shader_ctx *ctx)
9659{
9660	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9661	struct r600_bytecode_alu alu;
9662	unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9663	struct r600_bytecode_alu_src srcs[2][4];
9664	unsigned i;
9665	int r;
9666
9667	/* optimize if it's just an equal balance */
9668	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
9669		for (i = 0; i < lasti + 1; i++) {
9670			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9671				continue;
9672
9673			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9674			alu.op = ALU_OP2_ADD;
9675			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9676			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9677			alu.omod = 3;
9678			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9679			alu.dst.chan = i;
9680			if (i == lasti) {
9681				alu.last = 1;
9682			}
9683			r = r600_bytecode_add_alu(ctx->bc, &alu);
9684			if (r)
9685				return r;
9686		}
9687		return 0;
9688	}
9689
9690	/* 1 - src0 */
9691	for (i = 0; i < lasti + 1; i++) {
9692		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9693			continue;
9694
9695		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9696		alu.op = ALU_OP2_ADD;
9697		alu.src[0].sel = V_SQ_ALU_SRC_1;
9698		alu.src[0].chan = 0;
9699		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
9700		r600_bytecode_src_toggle_neg(&alu.src[1]);
9701		alu.dst.sel = ctx->temp_reg;
9702		alu.dst.chan = i;
9703		if (i == lasti) {
9704			alu.last = 1;
9705		}
9706		alu.dst.write = 1;
9707		r = r600_bytecode_add_alu(ctx->bc, &alu);
9708		if (r)
9709			return r;
9710	}
9711
9712	/* (1 - src0) * src2 */
9713	for (i = 0; i < lasti + 1; i++) {
9714		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9715			continue;
9716
9717		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9718		alu.op = ALU_OP2_MUL;
9719		alu.src[0].sel = ctx->temp_reg;
9720		alu.src[0].chan = i;
9721		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9722		alu.dst.sel = ctx->temp_reg;
9723		alu.dst.chan = i;
9724		if (i == lasti) {
9725			alu.last = 1;
9726		}
9727		alu.dst.write = 1;
9728		r = r600_bytecode_add_alu(ctx->bc, &alu);
9729		if (r)
9730			return r;
9731	}
9732
9733	/* src0 * src1 + (1 - src0) * src2 */
9734
9735	for (i = 0; i < 2; i++) {
9736		r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
9737					  srcs[i], &ctx->src[i]);
9738		if (r)
9739			return r;
9740	}
9741
9742	for (i = 0; i < lasti + 1; i++) {
9743		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9744			continue;
9745
9746		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9747		alu.op = ALU_OP3_MULADD;
9748		alu.is_op3 = 1;
9749		alu.src[0] = srcs[0][i];
9750		alu.src[1] = srcs[1][i];
9751		alu.src[2].sel = ctx->temp_reg;
9752		alu.src[2].chan = i;
9753
9754		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9755		alu.dst.chan = i;
9756		if (i == lasti) {
9757			alu.last = 1;
9758		}
9759		r = r600_bytecode_add_alu(ctx->bc, &alu);
9760		if (r)
9761			return r;
9762	}
9763	return 0;
9764}
9765
9766static int tgsi_cmp(struct r600_shader_ctx *ctx)
9767{
9768	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9769	struct r600_bytecode_alu alu;
9770	int i, r, j;
9771	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9772	struct r600_bytecode_alu_src srcs[3][4];
9773
9774	unsigned op;
9775
9776	if (ctx->src[0].abs && ctx->src[0].neg) {
9777		op = ALU_OP3_CNDE;
9778		ctx->src[0].abs = 0;
9779		ctx->src[0].neg = 0;
9780	} else {
9781		op = ALU_OP3_CNDGE;
9782	}
9783
9784	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
9785		r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
9786					  srcs[j], &ctx->src[j]);
9787		if (r)
9788			return r;
9789	}
9790
9791	for (i = 0; i < lasti + 1; i++) {
9792		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9793			continue;
9794
9795		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9796		alu.op = op;
9797		alu.src[0] = srcs[0][i];
9798		alu.src[1] = srcs[2][i];
9799		alu.src[2] = srcs[1][i];
9800
9801		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9802		alu.dst.chan = i;
9803		alu.dst.write = 1;
9804		alu.is_op3 = 1;
9805		if (i == lasti)
9806			alu.last = 1;
9807		r = r600_bytecode_add_alu(ctx->bc, &alu);
9808		if (r)
9809			return r;
9810	}
9811	return 0;
9812}
9813
9814static int tgsi_ucmp(struct r600_shader_ctx *ctx)
9815{
9816	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9817	struct r600_bytecode_alu alu;
9818	int i, r;
9819	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9820
9821	for (i = 0; i < lasti + 1; i++) {
9822		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9823			continue;
9824
9825		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9826		alu.op = ALU_OP3_CNDE_INT;
9827		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9828		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9829		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
9830		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9831		alu.dst.chan = i;
9832		alu.dst.write = 1;
9833		alu.is_op3 = 1;
9834		if (i == lasti)
9835			alu.last = 1;
9836		r = r600_bytecode_add_alu(ctx->bc, &alu);
9837		if (r)
9838			return r;
9839	}
9840	return 0;
9841}
9842
9843static int tgsi_exp(struct r600_shader_ctx *ctx)
9844{
9845	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9846	struct r600_bytecode_alu alu;
9847	int r;
9848	unsigned i;
9849
9850	/* result.x = 2^floor(src); */
9851	if (inst->Dst[0].Register.WriteMask & 1) {
9852		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9853
9854		alu.op = ALU_OP1_FLOOR;
9855		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9856
9857		alu.dst.sel = ctx->temp_reg;
9858		alu.dst.chan = 0;
9859		alu.dst.write = 1;
9860		alu.last = 1;
9861		r = r600_bytecode_add_alu(ctx->bc, &alu);
9862		if (r)
9863			return r;
9864
9865		if (ctx->bc->gfx_level == CAYMAN) {
9866			for (i = 0; i < 3; i++) {
9867				alu.op = ALU_OP1_EXP_IEEE;
9868				alu.src[0].sel = ctx->temp_reg;
9869				alu.src[0].chan = 0;
9870
9871				alu.dst.sel = ctx->temp_reg;
9872				alu.dst.chan = i;
9873				alu.dst.write = i == 0;
9874				alu.last = i == 2;
9875				r = r600_bytecode_add_alu(ctx->bc, &alu);
9876				if (r)
9877					return r;
9878			}
9879		} else {
9880			alu.op = ALU_OP1_EXP_IEEE;
9881			alu.src[0].sel = ctx->temp_reg;
9882			alu.src[0].chan = 0;
9883
9884			alu.dst.sel = ctx->temp_reg;
9885			alu.dst.chan = 0;
9886			alu.dst.write = 1;
9887			alu.last = 1;
9888			r = r600_bytecode_add_alu(ctx->bc, &alu);
9889			if (r)
9890				return r;
9891		}
9892	}
9893
9894	/* result.y = tmp - floor(tmp); */
9895	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9896		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9897
9898		alu.op = ALU_OP1_FRACT;
9899		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9900
9901		alu.dst.sel = ctx->temp_reg;
9902#if 0
9903		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9904		if (r)
9905			return r;
9906#endif
9907		alu.dst.write = 1;
9908		alu.dst.chan = 1;
9909
9910		alu.last = 1;
9911
9912		r = r600_bytecode_add_alu(ctx->bc, &alu);
9913		if (r)
9914			return r;
9915	}
9916
9917	/* result.z = RoughApprox2ToX(tmp);*/
9918	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
9919		if (ctx->bc->gfx_level == CAYMAN) {
9920			for (i = 0; i < 3; i++) {
9921				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9922				alu.op = ALU_OP1_EXP_IEEE;
9923				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9924
9925				alu.dst.sel = ctx->temp_reg;
9926				alu.dst.chan = i;
9927				if (i == 2) {
9928					alu.dst.write = 1;
9929					alu.last = 1;
9930				}
9931
9932				r = r600_bytecode_add_alu(ctx->bc, &alu);
9933				if (r)
9934					return r;
9935			}
9936		} else {
9937			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9938			alu.op = ALU_OP1_EXP_IEEE;
9939			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9940
9941			alu.dst.sel = ctx->temp_reg;
9942			alu.dst.write = 1;
9943			alu.dst.chan = 2;
9944
9945			alu.last = 1;
9946
9947			r = r600_bytecode_add_alu(ctx->bc, &alu);
9948			if (r)
9949				return r;
9950		}
9951	}
9952
9953	/* result.w = 1.0;*/
9954	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
9955		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9956
9957		alu.op = ALU_OP1_MOV;
9958		alu.src[0].sel = V_SQ_ALU_SRC_1;
9959		alu.src[0].chan = 0;
9960
9961		alu.dst.sel = ctx->temp_reg;
9962		alu.dst.chan = 3;
9963		alu.dst.write = 1;
9964		alu.last = 1;
9965		r = r600_bytecode_add_alu(ctx->bc, &alu);
9966		if (r)
9967			return r;
9968	}
9969	return tgsi_helper_copy(ctx, inst);
9970}
9971
9972static int tgsi_log(struct r600_shader_ctx *ctx)
9973{
9974	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9975	struct r600_bytecode_alu alu;
9976	int r;
9977	unsigned i;
9978
9979	/* result.x = floor(log2(|src|)); */
9980	if (inst->Dst[0].Register.WriteMask & 1) {
9981		if (ctx->bc->gfx_level == CAYMAN) {
9982			for (i = 0; i < 3; i++) {
9983				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9984
9985				alu.op = ALU_OP1_LOG_IEEE;
9986				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9987				r600_bytecode_src_set_abs(&alu.src[0]);
9988
9989				alu.dst.sel = ctx->temp_reg;
9990				alu.dst.chan = i;
9991				if (i == 0)
9992					alu.dst.write = 1;
9993				if (i == 2)
9994					alu.last = 1;
9995				r = r600_bytecode_add_alu(ctx->bc, &alu);
9996				if (r)
9997					return r;
9998			}
9999
10000		} else {
10001			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10002
10003			alu.op = ALU_OP1_LOG_IEEE;
10004			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10005			r600_bytecode_src_set_abs(&alu.src[0]);
10006
10007			alu.dst.sel = ctx->temp_reg;
10008			alu.dst.chan = 0;
10009			alu.dst.write = 1;
10010			alu.last = 1;
10011			r = r600_bytecode_add_alu(ctx->bc, &alu);
10012			if (r)
10013				return r;
10014		}
10015
10016		alu.op = ALU_OP1_FLOOR;
10017		alu.src[0].sel = ctx->temp_reg;
10018		alu.src[0].chan = 0;
10019
10020		alu.dst.sel = ctx->temp_reg;
10021		alu.dst.chan = 0;
10022		alu.dst.write = 1;
10023		alu.last = 1;
10024
10025		r = r600_bytecode_add_alu(ctx->bc, &alu);
10026		if (r)
10027			return r;
10028	}
10029
10030	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
10031	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
10032
10033		if (ctx->bc->gfx_level == CAYMAN) {
10034			for (i = 0; i < 3; i++) {
10035				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10036
10037				alu.op = ALU_OP1_LOG_IEEE;
10038				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10039				r600_bytecode_src_set_abs(&alu.src[0]);
10040
10041				alu.dst.sel = ctx->temp_reg;
10042				alu.dst.chan = i;
10043				if (i == 1)
10044					alu.dst.write = 1;
10045				if (i == 2)
10046					alu.last = 1;
10047
10048				r = r600_bytecode_add_alu(ctx->bc, &alu);
10049				if (r)
10050					return r;
10051			}
10052		} else {
10053			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10054
10055			alu.op = ALU_OP1_LOG_IEEE;
10056			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10057			r600_bytecode_src_set_abs(&alu.src[0]);
10058
10059			alu.dst.sel = ctx->temp_reg;
10060			alu.dst.chan = 1;
10061			alu.dst.write = 1;
10062			alu.last = 1;
10063
10064			r = r600_bytecode_add_alu(ctx->bc, &alu);
10065			if (r)
10066				return r;
10067		}
10068
10069		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10070
10071		alu.op = ALU_OP1_FLOOR;
10072		alu.src[0].sel = ctx->temp_reg;
10073		alu.src[0].chan = 1;
10074
10075		alu.dst.sel = ctx->temp_reg;
10076		alu.dst.chan = 1;
10077		alu.dst.write = 1;
10078		alu.last = 1;
10079
10080		r = r600_bytecode_add_alu(ctx->bc, &alu);
10081		if (r)
10082			return r;
10083
10084		if (ctx->bc->gfx_level == CAYMAN) {
10085			for (i = 0; i < 3; i++) {
10086				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10087				alu.op = ALU_OP1_EXP_IEEE;
10088				alu.src[0].sel = ctx->temp_reg;
10089				alu.src[0].chan = 1;
10090
10091				alu.dst.sel = ctx->temp_reg;
10092				alu.dst.chan = i;
10093				if (i == 1)
10094					alu.dst.write = 1;
10095				if (i == 2)
10096					alu.last = 1;
10097
10098				r = r600_bytecode_add_alu(ctx->bc, &alu);
10099				if (r)
10100					return r;
10101			}
10102		} else {
10103			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10104			alu.op = ALU_OP1_EXP_IEEE;
10105			alu.src[0].sel = ctx->temp_reg;
10106			alu.src[0].chan = 1;
10107
10108			alu.dst.sel = ctx->temp_reg;
10109			alu.dst.chan = 1;
10110			alu.dst.write = 1;
10111			alu.last = 1;
10112
10113			r = r600_bytecode_add_alu(ctx->bc, &alu);
10114			if (r)
10115				return r;
10116		}
10117
10118		if (ctx->bc->gfx_level == CAYMAN) {
10119			for (i = 0; i < 3; i++) {
10120				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10121				alu.op = ALU_OP1_RECIP_IEEE;
10122				alu.src[0].sel = ctx->temp_reg;
10123				alu.src[0].chan = 1;
10124
10125				alu.dst.sel = ctx->temp_reg;
10126				alu.dst.chan = i;
10127				if (i == 1)
10128					alu.dst.write = 1;
10129				if (i == 2)
10130					alu.last = 1;
10131
10132				r = r600_bytecode_add_alu(ctx->bc, &alu);
10133				if (r)
10134					return r;
10135			}
10136		} else {
10137			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10138			alu.op = ALU_OP1_RECIP_IEEE;
10139			alu.src[0].sel = ctx->temp_reg;
10140			alu.src[0].chan = 1;
10141
10142			alu.dst.sel = ctx->temp_reg;
10143			alu.dst.chan = 1;
10144			alu.dst.write = 1;
10145			alu.last = 1;
10146
10147			r = r600_bytecode_add_alu(ctx->bc, &alu);
10148			if (r)
10149				return r;
10150		}
10151
10152		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10153
10154		alu.op = ALU_OP2_MUL;
10155
10156		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10157		r600_bytecode_src_set_abs(&alu.src[0]);
10158
10159		alu.src[1].sel = ctx->temp_reg;
10160		alu.src[1].chan = 1;
10161
10162		alu.dst.sel = ctx->temp_reg;
10163		alu.dst.chan = 1;
10164		alu.dst.write = 1;
10165		alu.last = 1;
10166
10167		r = r600_bytecode_add_alu(ctx->bc, &alu);
10168		if (r)
10169			return r;
10170	}
10171
10172	/* result.z = log2(|src|);*/
10173	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
10174		if (ctx->bc->gfx_level == CAYMAN) {
10175			for (i = 0; i < 3; i++) {
10176				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10177
10178				alu.op = ALU_OP1_LOG_IEEE;
10179				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10180				r600_bytecode_src_set_abs(&alu.src[0]);
10181
10182				alu.dst.sel = ctx->temp_reg;
10183				if (i == 2)
10184					alu.dst.write = 1;
10185				alu.dst.chan = i;
10186				if (i == 2)
10187					alu.last = 1;
10188
10189				r = r600_bytecode_add_alu(ctx->bc, &alu);
10190				if (r)
10191					return r;
10192			}
10193		} else {
10194			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10195
10196			alu.op = ALU_OP1_LOG_IEEE;
10197			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10198			r600_bytecode_src_set_abs(&alu.src[0]);
10199
10200			alu.dst.sel = ctx->temp_reg;
10201			alu.dst.write = 1;
10202			alu.dst.chan = 2;
10203			alu.last = 1;
10204
10205			r = r600_bytecode_add_alu(ctx->bc, &alu);
10206			if (r)
10207				return r;
10208		}
10209	}
10210
10211	/* result.w = 1.0; */
10212	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
10213		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10214
10215		alu.op = ALU_OP1_MOV;
10216		alu.src[0].sel = V_SQ_ALU_SRC_1;
10217		alu.src[0].chan = 0;
10218
10219		alu.dst.sel = ctx->temp_reg;
10220		alu.dst.chan = 3;
10221		alu.dst.write = 1;
10222		alu.last = 1;
10223
10224		r = r600_bytecode_add_alu(ctx->bc, &alu);
10225		if (r)
10226			return r;
10227	}
10228
10229	return tgsi_helper_copy(ctx, inst);
10230}
10231
10232static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
10233{
10234	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10235	struct r600_bytecode_alu alu;
10236	int r;
10237	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10238	unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
10239
10240	assert(inst->Dst[0].Register.Index < 3);
10241	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10242
10243	switch (inst->Instruction.Opcode) {
10244	case TGSI_OPCODE_ARL:
10245		alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
10246		break;
10247	case TGSI_OPCODE_ARR:
10248		alu.op = ALU_OP1_FLT_TO_INT;
10249		break;
10250	case TGSI_OPCODE_UARL:
10251		alu.op = ALU_OP1_MOV;
10252		break;
10253	default:
10254		assert(0);
10255		return -1;
10256	}
10257
10258	for (i = 0; i <= lasti; ++i) {
10259		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10260			continue;
10261		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10262		alu.last = i == lasti;
10263		alu.dst.sel = reg;
10264	        alu.dst.chan = i;
10265		alu.dst.write = 1;
10266		r = r600_bytecode_add_alu(ctx->bc, &alu);
10267		if (r)
10268			return r;
10269	}
10270
10271	if (inst->Dst[0].Register.Index > 0)
10272		ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
10273	else
10274		ctx->bc->ar_loaded = 0;
10275
10276	return 0;
10277}
10278static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
10279{
10280	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10281	struct r600_bytecode_alu alu;
10282	int r;
10283	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10284
10285	switch (inst->Instruction.Opcode) {
10286	case TGSI_OPCODE_ARL:
10287		memset(&alu, 0, sizeof(alu));
10288		alu.op = ALU_OP1_FLOOR;
10289		alu.dst.sel = ctx->bc->ar_reg;
10290		alu.dst.write = 1;
10291		for (i = 0; i <= lasti; ++i) {
10292			if (inst->Dst[0].Register.WriteMask & (1 << i))  {
10293				alu.dst.chan = i;
10294				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10295				alu.last = i == lasti;
10296				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10297					return r;
10298			}
10299		}
10300
10301		memset(&alu, 0, sizeof(alu));
10302		alu.op = ALU_OP1_FLT_TO_INT;
10303		alu.src[0].sel = ctx->bc->ar_reg;
10304		alu.dst.sel = ctx->bc->ar_reg;
10305		alu.dst.write = 1;
10306		/* FLT_TO_INT is trans-only on r600/r700 */
10307		alu.last = TRUE;
10308		for (i = 0; i <= lasti; ++i) {
10309			alu.dst.chan = i;
10310			alu.src[0].chan = i;
10311			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10312				return r;
10313		}
10314		break;
10315	case TGSI_OPCODE_ARR:
10316		memset(&alu, 0, sizeof(alu));
10317		alu.op = ALU_OP1_FLT_TO_INT;
10318		alu.dst.sel = ctx->bc->ar_reg;
10319		alu.dst.write = 1;
10320		/* FLT_TO_INT is trans-only on r600/r700 */
10321		alu.last = TRUE;
10322		for (i = 0; i <= lasti; ++i) {
10323			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10324				alu.dst.chan = i;
10325				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10326				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10327					return r;
10328			}
10329		}
10330		break;
10331	case TGSI_OPCODE_UARL:
10332		memset(&alu, 0, sizeof(alu));
10333		alu.op = ALU_OP1_MOV;
10334		alu.dst.sel = ctx->bc->ar_reg;
10335		alu.dst.write = 1;
10336		for (i = 0; i <= lasti; ++i) {
10337			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10338				alu.dst.chan = i;
10339				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10340				alu.last = i == lasti;
10341				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10342					return r;
10343			}
10344		}
10345		break;
10346	default:
10347		assert(0);
10348		return -1;
10349	}
10350
10351	ctx->bc->ar_loaded = 0;
10352	return 0;
10353}
10354
10355static int tgsi_opdst(struct r600_shader_ctx *ctx)
10356{
10357	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10358	struct r600_bytecode_alu alu;
10359	int i, r = 0;
10360
10361	for (i = 0; i < 4; i++) {
10362		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10363
10364		alu.op = ALU_OP2_MUL;
10365		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10366
10367		if (i == 0 || i == 3) {
10368			alu.src[0].sel = V_SQ_ALU_SRC_1;
10369		} else {
10370			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10371		}
10372
10373		if (i == 0 || i == 2) {
10374			alu.src[1].sel = V_SQ_ALU_SRC_1;
10375		} else {
10376			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
10377		}
10378		if (i == 3)
10379			alu.last = 1;
10380		r = r600_bytecode_add_alu(ctx->bc, &alu);
10381		if (r)
10382			return r;
10383	}
10384	return 0;
10385}
10386
10387static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type,
10388			   struct r600_bytecode_alu_src *src)
10389{
10390	struct r600_bytecode_alu alu;
10391	int r;
10392
10393	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10394	alu.op = opcode;
10395	alu.execute_mask = 1;
10396	alu.update_pred = 1;
10397
10398	alu.dst.sel = ctx->temp_reg;
10399	alu.dst.write = 1;
10400	alu.dst.chan = 0;
10401
10402	alu.src[0] = *src;
10403	alu.src[1].sel = V_SQ_ALU_SRC_0;
10404	alu.src[1].chan = 0;
10405
10406	alu.last = 1;
10407
10408	r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
10409	if (r)
10410		return r;
10411	return 0;
10412}
10413
10414static int pops(struct r600_shader_ctx *ctx, int pops)
10415{
10416	unsigned force_pop = ctx->bc->force_add_cf;
10417
10418	if (!force_pop) {
10419		int alu_pop = 3;
10420		if (ctx->bc->cf_last) {
10421			if (ctx->bc->cf_last->op == CF_OP_ALU)
10422				alu_pop = 0;
10423			else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
10424				alu_pop = 1;
10425		}
10426		alu_pop += pops;
10427		if (alu_pop == 1) {
10428			ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
10429			ctx->bc->force_add_cf = 1;
10430		} else if (alu_pop == 2) {
10431			ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
10432			ctx->bc->force_add_cf = 1;
10433		} else {
10434			force_pop = 1;
10435		}
10436	}
10437
10438	if (force_pop) {
10439		r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
10440		ctx->bc->cf_last->pop_count = pops;
10441		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
10442	}
10443
10444	return 0;
10445}
10446
10447static inline int callstack_update_max_depth(struct r600_shader_ctx *ctx,
10448                                              unsigned reason)
10449{
10450	struct r600_stack_info *stack = &ctx->bc->stack;
10451	unsigned elements;
10452	int entries;
10453
10454	unsigned entry_size = stack->entry_size;
10455
10456	elements = (stack->loop + stack->push_wqm ) * entry_size;
10457	elements += stack->push;
10458
10459	switch (ctx->bc->gfx_level) {
10460	case R600:
10461	case R700:
10462		/* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
10463		 * the stack must be reserved to hold the current active/continue
10464		 * masks */
10465		if (reason == FC_PUSH_VPM || stack->push > 0) {
10466			elements += 2;
10467		}
10468		break;
10469
10470	case CAYMAN:
10471		/* r9xx: any stack operation on empty stack consumes 2 additional
10472		 * elements */
10473		elements += 2;
10474
10475		FALLTHROUGH;
10476		/* FIXME: do the two elements added above cover the cases for the
10477		 * r8xx+ below? */
10478
10479	case EVERGREEN:
10480		/* r8xx+: 2 extra elements are not always required, but one extra
10481		 * element must be added for each of the following cases:
10482		 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
10483		 *    stack usage.
10484		 *    (Currently we don't use ALU_ELSE_AFTER.)
10485		 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
10486		 *    PUSH instruction executed.
10487		 *
10488		 *    NOTE: it seems we also need to reserve additional element in some
10489		 *    other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
10490		 *    then STACK_SIZE should be 2 instead of 1 */
10491		if (reason == FC_PUSH_VPM || stack->push > 0) {
10492			elements += 1;
10493		}
10494		break;
10495
10496	default:
10497		assert(0);
10498		break;
10499	}
10500
10501	/* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
10502	 * for all chips, so we use 4 in the final formula, not the real entry_size
10503	 * for the chip */
10504	entry_size = 4;
10505
10506	entries = (elements + (entry_size - 1)) / entry_size;
10507
10508	if (entries > stack->max_entries)
10509		stack->max_entries = entries;
10510	return elements;
10511}
10512
10513static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
10514{
10515	switch(reason) {
10516	case FC_PUSH_VPM:
10517		--ctx->bc->stack.push;
10518		assert(ctx->bc->stack.push >= 0);
10519		break;
10520	case FC_PUSH_WQM:
10521		--ctx->bc->stack.push_wqm;
10522		assert(ctx->bc->stack.push_wqm >= 0);
10523		break;
10524	case FC_LOOP:
10525		--ctx->bc->stack.loop;
10526		assert(ctx->bc->stack.loop >= 0);
10527		break;
10528	default:
10529		assert(0);
10530		break;
10531	}
10532}
10533
10534static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
10535{
10536	switch (reason) {
10537	case FC_PUSH_VPM:
10538		++ctx->bc->stack.push;
10539		break;
10540	case FC_PUSH_WQM:
10541		++ctx->bc->stack.push_wqm;
10542		break;
10543	case FC_LOOP:
10544		++ctx->bc->stack.loop;
10545		break;
10546	default:
10547		assert(0);
10548	}
10549
10550	return callstack_update_max_depth(ctx, reason);
10551}
10552
10553static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
10554{
10555	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
10556
10557	sp->mid = realloc((void *)sp->mid,
10558						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
10559	sp->mid[sp->num_mid] = ctx->bc->cf_last;
10560	sp->num_mid++;
10561}
10562
10563static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
10564{
10565	assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack));
10566	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
10567	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
10568	ctx->bc->fc_sp++;
10569}
10570
10571static void fc_poplevel(struct r600_shader_ctx *ctx)
10572{
10573	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1];
10574	free(sp->mid);
10575	sp->mid = NULL;
10576	sp->num_mid = 0;
10577	sp->start = NULL;
10578	sp->type = 0;
10579	ctx->bc->fc_sp--;
10580}
10581
10582#if 0
10583static int emit_return(struct r600_shader_ctx *ctx)
10584{
10585	r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
10586	return 0;
10587}
10588
10589static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
10590{
10591
10592	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
10593	ctx->bc->cf_last->pop_count = pops;
10594	/* XXX work out offset */
10595	return 0;
10596}
10597
10598static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
10599{
10600	return 0;
10601}
10602
10603static void emit_testflag(struct r600_shader_ctx *ctx)
10604{
10605
10606}
10607
10608static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
10609{
10610	emit_testflag(ctx);
10611	emit_jump_to_offset(ctx, 1, 4);
10612	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
10613	pops(ctx, ifidx + 1);
10614	emit_return(ctx);
10615}
10616
10617static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
10618{
10619	emit_testflag(ctx);
10620
10621	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10622	ctx->bc->cf_last->pop_count = 1;
10623
10624	fc_set_mid(ctx, fc_sp);
10625
10626	pops(ctx, 1);
10627}
10628#endif
10629
10630static int emit_if(struct r600_shader_ctx *ctx, int opcode,
10631		   struct r600_bytecode_alu_src *src)
10632{
10633	int alu_type = CF_OP_ALU_PUSH_BEFORE;
10634	bool needs_workaround = false;
10635	int elems = callstack_push(ctx, FC_PUSH_VPM);
10636
10637	if (ctx->bc->gfx_level == CAYMAN && ctx->bc->stack.loop > 1)
10638		needs_workaround = true;
10639
10640	if (ctx->bc->gfx_level == EVERGREEN && ctx_needs_stack_workaround_8xx(ctx)) {
10641		unsigned dmod1 = (elems - 1) % ctx->bc->stack.entry_size;
10642		unsigned dmod2 = (elems) % ctx->bc->stack.entry_size;
10643
10644		if (elems && (!dmod1 || !dmod2))
10645			needs_workaround = true;
10646	}
10647
10648	/* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
10649	 * LOOP_STARTxxx for nested loops may put the branch stack into a state
10650	 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
10651	 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
10652	if (needs_workaround) {
10653		r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
10654		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
10655		alu_type = CF_OP_ALU;
10656	}
10657
10658	emit_logic_pred(ctx, opcode, alu_type, src);
10659
10660	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
10661
10662	fc_pushlevel(ctx, FC_IF);
10663
10664	return 0;
10665}
10666
10667static int tgsi_if(struct r600_shader_ctx *ctx)
10668{
10669	struct r600_bytecode_alu_src alu_src;
10670	r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10671
10672	return emit_if(ctx, ALU_OP2_PRED_SETNE, &alu_src);
10673}
10674
10675static int tgsi_uif(struct r600_shader_ctx *ctx)
10676{
10677	struct r600_bytecode_alu_src alu_src;
10678	r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10679	return emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
10680}
10681
10682static int tgsi_else(struct r600_shader_ctx *ctx)
10683{
10684	r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
10685	ctx->bc->cf_last->pop_count = 1;
10686
10687	fc_set_mid(ctx, ctx->bc->fc_sp - 1);
10688	ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id;
10689	return 0;
10690}
10691
10692static int tgsi_endif(struct r600_shader_ctx *ctx)
10693{
10694	int offset = 2;
10695	pops(ctx, 1);
10696	if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) {
10697		R600_ERR("if/endif unbalanced in shader\n");
10698		return -1;
10699	}
10700
10701	/* ALU_EXTENDED needs 4 DWords instead of two, adjust jump target offset accordingly */
10702	if (ctx->bc->cf_last->eg_alu_extended)
10703			offset += 2;
10704
10705	if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) {
10706		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + offset;
10707		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1;
10708	} else {
10709		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + offset;
10710	}
10711	fc_poplevel(ctx);
10712
10713	callstack_pop(ctx, FC_PUSH_VPM);
10714	return 0;
10715}
10716
10717static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
10718{
10719	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
10720	 * limited to 4096 iterations, like the other LOOP_* instructions. */
10721	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
10722
10723	fc_pushlevel(ctx, FC_LOOP);
10724
10725	/* check stack depth */
10726	callstack_push(ctx, FC_LOOP);
10727	return 0;
10728}
10729
10730static int tgsi_endloop(struct r600_shader_ctx *ctx)
10731{
10732	int i;
10733
10734	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
10735
10736	if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) {
10737		R600_ERR("loop/endloop in shader code are not paired.\n");
10738		return -EINVAL;
10739	}
10740
10741	/* fixup loop pointers - from r600isa
10742	   LOOP END points to CF after LOOP START,
10743	   LOOP START point to CF after LOOP END
10744	   BRK/CONT point to LOOP END CF
10745	*/
10746	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2;
10747
10748	ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
10749
10750	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) {
10751		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id;
10752	}
10753	/* XXX add LOOPRET support */
10754	fc_poplevel(ctx);
10755	callstack_pop(ctx, FC_LOOP);
10756	return 0;
10757}
10758
10759static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
10760{
10761	unsigned int fscp;
10762
10763	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
10764	{
10765		if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type)
10766			break;
10767	}
10768
10769	if (fscp == 0) {
10770		R600_ERR("Break not inside loop/endloop pair\n");
10771		return -EINVAL;
10772	}
10773
10774	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10775
10776	fc_set_mid(ctx, fscp - 1);
10777
10778	return 0;
10779}
10780
10781static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
10782{
10783	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10784	int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
10785	int r;
10786
10787	if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10788		emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
10789
10790	r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10791	if (!r) {
10792		ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
10793		if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10794			return emit_inc_ring_offset(ctx, stream, TRUE);
10795	}
10796	return r;
10797}
10798
10799static int tgsi_umad(struct r600_shader_ctx *ctx)
10800{
10801	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10802	struct r600_bytecode_alu alu;
10803	int i, j, r;
10804	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10805
10806	/* src0 * src1 */
10807	for (i = 0; i < lasti + 1; i++) {
10808		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10809			continue;
10810
10811		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10812
10813		alu.dst.chan = i;
10814		alu.dst.sel = ctx->temp_reg;
10815		alu.dst.write = 1;
10816
10817		alu.op = ALU_OP2_MULLO_UINT;
10818		for (j = 0; j < 2; j++) {
10819			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
10820		}
10821
10822		alu.last = 1;
10823		r = emit_mul_int_op(ctx->bc, &alu);
10824		if (r)
10825			return r;
10826	}
10827
10828
10829	for (i = 0; i < lasti + 1; i++) {
10830		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10831			continue;
10832
10833		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10834		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10835
10836		alu.op = ALU_OP2_ADD_INT;
10837
10838		alu.src[0].sel = ctx->temp_reg;
10839		alu.src[0].chan = i;
10840
10841		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
10842		if (i == lasti) {
10843			alu.last = 1;
10844		}
10845		r = r600_bytecode_add_alu(ctx->bc, &alu);
10846		if (r)
10847			return r;
10848	}
10849	return 0;
10850}
10851
10852static int tgsi_pk2h(struct r600_shader_ctx *ctx)
10853{
10854	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10855	struct r600_bytecode_alu alu;
10856	int r, i;
10857	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10858
10859	/* temp.xy = f32_to_f16(src) */
10860	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10861	alu.op = ALU_OP1_FLT32_TO_FLT16;
10862	alu.dst.chan = 0;
10863	alu.dst.sel = ctx->temp_reg;
10864	alu.dst.write = 1;
10865	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10866	r = r600_bytecode_add_alu(ctx->bc, &alu);
10867	if (r)
10868		return r;
10869	alu.dst.chan = 1;
10870	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
10871	alu.last = 1;
10872	r = r600_bytecode_add_alu(ctx->bc, &alu);
10873	if (r)
10874		return r;
10875
10876	/* dst.x = temp.y * 0x10000 + temp.x */
10877	for (i = 0; i < lasti + 1; i++) {
10878		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10879			continue;
10880
10881		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10882		alu.op = ALU_OP3_MULADD_UINT24;
10883		alu.is_op3 = 1;
10884		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10885		alu.last = i == lasti;
10886		alu.src[0].sel = ctx->temp_reg;
10887		alu.src[0].chan = 1;
10888		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10889		alu.src[1].value = 0x10000;
10890		alu.src[2].sel = ctx->temp_reg;
10891		alu.src[2].chan = 0;
10892		r = r600_bytecode_add_alu(ctx->bc, &alu);
10893		if (r)
10894			return r;
10895	}
10896
10897	return 0;
10898}
10899
10900static int tgsi_up2h(struct r600_shader_ctx *ctx)
10901{
10902	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10903	struct r600_bytecode_alu alu;
10904	int r, i;
10905	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10906
10907	/* temp.x = src.x */
10908	/* note: no need to mask out the high bits */
10909	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10910	alu.op = ALU_OP1_MOV;
10911	alu.dst.chan = 0;
10912	alu.dst.sel = ctx->temp_reg;
10913	alu.dst.write = 1;
10914	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10915	r = r600_bytecode_add_alu(ctx->bc, &alu);
10916	if (r)
10917		return r;
10918
10919	/* temp.y = src.x >> 16 */
10920	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10921	alu.op = ALU_OP2_LSHR_INT;
10922	alu.dst.chan = 1;
10923	alu.dst.sel = ctx->temp_reg;
10924	alu.dst.write = 1;
10925	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10926	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10927	alu.src[1].value = 16;
10928	alu.last = 1;
10929	r = r600_bytecode_add_alu(ctx->bc, &alu);
10930	if (r)
10931		return r;
10932
10933	/* dst.wz = dst.xy = f16_to_f32(temp.xy) */
10934	for (i = 0; i < lasti + 1; i++) {
10935		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10936			continue;
10937		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10938		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10939		alu.op = ALU_OP1_FLT16_TO_FLT32;
10940		alu.src[0].sel = ctx->temp_reg;
10941		alu.src[0].chan = i % 2;
10942		alu.last = i == lasti;
10943		r = r600_bytecode_add_alu(ctx->bc, &alu);
10944		if (r)
10945			return r;
10946	}
10947
10948	return 0;
10949}
10950
10951static int tgsi_bfe(struct r600_shader_ctx *ctx)
10952{
10953	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10954	struct r600_bytecode_alu alu;
10955	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10956	int r, i;
10957	int dst = -1;
10958
10959	if ((inst->Src[0].Register.File == inst->Dst[0].Register.File &&
10960	     inst->Src[0].Register.Index == inst->Dst[0].Register.Index) ||
10961	    (inst->Src[2].Register.File == inst->Dst[0].Register.File &&
10962	     inst->Src[2].Register.Index == inst->Dst[0].Register.Index))
10963		dst = r600_get_temp(ctx);
10964
10965	r = tgsi_op3_dst(ctx, dst);
10966	if (r)
10967		return r;
10968
10969	for (i = 0; i < lasti + 1; i++) {
10970		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10971		alu.op = ALU_OP2_SETGE_INT;
10972		r600_bytecode_src(&alu.src[0], &ctx->src[2], i);
10973		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10974		alu.src[1].value = 32;
10975		alu.dst.sel = ctx->temp_reg;
10976		alu.dst.chan = i;
10977		alu.dst.write = 1;
10978		if (i == lasti)
10979			alu.last = 1;
10980		r = r600_bytecode_add_alu(ctx->bc, &alu);
10981		if (r)
10982			return r;
10983	}
10984
10985	for (i = 0; i < lasti + 1; i++) {
10986		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10987		alu.op = ALU_OP3_CNDE_INT;
10988		alu.is_op3 = 1;
10989		alu.src[0].sel = ctx->temp_reg;
10990		alu.src[0].chan = i;
10991
10992		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10993		if (dst != -1)
10994			alu.src[1].sel = dst;
10995		else
10996			alu.src[1].sel = alu.dst.sel;
10997		alu.src[1].chan = i;
10998		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
10999		alu.dst.write = 1;
11000		if (i == lasti)
11001			alu.last = 1;
11002		r = r600_bytecode_add_alu(ctx->bc, &alu);
11003		if (r)
11004			return r;
11005	}
11006
11007	return 0;
11008}
11009
11010static int tgsi_clock(struct r600_shader_ctx *ctx)
11011{
11012	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11013	struct r600_bytecode_alu alu;
11014	int r;
11015
11016	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11017	alu.op = ALU_OP1_MOV;
11018	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11019	alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO;
11020	r = r600_bytecode_add_alu(ctx->bc, &alu);
11021	if (r)
11022		return r;
11023	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11024	alu.op = ALU_OP1_MOV;
11025	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11026	alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI;
11027	alu.last = 1;
11028	r = r600_bytecode_add_alu(ctx->bc, &alu);
11029	if (r)
11030		return r;
11031	return 0;
11032}
11033
11034static int emit_u64add(struct r600_shader_ctx *ctx, int op,
11035		       int treg,
11036		       int src0_sel, int src0_chan,
11037		       int src1_sel, int src1_chan)
11038{
11039	struct r600_bytecode_alu alu;
11040	int r;
11041	int opc;
11042
11043	if (op == ALU_OP2_ADD_INT)
11044		opc = ALU_OP2_ADDC_UINT;
11045	else
11046		opc = ALU_OP2_SUBB_UINT;
11047
11048	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11049	alu.op = op;            ;
11050	alu.dst.sel = treg;
11051	alu.dst.chan = 0;
11052	alu.dst.write = 1;
11053	alu.src[0].sel = src0_sel;
11054	alu.src[0].chan = src0_chan + 0;
11055	alu.src[1].sel = src1_sel;
11056	alu.src[1].chan = src1_chan + 0;
11057	alu.src[1].neg = 0;
11058	r = r600_bytecode_add_alu(ctx->bc, &alu);
11059	if (r)
11060		return r;
11061
11062	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11063	alu.op = op;
11064	alu.dst.sel = treg;
11065	alu.dst.chan = 1;
11066	alu.dst.write = 1;
11067	alu.src[0].sel = src0_sel;
11068	alu.src[0].chan = src0_chan + 1;
11069	alu.src[1].sel = src1_sel;
11070	alu.src[1].chan = src1_chan + 1;
11071	alu.src[1].neg = 0;
11072	r = r600_bytecode_add_alu(ctx->bc, &alu);
11073	if (r)
11074		return r;
11075
11076	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11077	alu.op = opc;
11078	alu.dst.sel = treg;
11079	alu.dst.chan = 2;
11080	alu.dst.write = 1;
11081	alu.last = 1;
11082	alu.src[0].sel = src0_sel;
11083	alu.src[0].chan = src0_chan + 0;
11084	alu.src[1].sel = src1_sel;
11085	alu.src[1].chan = src1_chan + 0;
11086	alu.src[1].neg = 0;
11087	r = r600_bytecode_add_alu(ctx->bc, &alu);
11088	if (r)
11089		return r;
11090
11091	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11092	alu.op = op;
11093	alu.dst.sel = treg;
11094	alu.dst.chan = 1;
11095	alu.dst.write = 1;
11096	alu.src[0].sel = treg;
11097	alu.src[0].chan = 1;
11098	alu.src[1].sel = treg;
11099	alu.src[1].chan = 2;
11100	alu.last = 1;
11101	r = r600_bytecode_add_alu(ctx->bc, &alu);
11102	if (r)
11103		return r;
11104	return 0;
11105}
11106
11107static int egcm_u64add(struct r600_shader_ctx *ctx)
11108{
11109	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11110	struct r600_bytecode_alu alu;
11111	int r;
11112	int treg = ctx->temp_reg;
11113	int op = ALU_OP2_ADD_INT, opc = ALU_OP2_ADDC_UINT;
11114
11115	if (ctx->src[1].neg) {
11116		op = ALU_OP2_SUB_INT;
11117		opc = ALU_OP2_SUBB_UINT;
11118	}
11119	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11120	alu.op = op;            ;
11121	alu.dst.sel = treg;
11122	alu.dst.chan = 0;
11123	alu.dst.write = 1;
11124	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11125	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11126	alu.src[1].neg = 0;
11127	r = r600_bytecode_add_alu(ctx->bc, &alu);
11128	if (r)
11129		return r;
11130
11131	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11132	alu.op = op;
11133	alu.dst.sel = treg;
11134	alu.dst.chan = 1;
11135	alu.dst.write = 1;
11136	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11137	r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11138	alu.src[1].neg = 0;
11139	r = r600_bytecode_add_alu(ctx->bc, &alu);
11140	if (r)
11141		return r;
11142
11143	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11144	alu.op = opc              ;
11145	alu.dst.sel = treg;
11146	alu.dst.chan = 2;
11147	alu.dst.write = 1;
11148	alu.last = 1;
11149	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11150	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11151	alu.src[1].neg = 0;
11152	r = r600_bytecode_add_alu(ctx->bc, &alu);
11153	if (r)
11154		return r;
11155
11156	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11157	alu.op = op;
11158	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11159	alu.src[0].sel = treg;
11160	alu.src[0].chan = 1;
11161	alu.src[1].sel = treg;
11162	alu.src[1].chan = 2;
11163	alu.last = 1;
11164	r = r600_bytecode_add_alu(ctx->bc, &alu);
11165	if (r)
11166		return r;
11167	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11168	alu.op = ALU_OP1_MOV;
11169	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11170	alu.src[0].sel = treg;
11171	alu.src[0].chan = 0;
11172	alu.last = 1;
11173	r = r600_bytecode_add_alu(ctx->bc, &alu);
11174	if (r)
11175		return r;
11176	return 0;
11177}
11178
11179
11180static int egcm_i64neg(struct r600_shader_ctx *ctx)
11181{
11182	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11183	struct r600_bytecode_alu alu;
11184	int r;
11185	int treg = ctx->temp_reg;
11186	const int op = ALU_OP2_SUB_INT;
11187	const int opc = ALU_OP2_SUBB_UINT;
11188
11189	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11190	alu.op = op;            ;
11191	alu.dst.sel = treg;
11192	alu.dst.chan = 0;
11193	alu.dst.write = 1;
11194	alu.src[0].sel = V_SQ_ALU_SRC_0;
11195	r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
11196	alu.src[1].neg = 0;
11197	r = r600_bytecode_add_alu(ctx->bc, &alu);
11198	if (r)
11199		return r;
11200
11201	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11202	alu.op = op;
11203	alu.dst.sel = treg;
11204	alu.dst.chan = 1;
11205	alu.dst.write = 1;
11206	alu.src[0].sel = V_SQ_ALU_SRC_0;
11207	r600_bytecode_src(&alu.src[1], &ctx->src[0], 1);
11208	alu.src[1].neg = 0;
11209	r = r600_bytecode_add_alu(ctx->bc, &alu);
11210	if (r)
11211		return r;
11212
11213	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11214	alu.op = opc              ;
11215	alu.dst.sel = treg;
11216	alu.dst.chan = 2;
11217	alu.dst.write = 1;
11218	alu.last = 1;
11219	alu.src[0].sel = V_SQ_ALU_SRC_0;
11220	r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
11221	alu.src[1].neg = 0;
11222	r = r600_bytecode_add_alu(ctx->bc, &alu);
11223	if (r)
11224		return r;
11225
11226	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11227	alu.op = op;
11228	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11229	alu.src[0].sel = treg;
11230	alu.src[0].chan = 1;
11231	alu.src[1].sel = treg;
11232	alu.src[1].chan = 2;
11233	alu.last = 1;
11234	r = r600_bytecode_add_alu(ctx->bc, &alu);
11235	if (r)
11236		return r;
11237	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11238	alu.op = ALU_OP1_MOV;
11239	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11240	alu.src[0].sel = treg;
11241	alu.src[0].chan = 0;
11242	alu.last = 1;
11243	r = r600_bytecode_add_alu(ctx->bc, &alu);
11244	if (r)
11245		return r;
11246	return 0;
11247}
11248
11249/* result.y = mul_high a, b
11250   result.x = mul a,b
11251   result.y += a.x * b.y + a.y * b.x;
11252*/
11253static int egcm_u64mul(struct r600_shader_ctx *ctx)
11254{
11255	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11256	struct r600_bytecode_alu alu;
11257	int r;
11258	int treg = ctx->temp_reg;
11259
11260	/* temp.x = mul_lo a.x, b.x */
11261	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11262	alu.op = ALU_OP2_MULLO_UINT;
11263	alu.dst.sel = treg;
11264	alu.dst.chan = 0;
11265	alu.dst.write = 1;
11266	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11267	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11268	r = emit_mul_int_op(ctx->bc, &alu);
11269	if (r)
11270		return r;
11271
11272	/* temp.y = mul_hi a.x, b.x */
11273	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11274	alu.op = ALU_OP2_MULHI_UINT;
11275	alu.dst.sel = treg;
11276	alu.dst.chan = 1;
11277	alu.dst.write = 1;
11278	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11279	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11280	r = emit_mul_int_op(ctx->bc, &alu);
11281	if (r)
11282		return r;
11283
11284	/* temp.z = mul a.x, b.y */
11285	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11286	alu.op = ALU_OP2_MULLO_UINT;
11287	alu.dst.sel = treg;
11288	alu.dst.chan = 2;
11289	alu.dst.write = 1;
11290	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11291	r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11292	r = emit_mul_int_op(ctx->bc, &alu);
11293	if (r)
11294		return r;
11295
11296	/* temp.w = mul a.y, b.x */
11297	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11298	alu.op = ALU_OP2_MULLO_UINT;
11299	alu.dst.sel = treg;
11300	alu.dst.chan = 3;
11301	alu.dst.write = 1;
11302	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11303	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11304	r = emit_mul_int_op(ctx->bc, &alu);
11305	if (r)
11306		return r;
11307
11308	/* temp.z = temp.z + temp.w */
11309	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11310	alu.op = ALU_OP2_ADD_INT;
11311	alu.dst.sel = treg;
11312	alu.dst.chan = 2;
11313	alu.dst.write = 1;
11314	alu.src[0].sel = treg;
11315	alu.src[0].chan = 2;
11316	alu.src[1].sel = treg;
11317	alu.src[1].chan = 3;
11318	alu.last = 1;
11319	r = r600_bytecode_add_alu(ctx->bc, &alu);
11320	if (r)
11321		return r;
11322
11323	/* temp.y = temp.y + temp.z */
11324	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11325	alu.op = ALU_OP2_ADD_INT;
11326	alu.dst.sel = treg;
11327	alu.dst.chan = 1;
11328	alu.dst.write = 1;
11329	alu.src[0].sel = treg;
11330	alu.src[0].chan = 1;
11331	alu.src[1].sel = treg;
11332	alu.src[1].chan = 2;
11333	alu.last = 1;
11334	r = r600_bytecode_add_alu(ctx->bc, &alu);
11335	if (r)
11336		return r;
11337
11338	/* dst.x = temp.x */
11339	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11340	alu.op = ALU_OP1_MOV;
11341	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11342	alu.src[0].sel = treg;
11343	alu.src[0].chan = 0;
11344	r = r600_bytecode_add_alu(ctx->bc, &alu);
11345	if (r)
11346		return r;
11347
11348	/* dst.y = temp.y */
11349	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11350	alu.op = ALU_OP1_MOV;
11351	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11352	alu.src[0].sel = treg;
11353	alu.src[0].chan = 1;
11354	alu.last = 1;
11355	r = r600_bytecode_add_alu(ctx->bc, &alu);
11356	if (r)
11357		return r;
11358
11359	return 0;
11360}
11361
11362static int emit_u64sge(struct r600_shader_ctx *ctx,
11363		       int treg,
11364		       int src0_sel, int src0_base_chan,
11365		       int src1_sel, int src1_base_chan)
11366{
11367	int r;
11368	/* for 64-bit sge */
11369	/* result = (src0.y > src1.y) || ((src0.y == src1.y) && src0.x >= src1.x)) */
11370	r = single_alu_op2(ctx, ALU_OP2_SETGT_UINT,
11371			   treg, 1,
11372			   src0_sel, src0_base_chan + 1,
11373			   src1_sel, src1_base_chan + 1);
11374	if (r)
11375		return r;
11376
11377	r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11378			   treg, 0,
11379			   src0_sel, src0_base_chan,
11380			   src1_sel, src1_base_chan);
11381	if (r)
11382		return r;
11383
11384	r = single_alu_op2(ctx, ALU_OP2_SETE_INT,
11385			   treg, 2,
11386			   src0_sel, src0_base_chan + 1,
11387			   src1_sel, src1_base_chan + 1);
11388	if (r)
11389		return r;
11390
11391	r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11392			   treg, 0,
11393			   treg, 0,
11394			   treg, 2);
11395	if (r)
11396		return r;
11397
11398	r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11399			   treg, 0,
11400			   treg, 0,
11401			   treg, 1);
11402	if (r)
11403		return r;
11404	return 0;
11405}
11406
11407/* this isn't a complete div it's just enough for qbo shader to work */
11408static int egcm_u64div(struct r600_shader_ctx *ctx)
11409{
11410	struct r600_bytecode_alu alu;
11411	struct r600_bytecode_alu_src alu_num_hi, alu_num_lo, alu_denom_hi, alu_denom_lo, alu_src;
11412	int r, i;
11413	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11414
11415	/* make sure we are dividing my a const with 0 in the high bits */
11416	if (ctx->src[1].sel != V_SQ_ALU_SRC_LITERAL)
11417		return -1;
11418	if (ctx->src[1].value[ctx->src[1].swizzle[1]] != 0)
11419		return -1;
11420	/* make sure we are doing one division */
11421	if (inst->Dst[0].Register.WriteMask != 0x3)
11422		return -1;
11423
11424	/* emit_if uses ctx->temp_reg so we can't */
11425	int treg = r600_get_temp(ctx);
11426	int tmp_num = r600_get_temp(ctx);
11427	int sub_tmp = r600_get_temp(ctx);
11428
11429	/* tmp quot are tmp_num.zw */
11430	r600_bytecode_src(&alu_num_lo, &ctx->src[0], 0);
11431	r600_bytecode_src(&alu_num_hi, &ctx->src[0], 1);
11432	r600_bytecode_src(&alu_denom_lo, &ctx->src[1], 0);
11433	r600_bytecode_src(&alu_denom_hi, &ctx->src[1], 1);
11434
11435	/* MOV tmp_num.xy, numerator */
11436	r = single_alu_op2(ctx, ALU_OP1_MOV,
11437			   tmp_num, 0,
11438			   alu_num_lo.sel, alu_num_lo.chan,
11439			   0, 0);
11440	if (r)
11441		return r;
11442	r = single_alu_op2(ctx, ALU_OP1_MOV,
11443			   tmp_num, 1,
11444			   alu_num_hi.sel, alu_num_hi.chan,
11445			   0, 0);
11446	if (r)
11447		return r;
11448
11449	r = single_alu_op2(ctx, ALU_OP1_MOV,
11450			   tmp_num, 2,
11451			   V_SQ_ALU_SRC_LITERAL, 0,
11452			   0, 0);
11453	if (r)
11454		return r;
11455
11456	r = single_alu_op2(ctx, ALU_OP1_MOV,
11457			   tmp_num, 3,
11458			   V_SQ_ALU_SRC_LITERAL, 0,
11459			   0, 0);
11460	if (r)
11461		return r;
11462
11463	/* treg 0 is log2_denom */
11464	/* normally this gets the MSB for the denom high value
11465	   - however we know this will always be 0 here. */
11466	r = single_alu_op2(ctx,
11467			   ALU_OP1_MOV,
11468			   treg, 0,
11469			   V_SQ_ALU_SRC_LITERAL, 32,
11470			   0, 0);
11471	if (r)
11472		return r;
11473
11474	/* normally check demon hi for 0, but we know it is already */
11475	/* t0.z = num_hi >= denom_lo */
11476	r = single_alu_op2(ctx,
11477			   ALU_OP2_SETGE_UINT,
11478			   treg, 1,
11479			   alu_num_hi.sel, alu_num_hi.chan,
11480			   V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11481	if (r)
11482		return r;
11483
11484	memset(&alu_src, 0, sizeof(alu_src));
11485	alu_src.sel = treg;
11486	alu_src.chan = 1;
11487	r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11488	if (r)
11489		return r;
11490
11491	/* for loops in here */
11492	/* get msb t0.x = msb(src[1].x) first */
11493	int msb_lo = util_last_bit(alu_denom_lo.value);
11494	r = single_alu_op2(ctx, ALU_OP1_MOV,
11495			   treg, 0,
11496			   V_SQ_ALU_SRC_LITERAL, msb_lo,
11497			   0, 0);
11498	if (r)
11499		return r;
11500
11501	/* unroll the asm here */
11502	for (i = 0; i < 31; i++) {
11503		r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11504				   treg, 2,
11505				   V_SQ_ALU_SRC_LITERAL, i,
11506				   treg, 0);
11507		if (r)
11508			return r;
11509
11510		/* we can do this on the CPU */
11511		uint32_t denom_lo_shl = alu_denom_lo.value << (31 - i);
11512		/* t0.z = tmp_num.y >= t0.z */
11513		r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11514				   treg, 1,
11515				   tmp_num, 1,
11516				   V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
11517		if (r)
11518			return r;
11519
11520		r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11521				   treg, 1,
11522				   treg, 1,
11523				   treg, 2);
11524		if (r)
11525			return r;
11526
11527		memset(&alu_src, 0, sizeof(alu_src));
11528		alu_src.sel = treg;
11529		alu_src.chan = 1;
11530		r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11531		if (r)
11532			return r;
11533
11534		r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
11535				   tmp_num, 1,
11536				   tmp_num, 1,
11537				   V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
11538		if (r)
11539			return r;
11540
11541		r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11542				   tmp_num, 3,
11543				   tmp_num, 3,
11544				   V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
11545		if (r)
11546			return r;
11547
11548		r = tgsi_endif(ctx);
11549		if (r)
11550			return r;
11551	}
11552
11553	/* log2_denom is always <= 31, so manually peel the last loop
11554	 * iteration.
11555	 */
11556	r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11557			   treg, 1,
11558			   tmp_num, 1,
11559			   V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11560	if (r)
11561		return r;
11562
11563	memset(&alu_src, 0, sizeof(alu_src));
11564	alu_src.sel = treg;
11565	alu_src.chan = 1;
11566	r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11567	if (r)
11568		return r;
11569
11570	r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
11571			   tmp_num, 1,
11572			   tmp_num, 1,
11573			   V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11574	if (r)
11575		return r;
11576
11577	r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11578			   tmp_num, 3,
11579			   tmp_num, 3,
11580			   V_SQ_ALU_SRC_LITERAL, 1U);
11581	if (r)
11582		return r;
11583	r = tgsi_endif(ctx);
11584	if (r)
11585		return r;
11586
11587	r = tgsi_endif(ctx);
11588	if (r)
11589		return r;
11590
11591	/* onto the second loop to unroll */
11592	for (i = 0; i < 31; i++) {
11593		r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11594				   treg, 1,
11595				   V_SQ_ALU_SRC_LITERAL, (63 - (31 - i)),
11596				   treg, 0);
11597		if (r)
11598			return r;
11599
11600		uint64_t denom_shl = (uint64_t)alu_denom_lo.value << (31 - i);
11601		r = single_alu_op2(ctx, ALU_OP1_MOV,
11602				   treg, 2,
11603				   V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
11604				   0, 0);
11605		if (r)
11606			return r;
11607
11608		r = single_alu_op2(ctx, ALU_OP1_MOV,
11609				   treg, 3,
11610				   V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
11611				   0, 0);
11612		if (r)
11613			return r;
11614
11615		r = emit_u64sge(ctx, sub_tmp,
11616				tmp_num, 0,
11617				treg, 2);
11618		if (r)
11619			return r;
11620
11621		r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11622				   treg, 1,
11623				   treg, 1,
11624				   sub_tmp, 0);
11625		if (r)
11626			return r;
11627
11628		memset(&alu_src, 0, sizeof(alu_src));
11629		alu_src.sel = treg;
11630		alu_src.chan = 1;
11631		r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11632		if (r)
11633			return r;
11634
11635
11636		r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11637				sub_tmp,
11638				tmp_num, 0,
11639				treg, 2);
11640		if (r)
11641			return r;
11642
11643		r = single_alu_op2(ctx, ALU_OP1_MOV,
11644				   tmp_num, 0,
11645				   sub_tmp, 0,
11646				   0, 0);
11647		if (r)
11648			return r;
11649
11650		r = single_alu_op2(ctx, ALU_OP1_MOV,
11651				   tmp_num, 1,
11652				   sub_tmp, 1,
11653				   0, 0);
11654		if (r)
11655			return r;
11656
11657		r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11658				   tmp_num, 2,
11659				   tmp_num, 2,
11660				   V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
11661		if (r)
11662			return r;
11663
11664		r = tgsi_endif(ctx);
11665		if (r)
11666			return r;
11667	}
11668
11669	/* log2_denom is always <= 63, so manually peel the last loop
11670	 * iteration.
11671	 */
11672	uint64_t denom_shl = (uint64_t)alu_denom_lo.value;
11673	r = single_alu_op2(ctx, ALU_OP1_MOV,
11674			   treg, 2,
11675			   V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
11676			   0, 0);
11677	if (r)
11678		return r;
11679
11680	r = single_alu_op2(ctx, ALU_OP1_MOV,
11681			   treg, 3,
11682			   V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
11683			   0, 0);
11684	if (r)
11685		return r;
11686
11687	r = emit_u64sge(ctx, sub_tmp,
11688			tmp_num, 0,
11689			treg, 2);
11690	if (r)
11691		return r;
11692
11693	memset(&alu_src, 0, sizeof(alu_src));
11694	alu_src.sel = sub_tmp;
11695	alu_src.chan = 0;
11696	r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11697	if (r)
11698		return r;
11699
11700	r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11701			sub_tmp,
11702			tmp_num, 0,
11703			treg, 2);
11704	if (r)
11705		return r;
11706
11707	r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11708			   tmp_num, 2,
11709			   tmp_num, 2,
11710			   V_SQ_ALU_SRC_LITERAL, 1U);
11711	if (r)
11712		return r;
11713	r = tgsi_endif(ctx);
11714	if (r)
11715		return r;
11716
11717	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11718	alu.op = ALU_OP1_MOV;
11719	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11720	alu.src[0].sel = tmp_num;
11721	alu.src[0].chan = 2;
11722	r = r600_bytecode_add_alu(ctx->bc, &alu);
11723	if (r)
11724		return r;
11725
11726	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11727	alu.op = ALU_OP1_MOV;
11728	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11729	alu.src[0].sel = tmp_num;
11730	alu.src[0].chan = 3;
11731	alu.last = 1;
11732	r = r600_bytecode_add_alu(ctx->bc, &alu);
11733	if (r)
11734		return r;
11735	return 0;
11736}
11737
11738static int egcm_u64sne(struct r600_shader_ctx *ctx)
11739{
11740	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11741	struct r600_bytecode_alu alu;
11742	int r;
11743	int treg = ctx->temp_reg;
11744
11745	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11746	alu.op = ALU_OP2_SETNE_INT;
11747	alu.dst.sel = treg;
11748	alu.dst.chan = 0;
11749	alu.dst.write = 1;
11750	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11751	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11752	r = r600_bytecode_add_alu(ctx->bc, &alu);
11753	if (r)
11754		return r;
11755
11756	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11757	alu.op = ALU_OP2_SETNE_INT;
11758	alu.dst.sel = treg;
11759	alu.dst.chan = 1;
11760	alu.dst.write = 1;
11761	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11762	r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11763	alu.last = 1;
11764	r = r600_bytecode_add_alu(ctx->bc, &alu);
11765	if (r)
11766		return r;
11767
11768	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11769	alu.op = ALU_OP2_OR_INT;
11770	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11771	alu.src[0].sel = treg;
11772	alu.src[0].chan = 0;
11773	alu.src[1].sel = treg;
11774	alu.src[1].chan = 1;
11775	alu.last = 1;
11776	r = r600_bytecode_add_alu(ctx->bc, &alu);
11777	if (r)
11778		return r;
11779	return 0;
11780}
11781
11782static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
11783	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_r600_arl},
11784	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
11785	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
11786
11787	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11788
11789	[TGSI_OPCODE_RSQ]	= { ALU_OP0_NOP, tgsi_rsq},
11790	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
11791	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
11792	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL_IEEE, tgsi_op2},
11793	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
11794	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11795	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11796	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
11797	/* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */
11798	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN_DX10, tgsi_op2},
11799	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX_DX10, tgsi_op2},
11800	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
11801	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
11802	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD_IEEE, tgsi_op3},
11803	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
11804	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
11805	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
11806	[21]	= { ALU_OP0_NOP, tgsi_unsupported},
11807	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
11808	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
11809	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
11810	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
11811	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
11812	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
11813	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
11814	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
11815	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
11816	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
11817	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
11818	[TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_unsupported},
11819	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
11820	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
11821	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
11822	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11823	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11824	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
11825	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
11826	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
11827	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
11828	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
11829	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
11830	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
11831	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
11832	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
11833	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
11834	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
11835	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
11836	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
11837	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
11838	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
11839	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
11840	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
11841	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
11842	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
11843	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
11844	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
11845	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
11846	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_r600_arl},
11847	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
11848	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
11849	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
11850	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
11851	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
11852	[67]			= { ALU_OP0_NOP, tgsi_unsupported},
11853	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
11854	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
11855	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
11856	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11857	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
11858	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
11859	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
11860	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
11861	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
11862	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
11863	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
11864	[TGSI_OPCODE_DDX_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
11865	[TGSI_OPCODE_DDY_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
11866	[81]			= { ALU_OP0_NOP, tgsi_unsupported},
11867	[82]			= { ALU_OP0_NOP, tgsi_unsupported},
11868	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
11869	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
11870	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
11871	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
11872	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2_trans},
11873	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
11874	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
11875	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
11876	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
11877	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
11878	[93]			= { ALU_OP0_NOP, tgsi_unsupported},
11879	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
11880	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11881	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
11882	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
11883	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
11884	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
11885	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
11886	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
11887	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
11888	[103]			= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11889	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
11890	[TGSI_OPCODE_RESQ]	= { ALU_OP0_NOP, tgsi_unsupported},
11891	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
11892	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
11893	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
11894	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
11895	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
11896	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
11897	[TGSI_OPCODE_MEMBAR]	= { ALU_OP0_NOP, tgsi_unsupported},
11898	[113]	= { ALU_OP0_NOP, tgsi_unsupported},
11899	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
11900	[115]			= { ALU_OP0_NOP, tgsi_unsupported},
11901	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
11902	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
11903	[TGSI_OPCODE_DFMA]	= { ALU_OP0_NOP, tgsi_unsupported},
11904	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
11905	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
11906	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
11907	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
11908	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
11909	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
11910	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2_trans},
11911	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
11912	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
11913	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
11914	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
11915	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
11916	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
11917	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
11918	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
11919	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
11920	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
11921	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
11922	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
11923	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2_trans},
11924	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
11925	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2_swap},
11926	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
11927	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
11928	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
11929	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
11930	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
11931	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
11932	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
11933	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
11934	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
11935	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
11936	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
11937	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
11938	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
11939	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
11940	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
11941	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
11942	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_r600_arl},
11943	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
11944	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
11945	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
11946	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
11947	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
11948	[163]	= { ALU_OP0_NOP, tgsi_unsupported},
11949	[164]	= { ALU_OP0_NOP, tgsi_unsupported},
11950	[165]	= { ALU_OP0_NOP, tgsi_unsupported},
11951	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
11952	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
11953	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
11954	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
11955	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
11956	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
11957	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
11958	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
11959	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
11960	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
11961	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
11962	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
11963	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
11964	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
11965	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
11966	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
11967	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_unsupported},
11968	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_unsupported},
11969	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_unsupported},
11970	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_unsupported},
11971	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_unsupported},
11972	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_unsupported},
11973	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_unsupported},
11974	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_unsupported},
11975	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_unsupported},
11976	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_unsupported},
11977	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_unsupported},
11978	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_unsupported},
11979	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_unsupported},
11980	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
11981};
11982
11983static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
11984	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
11985	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
11986	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
11987	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11988	[TGSI_OPCODE_RSQ]	= { ALU_OP0_NOP, tgsi_rsq},
11989	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
11990	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
11991	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL_IEEE, tgsi_op2},
11992	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
11993	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11994	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11995	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
11996	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN_DX10, tgsi_op2},
11997	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX_DX10, tgsi_op2},
11998	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
11999	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
12000	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD_IEEE, tgsi_op3},
12001	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
12002	[TGSI_OPCODE_FMA]	= { ALU_OP3_FMA, tgsi_op3},
12003	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
12004	[21]	= { ALU_OP0_NOP, tgsi_unsupported},
12005	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
12006	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
12007	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
12008	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
12009	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
12010	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
12011	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
12012	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
12013	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
12014	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
12015	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
12016	[TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_clock},
12017	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
12018	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
12019	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
12020	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12021	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12022	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
12023	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_pk2h},
12024	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
12025	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
12026	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
12027	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
12028	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
12029	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
12030	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
12031	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
12032	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
12033	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
12034	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
12035	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
12036	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
12037	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
12038	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_up2h},
12039	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
12040	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
12041	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
12042	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
12043	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
12044	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
12045	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
12046	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
12047	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
12048	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
12049	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
12050	[67]			= { ALU_OP0_NOP, tgsi_unsupported},
12051	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
12052	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
12053	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
12054	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
12055	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
12056	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
12057	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
12058	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
12059	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
12060	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
12061	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
12062	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12063	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12064	[82]			= { ALU_OP0_NOP, tgsi_unsupported},
12065	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
12066	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
12067	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
12068	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
12069	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
12070	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
12071	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
12072	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
12073	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
12074	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
12075	[93]			= { ALU_OP0_NOP, tgsi_unsupported},
12076	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
12077	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12078	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
12079	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
12080	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
12081	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
12082	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
12083	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
12084	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
12085	[103]			= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12086	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
12087	[TGSI_OPCODE_RESQ]     	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
12088	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
12089	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
12090	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
12091	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
12092	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
12093	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
12094	[TGSI_OPCODE_MEMBAR]    = { ALU_OP0_NOP, tgsi_membar},
12095	[113]	= { ALU_OP0_NOP, tgsi_unsupported},
12096	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
12097	[115]			= { ALU_OP0_NOP, tgsi_unsupported},
12098	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
12099	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
12100	/* Refer below for TGSI_OPCODE_DFMA */
12101	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_f2i},
12102	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
12103	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
12104	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
12105	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
12106	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
12107	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
12108	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
12109	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
12110	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
12111	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
12112	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
12113	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
12114	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
12115	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
12116	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
12117	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
12118	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
12119	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
12120	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
12121	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
12122	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
12123	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
12124	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
12125	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
12126	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
12127	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
12128	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
12129	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
12130	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
12131	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
12132	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
12133	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
12134	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
12135	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
12136	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
12137	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
12138	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
12139	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
12140	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
12141	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
12142	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
12143	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_load},
12144	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_store},
12145	[163]	= { ALU_OP0_NOP, tgsi_unsupported},
12146	[164]	= { ALU_OP0_NOP, tgsi_unsupported},
12147	[165]	= { ALU_OP0_NOP, tgsi_unsupported},
12148	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12149	[TGSI_OPCODE_ATOMUADD]	= { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
12150	[TGSI_OPCODE_ATOMXCHG]	= { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
12151	[TGSI_OPCODE_ATOMCAS]	= { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
12152	[TGSI_OPCODE_ATOMAND]	= { V_RAT_INST_AND_RTN, tgsi_atomic_op},
12153	[TGSI_OPCODE_ATOMOR]	= { V_RAT_INST_OR_RTN, tgsi_atomic_op},
12154	[TGSI_OPCODE_ATOMXOR]	= { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
12155	[TGSI_OPCODE_ATOMUMIN]	= { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
12156	[TGSI_OPCODE_ATOMUMAX]	= { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
12157	[TGSI_OPCODE_ATOMIMIN]	= { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
12158	[TGSI_OPCODE_ATOMIMAX]	= { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
12159	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
12160	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
12161	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
12162	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
12163	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
12164	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
12165	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
12166	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_bfe},
12167	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_bfe},
12168	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
12169	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
12170	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
12171	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
12172	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
12173	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
12174	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12175	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12176	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12177	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
12178	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
12179	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
12180	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
12181	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
12182	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
12183	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
12184	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
12185	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
12186	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
12187	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
12188	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
12189	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
12190	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
12191	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
12192	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
12193	[TGSI_OPCODE_DFMA]	= { ALU_OP3_FMA_64, tgsi_op3_64},
12194	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
12195	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
12196	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
12197	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
12198	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
12199	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
12200	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
12201	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
12202	[TGSI_OPCODE_U64SNE]    = { ALU_OP0_NOP, egcm_u64sne },
12203	[TGSI_OPCODE_U64ADD]    = { ALU_OP0_NOP, egcm_u64add },
12204	[TGSI_OPCODE_U64MUL]    = { ALU_OP0_NOP, egcm_u64mul },
12205	[TGSI_OPCODE_U64DIV]    = { ALU_OP0_NOP, egcm_u64div },
12206	[TGSI_OPCODE_I64NEG]    = { ALU_OP0_NOP, egcm_i64neg },
12207	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
12208};
12209
12210static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
12211	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
12212	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
12213	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
12214	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
12215	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
12216	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
12217	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
12218	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL_IEEE, tgsi_op2},
12219	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
12220	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
12221	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
12222	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
12223	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN_DX10, tgsi_op2},
12224	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX_DX10, tgsi_op2},
12225	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
12226	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
12227	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD_IEEE, tgsi_op3},
12228	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
12229	[TGSI_OPCODE_FMA]	= { ALU_OP3_FMA, tgsi_op3},
12230	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
12231	[21]	= { ALU_OP0_NOP, tgsi_unsupported},
12232	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
12233	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
12234	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
12235	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
12236	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
12237	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
12238	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
12239	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
12240	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, cayman_pow},
12241	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
12242	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
12243	[TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_clock},
12244	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
12245	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
12246	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, cayman_trig},
12247	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12248	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12249	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
12250	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_pk2h},
12251	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
12252	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
12253	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
12254	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
12255	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
12256	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
12257	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
12258	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, cayman_trig},
12259	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
12260	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
12261	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
12262	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
12263	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
12264	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
12265	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_up2h},
12266	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
12267	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
12268	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
12269	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
12270	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
12271	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
12272	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
12273	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
12274	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
12275	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
12276	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
12277	[67]			= { ALU_OP0_NOP, tgsi_unsupported},
12278	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
12279	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
12280	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
12281	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
12282	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
12283	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
12284	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
12285	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
12286	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
12287	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
12288	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
12289	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12290	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12291	[82]			= { ALU_OP0_NOP, tgsi_unsupported},
12292	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
12293	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2},
12294	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
12295	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
12296	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
12297	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
12298	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
12299	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
12300	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
12301	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
12302	[93]			= { ALU_OP0_NOP, tgsi_unsupported},
12303	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
12304	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12305	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
12306	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
12307	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
12308	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
12309	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
12310	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
12311	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
12312	[103]			= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12313	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
12314	[TGSI_OPCODE_RESQ]     	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
12315	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
12316	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
12317	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
12318	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
12319	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
12320	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
12321	[TGSI_OPCODE_MEMBAR]    = { ALU_OP0_NOP, tgsi_membar},
12322	[113]	= { ALU_OP0_NOP, tgsi_unsupported},
12323	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
12324	[115]			= { ALU_OP0_NOP, tgsi_unsupported},
12325	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
12326	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
12327	/* Refer below for TGSI_OPCODE_DFMA */
12328	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2},
12329	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
12330	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
12331	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
12332	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
12333	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
12334	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
12335	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
12336	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2},
12337	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2},
12338	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
12339	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
12340	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
12341	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
12342	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
12343	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
12344	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
12345	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
12346	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
12347	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
12348	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
12349	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
12350	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
12351	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
12352	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
12353	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
12354	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
12355	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
12356	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
12357	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
12358	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
12359	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
12360	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
12361	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
12362	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
12363	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
12364	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
12365	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
12366	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
12367	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
12368	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
12369	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
12370	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_load},
12371	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_store},
12372	[163]	= { ALU_OP0_NOP, tgsi_unsupported},
12373	[164]	= { ALU_OP0_NOP, tgsi_unsupported},
12374	[165]	= { ALU_OP0_NOP, tgsi_unsupported},
12375	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12376	[TGSI_OPCODE_ATOMUADD]	= { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
12377	[TGSI_OPCODE_ATOMXCHG]	= { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
12378	[TGSI_OPCODE_ATOMCAS]	= { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
12379	[TGSI_OPCODE_ATOMAND]	= { V_RAT_INST_AND_RTN, tgsi_atomic_op},
12380	[TGSI_OPCODE_ATOMOR]	= { V_RAT_INST_OR_RTN, tgsi_atomic_op},
12381	[TGSI_OPCODE_ATOMXOR]	= { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
12382	[TGSI_OPCODE_ATOMUMIN]	= { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
12383	[TGSI_OPCODE_ATOMUMAX]	= { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
12384	[TGSI_OPCODE_ATOMIMIN]	= { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
12385	[TGSI_OPCODE_ATOMIMAX]	= { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
12386	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
12387	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
12388	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
12389	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
12390	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
12391	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
12392	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
12393	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_bfe},
12394	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_bfe},
12395	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
12396	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
12397	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
12398	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
12399	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
12400	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
12401	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12402	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12403	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12404	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
12405	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
12406	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
12407	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
12408	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
12409	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
12410	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
12411	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
12412	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
12413	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
12414	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
12415	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
12416	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
12417	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
12418	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
12419	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
12420	[TGSI_OPCODE_DFMA]	= { ALU_OP3_FMA_64, tgsi_op3_64},
12421	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
12422	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
12423	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
12424	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
12425	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
12426	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
12427	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
12428	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
12429	[TGSI_OPCODE_U64SNE]    = { ALU_OP0_NOP, egcm_u64sne },
12430	[TGSI_OPCODE_U64ADD]    = { ALU_OP0_NOP, egcm_u64add },
12431	[TGSI_OPCODE_U64MUL]    = { ALU_OP0_NOP, egcm_u64mul },
12432	[TGSI_OPCODE_U64DIV]    = { ALU_OP0_NOP, egcm_u64div },
12433	[TGSI_OPCODE_I64NEG]    = { ALU_OP0_NOP, egcm_i64neg },
12434	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
12435};
12436