1/*
2 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE. */
22
23#include "radeon_compiler.h"
24
25#include <stdbool.h>
26#include <stdio.h>
27
28#include "r300_reg.h"
29
30#include "radeon_compiler_util.h"
31#include "radeon_dataflow.h"
32#include "radeon_program.h"
33#include "radeon_program_alu.h"
34#include "radeon_swizzle.h"
35#include "radeon_emulate_branches.h"
36#include "radeon_remove_constants.h"
37
38#include "util/compiler.h"
39
40/*
41 * Take an already-setup and valid source then swizzle it appropriately to
42 * obtain a constant ZERO or ONE source.
43 */
44#define __CONST(x, y)	\
45	(PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]),	\
46			   t_swizzle(y),	\
47			   t_swizzle(y),	\
48			   t_swizzle(y),	\
49			   t_swizzle(y),	\
50			   t_src_class(vpi->SrcReg[x].File), \
51			   RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
52
53
54static unsigned long t_dst_mask(unsigned int mask)
55{
56	/* RC_MASK_* is equivalent to VSF_FLAG_* */
57	return mask & RC_MASK_XYZW;
58}
59
60static unsigned long t_dst_class(rc_register_file file)
61{
62	switch (file) {
63	default:
64		fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
65		FALLTHROUGH;
66	case RC_FILE_TEMPORARY:
67		return PVS_DST_REG_TEMPORARY;
68	case RC_FILE_OUTPUT:
69		return PVS_DST_REG_OUT;
70	case RC_FILE_ADDRESS:
71		return PVS_DST_REG_A0;
72	}
73}
74
75static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
76				 struct rc_dst_register *dst)
77{
78	if (dst->File == RC_FILE_OUTPUT)
79		return vp->outputs[dst->Index];
80
81	return dst->Index;
82}
83
84static unsigned long t_src_class(rc_register_file file)
85{
86	switch (file) {
87	default:
88		fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
89		FALLTHROUGH;
90	case RC_FILE_NONE:
91	case RC_FILE_TEMPORARY:
92		return PVS_SRC_REG_TEMPORARY;
93	case RC_FILE_INPUT:
94		return PVS_SRC_REG_INPUT;
95	case RC_FILE_CONSTANT:
96		return PVS_SRC_REG_CONSTANT;
97	}
98}
99
100static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
101{
102	unsigned long aclass = t_src_class(a.File);
103	unsigned long bclass = t_src_class(b.File);
104
105	if (aclass != bclass)
106		return 0;
107	if (aclass == PVS_SRC_REG_TEMPORARY)
108		return 0;
109
110	if (a.RelAddr || b.RelAddr)
111		return 1;
112	if (a.Index != b.Index)
113		return 1;
114
115	return 0;
116}
117
118static inline unsigned long t_swizzle(unsigned int swizzle)
119{
120	/* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
121	return swizzle;
122}
123
124static unsigned long t_src_index(struct r300_vertex_program_code *vp,
125				 struct rc_src_register *src)
126{
127	if (src->File == RC_FILE_INPUT) {
128		assert(vp->inputs[src->Index] != -1);
129		return vp->inputs[src->Index];
130	} else {
131		if (src->Index < 0) {
132			fprintf(stderr,
133				"negative offsets for indirect addressing do not work.\n");
134			return 0;
135		}
136		return src->Index;
137	}
138}
139
140/* these two functions should probably be merged... */
141
142static unsigned long t_src(struct r300_vertex_program_code *vp,
143			   struct rc_src_register *src)
144{
145	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
146	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
147	 */
148	return PVS_SRC_OPERAND(t_src_index(vp, src),
149			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
150			       t_swizzle(GET_SWZ(src->Swizzle, 1)),
151			       t_swizzle(GET_SWZ(src->Swizzle, 2)),
152			       t_swizzle(GET_SWZ(src->Swizzle, 3)),
153			       t_src_class(src->File),
154			       src->Negate) |
155	       (src->RelAddr << 4) | (src->Abs << 3);
156}
157
158static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
159				  struct rc_src_register *src)
160{
161	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
162	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
163	 */
164	unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
165
166	return PVS_SRC_OPERAND(t_src_index(vp, src),
167			       t_swizzle(swz),
168			       t_swizzle(swz),
169			       t_swizzle(swz),
170			       t_swizzle(swz),
171			       t_src_class(src->File),
172			       src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
173	       (src->RelAddr << 4) | (src->Abs << 3);
174}
175
176static int valid_dst(struct r300_vertex_program_code *vp,
177			   struct rc_dst_register *dst)
178{
179	if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
180		return 0;
181	} else if (dst->File == RC_FILE_ADDRESS) {
182		assert(dst->Index == 0);
183	}
184
185	return 1;
186}
187
188static void ei_vector1(struct r300_vertex_program_code *vp,
189				unsigned int hw_opcode,
190				struct rc_sub_instruction *vpi,
191				unsigned int * inst)
192{
193	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
194				     0,
195				     0,
196				     t_dst_index(vp, &vpi->DstReg),
197				     t_dst_mask(vpi->DstReg.WriteMask),
198				     t_dst_class(vpi->DstReg.File),
199                                     vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
200	inst[1] = t_src(vp, &vpi->SrcReg[0]);
201	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
202	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
203}
204
205static void ei_vector2(struct r300_vertex_program_code *vp,
206				unsigned int hw_opcode,
207				struct rc_sub_instruction *vpi,
208				unsigned int * inst)
209{
210	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
211				     0,
212				     0,
213				     t_dst_index(vp, &vpi->DstReg),
214				     t_dst_mask(vpi->DstReg.WriteMask),
215				     t_dst_class(vpi->DstReg.File),
216                                     vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
217	inst[1] = t_src(vp, &vpi->SrcReg[0]);
218	inst[2] = t_src(vp, &vpi->SrcReg[1]);
219	inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
220}
221
222static void ei_math1(struct r300_vertex_program_code *vp,
223				unsigned int hw_opcode,
224				struct rc_sub_instruction *vpi,
225				unsigned int * inst)
226{
227	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
228				     1,
229				     0,
230				     t_dst_index(vp, &vpi->DstReg),
231				     t_dst_mask(vpi->DstReg.WriteMask),
232				     t_dst_class(vpi->DstReg.File),
233                                     vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
234	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
235	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
236	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
237}
238
239static void ei_lit(struct r300_vertex_program_code *vp,
240				      struct rc_sub_instruction *vpi,
241				      unsigned int * inst)
242{
243	//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
244
245	inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
246				     1,
247				     0,
248				     t_dst_index(vp, &vpi->DstReg),
249				     t_dst_mask(vpi->DstReg.WriteMask),
250				     t_dst_class(vpi->DstReg.File),
251                                     vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
252	/* NOTE: Users swizzling might not work. */
253	inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
254				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
255				  PVS_SRC_SELECT_FORCE_0,	// Z
256				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
257				  t_src_class(vpi->SrcReg[0].File),
258				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
259	    (vpi->SrcReg[0].RelAddr << 4);
260	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
261				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
262				  PVS_SRC_SELECT_FORCE_0,	// Z
263				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
264				  t_src_class(vpi->SrcReg[0].File),
265				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
266	    (vpi->SrcReg[0].RelAddr << 4);
267	inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
268				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
269				  PVS_SRC_SELECT_FORCE_0,	// Z
270				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
271				  t_src_class(vpi->SrcReg[0].File),
272				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
273	    (vpi->SrcReg[0].RelAddr << 4);
274}
275
276static void ei_mad(struct r300_vertex_program_code *vp,
277				      struct rc_sub_instruction *vpi,
278				      unsigned int * inst)
279{
280	unsigned int i;
281	/* Remarks about hardware limitations of MAD
282	 * (please preserve this comment, as this information is _NOT_
283	 * in the documentation provided by AMD).
284	 *
285	 * As described in the documentation, MAD with three unique temporary
286	 * source registers requires the use of the macro version.
287	 *
288	 * However (and this is not mentioned in the documentation), apparently
289	 * the macro version is _NOT_ a full superset of the normal version.
290	 * In particular, the macro version does not always work when relative
291	 * addressing is used in the source operands.
292	 *
293	 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
294	 * assembly shader path when using medium quality animations
295	 * (i.e. animations with matrix blending instead of quaternion blending).
296	 *
297	 * Unfortunately, I (nha) have been unable to extract a Piglit regression
298	 * test for this issue - for some reason, it is possible to have vertex
299	 * programs whose prefix is *exactly* the same as the prefix of the
300	 * offending program in Sauerbraten up to the offending instruction
301	 * without causing any trouble.
302	 *
303	 * Bottom line: Only use the macro version only when really necessary;
304	 * according to AMD docs, this should improve performance by one clock
305	 * as a nice side bonus.
306	 */
307	if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
308	    vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
309	    vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
310	    vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
311	    vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
312	    vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
313		inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
314				0,
315				1,
316				t_dst_index(vp, &vpi->DstReg),
317				t_dst_mask(vpi->DstReg.WriteMask),
318				t_dst_class(vpi->DstReg.File),
319                                vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
320	} else {
321		inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
322				0,
323				0,
324				t_dst_index(vp, &vpi->DstReg),
325				t_dst_mask(vpi->DstReg.WriteMask),
326				t_dst_class(vpi->DstReg.File),
327                                vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
328
329		/* Arguments with constant swizzles still count as a unique
330		 * temporary, so we should make sure these arguments share a
331		 * register index with one of the other arguments. */
332		for (i = 0; i < 3; i++) {
333			unsigned int j;
334			if (vpi->SrcReg[i].File != RC_FILE_NONE)
335				continue;
336
337			for (j = 0; j < 3; j++) {
338				if (i != j) {
339					vpi->SrcReg[i].Index =
340						vpi->SrcReg[j].Index;
341					break;
342				}
343			}
344		}
345	}
346	inst[1] = t_src(vp, &vpi->SrcReg[0]);
347	inst[2] = t_src(vp, &vpi->SrcReg[1]);
348	inst[3] = t_src(vp, &vpi->SrcReg[2]);
349}
350
351static void ei_pow(struct r300_vertex_program_code *vp,
352				      struct rc_sub_instruction *vpi,
353				      unsigned int * inst)
354{
355	inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
356				     1,
357				     0,
358				     t_dst_index(vp, &vpi->DstReg),
359				     t_dst_mask(vpi->DstReg.WriteMask),
360				     t_dst_class(vpi->DstReg.File),
361                                     vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
362	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
363	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
364	inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
365}
366
367static void translate_vertex_program(struct radeon_compiler *c, void *user)
368{
369	struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
370	struct rc_instruction *rci;
371
372	unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {};
373	unsigned loop_depth = 0;
374	bool last_input_read_at_loop_end = false;
375	bool last_pos_write_at_loop_end = false;
376
377	compiler->code->pos_end = 0;	/* Not supported yet */
378	compiler->code->length = 0;
379	compiler->code->num_temporaries = 0;
380	compiler->code->last_input_read = 0;
381	compiler->code->last_pos_write = 0;
382
383	compiler->SetHwInputOutput(compiler);
384
385	for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
386		struct rc_sub_instruction *vpi = &rci->U.I;
387		unsigned int *inst = compiler->code->body.d + compiler->code->length;
388		const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
389
390		/* Skip instructions writing to non-existing destination */
391		if (!valid_dst(compiler->code, &vpi->DstReg))
392			continue;
393
394		if (info->HasDstReg) {
395			/* Neither is Saturate. */
396			if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) {
397				rc_error(&compiler->Base, "Vertex program does not support the Saturate "
398					 "modifier (yet).\n");
399			}
400		}
401
402		if (compiler->code->length >= c->max_alu_insts * 4) {
403			rc_error(&compiler->Base, "Vertex program has too many instructions\n");
404			return;
405		}
406
407		assert(compiler->Base.is_r500 ||
408		       (vpi->Opcode != RC_OPCODE_SEQ &&
409			vpi->Opcode != RC_OPCODE_SNE));
410
411		switch (vpi->Opcode) {
412		case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
413		case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
414		case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;
415		case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
416		case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
417		case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
418		case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
419		case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
420		case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
421		case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
422		case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
423		case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
424		case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
425		case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
426		case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
427		case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
428		case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
429		case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
430		case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
431		case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
432		case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
433		case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
434		case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
435		case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
436		case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
437		case RC_OPCODE_BGNLOOP:
438		{
439			if ((!compiler->Base.is_r500
440				&& loop_depth >= R300_VS_MAX_LOOP_DEPTH)
441				|| loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
442				rc_error(&compiler->Base,
443						"Loops are nested too deep.");
444				return;
445			}
446			loops[loop_depth++] = ((compiler->code->length)/ 4) + 1;
447			break;
448		}
449		case RC_OPCODE_ENDLOOP:
450		{
451			unsigned int act_addr;
452			unsigned int last_addr;
453			unsigned int ret_addr;
454
455			if (loop_depth == 1 && last_input_read_at_loop_end) {
456				compiler->code->last_input_read = compiler->code->length / 4;
457				last_input_read_at_loop_end = false;
458			}
459			if (loop_depth == 1 && last_pos_write_at_loop_end) {
460				compiler->code->last_pos_write = compiler->code->length / 4;
461				last_pos_write_at_loop_end = false;
462			}
463
464			ret_addr = loops[--loop_depth];
465			act_addr = ret_addr - 1;
466			last_addr = (compiler->code->length / 4) - 1;
467
468			if (loop_depth >= R300_VS_MAX_FC_OPS) {
469				rc_error(&compiler->Base,
470					"Too many flow control instructions.");
471				return;
472			}
473			if (compiler->Base.is_r500) {
474				compiler->code->fc_op_addrs.r500
475					[compiler->code->num_fc_ops].lw =
476					R500_PVS_FC_ACT_ADRS(act_addr)
477					| R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff)
478					;
479				compiler->code->fc_op_addrs.r500
480					[compiler->code->num_fc_ops].uw =
481					R500_PVS_FC_LAST_INST(last_addr)
482					| R500_PVS_FC_RTN_INST(ret_addr)
483					;
484			} else {
485				compiler->code->fc_op_addrs.r300
486					[compiler->code->num_fc_ops] =
487					R300_PVS_FC_ACT_ADRS(act_addr)
488					| R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
489					| R300_PVS_FC_LAST_INST(last_addr)
490					| R300_PVS_FC_RTN_INST(ret_addr)
491					;
492			}
493			compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
494				R300_PVS_FC_LOOP_INIT_VAL(0x0)
495				| R300_PVS_FC_LOOP_STEP_VAL(0x1)
496				;
497			compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
498						compiler->code->num_fc_ops);
499			compiler->code->num_fc_ops++;
500
501			break;
502		}
503
504		case RC_ME_PRED_SET_CLR:
505			ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
506			break;
507
508		case RC_ME_PRED_SET_INV:
509			ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
510			break;
511
512		case RC_ME_PRED_SET_POP:
513			ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
514			break;
515
516		case RC_ME_PRED_SET_RESTORE:
517			ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
518			break;
519
520		case RC_ME_PRED_SEQ:
521			ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
522			break;
523
524		case RC_ME_PRED_SNEQ:
525			ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
526			break;
527
528		case RC_VE_PRED_SNEQ_PUSH:
529			ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH,
530								vpi, inst);
531			break;
532
533		default:
534			rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
535			return;
536		}
537
538		if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
539			inst[0] |= (PVS_DST_PRED_ENABLE_MASK
540						<< PVS_DST_PRED_ENABLE_SHIFT);
541			if (vpi->DstReg.Pred == RC_PRED_SET) {
542				inst[0] |= (PVS_DST_PRED_SENSE_MASK
543						<< PVS_DST_PRED_SENSE_SHIFT);
544			}
545		}
546
547		/* Update the number of temporaries. */
548		if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
549		    vpi->DstReg.Index >= compiler->code->num_temporaries)
550			compiler->code->num_temporaries = vpi->DstReg.Index + 1;
551
552		/* last instruction that writes position */
553		if (info->HasDstReg && vpi->DstReg.File == RC_FILE_OUTPUT &&
554		    t_dst_index(compiler->code, &vpi->DstReg) == 0) {
555			if (loop_depth == 0)
556				compiler->code->last_pos_write = compiler->code->length / 4;
557			else
558				last_pos_write_at_loop_end = true;
559		}
560
561		for (unsigned i = 0; i < info->NumSrcRegs; i++) {
562			if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
563			    vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
564				compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
565			if (vpi->SrcReg[i].File == RC_FILE_INPUT) {
566				if (loop_depth == 0)
567					compiler->code->last_input_read = compiler->code->length / 4;
568				else
569					last_input_read_at_loop_end = true;
570			}
571
572		}
573
574
575		if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
576			rc_error(&compiler->Base, "Too many temporaries.\n");
577			return;
578		}
579
580		compiler->code->length += 4;
581
582		if (compiler->Base.Error)
583			return;
584	}
585}
586
587struct temporary_allocation {
588	unsigned int Allocated:1;
589	unsigned int HwTemp:15;
590	struct rc_instruction * LastRead;
591};
592
593static int get_reg(struct radeon_compiler *c, struct temporary_allocation *ta, bool *hwtemps,
594                   unsigned int orig)
595{
596    if (!ta[orig].Allocated) {
597        int j;
598        for (j = 0; j < c->max_temp_regs; ++j)
599        {
600            if (!hwtemps[j])
601                break;
602        }
603        ta[orig].Allocated = 1;
604        ta[orig].HwTemp = j;
605        hwtemps[ta[orig].HwTemp] = true;
606    }
607
608    return ta[orig].HwTemp;
609}
610
611static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
612{
613	struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
614	struct rc_instruction *inst;
615	struct rc_instruction *end_loop = NULL;
616	unsigned int num_orig_temps = 0;
617	bool hwtemps[RC_REGISTER_MAX_INDEX];
618	struct temporary_allocation * ta;
619	unsigned int i;
620
621	memset(hwtemps, 0, sizeof(hwtemps));
622
623	rc_recompute_ips(c);
624
625	/* Pass 1: Count original temporaries. */
626	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
627		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
628
629		for (i = 0; i < opcode->NumSrcRegs; ++i) {
630			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
631				if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
632					num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
633			}
634		}
635
636		if (opcode->HasDstReg) {
637			if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
638				if (inst->U.I.DstReg.Index >= num_orig_temps)
639					num_orig_temps = inst->U.I.DstReg.Index + 1;
640			}
641		}
642	}
643
644	ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
645			sizeof(struct temporary_allocation) * num_orig_temps);
646	memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
647
648	/* Pass 2: Determine original temporary lifetimes */
649	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
650		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
651		/* Instructions inside of loops need to use the ENDLOOP
652		 * instruction as their LastRead. */
653		if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP)
654			end_loop = rc_match_bgnloop(inst);
655
656		if (inst == end_loop) {
657			end_loop = NULL;
658			continue;
659		}
660
661		for (i = 0; i < opcode->NumSrcRegs; ++i) {
662			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
663				ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst;
664			}
665		}
666	}
667
668	/* Pass 3: Register allocation */
669	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
670		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
671
672		for (i = 0; i < opcode->NumSrcRegs; ++i) {
673			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
674				unsigned int orig = inst->U.I.SrcReg[i].Index;
675				inst->U.I.SrcReg[i].Index = get_reg(c, ta, hwtemps, orig);
676
677				if (ta[orig].Allocated && inst == ta[orig].LastRead)
678					hwtemps[ta[orig].HwTemp] = false;
679			}
680		}
681
682		if (opcode->HasDstReg) {
683			if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
684				unsigned int orig = inst->U.I.DstReg.Index;
685				inst->U.I.DstReg.Index = get_reg(c, ta, hwtemps, orig);
686			}
687		}
688	}
689}
690
691/**
692 * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
693 * and the Saturate opcode modifier. Only Absolute is currently transformed.
694 */
695static int transform_nonnative_modifiers(
696	struct radeon_compiler *c,
697	struct rc_instruction *inst,
698	void* unused)
699{
700	const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
701	unsigned i;
702
703	/* Transform ABS(a) to MAX(a, -a). */
704	for (i = 0; i < opcode->NumSrcRegs; i++) {
705		if (inst->U.I.SrcReg[i].Abs) {
706			struct rc_instruction *new_inst;
707			unsigned temp;
708
709			inst->U.I.SrcReg[i].Abs = 0;
710
711			temp = rc_find_free_temporary(c);
712
713			new_inst = rc_insert_new_instruction(c, inst->Prev);
714			new_inst->U.I.Opcode = RC_OPCODE_MAX;
715			new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
716			new_inst->U.I.DstReg.Index = temp;
717			new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
718			new_inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
719			new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
720			new_inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_XYZW;
721			new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
722
723			inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
724			inst->U.I.SrcReg[i].Index = temp;
725			inst->U.I.SrcReg[i].RelAddr = 0;
726		}
727	}
728	return 1;
729}
730
731/**
732 * Vertex engine cannot read two inputs or two constants at the same time.
733 * Introduce intermediate MOVs to temporary registers to account for this.
734 */
735static int transform_source_conflicts(
736	struct radeon_compiler *c,
737	struct rc_instruction* inst,
738	void* unused)
739{
740	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
741
742	if (opcode->NumSrcRegs == 3) {
743		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
744		    || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
745			int tmpreg = rc_find_free_temporary(c);
746			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
747			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
748			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
749			inst_mov->U.I.DstReg.Index = tmpreg;
750			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
751			inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
752			inst_mov->U.I.SrcReg[0].Negate = 0;
753			inst_mov->U.I.SrcReg[0].Abs = 0;
754
755			inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
756			inst->U.I.SrcReg[2].Index = tmpreg;
757			inst->U.I.SrcReg[2].RelAddr = false;
758		}
759	}
760
761	if (opcode->NumSrcRegs >= 2) {
762		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
763			int tmpreg = rc_find_free_temporary(c);
764			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
765			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
766			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
767			inst_mov->U.I.DstReg.Index = tmpreg;
768			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
769			inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
770			inst_mov->U.I.SrcReg[0].Negate = 0;
771			inst_mov->U.I.SrcReg[0].Abs = 0;
772
773			inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
774			inst->U.I.SrcReg[1].Index = tmpreg;
775			inst->U.I.SrcReg[1].RelAddr = false;
776		}
777	}
778
779	return 1;
780}
781
782static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
783{
784	struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
785	int i;
786
787	for(i = 0; i < 32; ++i) {
788		if ((compiler->RequiredOutputs & (1U << i)) &&
789		    !(compiler->Base.Program.OutputsWritten & (1U << i))) {
790			struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
791			inst->U.I.Opcode = RC_OPCODE_MOV;
792
793			inst->U.I.DstReg.File = RC_FILE_OUTPUT;
794			inst->U.I.DstReg.Index = i;
795			inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
796
797			inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
798			inst->U.I.SrcReg[0].Index = 0;
799			inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
800
801			compiler->Base.Program.OutputsWritten |= 1U << i;
802		}
803	}
804}
805
806static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
807{
808	(void) opcode;
809	(void) reg;
810
811	return 1;
812}
813
814static void transform_negative_addressing(struct r300_vertex_program_compiler *c,
815					  struct rc_instruction *arl,
816					  struct rc_instruction *end,
817					  int min_offset)
818{
819	struct rc_instruction *inst, *add;
820	unsigned const_swizzle;
821
822	/* Transform ARL/ARR */
823	add = rc_insert_new_instruction(&c->Base, arl->Prev);
824	add->U.I.Opcode = RC_OPCODE_ADD;
825	add->U.I.DstReg.File = RC_FILE_TEMPORARY;
826	add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base);
827	add->U.I.DstReg.WriteMask = RC_MASK_X;
828	add->U.I.SrcReg[0] = arl->U.I.SrcReg[0];
829	add->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
830	add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants,
831								     min_offset, &const_swizzle);
832	add->U.I.SrcReg[1].Swizzle = const_swizzle;
833
834	arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
835	arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index;
836	arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX;
837
838	/* Rewrite offsets up to and excluding inst. */
839	for (inst = arl->Next; inst != end; inst = inst->Next) {
840		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
841
842		for (unsigned i = 0; i < opcode->NumSrcRegs; i++)
843			if (inst->U.I.SrcReg[i].RelAddr)
844				inst->U.I.SrcReg[i].Index -= min_offset;
845	}
846}
847
848static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user)
849{
850	struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler;
851	struct rc_instruction *inst, *lastARL = NULL;
852	int min_offset = 0;
853
854	for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) {
855		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
856
857		if (inst->U.I.Opcode == RC_OPCODE_ARL || inst->U.I.Opcode == RC_OPCODE_ARR) {
858			if (lastARL != NULL && min_offset < 0)
859				transform_negative_addressing(c, lastARL, inst, min_offset);
860
861			lastARL = inst;
862			min_offset = 0;
863			continue;
864		}
865
866		for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
867			if (inst->U.I.SrcReg[i].RelAddr &&
868			    inst->U.I.SrcReg[i].Index < 0) {
869				/* ARL must precede any indirect addressing. */
870				if (!lastARL) {
871					rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL/ARR.");
872					return;
873				}
874
875				if (inst->U.I.SrcReg[i].Index < min_offset)
876					min_offset = inst->U.I.SrcReg[i].Index;
877			}
878		}
879	}
880
881	if (lastARL != NULL && min_offset < 0)
882		transform_negative_addressing(c, lastARL, inst, min_offset);
883}
884
885const struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
886	.IsNative = &swizzle_is_native,
887	.Split = NULL /* should never be called */
888};
889
890void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
891{
892	int is_r500 = c->Base.is_r500;
893	int opt = !c->Base.disable_optimizations;
894
895	/* Lists of instruction transformations. */
896	struct radeon_program_transformation alu_rewrite_r500[] = {
897		{ &r300_transform_vertex_alu, NULL },
898		{ &r300_transform_trig_scale_vertex, NULL },
899		{ NULL, NULL }
900	};
901
902	struct radeon_program_transformation alu_rewrite_r300[] = {
903		{ &r300_transform_vertex_alu, NULL },
904		{ &r300_transform_trig_simple, NULL },
905		{ NULL, NULL }
906	};
907
908	/* Note: These passes have to be done seperately from ALU rewrite,
909	 * otherwise non-native ALU instructions with source conflits
910	 * or non-native modifiers will not be treated properly.
911	 */
912	struct radeon_program_transformation emulate_modifiers[] = {
913		{ &transform_nonnative_modifiers, NULL },
914		{ NULL, NULL }
915	};
916
917	struct radeon_program_transformation resolve_src_conflicts[] = {
918		{ &transform_source_conflicts, NULL },
919		{ NULL, NULL }
920	};
921
922	/* List of compiler passes. */
923	struct radeon_compiler_pass vs_list[] = {
924		/* NAME				DUMP PREDICATE	FUNCTION			PARAM */
925		{"add artificial outputs",	0, 1,		rc_vs_add_artificial_outputs,	NULL},
926		{"emulate branches",		1, !is_r500,	rc_emulate_branches,		NULL},
927		{"emulate negative addressing", 1, 1,		rc_emulate_negative_addressing,	NULL},
928		{"native rewrite",		1, is_r500,	rc_local_transform,		alu_rewrite_r500},
929		{"native rewrite",		1, !is_r500,	rc_local_transform,		alu_rewrite_r300},
930		{"emulate modifiers",		1, !is_r500,	rc_local_transform,		emulate_modifiers},
931		{"deadcode",			1, opt,		rc_dataflow_deadcode,		NULL},
932		{"dataflow optimize",		1, opt,		rc_optimize,			NULL},
933		/* This pass must be done after optimizations. */
934		{"source conflict resolve",	1, 1,		rc_local_transform,		resolve_src_conflicts},
935		{"register allocation",		1, opt,		allocate_temporary_registers,	NULL},
936		{"dead constants",		1, 1,		rc_remove_unused_constants,	&c->code->constants_remap_table},
937		{"lower control flow opcodes",	1, is_r500,	rc_vert_fc,			NULL},
938		{"final code validation",	0, 1,		rc_validate_final_shader,	NULL},
939		{"machine code generation",	0, 1,		translate_vertex_program,	NULL},
940		{"dump machine code",		0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump,	NULL},
941		{NULL, 0, 0, NULL, NULL}
942	};
943
944	c->Base.type = RC_VERTEX_PROGRAM;
945	c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
946
947	rc_run_compiler(&c->Base, vs_list);
948
949	c->code->InputsRead = c->Base.Program.InputsRead;
950	c->code->OutputsWritten = c->Base.Program.OutputsWritten;
951	rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
952}
953