1/*
2 * Copyright (C) 2009 Nicolai Haehnle.
3 * Copyright 2010 Tom Stellard <tstellar@gmail.com>
4 *
5 * All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sublicense, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial
17 * portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
23 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
25 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 */
28
29#include "util/u_math.h"
30
31#include "radeon_dataflow.h"
32
33#include "radeon_compiler.h"
34#include "radeon_compiler_util.h"
35#include "radeon_list.h"
36#include "radeon_swizzle.h"
37#include "radeon_variable.h"
38
39struct src_clobbered_reads_cb_data {
40	rc_register_file File;
41	unsigned int Index;
42	unsigned int Mask;
43	struct rc_reader_data * ReaderData;
44};
45
46typedef void (*rc_presub_replace_fn)(struct rc_instruction *,
47						struct rc_instruction *,
48						unsigned int);
49
50static struct rc_src_register chain_srcregs(struct rc_src_register outer, struct rc_src_register inner)
51{
52	struct rc_src_register combine;
53	combine.File = inner.File;
54	combine.Index = inner.Index;
55	combine.RelAddr = inner.RelAddr;
56	if (outer.Abs) {
57		combine.Abs = 1;
58		combine.Negate = outer.Negate;
59	} else {
60		combine.Abs = inner.Abs;
61		combine.Negate = swizzle_mask(outer.Swizzle, inner.Negate);
62		combine.Negate ^= outer.Negate;
63	}
64	combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
65	return combine;
66}
67
68static void copy_propagate_scan_read(void * data, struct rc_instruction * inst,
69						struct rc_src_register * src)
70{
71	rc_register_file file = src->File;
72	struct rc_reader_data * reader_data = data;
73
74	if(!rc_inst_can_use_presub(inst,
75				reader_data->Writer->U.I.PreSub.Opcode,
76				rc_swizzle_to_writemask(src->Swizzle),
77				src,
78				&reader_data->Writer->U.I.PreSub.SrcReg[0],
79				&reader_data->Writer->U.I.PreSub.SrcReg[1])) {
80		reader_data->Abort = 1;
81		return;
82	}
83
84	/* XXX This could probably be handled better. */
85	if (file == RC_FILE_ADDRESS) {
86		reader_data->Abort = 1;
87		return;
88	}
89
90	/* These instructions cannot read from the constants file.
91	 * see radeonTransformTEX()
92	 */
93	if(reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_TEMPORARY &&
94			reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_INPUT &&
95				(inst->U.I.Opcode == RC_OPCODE_TEX ||
96				inst->U.I.Opcode == RC_OPCODE_TXB ||
97				inst->U.I.Opcode == RC_OPCODE_TXP ||
98				inst->U.I.Opcode == RC_OPCODE_TXD ||
99				inst->U.I.Opcode == RC_OPCODE_TXL ||
100				inst->U.I.Opcode == RC_OPCODE_KIL)){
101		reader_data->Abort = 1;
102		return;
103	}
104}
105
106static void src_clobbered_reads_cb(
107	void * data,
108	struct rc_instruction * inst,
109	struct rc_src_register * src)
110{
111	struct src_clobbered_reads_cb_data * sc_data = data;
112
113	if (src->File == sc_data->File
114	    && src->Index == sc_data->Index
115	    && (rc_swizzle_to_writemask(src->Swizzle) & sc_data->Mask)) {
116
117		sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
118	}
119
120	if (src->RelAddr && sc_data->File == RC_FILE_ADDRESS) {
121		sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
122	}
123}
124
125static void is_src_clobbered_scan_write(
126	void * data,
127	struct rc_instruction * inst,
128	rc_register_file file,
129	unsigned int index,
130	unsigned int mask)
131{
132	struct src_clobbered_reads_cb_data sc_data;
133	struct rc_reader_data * reader_data = data;
134	sc_data.File = file;
135	sc_data.Index = index;
136	sc_data.Mask = mask;
137	sc_data.ReaderData = reader_data;
138	rc_for_all_reads_src(reader_data->Writer,
139					src_clobbered_reads_cb, &sc_data);
140}
141
142static void copy_propagate(struct radeon_compiler * c, struct rc_instruction * inst_mov)
143{
144	struct rc_reader_data reader_data;
145	unsigned int i;
146
147	if (inst_mov->U.I.DstReg.File != RC_FILE_TEMPORARY ||
148	    inst_mov->U.I.WriteALUResult)
149		return;
150
151	/* Get a list of all the readers of this MOV instruction. */
152	reader_data.ExitOnAbort = 1;
153	rc_get_readers(c, inst_mov, &reader_data,
154		       copy_propagate_scan_read, NULL,
155		       is_src_clobbered_scan_write);
156
157	if (reader_data.Abort || reader_data.ReaderCount == 0 || reader_data.ReadersAfterEndloop)
158		return;
159
160	/* We can propagate SaturateMode if all the readers are MOV instructions
161	 * without a presubtract operation, source negation and absolute.
162	 * In that case, we just move SaturateMode to all readers. */
163        if (inst_mov->U.I.SaturateMode) {
164		for (i = 0; i < reader_data.ReaderCount; i++) {
165			struct rc_instruction * inst = reader_data.Readers[i].Inst;
166
167			if (inst->U.I.Opcode != RC_OPCODE_MOV ||
168			    inst->U.I.SrcReg[0].File == RC_FILE_PRESUB ||
169			    inst->U.I.SrcReg[0].Abs ||
170			    inst->U.I.SrcReg[0].Negate) {
171				return;
172			}
173		}
174	}
175
176	/* Propagate the MOV instruction. */
177	for (i = 0; i < reader_data.ReaderCount; i++) {
178		struct rc_instruction * inst = reader_data.Readers[i].Inst;
179		*reader_data.Readers[i].U.I.Src = chain_srcregs(*reader_data.Readers[i].U.I.Src, inst_mov->U.I.SrcReg[0]);
180
181		if (inst_mov->U.I.SrcReg[0].File == RC_FILE_PRESUB)
182			inst->U.I.PreSub = inst_mov->U.I.PreSub;
183		if (!inst->U.I.SaturateMode)
184			inst->U.I.SaturateMode = inst_mov->U.I.SaturateMode;
185	}
186
187	/* Finally, remove the original MOV instruction */
188	rc_remove_instruction(inst_mov);
189}
190
191/**
192 * Check if a source register is actually always the same
193 * swizzle constant.
194 */
195static int is_src_uniform_constant(struct rc_src_register src,
196		rc_swizzle * pswz, unsigned int * pnegate)
197{
198	int have_used = 0;
199
200	if (src.File != RC_FILE_NONE) {
201		*pswz = 0;
202		return 0;
203	}
204
205	for(unsigned int chan = 0; chan < 4; ++chan) {
206		unsigned int swz = GET_SWZ(src.Swizzle, chan);
207		if (swz < 4) {
208			*pswz = 0;
209			return 0;
210		}
211		if (swz == RC_SWIZZLE_UNUSED)
212			continue;
213
214		if (!have_used) {
215			*pswz = swz;
216			*pnegate = GET_BIT(src.Negate, chan);
217			have_used = 1;
218		} else {
219			if (swz != *pswz || *pnegate != GET_BIT(src.Negate, chan)) {
220				*pswz = 0;
221				return 0;
222			}
223		}
224	}
225
226	return 1;
227}
228
229static void constant_folding_mad(struct rc_instruction * inst)
230{
231	rc_swizzle swz = 0;
232	unsigned int negate= 0;
233
234	if (is_src_uniform_constant(inst->U.I.SrcReg[2], &swz, &negate)) {
235		if (swz == RC_SWIZZLE_ZERO) {
236			inst->U.I.Opcode = RC_OPCODE_MUL;
237			return;
238		}
239	}
240
241	if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
242		if (swz == RC_SWIZZLE_ONE) {
243			inst->U.I.Opcode = RC_OPCODE_ADD;
244			if (negate)
245				inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
246			inst->U.I.SrcReg[1] = inst->U.I.SrcReg[2];
247			return;
248		} else if (swz == RC_SWIZZLE_ZERO) {
249			inst->U.I.Opcode = RC_OPCODE_MOV;
250			inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
251			return;
252		}
253	}
254
255	if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
256		if (swz == RC_SWIZZLE_ONE) {
257			inst->U.I.Opcode = RC_OPCODE_ADD;
258			if (negate)
259				inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
260			inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
261			return;
262		} else if (swz == RC_SWIZZLE_ZERO) {
263			inst->U.I.Opcode = RC_OPCODE_MOV;
264			inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
265			return;
266		}
267	}
268}
269
270static void constant_folding_mul(struct rc_instruction * inst)
271{
272	rc_swizzle swz = 0;
273	unsigned int negate = 0;
274
275	if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
276		if (swz == RC_SWIZZLE_ONE) {
277			inst->U.I.Opcode = RC_OPCODE_MOV;
278			inst->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
279			if (negate)
280				inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
281			return;
282		} else if (swz == RC_SWIZZLE_ZERO) {
283			inst->U.I.Opcode = RC_OPCODE_MOV;
284			inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
285			return;
286		}
287	}
288
289	if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
290		if (swz == RC_SWIZZLE_ONE) {
291			inst->U.I.Opcode = RC_OPCODE_MOV;
292			if (negate)
293				inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
294			return;
295		} else if (swz == RC_SWIZZLE_ZERO) {
296			inst->U.I.Opcode = RC_OPCODE_MOV;
297			inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
298			return;
299		}
300	}
301}
302
303static void constant_folding_add(struct rc_instruction * inst)
304{
305	rc_swizzle swz = 0;
306	unsigned int negate = 0;
307
308	if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
309		if (swz == RC_SWIZZLE_ZERO) {
310			inst->U.I.Opcode = RC_OPCODE_MOV;
311			inst->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
312			return;
313		}
314	}
315
316	if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
317		if (swz == RC_SWIZZLE_ZERO) {
318			inst->U.I.Opcode = RC_OPCODE_MOV;
319			return;
320		}
321	}
322}
323
324/**
325 * Replace 0.0, 1.0 and 0.5 immediate constants by their
326 * respective swizzles. Simplify instructions like ADD dst, src, 0;
327 */
328static void constant_folding(struct radeon_compiler * c, struct rc_instruction * inst)
329{
330	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
331	unsigned int i;
332
333	/* Replace 0.0, 1.0 and 0.5 immediates by their explicit swizzles */
334	for(unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
335		struct rc_constant * constant;
336		struct rc_src_register newsrc;
337		int have_real_reference;
338		unsigned int chan;
339
340		/* If there are only 0, 0.5, 1, or _ swizzles, mark the source as a constant. */
341		for (chan = 0; chan < 4; ++chan)
342			if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) <= 3)
343				break;
344		if (chan == 4) {
345			inst->U.I.SrcReg[src].File = RC_FILE_NONE;
346			continue;
347		}
348
349		/* Convert immediates to swizzles. */
350		if (inst->U.I.SrcReg[src].File != RC_FILE_CONSTANT ||
351		    inst->U.I.SrcReg[src].RelAddr ||
352		    inst->U.I.SrcReg[src].Index >= c->Program.Constants.Count)
353			continue;
354
355		constant =
356			&c->Program.Constants.Constants[inst->U.I.SrcReg[src].Index];
357
358		if (constant->Type != RC_CONSTANT_IMMEDIATE)
359			continue;
360
361		newsrc = inst->U.I.SrcReg[src];
362		have_real_reference = 0;
363		for (chan = 0; chan < 4; ++chan) {
364			unsigned int swz = GET_SWZ(newsrc.Swizzle, chan);
365			unsigned int newswz;
366			float imm;
367			float baseimm;
368
369			if (swz >= 4)
370				continue;
371
372			imm = constant->u.Immediate[swz];
373			baseimm = imm;
374			if (imm < 0.0)
375				baseimm = -baseimm;
376
377			if (baseimm == 0.0) {
378				newswz = RC_SWIZZLE_ZERO;
379			} else if (baseimm == 1.0) {
380				newswz = RC_SWIZZLE_ONE;
381			} else if (baseimm == 0.5 && c->has_half_swizzles) {
382				newswz = RC_SWIZZLE_HALF;
383			} else {
384				have_real_reference = 1;
385				continue;
386			}
387
388			SET_SWZ(newsrc.Swizzle, chan, newswz);
389			if (imm < 0.0 && !newsrc.Abs)
390				newsrc.Negate ^= 1 << chan;
391		}
392
393		if (!have_real_reference) {
394			newsrc.File = RC_FILE_NONE;
395			newsrc.Index = 0;
396		}
397
398		/* don't make the swizzle worse */
399		if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, newsrc))
400			continue;
401
402		inst->U.I.SrcReg[src] = newsrc;
403	}
404
405	/* Simplify instructions based on constants */
406	if (inst->U.I.Opcode == RC_OPCODE_MAD)
407		constant_folding_mad(inst);
408
409	/* note: MAD can simplify to MUL or ADD */
410	if (inst->U.I.Opcode == RC_OPCODE_MUL)
411		constant_folding_mul(inst);
412	else if (inst->U.I.Opcode == RC_OPCODE_ADD)
413		constant_folding_add(inst);
414
415	/* In case this instruction has been converted, make sure all of the
416	 * registers that are no longer used are empty. */
417	opcode = rc_get_opcode_info(inst->U.I.Opcode);
418	for(i = opcode->NumSrcRegs; i < 3; i++) {
419		memset(&inst->U.I.SrcReg[i], 0, sizeof(struct rc_src_register));
420	}
421}
422
423/**
424 * If src and dst use the same register, this function returns a writemask that
425 * indicates which components are read by src.  Otherwise zero is returned.
426 */
427static unsigned int src_reads_dst_mask(struct rc_src_register src,
428						struct rc_dst_register dst)
429{
430	if (dst.File != src.File || dst.Index != src.Index) {
431		return 0;
432	}
433	return rc_swizzle_to_writemask(src.Swizzle);
434}
435
436/* Return 1 if the source registers has a constant swizzle (e.g. 0, 0.5, 1.0)
437 * in any of its channels.  Return 0 otherwise. */
438static int src_has_const_swz(struct rc_src_register src) {
439	int chan;
440	for(chan = 0; chan < 4; chan++) {
441		unsigned int swz = GET_SWZ(src.Swizzle, chan);
442		if (swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF
443						|| swz == RC_SWIZZLE_ONE) {
444			return 1;
445		}
446	}
447	return 0;
448}
449
450static void presub_scan_read(
451	void * data,
452	struct rc_instruction * inst,
453	struct rc_src_register * src)
454{
455	struct rc_reader_data * reader_data = data;
456	rc_presubtract_op * presub_opcode = reader_data->CbData;
457
458	if (!rc_inst_can_use_presub(inst, *presub_opcode,
459			reader_data->Writer->U.I.DstReg.WriteMask,
460			src,
461			&reader_data->Writer->U.I.SrcReg[0],
462			&reader_data->Writer->U.I.SrcReg[1])) {
463		reader_data->Abort = 1;
464		return;
465	}
466}
467
468static int presub_helper(
469	struct radeon_compiler * c,
470	struct rc_instruction * inst_add,
471	rc_presubtract_op presub_opcode,
472	rc_presub_replace_fn presub_replace)
473{
474	struct rc_reader_data reader_data;
475	unsigned int i;
476	rc_presubtract_op cb_op = presub_opcode;
477
478	reader_data.CbData = &cb_op;
479	reader_data.ExitOnAbort = 1;
480	rc_get_readers(c, inst_add, &reader_data, presub_scan_read, NULL,
481						is_src_clobbered_scan_write);
482
483	if (reader_data.Abort || reader_data.ReaderCount == 0)
484		return 0;
485
486	for(i = 0; i < reader_data.ReaderCount; i++) {
487		unsigned int src_index;
488		struct rc_reader reader = reader_data.Readers[i];
489		const struct rc_opcode_info * info =
490				rc_get_opcode_info(reader.Inst->U.I.Opcode);
491
492		for (src_index = 0; src_index < info->NumSrcRegs; src_index++) {
493			if (&reader.Inst->U.I.SrcReg[src_index] == reader.U.I.Src)
494				presub_replace(inst_add, reader.Inst, src_index);
495		}
496	}
497	return 1;
498}
499
500static void presub_replace_add(
501	struct rc_instruction * inst_add,
502	struct rc_instruction * inst_reader,
503	unsigned int src_index)
504{
505	rc_presubtract_op presub_opcode;
506
507	/* This function assumes that inst_add->U.I.SrcReg[0] and
508	 * inst_add->U.I.SrcReg[1] aren't both negative.
509	 */
510	assert(!(inst_add->U.I.SrcReg[1].Negate && inst_add->U.I.SrcReg[0].Negate));
511
512	if (inst_add->U.I.SrcReg[1].Negate || inst_add->U.I.SrcReg[0].Negate)
513		presub_opcode = RC_PRESUB_SUB;
514	else
515		presub_opcode = RC_PRESUB_ADD;
516
517	if (inst_add->U.I.SrcReg[1].Negate) {
518		inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
519		inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[0];
520	} else {
521		inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[0];
522		inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[1];
523	}
524	inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
525	inst_reader->U.I.PreSub.SrcReg[1].Negate = 0;
526	inst_reader->U.I.PreSub.Opcode = presub_opcode;
527	inst_reader->U.I.SrcReg[src_index] =
528			chain_srcregs(inst_reader->U.I.SrcReg[src_index],
529					inst_reader->U.I.PreSub.SrcReg[0]);
530	inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
531	inst_reader->U.I.SrcReg[src_index].Index = presub_opcode;
532}
533
534static int is_presub_candidate(
535	struct radeon_compiler * c,
536	struct rc_instruction * inst)
537{
538	const struct rc_opcode_info * info = rc_get_opcode_info(inst->U.I.Opcode);
539	unsigned int i;
540	unsigned int is_constant[2] = {0, 0};
541
542	assert(inst->U.I.Opcode == RC_OPCODE_ADD);
543
544	if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE
545			|| inst->U.I.SaturateMode
546			|| inst->U.I.WriteALUResult
547			|| inst->U.I.Omod) {
548		return 0;
549	}
550
551	/* If both sources use a constant swizzle, then we can't convert it to
552	 * a presubtract operation.  In fact for the ADD and SUB presubtract
553	 * operations neither source can contain a constant swizzle.  This
554	 * specific case is checked in peephole_add_presub_add() when
555	 * we make sure the swizzles for both sources are equal, so we
556	 * don't need to worry about it here. */
557	for (i = 0; i < 2; i++) {
558		int chan;
559		for (chan = 0; chan < 4; chan++) {
560			rc_swizzle swz =
561				get_swz(inst->U.I.SrcReg[i].Swizzle, chan);
562			if (swz == RC_SWIZZLE_ONE
563					|| swz == RC_SWIZZLE_ZERO
564					|| swz == RC_SWIZZLE_HALF) {
565				is_constant[i] = 1;
566			}
567		}
568	}
569	if (is_constant[0] && is_constant[1])
570		return 0;
571
572	for(i = 0; i < info->NumSrcRegs; i++) {
573		struct rc_src_register src = inst->U.I.SrcReg[i];
574		if (src_reads_dst_mask(src, inst->U.I.DstReg))
575			return 0;
576
577		src.File = RC_FILE_PRESUB;
578		if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, src))
579			return 0;
580	}
581	return 1;
582}
583
584static int peephole_add_presub_add(
585	struct radeon_compiler * c,
586	struct rc_instruction * inst_add)
587{
588	unsigned dstmask = inst_add->U.I.DstReg.WriteMask;
589        unsigned src0_neg = inst_add->U.I.SrcReg[0].Negate & dstmask;
590        unsigned src1_neg = inst_add->U.I.SrcReg[1].Negate & dstmask;
591
592	if (inst_add->U.I.SrcReg[0].Swizzle != inst_add->U.I.SrcReg[1].Swizzle)
593		return 0;
594
595	/* src0 and src1 can't have absolute values */
596	if (inst_add->U.I.SrcReg[0].Abs || inst_add->U.I.SrcReg[1].Abs)
597	        return 0;
598
599	/* presub_replace_add() assumes only one is negative */
600	if (inst_add->U.I.SrcReg[0].Negate && inst_add->U.I.SrcReg[1].Negate)
601	        return 0;
602
603        /* if src0 is negative, at least all bits of dstmask have to be set */
604        if (inst_add->U.I.SrcReg[0].Negate && src0_neg != dstmask)
605	        return 0;
606
607        /* if src1 is negative, at least all bits of dstmask have to be set */
608        if (inst_add->U.I.SrcReg[1].Negate && src1_neg != dstmask)
609	        return 0;
610
611	if (!is_presub_candidate(c, inst_add))
612		return 0;
613
614	if (presub_helper(c, inst_add, RC_PRESUB_ADD, presub_replace_add)) {
615		rc_remove_instruction(inst_add);
616		return 1;
617	}
618	return 0;
619}
620
621static void presub_replace_inv(
622	struct rc_instruction * inst_add,
623	struct rc_instruction * inst_reader,
624	unsigned int src_index)
625{
626	/* We must be careful not to modify inst_add, since it
627	 * is possible it will remain part of the program.*/
628	inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
629	inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
630	inst_reader->U.I.PreSub.Opcode = RC_PRESUB_INV;
631	inst_reader->U.I.SrcReg[src_index] = chain_srcregs(inst_reader->U.I.SrcReg[src_index],
632						inst_reader->U.I.PreSub.SrcReg[0]);
633
634	inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
635	inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_INV;
636}
637
638/**
639 * PRESUB_INV: ADD TEMP[0], none.1, -TEMP[1]
640 * Use the presubtract 1 - src0 for all readers of TEMP[0].  The first source
641 * of the add instruction must have the constatnt 1 swizzle.  This function
642 * does not check const registers to see if their value is 1.0, so it should
643 * be called after the constant_folding optimization.
644 * @return
645 * 	0 if the ADD instruction is still part of the program.
646 * 	1 if the ADD instruction is no longer part of the program.
647 */
648static int peephole_add_presub_inv(
649	struct radeon_compiler * c,
650	struct rc_instruction * inst_add)
651{
652	unsigned int i, swz;
653
654	if (!is_presub_candidate(c, inst_add))
655		return 0;
656
657	/* Check if src0 is 1. */
658	/* XXX It would be nice to use is_src_uniform_constant here, but that
659	 * function only works if the register's file is RC_FILE_NONE */
660	for(i = 0; i < 4; i++ ) {
661		if (!(inst_add->U.I.DstReg.WriteMask & (1 << i)))
662			continue;
663
664		swz = GET_SWZ(inst_add->U.I.SrcReg[0].Swizzle, i);
665		if (swz != RC_SWIZZLE_ONE || inst_add->U.I.SrcReg[0].Negate & (1 << i))
666			return 0;
667	}
668
669	/* Check src1. */
670	if ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) !=
671						inst_add->U.I.DstReg.WriteMask
672		|| inst_add->U.I.SrcReg[1].Abs
673		|| (inst_add->U.I.SrcReg[1].File != RC_FILE_TEMPORARY
674			&& inst_add->U.I.SrcReg[1].File != RC_FILE_CONSTANT)
675		|| src_has_const_swz(inst_add->U.I.SrcReg[1])) {
676
677		return 0;
678	}
679
680	if (presub_helper(c, inst_add, RC_PRESUB_INV, presub_replace_inv)) {
681		rc_remove_instruction(inst_add);
682		return 1;
683	}
684	return 0;
685}
686
687struct peephole_mul_cb_data {
688	struct rc_dst_register * Writer;
689	unsigned int Clobbered;
690};
691
692static void omod_filter_reader_cb(
693	void * userdata,
694	struct rc_instruction * inst,
695	rc_register_file file,
696	unsigned int index,
697	unsigned int mask)
698{
699	struct peephole_mul_cb_data * d = userdata;
700	if (rc_src_reads_dst_mask(file, mask, index,
701		d->Writer->File, d->Writer->Index, d->Writer->WriteMask)) {
702
703		d->Clobbered = 1;
704	}
705}
706
707static void omod_filter_writer_cb(
708	void * userdata,
709	struct rc_instruction * inst,
710	rc_register_file file,
711	unsigned int index,
712	unsigned int mask)
713{
714	struct peephole_mul_cb_data * d = userdata;
715	if (file == d->Writer->File && index == d->Writer->Index &&
716					(mask & d->Writer->WriteMask)) {
717		d->Clobbered = 1;
718	}
719}
720
721static int peephole_mul_omod(
722	struct radeon_compiler * c,
723	struct rc_instruction * inst_mul,
724	struct rc_list * var_list)
725{
726	unsigned int chan = 0, swz, i;
727	int const_index = -1;
728	int temp_index = -1;
729	float const_value;
730	rc_omod_op omod_op = RC_OMOD_DISABLE;
731	struct rc_list * writer_list;
732	struct rc_variable * var;
733	struct peephole_mul_cb_data cb_data;
734	unsigned writemask_sum;
735
736	for (i = 0; i < 2; i++) {
737		unsigned int j;
738		if (inst_mul->U.I.SrcReg[i].File != RC_FILE_CONSTANT
739			&& inst_mul->U.I.SrcReg[i].File != RC_FILE_TEMPORARY) {
740			return 0;
741		}
742		if (inst_mul->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
743			if (temp_index != -1) {
744				/* The instruction has two temp sources */
745				return 0;
746			} else {
747				temp_index = i;
748				continue;
749			}
750		}
751		/* If we get this far Src[i] must be a constant src */
752		if (inst_mul->U.I.SrcReg[i].Negate) {
753			return 0;
754		}
755		/* The constant src needs to read from the same swizzle */
756		swz = RC_SWIZZLE_UNUSED;
757		chan = 0;
758		for (j = 0; j < 4; j++) {
759			unsigned int j_swz =
760				GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
761			if (j_swz == RC_SWIZZLE_UNUSED) {
762				continue;
763			}
764			if (swz == RC_SWIZZLE_UNUSED) {
765				swz = j_swz;
766				chan = j;
767			} else if (j_swz != swz) {
768				return 0;
769			}
770		}
771
772		if (const_index != -1) {
773			/* The instruction has two constant sources */
774			return 0;
775		} else {
776			const_index = i;
777		}
778	}
779
780	if (!rc_src_reg_is_immediate(c, inst_mul->U.I.SrcReg[const_index].File,
781				inst_mul->U.I.SrcReg[const_index].Index)) {
782		return 0;
783	}
784	const_value = rc_get_constant_value(c,
785			inst_mul->U.I.SrcReg[const_index].Index,
786			inst_mul->U.I.SrcReg[const_index].Swizzle,
787			inst_mul->U.I.SrcReg[const_index].Negate,
788			chan);
789
790	if (const_value == 2.0f) {
791		omod_op = RC_OMOD_MUL_2;
792	} else if (const_value == 4.0f) {
793		omod_op = RC_OMOD_MUL_4;
794	} else if (const_value == 8.0f) {
795		omod_op = RC_OMOD_MUL_8;
796	} else if (const_value == (1.0f / 2.0f)) {
797		omod_op = RC_OMOD_DIV_2;
798	} else if (const_value == (1.0f / 4.0f)) {
799		omod_op = RC_OMOD_DIV_4;
800	} else if (const_value == (1.0f / 8.0f)) {
801		omod_op = RC_OMOD_DIV_8;
802	} else {
803		return 0;
804	}
805
806	writer_list = rc_variable_list_get_writers_one_reader(var_list,
807		RC_INSTRUCTION_NORMAL, &inst_mul->U.I.SrcReg[temp_index]);
808
809	if (!writer_list) {
810		return 0;
811	}
812
813	cb_data.Clobbered = 0;
814	cb_data.Writer = &inst_mul->U.I.DstReg;
815	for (var = writer_list->Item; var; var = var->Friend) {
816		struct rc_instruction * inst;
817		const struct rc_opcode_info * info = rc_get_opcode_info(
818				var->Inst->U.I.Opcode);
819		if (info->HasTexture) {
820			return 0;
821		}
822		if (var->Inst->U.I.SaturateMode != RC_SATURATE_NONE) {
823			return 0;
824		}
825		for (inst = inst_mul->Prev; inst != var->Inst;
826							inst = inst->Prev) {
827			rc_for_all_reads_mask(inst, omod_filter_reader_cb,
828								&cb_data);
829			rc_for_all_writes_mask(inst, omod_filter_writer_cb,
830								&cb_data);
831			if (cb_data.Clobbered) {
832				break;
833			}
834		}
835	}
836
837	if (cb_data.Clobbered) {
838		return 0;
839	}
840
841	writemask_sum = rc_variable_writemask_sum(writer_list->Item);
842
843	/* rc_normal_rewrite_writemask can't expand a previous writemask to store
844	 * more channels replicated.
845	 */
846	if (util_bitcount(writemask_sum) < util_bitcount(inst_mul->U.I.DstReg.WriteMask))
847		return 0;
848
849	/* Rewrite the instructions */
850	for (var = writer_list->Item; var; var = var->Friend) {
851		struct rc_variable * writer = var;
852		unsigned conversion_swizzle = rc_make_conversion_swizzle(
853					writemask_sum,
854					inst_mul->U.I.DstReg.WriteMask);
855		writer->Inst->U.I.Omod = omod_op;
856		writer->Inst->U.I.DstReg.File = inst_mul->U.I.DstReg.File;
857		writer->Inst->U.I.DstReg.Index = inst_mul->U.I.DstReg.Index;
858		rc_normal_rewrite_writemask(writer->Inst, conversion_swizzle);
859		writer->Inst->U.I.SaturateMode = inst_mul->U.I.SaturateMode;
860	}
861
862	rc_remove_instruction(inst_mul);
863
864	return 1;
865}
866
867/**
868 * @return
869 * 	0 if inst is still part of the program.
870 * 	1 if inst is no longer part of the program.
871 */
872static int peephole(struct radeon_compiler * c, struct rc_instruction * inst)
873{
874	switch(inst->U.I.Opcode){
875	case RC_OPCODE_ADD:
876		if (c->has_presub) {
877			if(peephole_add_presub_inv(c, inst))
878				return 1;
879			if(peephole_add_presub_add(c, inst))
880				return 1;
881		}
882		break;
883	default:
884		break;
885	}
886	return 0;
887}
888
889static unsigned int merge_swizzles(unsigned int swz1, unsigned int swz2) {
890	unsigned int new_swz = rc_init_swizzle(RC_SWIZZLE_UNUSED, 0);
891	for (unsigned int chan = 0; chan < 4; chan++) {
892		unsigned int swz = GET_SWZ(swz1, chan);
893		if (swz != RC_SWIZZLE_UNUSED) {
894			SET_SWZ(new_swz, chan, swz);
895			continue;
896		}
897		swz = GET_SWZ(swz2, chan);
898		SET_SWZ(new_swz, chan, swz);
899	}
900	return new_swz;
901}
902
903static int merge_movs(struct radeon_compiler * c, struct rc_instruction * inst)
904{
905	unsigned int orig_dst_reg = inst->U.I.DstReg.Index;
906	unsigned int orig_dst_file = inst->U.I.DstReg.File;
907	unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
908	unsigned int orig_src_reg = inst->U.I.SrcReg[0].Index;
909	unsigned int orig_src_file = inst->U.I.SrcReg[0].File;
910
911	struct rc_instruction * cur = inst;
912	while (cur!= &c->Program.Instructions) {
913		cur = cur->Next;
914		const struct rc_opcode_info * opcode = rc_get_opcode_info(cur->U.I.Opcode);
915
916		/* Keep it simple for now and stop when encountering any
917		 * control flow.
918		 */
919		if (opcode->IsFlowControl)
920			return 0;
921
922		/* Stop when the original destination is overwritten */
923		if (orig_dst_reg == cur->U.I.DstReg.Index &&
924			orig_dst_file == cur->U.I.DstReg.File &&
925			(orig_dst_wmask & cur->U.I.DstReg.WriteMask) != 0)
926			return 0;
927
928		/* Stop the search when the original instruction destination
929		 * is used as a source for anything.
930		 */
931		for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
932			if (cur->U.I.SrcReg[i].File == orig_dst_file &&
933				cur->U.I.SrcReg[i].Index == orig_dst_reg)
934				return 0;
935		}
936
937		if (cur->U.I.Opcode == RC_OPCODE_MOV &&
938			cur->U.I.DstReg.File == orig_dst_file &&
939			cur->U.I.DstReg.Index == orig_dst_reg &&
940			(cur->U.I.DstReg.WriteMask & orig_dst_wmask) == 0) {
941
942			/* We can merge the movs if one of them is from inline constant */
943			if (cur->U.I.SrcReg[0].File == RC_FILE_NONE ||
944				orig_src_file == RC_FILE_NONE) {
945				cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
946
947				if (cur->U.I.SrcReg[0].File == RC_FILE_NONE) {
948					cur->U.I.SrcReg[0].File = orig_src_file;
949					cur->U.I.SrcReg[0].Index = orig_src_reg;
950					cur->U.I.SrcReg[0].Abs = inst->U.I.SrcReg[0].Abs;
951					cur->U.I.SrcReg[0].RelAddr = inst->U.I.SrcReg[0].RelAddr;
952				}
953				cur->U.I.SrcReg[0].Swizzle =
954					merge_swizzles(cur->U.I.SrcReg[0].Swizzle,
955							inst->U.I.SrcReg[0].Swizzle);
956
957				cur->U.I.SrcReg[0].Negate |= inst->U.I.SrcReg[0].Negate;
958
959				/* finally delete the original mov */
960				rc_remove_instruction(inst);
961
962				return 1;
963			}
964		}
965	}
966	return 0;
967}
968
969void rc_optimize(struct radeon_compiler * c, void *user)
970{
971	struct rc_instruction * inst = c->Program.Instructions.Next;
972	while(inst != &c->Program.Instructions) {
973		struct rc_instruction * cur = inst;
974		inst = inst->Next;
975
976		constant_folding(c, cur);
977
978		if(peephole(c, cur))
979			continue;
980
981		if (cur->U.I.Opcode == RC_OPCODE_MOV) {
982			if (c->is_r500) {
983				if (merge_movs(c, cur))
984					continue;
985			}
986			copy_propagate(c, cur);
987			/* cur may no longer be part of the program */
988		}
989	}
990
991	if (!c->has_omod) {
992		return;
993	}
994
995	inst = c->Program.Instructions.Next;
996	struct rc_list * var_list = NULL;
997	while(inst != &c->Program.Instructions) {
998		struct rc_instruction * cur = inst;
999		inst = inst->Next;
1000		if (cur->U.I.Opcode == RC_OPCODE_MUL) {
1001			if (!var_list)
1002				var_list = rc_get_variables(c);
1003			if (peephole_mul_omod(c, cur, var_list))
1004				var_list = NULL;
1005		}
1006	}
1007}
1008