1/*
2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Vadim Girlin
25 */
26
27#ifndef SB_PASS_H_
28#define SB_PASS_H_
29
30#include <stack>
31
32namespace r600_sb {
33
34class pass {
35protected:
36	sb_context &ctx;
37	shader &sh;
38
39public:
40	pass(shader &s);
41
42	virtual int run();
43
44	virtual ~pass() {}
45};
46
47class vpass : public pass {
48
49public:
50
51	vpass(shader &s) : pass(s) {}
52
53	virtual int init();
54	virtual int done();
55
56	virtual int run();
57	virtual void run_on(container_node &n);
58
59	virtual bool visit(node &n, bool enter);
60	virtual bool visit(container_node &n, bool enter);
61	virtual bool visit(alu_group_node &n, bool enter);
62	virtual bool visit(cf_node &n, bool enter);
63	virtual bool visit(alu_node &n, bool enter);
64	virtual bool visit(alu_packed_node &n, bool enter);
65	virtual bool visit(fetch_node &n, bool enter);
66	virtual bool visit(region_node &n, bool enter);
67	virtual bool visit(repeat_node &n, bool enter);
68	virtual bool visit(depart_node &n, bool enter);
69	virtual bool visit(if_node &n, bool enter);
70	virtual bool visit(bb_node &n, bool enter);
71
72};
73
74class rev_vpass : public vpass {
75
76public:
77	rev_vpass(shader &s) : vpass(s) {}
78
79	virtual void run_on(container_node &n);
80};
81
82
83// =================== PASSES
84
85class bytecode;
86
87class bc_dump : public vpass {
88	using vpass::visit;
89
90	uint32_t *bc_data;
91	unsigned ndw;
92
93	unsigned id;
94
95	unsigned new_group, group_index;
96
97public:
98
99	bc_dump(shader &s, bytecode *bc = NULL);
100
101	bc_dump(shader &s, uint32_t *bc_ptr, unsigned ndw) :
102		vpass(s), bc_data(bc_ptr), ndw(ndw), id(), new_group(), group_index() {}
103
104	virtual int init();
105	virtual int done();
106
107	virtual bool visit(cf_node &n, bool enter);
108	virtual bool visit(alu_node &n, bool enter);
109	virtual bool visit(fetch_node &n, bool enter);
110
111	void dump_dw(unsigned dw_id, unsigned count = 2);
112
113	void dump(cf_node& n);
114	void dump(alu_node& n);
115	void dump(fetch_node& n);
116};
117
118
119class dce_cleanup : public vpass {
120	using vpass::visit;
121
122	bool remove_unused;
123
124public:
125
126	dce_cleanup(shader &s) : vpass(s),
127		remove_unused(s.dce_flags & DF_REMOVE_UNUSED), nodes_changed(false) {}
128
129	virtual int run();
130
131	virtual bool visit(node &n, bool enter);
132	virtual bool visit(alu_group_node &n, bool enter);
133	virtual bool visit(cf_node &n, bool enter);
134	virtual bool visit(alu_node &n, bool enter);
135	virtual bool visit(alu_packed_node &n, bool enter);
136	virtual bool visit(fetch_node &n, bool enter);
137	virtual bool visit(region_node &n, bool enter);
138	virtual bool visit(container_node &n, bool enter);
139
140private:
141
142	void cleanup_dst(node &n);
143	bool cleanup_dst_vec(vvec &vv);
144
145	// Did we alter/remove nodes during a single pass?
146	bool nodes_changed;
147};
148
149
150class def_use : public pass {
151
152public:
153
154	def_use(shader &sh) : pass(sh) {}
155
156	virtual int run();
157	void run_on(node *n, bool defs);
158
159private:
160
161	void process_uses(node *n);
162	void process_defs(node *n, vvec &vv, bool arr_def);
163	void process_phi(container_node *c, bool defs, bool uses);
164};
165
166
167
168class dump : public vpass {
169	using vpass::visit;
170
171	int level;
172
173public:
174
175	dump(shader &s) : vpass(s), level(0) {}
176
177	virtual bool visit(node &n, bool enter);
178	virtual bool visit(container_node &n, bool enter);
179	virtual bool visit(alu_group_node &n, bool enter);
180	virtual bool visit(cf_node &n, bool enter);
181	virtual bool visit(alu_node &n, bool enter);
182	virtual bool visit(alu_packed_node &n, bool enter);
183	virtual bool visit(fetch_node &n, bool enter);
184	virtual bool visit(region_node &n, bool enter);
185	virtual bool visit(repeat_node &n, bool enter);
186	virtual bool visit(depart_node &n, bool enter);
187	virtual bool visit(if_node &n, bool enter);
188	virtual bool visit(bb_node &n, bool enter);
189
190
191	static void dump_op(node &n, const char *name);
192	static void dump_vec(const vvec & vv);
193	static void dump_set(shader &sh, val_set & v);
194
195	static void dump_rels(vvec & vv);
196
197	static void dump_val(value *v);
198	static void dump_op(node *n);
199
200	static void dump_op_list(container_node *c);
201	static void dump_queue(sched_queue &q);
202
203	static void dump_alu(alu_node *n);
204
205private:
206
207	void indent();
208
209	void dump_common(node &n);
210	void dump_flags(node &n);
211
212	void dump_live_values(container_node &n, bool before);
213};
214
215
216// Global Code Motion
217
218class gcm : public pass {
219
220	sched_queue bu_ready[SQ_NUM];
221	sched_queue bu_ready_next[SQ_NUM];
222	sched_queue bu_ready_early[SQ_NUM];
223	sched_queue ready;
224	sched_queue ready_above;
225
226	unsigned outstanding_lds_oq;
227	container_node pending;
228
229	struct op_info {
230		bb_node* top_bb;
231		bb_node* bottom_bb;
232		op_info() : top_bb(), bottom_bb() {}
233	};
234
235	typedef std::map<node*, op_info> op_info_map;
236
237	typedef std::map<node*, unsigned> nuc_map;
238
239	op_info_map op_map;
240	nuc_map uses;
241
242	typedef std::vector<nuc_map> nuc_stack;
243
244	nuc_stack nuc_stk;
245	unsigned ucs_level;
246
247	bb_node * bu_bb;
248
249	vvec pending_defs;
250
251	node_list pending_nodes;
252
253	unsigned cur_sq;
254
255	// for register pressure tracking in bottom-up pass
256	val_set live;
257	int live_count;
258
259	static const int rp_threshold = 100;
260
261	bool pending_exec_mask_update;
262
263public:
264
265	gcm(shader &sh) : pass(sh),
266		bu_ready(), bu_ready_next(), bu_ready_early(),
267		ready(), outstanding_lds_oq(),
268		op_map(), uses(), nuc_stk(1), ucs_level(),
269		bu_bb(), pending_defs(), pending_nodes(), cur_sq(),
270		live(), live_count(), pending_exec_mask_update() {}
271
272	virtual int run();
273
274private:
275
276	void collect_instructions(container_node *c, bool early_pass);
277
278	void sched_early(container_node *n);
279	void td_sched_bb(bb_node *bb);
280	bool td_is_ready(node *n);
281	void td_release_uses(vvec &v);
282	void td_release_val(value *v);
283	void td_schedule(bb_node *bb, node *n);
284
285	void sched_late(container_node *n);
286	void bu_sched_bb(bb_node *bb);
287	void bu_release_defs(vvec &v, bool src);
288	void bu_release_phi_defs(container_node *p, unsigned op);
289	bool bu_is_ready(node *n);
290	void bu_release_val(value *v);
291	void bu_release_op(node * n);
292	void bu_find_best_bb(node *n, op_info &oi);
293	void bu_schedule(container_node *bb, node *n);
294
295	void push_uc_stack();
296	void pop_uc_stack();
297
298	void init_def_count(nuc_map &m, container_node &s);
299	void init_use_count(nuc_map &m, container_node &s);
300	unsigned get_uc_vec(vvec &vv);
301	unsigned get_dc_vec(vvec &vv, bool src);
302
303	void add_ready(node *n);
304
305	void dump_uc_stack();
306
307	unsigned real_alu_count(sched_queue &q, unsigned max);
308
309	// check if we have not less than threshold ready alu instructions
310	bool check_alu_ready_count(unsigned threshold);
311};
312
313
314class gvn : public vpass {
315	using vpass::visit;
316
317public:
318
319	gvn(shader &sh) : vpass(sh) {}
320
321	virtual bool visit(node &n, bool enter);
322	virtual bool visit(cf_node &n, bool enter);
323	virtual bool visit(alu_node &n, bool enter);
324	virtual bool visit(alu_packed_node &n, bool enter);
325	virtual bool visit(fetch_node &n, bool enter);
326	virtual bool visit(region_node &n, bool enter);
327
328private:
329
330	void process_op(node &n, bool rewrite = true);
331
332	// returns true if the value was rewritten
333	bool process_src(value* &v, bool rewrite);
334
335
336	void process_alu_src_constants(node &n, value* &v);
337};
338
339
340class if_conversion : public pass {
341
342public:
343
344	if_conversion(shader &sh) : pass(sh) {}
345
346	virtual int run();
347
348	bool run_on(region_node *r);
349
350	void convert_kill_instructions(region_node *r, value *em, bool branch,
351	                               container_node *c);
352
353	bool check_and_convert(region_node *r);
354
355	alu_node* convert_phi(value *select, node *phi);
356
357};
358
359
360class liveness : public rev_vpass {
361	using vpass::visit;
362
363	val_set live;
364	bool live_changed;
365
366public:
367
368	liveness(shader &s) : rev_vpass(s), live_changed(false) {}
369
370	virtual int init();
371
372	virtual bool visit(node &n, bool enter);
373	virtual bool visit(bb_node &n, bool enter);
374	virtual bool visit(container_node &n, bool enter);
375	virtual bool visit(alu_group_node &n, bool enter);
376	virtual bool visit(cf_node &n, bool enter);
377	virtual bool visit(alu_node &n, bool enter);
378	virtual bool visit(alu_packed_node &n, bool enter);
379	virtual bool visit(fetch_node &n, bool enter);
380	virtual bool visit(region_node &n, bool enter);
381	virtual bool visit(repeat_node &n, bool enter);
382	virtual bool visit(depart_node &n, bool enter);
383	virtual bool visit(if_node &n, bool enter);
384
385private:
386
387	void update_interferences();
388	void process_op(node &n);
389
390	bool remove_val(value *v);
391	bool remove_vec(vvec &v);
392	bool process_outs(node& n);
393	void process_ins(node& n);
394
395	void process_phi_outs(container_node *phi);
396	void process_phi_branch(container_node *phi, unsigned id);
397
398	bool process_maydef(value *v);
399
400	bool add_vec(vvec &vv, bool src);
401
402	void update_src_vec(vvec &vv, bool src);
403};
404
405
406struct bool_op_info {
407	bool invert;
408	unsigned int_cvt;
409
410	alu_node *n;
411};
412
413class peephole : public pass {
414
415public:
416
417	peephole(shader &sh) : pass(sh) {}
418
419	virtual int run();
420
421	void run_on(container_node *c);
422
423	void optimize_cc_op(alu_node *a);
424
425	void optimize_cc_op2(alu_node *a);
426	void optimize_CNDcc_op(alu_node *a);
427
428	bool get_bool_op_info(value *b, bool_op_info& bop);
429	bool get_bool_flt_to_int_source(alu_node* &a);
430	void convert_float_setcc(alu_node *f2i, alu_node *s);
431};
432
433
434class psi_ops : public rev_vpass {
435	using rev_vpass::visit;
436
437public:
438
439	psi_ops(shader &s) : rev_vpass(s) {}
440
441	virtual bool visit(node &n, bool enter);
442	virtual bool visit(alu_node &n, bool enter);
443
444	bool try_inline(node &n);
445	bool try_reduce(node &n);
446	bool eliminate(node &n);
447
448	void unpredicate(node *n);
449};
450
451
452// check correctness of the generated code, e.g.:
453// - expected source operand value is the last value written to its gpr,
454// - all arguments of phi node should be allocated to the same gpr,
455// TODO other tests
456class ra_checker : public pass {
457
458	typedef std::map<sel_chan, value *> reg_value_map;
459
460	typedef std::vector<reg_value_map> regmap_stack;
461
462	regmap_stack rm_stack;
463	unsigned rm_stk_level;
464
465	value* prev_dst[5];
466
467public:
468
469	ra_checker(shader &sh) : pass(sh), rm_stk_level(0), prev_dst() {}
470
471	virtual int run();
472
473	void run_on(container_node *c);
474
475	void dump_error(const error_info &e);
476	void dump_all_errors();
477
478private:
479
480	reg_value_map& rmap() { return rm_stack[rm_stk_level]; }
481
482	void push_stack();
483	void pop_stack();
484
485	// when going out of the alu clause, values in the clause temporary gprs,
486	// AR, predicate values, PS/PV are destroyed
487	void kill_alu_only_regs();
488	void error(node *n, unsigned id, std::string msg);
489
490	void check_phi_src(container_node *p, unsigned id);
491	void process_phi_dst(container_node *p);
492	void check_alu_group(alu_group_node *g);
493	void process_op_dst(node *n);
494	void check_op_src(node *n);
495	void check_src_vec(node *n, unsigned id, vvec &vv, bool src);
496	void check_value_gpr(node *n, unsigned id, value *v);
497};
498
499// =======================================
500
501
502class ra_coalesce : public pass {
503
504public:
505
506	ra_coalesce(shader &sh) : pass(sh) {}
507
508	virtual int run();
509};
510
511
512
513// =======================================
514
515class ra_init : public pass {
516
517public:
518
519	ra_init(shader &sh) : pass(sh), prev_chans() {
520
521		// The parameter below affects register channels distribution.
522		// For cayman (VLIW-4) we're trying to distribute the channels
523		// uniformly, this means significantly better alu slots utilization
524		// at the expense of higher gpr usage. Hopefully this will improve
525		// performance, though it has to be proven with real benchmarks yet.
526		// For VLIW-5 this method could also slightly improve slots
527		// utilization, but increased register pressure seems more significant
528		// and overall performance effect is negative according to some
529		// benchmarks, so it's not used currently. Basically, VLIW-5 doesn't
530		// really need it because trans slot (unrestricted by register write
531		// channel) allows to consume most deviations from uniform channel
532		// distribution.
533		// Value 3 means that for new allocation we'll use channel that differs
534		// from 3 last used channels. 0 for VLIW-5 effectively turns this off.
535
536		ra_tune = sh.get_ctx().is_cayman() ? 3 : 0;
537	}
538
539	virtual int run();
540
541private:
542
543	unsigned prev_chans;
544	unsigned ra_tune;
545
546	void add_prev_chan(unsigned chan);
547	unsigned get_preferable_chan_mask();
548
549	bool ra_node(container_node *c);
550	bool process_op(node *n);
551
552	bool color(value *v);
553
554	void color_bs_constraint(ra_constraint *c);
555
556	void assign_color(value *v, sel_chan c);
557	void alloc_arrays();
558};
559
560// =======================================
561
562class ra_split : public pass {
563
564public:
565
566	ra_split(shader &sh) : pass(sh) {}
567
568	virtual int run();
569
570	void split(container_node *n);
571	void split_op(node *n);
572	void split_alu_packed(alu_packed_node *n);
573	void split_vector_inst(node *n);
574
575	void split_packed_ins(alu_packed_node *n);
576
577#if 0
578	void split_pinned_outs(node *n);
579#endif
580
581	void split_vec(vvec &vv, vvec &v1, vvec &v2, bool allow_swz);
582
583	void split_phi_src(container_node *loc, container_node *c, unsigned id,
584	                   bool loop);
585	void split_phi_dst(node *loc, container_node *c, bool loop);
586	void init_phi_constraints(container_node *c);
587};
588
589
590
591class ssa_prepare : public vpass {
592	using vpass::visit;
593
594	typedef std::vector<val_set> vd_stk;
595	vd_stk stk;
596
597	unsigned level;
598
599public:
600	ssa_prepare(shader &s) : vpass(s), level(0) {}
601
602	virtual bool visit(cf_node &n, bool enter);
603	virtual bool visit(alu_node &n, bool enter);
604	virtual bool visit(fetch_node &n, bool enter);
605	virtual bool visit(region_node &n, bool enter);
606	virtual bool visit(repeat_node &n, bool enter);
607	virtual bool visit(depart_node &n, bool enter);
608
609private:
610
611	void push_stk() {
612		++level;
613		if (level + 1 > stk.size())
614			stk.resize(level+1);
615		else
616			stk[level].clear();
617	}
618	void pop_stk() {
619		assert(level);
620		--level;
621		stk[level].add_set(stk[level + 1]);
622	}
623
624	void add_defs(node &n);
625
626	val_set & cur_set() { return stk[level]; }
627
628	container_node* create_phi_nodes(int count);
629};
630
631class ssa_rename : public vpass {
632	using vpass::visit;
633
634	typedef sb_map<value*, unsigned> def_map;
635
636	def_map def_count;
637	def_map lds_oq_count;
638	def_map lds_rw_count;
639	std::stack<def_map> rename_stack;
640	std::stack<def_map> rename_lds_oq_stack;
641	std::stack<def_map> rename_lds_rw_stack;
642
643	typedef std::map<uint32_t, value*> val_map;
644	val_map values;
645
646public:
647
648	ssa_rename(shader &s) : vpass(s) {}
649
650	virtual int init();
651
652	virtual bool visit(container_node &n, bool enter);
653	virtual bool visit(node &n, bool enter);
654	virtual bool visit(alu_group_node &n, bool enter);
655	virtual bool visit(cf_node &n, bool enter);
656	virtual bool visit(alu_node &n, bool enter);
657	virtual bool visit(alu_packed_node &n, bool enter);
658	virtual bool visit(fetch_node &n, bool enter);
659	virtual bool visit(region_node &n, bool enter);
660	virtual bool visit(repeat_node &n, bool enter);
661	virtual bool visit(depart_node &n, bool enter);
662	virtual bool visit(if_node &n, bool enter);
663
664private:
665
666	void push(node *phi);
667	void pop();
668
669	unsigned get_index(def_map& m, value* v);
670	void set_index(def_map& m, value* v, unsigned index);
671	unsigned new_index(def_map& m, value* v);
672
673	value* rename_use(node *n, value* v);
674	value* rename_def(node *def, value* v);
675
676	void rename_src_vec(node *n, vvec &vv, bool src);
677	void rename_dst_vec(node *def, vvec &vv, bool set_def);
678
679	void rename_src(node *n);
680	void rename_dst(node *n);
681
682	void rename_phi_args(container_node *phi, unsigned op, bool def);
683
684	void rename_virt(node *n);
685	void rename_virt_val(node *n, value *v);
686};
687
688class bc_finalizer : public pass {
689
690	cf_node *last_export[EXP_TYPE_COUNT];
691	cf_node *last_cf;
692
693	unsigned ngpr;
694	unsigned nstack;
695
696public:
697
698	bc_finalizer(shader &sh) : pass(sh), last_export(), last_cf(), ngpr(),
699		nstack() {}
700
701	virtual int run();
702
703	void finalize_loop(region_node *r);
704	void finalize_if(region_node *r);
705
706	void run_on(container_node *c);
707
708	void insert_rv6xx_load_ar_workaround(alu_group_node *b4);
709	void finalize_alu_group(alu_group_node *g, node *prev_node);
710	bool finalize_alu_src(alu_group_node *g, alu_node *a, alu_group_node *prev_node);
711
712	void emit_set_grad(fetch_node* f);
713	void finalize_fetch(fetch_node *f);
714
715	void finalize_cf(cf_node *c);
716
717	sel_chan translate_kcache(cf_node *alu, value *v);
718
719	void update_ngpr(unsigned gpr);
720	void update_nstack(region_node *r, unsigned add = 0);
721
722	unsigned get_stack_depth(node *n, unsigned &loops, unsigned &ifs,
723	                         unsigned add = 0);
724
725	void cf_peephole();
726
727private:
728	void copy_fetch_src(fetch_node &dst, fetch_node &src, unsigned arg_start);
729	void emit_set_texture_offsets(fetch_node &f);
730};
731
732
733} // namespace r600_sb
734
735#endif /* SB_PASS_H_ */
736