1/*
2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Vadim Girlin
25 */
26
27#define GCM_DEBUG 0
28
29#if GCM_DEBUG
30#define GCM_DUMP(a) do { a } while(0);
31#else
32#define GCM_DUMP(a)
33#endif
34
35#include <map>
36
37#include "sb_bc.h"
38#include "sb_shader.h"
39#include "sb_pass.h"
40#include "eg_sq.h" // V_SQ_CF_INDEX_NONE
41
42namespace r600_sb {
43
44int gcm::run() {
45
46	GCM_DUMP( sblog << "==== GCM ==== \n"; sh.dump_ir(); );
47
48	collect_instructions(sh.root, true);
49
50	init_def_count(uses, pending);
51
52	for (node_iterator N, I = pending.begin(), E = pending.end();
53			I != E; I = N) {
54		N = I;
55		++N;
56		node *o = *I;
57
58		GCM_DUMP(
59			sblog << "pending : ";
60			dump::dump_op(o);
61			sblog << "\n";
62		);
63
64		if (td_is_ready(o)) {
65
66			GCM_DUMP(
67				sblog << "  ready: ";
68				dump::dump_op(o);
69				sblog << "\n";
70			);
71			pending.remove_node(o);
72			ready.push_back(o);
73		} else {
74		}
75	}
76
77	sched_early(sh.root);
78
79	if (!pending.empty()) {
80		sblog << "##### gcm_sched_early_pass: unscheduled ops:\n";
81		dump::dump_op(pending.front());
82	}
83
84	assert(pending.empty());
85
86	GCM_DUMP( sh.dump_ir(); );
87
88	GCM_DUMP( sblog << "\n\n ############## gcm late\n\n"; );
89
90	collect_instructions(sh.root, false);
91
92	init_use_count(uses, pending);
93
94	sched_late(sh.root);
95	if (!pending.empty()) {
96		sblog << "##### gcm_sched_late_pass: unscheduled ops:\n";
97		dump::dump_op(pending.front());
98	}
99
100	assert(ucs_level == 0);
101	assert(pending.empty());
102
103	return 0;
104}
105
106
107void gcm::collect_instructions(container_node *c, bool early_pass) {
108	if (c->is_bb()) {
109
110		if (early_pass) {
111			for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) {
112				node *n = *I;
113				if (n->flags & NF_DONT_MOVE) {
114					op_info &o = op_map[n];
115					o.top_bb = o.bottom_bb = static_cast<bb_node*>(c);
116				}
117			}
118		}
119
120		pending.append_from(c);
121		return;
122	}
123
124	for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) {
125		if (I->is_container()) {
126			collect_instructions(static_cast<container_node*>(*I), early_pass);
127		}
128	}
129}
130
131void gcm::sched_early(container_node *n) {
132
133	region_node *r =
134			(n->type == NT_REGION) ? static_cast<region_node*>(n) : NULL;
135
136	if (r && r->loop_phi) {
137		sched_early(r->loop_phi);
138	}
139
140	for (node_iterator I = n->begin(), E = n->end(); I != E; ++I) {
141		if (I->type == NT_OP) {
142			node *op = *I;
143			if (op->subtype == NST_PHI) {
144				td_release_uses(op->dst);
145			}
146		} else if (I->is_container()) {
147			if (I->subtype == NST_BB) {
148				bb_node* bb = static_cast<bb_node*>(*I);
149				td_sched_bb(bb);
150			} else {
151				sched_early(static_cast<container_node*>(*I));
152			}
153		}
154	}
155
156	if (r && r->phi) {
157		sched_early(r->phi);
158	}
159}
160
161void gcm::td_schedule(bb_node *bb, node *n) {
162	GCM_DUMP(
163		sblog << "scheduling : ";
164		dump::dump_op(n);
165		sblog << "\n";
166	);
167	td_release_uses(n->dst);
168
169	bb->push_back(n);
170
171	op_map[n].top_bb = bb;
172
173}
174
175void gcm::td_sched_bb(bb_node* bb) {
176	GCM_DUMP(
177	sblog << "td scheduling BB_" << bb->id << "\n";
178	);
179
180	while (!ready.empty()) {
181		for (sq_iterator N, I = ready.begin(), E = ready.end(); I != E;
182				I = N) {
183			N = I; ++N;
184			td_schedule(bb, *I);
185			ready.erase(I);
186		}
187	}
188}
189
190bool gcm::td_is_ready(node* n) {
191	return uses[n] == 0;
192}
193
194void gcm::td_release_val(value *v) {
195
196	GCM_DUMP(
197		sblog << "td checking uses: ";
198		dump::dump_val(v);
199		sblog << "\n";
200	);
201
202	for (uselist::iterator I = v->uses.begin(), E = v->uses.end(); I != E; ++I) {
203		node *op = *I;
204		if (op->parent != &pending) {
205			continue;
206		}
207
208		GCM_DUMP(
209			sblog << "td    used in ";
210			dump::dump_op(op);
211			sblog << "\n";
212		);
213
214		assert(uses[op] > 0);
215		if (--uses[op] == 0) {
216			GCM_DUMP(
217				sblog << "td        released : ";
218				dump::dump_op(op);
219				sblog << "\n";
220			);
221
222			pending.remove_node(op);
223			ready.push_back(op);
224		}
225	}
226
227}
228
229void gcm::td_release_uses(vvec& v) {
230	for (vvec::iterator I = v.begin(), E = v.end(); I != E; ++I) {
231		value *v = *I;
232		if (!v)
233			continue;
234
235		if (v->is_rel())
236			td_release_uses(v->mdef);
237		else
238			td_release_val(v);
239	}
240}
241
242void gcm::sched_late(container_node *n) {
243
244	bool stack_pushed = false;
245
246	if (n->is_depart()) {
247		depart_node *d = static_cast<depart_node*>(n);
248		push_uc_stack();
249		stack_pushed = true;
250		bu_release_phi_defs(d->target->phi, d->dep_id);
251	} else if (n->is_repeat()) {
252		repeat_node *r = static_cast<repeat_node*>(n);
253		assert(r->target->loop_phi);
254		push_uc_stack();
255		stack_pushed = true;
256		bu_release_phi_defs(r->target->loop_phi, r->rep_id);
257	}
258
259	for (node_riterator I = n->rbegin(), E = n->rend(); I != E; ++I) {
260		if (I->is_container()) {
261			if (I->subtype == NST_BB) {
262				bb_node* bb = static_cast<bb_node*>(*I);
263				bu_sched_bb(bb);
264			} else {
265				sched_late(static_cast<container_node*>(*I));
266			}
267		}
268	}
269
270	if (n->type == NT_IF) {
271		if_node *f = static_cast<if_node*>(n);
272		if (f->cond)
273			pending_defs.push_back(f->cond);
274	} else if (n->type == NT_REGION) {
275		region_node *r = static_cast<region_node*>(n);
276		if (r->loop_phi)
277			bu_release_phi_defs(r->loop_phi, 0);
278	}
279
280	if (stack_pushed)
281		pop_uc_stack();
282
283}
284
285void gcm::bu_sched_bb(bb_node* bb) {
286	GCM_DUMP(
287	sblog << "bu scheduling BB_" << bb->id << "\n";
288	);
289
290	bu_bb = bb;
291
292	if (!pending_nodes.empty()) {
293		GCM_DUMP(
294				sblog << "pending nodes:\n";
295		);
296
297		// TODO consider sorting the exports by array_base,
298		// possibly it can improve performance
299
300		for (node_list::iterator I = pending_nodes.begin(),
301				E = pending_nodes.end(); I != E; ++I) {
302			bu_release_op(*I);
303		}
304		pending_nodes.clear();
305		GCM_DUMP(
306			sblog << "pending nodes processed...\n";
307		);
308	}
309
310
311	if (!pending_defs.empty()) {
312		for (vvec::iterator I = pending_defs.begin(), E = pending_defs.end();
313				I != E; ++I) {
314			bu_release_val(*I);
315		}
316		pending_defs.clear();
317	}
318
319	for (sched_queue::iterator N, I = ready_above.begin(), E = ready_above.end();
320			I != E;	I = N) {
321		N = I;
322		++N;
323		node *n = *I;
324		if (op_map[n].bottom_bb == bb) {
325			add_ready(*I);
326			ready_above.erase(I);
327		}
328	}
329
330	unsigned cnt_ready[SQ_NUM];
331
332	container_node *clause = NULL;
333	unsigned last_inst_type = ~0;
334	unsigned last_count = 0;
335
336	bool s = true;
337	while (s) {
338		node *n;
339
340		s = false;
341
342		unsigned ready_mask = 0;
343
344		for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) {
345			if (!bu_ready[sq].empty() || !bu_ready_next[sq].empty())
346				ready_mask |= (1 << sq);
347		}
348
349		if (!ready_mask) {
350			for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) {
351				if (!bu_ready_early[sq].empty()) {
352					node *n = bu_ready_early[sq].front();
353					bu_ready_early[sq].pop_front();
354					bu_ready[sq].push_back(n);
355					break;
356				}
357			}
358		}
359
360		for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) {
361
362			if (sq == SQ_CF && pending_exec_mask_update) {
363				pending_exec_mask_update = false;
364				sq = SQ_ALU;
365				--sq;
366				continue;
367			}
368
369			if (sq != SQ_ALU && outstanding_lds_oq)
370				continue;
371
372			if (!bu_ready_next[sq].empty())
373				bu_ready[sq].splice(bu_ready[sq].end(), bu_ready_next[sq]);
374
375			cnt_ready[sq] = bu_ready[sq].size();
376
377			if ((sq == SQ_TEX || sq == SQ_VTX) && live_count <= rp_threshold &&
378					cnt_ready[sq] < ctx.max_fetch/2	&&
379					!bu_ready_next[SQ_ALU].empty()) {
380				sq = SQ_ALU;
381				--sq;
382				continue;
383			}
384
385			while (!bu_ready[sq].empty()) {
386
387				if (last_inst_type != sq) {
388					clause = NULL;
389					last_count = 0;
390					last_inst_type = sq;
391				}
392
393				// simple heuristic to limit register pressure,
394				if (sq == SQ_ALU && live_count > rp_threshold && !outstanding_lds_oq &&
395						(!bu_ready[SQ_TEX].empty() ||
396						 !bu_ready[SQ_VTX].empty() ||
397						 !bu_ready_next[SQ_TEX].empty() ||
398						 !bu_ready_next[SQ_VTX].empty())) {
399					GCM_DUMP( sblog << "switching to fetch (regpressure)\n"; );
400					break;
401				}
402
403				n = bu_ready[sq].front();
404
405				// real count (e.g. SAMPLE_G will be expanded to 3 instructions,
406				// 2 SET_GRAD_ + 1 SAMPLE_G
407				unsigned ncnt = 1;
408				if (n->is_fetch_inst() && n->src.size() == 12) {
409					ncnt = 3;
410				}
411
412				bool sampler_indexing = false;
413				if (n->is_fetch_inst() &&
414					static_cast<fetch_node *>(n)->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE)
415				{
416					sampler_indexing = true; // Give sampler indexed ops get their own clause
417					ncnt = sh.get_ctx().is_cayman() ? 2 : 3; // MOVA + SET_CF_IDX0/1
418				}
419
420				if ((sq == SQ_TEX || sq == SQ_VTX) &&
421						((last_count >= ctx.max_fetch/2 &&
422						check_alu_ready_count(24)) ||
423								last_count + ncnt > ctx.max_fetch))
424					break;
425				else if (sq == SQ_CF && last_count > 4 &&
426						check_alu_ready_count(24))
427					break;
428
429
430				if (sq == SQ_ALU && n->consumes_lds_oq() &&
431				    (bu_ready[SQ_TEX].size() || bu_ready[SQ_VTX].size() || bu_ready[SQ_GDS].size())) {
432					GCM_DUMP( sblog << "switching scheduling due to lds op\n"; );
433					break;
434				}
435				bu_ready[sq].pop_front();
436
437				if (sq != SQ_CF) {
438					if (!clause || sampler_indexing) {
439						node_subtype nst;
440						switch (sq) {
441						case SQ_ALU:
442							nst = NST_ALU_CLAUSE;
443							break;
444						case SQ_TEX:
445							nst = NST_TEX_CLAUSE;
446							break;
447						case SQ_GDS:
448							nst = NST_GDS_CLAUSE;
449							break;
450						default:
451							nst = NST_VTX_CLAUSE;
452							break;
453						}
454						clause = sh.create_clause(nst);
455						bb->push_front(clause);
456					}
457				} else {
458					clause = bb;
459				}
460
461				bu_schedule(clause, n);
462				s = true;
463				last_count += ncnt;
464			}
465		}
466	}
467
468	bu_bb = NULL;
469
470	GCM_DUMP(
471		sblog << "bu finished scheduling BB_" << bb->id << "\n";
472	);
473}
474
475void gcm::bu_release_defs(vvec& v, bool src) {
476	for (vvec::reverse_iterator I = v.rbegin(), E = v.rend(); I != E; ++I) {
477		value *v = *I;
478		if (!v || v->is_readonly())
479			continue;
480
481		if (v->is_rel()) {
482			if (!v->rel->is_readonly())
483				bu_release_val(v->rel);
484			bu_release_defs(v->muse, true);
485		} else if (src)
486			bu_release_val(v);
487		else {
488			if (live.remove_val(v)) {
489				--live_count;
490			}
491		}
492	}
493}
494
495void gcm::push_uc_stack() {
496	GCM_DUMP(
497		sblog << "pushing use count stack prev_level " << ucs_level
498			<< "   new level " << (ucs_level + 1) << "\n";
499	);
500	++ucs_level;
501	if (ucs_level == nuc_stk.size()) {
502		nuc_stk.resize(ucs_level + 1);
503	}
504	else {
505		nuc_stk[ucs_level].clear();
506	}
507}
508
509bool gcm::bu_is_ready(node* n) {
510	nuc_map &cm = nuc_stk[ucs_level];
511	nuc_map::iterator F = cm.find(n);
512	unsigned uc = (F == cm.end() ? 0 : F->second);
513	return uc == uses[n];
514}
515
516void gcm::bu_schedule(container_node* c, node* n) {
517	GCM_DUMP(
518		sblog << "bu scheduling : ";
519		dump::dump_op(n);
520		sblog << "\n";
521	);
522
523	assert(op_map[n].bottom_bb == bu_bb);
524
525	if (n->produces_lds_oq())
526		outstanding_lds_oq--;
527	if (n->consumes_lds_oq())
528		outstanding_lds_oq++;
529	bu_release_defs(n->src, true);
530	bu_release_defs(n->dst, false);
531
532	c->push_front(n);
533}
534
535void gcm::dump_uc_stack() {
536	sblog << "##### uc_stk start ####\n";
537	for (unsigned l = 0; l <= ucs_level; ++l) {
538		nuc_map &m = nuc_stk[l];
539
540		sblog << "nuc_stk[" << l << "] :   @" << &m << "\n";
541
542		for (nuc_map::iterator I = m.begin(), E = m.end(); I != E; ++I) {
543			sblog << "    uc " << I->second << " for ";
544			dump::dump_op(I->first);
545			sblog << "\n";
546		}
547	}
548	sblog << "##### uc_stk end ####\n";
549}
550
551void gcm::pop_uc_stack() {
552	nuc_map &pm = nuc_stk[ucs_level];
553	--ucs_level;
554	nuc_map &cm = nuc_stk[ucs_level];
555
556	GCM_DUMP(
557		sblog << "merging use stack from level " << (ucs_level+1)
558			<< " to " << ucs_level << "\n";
559	);
560
561	for (nuc_map::iterator N, I = pm.begin(), E = pm.end(); I != E; ++I) {
562		node *n = I->first;
563
564		GCM_DUMP(
565			sblog << "      " << cm[n] << " += " << I->second << "  for ";
566			dump::dump_op(n);
567			sblog << "\n";
568		);
569
570		unsigned uc = cm[n] += I->second;
571
572		if (n->parent == &pending && uc == uses[n]) {
573			cm.erase(n);
574			pending_nodes.push_back(n);
575			GCM_DUMP(
576				sblog << "pushed pending_node due to stack pop ";
577				dump::dump_op(n);
578				sblog << "\n";
579			);
580		}
581	}
582}
583
584void gcm::bu_find_best_bb(node *n, op_info &oi) {
585
586	GCM_DUMP(
587		sblog << "  find best bb : ";
588		dump::dump_op(n);
589		sblog << "\n";
590	);
591
592	if (oi.bottom_bb)
593		return;
594
595	// don't hoist generated copies
596	if (n->flags & NF_DONT_HOIST) {
597		oi.bottom_bb = bu_bb;
598		return;
599	}
600
601	bb_node* best_bb = bu_bb;
602	bb_node* top_bb = oi.top_bb;
603	assert(oi.top_bb && !oi.bottom_bb);
604
605	node *c = best_bb;
606
607	// FIXME top_bb may be located inside the loop so we'll never enter it
608	// in the loop below, and the instruction will be incorrectly placed at the
609	// beginning of the shader.
610	// For now just check if top_bb's loop_level is higher than of
611	// current bb and abort the search for better bb in such case,
612	// but this problem may require more complete (and more expensive) fix
613	if (top_bb->loop_level <= best_bb->loop_level) {
614		while (c && c != top_bb) {
615
616			if (c->prev) {
617				c = c->prev;
618			} else {
619				c = c->parent;
620				if (!c)
621					break;
622				continue;
623			}
624
625			if (c->subtype == NST_BB) {
626				bb_node *bb = static_cast<bb_node*>(c);
627				if (bb->loop_level < best_bb->loop_level)
628					best_bb = bb;
629			}
630		}
631	}
632
633	oi.bottom_bb = best_bb;
634}
635
636void gcm::add_ready(node *n) {
637	sched_queue_id sq = sh.get_queue_id(n);
638	if (n->flags & NF_SCHEDULE_EARLY)
639		bu_ready_early[sq].push_back(n);
640	else if (sq == SQ_ALU && n->is_copy_mov())
641		bu_ready[sq].push_front(n);
642	else if (n->is_alu_inst()) {
643		alu_node *a = static_cast<alu_node*>(n);
644		if (a->bc.op_ptr->flags & AF_PRED && a->dst[2]) {
645			// PRED_SET instruction that updates exec mask
646			pending_exec_mask_update = true;
647		}
648		bu_ready_next[sq].push_back(n);
649	} else
650		bu_ready_next[sq].push_back(n);
651}
652
653void gcm::bu_release_op(node * n) {
654	op_info &oi = op_map[n];
655
656	GCM_DUMP(
657	sblog << "  bu release op  ";
658	dump::dump_op(n);
659	);
660
661	nuc_stk[ucs_level].erase(n);
662	pending.remove_node(n);
663
664	bu_find_best_bb(n, oi);
665
666	if (oi.bottom_bb == bu_bb) {
667		GCM_DUMP( sblog << "   ready\n";);
668		add_ready(n);
669	} else {
670		GCM_DUMP( sblog << "   ready_above\n";);
671		ready_above.push_back(n);
672	}
673}
674
675void gcm::bu_release_phi_defs(container_node* p, unsigned op)
676{
677	for (node_riterator I = p->rbegin(), E = p->rend(); I != E; ++I) {
678		node *o = *I;
679		value *v = o->src[op];
680		if (v && !v->is_readonly())
681			pending_defs.push_back(o->src[op]);
682
683	}
684}
685
686unsigned gcm::get_uc_vec(vvec &vv) {
687	unsigned c = 0;
688	for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
689		value *v = *I;
690		if (!v)
691			continue;
692
693		if (v->is_rel())
694			c += get_uc_vec(v->mdef);
695		else
696			c += v->use_count();
697	}
698	return c;
699}
700
701void gcm::init_use_count(nuc_map& m, container_node &s) {
702	m.clear();
703	for (node_iterator I = s.begin(), E = s.end(); I != E; ++I) {
704		node *n = *I;
705		unsigned uc = get_uc_vec(n->dst);
706		GCM_DUMP(
707			sblog << "uc " << uc << "  ";
708			dump::dump_op(n);
709			sblog << "\n";
710		);
711		if (!uc) {
712			pending_nodes.push_back(n);
713			GCM_DUMP(
714				sblog << "pushed pending_node in init ";
715				dump::dump_op(n);
716				sblog << "\n";
717			);
718
719		} else
720			m[n] = uc;
721	}
722}
723
724void gcm::bu_release_val(value* v) {
725	node *n = v->any_def();
726
727	if (n && n->parent == &pending) {
728		nuc_map &m = nuc_stk[ucs_level];
729		unsigned uc = ++m[n];
730		unsigned uc2 = uses[n];
731
732		if (live.add_val(v)) {
733			++live_count;
734			GCM_DUMP ( sblog << "live_count: " << live_count << "\n"; );
735		}
736
737		GCM_DUMP(
738			sblog << "release val ";
739			dump::dump_val(v);
740			sblog << "  for node ";
741			dump::dump_op(n);
742			sblog << "    new uc=" << uc << ", total " << uc2 << "\n";
743		);
744
745		if (uc == uc2)
746			bu_release_op(n);
747	}
748
749}
750
751void gcm::init_def_count(nuc_map& m, container_node& s) {
752	m.clear();
753	for (node_iterator I = s.begin(), E = s.end(); I != E; ++I) {
754		node *n = *I;
755		unsigned dc = get_dc_vec(n->src, true) + get_dc_vec(n->dst, false);
756		m[n] = dc;
757
758		GCM_DUMP(
759			sblog << "dc " << dc << "  ";
760			dump::dump_op(n);
761			sblog << "\n";
762		);
763	}
764}
765
766unsigned gcm::get_dc_vec(vvec& vv, bool src) {
767	unsigned c = 0;
768	for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
769		value *v = *I;
770		if (!v || v->is_readonly())
771			continue;
772
773		if (v->is_rel()) {
774			c += v->rel->def != NULL;
775			c += get_dc_vec(v->muse, true);
776		}
777		else if (src) {
778			c += v->def != NULL;
779			c += v->adef != NULL;
780		}
781	}
782	return c;
783}
784
785unsigned gcm::real_alu_count(sched_queue& q, unsigned max) {
786	sq_iterator I(q.begin()), E(q.end());
787	unsigned c = 0;
788
789	while (I != E && c < max) {
790		node *n = *I;
791		if (n->is_alu_inst()) {
792			if (!n->is_copy_mov() || !n->src[0]->is_any_gpr())
793				++c;
794		} else if (n->is_alu_packed()) {
795			c += static_cast<container_node*>(n)->count();
796		}
797		++I;
798	}
799
800	return c;
801}
802
803bool gcm::check_alu_ready_count(unsigned threshold) {
804	unsigned r = real_alu_count(bu_ready[SQ_ALU], threshold);
805	if (r >= threshold)
806		return true;
807	r += real_alu_count(bu_ready_next[SQ_ALU], threshold - r);
808	return r >= threshold;
809}
810
811} // namespace r600_sb
812