1/*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 *    Ben Widawsky <ben@bwidawsk.net>
25 *    Michel Thierry <michel.thierry@intel.com>
26 *    Thomas Daniel <thomas.daniel@intel.com>
27 *    Oscar Mateo <oscar.mateo@intel.com>
28 *
29 */
30
31/**
32 * DOC: Logical Rings, Logical Ring Contexts and Execlists
33 *
34 * Motivation:
35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36 * These expanded contexts enable a number of new abilities, especially
37 * "Execlists" (also implemented in this file).
38 *
39 * One of the main differences with the legacy HW contexts is that logical
40 * ring contexts incorporate many more things to the context's state, like
41 * PDPs or ringbuffer control registers:
42 *
43 * The reason why PDPs are included in the context is straightforward: as
44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45 * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46 * instead, the GPU will do it for you on the context switch.
47 *
48 * But, what about the ringbuffer control registers (head, tail, etc..)?
49 * shouldn't we just need a set of those per engine command streamer? This is
50 * where the name "Logical Rings" starts to make sense: by virtualizing the
51 * rings, the engine cs shifts to a new "ring buffer" with every context
52 * switch. When you want to submit a workload to the GPU you: A) choose your
53 * context, B) find its appropriate virtualized ring, C) write commands to it
54 * and then, finally, D) tell the GPU to switch to that context.
55 *
56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57 * to a contexts is via a context execution list, ergo "Execlists".
58 *
59 * LRC implementation:
60 * Regarding the creation of contexts, we have:
61 *
62 * - One global default context.
63 * - One local default context for each opened fd.
64 * - One local extra context for each context create ioctl call.
65 *
66 * Now that ringbuffers belong per-context (and not per-engine, like before)
67 * and that contexts are uniquely tied to a given engine (and not reusable,
68 * like before) we need:
69 *
70 * - One ringbuffer per-engine inside each context.
71 * - One backing object per-engine inside each context.
72 *
73 * The global default context starts its life with these new objects fully
74 * allocated and populated. The local default context for each opened fd is
75 * more complex, because we don't know at creation time which engine is going
76 * to use them. To handle this, we have implemented a deferred creation of LR
77 * contexts:
78 *
79 * The local context starts its life as a hollow or blank holder, that only
80 * gets populated for a given engine once we receive an execbuffer. If later
81 * on we receive another execbuffer ioctl for the same context but a different
82 * engine, we allocate/populate a new ringbuffer and context backing object and
83 * so on.
84 *
85 * Finally, regarding local contexts created using the ioctl call: as they are
86 * only allowed with the render ring, we can allocate & populate them right
87 * away (no need to defer anything, at least for now).
88 *
89 * Execlists implementation:
90 * Execlists are the new method by which, on gen8+ hardware, workloads are
91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92 * This method works as follows:
93 *
94 * When a request is committed, its commands (the BB start and any leading or
95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96 * for the appropriate context. The tail pointer in the hardware context is not
97 * updated at this time, but instead, kept by the driver in the ringbuffer
98 * structure. A structure representing this request is added to a request queue
99 * for the appropriate engine: this structure contains a copy of the context's
100 * tail after the request was written to the ring buffer and a pointer to the
101 * context itself.
102 *
103 * If the engine's request queue was empty before the request was added, the
104 * queue is processed immediately. Otherwise the queue will be processed during
105 * a context switch interrupt. In any case, elements on the queue will get sent
106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107 * globally unique 20-bits submission ID.
108 *
109 * When execution of a request completes, the GPU updates the context status
110 * buffer with a context complete event and generates a context switch interrupt.
111 * During the interrupt handling, the driver examines the events in the buffer:
112 * for each context complete event, if the announced ID matches that on the head
113 * of the request queue, then that request is retired and removed from the queue.
114 *
115 * After processing, if any requests were retired and the queue is not empty
116 * then a new execution list can be submitted. The two requests at the front of
117 * the queue are next to be submitted but since a context may not occur twice in
118 * an execution list, if subsequent requests have the same ID as the first then
119 * the two requests must be combined. This is done simply by discarding requests
120 * at the head of the queue until either only one requests is left (in which case
121 * we use a NULL second context) or the first two requests have unique IDs.
122 *
123 * By always executing the first two requests in the queue the driver ensures
124 * that the GPU is kept as busy as possible. In the case where a single context
125 * completes but a second context is still executing, the request for this second
126 * context will be at the head of the queue when we remove the first one. This
127 * request will then be resubmitted along with a new request for a different context,
128 * which will cause the hardware to continue executing the second request and queue
129 * the new request (the GPU detects the condition of a context getting preempted
130 * with the same context and optimizes the context switch flow by not doing
131 * preemption, but just sampling the new tail pointer).
132 *
133 */
134#include <linux/interrupt.h>
135
136#include "i915_drv.h"
137#include "i915_perf.h"
138#include "i915_trace.h"
139#include "i915_vgpu.h"
140#include "intel_breadcrumbs.h"
141#include "intel_context.h"
142#include "intel_engine_pm.h"
143#include "intel_gt.h"
144#include "intel_gt_pm.h"
145#include "intel_gt_requests.h"
146#include "intel_lrc_reg.h"
147#include "intel_mocs.h"
148#include "intel_reset.h"
149#include "intel_ring.h"
150#include "intel_workarounds.h"
151#include "shmem_utils.h"
152
153#define RING_EXECLIST_QFULL		(1 << 0x2)
154#define RING_EXECLIST1_VALID		(1 << 0x3)
155#define RING_EXECLIST0_VALID		(1 << 0x4)
156#define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
157#define RING_EXECLIST1_ACTIVE		(1 << 0x11)
158#define RING_EXECLIST0_ACTIVE		(1 << 0x12)
159
160#define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
161#define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
162#define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
163#define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
164#define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
165#define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
166
167#define GEN8_CTX_STATUS_COMPLETED_MASK \
168	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
169
170#define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
171
172#define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
173#define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
174#define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
175#define GEN12_IDLE_CTX_ID		0x7FF
176#define GEN12_CSB_CTX_VALID(csb_dw) \
177	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
178
179/* Typical size of the average request (2 pipecontrols and a MI_BB) */
180#define EXECLISTS_REQUEST_SIZE 64 /* bytes */
181
182struct virtual_engine {
183	struct intel_engine_cs base;
184	struct intel_context context;
185	struct rcu_work rcu;
186
187	/*
188	 * We allow only a single request through the virtual engine at a time
189	 * (each request in the timeline waits for the completion fence of
190	 * the previous before being submitted). By restricting ourselves to
191	 * only submitting a single request, each request is placed on to a
192	 * physical to maximise load spreading (by virtue of the late greedy
193	 * scheduling -- each real engine takes the next available request
194	 * upon idling).
195	 */
196	struct i915_request *request;
197
198	/*
199	 * We keep a rbtree of available virtual engines inside each physical
200	 * engine, sorted by priority. Here we preallocate the nodes we need
201	 * for the virtual engine, indexed by physical_engine->id.
202	 */
203	struct ve_node {
204		struct rb_node rb;
205		int prio;
206	} nodes[I915_NUM_ENGINES];
207
208	/*
209	 * Keep track of bonded pairs -- restrictions upon on our selection
210	 * of physical engines any particular request may be submitted to.
211	 * If we receive a submit-fence from a master engine, we will only
212	 * use one of sibling_mask physical engines.
213	 */
214	struct ve_bond {
215		const struct intel_engine_cs *master;
216		intel_engine_mask_t sibling_mask;
217	} *bonds;
218	unsigned int num_bonds;
219
220	/* And finally, which physical engines this virtual engine maps onto. */
221	unsigned int num_siblings;
222	struct intel_engine_cs *siblings[];
223};
224
225static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
226{
227	GEM_BUG_ON(!intel_engine_is_virtual(engine));
228	return container_of(engine, struct virtual_engine, base);
229}
230
231static int __execlists_context_alloc(struct intel_context *ce,
232				     struct intel_engine_cs *engine);
233
234static void execlists_init_reg_state(u32 *reg_state,
235				     const struct intel_context *ce,
236				     const struct intel_engine_cs *engine,
237				     const struct intel_ring *ring,
238				     bool close);
239static void
240__execlists_update_reg_state(const struct intel_context *ce,
241			     const struct intel_engine_cs *engine,
242			     u32 head);
243
244static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
245{
246	if (INTEL_GEN(engine->i915) >= 12)
247		return 0x60;
248	else if (INTEL_GEN(engine->i915) >= 9)
249		return 0x54;
250	else if (engine->class == RENDER_CLASS)
251		return 0x58;
252	else
253		return -1;
254}
255
256static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
257{
258	if (INTEL_GEN(engine->i915) >= 12)
259		return 0x74;
260	else if (INTEL_GEN(engine->i915) >= 9)
261		return 0x68;
262	else if (engine->class == RENDER_CLASS)
263		return 0xd8;
264	else
265		return -1;
266}
267
268static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
269{
270	if (INTEL_GEN(engine->i915) >= 12)
271		return 0x12;
272	else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
273		return 0x18;
274	else
275		return -1;
276}
277
278static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
279{
280	int x;
281
282	x = lrc_ring_wa_bb_per_ctx(engine);
283	if (x < 0)
284		return x;
285
286	return x + 2;
287}
288
289static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
290{
291	int x;
292
293	x = lrc_ring_indirect_ptr(engine);
294	if (x < 0)
295		return x;
296
297	return x + 2;
298}
299
300static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
301{
302	if (engine->class != RENDER_CLASS)
303		return -1;
304
305	if (INTEL_GEN(engine->i915) >= 12)
306		return 0xb6;
307	else if (INTEL_GEN(engine->i915) >= 11)
308		return 0xaa;
309	else
310		return -1;
311}
312
313static u32
314lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
315{
316	switch (INTEL_GEN(engine->i915)) {
317	default:
318		MISSING_CASE(INTEL_GEN(engine->i915));
319		fallthrough;
320	case 12:
321		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
322	case 11:
323		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
324	case 10:
325		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
326	case 9:
327		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
328	case 8:
329		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
330	}
331}
332
333static void
334lrc_ring_setup_indirect_ctx(u32 *regs,
335			    const struct intel_engine_cs *engine,
336			    u32 ctx_bb_ggtt_addr,
337			    u32 size)
338{
339	GEM_BUG_ON(!size);
340	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
341	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
342	regs[lrc_ring_indirect_ptr(engine) + 1] =
343		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
344
345	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
346	regs[lrc_ring_indirect_offset(engine) + 1] =
347		lrc_ring_indirect_offset_default(engine) << 6;
348}
349
350static u32 intel_context_get_runtime(const struct intel_context *ce)
351{
352	/*
353	 * We can use either ppHWSP[16] which is recorded before the context
354	 * switch (and so excludes the cost of context switches) or use the
355	 * value from the context image itself, which is saved/restored earlier
356	 * and so includes the cost of the save.
357	 */
358	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
359}
360
361static void mark_eio(struct i915_request *rq)
362{
363	if (i915_request_completed(rq))
364		return;
365
366	GEM_BUG_ON(i915_request_signaled(rq));
367
368	i915_request_set_error_once(rq, -EIO);
369	i915_request_mark_complete(rq);
370}
371
372static struct i915_request *
373active_request(const struct intel_timeline * const tl, struct i915_request *rq)
374{
375	struct i915_request *active = rq;
376
377	rcu_read_lock();
378	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
379		if (i915_request_completed(rq))
380			break;
381
382		active = rq;
383	}
384	rcu_read_unlock();
385
386	return active;
387}
388
389static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
390{
391	return (i915_ggtt_offset(engine->status_page.vma) +
392		I915_GEM_HWS_PREEMPT_ADDR);
393}
394
395static inline void
396ring_set_paused(const struct intel_engine_cs *engine, int state)
397{
398	/*
399	 * We inspect HWS_PREEMPT with a semaphore inside
400	 * engine->emit_fini_breadcrumb. If the dword is true,
401	 * the ring is paused as the semaphore will busywait
402	 * until the dword is false.
403	 */
404	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
405	if (state)
406		wmb();
407}
408
409static inline struct i915_priolist *to_priolist(struct rb_node *rb)
410{
411	return rb_entry(rb, struct i915_priolist, node);
412}
413
414static inline int rq_prio(const struct i915_request *rq)
415{
416	return READ_ONCE(rq->sched.attr.priority);
417}
418
419static int effective_prio(const struct i915_request *rq)
420{
421	int prio = rq_prio(rq);
422
423	/*
424	 * If this request is special and must not be interrupted at any
425	 * cost, so be it. Note we are only checking the most recent request
426	 * in the context and so may be masking an earlier vip request. It
427	 * is hoped that under the conditions where nopreempt is used, this
428	 * will not matter (i.e. all requests to that context will be
429	 * nopreempt for as long as desired).
430	 */
431	if (i915_request_has_nopreempt(rq))
432		prio = I915_PRIORITY_UNPREEMPTABLE;
433
434	return prio;
435}
436
437static int queue_prio(const struct intel_engine_execlists *execlists)
438{
439	struct i915_priolist *p;
440	struct rb_node *rb;
441
442	rb = rb_first_cached(&execlists->queue);
443	if (!rb)
444		return INT_MIN;
445
446	/*
447	 * As the priolist[] are inverted, with the highest priority in [0],
448	 * we have to flip the index value to become priority.
449	 */
450	p = to_priolist(rb);
451	if (!I915_USER_PRIORITY_SHIFT)
452		return p->priority;
453
454	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
455}
456
457static inline bool need_preempt(const struct intel_engine_cs *engine,
458				const struct i915_request *rq,
459				struct rb_node *rb)
460{
461	int last_prio;
462
463	if (!intel_engine_has_semaphores(engine))
464		return false;
465
466	/*
467	 * Check if the current priority hint merits a preemption attempt.
468	 *
469	 * We record the highest value priority we saw during rescheduling
470	 * prior to this dequeue, therefore we know that if it is strictly
471	 * less than the current tail of ESLP[0], we do not need to force
472	 * a preempt-to-idle cycle.
473	 *
474	 * However, the priority hint is a mere hint that we may need to
475	 * preempt. If that hint is stale or we may be trying to preempt
476	 * ourselves, ignore the request.
477	 *
478	 * More naturally we would write
479	 *      prio >= max(0, last);
480	 * except that we wish to prevent triggering preemption at the same
481	 * priority level: the task that is running should remain running
482	 * to preserve FIFO ordering of dependencies.
483	 */
484	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
485	if (engine->execlists.queue_priority_hint <= last_prio)
486		return false;
487
488	/*
489	 * Check against the first request in ELSP[1], it will, thanks to the
490	 * power of PI, be the highest priority of that context.
491	 */
492	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
493	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
494		return true;
495
496	if (rb) {
497		struct virtual_engine *ve =
498			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
499		bool preempt = false;
500
501		if (engine == ve->siblings[0]) { /* only preempt one sibling */
502			struct i915_request *next;
503
504			rcu_read_lock();
505			next = READ_ONCE(ve->request);
506			if (next)
507				preempt = rq_prio(next) > last_prio;
508			rcu_read_unlock();
509		}
510
511		if (preempt)
512			return preempt;
513	}
514
515	/*
516	 * If the inflight context did not trigger the preemption, then maybe
517	 * it was the set of queued requests? Pick the highest priority in
518	 * the queue (the first active priolist) and see if it deserves to be
519	 * running instead of ELSP[0].
520	 *
521	 * The highest priority request in the queue can not be either
522	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
523	 * context, it's priority would not exceed ELSP[0] aka last_prio.
524	 */
525	return queue_prio(&engine->execlists) > last_prio;
526}
527
528__maybe_unused static inline bool
529assert_priority_queue(const struct i915_request *prev,
530		      const struct i915_request *next)
531{
532	/*
533	 * Without preemption, the prev may refer to the still active element
534	 * which we refuse to let go.
535	 *
536	 * Even with preemption, there are times when we think it is better not
537	 * to preempt and leave an ostensibly lower priority request in flight.
538	 */
539	if (i915_request_is_active(prev))
540		return true;
541
542	return rq_prio(prev) >= rq_prio(next);
543}
544
545/*
546 * The context descriptor encodes various attributes of a context,
547 * including its GTT address and some flags. Because it's fairly
548 * expensive to calculate, we'll just do it once and cache the result,
549 * which remains valid until the context is unpinned.
550 *
551 * This is what a descriptor looks like, from LSB to MSB::
552 *
553 *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
554 *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
555 *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
556 *      bits 53-54:    mbz, reserved for use by hardware
557 *      bits 55-63:    group ID, currently unused and set to 0
558 *
559 * Starting from Gen11, the upper dword of the descriptor has a new format:
560 *
561 *      bits 32-36:    reserved
562 *      bits 37-47:    SW context ID
563 *      bits 48:53:    engine instance
564 *      bit 54:        mbz, reserved for use by hardware
565 *      bits 55-60:    SW counter
566 *      bits 61-63:    engine class
567 *
568 * engine info, SW context ID and SW counter need to form a unique number
569 * (Context ID) per lrc.
570 */
571static u32
572lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
573{
574	u32 desc;
575
576	desc = INTEL_LEGACY_32B_CONTEXT;
577	if (i915_vm_is_4lvl(ce->vm))
578		desc = INTEL_LEGACY_64B_CONTEXT;
579	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
580
581	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
582	if (IS_GEN(engine->i915, 8))
583		desc |= GEN8_CTX_L3LLC_COHERENT;
584
585	return i915_ggtt_offset(ce->state) | desc;
586}
587
588static inline unsigned int dword_in_page(void *addr)
589{
590	return offset_in_page(addr) / sizeof(u32);
591}
592
593static void set_offsets(u32 *regs,
594			const u8 *data,
595			const struct intel_engine_cs *engine,
596			bool clear)
597#define NOP(x) (BIT(7) | (x))
598#define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
599#define POSTED BIT(0)
600#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
601#define REG16(x) \
602	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
603	(((x) >> 2) & 0x7f)
604#define END(total_state_size) 0, (total_state_size)
605{
606	const u32 base = engine->mmio_base;
607
608	while (*data) {
609		u8 count, flags;
610
611		if (*data & BIT(7)) { /* skip */
612			count = *data++ & ~BIT(7);
613			if (clear)
614				memset32(regs, MI_NOOP, count);
615			regs += count;
616			continue;
617		}
618
619		count = *data & 0x3f;
620		flags = *data >> 6;
621		data++;
622
623		*regs = MI_LOAD_REGISTER_IMM(count);
624		if (flags & POSTED)
625			*regs |= MI_LRI_FORCE_POSTED;
626		if (INTEL_GEN(engine->i915) >= 11)
627			*regs |= MI_LRI_LRM_CS_MMIO;
628		regs++;
629
630		GEM_BUG_ON(!count);
631		do {
632			u32 offset = 0;
633			u8 v;
634
635			do {
636				v = *data++;
637				offset <<= 7;
638				offset |= v & ~BIT(7);
639			} while (v & BIT(7));
640
641			regs[0] = base + (offset << 2);
642			if (clear)
643				regs[1] = 0;
644			regs += 2;
645		} while (--count);
646	}
647
648	if (clear) {
649		u8 count = *++data;
650
651		/* Clear past the tail for HW access */
652		GEM_BUG_ON(dword_in_page(regs) > count);
653		memset32(regs, MI_NOOP, count - dword_in_page(regs));
654
655		/* Close the batch; used mainly by live_lrc_layout() */
656		*regs = MI_BATCH_BUFFER_END;
657		if (INTEL_GEN(engine->i915) >= 10)
658			*regs |= BIT(0);
659	}
660}
661
662static const u8 gen8_xcs_offsets[] = {
663	NOP(1),
664	LRI(11, 0),
665	REG16(0x244),
666	REG(0x034),
667	REG(0x030),
668	REG(0x038),
669	REG(0x03c),
670	REG(0x168),
671	REG(0x140),
672	REG(0x110),
673	REG(0x11c),
674	REG(0x114),
675	REG(0x118),
676
677	NOP(9),
678	LRI(9, 0),
679	REG16(0x3a8),
680	REG16(0x28c),
681	REG16(0x288),
682	REG16(0x284),
683	REG16(0x280),
684	REG16(0x27c),
685	REG16(0x278),
686	REG16(0x274),
687	REG16(0x270),
688
689	NOP(13),
690	LRI(2, 0),
691	REG16(0x200),
692	REG(0x028),
693
694	END(80)
695};
696
697static const u8 gen9_xcs_offsets[] = {
698	NOP(1),
699	LRI(14, POSTED),
700	REG16(0x244),
701	REG(0x034),
702	REG(0x030),
703	REG(0x038),
704	REG(0x03c),
705	REG(0x168),
706	REG(0x140),
707	REG(0x110),
708	REG(0x11c),
709	REG(0x114),
710	REG(0x118),
711	REG(0x1c0),
712	REG(0x1c4),
713	REG(0x1c8),
714
715	NOP(3),
716	LRI(9, POSTED),
717	REG16(0x3a8),
718	REG16(0x28c),
719	REG16(0x288),
720	REG16(0x284),
721	REG16(0x280),
722	REG16(0x27c),
723	REG16(0x278),
724	REG16(0x274),
725	REG16(0x270),
726
727	NOP(13),
728	LRI(1, POSTED),
729	REG16(0x200),
730
731	NOP(13),
732	LRI(44, POSTED),
733	REG(0x028),
734	REG(0x09c),
735	REG(0x0c0),
736	REG(0x178),
737	REG(0x17c),
738	REG16(0x358),
739	REG(0x170),
740	REG(0x150),
741	REG(0x154),
742	REG(0x158),
743	REG16(0x41c),
744	REG16(0x600),
745	REG16(0x604),
746	REG16(0x608),
747	REG16(0x60c),
748	REG16(0x610),
749	REG16(0x614),
750	REG16(0x618),
751	REG16(0x61c),
752	REG16(0x620),
753	REG16(0x624),
754	REG16(0x628),
755	REG16(0x62c),
756	REG16(0x630),
757	REG16(0x634),
758	REG16(0x638),
759	REG16(0x63c),
760	REG16(0x640),
761	REG16(0x644),
762	REG16(0x648),
763	REG16(0x64c),
764	REG16(0x650),
765	REG16(0x654),
766	REG16(0x658),
767	REG16(0x65c),
768	REG16(0x660),
769	REG16(0x664),
770	REG16(0x668),
771	REG16(0x66c),
772	REG16(0x670),
773	REG16(0x674),
774	REG16(0x678),
775	REG16(0x67c),
776	REG(0x068),
777
778	END(176)
779};
780
781static const u8 gen12_xcs_offsets[] = {
782	NOP(1),
783	LRI(13, POSTED),
784	REG16(0x244),
785	REG(0x034),
786	REG(0x030),
787	REG(0x038),
788	REG(0x03c),
789	REG(0x168),
790	REG(0x140),
791	REG(0x110),
792	REG(0x1c0),
793	REG(0x1c4),
794	REG(0x1c8),
795	REG(0x180),
796	REG16(0x2b4),
797
798	NOP(5),
799	LRI(9, POSTED),
800	REG16(0x3a8),
801	REG16(0x28c),
802	REG16(0x288),
803	REG16(0x284),
804	REG16(0x280),
805	REG16(0x27c),
806	REG16(0x278),
807	REG16(0x274),
808	REG16(0x270),
809
810	END(80)
811};
812
813static const u8 gen8_rcs_offsets[] = {
814	NOP(1),
815	LRI(14, POSTED),
816	REG16(0x244),
817	REG(0x034),
818	REG(0x030),
819	REG(0x038),
820	REG(0x03c),
821	REG(0x168),
822	REG(0x140),
823	REG(0x110),
824	REG(0x11c),
825	REG(0x114),
826	REG(0x118),
827	REG(0x1c0),
828	REG(0x1c4),
829	REG(0x1c8),
830
831	NOP(3),
832	LRI(9, POSTED),
833	REG16(0x3a8),
834	REG16(0x28c),
835	REG16(0x288),
836	REG16(0x284),
837	REG16(0x280),
838	REG16(0x27c),
839	REG16(0x278),
840	REG16(0x274),
841	REG16(0x270),
842
843	NOP(13),
844	LRI(1, 0),
845	REG(0x0c8),
846
847	END(80)
848};
849
850static const u8 gen9_rcs_offsets[] = {
851	NOP(1),
852	LRI(14, POSTED),
853	REG16(0x244),
854	REG(0x34),
855	REG(0x30),
856	REG(0x38),
857	REG(0x3c),
858	REG(0x168),
859	REG(0x140),
860	REG(0x110),
861	REG(0x11c),
862	REG(0x114),
863	REG(0x118),
864	REG(0x1c0),
865	REG(0x1c4),
866	REG(0x1c8),
867
868	NOP(3),
869	LRI(9, POSTED),
870	REG16(0x3a8),
871	REG16(0x28c),
872	REG16(0x288),
873	REG16(0x284),
874	REG16(0x280),
875	REG16(0x27c),
876	REG16(0x278),
877	REG16(0x274),
878	REG16(0x270),
879
880	NOP(13),
881	LRI(1, 0),
882	REG(0xc8),
883
884	NOP(13),
885	LRI(44, POSTED),
886	REG(0x28),
887	REG(0x9c),
888	REG(0xc0),
889	REG(0x178),
890	REG(0x17c),
891	REG16(0x358),
892	REG(0x170),
893	REG(0x150),
894	REG(0x154),
895	REG(0x158),
896	REG16(0x41c),
897	REG16(0x600),
898	REG16(0x604),
899	REG16(0x608),
900	REG16(0x60c),
901	REG16(0x610),
902	REG16(0x614),
903	REG16(0x618),
904	REG16(0x61c),
905	REG16(0x620),
906	REG16(0x624),
907	REG16(0x628),
908	REG16(0x62c),
909	REG16(0x630),
910	REG16(0x634),
911	REG16(0x638),
912	REG16(0x63c),
913	REG16(0x640),
914	REG16(0x644),
915	REG16(0x648),
916	REG16(0x64c),
917	REG16(0x650),
918	REG16(0x654),
919	REG16(0x658),
920	REG16(0x65c),
921	REG16(0x660),
922	REG16(0x664),
923	REG16(0x668),
924	REG16(0x66c),
925	REG16(0x670),
926	REG16(0x674),
927	REG16(0x678),
928	REG16(0x67c),
929	REG(0x68),
930
931	END(176)
932};
933
934static const u8 gen11_rcs_offsets[] = {
935	NOP(1),
936	LRI(15, POSTED),
937	REG16(0x244),
938	REG(0x034),
939	REG(0x030),
940	REG(0x038),
941	REG(0x03c),
942	REG(0x168),
943	REG(0x140),
944	REG(0x110),
945	REG(0x11c),
946	REG(0x114),
947	REG(0x118),
948	REG(0x1c0),
949	REG(0x1c4),
950	REG(0x1c8),
951	REG(0x180),
952
953	NOP(1),
954	LRI(9, POSTED),
955	REG16(0x3a8),
956	REG16(0x28c),
957	REG16(0x288),
958	REG16(0x284),
959	REG16(0x280),
960	REG16(0x27c),
961	REG16(0x278),
962	REG16(0x274),
963	REG16(0x270),
964
965	LRI(1, POSTED),
966	REG(0x1b0),
967
968	NOP(10),
969	LRI(1, 0),
970	REG(0x0c8),
971
972	END(80)
973};
974
975static const u8 gen12_rcs_offsets[] = {
976	NOP(1),
977	LRI(13, POSTED),
978	REG16(0x244),
979	REG(0x034),
980	REG(0x030),
981	REG(0x038),
982	REG(0x03c),
983	REG(0x168),
984	REG(0x140),
985	REG(0x110),
986	REG(0x1c0),
987	REG(0x1c4),
988	REG(0x1c8),
989	REG(0x180),
990	REG16(0x2b4),
991
992	NOP(5),
993	LRI(9, POSTED),
994	REG16(0x3a8),
995	REG16(0x28c),
996	REG16(0x288),
997	REG16(0x284),
998	REG16(0x280),
999	REG16(0x27c),
1000	REG16(0x278),
1001	REG16(0x274),
1002	REG16(0x270),
1003
1004	LRI(3, POSTED),
1005	REG(0x1b0),
1006	REG16(0x5a8),
1007	REG16(0x5ac),
1008
1009	NOP(6),
1010	LRI(1, 0),
1011	REG(0x0c8),
1012	NOP(3 + 9 + 1),
1013
1014	LRI(51, POSTED),
1015	REG16(0x588),
1016	REG16(0x588),
1017	REG16(0x588),
1018	REG16(0x588),
1019	REG16(0x588),
1020	REG16(0x588),
1021	REG(0x028),
1022	REG(0x09c),
1023	REG(0x0c0),
1024	REG(0x178),
1025	REG(0x17c),
1026	REG16(0x358),
1027	REG(0x170),
1028	REG(0x150),
1029	REG(0x154),
1030	REG(0x158),
1031	REG16(0x41c),
1032	REG16(0x600),
1033	REG16(0x604),
1034	REG16(0x608),
1035	REG16(0x60c),
1036	REG16(0x610),
1037	REG16(0x614),
1038	REG16(0x618),
1039	REG16(0x61c),
1040	REG16(0x620),
1041	REG16(0x624),
1042	REG16(0x628),
1043	REG16(0x62c),
1044	REG16(0x630),
1045	REG16(0x634),
1046	REG16(0x638),
1047	REG16(0x63c),
1048	REG16(0x640),
1049	REG16(0x644),
1050	REG16(0x648),
1051	REG16(0x64c),
1052	REG16(0x650),
1053	REG16(0x654),
1054	REG16(0x658),
1055	REG16(0x65c),
1056	REG16(0x660),
1057	REG16(0x664),
1058	REG16(0x668),
1059	REG16(0x66c),
1060	REG16(0x670),
1061	REG16(0x674),
1062	REG16(0x678),
1063	REG16(0x67c),
1064	REG(0x068),
1065	REG(0x084),
1066	NOP(1),
1067
1068	END(192)
1069};
1070
1071#undef END
1072#undef REG16
1073#undef REG
1074#undef LRI
1075#undef NOP
1076
1077static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1078{
1079	/*
1080	 * The gen12+ lists only have the registers we program in the basic
1081	 * default state. We rely on the context image using relative
1082	 * addressing to automatic fixup the register state between the
1083	 * physical engines for virtual engine.
1084	 */
1085	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1086		   !intel_engine_has_relative_mmio(engine));
1087
1088	if (engine->class == RENDER_CLASS) {
1089		if (INTEL_GEN(engine->i915) >= 12)
1090			return gen12_rcs_offsets;
1091		else if (INTEL_GEN(engine->i915) >= 11)
1092			return gen11_rcs_offsets;
1093		else if (INTEL_GEN(engine->i915) >= 9)
1094			return gen9_rcs_offsets;
1095		else
1096			return gen8_rcs_offsets;
1097	} else {
1098		if (INTEL_GEN(engine->i915) >= 12)
1099			return gen12_xcs_offsets;
1100		else if (INTEL_GEN(engine->i915) >= 9)
1101			return gen9_xcs_offsets;
1102		else
1103			return gen8_xcs_offsets;
1104	}
1105}
1106
1107static struct i915_request *
1108__unwind_incomplete_requests(struct intel_engine_cs *engine)
1109{
1110	struct i915_request *rq, *rn, *active = NULL;
1111	struct list_head *pl;
1112	int prio = I915_PRIORITY_INVALID;
1113
1114	lockdep_assert_held(&engine->active.lock);
1115
1116	list_for_each_entry_safe_reverse(rq, rn,
1117					 &engine->active.requests,
1118					 sched.link) {
1119		if (i915_request_completed(rq))
1120			continue; /* XXX */
1121
1122		__i915_request_unsubmit(rq);
1123
1124		/*
1125		 * Push the request back into the queue for later resubmission.
1126		 * If this request is not native to this physical engine (i.e.
1127		 * it came from a virtual source), push it back onto the virtual
1128		 * engine so that it can be moved across onto another physical
1129		 * engine as load dictates.
1130		 */
1131		if (likely(rq->execution_mask == engine->mask)) {
1132			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1133			if (rq_prio(rq) != prio) {
1134				prio = rq_prio(rq);
1135				pl = i915_sched_lookup_priolist(engine, prio);
1136			}
1137			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1138
1139			list_move(&rq->sched.link, pl);
1140			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1141
1142			/* Check in case we rollback so far we wrap [size/2] */
1143			if (intel_ring_direction(rq->ring,
1144						 rq->tail,
1145						 rq->ring->tail + 8) > 0)
1146				rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1147
1148			active = rq;
1149		} else {
1150			struct intel_engine_cs *owner = rq->context->engine;
1151
1152			WRITE_ONCE(rq->engine, owner);
1153			owner->submit_request(rq);
1154			active = NULL;
1155		}
1156	}
1157
1158	return active;
1159}
1160
1161struct i915_request *
1162execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1163{
1164	struct intel_engine_cs *engine =
1165		container_of(execlists, typeof(*engine), execlists);
1166
1167	return __unwind_incomplete_requests(engine);
1168}
1169
1170static inline void
1171execlists_context_status_change(struct i915_request *rq, unsigned long status)
1172{
1173	/*
1174	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1175	 * The compiler should eliminate this function as dead-code.
1176	 */
1177	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1178		return;
1179
1180	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1181				   status, rq);
1182}
1183
1184static void intel_engine_context_in(struct intel_engine_cs *engine)
1185{
1186	unsigned long flags;
1187
1188	if (atomic_add_unless(&engine->stats.active, 1, 0))
1189		return;
1190
1191	write_seqlock_irqsave(&engine->stats.lock, flags);
1192	if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1193		engine->stats.start = ktime_get();
1194		atomic_inc(&engine->stats.active);
1195	}
1196	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1197}
1198
1199static void intel_engine_context_out(struct intel_engine_cs *engine)
1200{
1201	unsigned long flags;
1202
1203	GEM_BUG_ON(!atomic_read(&engine->stats.active));
1204
1205	if (atomic_add_unless(&engine->stats.active, -1, 1))
1206		return;
1207
1208	write_seqlock_irqsave(&engine->stats.lock, flags);
1209	if (atomic_dec_and_test(&engine->stats.active)) {
1210		engine->stats.total =
1211			ktime_add(engine->stats.total,
1212				  ktime_sub(ktime_get(), engine->stats.start));
1213	}
1214	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1215}
1216
1217static void
1218execlists_check_context(const struct intel_context *ce,
1219			const struct intel_engine_cs *engine)
1220{
1221	const struct intel_ring *ring = ce->ring;
1222	u32 *regs = ce->lrc_reg_state;
1223	bool valid = true;
1224	int x;
1225
1226	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1227		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1228		       engine->name,
1229		       regs[CTX_RING_START],
1230		       i915_ggtt_offset(ring->vma));
1231		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1232		valid = false;
1233	}
1234
1235	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1236	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1237		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1238		       engine->name,
1239		       regs[CTX_RING_CTL],
1240		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1241		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1242		valid = false;
1243	}
1244
1245	x = lrc_ring_mi_mode(engine);
1246	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1247		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1248		       engine->name, regs[x + 1]);
1249		regs[x + 1] &= ~STOP_RING;
1250		regs[x + 1] |= STOP_RING << 16;
1251		valid = false;
1252	}
1253
1254	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1255}
1256
1257static void restore_default_state(struct intel_context *ce,
1258				  struct intel_engine_cs *engine)
1259{
1260	u32 *regs;
1261
1262	regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1263	execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1264
1265	ce->runtime.last = intel_context_get_runtime(ce);
1266}
1267
1268static void reset_active(struct i915_request *rq,
1269			 struct intel_engine_cs *engine)
1270{
1271	struct intel_context * const ce = rq->context;
1272	u32 head;
1273
1274	/*
1275	 * The executing context has been cancelled. We want to prevent
1276	 * further execution along this context and propagate the error on
1277	 * to anything depending on its results.
1278	 *
1279	 * In __i915_request_submit(), we apply the -EIO and remove the
1280	 * requests' payloads for any banned requests. But first, we must
1281	 * rewind the context back to the start of the incomplete request so
1282	 * that we do not jump back into the middle of the batch.
1283	 *
1284	 * We preserve the breadcrumbs and semaphores of the incomplete
1285	 * requests so that inter-timeline dependencies (i.e other timelines)
1286	 * remain correctly ordered. And we defer to __i915_request_submit()
1287	 * so that all asynchronous waits are correctly handled.
1288	 */
1289	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1290		     rq->fence.context, rq->fence.seqno);
1291
1292	/* On resubmission of the active request, payload will be scrubbed */
1293	if (i915_request_completed(rq))
1294		head = rq->tail;
1295	else
1296		head = active_request(ce->timeline, rq)->head;
1297	head = intel_ring_wrap(ce->ring, head);
1298
1299	/* Scrub the context image to prevent replaying the previous batch */
1300	restore_default_state(ce, engine);
1301	__execlists_update_reg_state(ce, engine, head);
1302
1303	/* We've switched away, so this should be a no-op, but intent matters */
1304	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1305}
1306
1307static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1308{
1309#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1310	ce->runtime.num_underflow += dt < 0;
1311	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1312#endif
1313}
1314
1315static void intel_context_update_runtime(struct intel_context *ce)
1316{
1317	u32 old;
1318	s32 dt;
1319
1320	if (intel_context_is_barrier(ce))
1321		return;
1322
1323	old = ce->runtime.last;
1324	ce->runtime.last = intel_context_get_runtime(ce);
1325	dt = ce->runtime.last - old;
1326
1327	if (unlikely(dt <= 0)) {
1328		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1329			 old, ce->runtime.last, dt);
1330		st_update_runtime_underflow(ce, dt);
1331		return;
1332	}
1333
1334	ewma_runtime_add(&ce->runtime.avg, dt);
1335	ce->runtime.total += dt;
1336}
1337
1338static inline struct intel_engine_cs *
1339__execlists_schedule_in(struct i915_request *rq)
1340{
1341	struct intel_engine_cs * const engine = rq->engine;
1342	struct intel_context * const ce = rq->context;
1343
1344	intel_context_get(ce);
1345
1346	if (unlikely(intel_context_is_banned(ce)))
1347		reset_active(rq, engine);
1348
1349	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1350		execlists_check_context(ce, engine);
1351
1352	if (ce->tag) {
1353		/* Use a fixed tag for OA and friends */
1354		GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1355		ce->lrc.ccid = ce->tag;
1356	} else {
1357		/* We don't need a strict matching tag, just different values */
1358		unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1359
1360		GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1361		clear_bit(tag - 1, &engine->context_tag);
1362		ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1363
1364		BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1365	}
1366
1367	ce->lrc.ccid |= engine->execlists.ccid;
1368
1369	__intel_gt_pm_get(engine->gt);
1370	if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
1371		intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
1372	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1373	intel_engine_context_in(engine);
1374
1375	return engine;
1376}
1377
1378static inline struct i915_request *
1379execlists_schedule_in(struct i915_request *rq, int idx)
1380{
1381	struct intel_context * const ce = rq->context;
1382	struct intel_engine_cs *old;
1383
1384	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1385	trace_i915_request_in(rq, idx);
1386
1387	old = READ_ONCE(ce->inflight);
1388	do {
1389		if (!old) {
1390			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1391			break;
1392		}
1393	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1394
1395	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1396	return i915_request_get(rq);
1397}
1398
1399static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1400{
1401	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1402	struct i915_request *next = READ_ONCE(ve->request);
1403
1404	if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
1405		tasklet_hi_schedule(&ve->base.execlists.tasklet);
1406}
1407
1408static inline void
1409__execlists_schedule_out(struct i915_request *rq,
1410			 struct intel_engine_cs * const engine,
1411			 unsigned int ccid)
1412{
1413	struct intel_context * const ce = rq->context;
1414
1415	/*
1416	 * NB process_csb() is not under the engine->active.lock and hence
1417	 * schedule_out can race with schedule_in meaning that we should
1418	 * refrain from doing non-trivial work here.
1419	 */
1420
1421	/*
1422	 * If we have just completed this context, the engine may now be
1423	 * idle and we want to re-enter powersaving.
1424	 */
1425	if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1426	    i915_request_completed(rq))
1427		intel_engine_add_retire(engine, ce->timeline);
1428
1429	ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1430	ccid &= GEN12_MAX_CONTEXT_HW_ID;
1431	if (ccid < BITS_PER_LONG) {
1432		GEM_BUG_ON(ccid == 0);
1433		GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1434		set_bit(ccid - 1, &engine->context_tag);
1435	}
1436
1437	intel_context_update_runtime(ce);
1438	intel_engine_context_out(engine);
1439	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1440	if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
1441		intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
1442	intel_gt_pm_put_async(engine->gt);
1443
1444	/*
1445	 * If this is part of a virtual engine, its next request may
1446	 * have been blocked waiting for access to the active context.
1447	 * We have to kick all the siblings again in case we need to
1448	 * switch (e.g. the next request is not runnable on this
1449	 * engine). Hopefully, we will already have submitted the next
1450	 * request before the tasklet runs and do not need to rebuild
1451	 * each virtual tree and kick everyone again.
1452	 */
1453	if (ce->engine != engine)
1454		kick_siblings(rq, ce);
1455
1456	intel_context_put(ce);
1457}
1458
1459static inline void
1460execlists_schedule_out(struct i915_request *rq)
1461{
1462	struct intel_context * const ce = rq->context;
1463	struct intel_engine_cs *cur, *old;
1464	u32 ccid;
1465
1466	trace_i915_request_out(rq);
1467
1468	ccid = rq->context->lrc.ccid;
1469	old = READ_ONCE(ce->inflight);
1470	do
1471		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1472	while (!try_cmpxchg(&ce->inflight, &old, cur));
1473	if (!cur)
1474		__execlists_schedule_out(rq, old, ccid);
1475
1476	i915_request_put(rq);
1477}
1478
1479static u64 execlists_update_context(struct i915_request *rq)
1480{
1481	struct intel_context *ce = rq->context;
1482	u64 desc = ce->lrc.desc;
1483	u32 tail, prev;
1484
1485	/*
1486	 * WaIdleLiteRestore:bdw,skl
1487	 *
1488	 * We should never submit the context with the same RING_TAIL twice
1489	 * just in case we submit an empty ring, which confuses the HW.
1490	 *
1491	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1492	 * the normal request to be able to always advance the RING_TAIL on
1493	 * subsequent resubmissions (for lite restore). Should that fail us,
1494	 * and we try and submit the same tail again, force the context
1495	 * reload.
1496	 *
1497	 * If we need to return to a preempted context, we need to skip the
1498	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1499	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1500	 * an earlier request.
1501	 */
1502	GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1503	prev = rq->ring->tail;
1504	tail = intel_ring_set_tail(rq->ring, rq->tail);
1505	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1506		desc |= CTX_DESC_FORCE_RESTORE;
1507	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1508	rq->tail = rq->wa_tail;
1509
1510	/*
1511	 * Make sure the context image is complete before we submit it to HW.
1512	 *
1513	 * Ostensibly, writes (including the WCB) should be flushed prior to
1514	 * an uncached write such as our mmio register access, the empirical
1515	 * evidence (esp. on Braswell) suggests that the WC write into memory
1516	 * may not be visible to the HW prior to the completion of the UC
1517	 * register write and that we may begin execution from the context
1518	 * before its image is complete leading to invalid PD chasing.
1519	 */
1520	wmb();
1521
1522	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1523	return desc;
1524}
1525
1526static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1527{
1528	if (execlists->ctrl_reg) {
1529		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1530		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1531	} else {
1532		writel(upper_32_bits(desc), execlists->submit_reg);
1533		writel(lower_32_bits(desc), execlists->submit_reg);
1534	}
1535}
1536
1537static __maybe_unused char *
1538dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1539{
1540	if (!rq)
1541		return "";
1542
1543	snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1544		 prefix,
1545		 rq->context->lrc.ccid,
1546		 rq->fence.context, rq->fence.seqno,
1547		 i915_request_completed(rq) ? "!" :
1548		 i915_request_started(rq) ? "*" :
1549		 "",
1550		 rq_prio(rq));
1551
1552	return buf;
1553}
1554
1555static __maybe_unused void
1556trace_ports(const struct intel_engine_execlists *execlists,
1557	    const char *msg,
1558	    struct i915_request * const *ports)
1559{
1560	const struct intel_engine_cs *engine =
1561		container_of(execlists, typeof(*engine), execlists);
1562	char __maybe_unused p0[40], p1[40];
1563
1564	if (!ports[0])
1565		return;
1566
1567	ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1568		     dump_port(p0, sizeof(p0), "", ports[0]),
1569		     dump_port(p1, sizeof(p1), ", ", ports[1]));
1570}
1571
1572static inline bool
1573reset_in_progress(const struct intel_engine_execlists *execlists)
1574{
1575	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1576}
1577
1578static __maybe_unused bool
1579assert_pending_valid(const struct intel_engine_execlists *execlists,
1580		     const char *msg)
1581{
1582	struct intel_engine_cs *engine =
1583		container_of(execlists, typeof(*engine), execlists);
1584	struct i915_request * const *port, *rq;
1585	struct intel_context *ce = NULL;
1586	bool sentinel = false;
1587	u32 ccid = -1;
1588
1589	trace_ports(execlists, msg, execlists->pending);
1590
1591	/* We may be messing around with the lists during reset, lalala */
1592	if (reset_in_progress(execlists))
1593		return true;
1594
1595	if (!execlists->pending[0]) {
1596		GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1597			      engine->name);
1598		return false;
1599	}
1600
1601	if (execlists->pending[execlists_num_ports(execlists)]) {
1602		GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1603			      engine->name, execlists_num_ports(execlists));
1604		return false;
1605	}
1606
1607	for (port = execlists->pending; (rq = *port); port++) {
1608		unsigned long flags;
1609		bool ok = true;
1610
1611		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1612		GEM_BUG_ON(!i915_request_is_active(rq));
1613
1614		if (ce == rq->context) {
1615			GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1616				      engine->name,
1617				      ce->timeline->fence_context,
1618				      port - execlists->pending);
1619			return false;
1620		}
1621		ce = rq->context;
1622
1623		if (ccid == ce->lrc.ccid) {
1624			GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1625				      engine->name,
1626				      ccid, ce->timeline->fence_context,
1627				      port - execlists->pending);
1628			return false;
1629		}
1630		ccid = ce->lrc.ccid;
1631
1632		/*
1633		 * Sentinels are supposed to be the last request so they flush
1634		 * the current execution off the HW. Check that they are the only
1635		 * request in the pending submission.
1636		 */
1637		if (sentinel) {
1638			GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1639				      engine->name,
1640				      ce->timeline->fence_context,
1641				      port - execlists->pending);
1642			return false;
1643		}
1644		sentinel = i915_request_has_sentinel(rq);
1645
1646		/* Hold tightly onto the lock to prevent concurrent retires! */
1647		if (!spin_trylock_irqsave(&rq->lock, flags))
1648			continue;
1649
1650		if (i915_request_completed(rq))
1651			goto unlock;
1652
1653		if (i915_active_is_idle(&ce->active) &&
1654		    !intel_context_is_barrier(ce)) {
1655			GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1656				      engine->name,
1657				      ce->timeline->fence_context,
1658				      port - execlists->pending);
1659			ok = false;
1660			goto unlock;
1661		}
1662
1663		if (!i915_vma_is_pinned(ce->state)) {
1664			GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1665				      engine->name,
1666				      ce->timeline->fence_context,
1667				      port - execlists->pending);
1668			ok = false;
1669			goto unlock;
1670		}
1671
1672		if (!i915_vma_is_pinned(ce->ring->vma)) {
1673			GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1674				      engine->name,
1675				      ce->timeline->fence_context,
1676				      port - execlists->pending);
1677			ok = false;
1678			goto unlock;
1679		}
1680
1681unlock:
1682		spin_unlock_irqrestore(&rq->lock, flags);
1683		if (!ok)
1684			return false;
1685	}
1686
1687	return ce;
1688}
1689
1690static void execlists_submit_ports(struct intel_engine_cs *engine)
1691{
1692	struct intel_engine_execlists *execlists = &engine->execlists;
1693	unsigned int n;
1694
1695	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1696
1697	/*
1698	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1699	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1700	 * not be relinquished until the device is idle (see
1701	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1702	 * that all ELSP are drained i.e. we have processed the CSB,
1703	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1704	 */
1705	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1706
1707	/*
1708	 * ELSQ note: the submit queue is not cleared after being submitted
1709	 * to the HW so we need to make sure we always clean it up. This is
1710	 * currently ensured by the fact that we always write the same number
1711	 * of elsq entries, keep this in mind before changing the loop below.
1712	 */
1713	for (n = execlists_num_ports(execlists); n--; ) {
1714		struct i915_request *rq = execlists->pending[n];
1715
1716		write_desc(execlists,
1717			   rq ? execlists_update_context(rq) : 0,
1718			   n);
1719	}
1720
1721	/* we need to manually load the submit queue */
1722	if (execlists->ctrl_reg)
1723		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1724}
1725
1726static bool ctx_single_port_submission(const struct intel_context *ce)
1727{
1728	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1729		intel_context_force_single_submission(ce));
1730}
1731
1732static bool can_merge_ctx(const struct intel_context *prev,
1733			  const struct intel_context *next)
1734{
1735	if (prev != next)
1736		return false;
1737
1738	if (ctx_single_port_submission(prev))
1739		return false;
1740
1741	return true;
1742}
1743
1744static unsigned long i915_request_flags(const struct i915_request *rq)
1745{
1746	return READ_ONCE(rq->fence.flags);
1747}
1748
1749static bool can_merge_rq(const struct i915_request *prev,
1750			 const struct i915_request *next)
1751{
1752	GEM_BUG_ON(prev == next);
1753	GEM_BUG_ON(!assert_priority_queue(prev, next));
1754
1755	/*
1756	 * We do not submit known completed requests. Therefore if the next
1757	 * request is already completed, we can pretend to merge it in
1758	 * with the previous context (and we will skip updating the ELSP
1759	 * and tracking). Thus hopefully keeping the ELSP full with active
1760	 * contexts, despite the best efforts of preempt-to-busy to confuse
1761	 * us.
1762	 */
1763	if (i915_request_completed(next))
1764		return true;
1765
1766	if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1767		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1768		      BIT(I915_FENCE_FLAG_SENTINEL))))
1769		return false;
1770
1771	if (!can_merge_ctx(prev->context, next->context))
1772		return false;
1773
1774	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1775	return true;
1776}
1777
1778static void virtual_update_register_offsets(u32 *regs,
1779					    struct intel_engine_cs *engine)
1780{
1781	set_offsets(regs, reg_offsets(engine), engine, false);
1782}
1783
1784static bool virtual_matches(const struct virtual_engine *ve,
1785			    const struct i915_request *rq,
1786			    const struct intel_engine_cs *engine)
1787{
1788	const struct intel_engine_cs *inflight;
1789
1790	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1791		return false;
1792
1793	/*
1794	 * We track when the HW has completed saving the context image
1795	 * (i.e. when we have seen the final CS event switching out of
1796	 * the context) and must not overwrite the context image before
1797	 * then. This restricts us to only using the active engine
1798	 * while the previous virtualized request is inflight (so
1799	 * we reuse the register offsets). This is a very small
1800	 * hystersis on the greedy seelction algorithm.
1801	 */
1802	inflight = intel_context_inflight(&ve->context);
1803	if (inflight && inflight != engine)
1804		return false;
1805
1806	return true;
1807}
1808
1809static void virtual_xfer_context(struct virtual_engine *ve,
1810				 struct intel_engine_cs *engine)
1811{
1812	unsigned int n;
1813
1814	if (likely(engine == ve->siblings[0]))
1815		return;
1816
1817	GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1818	if (!intel_engine_has_relative_mmio(engine))
1819		virtual_update_register_offsets(ve->context.lrc_reg_state,
1820						engine);
1821
1822	/*
1823	 * Move the bound engine to the top of the list for
1824	 * future execution. We then kick this tasklet first
1825	 * before checking others, so that we preferentially
1826	 * reuse this set of bound registers.
1827	 */
1828	for (n = 1; n < ve->num_siblings; n++) {
1829		if (ve->siblings[n] == engine) {
1830			swap(ve->siblings[n], ve->siblings[0]);
1831			break;
1832		}
1833	}
1834}
1835
1836#define for_each_waiter(p__, rq__) \
1837	list_for_each_entry_lockless(p__, \
1838				     &(rq__)->sched.waiters_list, \
1839				     wait_link)
1840
1841#define for_each_signaler(p__, rq__) \
1842	list_for_each_entry_rcu(p__, \
1843				&(rq__)->sched.signalers_list, \
1844				signal_link)
1845
1846static void defer_request(struct i915_request *rq, struct list_head * const pl)
1847{
1848	LIST_HEAD(list);
1849
1850	/*
1851	 * We want to move the interrupted request to the back of
1852	 * the round-robin list (i.e. its priority level), but
1853	 * in doing so, we must then move all requests that were in
1854	 * flight and were waiting for the interrupted request to
1855	 * be run after it again.
1856	 */
1857	do {
1858		struct i915_dependency *p;
1859
1860		GEM_BUG_ON(i915_request_is_active(rq));
1861		list_move_tail(&rq->sched.link, pl);
1862
1863		for_each_waiter(p, rq) {
1864			struct i915_request *w =
1865				container_of(p->waiter, typeof(*w), sched);
1866
1867			if (p->flags & I915_DEPENDENCY_WEAK)
1868				continue;
1869
1870			/* Leave semaphores spinning on the other engines */
1871			if (w->engine != rq->engine)
1872				continue;
1873
1874			/* No waiter should start before its signaler */
1875			GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1876				   i915_request_started(w) &&
1877				   !i915_request_completed(rq));
1878
1879			GEM_BUG_ON(i915_request_is_active(w));
1880			if (!i915_request_is_ready(w))
1881				continue;
1882
1883			if (rq_prio(w) < rq_prio(rq))
1884				continue;
1885
1886			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1887			list_move_tail(&w->sched.link, &list);
1888		}
1889
1890		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1891	} while (rq);
1892}
1893
1894static void defer_active(struct intel_engine_cs *engine)
1895{
1896	struct i915_request *rq;
1897
1898	rq = __unwind_incomplete_requests(engine);
1899	if (!rq)
1900		return;
1901
1902	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1903}
1904
1905static bool
1906need_timeslice(const struct intel_engine_cs *engine,
1907	       const struct i915_request *rq,
1908	       const struct rb_node *rb)
1909{
1910	int hint;
1911
1912	if (!intel_engine_has_timeslices(engine))
1913		return false;
1914
1915	hint = engine->execlists.queue_priority_hint;
1916
1917	if (rb) {
1918		const struct virtual_engine *ve =
1919			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1920		const struct intel_engine_cs *inflight =
1921			intel_context_inflight(&ve->context);
1922
1923		if (!inflight || inflight == engine) {
1924			struct i915_request *next;
1925
1926			rcu_read_lock();
1927			next = READ_ONCE(ve->request);
1928			if (next)
1929				hint = max(hint, rq_prio(next));
1930			rcu_read_unlock();
1931		}
1932	}
1933
1934	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1935		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1936
1937	GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1938	return hint >= effective_prio(rq);
1939}
1940
1941static bool
1942timeslice_yield(const struct intel_engine_execlists *el,
1943		const struct i915_request *rq)
1944{
1945	/*
1946	 * Once bitten, forever smitten!
1947	 *
1948	 * If the active context ever busy-waited on a semaphore,
1949	 * it will be treated as a hog until the end of its timeslice (i.e.
1950	 * until it is scheduled out and replaced by a new submission,
1951	 * possibly even its own lite-restore). The HW only sends an interrupt
1952	 * on the first miss, and we do know if that semaphore has been
1953	 * signaled, or even if it is now stuck on another semaphore. Play
1954	 * safe, yield if it might be stuck -- it will be given a fresh
1955	 * timeslice in the near future.
1956	 */
1957	return rq->context->lrc.ccid == READ_ONCE(el->yield);
1958}
1959
1960static bool
1961timeslice_expired(const struct intel_engine_execlists *el,
1962		  const struct i915_request *rq)
1963{
1964	return timer_expired(&el->timer) || timeslice_yield(el, rq);
1965}
1966
1967static int
1968switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1969{
1970	if (list_is_last(&rq->sched.link, &engine->active.requests))
1971		return engine->execlists.queue_priority_hint;
1972
1973	return rq_prio(list_next_entry(rq, sched.link));
1974}
1975
1976static inline unsigned long
1977timeslice(const struct intel_engine_cs *engine)
1978{
1979	return READ_ONCE(engine->props.timeslice_duration_ms);
1980}
1981
1982static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1983{
1984	const struct intel_engine_execlists *execlists = &engine->execlists;
1985	const struct i915_request *rq = *execlists->active;
1986
1987	if (!rq || i915_request_completed(rq))
1988		return 0;
1989
1990	if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1991		return 0;
1992
1993	return timeslice(engine);
1994}
1995
1996static void set_timeslice(struct intel_engine_cs *engine)
1997{
1998	unsigned long duration;
1999
2000	if (!intel_engine_has_timeslices(engine))
2001		return;
2002
2003	duration = active_timeslice(engine);
2004	ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
2005
2006	set_timer_ms(&engine->execlists.timer, duration);
2007}
2008
2009static void start_timeslice(struct intel_engine_cs *engine, int prio)
2010{
2011	struct intel_engine_execlists *execlists = &engine->execlists;
2012	unsigned long duration;
2013
2014	if (!intel_engine_has_timeslices(engine))
2015		return;
2016
2017	WRITE_ONCE(execlists->switch_priority_hint, prio);
2018	if (prio == INT_MIN)
2019		return;
2020
2021	if (timer_pending(&execlists->timer))
2022		return;
2023
2024	duration = timeslice(engine);
2025	ENGINE_TRACE(engine,
2026		     "start timeslicing, prio:%d, interval:%lu",
2027		     prio, duration);
2028
2029	set_timer_ms(&execlists->timer, duration);
2030}
2031
2032static void record_preemption(struct intel_engine_execlists *execlists)
2033{
2034	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2035}
2036
2037static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2038					    const struct i915_request *rq)
2039{
2040	if (!rq)
2041		return 0;
2042
2043	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
2044	if (unlikely(intel_context_is_banned(rq->context)))
2045		return 1;
2046
2047	return READ_ONCE(engine->props.preempt_timeout_ms);
2048}
2049
2050static void set_preempt_timeout(struct intel_engine_cs *engine,
2051				const struct i915_request *rq)
2052{
2053	if (!intel_engine_has_preempt_reset(engine))
2054		return;
2055
2056	set_timer_ms(&engine->execlists.preempt,
2057		     active_preempt_timeout(engine, rq));
2058}
2059
2060static inline void clear_ports(struct i915_request **ports, int count)
2061{
2062	memset_p((void **)ports, NULL, count);
2063}
2064
2065static inline void
2066copy_ports(struct i915_request **dst, struct i915_request **src, int count)
2067{
2068	/* A memcpy_p() would be very useful here! */
2069	while (count--)
2070		WRITE_ONCE(*dst++, *src++); /* avoid write tearing */
2071}
2072
2073static void execlists_dequeue(struct intel_engine_cs *engine)
2074{
2075	struct intel_engine_execlists * const execlists = &engine->execlists;
2076	struct i915_request **port = execlists->pending;
2077	struct i915_request ** const last_port = port + execlists->port_mask;
2078	struct i915_request * const *active;
2079	struct i915_request *last;
2080	struct rb_node *rb;
2081	bool submit = false;
2082
2083	/*
2084	 * Hardware submission is through 2 ports. Conceptually each port
2085	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2086	 * static for a context, and unique to each, so we only execute
2087	 * requests belonging to a single context from each ring. RING_HEAD
2088	 * is maintained by the CS in the context image, it marks the place
2089	 * where it got up to last time, and through RING_TAIL we tell the CS
2090	 * where we want to execute up to this time.
2091	 *
2092	 * In this list the requests are in order of execution. Consecutive
2093	 * requests from the same context are adjacent in the ringbuffer. We
2094	 * can combine these requests into a single RING_TAIL update:
2095	 *
2096	 *              RING_HEAD...req1...req2
2097	 *                                    ^- RING_TAIL
2098	 * since to execute req2 the CS must first execute req1.
2099	 *
2100	 * Our goal then is to point each port to the end of a consecutive
2101	 * sequence of requests as being the most optimal (fewest wake ups
2102	 * and context switches) submission.
2103	 */
2104
2105	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2106		struct virtual_engine *ve =
2107			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2108		struct i915_request *rq = READ_ONCE(ve->request);
2109
2110		if (!rq) { /* lazily cleanup after another engine handled rq */
2111			rb_erase_cached(rb, &execlists->virtual);
2112			RB_CLEAR_NODE(rb);
2113			rb = rb_first_cached(&execlists->virtual);
2114			continue;
2115		}
2116
2117		if (!virtual_matches(ve, rq, engine)) {
2118			rb = rb_next(rb);
2119			continue;
2120		}
2121
2122		break;
2123	}
2124
2125	/*
2126	 * If the queue is higher priority than the last
2127	 * request in the currently active context, submit afresh.
2128	 * We will resubmit again afterwards in case we need to split
2129	 * the active context to interject the preemption request,
2130	 * i.e. we will retrigger preemption following the ack in case
2131	 * of trouble.
2132	 */
2133	active = READ_ONCE(execlists->active);
2134
2135	/*
2136	 * In theory we can skip over completed contexts that have not
2137	 * yet been processed by events (as those events are in flight):
2138	 *
2139	 * while ((last = *active) && i915_request_completed(last))
2140	 *	active++;
2141	 *
2142	 * However, the GPU cannot handle this as it will ultimately
2143	 * find itself trying to jump back into a context it has just
2144	 * completed and barf.
2145	 */
2146
2147	if ((last = *active)) {
2148		if (need_preempt(engine, last, rb)) {
2149			if (i915_request_completed(last)) {
2150				tasklet_hi_schedule(&execlists->tasklet);
2151				return;
2152			}
2153
2154			ENGINE_TRACE(engine,
2155				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2156				     last->fence.context,
2157				     last->fence.seqno,
2158				     last->sched.attr.priority,
2159				     execlists->queue_priority_hint);
2160			record_preemption(execlists);
2161
2162			/*
2163			 * Don't let the RING_HEAD advance past the breadcrumb
2164			 * as we unwind (and until we resubmit) so that we do
2165			 * not accidentally tell it to go backwards.
2166			 */
2167			ring_set_paused(engine, 1);
2168
2169			/*
2170			 * Note that we have not stopped the GPU at this point,
2171			 * so we are unwinding the incomplete requests as they
2172			 * remain inflight and so by the time we do complete
2173			 * the preemption, some of the unwound requests may
2174			 * complete!
2175			 */
2176			__unwind_incomplete_requests(engine);
2177
2178			last = NULL;
2179		} else if (need_timeslice(engine, last, rb) &&
2180			   timeslice_expired(execlists, last)) {
2181			if (i915_request_completed(last)) {
2182				tasklet_hi_schedule(&execlists->tasklet);
2183				return;
2184			}
2185
2186			ENGINE_TRACE(engine,
2187				     "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2188				     last->fence.context,
2189				     last->fence.seqno,
2190				     last->sched.attr.priority,
2191				     execlists->queue_priority_hint,
2192				     yesno(timeslice_yield(execlists, last)));
2193
2194			ring_set_paused(engine, 1);
2195			defer_active(engine);
2196
2197			/*
2198			 * Unlike for preemption, if we rewind and continue
2199			 * executing the same context as previously active,
2200			 * the order of execution will remain the same and
2201			 * the tail will only advance. We do not need to
2202			 * force a full context restore, as a lite-restore
2203			 * is sufficient to resample the monotonic TAIL.
2204			 *
2205			 * If we switch to any other context, similarly we
2206			 * will not rewind TAIL of current context, and
2207			 * normal save/restore will preserve state and allow
2208			 * us to later continue executing the same request.
2209			 */
2210			last = NULL;
2211		} else {
2212			/*
2213			 * Otherwise if we already have a request pending
2214			 * for execution after the current one, we can
2215			 * just wait until the next CS event before
2216			 * queuing more. In either case we will force a
2217			 * lite-restore preemption event, but if we wait
2218			 * we hopefully coalesce several updates into a single
2219			 * submission.
2220			 */
2221			if (!list_is_last(&last->sched.link,
2222					  &engine->active.requests)) {
2223				/*
2224				 * Even if ELSP[1] is occupied and not worthy
2225				 * of timeslices, our queue might be.
2226				 */
2227				start_timeslice(engine, queue_prio(execlists));
2228				return;
2229			}
2230		}
2231	}
2232
2233	while (rb) { /* XXX virtual is always taking precedence */
2234		struct virtual_engine *ve =
2235			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2236		struct i915_request *rq;
2237
2238		spin_lock(&ve->base.active.lock);
2239
2240		rq = ve->request;
2241		if (unlikely(!rq)) { /* lost the race to a sibling */
2242			spin_unlock(&ve->base.active.lock);
2243			rb_erase_cached(rb, &execlists->virtual);
2244			RB_CLEAR_NODE(rb);
2245			rb = rb_first_cached(&execlists->virtual);
2246			continue;
2247		}
2248
2249		GEM_BUG_ON(rq != ve->request);
2250		GEM_BUG_ON(rq->engine != &ve->base);
2251		GEM_BUG_ON(rq->context != &ve->context);
2252
2253		if (rq_prio(rq) >= queue_prio(execlists)) {
2254			if (!virtual_matches(ve, rq, engine)) {
2255				spin_unlock(&ve->base.active.lock);
2256				rb = rb_next(rb);
2257				continue;
2258			}
2259
2260			if (last && !can_merge_rq(last, rq)) {
2261				spin_unlock(&ve->base.active.lock);
2262				start_timeslice(engine, rq_prio(rq));
2263				return; /* leave this for another sibling */
2264			}
2265
2266			ENGINE_TRACE(engine,
2267				     "virtual rq=%llx:%lld%s, new engine? %s\n",
2268				     rq->fence.context,
2269				     rq->fence.seqno,
2270				     i915_request_completed(rq) ? "!" :
2271				     i915_request_started(rq) ? "*" :
2272				     "",
2273				     yesno(engine != ve->siblings[0]));
2274
2275			WRITE_ONCE(ve->request, NULL);
2276			WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2277				   INT_MIN);
2278			rb_erase_cached(rb, &execlists->virtual);
2279			RB_CLEAR_NODE(rb);
2280
2281			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2282			WRITE_ONCE(rq->engine, engine);
2283
2284			if (__i915_request_submit(rq)) {
2285				/*
2286				 * Only after we confirm that we will submit
2287				 * this request (i.e. it has not already
2288				 * completed), do we want to update the context.
2289				 *
2290				 * This serves two purposes. It avoids
2291				 * unnecessary work if we are resubmitting an
2292				 * already completed request after timeslicing.
2293				 * But more importantly, it prevents us altering
2294				 * ve->siblings[] on an idle context, where
2295				 * we may be using ve->siblings[] in
2296				 * virtual_context_enter / virtual_context_exit.
2297				 */
2298				virtual_xfer_context(ve, engine);
2299				GEM_BUG_ON(ve->siblings[0] != engine);
2300
2301				submit = true;
2302				last = rq;
2303			}
2304			i915_request_put(rq);
2305
2306			/*
2307			 * Hmm, we have a bunch of virtual engine requests,
2308			 * but the first one was already completed (thanks
2309			 * preempt-to-busy!). Keep looking at the veng queue
2310			 * until we have no more relevant requests (i.e.
2311			 * the normal submit queue has higher priority).
2312			 */
2313			if (!submit) {
2314				spin_unlock(&ve->base.active.lock);
2315				rb = rb_first_cached(&execlists->virtual);
2316				continue;
2317			}
2318		}
2319
2320		spin_unlock(&ve->base.active.lock);
2321		break;
2322	}
2323
2324	while ((rb = rb_first_cached(&execlists->queue))) {
2325		struct i915_priolist *p = to_priolist(rb);
2326		struct i915_request *rq, *rn;
2327		int i;
2328
2329		priolist_for_each_request_consume(rq, rn, p, i) {
2330			bool merge = true;
2331
2332			/*
2333			 * Can we combine this request with the current port?
2334			 * It has to be the same context/ringbuffer and not
2335			 * have any exceptions (e.g. GVT saying never to
2336			 * combine contexts).
2337			 *
2338			 * If we can combine the requests, we can execute both
2339			 * by updating the RING_TAIL to point to the end of the
2340			 * second request, and so we never need to tell the
2341			 * hardware about the first.
2342			 */
2343			if (last && !can_merge_rq(last, rq)) {
2344				/*
2345				 * If we are on the second port and cannot
2346				 * combine this request with the last, then we
2347				 * are done.
2348				 */
2349				if (port == last_port)
2350					goto done;
2351
2352				/*
2353				 * We must not populate both ELSP[] with the
2354				 * same LRCA, i.e. we must submit 2 different
2355				 * contexts if we submit 2 ELSP.
2356				 */
2357				if (last->context == rq->context)
2358					goto done;
2359
2360				if (i915_request_has_sentinel(last))
2361					goto done;
2362
2363				/*
2364				 * If GVT overrides us we only ever submit
2365				 * port[0], leaving port[1] empty. Note that we
2366				 * also have to be careful that we don't queue
2367				 * the same context (even though a different
2368				 * request) to the second port.
2369				 */
2370				if (ctx_single_port_submission(last->context) ||
2371				    ctx_single_port_submission(rq->context))
2372					goto done;
2373
2374				merge = false;
2375			}
2376
2377			if (__i915_request_submit(rq)) {
2378				if (!merge) {
2379					*port = execlists_schedule_in(last, port - execlists->pending);
2380					port++;
2381					last = NULL;
2382				}
2383
2384				GEM_BUG_ON(last &&
2385					   !can_merge_ctx(last->context,
2386							  rq->context));
2387				GEM_BUG_ON(last &&
2388					   i915_seqno_passed(last->fence.seqno,
2389							     rq->fence.seqno));
2390
2391				submit = true;
2392				last = rq;
2393			}
2394		}
2395
2396		rb_erase_cached(&p->node, &execlists->queue);
2397		i915_priolist_free(p);
2398	}
2399
2400done:
2401	/*
2402	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2403	 *
2404	 * We choose the priority hint such that if we add a request of greater
2405	 * priority than this, we kick the submission tasklet to decide on
2406	 * the right order of submitting the requests to hardware. We must
2407	 * also be prepared to reorder requests as they are in-flight on the
2408	 * HW. We derive the priority hint then as the first "hole" in
2409	 * the HW submission ports and if there are no available slots,
2410	 * the priority of the lowest executing request, i.e. last.
2411	 *
2412	 * When we do receive a higher priority request ready to run from the
2413	 * user, see queue_request(), the priority hint is bumped to that
2414	 * request triggering preemption on the next dequeue (or subsequent
2415	 * interrupt for secondary ports).
2416	 */
2417	execlists->queue_priority_hint = queue_prio(execlists);
2418
2419	if (submit) {
2420		*port = execlists_schedule_in(last, port - execlists->pending);
2421		execlists->switch_priority_hint =
2422			switch_prio(engine, *execlists->pending);
2423
2424		/*
2425		 * Skip if we ended up with exactly the same set of requests,
2426		 * e.g. trying to timeslice a pair of ordered contexts
2427		 */
2428		if (!memcmp(active, execlists->pending,
2429			    (port - execlists->pending + 1) * sizeof(*port))) {
2430			do
2431				execlists_schedule_out(fetch_and_zero(port));
2432			while (port-- != execlists->pending);
2433
2434			goto skip_submit;
2435		}
2436		clear_ports(port + 1, last_port - port);
2437
2438		WRITE_ONCE(execlists->yield, -1);
2439		set_preempt_timeout(engine, *active);
2440		execlists_submit_ports(engine);
2441	} else {
2442		start_timeslice(engine, execlists->queue_priority_hint);
2443skip_submit:
2444		ring_set_paused(engine, 0);
2445	}
2446}
2447
2448static void
2449cancel_port_requests(struct intel_engine_execlists * const execlists)
2450{
2451	struct i915_request * const *port;
2452
2453	for (port = execlists->pending; *port; port++)
2454		execlists_schedule_out(*port);
2455	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2456
2457	/* Mark the end of active before we overwrite *active */
2458	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2459		execlists_schedule_out(*port);
2460	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2461
2462	smp_wmb(); /* complete the seqlock for execlists_active() */
2463	WRITE_ONCE(execlists->active, execlists->inflight);
2464}
2465
2466static inline void
2467invalidate_csb_entries(const u64 *first, const u64 *last)
2468{
2469	clflush((void *)first);
2470	clflush((void *)last);
2471}
2472
2473/*
2474 * Starting with Gen12, the status has a new format:
2475 *
2476 *     bit  0:     switched to new queue
2477 *     bit  1:     reserved
2478 *     bit  2:     semaphore wait mode (poll or signal), only valid when
2479 *                 switch detail is set to "wait on semaphore"
2480 *     bits 3-5:   engine class
2481 *     bits 6-11:  engine instance
2482 *     bits 12-14: reserved
2483 *     bits 15-25: sw context id of the lrc the GT switched to
2484 *     bits 26-31: sw counter of the lrc the GT switched to
2485 *     bits 32-35: context switch detail
2486 *                  - 0: ctx complete
2487 *                  - 1: wait on sync flip
2488 *                  - 2: wait on vblank
2489 *                  - 3: wait on scanline
2490 *                  - 4: wait on semaphore
2491 *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2492 *                       WAIT_FOR_EVENT)
2493 *     bit  36:    reserved
2494 *     bits 37-43: wait detail (for switch detail 1 to 4)
2495 *     bits 44-46: reserved
2496 *     bits 47-57: sw context id of the lrc the GT switched away from
2497 *     bits 58-63: sw counter of the lrc the GT switched away from
2498 */
2499static inline bool gen12_csb_parse(const u64 *csb)
2500{
2501	bool ctx_away_valid;
2502	bool new_queue;
2503	u64 entry;
2504
2505	/* HSD#22011248461 */
2506	entry = READ_ONCE(*csb);
2507	if (unlikely(entry == -1)) {
2508		preempt_disable();
2509		if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50))
2510			GEM_WARN_ON("50us CSB timeout");
2511		preempt_enable();
2512	}
2513	WRITE_ONCE(*(u64 *)csb, -1);
2514
2515	ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry));
2516	new_queue =
2517		lower_32_bits(entry) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2518
2519	/*
2520	 * The context switch detail is not guaranteed to be 5 when a preemption
2521	 * occurs, so we can't just check for that. The check below works for
2522	 * all the cases we care about, including preemptions of WAIT
2523	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2524	 * would require some extra handling, but we don't support that.
2525	 */
2526	if (!ctx_away_valid || new_queue) {
2527		GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(entry)));
2528		return true;
2529	}
2530
2531	/*
2532	 * switch detail = 5 is covered by the case above and we do not expect a
2533	 * context switch on an unsuccessful wait instruction since we always
2534	 * use polling mode.
2535	 */
2536	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(entry)));
2537	return false;
2538}
2539
2540static inline bool gen8_csb_parse(const u64 *csb)
2541{
2542	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2543}
2544
2545static void process_csb(struct intel_engine_cs *engine)
2546{
2547	struct intel_engine_execlists * const execlists = &engine->execlists;
2548	const u64 * const buf = execlists->csb_status;
2549	const u8 num_entries = execlists->csb_size;
2550	u8 head, tail;
2551
2552	/*
2553	 * As we modify our execlists state tracking we require exclusive
2554	 * access. Either we are inside the tasklet, or the tasklet is disabled
2555	 * and we assume that is only inside the reset paths and so serialised.
2556	 */
2557	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2558		   !reset_in_progress(execlists));
2559	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2560
2561	/*
2562	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2563	 * When reading from the csb_write mmio register, we have to be
2564	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2565	 * the low 4bits. As it happens we know the next 4bits are always
2566	 * zero and so we can simply masked off the low u8 of the register
2567	 * and treat it identically to reading from the HWSP (without having
2568	 * to use explicit shifting and masking, and probably bifurcating
2569	 * the code to handle the legacy mmio read).
2570	 */
2571	head = execlists->csb_head;
2572	tail = READ_ONCE(*execlists->csb_write);
2573	if (unlikely(head == tail))
2574		return;
2575
2576	/*
2577	 * We will consume all events from HW, or at least pretend to.
2578	 *
2579	 * The sequence of events from the HW is deterministic, and derived
2580	 * from our writes to the ELSP, with a smidgen of variability for
2581	 * the arrival of the asynchronous requests wrt to the inflight
2582	 * execution. If the HW sends an event that does not correspond with
2583	 * the one we are expecting, we have to abandon all hope as we lose
2584	 * all tracking of what the engine is actually executing. We will
2585	 * only detect we are out of sequence with the HW when we get an
2586	 * 'impossible' event because we have already drained our own
2587	 * preemption/promotion queue. If this occurs, we know that we likely
2588	 * lost track of execution earlier and must unwind and restart, the
2589	 * simplest way is by stop processing the event queue and force the
2590	 * engine to reset.
2591	 */
2592	execlists->csb_head = tail;
2593	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2594
2595	/*
2596	 * Hopefully paired with a wmb() in HW!
2597	 *
2598	 * We must complete the read of the write pointer before any reads
2599	 * from the CSB, so that we do not see stale values. Without an rmb
2600	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2601	 * we perform the READ_ONCE(*csb_write).
2602	 */
2603	rmb();
2604	do {
2605		bool promote;
2606
2607		if (++head == num_entries)
2608			head = 0;
2609
2610		/*
2611		 * We are flying near dragons again.
2612		 *
2613		 * We hold a reference to the request in execlist_port[]
2614		 * but no more than that. We are operating in softirq
2615		 * context and so cannot hold any mutex or sleep. That
2616		 * prevents us stopping the requests we are processing
2617		 * in port[] from being retired simultaneously (the
2618		 * breadcrumb will be complete before we see the
2619		 * context-switch). As we only hold the reference to the
2620		 * request, any pointer chasing underneath the request
2621		 * is subject to a potential use-after-free. Thus we
2622		 * store all of the bookkeeping within port[] as
2623		 * required, and avoid using unguarded pointers beneath
2624		 * request itself. The same applies to the atomic
2625		 * status notifier.
2626		 */
2627
2628		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2629			     head,
2630			     upper_32_bits(buf[head]),
2631			     lower_32_bits(buf[head]));
2632
2633		if (INTEL_GEN(engine->i915) >= 12)
2634			promote = gen12_csb_parse(buf + head);
2635		else
2636			promote = gen8_csb_parse(buf + head);
2637		if (promote) {
2638			struct i915_request * const *old = execlists->active;
2639
2640			if (GEM_WARN_ON(!*execlists->pending)) {
2641				execlists->error_interrupt |= ERROR_CSB;
2642				break;
2643			}
2644
2645			ring_set_paused(engine, 0);
2646
2647			/* Point active to the new ELSP; prevent overwriting */
2648			WRITE_ONCE(execlists->active, execlists->pending);
2649			smp_wmb(); /* notify execlists_active() */
2650
2651			/* cancel old inflight, prepare for switch */
2652			trace_ports(execlists, "preempted", old);
2653			while (*old)
2654				execlists_schedule_out(*old++);
2655
2656			/* switch pending to inflight */
2657			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2658			copy_ports(execlists->inflight,
2659				   execlists->pending,
2660				   execlists_num_ports(execlists));
2661			smp_wmb(); /* complete the seqlock */
2662			WRITE_ONCE(execlists->active, execlists->inflight);
2663
2664			/* XXX Magic delay for tgl */
2665			ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
2666
2667			WRITE_ONCE(execlists->pending[0], NULL);
2668		} else {
2669			if (GEM_WARN_ON(!*execlists->active)) {
2670				execlists->error_interrupt |= ERROR_CSB;
2671				break;
2672			}
2673
2674			/* port0 completed, advanced to port1 */
2675			trace_ports(execlists, "completed", execlists->active);
2676
2677			/*
2678			 * We rely on the hardware being strongly
2679			 * ordered, that the breadcrumb write is
2680			 * coherent (visible from the CPU) before the
2681			 * user interrupt is processed. One might assume
2682			 * that the breadcrumb write being before the
2683			 * user interrupt and the CS event for the context
2684			 * switch would therefore be before the CS event
2685			 * itself...
2686			 */
2687			if (GEM_SHOW_DEBUG() &&
2688			    !i915_request_completed(*execlists->active)) {
2689				struct i915_request *rq = *execlists->active;
2690				const u32 *regs __maybe_unused =
2691					rq->context->lrc_reg_state;
2692
2693				ENGINE_TRACE(engine,
2694					     "context completed before request!\n");
2695				ENGINE_TRACE(engine,
2696					     "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2697					     ENGINE_READ(engine, RING_START),
2698					     ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2699					     ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2700					     ENGINE_READ(engine, RING_CTL),
2701					     ENGINE_READ(engine, RING_MI_MODE));
2702				ENGINE_TRACE(engine,
2703					     "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2704					     i915_ggtt_offset(rq->ring->vma),
2705					     rq->head, rq->tail,
2706					     rq->fence.context,
2707					     lower_32_bits(rq->fence.seqno),
2708					     hwsp_seqno(rq));
2709				ENGINE_TRACE(engine,
2710					     "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2711					     regs[CTX_RING_START],
2712					     regs[CTX_RING_HEAD],
2713					     regs[CTX_RING_TAIL]);
2714			}
2715
2716			execlists_schedule_out(*execlists->active++);
2717
2718			GEM_BUG_ON(execlists->active - execlists->inflight >
2719				   execlists_num_ports(execlists));
2720		}
2721	} while (head != tail);
2722
2723	set_timeslice(engine);
2724
2725	/*
2726	 * Gen11 has proven to fail wrt global observation point between
2727	 * entry and tail update, failing on the ordering and thus
2728	 * we see an old entry in the context status buffer.
2729	 *
2730	 * Forcibly evict out entries for the next gpu csb update,
2731	 * to increase the odds that we get a fresh entries with non
2732	 * working hardware. The cost for doing so comes out mostly with
2733	 * the wash as hardware, working or not, will need to do the
2734	 * invalidation before.
2735	 */
2736	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2737}
2738
2739static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2740{
2741	lockdep_assert_held(&engine->active.lock);
2742	if (!READ_ONCE(engine->execlists.pending[0])) {
2743		rcu_read_lock(); /* protect peeking at execlists->active */
2744		execlists_dequeue(engine);
2745		rcu_read_unlock();
2746	}
2747}
2748
2749static void __execlists_hold(struct i915_request *rq)
2750{
2751	LIST_HEAD(list);
2752
2753	do {
2754		struct i915_dependency *p;
2755
2756		if (i915_request_is_active(rq))
2757			__i915_request_unsubmit(rq);
2758
2759		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2760		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2761		i915_request_set_hold(rq);
2762		RQ_TRACE(rq, "on hold\n");
2763
2764		for_each_waiter(p, rq) {
2765			struct i915_request *w =
2766				container_of(p->waiter, typeof(*w), sched);
2767
2768			/* Leave semaphores spinning on the other engines */
2769			if (w->engine != rq->engine)
2770				continue;
2771
2772			if (!i915_request_is_ready(w))
2773				continue;
2774
2775			if (i915_request_completed(w))
2776				continue;
2777
2778			if (i915_request_on_hold(w))
2779				continue;
2780
2781			list_move_tail(&w->sched.link, &list);
2782		}
2783
2784		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2785	} while (rq);
2786}
2787
2788static bool execlists_hold(struct intel_engine_cs *engine,
2789			   struct i915_request *rq)
2790{
2791	if (i915_request_on_hold(rq))
2792		return false;
2793
2794	spin_lock_irq(&engine->active.lock);
2795
2796	if (i915_request_completed(rq)) { /* too late! */
2797		rq = NULL;
2798		goto unlock;
2799	}
2800
2801	if (rq->engine != engine) { /* preempted virtual engine */
2802		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2803
2804		/*
2805		 * intel_context_inflight() is only protected by virtue
2806		 * of process_csb() being called only by the tasklet (or
2807		 * directly from inside reset while the tasklet is suspended).
2808		 * Assert that neither of those are allowed to run while we
2809		 * poke at the request queues.
2810		 */
2811		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2812
2813		/*
2814		 * An unsubmitted request along a virtual engine will
2815		 * remain on the active (this) engine until we are able
2816		 * to process the context switch away (and so mark the
2817		 * context as no longer in flight). That cannot have happened
2818		 * yet, otherwise we would not be hanging!
2819		 */
2820		spin_lock(&ve->base.active.lock);
2821		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2822		GEM_BUG_ON(ve->request != rq);
2823		ve->request = NULL;
2824		spin_unlock(&ve->base.active.lock);
2825		i915_request_put(rq);
2826
2827		rq->engine = engine;
2828	}
2829
2830	/*
2831	 * Transfer this request onto the hold queue to prevent it
2832	 * being resumbitted to HW (and potentially completed) before we have
2833	 * released it. Since we may have already submitted following
2834	 * requests, we need to remove those as well.
2835	 */
2836	GEM_BUG_ON(i915_request_on_hold(rq));
2837	GEM_BUG_ON(rq->engine != engine);
2838	__execlists_hold(rq);
2839	GEM_BUG_ON(list_empty(&engine->active.hold));
2840
2841unlock:
2842	spin_unlock_irq(&engine->active.lock);
2843	return rq;
2844}
2845
2846static bool hold_request(const struct i915_request *rq)
2847{
2848	struct i915_dependency *p;
2849	bool result = false;
2850
2851	/*
2852	 * If one of our ancestors is on hold, we must also be on hold,
2853	 * otherwise we will bypass it and execute before it.
2854	 */
2855	rcu_read_lock();
2856	for_each_signaler(p, rq) {
2857		const struct i915_request *s =
2858			container_of(p->signaler, typeof(*s), sched);
2859
2860		if (s->engine != rq->engine)
2861			continue;
2862
2863		result = i915_request_on_hold(s);
2864		if (result)
2865			break;
2866	}
2867	rcu_read_unlock();
2868
2869	return result;
2870}
2871
2872static void __execlists_unhold(struct i915_request *rq)
2873{
2874	LIST_HEAD(list);
2875
2876	do {
2877		struct i915_dependency *p;
2878
2879		RQ_TRACE(rq, "hold release\n");
2880
2881		GEM_BUG_ON(!i915_request_on_hold(rq));
2882		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2883
2884		i915_request_clear_hold(rq);
2885		list_move_tail(&rq->sched.link,
2886			       i915_sched_lookup_priolist(rq->engine,
2887							  rq_prio(rq)));
2888		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2889
2890		/* Also release any children on this engine that are ready */
2891		for_each_waiter(p, rq) {
2892			struct i915_request *w =
2893				container_of(p->waiter, typeof(*w), sched);
2894
2895			/* Propagate any change in error status */
2896			if (rq->fence.error)
2897				i915_request_set_error_once(w, rq->fence.error);
2898
2899			if (w->engine != rq->engine)
2900				continue;
2901
2902			if (!i915_request_on_hold(w))
2903				continue;
2904
2905			/* Check that no other parents are also on hold */
2906			if (hold_request(w))
2907				continue;
2908
2909			list_move_tail(&w->sched.link, &list);
2910		}
2911
2912		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2913	} while (rq);
2914}
2915
2916static void execlists_unhold(struct intel_engine_cs *engine,
2917			     struct i915_request *rq)
2918{
2919	spin_lock_irq(&engine->active.lock);
2920
2921	/*
2922	 * Move this request back to the priority queue, and all of its
2923	 * children and grandchildren that were suspended along with it.
2924	 */
2925	__execlists_unhold(rq);
2926
2927	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2928		engine->execlists.queue_priority_hint = rq_prio(rq);
2929		tasklet_hi_schedule(&engine->execlists.tasklet);
2930	}
2931
2932	spin_unlock_irq(&engine->active.lock);
2933}
2934
2935struct execlists_capture {
2936	struct work_struct work;
2937	struct i915_request *rq;
2938	struct i915_gpu_coredump *error;
2939};
2940
2941static void execlists_capture_work(struct work_struct *work)
2942{
2943	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2944	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2945	struct intel_engine_cs *engine = cap->rq->engine;
2946	struct intel_gt_coredump *gt = cap->error->gt;
2947	struct intel_engine_capture_vma *vma;
2948
2949	/* Compress all the objects attached to the request, slow! */
2950	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2951	if (vma) {
2952		struct i915_vma_compress *compress =
2953			i915_vma_capture_prepare(gt);
2954
2955		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2956		i915_vma_capture_finish(gt, compress);
2957	}
2958
2959	gt->simulated = gt->engine->simulated;
2960	cap->error->simulated = gt->simulated;
2961
2962	/* Publish the error state, and announce it to the world */
2963	i915_error_state_store(cap->error);
2964	i915_gpu_coredump_put(cap->error);
2965
2966	/* Return this request and all that depend upon it for signaling */
2967	execlists_unhold(engine, cap->rq);
2968	i915_request_put(cap->rq);
2969
2970	kfree(cap);
2971}
2972
2973static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2974{
2975	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2976	struct execlists_capture *cap;
2977
2978	cap = kmalloc(sizeof(*cap), gfp);
2979	if (!cap)
2980		return NULL;
2981
2982	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2983	if (!cap->error)
2984		goto err_cap;
2985
2986	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2987	if (!cap->error->gt)
2988		goto err_gpu;
2989
2990	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2991	if (!cap->error->gt->engine)
2992		goto err_gt;
2993
2994	return cap;
2995
2996err_gt:
2997	kfree(cap->error->gt);
2998err_gpu:
2999	kfree(cap->error);
3000err_cap:
3001	kfree(cap);
3002	return NULL;
3003}
3004
3005static struct i915_request *
3006active_context(struct intel_engine_cs *engine, u32 ccid)
3007{
3008	const struct intel_engine_execlists * const el = &engine->execlists;
3009	struct i915_request * const *port, *rq;
3010
3011	/*
3012	 * Use the most recent result from process_csb(), but just in case
3013	 * we trigger an error (via interrupt) before the first CS event has
3014	 * been written, peek at the next submission.
3015	 */
3016
3017	for (port = el->active; (rq = *port); port++) {
3018		if (rq->context->lrc.ccid == ccid) {
3019			ENGINE_TRACE(engine,
3020				     "ccid found at active:%zd\n",
3021				     port - el->active);
3022			return rq;
3023		}
3024	}
3025
3026	for (port = el->pending; (rq = *port); port++) {
3027		if (rq->context->lrc.ccid == ccid) {
3028			ENGINE_TRACE(engine,
3029				     "ccid found at pending:%zd\n",
3030				     port - el->pending);
3031			return rq;
3032		}
3033	}
3034
3035	ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3036	return NULL;
3037}
3038
3039static u32 active_ccid(struct intel_engine_cs *engine)
3040{
3041	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3042}
3043
3044static void execlists_capture(struct intel_engine_cs *engine)
3045{
3046	struct execlists_capture *cap;
3047
3048	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3049		return;
3050
3051	/*
3052	 * We need to _quickly_ capture the engine state before we reset.
3053	 * We are inside an atomic section (softirq) here and we are delaying
3054	 * the forced preemption event.
3055	 */
3056	cap = capture_regs(engine);
3057	if (!cap)
3058		return;
3059
3060	spin_lock_irq(&engine->active.lock);
3061	cap->rq = active_context(engine, active_ccid(engine));
3062	if (cap->rq) {
3063		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3064		cap->rq = i915_request_get_rcu(cap->rq);
3065	}
3066	spin_unlock_irq(&engine->active.lock);
3067	if (!cap->rq)
3068		goto err_free;
3069
3070	/*
3071	 * Remove the request from the execlists queue, and take ownership
3072	 * of the request. We pass it to our worker who will _slowly_ compress
3073	 * all the pages the _user_ requested for debugging their batch, after
3074	 * which we return it to the queue for signaling.
3075	 *
3076	 * By removing them from the execlists queue, we also remove the
3077	 * requests from being processed by __unwind_incomplete_requests()
3078	 * during the intel_engine_reset(), and so they will *not* be replayed
3079	 * afterwards.
3080	 *
3081	 * Note that because we have not yet reset the engine at this point,
3082	 * it is possible for the request that we have identified as being
3083	 * guilty, did in fact complete and we will then hit an arbitration
3084	 * point allowing the outstanding preemption to succeed. The likelihood
3085	 * of that is very low (as capturing of the engine registers should be
3086	 * fast enough to run inside an irq-off atomic section!), so we will
3087	 * simply hold that request accountable for being non-preemptible
3088	 * long enough to force the reset.
3089	 */
3090	if (!execlists_hold(engine, cap->rq))
3091		goto err_rq;
3092
3093	INIT_WORK(&cap->work, execlists_capture_work);
3094	schedule_work(&cap->work);
3095	return;
3096
3097err_rq:
3098	i915_request_put(cap->rq);
3099err_free:
3100	i915_gpu_coredump_put(cap->error);
3101	kfree(cap);
3102}
3103
3104static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3105{
3106	const unsigned int bit = I915_RESET_ENGINE + engine->id;
3107	unsigned long *lock = &engine->gt->reset.flags;
3108
3109	if (!intel_has_reset_engine(engine->gt))
3110		return;
3111
3112	if (test_and_set_bit(bit, lock))
3113		return;
3114
3115	ENGINE_TRACE(engine, "reset for %s\n", msg);
3116
3117	/* Mark this tasklet as disabled to avoid waiting for it to complete */
3118	tasklet_disable_nosync(&engine->execlists.tasklet);
3119
3120	ring_set_paused(engine, 1); /* Freeze the current request in place */
3121	execlists_capture(engine);
3122	intel_engine_reset(engine, msg);
3123
3124	tasklet_enable(&engine->execlists.tasklet);
3125	clear_and_wake_up_bit(bit, lock);
3126}
3127
3128static bool preempt_timeout(const struct intel_engine_cs *const engine)
3129{
3130	const struct timer_list *t = &engine->execlists.preempt;
3131
3132	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3133		return false;
3134
3135	if (!timer_expired(t))
3136		return false;
3137
3138	return READ_ONCE(engine->execlists.pending[0]);
3139}
3140
3141/*
3142 * Check the unread Context Status Buffers and manage the submission of new
3143 * contexts to the ELSP accordingly.
3144 */
3145static void execlists_submission_tasklet(unsigned long data)
3146{
3147	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3148	bool timeout = preempt_timeout(engine);
3149
3150	process_csb(engine);
3151
3152	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3153		const char *msg;
3154
3155		/* Generate the error message in priority wrt to the user! */
3156		if (engine->execlists.error_interrupt & GENMASK(15, 0))
3157			msg = "CS error"; /* thrown by a user payload */
3158		else if (engine->execlists.error_interrupt & ERROR_CSB)
3159			msg = "invalid CSB event";
3160		else
3161			msg = "internal error";
3162
3163		engine->execlists.error_interrupt = 0;
3164		execlists_reset(engine, msg);
3165	}
3166
3167	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3168		unsigned long flags;
3169
3170		spin_lock_irqsave(&engine->active.lock, flags);
3171		__execlists_submission_tasklet(engine);
3172		spin_unlock_irqrestore(&engine->active.lock, flags);
3173
3174		/* Recheck after serialising with direct-submission */
3175		if (unlikely(timeout && preempt_timeout(engine))) {
3176			cancel_timer(&engine->execlists.preempt);
3177			execlists_reset(engine, "preemption time out");
3178		}
3179	}
3180}
3181
3182static void __execlists_kick(struct intel_engine_execlists *execlists)
3183{
3184	/* Kick the tasklet for some interrupt coalescing and reset handling */
3185	tasklet_hi_schedule(&execlists->tasklet);
3186}
3187
3188#define execlists_kick(t, member) \
3189	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
3190
3191static void execlists_timeslice(struct timer_list *timer)
3192{
3193	execlists_kick(timer, timer);
3194}
3195
3196static void execlists_preempt(struct timer_list *timer)
3197{
3198	execlists_kick(timer, preempt);
3199}
3200
3201static void queue_request(struct intel_engine_cs *engine,
3202			  struct i915_request *rq)
3203{
3204	GEM_BUG_ON(!list_empty(&rq->sched.link));
3205	list_add_tail(&rq->sched.link,
3206		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
3207	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3208}
3209
3210static void __submit_queue_imm(struct intel_engine_cs *engine)
3211{
3212	struct intel_engine_execlists * const execlists = &engine->execlists;
3213
3214	if (reset_in_progress(execlists))
3215		return; /* defer until we restart the engine following reset */
3216
3217	__execlists_submission_tasklet(engine);
3218}
3219
3220static void submit_queue(struct intel_engine_cs *engine,
3221			 const struct i915_request *rq)
3222{
3223	struct intel_engine_execlists *execlists = &engine->execlists;
3224
3225	if (rq_prio(rq) <= execlists->queue_priority_hint)
3226		return;
3227
3228	execlists->queue_priority_hint = rq_prio(rq);
3229	__submit_queue_imm(engine);
3230}
3231
3232static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3233			     const struct i915_request *rq)
3234{
3235	GEM_BUG_ON(i915_request_on_hold(rq));
3236	return !list_empty(&engine->active.hold) && hold_request(rq);
3237}
3238
3239static void flush_csb(struct intel_engine_cs *engine)
3240{
3241	struct intel_engine_execlists *el = &engine->execlists;
3242
3243	if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
3244		if (!reset_in_progress(el))
3245			process_csb(engine);
3246		tasklet_unlock(&el->tasklet);
3247	}
3248}
3249
3250static void execlists_submit_request(struct i915_request *request)
3251{
3252	struct intel_engine_cs *engine = request->engine;
3253	unsigned long flags;
3254
3255	/* Hopefully we clear execlists->pending[] to let us through */
3256	flush_csb(engine);
3257
3258	/* Will be called from irq-context when using foreign fences. */
3259	spin_lock_irqsave(&engine->active.lock, flags);
3260
3261	if (unlikely(ancestor_on_hold(engine, request))) {
3262		RQ_TRACE(request, "ancestor on hold\n");
3263		list_add_tail(&request->sched.link, &engine->active.hold);
3264		i915_request_set_hold(request);
3265	} else {
3266		queue_request(engine, request);
3267
3268		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3269		GEM_BUG_ON(list_empty(&request->sched.link));
3270
3271		submit_queue(engine, request);
3272	}
3273
3274	spin_unlock_irqrestore(&engine->active.lock, flags);
3275}
3276
3277static void __execlists_context_fini(struct intel_context *ce)
3278{
3279	intel_ring_put(ce->ring);
3280	i915_vma_put(ce->state);
3281}
3282
3283static void execlists_context_destroy(struct kref *kref)
3284{
3285	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3286
3287	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3288	GEM_BUG_ON(intel_context_is_pinned(ce));
3289
3290	if (ce->state)
3291		__execlists_context_fini(ce);
3292
3293	intel_context_fini(ce);
3294	intel_context_free(ce);
3295}
3296
3297static void
3298set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3299{
3300	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3301		return;
3302
3303	vaddr += engine->context_size;
3304
3305	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3306}
3307
3308static void
3309check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3310{
3311	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3312		return;
3313
3314	vaddr += engine->context_size;
3315
3316	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3317		drm_err_once(&engine->i915->drm,
3318			     "%s context redzone overwritten!\n",
3319			     engine->name);
3320}
3321
3322static void execlists_context_unpin(struct intel_context *ce)
3323{
3324	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3325		      ce->engine);
3326}
3327
3328static void execlists_context_post_unpin(struct intel_context *ce)
3329{
3330	i915_gem_object_unpin_map(ce->state->obj);
3331}
3332
3333static u32 *
3334gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3335{
3336	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3337		MI_SRM_LRM_GLOBAL_GTT |
3338		MI_LRI_LRM_CS_MMIO;
3339	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3340	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3341		CTX_TIMESTAMP * sizeof(u32);
3342	*cs++ = 0;
3343
3344	*cs++ = MI_LOAD_REGISTER_REG |
3345		MI_LRR_SOURCE_CS_MMIO |
3346		MI_LRI_LRM_CS_MMIO;
3347	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3348	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3349
3350	*cs++ = MI_LOAD_REGISTER_REG |
3351		MI_LRR_SOURCE_CS_MMIO |
3352		MI_LRI_LRM_CS_MMIO;
3353	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3354	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3355
3356	return cs;
3357}
3358
3359static u32 *
3360gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3361{
3362	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3363
3364	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3365		MI_SRM_LRM_GLOBAL_GTT |
3366		MI_LRI_LRM_CS_MMIO;
3367	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3368	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3369		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3370	*cs++ = 0;
3371
3372	return cs;
3373}
3374
3375static u32 *
3376gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3377{
3378	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3379
3380	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3381		MI_SRM_LRM_GLOBAL_GTT |
3382		MI_LRI_LRM_CS_MMIO;
3383	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3384	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3385		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3386	*cs++ = 0;
3387
3388	*cs++ = MI_LOAD_REGISTER_REG |
3389		MI_LRR_SOURCE_CS_MMIO |
3390		MI_LRI_LRM_CS_MMIO;
3391	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3392	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3393
3394	return cs;
3395}
3396
3397static u32 *
3398gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3399{
3400	cs = gen12_emit_timestamp_wa(ce, cs);
3401	cs = gen12_emit_cmd_buf_wa(ce, cs);
3402	cs = gen12_emit_restore_scratch(ce, cs);
3403
3404	return cs;
3405}
3406
3407static u32 *
3408gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3409{
3410	cs = gen12_emit_timestamp_wa(ce, cs);
3411	cs = gen12_emit_restore_scratch(ce, cs);
3412
3413	return cs;
3414}
3415
3416static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3417{
3418	return PAGE_SIZE * ce->wa_bb_page;
3419}
3420
3421static u32 *context_indirect_bb(const struct intel_context *ce)
3422{
3423	void *ptr;
3424
3425	GEM_BUG_ON(!ce->wa_bb_page);
3426
3427	ptr = ce->lrc_reg_state;
3428	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3429	ptr += context_wa_bb_offset(ce);
3430
3431	return ptr;
3432}
3433
3434static void
3435setup_indirect_ctx_bb(const struct intel_context *ce,
3436		      const struct intel_engine_cs *engine,
3437		      u32 *(*emit)(const struct intel_context *, u32 *))
3438{
3439	u32 * const start = context_indirect_bb(ce);
3440	u32 *cs;
3441
3442	cs = emit(ce, start);
3443	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3444	while ((unsigned long)cs % CACHELINE_BYTES)
3445		*cs++ = MI_NOOP;
3446
3447	lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3448				    i915_ggtt_offset(ce->state) +
3449				    context_wa_bb_offset(ce),
3450				    (cs - start) * sizeof(*cs));
3451}
3452
3453static void
3454__execlists_update_reg_state(const struct intel_context *ce,
3455			     const struct intel_engine_cs *engine,
3456			     u32 head)
3457{
3458	struct intel_ring *ring = ce->ring;
3459	u32 *regs = ce->lrc_reg_state;
3460
3461	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3462	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3463
3464	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3465	regs[CTX_RING_HEAD] = head;
3466	regs[CTX_RING_TAIL] = ring->tail;
3467	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3468
3469	/* RPCS */
3470	if (engine->class == RENDER_CLASS) {
3471		regs[CTX_R_PWR_CLK_STATE] =
3472			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
3473
3474		i915_oa_init_reg_state(ce, engine);
3475	}
3476
3477	if (ce->wa_bb_page) {
3478		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3479
3480		fn = gen12_emit_indirect_ctx_xcs;
3481		if (ce->engine->class == RENDER_CLASS)
3482			fn = gen12_emit_indirect_ctx_rcs;
3483
3484		/* Mutually exclusive wrt to global indirect bb */
3485		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3486		setup_indirect_ctx_bb(ce, engine, fn);
3487	}
3488}
3489
3490static int
3491execlists_context_pre_pin(struct intel_context *ce,
3492			  struct i915_gem_ww_ctx *ww, void **vaddr)
3493{
3494	GEM_BUG_ON(!ce->state);
3495	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3496
3497	*vaddr = i915_gem_object_pin_map(ce->state->obj,
3498					i915_coherent_map_type(ce->engine->i915) |
3499					I915_MAP_OVERRIDE);
3500
3501	return PTR_ERR_OR_ZERO(*vaddr);
3502}
3503
3504static int
3505__execlists_context_pin(struct intel_context *ce,
3506			struct intel_engine_cs *engine,
3507			void *vaddr)
3508{
3509	ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3510	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3511	__execlists_update_reg_state(ce, engine, ce->ring->tail);
3512
3513	return 0;
3514}
3515
3516static int execlists_context_pin(struct intel_context *ce, void *vaddr)
3517{
3518	return __execlists_context_pin(ce, ce->engine, vaddr);
3519}
3520
3521static int execlists_context_alloc(struct intel_context *ce)
3522{
3523	return __execlists_context_alloc(ce, ce->engine);
3524}
3525
3526static void execlists_context_reset(struct intel_context *ce)
3527{
3528	CE_TRACE(ce, "reset\n");
3529	GEM_BUG_ON(!intel_context_is_pinned(ce));
3530
3531	intel_ring_reset(ce->ring, ce->ring->emit);
3532
3533	/* Scrub away the garbage */
3534	execlists_init_reg_state(ce->lrc_reg_state,
3535				 ce, ce->engine, ce->ring, true);
3536	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3537
3538	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3539}
3540
3541static const struct intel_context_ops execlists_context_ops = {
3542	.alloc = execlists_context_alloc,
3543
3544	.pre_pin = execlists_context_pre_pin,
3545	.pin = execlists_context_pin,
3546	.unpin = execlists_context_unpin,
3547	.post_unpin = execlists_context_post_unpin,
3548
3549	.enter = intel_context_enter_engine,
3550	.exit = intel_context_exit_engine,
3551
3552	.reset = execlists_context_reset,
3553	.destroy = execlists_context_destroy,
3554};
3555
3556static u32 hwsp_offset(const struct i915_request *rq)
3557{
3558	const struct intel_timeline_cacheline *cl;
3559
3560	/* Before the request is executed, the timeline/cachline is fixed */
3561
3562	cl = rcu_dereference_protected(rq->hwsp_cacheline, 1);
3563	if (cl)
3564		return cl->ggtt_offset;
3565
3566	return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset;
3567}
3568
3569static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3570{
3571	u32 *cs;
3572
3573	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3574	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3575		return 0;
3576
3577	cs = intel_ring_begin(rq, 6);
3578	if (IS_ERR(cs))
3579		return PTR_ERR(cs);
3580
3581	/*
3582	 * Check if we have been preempted before we even get started.
3583	 *
3584	 * After this point i915_request_started() reports true, even if
3585	 * we get preempted and so are no longer running.
3586	 */
3587	*cs++ = MI_ARB_CHECK;
3588	*cs++ = MI_NOOP;
3589
3590	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3591	*cs++ = hwsp_offset(rq);
3592	*cs++ = 0;
3593	*cs++ = rq->fence.seqno - 1;
3594
3595	intel_ring_advance(rq, cs);
3596
3597	/* Record the updated position of the request's payload */
3598	rq->infix = intel_ring_offset(rq, cs);
3599
3600	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3601
3602	return 0;
3603}
3604
3605static int emit_pdps(struct i915_request *rq)
3606{
3607	const struct intel_engine_cs * const engine = rq->engine;
3608	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3609	int err, i;
3610	u32 *cs;
3611
3612	GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
3613
3614	/*
3615	 * Beware ye of the dragons, this sequence is magic!
3616	 *
3617	 * Small changes to this sequence can cause anything from
3618	 * GPU hangs to forcewake errors and machine lockups!
3619	 */
3620
3621	/* Flush any residual operations from the context load */
3622	err = engine->emit_flush(rq, EMIT_FLUSH);
3623	if (err)
3624		return err;
3625
3626	/* Magic required to prevent forcewake errors! */
3627	err = engine->emit_flush(rq, EMIT_INVALIDATE);
3628	if (err)
3629		return err;
3630
3631	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3632	if (IS_ERR(cs))
3633		return PTR_ERR(cs);
3634
3635	/* Ensure the LRI have landed before we invalidate & continue */
3636	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3637	for (i = GEN8_3LVL_PDPES; i--; ) {
3638		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3639		u32 base = engine->mmio_base;
3640
3641		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3642		*cs++ = upper_32_bits(pd_daddr);
3643		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3644		*cs++ = lower_32_bits(pd_daddr);
3645	}
3646	*cs++ = MI_NOOP;
3647
3648	intel_ring_advance(rq, cs);
3649
3650	return 0;
3651}
3652
3653static int execlists_request_alloc(struct i915_request *request)
3654{
3655	int ret;
3656
3657	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3658
3659	/*
3660	 * Flush enough space to reduce the likelihood of waiting after
3661	 * we start building the request - in which case we will just
3662	 * have to repeat work.
3663	 */
3664	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3665
3666	/*
3667	 * Note that after this point, we have committed to using
3668	 * this request as it is being used to both track the
3669	 * state of engine initialisation and liveness of the
3670	 * golden renderstate above. Think twice before you try
3671	 * to cancel/unwind this request now.
3672	 */
3673
3674	if (!i915_vm_is_4lvl(request->context->vm)) {
3675		ret = emit_pdps(request);
3676		if (ret)
3677			return ret;
3678	}
3679
3680	/* Unconditionally invalidate GPU caches and TLBs. */
3681	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3682	if (ret)
3683		return ret;
3684
3685	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3686	return 0;
3687}
3688
3689/*
3690 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3691 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3692 * but there is a slight complication as this is applied in WA batch where the
3693 * values are only initialized once so we cannot take register value at the
3694 * beginning and reuse it further; hence we save its value to memory, upload a
3695 * constant value with bit21 set and then we restore it back with the saved value.
3696 * To simplify the WA, a constant value is formed by using the default value
3697 * of this register. This shouldn't be a problem because we are only modifying
3698 * it for a short period and this batch in non-premptible. We can ofcourse
3699 * use additional instructions that read the actual value of the register
3700 * at that time and set our bit of interest but it makes the WA complicated.
3701 *
3702 * This WA is also required for Gen9 so extracting as a function avoids
3703 * code duplication.
3704 */
3705static u32 *
3706gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3707{
3708	/* NB no one else is allowed to scribble over scratch + 256! */
3709	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3710	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3711	*batch++ = intel_gt_scratch_offset(engine->gt,
3712					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3713	*batch++ = 0;
3714
3715	*batch++ = MI_LOAD_REGISTER_IMM(1);
3716	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3717	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3718
3719	batch = gen8_emit_pipe_control(batch,
3720				       PIPE_CONTROL_CS_STALL |
3721				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3722				       0);
3723
3724	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3725	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3726	*batch++ = intel_gt_scratch_offset(engine->gt,
3727					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3728	*batch++ = 0;
3729
3730	return batch;
3731}
3732
3733/*
3734 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3735 * initialized at the beginning and shared across all contexts but this field
3736 * helps us to have multiple batches at different offsets and select them based
3737 * on a criteria. At the moment this batch always start at the beginning of the page
3738 * and at this point we don't have multiple wa_ctx batch buffers.
3739 *
3740 * The number of WA applied are not known at the beginning; we use this field
3741 * to return the no of DWORDS written.
3742 *
3743 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3744 * so it adds NOOPs as padding to make it cacheline aligned.
3745 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3746 * makes a complete batch buffer.
3747 */
3748static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3749{
3750	/* WaDisableCtxRestoreArbitration:bdw,chv */
3751	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3752
3753	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3754	if (IS_BROADWELL(engine->i915))
3755		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3756
3757	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3758	/* Actual scratch location is at 128 bytes offset */
3759	batch = gen8_emit_pipe_control(batch,
3760				       PIPE_CONTROL_FLUSH_L3 |
3761				       PIPE_CONTROL_STORE_DATA_INDEX |
3762				       PIPE_CONTROL_CS_STALL |
3763				       PIPE_CONTROL_QW_WRITE,
3764				       LRC_PPHWSP_SCRATCH_ADDR);
3765
3766	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3767
3768	/* Pad to end of cacheline */
3769	while ((unsigned long)batch % CACHELINE_BYTES)
3770		*batch++ = MI_NOOP;
3771
3772	/*
3773	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3774	 * execution depends on the length specified in terms of cache lines
3775	 * in the register CTX_RCS_INDIRECT_CTX
3776	 */
3777
3778	return batch;
3779}
3780
3781struct lri {
3782	i915_reg_t reg;
3783	u32 value;
3784};
3785
3786static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3787{
3788	GEM_BUG_ON(!count || count > 63);
3789
3790	*batch++ = MI_LOAD_REGISTER_IMM(count);
3791	do {
3792		*batch++ = i915_mmio_reg_offset(lri->reg);
3793		*batch++ = lri->value;
3794	} while (lri++, --count);
3795	*batch++ = MI_NOOP;
3796
3797	return batch;
3798}
3799
3800static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3801{
3802	static const struct lri lri[] = {
3803		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3804		{
3805			COMMON_SLICE_CHICKEN2,
3806			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3807				       0),
3808		},
3809
3810		/* BSpec: 11391 */
3811		{
3812			FF_SLICE_CHICKEN,
3813			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3814				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3815		},
3816
3817		/* BSpec: 11299 */
3818		{
3819			_3D_CHICKEN3,
3820			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3821				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3822		}
3823	};
3824
3825	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3826
3827	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3828	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3829
3830	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3831	batch = gen8_emit_pipe_control(batch,
3832				       PIPE_CONTROL_FLUSH_L3 |
3833				       PIPE_CONTROL_STORE_DATA_INDEX |
3834				       PIPE_CONTROL_CS_STALL |
3835				       PIPE_CONTROL_QW_WRITE,
3836				       LRC_PPHWSP_SCRATCH_ADDR);
3837
3838	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3839
3840	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3841	if (HAS_POOLED_EU(engine->i915)) {
3842		/*
3843		 * EU pool configuration is setup along with golden context
3844		 * during context initialization. This value depends on
3845		 * device type (2x6 or 3x6) and needs to be updated based
3846		 * on which subslice is disabled especially for 2x6
3847		 * devices, however it is safe to load default
3848		 * configuration of 3x6 device instead of masking off
3849		 * corresponding bits because HW ignores bits of a disabled
3850		 * subslice and drops down to appropriate config. Please
3851		 * see render_state_setup() in i915_gem_render_state.c for
3852		 * possible configurations, to avoid duplication they are
3853		 * not shown here again.
3854		 */
3855		*batch++ = GEN9_MEDIA_POOL_STATE;
3856		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3857		*batch++ = 0x00777000;
3858		*batch++ = 0;
3859		*batch++ = 0;
3860		*batch++ = 0;
3861	}
3862
3863	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3864
3865	/* Pad to end of cacheline */
3866	while ((unsigned long)batch % CACHELINE_BYTES)
3867		*batch++ = MI_NOOP;
3868
3869	return batch;
3870}
3871
3872static u32 *
3873gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3874{
3875	int i;
3876
3877	/*
3878	 * WaPipeControlBefore3DStateSamplePattern: cnl
3879	 *
3880	 * Ensure the engine is idle prior to programming a
3881	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3882	 */
3883	batch = gen8_emit_pipe_control(batch,
3884				       PIPE_CONTROL_CS_STALL,
3885				       0);
3886	/*
3887	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3888	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3889	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3890	 * confusing. Since gen8_emit_pipe_control() already advances the
3891	 * batch by 6 dwords, we advance the other 10 here, completing a
3892	 * cacheline. It's not clear if the workaround requires this padding
3893	 * before other commands, or if it's just the regular padding we would
3894	 * already have for the workaround bb, so leave it here for now.
3895	 */
3896	for (i = 0; i < 10; i++)
3897		*batch++ = MI_NOOP;
3898
3899	/* Pad to end of cacheline */
3900	while ((unsigned long)batch % CACHELINE_BYTES)
3901		*batch++ = MI_NOOP;
3902
3903	return batch;
3904}
3905
3906#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3907
3908static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3909{
3910	struct drm_i915_gem_object *obj;
3911	struct i915_vma *vma;
3912	int err;
3913
3914	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3915	if (IS_ERR(obj))
3916		return PTR_ERR(obj);
3917
3918	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3919	if (IS_ERR(vma)) {
3920		err = PTR_ERR(vma);
3921		goto err;
3922	}
3923
3924	err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH);
3925	if (err)
3926		goto err;
3927
3928	engine->wa_ctx.vma = vma;
3929	return 0;
3930
3931err:
3932	i915_gem_object_put(obj);
3933	return err;
3934}
3935
3936static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3937{
3938	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3939
3940	/* Called on error unwind, clear all flags to prevent further use */
3941	memset(&engine->wa_ctx, 0, sizeof(engine->wa_ctx));
3942}
3943
3944typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3945
3946static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3947{
3948	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3949	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3950					    &wa_ctx->per_ctx };
3951	wa_bb_func_t wa_bb_fn[2];
3952	void *batch, *batch_ptr;
3953	unsigned int i;
3954	int ret;
3955
3956	if (engine->class != RENDER_CLASS)
3957		return 0;
3958
3959	switch (INTEL_GEN(engine->i915)) {
3960	case 12:
3961	case 11:
3962		return 0;
3963	case 10:
3964		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3965		wa_bb_fn[1] = NULL;
3966		break;
3967	case 9:
3968		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3969		wa_bb_fn[1] = NULL;
3970		break;
3971	case 8:
3972		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3973		wa_bb_fn[1] = NULL;
3974		break;
3975	default:
3976		MISSING_CASE(INTEL_GEN(engine->i915));
3977		return 0;
3978	}
3979
3980	ret = lrc_setup_wa_ctx(engine);
3981	if (ret) {
3982		drm_dbg(&engine->i915->drm,
3983			"Failed to setup context WA page: %d\n", ret);
3984		return ret;
3985	}
3986
3987	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
3988
3989	/*
3990	 * Emit the two workaround batch buffers, recording the offset from the
3991	 * start of the workaround batch buffer object for each and their
3992	 * respective sizes.
3993	 */
3994	batch_ptr = batch;
3995	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3996		wa_bb[i]->offset = batch_ptr - batch;
3997		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3998						  CACHELINE_BYTES))) {
3999			ret = -EINVAL;
4000			break;
4001		}
4002		if (wa_bb_fn[i])
4003			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
4004		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
4005	}
4006	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
4007
4008	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
4009	__i915_gem_object_release_map(wa_ctx->vma->obj);
4010	if (ret)
4011		lrc_destroy_wa_ctx(engine);
4012
4013	return ret;
4014}
4015
4016static void reset_csb_pointers(struct intel_engine_cs *engine)
4017{
4018	struct intel_engine_execlists * const execlists = &engine->execlists;
4019	const unsigned int reset_value = execlists->csb_size - 1;
4020
4021	ring_set_paused(engine, 0);
4022
4023	/*
4024	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
4025	 * Bludgeon them with a mmio update to be sure.
4026	 */
4027	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4028		     0xffff << 16 | reset_value << 8 | reset_value);
4029	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4030
4031	/*
4032	 * After a reset, the HW starts writing into CSB entry [0]. We
4033	 * therefore have to set our HEAD pointer back one entry so that
4034	 * the *first* entry we check is entry 0. To complicate this further,
4035	 * as we don't wait for the first interrupt after reset, we have to
4036	 * fake the HW write to point back to the last entry so that our
4037	 * inline comparison of our cached head position against the last HW
4038	 * write works even before the first interrupt.
4039	 */
4040	execlists->csb_head = reset_value;
4041	WRITE_ONCE(*execlists->csb_write, reset_value);
4042	wmb(); /* Make sure this is visible to HW (paranoia?) */
4043
4044	/* Check that the GPU does indeed update the CSB entries! */
4045	memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64));
4046	invalidate_csb_entries(&execlists->csb_status[0],
4047			       &execlists->csb_status[reset_value]);
4048
4049	/* Once more for luck and our trusty paranoia */
4050	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4051		     0xffff << 16 | reset_value << 8 | reset_value);
4052	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4053
4054	GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
4055}
4056
4057static void execlists_sanitize(struct intel_engine_cs *engine)
4058{
4059	/*
4060	 * Poison residual state on resume, in case the suspend didn't!
4061	 *
4062	 * We have to assume that across suspend/resume (or other loss
4063	 * of control) that the contents of our pinned buffers has been
4064	 * lost, replaced by garbage. Since this doesn't always happen,
4065	 * let's poison such state so that we more quickly spot when
4066	 * we falsely assume it has been preserved.
4067	 */
4068	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4069		memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
4070
4071	reset_csb_pointers(engine);
4072
4073	/*
4074	 * The kernel_context HWSP is stored in the status_page. As above,
4075	 * that may be lost on resume/initialisation, and so we need to
4076	 * reset the value in the HWSP.
4077	 */
4078	intel_timeline_reset_seqno(engine->kernel_context->timeline);
4079
4080	/* And scrub the dirty cachelines for the HWSP */
4081	clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4082}
4083
4084static void enable_error_interrupt(struct intel_engine_cs *engine)
4085{
4086	u32 status;
4087
4088	engine->execlists.error_interrupt = 0;
4089	ENGINE_WRITE(engine, RING_EMR, ~0u);
4090	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4091
4092	status = ENGINE_READ(engine, RING_ESR);
4093	if (unlikely(status)) {
4094		drm_err(&engine->i915->drm,
4095			"engine '%s' resumed still in error: %08x\n",
4096			engine->name, status);
4097		__intel_gt_reset(engine->gt, engine->mask);
4098	}
4099
4100	/*
4101	 * On current gen8+, we have 2 signals to play with
4102	 *
4103	 * - I915_ERROR_INSTUCTION (bit 0)
4104	 *
4105	 *    Generate an error if the command parser encounters an invalid
4106	 *    instruction
4107	 *
4108	 *    This is a fatal error.
4109	 *
4110	 * - CP_PRIV (bit 2)
4111	 *
4112	 *    Generate an error on privilege violation (where the CP replaces
4113	 *    the instruction with a no-op). This also fires for writes into
4114	 *    read-only scratch pages.
4115	 *
4116	 *    This is a non-fatal error, parsing continues.
4117	 *
4118	 * * there are a few others defined for odd HW that we do not use
4119	 *
4120	 * Since CP_PRIV fires for cases where we have chosen to ignore the
4121	 * error (as the HW is validating and suppressing the mistakes), we
4122	 * only unmask the instruction error bit.
4123	 */
4124	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4125}
4126
4127static void enable_execlists(struct intel_engine_cs *engine)
4128{
4129	u32 mode;
4130
4131	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4132
4133	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4134
4135	if (INTEL_GEN(engine->i915) >= 11)
4136		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4137	else
4138		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4139	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4140
4141	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4142
4143	ENGINE_WRITE_FW(engine,
4144			RING_HWS_PGA,
4145			i915_ggtt_offset(engine->status_page.vma));
4146	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4147
4148	enable_error_interrupt(engine);
4149
4150	engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4151}
4152
4153static bool unexpected_starting_state(struct intel_engine_cs *engine)
4154{
4155	bool unexpected = false;
4156
4157	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4158		drm_dbg(&engine->i915->drm,
4159			"STOP_RING still set in RING_MI_MODE\n");
4160		unexpected = true;
4161	}
4162
4163	return unexpected;
4164}
4165
4166static int execlists_resume(struct intel_engine_cs *engine)
4167{
4168	intel_mocs_init_engine(engine);
4169
4170	intel_breadcrumbs_reset(engine->breadcrumbs);
4171
4172	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4173		struct drm_printer p = drm_debug_printer(__func__);
4174
4175		intel_engine_dump(engine, &p, NULL);
4176	}
4177
4178	enable_execlists(engine);
4179
4180	return 0;
4181}
4182
4183static void execlists_reset_prepare(struct intel_engine_cs *engine)
4184{
4185	struct intel_engine_execlists * const execlists = &engine->execlists;
4186	unsigned long flags;
4187
4188	ENGINE_TRACE(engine, "depth<-%d\n",
4189		     atomic_read(&execlists->tasklet.count));
4190
4191	/*
4192	 * Prevent request submission to the hardware until we have
4193	 * completed the reset in i915_gem_reset_finish(). If a request
4194	 * is completed by one engine, it may then queue a request
4195	 * to a second via its execlists->tasklet *just* as we are
4196	 * calling engine->resume() and also writing the ELSP.
4197	 * Turning off the execlists->tasklet until the reset is over
4198	 * prevents the race.
4199	 */
4200	__tasklet_disable_sync_once(&execlists->tasklet);
4201	GEM_BUG_ON(!reset_in_progress(execlists));
4202
4203	/* And flush any current direct submission. */
4204	spin_lock_irqsave(&engine->active.lock, flags);
4205	spin_unlock_irqrestore(&engine->active.lock, flags);
4206
4207	/*
4208	 * We stop engines, otherwise we might get failed reset and a
4209	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
4210	 * from system hang if batchbuffer is progressing when
4211	 * the reset is issued, regardless of READY_TO_RESET ack.
4212	 * Thus assume it is best to stop engines on all gens
4213	 * where we have a gpu reset.
4214	 *
4215	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4216	 *
4217	 * FIXME: Wa for more modern gens needs to be validated
4218	 */
4219	ring_set_paused(engine, 1);
4220	intel_engine_stop_cs(engine);
4221
4222	engine->execlists.reset_ccid = active_ccid(engine);
4223}
4224
4225static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4226{
4227	int x;
4228
4229	x = lrc_ring_mi_mode(engine);
4230	if (x != -1) {
4231		regs[x + 1] &= ~STOP_RING;
4232		regs[x + 1] |= STOP_RING << 16;
4233	}
4234}
4235
4236static void __execlists_reset_reg_state(const struct intel_context *ce,
4237					const struct intel_engine_cs *engine)
4238{
4239	u32 *regs = ce->lrc_reg_state;
4240
4241	__reset_stop_ring(regs, engine);
4242}
4243
4244static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4245{
4246	struct intel_engine_execlists * const execlists = &engine->execlists;
4247	struct intel_context *ce;
4248	struct i915_request *rq;
4249	u32 head;
4250
4251	mb(); /* paranoia: read the CSB pointers from after the reset */
4252	clflush(execlists->csb_write);
4253	mb();
4254
4255	process_csb(engine); /* drain preemption events */
4256
4257	/* Following the reset, we need to reload the CSB read/write pointers */
4258	reset_csb_pointers(engine);
4259
4260	/*
4261	 * Save the currently executing context, even if we completed
4262	 * its request, it was still running at the time of the
4263	 * reset and will have been clobbered.
4264	 */
4265	rq = active_context(engine, engine->execlists.reset_ccid);
4266	if (!rq)
4267		goto unwind;
4268
4269	ce = rq->context;
4270	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4271
4272	if (i915_request_completed(rq)) {
4273		/* Idle context; tidy up the ring so we can restart afresh */
4274		head = intel_ring_wrap(ce->ring, rq->tail);
4275		goto out_replay;
4276	}
4277
4278	/* We still have requests in-flight; the engine should be active */
4279	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4280
4281	/* Context has requests still in-flight; it should not be idle! */
4282	GEM_BUG_ON(i915_active_is_idle(&ce->active));
4283
4284	rq = active_request(ce->timeline, rq);
4285	head = intel_ring_wrap(ce->ring, rq->head);
4286	GEM_BUG_ON(head == ce->ring->tail);
4287
4288	/*
4289	 * If this request hasn't started yet, e.g. it is waiting on a
4290	 * semaphore, we need to avoid skipping the request or else we
4291	 * break the signaling chain. However, if the context is corrupt
4292	 * the request will not restart and we will be stuck with a wedged
4293	 * device. It is quite often the case that if we issue a reset
4294	 * while the GPU is loading the context image, that the context
4295	 * image becomes corrupt.
4296	 *
4297	 * Otherwise, if we have not started yet, the request should replay
4298	 * perfectly and we do not need to flag the result as being erroneous.
4299	 */
4300	if (!i915_request_started(rq))
4301		goto out_replay;
4302
4303	/*
4304	 * If the request was innocent, we leave the request in the ELSP
4305	 * and will try to replay it on restarting. The context image may
4306	 * have been corrupted by the reset, in which case we may have
4307	 * to service a new GPU hang, but more likely we can continue on
4308	 * without impact.
4309	 *
4310	 * If the request was guilty, we presume the context is corrupt
4311	 * and have to at least restore the RING register in the context
4312	 * image back to the expected values to skip over the guilty request.
4313	 */
4314	__i915_request_reset(rq, stalled);
4315
4316	/*
4317	 * We want a simple context + ring to execute the breadcrumb update.
4318	 * We cannot rely on the context being intact across the GPU hang,
4319	 * so clear it and rebuild just what we need for the breadcrumb.
4320	 * All pending requests for this context will be zapped, and any
4321	 * future request will be after userspace has had the opportunity
4322	 * to recreate its own state.
4323	 */
4324out_replay:
4325	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4326		     head, ce->ring->tail);
4327	__execlists_reset_reg_state(ce, engine);
4328	__execlists_update_reg_state(ce, engine, head);
4329	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4330
4331unwind:
4332	/* Push back any incomplete requests for replay after the reset. */
4333	cancel_port_requests(execlists);
4334	__unwind_incomplete_requests(engine);
4335}
4336
4337static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4338{
4339	unsigned long flags;
4340
4341	ENGINE_TRACE(engine, "\n");
4342
4343	spin_lock_irqsave(&engine->active.lock, flags);
4344
4345	__execlists_reset(engine, stalled);
4346
4347	spin_unlock_irqrestore(&engine->active.lock, flags);
4348}
4349
4350static void nop_submission_tasklet(unsigned long data)
4351{
4352	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4353
4354	/* The driver is wedged; don't process any more events. */
4355	WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4356}
4357
4358static void execlists_reset_cancel(struct intel_engine_cs *engine)
4359{
4360	struct intel_engine_execlists * const execlists = &engine->execlists;
4361	struct i915_request *rq, *rn;
4362	struct rb_node *rb;
4363	unsigned long flags;
4364
4365	ENGINE_TRACE(engine, "\n");
4366
4367	/*
4368	 * Before we call engine->cancel_requests(), we should have exclusive
4369	 * access to the submission state. This is arranged for us by the
4370	 * caller disabling the interrupt generation, the tasklet and other
4371	 * threads that may then access the same state, giving us a free hand
4372	 * to reset state. However, we still need to let lockdep be aware that
4373	 * we know this state may be accessed in hardirq context, so we
4374	 * disable the irq around this manipulation and we want to keep
4375	 * the spinlock focused on its duties and not accidentally conflate
4376	 * coverage to the submission's irq state. (Similarly, although we
4377	 * shouldn't need to disable irq around the manipulation of the
4378	 * submission's irq state, we also wish to remind ourselves that
4379	 * it is irq state.)
4380	 */
4381	spin_lock_irqsave(&engine->active.lock, flags);
4382
4383	__execlists_reset(engine, true);
4384
4385	/* Mark all executing requests as skipped. */
4386	list_for_each_entry(rq, &engine->active.requests, sched.link)
4387		mark_eio(rq);
4388
4389	/* Flush the queued requests to the timeline list (for retiring). */
4390	while ((rb = rb_first_cached(&execlists->queue))) {
4391		struct i915_priolist *p = to_priolist(rb);
4392		int i;
4393
4394		priolist_for_each_request_consume(rq, rn, p, i) {
4395			mark_eio(rq);
4396			__i915_request_submit(rq);
4397		}
4398
4399		rb_erase_cached(&p->node, &execlists->queue);
4400		i915_priolist_free(p);
4401	}
4402
4403	/* On-hold requests will be flushed to timeline upon their release */
4404	list_for_each_entry(rq, &engine->active.hold, sched.link)
4405		mark_eio(rq);
4406
4407	/* Cancel all attached virtual engines */
4408	while ((rb = rb_first_cached(&execlists->virtual))) {
4409		struct virtual_engine *ve =
4410			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4411
4412		rb_erase_cached(rb, &execlists->virtual);
4413		RB_CLEAR_NODE(rb);
4414
4415		spin_lock(&ve->base.active.lock);
4416		rq = fetch_and_zero(&ve->request);
4417		if (rq) {
4418			mark_eio(rq);
4419
4420			rq->engine = engine;
4421			__i915_request_submit(rq);
4422			i915_request_put(rq);
4423
4424			ve->base.execlists.queue_priority_hint = INT_MIN;
4425		}
4426		spin_unlock(&ve->base.active.lock);
4427	}
4428
4429	/* Remaining _unready_ requests will be nop'ed when submitted */
4430
4431	execlists->queue_priority_hint = INT_MIN;
4432	execlists->queue = RB_ROOT_CACHED;
4433
4434	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4435	execlists->tasklet.func = nop_submission_tasklet;
4436
4437	spin_unlock_irqrestore(&engine->active.lock, flags);
4438}
4439
4440static void execlists_reset_finish(struct intel_engine_cs *engine)
4441{
4442	struct intel_engine_execlists * const execlists = &engine->execlists;
4443
4444	/*
4445	 * After a GPU reset, we may have requests to replay. Do so now while
4446	 * we still have the forcewake to be sure that the GPU is not allowed
4447	 * to sleep before we restart and reload a context.
4448	 */
4449	GEM_BUG_ON(!reset_in_progress(execlists));
4450	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4451		execlists->tasklet.func(execlists->tasklet.data);
4452
4453	if (__tasklet_enable(&execlists->tasklet))
4454		/* And kick in case we missed a new request submission. */
4455		tasklet_hi_schedule(&execlists->tasklet);
4456	ENGINE_TRACE(engine, "depth->%d\n",
4457		     atomic_read(&execlists->tasklet.count));
4458}
4459
4460static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4461				    u64 offset, u32 len,
4462				    const unsigned int flags)
4463{
4464	u32 *cs;
4465
4466	cs = intel_ring_begin(rq, 4);
4467	if (IS_ERR(cs))
4468		return PTR_ERR(cs);
4469
4470	/*
4471	 * WaDisableCtxRestoreArbitration:bdw,chv
4472	 *
4473	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
4474	 * particular all the gen that do not need the w/a at all!), if we
4475	 * took care to make sure that on every switch into this context
4476	 * (both ordinary and for preemption) that arbitrartion was enabled
4477	 * we would be fine.  However, for gen8 there is another w/a that
4478	 * requires us to not preempt inside GPGPU execution, so we keep
4479	 * arbitration disabled for gen8 batches. Arbitration will be
4480	 * re-enabled before we close the request
4481	 * (engine->emit_fini_breadcrumb).
4482	 */
4483	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4484
4485	/* FIXME(BDW+): Address space and security selectors. */
4486	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4487		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4488	*cs++ = lower_32_bits(offset);
4489	*cs++ = upper_32_bits(offset);
4490
4491	intel_ring_advance(rq, cs);
4492
4493	return 0;
4494}
4495
4496static int gen8_emit_bb_start(struct i915_request *rq,
4497			      u64 offset, u32 len,
4498			      const unsigned int flags)
4499{
4500	u32 *cs;
4501
4502	cs = intel_ring_begin(rq, 6);
4503	if (IS_ERR(cs))
4504		return PTR_ERR(cs);
4505
4506	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4507
4508	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4509		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4510	*cs++ = lower_32_bits(offset);
4511	*cs++ = upper_32_bits(offset);
4512
4513	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4514	*cs++ = MI_NOOP;
4515
4516	intel_ring_advance(rq, cs);
4517
4518	return 0;
4519}
4520
4521static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4522{
4523	ENGINE_WRITE(engine, RING_IMR,
4524		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
4525	ENGINE_POSTING_READ(engine, RING_IMR);
4526}
4527
4528static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4529{
4530	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4531}
4532
4533static int gen8_emit_flush(struct i915_request *request, u32 mode)
4534{
4535	u32 cmd, *cs;
4536
4537	cs = intel_ring_begin(request, 4);
4538	if (IS_ERR(cs))
4539		return PTR_ERR(cs);
4540
4541	cmd = MI_FLUSH_DW + 1;
4542
4543	/* We always require a command barrier so that subsequent
4544	 * commands, such as breadcrumb interrupts, are strictly ordered
4545	 * wrt the contents of the write cache being flushed to memory
4546	 * (and thus being coherent from the CPU).
4547	 */
4548	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4549
4550	if (mode & EMIT_INVALIDATE) {
4551		cmd |= MI_INVALIDATE_TLB;
4552		if (request->engine->class == VIDEO_DECODE_CLASS)
4553			cmd |= MI_INVALIDATE_BSD;
4554	}
4555
4556	*cs++ = cmd;
4557	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4558	*cs++ = 0; /* upper addr */
4559	*cs++ = 0; /* value */
4560	intel_ring_advance(request, cs);
4561
4562	return 0;
4563}
4564
4565static int gen8_emit_flush_render(struct i915_request *request,
4566				  u32 mode)
4567{
4568	bool vf_flush_wa = false, dc_flush_wa = false;
4569	u32 *cs, flags = 0;
4570	int len;
4571
4572	flags |= PIPE_CONTROL_CS_STALL;
4573
4574	if (mode & EMIT_FLUSH) {
4575		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4576		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4577		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4578		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4579	}
4580
4581	if (mode & EMIT_INVALIDATE) {
4582		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4583		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4584		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4585		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4586		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4587		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4588		flags |= PIPE_CONTROL_QW_WRITE;
4589		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4590
4591		/*
4592		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4593		 * pipe control.
4594		 */
4595		if (IS_GEN(request->engine->i915, 9))
4596			vf_flush_wa = true;
4597
4598		/* WaForGAMHang:kbl */
4599		if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0))
4600			dc_flush_wa = true;
4601	}
4602
4603	len = 6;
4604
4605	if (vf_flush_wa)
4606		len += 6;
4607
4608	if (dc_flush_wa)
4609		len += 12;
4610
4611	cs = intel_ring_begin(request, len);
4612	if (IS_ERR(cs))
4613		return PTR_ERR(cs);
4614
4615	if (vf_flush_wa)
4616		cs = gen8_emit_pipe_control(cs, 0, 0);
4617
4618	if (dc_flush_wa)
4619		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4620					    0);
4621
4622	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4623
4624	if (dc_flush_wa)
4625		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4626
4627	intel_ring_advance(request, cs);
4628
4629	return 0;
4630}
4631
4632static int gen11_emit_flush_render(struct i915_request *request,
4633				   u32 mode)
4634{
4635	if (mode & EMIT_FLUSH) {
4636		u32 *cs;
4637		u32 flags = 0;
4638
4639		flags |= PIPE_CONTROL_CS_STALL;
4640
4641		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4642		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4643		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4644		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4645		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4646		flags |= PIPE_CONTROL_QW_WRITE;
4647		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4648
4649		cs = intel_ring_begin(request, 6);
4650		if (IS_ERR(cs))
4651			return PTR_ERR(cs);
4652
4653		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4654		intel_ring_advance(request, cs);
4655	}
4656
4657	if (mode & EMIT_INVALIDATE) {
4658		u32 *cs;
4659		u32 flags = 0;
4660
4661		flags |= PIPE_CONTROL_CS_STALL;
4662
4663		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4664		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4665		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4666		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4667		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4668		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4669		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4670		flags |= PIPE_CONTROL_QW_WRITE;
4671		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4672
4673		cs = intel_ring_begin(request, 6);
4674		if (IS_ERR(cs))
4675			return PTR_ERR(cs);
4676
4677		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4678		intel_ring_advance(request, cs);
4679	}
4680
4681	return 0;
4682}
4683
4684static u32 preparser_disable(bool state)
4685{
4686	return MI_ARB_CHECK | 1 << 8 | state;
4687}
4688
4689static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4690{
4691	static const i915_reg_t vd[] = {
4692		GEN12_VD0_AUX_NV,
4693		GEN12_VD1_AUX_NV,
4694		GEN12_VD2_AUX_NV,
4695		GEN12_VD3_AUX_NV,
4696	};
4697
4698	static const i915_reg_t ve[] = {
4699		GEN12_VE0_AUX_NV,
4700		GEN12_VE1_AUX_NV,
4701	};
4702
4703	if (engine->class == VIDEO_DECODE_CLASS)
4704		return vd[engine->instance];
4705
4706	if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4707		return ve[engine->instance];
4708
4709	GEM_BUG_ON("unknown aux_inv_reg\n");
4710
4711	return INVALID_MMIO_REG;
4712}
4713
4714static u32 *
4715gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4716{
4717	*cs++ = MI_LOAD_REGISTER_IMM(1);
4718	*cs++ = i915_mmio_reg_offset(inv_reg);
4719	*cs++ = AUX_INV;
4720	*cs++ = MI_NOOP;
4721
4722	return cs;
4723}
4724
4725static int gen12_emit_flush_render(struct i915_request *request,
4726				   u32 mode)
4727{
4728	if (mode & EMIT_FLUSH) {
4729		u32 flags = 0;
4730		u32 *cs;
4731
4732		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4733		flags |= PIPE_CONTROL_FLUSH_L3;
4734		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4735		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4736		/* Wa_1409600907:tgl */
4737		flags |= PIPE_CONTROL_DEPTH_STALL;
4738		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4739		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4740
4741		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4742		flags |= PIPE_CONTROL_QW_WRITE;
4743
4744		flags |= PIPE_CONTROL_CS_STALL;
4745
4746		cs = intel_ring_begin(request, 6);
4747		if (IS_ERR(cs))
4748			return PTR_ERR(cs);
4749
4750		cs = gen12_emit_pipe_control(cs,
4751					     PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4752					     flags, LRC_PPHWSP_SCRATCH_ADDR);
4753		intel_ring_advance(request, cs);
4754	}
4755
4756	if (mode & EMIT_INVALIDATE) {
4757		u32 flags = 0;
4758		u32 *cs;
4759
4760		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4761		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4762		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4763		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4764		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4765		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4766		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4767
4768		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4769		flags |= PIPE_CONTROL_QW_WRITE;
4770
4771		flags |= PIPE_CONTROL_CS_STALL;
4772
4773		cs = intel_ring_begin(request, 8 + 4);
4774		if (IS_ERR(cs))
4775			return PTR_ERR(cs);
4776
4777		/*
4778		 * Prevent the pre-parser from skipping past the TLB
4779		 * invalidate and loading a stale page for the batch
4780		 * buffer / request payload.
4781		 */
4782		*cs++ = preparser_disable(true);
4783
4784		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4785
4786		/* hsdes: 1809175790 */
4787		cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4788
4789		*cs++ = preparser_disable(false);
4790		intel_ring_advance(request, cs);
4791	}
4792
4793	return 0;
4794}
4795
4796static int gen12_emit_flush(struct i915_request *request, u32 mode)
4797{
4798	intel_engine_mask_t aux_inv = 0;
4799	u32 cmd, *cs;
4800
4801	cmd = 4;
4802	if (mode & EMIT_INVALIDATE)
4803		cmd += 2;
4804	if (mode & EMIT_INVALIDATE)
4805		aux_inv = request->engine->mask & ~BIT(BCS0);
4806	if (aux_inv)
4807		cmd += 2 * hweight8(aux_inv) + 2;
4808
4809	cs = intel_ring_begin(request, cmd);
4810	if (IS_ERR(cs))
4811		return PTR_ERR(cs);
4812
4813	if (mode & EMIT_INVALIDATE)
4814		*cs++ = preparser_disable(true);
4815
4816	cmd = MI_FLUSH_DW + 1;
4817
4818	/* We always require a command barrier so that subsequent
4819	 * commands, such as breadcrumb interrupts, are strictly ordered
4820	 * wrt the contents of the write cache being flushed to memory
4821	 * (and thus being coherent from the CPU).
4822	 */
4823	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4824
4825	if (mode & EMIT_INVALIDATE) {
4826		cmd |= MI_INVALIDATE_TLB;
4827		if (request->engine->class == VIDEO_DECODE_CLASS)
4828			cmd |= MI_INVALIDATE_BSD;
4829	}
4830
4831	*cs++ = cmd;
4832	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4833	*cs++ = 0; /* upper addr */
4834	*cs++ = 0; /* value */
4835
4836	if (aux_inv) { /* hsdes: 1809175790 */
4837		struct intel_engine_cs *engine;
4838		unsigned int tmp;
4839
4840		*cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4841		for_each_engine_masked(engine, request->engine->gt,
4842				       aux_inv, tmp) {
4843			*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4844			*cs++ = AUX_INV;
4845		}
4846		*cs++ = MI_NOOP;
4847	}
4848
4849	if (mode & EMIT_INVALIDATE)
4850		*cs++ = preparser_disable(false);
4851
4852	intel_ring_advance(request, cs);
4853
4854	return 0;
4855}
4856
4857static void assert_request_valid(struct i915_request *rq)
4858{
4859	struct intel_ring *ring __maybe_unused = rq->ring;
4860
4861	/* Can we unwind this request without appearing to go forwards? */
4862	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4863}
4864
4865/*
4866 * Reserve space for 2 NOOPs at the end of each request to be
4867 * used as a workaround for not being allowed to do lite
4868 * restore with HEAD==TAIL (WaIdleLiteRestore).
4869 */
4870static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4871{
4872	/* Ensure there's always at least one preemption point per-request. */
4873	*cs++ = MI_ARB_CHECK;
4874	*cs++ = MI_NOOP;
4875	request->wa_tail = intel_ring_offset(request, cs);
4876
4877	/* Check that entire request is less than half the ring */
4878	assert_request_valid(request);
4879
4880	return cs;
4881}
4882
4883static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4884{
4885	*cs++ = MI_SEMAPHORE_WAIT |
4886		MI_SEMAPHORE_GLOBAL_GTT |
4887		MI_SEMAPHORE_POLL |
4888		MI_SEMAPHORE_SAD_EQ_SDD;
4889	*cs++ = 0;
4890	*cs++ = intel_hws_preempt_address(request->engine);
4891	*cs++ = 0;
4892
4893	return cs;
4894}
4895
4896static __always_inline u32*
4897gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4898{
4899	*cs++ = MI_USER_INTERRUPT;
4900
4901	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4902	if (intel_engine_has_semaphores(request->engine))
4903		cs = emit_preempt_busywait(request, cs);
4904
4905	request->tail = intel_ring_offset(request, cs);
4906	assert_ring_tail_valid(request->ring, request->tail);
4907
4908	return gen8_emit_wa_tail(request, cs);
4909}
4910
4911static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
4912{
4913	return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
4914}
4915
4916static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4917{
4918	return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4919}
4920
4921static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4922{
4923	cs = gen8_emit_pipe_control(cs,
4924				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4925				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4926				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4927				    0);
4928
4929	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4930	cs = gen8_emit_ggtt_write_rcs(cs,
4931				      request->fence.seqno,
4932				      hwsp_offset(request),
4933				      PIPE_CONTROL_FLUSH_ENABLE |
4934				      PIPE_CONTROL_CS_STALL);
4935
4936	return gen8_emit_fini_breadcrumb_tail(request, cs);
4937}
4938
4939static u32 *
4940gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4941{
4942	cs = gen8_emit_ggtt_write_rcs(cs,
4943				      request->fence.seqno,
4944				      hwsp_offset(request),
4945				      PIPE_CONTROL_CS_STALL |
4946				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4947				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4948				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4949				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4950				      PIPE_CONTROL_FLUSH_ENABLE);
4951
4952	return gen8_emit_fini_breadcrumb_tail(request, cs);
4953}
4954
4955/*
4956 * Note that the CS instruction pre-parser will not stall on the breadcrumb
4957 * flush and will continue pre-fetching the instructions after it before the
4958 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4959 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4960 * of the next request before the memory has been flushed, we're guaranteed that
4961 * we won't access the batch itself too early.
4962 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4963 * so, if the current request is modifying an instruction in the next request on
4964 * the same intel_context, we might pre-fetch and then execute the pre-update
4965 * instruction. To avoid this, the users of self-modifying code should either
4966 * disable the parser around the code emitting the memory writes, via a new flag
4967 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4968 * the in-kernel use-cases we've opted to use a separate context, see
4969 * reloc_gpu() as an example.
4970 * All the above applies only to the instructions themselves. Non-inline data
4971 * used by the instructions is not pre-fetched.
4972 */
4973
4974static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4975{
4976	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4977		MI_SEMAPHORE_GLOBAL_GTT |
4978		MI_SEMAPHORE_POLL |
4979		MI_SEMAPHORE_SAD_EQ_SDD;
4980	*cs++ = 0;
4981	*cs++ = intel_hws_preempt_address(request->engine);
4982	*cs++ = 0;
4983	*cs++ = 0;
4984	*cs++ = MI_NOOP;
4985
4986	return cs;
4987}
4988
4989static __always_inline u32*
4990gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4991{
4992	*cs++ = MI_USER_INTERRUPT;
4993
4994	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4995	if (intel_engine_has_semaphores(request->engine))
4996		cs = gen12_emit_preempt_busywait(request, cs);
4997
4998	request->tail = intel_ring_offset(request, cs);
4999	assert_ring_tail_valid(request->ring, request->tail);
5000
5001	return gen8_emit_wa_tail(request, cs);
5002}
5003
5004static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
5005{
5006	/* XXX Stalling flush before seqno write; post-sync not */
5007	cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
5008	return gen12_emit_fini_breadcrumb_tail(rq, cs);
5009}
5010
5011static u32 *
5012gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
5013{
5014	cs = gen12_emit_ggtt_write_rcs(cs,
5015				       request->fence.seqno,
5016				       hwsp_offset(request),
5017				       PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
5018				       PIPE_CONTROL_CS_STALL |
5019				       PIPE_CONTROL_TILE_CACHE_FLUSH |
5020				       PIPE_CONTROL_FLUSH_L3 |
5021				       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
5022				       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
5023				       /* Wa_1409600907:tgl */
5024				       PIPE_CONTROL_DEPTH_STALL |
5025				       PIPE_CONTROL_DC_FLUSH_ENABLE |
5026				       PIPE_CONTROL_FLUSH_ENABLE);
5027
5028	return gen12_emit_fini_breadcrumb_tail(request, cs);
5029}
5030
5031static void execlists_park(struct intel_engine_cs *engine)
5032{
5033	cancel_timer(&engine->execlists.timer);
5034	cancel_timer(&engine->execlists.preempt);
5035}
5036
5037void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
5038{
5039	engine->submit_request = execlists_submit_request;
5040	engine->schedule = i915_schedule;
5041	engine->execlists.tasklet.func = execlists_submission_tasklet;
5042
5043	engine->reset.prepare = execlists_reset_prepare;
5044	engine->reset.rewind = execlists_reset_rewind;
5045	engine->reset.cancel = execlists_reset_cancel;
5046	engine->reset.finish = execlists_reset_finish;
5047
5048	engine->park = execlists_park;
5049	engine->unpark = NULL;
5050
5051	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
5052	if (!intel_vgpu_active(engine->i915)) {
5053		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
5054		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
5055			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
5056			if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
5057				engine->flags |= I915_ENGINE_HAS_TIMESLICES;
5058		}
5059	}
5060
5061	if (INTEL_GEN(engine->i915) >= 12)
5062		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
5063
5064	if (intel_engine_has_preemption(engine))
5065		engine->emit_bb_start = gen8_emit_bb_start;
5066	else
5067		engine->emit_bb_start = gen8_emit_bb_start_noarb;
5068}
5069
5070static void execlists_shutdown(struct intel_engine_cs *engine)
5071{
5072	/* Synchronise with residual timers and any softirq they raise */
5073	del_timer_sync(&engine->execlists.timer);
5074	del_timer_sync(&engine->execlists.preempt);
5075	tasklet_kill(&engine->execlists.tasklet);
5076}
5077
5078static void execlists_release(struct intel_engine_cs *engine)
5079{
5080	engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
5081
5082	execlists_shutdown(engine);
5083
5084	intel_engine_cleanup_common(engine);
5085	lrc_destroy_wa_ctx(engine);
5086}
5087
5088static void
5089logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5090{
5091	/* Default vfuncs which can be overriden by each engine. */
5092
5093	engine->resume = execlists_resume;
5094
5095	engine->cops = &execlists_context_ops;
5096	engine->request_alloc = execlists_request_alloc;
5097
5098	engine->emit_flush = gen8_emit_flush;
5099	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5100	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5101	if (INTEL_GEN(engine->i915) >= 12) {
5102		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5103		engine->emit_flush = gen12_emit_flush;
5104	}
5105	engine->set_default_submission = intel_execlists_set_default_submission;
5106
5107	if (INTEL_GEN(engine->i915) < 11) {
5108		engine->irq_enable = gen8_logical_ring_enable_irq;
5109		engine->irq_disable = gen8_logical_ring_disable_irq;
5110	} else {
5111		/*
5112		 * TODO: On Gen11 interrupt masks need to be clear
5113		 * to allow C6 entry. Keep interrupts enabled at
5114		 * and take the hit of generating extra interrupts
5115		 * until a more refined solution exists.
5116		 */
5117	}
5118}
5119
5120static inline void
5121logical_ring_default_irqs(struct intel_engine_cs *engine)
5122{
5123	unsigned int shift = 0;
5124
5125	if (INTEL_GEN(engine->i915) < 11) {
5126		const u8 irq_shifts[] = {
5127			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
5128			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
5129			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5130			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5131			[VECS0] = GEN8_VECS_IRQ_SHIFT,
5132		};
5133
5134		shift = irq_shifts[engine->id];
5135	}
5136
5137	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5138	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5139	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5140	engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5141}
5142
5143static void rcs_submission_override(struct intel_engine_cs *engine)
5144{
5145	switch (INTEL_GEN(engine->i915)) {
5146	case 12:
5147		engine->emit_flush = gen12_emit_flush_render;
5148		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5149		break;
5150	case 11:
5151		engine->emit_flush = gen11_emit_flush_render;
5152		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5153		break;
5154	default:
5155		engine->emit_flush = gen8_emit_flush_render;
5156		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5157		break;
5158	}
5159}
5160
5161int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5162{
5163	struct intel_engine_execlists * const execlists = &engine->execlists;
5164	struct drm_i915_private *i915 = engine->i915;
5165	struct intel_uncore *uncore = engine->uncore;
5166	u32 base = engine->mmio_base;
5167
5168	tasklet_init(&engine->execlists.tasklet,
5169		     execlists_submission_tasklet, (unsigned long)engine);
5170	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5171	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5172
5173	logical_ring_default_vfuncs(engine);
5174	logical_ring_default_irqs(engine);
5175
5176	if (engine->class == RENDER_CLASS)
5177		rcs_submission_override(engine);
5178
5179	if (intel_init_workaround_bb(engine))
5180		/*
5181		 * We continue even if we fail to initialize WA batch
5182		 * because we only expect rare glitches but nothing
5183		 * critical to prevent us from using GPU
5184		 */
5185		drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5186
5187	if (HAS_LOGICAL_RING_ELSQ(i915)) {
5188		execlists->submit_reg = uncore->regs +
5189			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5190		execlists->ctrl_reg = uncore->regs +
5191			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5192	} else {
5193		execlists->submit_reg = uncore->regs +
5194			i915_mmio_reg_offset(RING_ELSP(base));
5195	}
5196
5197	execlists->csb_status =
5198		(u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5199
5200	execlists->csb_write =
5201		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
5202
5203	if (INTEL_GEN(i915) < 11)
5204		execlists->csb_size = GEN8_CSB_ENTRIES;
5205	else
5206		execlists->csb_size = GEN11_CSB_ENTRIES;
5207
5208	if (INTEL_GEN(engine->i915) >= 11) {
5209		execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5210		execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5211	}
5212
5213	/* Finally, take ownership and responsibility for cleanup! */
5214	engine->sanitize = execlists_sanitize;
5215	engine->release = execlists_release;
5216
5217	return 0;
5218}
5219
5220static void init_common_reg_state(u32 * const regs,
5221				  const struct intel_engine_cs *engine,
5222				  const struct intel_ring *ring,
5223				  bool inhibit)
5224{
5225	u32 ctl;
5226
5227	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5228	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5229	if (inhibit)
5230		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5231	if (INTEL_GEN(engine->i915) < 11)
5232		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5233					   CTX_CTRL_RS_CTX_ENABLE);
5234	regs[CTX_CONTEXT_CONTROL] = ctl;
5235
5236	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5237	regs[CTX_TIMESTAMP] = 0;
5238}
5239
5240static void init_wa_bb_reg_state(u32 * const regs,
5241				 const struct intel_engine_cs *engine)
5242{
5243	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5244
5245	if (wa_ctx->per_ctx.size) {
5246		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5247
5248		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5249		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5250			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5251	}
5252
5253	if (wa_ctx->indirect_ctx.size) {
5254		lrc_ring_setup_indirect_ctx(regs, engine,
5255					    i915_ggtt_offset(wa_ctx->vma) +
5256					    wa_ctx->indirect_ctx.offset,
5257					    wa_ctx->indirect_ctx.size);
5258	}
5259}
5260
5261static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5262{
5263	if (i915_vm_is_4lvl(&ppgtt->vm)) {
5264		/* 64b PPGTT (48bit canonical)
5265		 * PDP0_DESCRIPTOR contains the base address to PML4 and
5266		 * other PDP Descriptors are ignored.
5267		 */
5268		ASSIGN_CTX_PML4(ppgtt, regs);
5269	} else {
5270		ASSIGN_CTX_PDP(ppgtt, regs, 3);
5271		ASSIGN_CTX_PDP(ppgtt, regs, 2);
5272		ASSIGN_CTX_PDP(ppgtt, regs, 1);
5273		ASSIGN_CTX_PDP(ppgtt, regs, 0);
5274	}
5275}
5276
5277static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5278{
5279	if (i915_is_ggtt(vm))
5280		return i915_vm_to_ggtt(vm)->alias;
5281	else
5282		return i915_vm_to_ppgtt(vm);
5283}
5284
5285static void execlists_init_reg_state(u32 *regs,
5286				     const struct intel_context *ce,
5287				     const struct intel_engine_cs *engine,
5288				     const struct intel_ring *ring,
5289				     bool inhibit)
5290{
5291	/*
5292	 * A context is actually a big batch buffer with several
5293	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5294	 * values we are setting here are only for the first context restore:
5295	 * on a subsequent save, the GPU will recreate this batchbuffer with new
5296	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5297	 * we are not initializing here).
5298	 *
5299	 * Must keep consistent with virtual_update_register_offsets().
5300	 */
5301	set_offsets(regs, reg_offsets(engine), engine, inhibit);
5302
5303	init_common_reg_state(regs, engine, ring, inhibit);
5304	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5305
5306	init_wa_bb_reg_state(regs, engine);
5307
5308	__reset_stop_ring(regs, engine);
5309}
5310
5311static int
5312populate_lr_context(struct intel_context *ce,
5313		    struct drm_i915_gem_object *ctx_obj,
5314		    struct intel_engine_cs *engine,
5315		    struct intel_ring *ring)
5316{
5317	bool inhibit = true;
5318	void *vaddr;
5319
5320	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5321	if (IS_ERR(vaddr)) {
5322		drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5323		return PTR_ERR(vaddr);
5324	}
5325
5326	set_redzone(vaddr, engine);
5327
5328	if (engine->default_state) {
5329		shmem_read(engine->default_state, 0,
5330			   vaddr, engine->context_size);
5331		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
5332		inhibit = false;
5333	}
5334
5335	/* Clear the ppHWSP (inc. per-context counters) */
5336	memset(vaddr, 0, PAGE_SIZE);
5337
5338	/*
5339	 * The second page of the context object contains some registers which
5340	 * must be set up prior to the first execution.
5341	 */
5342	execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5343				 ce, engine, ring, inhibit);
5344
5345	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5346	i915_gem_object_unpin_map(ctx_obj);
5347	return 0;
5348}
5349
5350static struct intel_timeline *pinned_timeline(struct intel_context *ce)
5351{
5352	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
5353
5354	return intel_timeline_create_from_engine(ce->engine,
5355						 page_unmask_bits(tl));
5356}
5357
5358static int __execlists_context_alloc(struct intel_context *ce,
5359				     struct intel_engine_cs *engine)
5360{
5361	struct drm_i915_gem_object *ctx_obj;
5362	struct intel_ring *ring;
5363	struct i915_vma *vma;
5364	u32 context_size;
5365	int ret;
5366
5367	GEM_BUG_ON(ce->state);
5368	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5369
5370	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5371		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5372
5373	if (INTEL_GEN(engine->i915) == 12) {
5374		ce->wa_bb_page = context_size / PAGE_SIZE;
5375		context_size += PAGE_SIZE;
5376	}
5377
5378	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5379	if (IS_ERR(ctx_obj))
5380		return PTR_ERR(ctx_obj);
5381
5382	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5383	if (IS_ERR(vma)) {
5384		ret = PTR_ERR(vma);
5385		goto error_deref_obj;
5386	}
5387
5388	if (!page_mask_bits(ce->timeline)) {
5389		struct intel_timeline *tl;
5390
5391		/*
5392		 * Use the static global HWSP for the kernel context, and
5393		 * a dynamically allocated cacheline for everyone else.
5394		 */
5395		if (unlikely(ce->timeline))
5396			tl = pinned_timeline(ce);
5397		else
5398			tl = intel_timeline_create(engine->gt);
5399		if (IS_ERR(tl)) {
5400			ret = PTR_ERR(tl);
5401			goto error_deref_obj;
5402		}
5403
5404		ce->timeline = tl;
5405	}
5406
5407	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5408	if (IS_ERR(ring)) {
5409		ret = PTR_ERR(ring);
5410		goto error_deref_obj;
5411	}
5412
5413	ret = populate_lr_context(ce, ctx_obj, engine, ring);
5414	if (ret) {
5415		drm_dbg(&engine->i915->drm,
5416			"Failed to populate LRC: %d\n", ret);
5417		goto error_ring_free;
5418	}
5419
5420	ce->ring = ring;
5421	ce->state = vma;
5422
5423	return 0;
5424
5425error_ring_free:
5426	intel_ring_put(ring);
5427error_deref_obj:
5428	i915_gem_object_put(ctx_obj);
5429	return ret;
5430}
5431
5432static struct list_head *virtual_queue(struct virtual_engine *ve)
5433{
5434	return &ve->base.execlists.default_priolist.requests[0];
5435}
5436
5437static void rcu_virtual_context_destroy(struct work_struct *wrk)
5438{
5439	struct virtual_engine *ve =
5440		container_of(wrk, typeof(*ve), rcu.work);
5441	unsigned int n;
5442
5443	GEM_BUG_ON(ve->context.inflight);
5444
5445	/* Preempt-to-busy may leave a stale request behind. */
5446	if (unlikely(ve->request)) {
5447		struct i915_request *old;
5448
5449		spin_lock_irq(&ve->base.active.lock);
5450
5451		old = fetch_and_zero(&ve->request);
5452		if (old) {
5453			GEM_BUG_ON(!i915_request_completed(old));
5454			__i915_request_submit(old);
5455			i915_request_put(old);
5456		}
5457
5458		spin_unlock_irq(&ve->base.active.lock);
5459	}
5460
5461	/*
5462	 * Flush the tasklet in case it is still running on another core.
5463	 *
5464	 * This needs to be done before we remove ourselves from the siblings'
5465	 * rbtrees as in the case it is running in parallel, it may reinsert
5466	 * the rb_node into a sibling.
5467	 */
5468	tasklet_kill(&ve->base.execlists.tasklet);
5469
5470	/* Decouple ourselves from the siblings, no more access allowed. */
5471	for (n = 0; n < ve->num_siblings; n++) {
5472		struct intel_engine_cs *sibling = ve->siblings[n];
5473		struct rb_node *node = &ve->nodes[sibling->id].rb;
5474
5475		if (RB_EMPTY_NODE(node))
5476			continue;
5477
5478		spin_lock_irq(&sibling->active.lock);
5479
5480		/* Detachment is lazily performed in the execlists tasklet */
5481		if (!RB_EMPTY_NODE(node))
5482			rb_erase_cached(node, &sibling->execlists.virtual);
5483
5484		spin_unlock_irq(&sibling->active.lock);
5485	}
5486	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5487	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5488
5489	if (ve->context.state)
5490		__execlists_context_fini(&ve->context);
5491	intel_context_fini(&ve->context);
5492
5493	intel_breadcrumbs_free(ve->base.breadcrumbs);
5494	intel_engine_free_request_pool(&ve->base);
5495
5496	kfree(ve->bonds);
5497	kfree(ve);
5498}
5499
5500static void virtual_context_destroy(struct kref *kref)
5501{
5502	struct virtual_engine *ve =
5503		container_of(kref, typeof(*ve), context.ref);
5504
5505	GEM_BUG_ON(!list_empty(&ve->context.signals));
5506
5507	/*
5508	 * When destroying the virtual engine, we have to be aware that
5509	 * it may still be in use from an hardirq/softirq context causing
5510	 * the resubmission of a completed request (background completion
5511	 * due to preempt-to-busy). Before we can free the engine, we need
5512	 * to flush the submission code and tasklets that are still potentially
5513	 * accessing the engine. Flushing the tasklets requires process context,
5514	 * and since we can guard the resubmit onto the engine with an RCU read
5515	 * lock, we can delegate the free of the engine to an RCU worker.
5516	 */
5517	INIT_RCU_WORK(&ve->rcu, rcu_virtual_context_destroy);
5518	queue_rcu_work(system_wq, &ve->rcu);
5519}
5520
5521static void virtual_engine_initial_hint(struct virtual_engine *ve)
5522{
5523	int swp;
5524
5525	/*
5526	 * Pick a random sibling on starting to help spread the load around.
5527	 *
5528	 * New contexts are typically created with exactly the same order
5529	 * of siblings, and often started in batches. Due to the way we iterate
5530	 * the array of sibling when submitting requests, sibling[0] is
5531	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5532	 * randomised across the system, we also help spread the load by the
5533	 * first engine we inspect being different each time.
5534	 *
5535	 * NB This does not force us to execute on this engine, it will just
5536	 * typically be the first we inspect for submission.
5537	 */
5538	swp = prandom_u32_max(ve->num_siblings);
5539	if (swp)
5540		swap(ve->siblings[swp], ve->siblings[0]);
5541}
5542
5543static int virtual_context_alloc(struct intel_context *ce)
5544{
5545	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5546
5547	return __execlists_context_alloc(ce, ve->siblings[0]);
5548}
5549
5550static int virtual_context_pin(struct intel_context *ce, void *vaddr)
5551{
5552	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5553
5554	/* Note: we must use a real engine class for setting up reg state */
5555	return __execlists_context_pin(ce, ve->siblings[0], vaddr);
5556}
5557
5558static void virtual_context_enter(struct intel_context *ce)
5559{
5560	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5561	unsigned int n;
5562
5563	for (n = 0; n < ve->num_siblings; n++)
5564		intel_engine_pm_get(ve->siblings[n]);
5565
5566	intel_timeline_enter(ce->timeline);
5567}
5568
5569static void virtual_context_exit(struct intel_context *ce)
5570{
5571	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5572	unsigned int n;
5573
5574	intel_timeline_exit(ce->timeline);
5575
5576	for (n = 0; n < ve->num_siblings; n++)
5577		intel_engine_pm_put(ve->siblings[n]);
5578}
5579
5580static const struct intel_context_ops virtual_context_ops = {
5581	.alloc = virtual_context_alloc,
5582
5583	.pre_pin = execlists_context_pre_pin,
5584	.pin = virtual_context_pin,
5585	.unpin = execlists_context_unpin,
5586	.post_unpin = execlists_context_post_unpin,
5587
5588	.enter = virtual_context_enter,
5589	.exit = virtual_context_exit,
5590
5591	.destroy = virtual_context_destroy,
5592};
5593
5594static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5595{
5596	struct i915_request *rq;
5597	intel_engine_mask_t mask;
5598
5599	rq = READ_ONCE(ve->request);
5600	if (!rq)
5601		return 0;
5602
5603	/* The rq is ready for submission; rq->execution_mask is now stable. */
5604	mask = rq->execution_mask;
5605	if (unlikely(!mask)) {
5606		/* Invalid selection, submit to a random engine in error */
5607		i915_request_set_error_once(rq, -ENODEV);
5608		mask = ve->siblings[0]->mask;
5609	}
5610
5611	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5612		     rq->fence.context, rq->fence.seqno,
5613		     mask, ve->base.execlists.queue_priority_hint);
5614
5615	return mask;
5616}
5617
5618static void virtual_submission_tasklet(unsigned long data)
5619{
5620	struct virtual_engine * const ve = (struct virtual_engine *)data;
5621	const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5622	intel_engine_mask_t mask;
5623	unsigned int n;
5624
5625	rcu_read_lock();
5626	mask = virtual_submission_mask(ve);
5627	rcu_read_unlock();
5628	if (unlikely(!mask))
5629		return;
5630
5631	local_irq_disable();
5632	for (n = 0; n < ve->num_siblings; n++) {
5633		struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5634		struct ve_node * const node = &ve->nodes[sibling->id];
5635		struct rb_node **parent, *rb;
5636		bool first;
5637
5638		if (!READ_ONCE(ve->request))
5639			break; /* already handled by a sibling's tasklet */
5640
5641		if (unlikely(!(mask & sibling->mask))) {
5642			if (!RB_EMPTY_NODE(&node->rb)) {
5643				spin_lock(&sibling->active.lock);
5644				rb_erase_cached(&node->rb,
5645						&sibling->execlists.virtual);
5646				RB_CLEAR_NODE(&node->rb);
5647				spin_unlock(&sibling->active.lock);
5648			}
5649			continue;
5650		}
5651
5652		spin_lock(&sibling->active.lock);
5653
5654		if (!RB_EMPTY_NODE(&node->rb)) {
5655			/*
5656			 * Cheat and avoid rebalancing the tree if we can
5657			 * reuse this node in situ.
5658			 */
5659			first = rb_first_cached(&sibling->execlists.virtual) ==
5660				&node->rb;
5661			if (prio == node->prio || (prio > node->prio && first))
5662				goto submit_engine;
5663
5664			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5665		}
5666
5667		rb = NULL;
5668		first = true;
5669		parent = &sibling->execlists.virtual.rb_root.rb_node;
5670		while (*parent) {
5671			struct ve_node *other;
5672
5673			rb = *parent;
5674			other = rb_entry(rb, typeof(*other), rb);
5675			if (prio > other->prio) {
5676				parent = &rb->rb_left;
5677			} else {
5678				parent = &rb->rb_right;
5679				first = false;
5680			}
5681		}
5682
5683		rb_link_node(&node->rb, rb, parent);
5684		rb_insert_color_cached(&node->rb,
5685				       &sibling->execlists.virtual,
5686				       first);
5687
5688submit_engine:
5689		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5690		node->prio = prio;
5691		if (first && prio > sibling->execlists.queue_priority_hint)
5692			tasklet_hi_schedule(&sibling->execlists.tasklet);
5693
5694		spin_unlock(&sibling->active.lock);
5695	}
5696	local_irq_enable();
5697}
5698
5699static void virtual_submit_request(struct i915_request *rq)
5700{
5701	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5702	struct i915_request *old;
5703	unsigned long flags;
5704
5705	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5706		     rq->fence.context,
5707		     rq->fence.seqno);
5708
5709	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5710
5711	spin_lock_irqsave(&ve->base.active.lock, flags);
5712
5713	old = ve->request;
5714	if (old) { /* background completion event from preempt-to-busy */
5715		GEM_BUG_ON(!i915_request_completed(old));
5716		__i915_request_submit(old);
5717		i915_request_put(old);
5718	}
5719
5720	if (i915_request_completed(rq)) {
5721		__i915_request_submit(rq);
5722
5723		ve->base.execlists.queue_priority_hint = INT_MIN;
5724		ve->request = NULL;
5725	} else {
5726		ve->base.execlists.queue_priority_hint = rq_prio(rq);
5727		ve->request = i915_request_get(rq);
5728
5729		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5730		list_move_tail(&rq->sched.link, virtual_queue(ve));
5731
5732		tasklet_hi_schedule(&ve->base.execlists.tasklet);
5733	}
5734
5735	spin_unlock_irqrestore(&ve->base.active.lock, flags);
5736}
5737
5738static struct ve_bond *
5739virtual_find_bond(struct virtual_engine *ve,
5740		  const struct intel_engine_cs *master)
5741{
5742	int i;
5743
5744	for (i = 0; i < ve->num_bonds; i++) {
5745		if (ve->bonds[i].master == master)
5746			return &ve->bonds[i];
5747	}
5748
5749	return NULL;
5750}
5751
5752static void
5753virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5754{
5755	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5756	intel_engine_mask_t allowed, exec;
5757	struct ve_bond *bond;
5758
5759	allowed = ~to_request(signal)->engine->mask;
5760
5761	bond = virtual_find_bond(ve, to_request(signal)->engine);
5762	if (bond)
5763		allowed &= bond->sibling_mask;
5764
5765	/* Restrict the bonded request to run on only the available engines */
5766	exec = READ_ONCE(rq->execution_mask);
5767	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5768		;
5769
5770	/* Prevent the master from being re-run on the bonded engines */
5771	to_request(signal)->execution_mask &= ~allowed;
5772}
5773
5774struct intel_context *
5775intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5776			       unsigned int count)
5777{
5778	struct virtual_engine *ve;
5779	unsigned int n;
5780	int err;
5781
5782	if (count == 0)
5783		return ERR_PTR(-EINVAL);
5784
5785	if (count == 1)
5786		return intel_context_create(siblings[0]);
5787
5788	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5789	if (!ve)
5790		return ERR_PTR(-ENOMEM);
5791
5792	ve->base.i915 = siblings[0]->i915;
5793	ve->base.gt = siblings[0]->gt;
5794	ve->base.uncore = siblings[0]->uncore;
5795	ve->base.id = -1;
5796
5797	ve->base.class = OTHER_CLASS;
5798	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5799	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5800	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5801
5802	/*
5803	 * The decision on whether to submit a request using semaphores
5804	 * depends on the saturated state of the engine. We only compute
5805	 * this during HW submission of the request, and we need for this
5806	 * state to be globally applied to all requests being submitted
5807	 * to this engine. Virtual engines encompass more than one physical
5808	 * engine and so we cannot accurately tell in advance if one of those
5809	 * engines is already saturated and so cannot afford to use a semaphore
5810	 * and be pessimized in priority for doing so -- if we are the only
5811	 * context using semaphores after all other clients have stopped, we
5812	 * will be starved on the saturated system. Such a global switch for
5813	 * semaphores is less than ideal, but alas is the current compromise.
5814	 */
5815	ve->base.saturated = ALL_ENGINES;
5816
5817	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5818
5819	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5820	intel_engine_init_execlists(&ve->base);
5821
5822	ve->base.cops = &virtual_context_ops;
5823	ve->base.request_alloc = execlists_request_alloc;
5824
5825	ve->base.schedule = i915_schedule;
5826	ve->base.submit_request = virtual_submit_request;
5827	ve->base.bond_execute = virtual_bond_execute;
5828
5829	INIT_LIST_HEAD(virtual_queue(ve));
5830	ve->base.execlists.queue_priority_hint = INT_MIN;
5831	tasklet_init(&ve->base.execlists.tasklet,
5832		     virtual_submission_tasklet,
5833		     (unsigned long)ve);
5834
5835	intel_context_init(&ve->context, &ve->base);
5836
5837	ve->base.breadcrumbs = intel_breadcrumbs_create(NULL);
5838	if (!ve->base.breadcrumbs) {
5839		err = -ENOMEM;
5840		goto err_put;
5841	}
5842
5843	for (n = 0; n < count; n++) {
5844		struct intel_engine_cs *sibling = siblings[n];
5845
5846		GEM_BUG_ON(!is_power_of_2(sibling->mask));
5847		if (sibling->mask & ve->base.mask) {
5848			DRM_DEBUG("duplicate %s entry in load balancer\n",
5849				  sibling->name);
5850			err = -EINVAL;
5851			goto err_put;
5852		}
5853
5854		/*
5855		 * The virtual engine implementation is tightly coupled to
5856		 * the execlists backend -- we push out request directly
5857		 * into a tree inside each physical engine. We could support
5858		 * layering if we handle cloning of the requests and
5859		 * submitting a copy into each backend.
5860		 */
5861		if (sibling->execlists.tasklet.func !=
5862		    execlists_submission_tasklet) {
5863			err = -ENODEV;
5864			goto err_put;
5865		}
5866
5867		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5868		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5869
5870		ve->siblings[ve->num_siblings++] = sibling;
5871		ve->base.mask |= sibling->mask;
5872
5873		/*
5874		 * All physical engines must be compatible for their emission
5875		 * functions (as we build the instructions during request
5876		 * construction and do not alter them before submission
5877		 * on the physical engine). We use the engine class as a guide
5878		 * here, although that could be refined.
5879		 */
5880		if (ve->base.class != OTHER_CLASS) {
5881			if (ve->base.class != sibling->class) {
5882				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5883					  sibling->class, ve->base.class);
5884				err = -EINVAL;
5885				goto err_put;
5886			}
5887			continue;
5888		}
5889
5890		ve->base.class = sibling->class;
5891		ve->base.uabi_class = sibling->uabi_class;
5892		snprintf(ve->base.name, sizeof(ve->base.name),
5893			 "v%dx%d", ve->base.class, count);
5894		ve->base.context_size = sibling->context_size;
5895
5896		ve->base.emit_bb_start = sibling->emit_bb_start;
5897		ve->base.emit_flush = sibling->emit_flush;
5898		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5899		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5900		ve->base.emit_fini_breadcrumb_dw =
5901			sibling->emit_fini_breadcrumb_dw;
5902
5903		ve->base.flags = sibling->flags;
5904	}
5905
5906	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5907
5908	virtual_engine_initial_hint(ve);
5909	return &ve->context;
5910
5911err_put:
5912	intel_context_put(&ve->context);
5913	return ERR_PTR(err);
5914}
5915
5916struct intel_context *
5917intel_execlists_clone_virtual(struct intel_engine_cs *src)
5918{
5919	struct virtual_engine *se = to_virtual_engine(src);
5920	struct intel_context *dst;
5921
5922	dst = intel_execlists_create_virtual(se->siblings,
5923					     se->num_siblings);
5924	if (IS_ERR(dst))
5925		return dst;
5926
5927	if (se->num_bonds) {
5928		struct virtual_engine *de = to_virtual_engine(dst->engine);
5929
5930		de->bonds = kmemdup(se->bonds,
5931				    sizeof(*se->bonds) * se->num_bonds,
5932				    GFP_KERNEL);
5933		if (!de->bonds) {
5934			intel_context_put(dst);
5935			return ERR_PTR(-ENOMEM);
5936		}
5937
5938		de->num_bonds = se->num_bonds;
5939	}
5940
5941	return dst;
5942}
5943
5944int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5945				     const struct intel_engine_cs *master,
5946				     const struct intel_engine_cs *sibling)
5947{
5948	struct virtual_engine *ve = to_virtual_engine(engine);
5949	struct ve_bond *bond;
5950	int n;
5951
5952	/* Sanity check the sibling is part of the virtual engine */
5953	for (n = 0; n < ve->num_siblings; n++)
5954		if (sibling == ve->siblings[n])
5955			break;
5956	if (n == ve->num_siblings)
5957		return -EINVAL;
5958
5959	bond = virtual_find_bond(ve, master);
5960	if (bond) {
5961		bond->sibling_mask |= sibling->mask;
5962		return 0;
5963	}
5964
5965	bond = krealloc(ve->bonds,
5966			sizeof(*bond) * (ve->num_bonds + 1),
5967			GFP_KERNEL);
5968	if (!bond)
5969		return -ENOMEM;
5970
5971	bond[ve->num_bonds].master = master;
5972	bond[ve->num_bonds].sibling_mask = sibling->mask;
5973
5974	ve->bonds = bond;
5975	ve->num_bonds++;
5976
5977	return 0;
5978}
5979
5980struct intel_engine_cs *
5981intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5982				 unsigned int sibling)
5983{
5984	struct virtual_engine *ve = to_virtual_engine(engine);
5985
5986	if (sibling >= ve->num_siblings)
5987		return NULL;
5988
5989	return ve->siblings[sibling];
5990}
5991
5992void intel_execlists_show_requests(struct intel_engine_cs *engine,
5993				   struct drm_printer *m,
5994				   void (*show_request)(struct drm_printer *m,
5995							struct i915_request *rq,
5996							const char *prefix),
5997				   unsigned int max)
5998{
5999	const struct intel_engine_execlists *execlists = &engine->execlists;
6000	struct i915_request *rq, *last;
6001	unsigned long flags;
6002	unsigned int count;
6003	struct rb_node *rb;
6004
6005	spin_lock_irqsave(&engine->active.lock, flags);
6006
6007	last = NULL;
6008	count = 0;
6009	list_for_each_entry(rq, &engine->active.requests, sched.link) {
6010		if (count++ < max - 1)
6011			show_request(m, rq, "\t\tE ");
6012		else
6013			last = rq;
6014	}
6015	if (last) {
6016		if (count > max) {
6017			drm_printf(m,
6018				   "\t\t...skipping %d executing requests...\n",
6019				   count - max);
6020		}
6021		show_request(m, last, "\t\tE ");
6022	}
6023
6024	if (execlists->switch_priority_hint != INT_MIN)
6025		drm_printf(m, "\t\tSwitch priority hint: %d\n",
6026			   READ_ONCE(execlists->switch_priority_hint));
6027	if (execlists->queue_priority_hint != INT_MIN)
6028		drm_printf(m, "\t\tQueue priority hint: %d\n",
6029			   READ_ONCE(execlists->queue_priority_hint));
6030
6031	last = NULL;
6032	count = 0;
6033	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
6034		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
6035		int i;
6036
6037		priolist_for_each_request(rq, p, i) {
6038			if (count++ < max - 1)
6039				show_request(m, rq, "\t\tQ ");
6040			else
6041				last = rq;
6042		}
6043	}
6044	if (last) {
6045		if (count > max) {
6046			drm_printf(m,
6047				   "\t\t...skipping %d queued requests...\n",
6048				   count - max);
6049		}
6050		show_request(m, last, "\t\tQ ");
6051	}
6052
6053	last = NULL;
6054	count = 0;
6055	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
6056		struct virtual_engine *ve =
6057			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
6058		struct i915_request *rq = READ_ONCE(ve->request);
6059
6060		if (rq) {
6061			if (count++ < max - 1)
6062				show_request(m, rq, "\t\tV ");
6063			else
6064				last = rq;
6065		}
6066	}
6067	if (last) {
6068		if (count > max) {
6069			drm_printf(m,
6070				   "\t\t...skipping %d virtual requests...\n",
6071				   count - max);
6072		}
6073		show_request(m, last, "\t\tV ");
6074	}
6075
6076	spin_unlock_irqrestore(&engine->active.lock, flags);
6077}
6078
6079void intel_lr_context_reset(struct intel_engine_cs *engine,
6080			    struct intel_context *ce,
6081			    u32 head,
6082			    bool scrub)
6083{
6084	GEM_BUG_ON(!intel_context_is_pinned(ce));
6085
6086	/*
6087	 * We want a simple context + ring to execute the breadcrumb update.
6088	 * We cannot rely on the context being intact across the GPU hang,
6089	 * so clear it and rebuild just what we need for the breadcrumb.
6090	 * All pending requests for this context will be zapped, and any
6091	 * future request will be after userspace has had the opportunity
6092	 * to recreate its own state.
6093	 */
6094	if (scrub)
6095		restore_default_state(ce, engine);
6096
6097	/* Rerun the request; its payload has been neutered (if guilty). */
6098	__execlists_update_reg_state(ce, engine, head);
6099}
6100
6101bool
6102intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
6103{
6104	return engine->set_default_submission ==
6105	       intel_execlists_set_default_submission;
6106}
6107
6108#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6109#include "selftest_lrc.c"
6110#endif
6111