1/*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25#include <linux/kthread.h>
26
27#include "gem/i915_gem_context.h"
28
29#include "intel_gt.h"
30#include "intel_engine_heartbeat.h"
31#include "intel_engine_pm.h"
32#include "selftest_engine_heartbeat.h"
33
34#include "i915_selftest.h"
35#include "selftests/i915_random.h"
36#include "selftests/igt_flush_test.h"
37#include "selftests/igt_reset.h"
38#include "selftests/igt_atomic.h"
39
40#include "selftests/mock_drm.h"
41
42#include "gem/selftests/mock_context.h"
43#include "gem/selftests/igt_gem_utils.h"
44
45#define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
46
47struct hang {
48	struct intel_gt *gt;
49	struct drm_i915_gem_object *hws;
50	struct drm_i915_gem_object *obj;
51	struct i915_gem_context *ctx;
52	u32 *seqno;
53	u32 *batch;
54};
55
56static int hang_init(struct hang *h, struct intel_gt *gt)
57{
58	void *vaddr;
59	int err;
60
61	memset(h, 0, sizeof(*h));
62	h->gt = gt;
63
64	h->ctx = kernel_context(gt->i915);
65	if (IS_ERR(h->ctx))
66		return PTR_ERR(h->ctx);
67
68	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
69
70	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
71	if (IS_ERR(h->hws)) {
72		err = PTR_ERR(h->hws);
73		goto err_ctx;
74	}
75
76	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
77	if (IS_ERR(h->obj)) {
78		err = PTR_ERR(h->obj);
79		goto err_hws;
80	}
81
82	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
83	vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
84	if (IS_ERR(vaddr)) {
85		err = PTR_ERR(vaddr);
86		goto err_obj;
87	}
88	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
89
90	vaddr = i915_gem_object_pin_map(h->obj,
91					i915_coherent_map_type(gt->i915));
92	if (IS_ERR(vaddr)) {
93		err = PTR_ERR(vaddr);
94		goto err_unpin_hws;
95	}
96	h->batch = vaddr;
97
98	return 0;
99
100err_unpin_hws:
101	i915_gem_object_unpin_map(h->hws);
102err_obj:
103	i915_gem_object_put(h->obj);
104err_hws:
105	i915_gem_object_put(h->hws);
106err_ctx:
107	kernel_context_close(h->ctx);
108	return err;
109}
110
111static u64 hws_address(const struct i915_vma *hws,
112		       const struct i915_request *rq)
113{
114	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
115}
116
117static int move_to_active(struct i915_vma *vma,
118			  struct i915_request *rq,
119			  unsigned int flags)
120{
121	int err;
122
123	i915_vma_lock(vma);
124	err = i915_request_await_object(rq, vma->obj,
125					flags & EXEC_OBJECT_WRITE);
126	if (err == 0)
127		err = i915_vma_move_to_active(vma, rq, flags);
128	i915_vma_unlock(vma);
129
130	return err;
131}
132
133static struct i915_request *
134hang_create_request(struct hang *h, struct intel_engine_cs *engine)
135{
136	struct intel_gt *gt = h->gt;
137	struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
138	struct drm_i915_gem_object *obj;
139	struct i915_request *rq = NULL;
140	struct i915_vma *hws, *vma;
141	unsigned int flags;
142	void *vaddr;
143	u32 *batch;
144	int err;
145
146	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
147	if (IS_ERR(obj)) {
148		i915_vm_put(vm);
149		return ERR_CAST(obj);
150	}
151
152	vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
153	if (IS_ERR(vaddr)) {
154		i915_gem_object_put(obj);
155		i915_vm_put(vm);
156		return ERR_CAST(vaddr);
157	}
158
159	i915_gem_object_unpin_map(h->obj);
160	i915_gem_object_put(h->obj);
161
162	h->obj = obj;
163	h->batch = vaddr;
164
165	vma = i915_vma_instance(h->obj, vm, NULL);
166	if (IS_ERR(vma)) {
167		i915_vm_put(vm);
168		return ERR_CAST(vma);
169	}
170
171	hws = i915_vma_instance(h->hws, vm, NULL);
172	if (IS_ERR(hws)) {
173		i915_vm_put(vm);
174		return ERR_CAST(hws);
175	}
176
177	err = i915_vma_pin(vma, 0, 0, PIN_USER);
178	if (err) {
179		i915_vm_put(vm);
180		return ERR_PTR(err);
181	}
182
183	err = i915_vma_pin(hws, 0, 0, PIN_USER);
184	if (err)
185		goto unpin_vma;
186
187	rq = igt_request_alloc(h->ctx, engine);
188	if (IS_ERR(rq)) {
189		err = PTR_ERR(rq);
190		goto unpin_hws;
191	}
192
193	err = move_to_active(vma, rq, 0);
194	if (err)
195		goto cancel_rq;
196
197	err = move_to_active(hws, rq, 0);
198	if (err)
199		goto cancel_rq;
200
201	batch = h->batch;
202	if (INTEL_GEN(gt->i915) >= 8) {
203		*batch++ = MI_STORE_DWORD_IMM_GEN4;
204		*batch++ = lower_32_bits(hws_address(hws, rq));
205		*batch++ = upper_32_bits(hws_address(hws, rq));
206		*batch++ = rq->fence.seqno;
207		*batch++ = MI_NOOP;
208
209		memset(batch, 0, 1024);
210		batch += 1024 / sizeof(*batch);
211
212		*batch++ = MI_NOOP;
213		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
214		*batch++ = lower_32_bits(vma->node.start);
215		*batch++ = upper_32_bits(vma->node.start);
216	} else if (INTEL_GEN(gt->i915) >= 6) {
217		*batch++ = MI_STORE_DWORD_IMM_GEN4;
218		*batch++ = 0;
219		*batch++ = lower_32_bits(hws_address(hws, rq));
220		*batch++ = rq->fence.seqno;
221		*batch++ = MI_NOOP;
222
223		memset(batch, 0, 1024);
224		batch += 1024 / sizeof(*batch);
225
226		*batch++ = MI_NOOP;
227		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
228		*batch++ = lower_32_bits(vma->node.start);
229	} else if (INTEL_GEN(gt->i915) >= 4) {
230		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
231		*batch++ = 0;
232		*batch++ = lower_32_bits(hws_address(hws, rq));
233		*batch++ = rq->fence.seqno;
234		*batch++ = MI_NOOP;
235
236		memset(batch, 0, 1024);
237		batch += 1024 / sizeof(*batch);
238
239		*batch++ = MI_NOOP;
240		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
241		*batch++ = lower_32_bits(vma->node.start);
242	} else {
243		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
244		*batch++ = lower_32_bits(hws_address(hws, rq));
245		*batch++ = rq->fence.seqno;
246		*batch++ = MI_NOOP;
247
248		memset(batch, 0, 1024);
249		batch += 1024 / sizeof(*batch);
250
251		*batch++ = MI_NOOP;
252		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
253		*batch++ = lower_32_bits(vma->node.start);
254	}
255	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
256	intel_gt_chipset_flush(engine->gt);
257
258	if (rq->engine->emit_init_breadcrumb) {
259		err = rq->engine->emit_init_breadcrumb(rq);
260		if (err)
261			goto cancel_rq;
262	}
263
264	flags = 0;
265	if (INTEL_GEN(gt->i915) <= 5)
266		flags |= I915_DISPATCH_SECURE;
267
268	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
269
270cancel_rq:
271	if (err) {
272		i915_request_set_error_once(rq, err);
273		i915_request_add(rq);
274	}
275unpin_hws:
276	i915_vma_unpin(hws);
277unpin_vma:
278	i915_vma_unpin(vma);
279	i915_vm_put(vm);
280	return err ? ERR_PTR(err) : rq;
281}
282
283static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
284{
285	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
286}
287
288static void hang_fini(struct hang *h)
289{
290	*h->batch = MI_BATCH_BUFFER_END;
291	intel_gt_chipset_flush(h->gt);
292
293	i915_gem_object_unpin_map(h->obj);
294	i915_gem_object_put(h->obj);
295
296	i915_gem_object_unpin_map(h->hws);
297	i915_gem_object_put(h->hws);
298
299	kernel_context_close(h->ctx);
300
301	igt_flush_test(h->gt->i915);
302}
303
304static bool wait_until_running(struct hang *h, struct i915_request *rq)
305{
306	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
307					       rq->fence.seqno),
308			     10) &&
309		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
310					    rq->fence.seqno),
311			  1000));
312}
313
314static int igt_hang_sanitycheck(void *arg)
315{
316	struct intel_gt *gt = arg;
317	struct i915_request *rq;
318	struct intel_engine_cs *engine;
319	enum intel_engine_id id;
320	struct hang h;
321	int err;
322
323	/* Basic check that we can execute our hanging batch */
324
325	err = hang_init(&h, gt);
326	if (err)
327		return err;
328
329	for_each_engine(engine, gt, id) {
330		struct intel_wedge_me w;
331		long timeout;
332
333		if (!intel_engine_can_store_dword(engine))
334			continue;
335
336		rq = hang_create_request(&h, engine);
337		if (IS_ERR(rq)) {
338			err = PTR_ERR(rq);
339			pr_err("Failed to create request for %s, err=%d\n",
340			       engine->name, err);
341			goto fini;
342		}
343
344		i915_request_get(rq);
345
346		*h.batch = MI_BATCH_BUFFER_END;
347		intel_gt_chipset_flush(engine->gt);
348
349		i915_request_add(rq);
350
351		timeout = 0;
352		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
353			timeout = i915_request_wait(rq, 0,
354						    MAX_SCHEDULE_TIMEOUT);
355		if (intel_gt_is_wedged(gt))
356			timeout = -EIO;
357
358		i915_request_put(rq);
359
360		if (timeout < 0) {
361			err = timeout;
362			pr_err("Wait for request failed on %s, err=%d\n",
363			       engine->name, err);
364			goto fini;
365		}
366	}
367
368fini:
369	hang_fini(&h);
370	return err;
371}
372
373static bool wait_for_idle(struct intel_engine_cs *engine)
374{
375	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
376}
377
378static int igt_reset_nop(void *arg)
379{
380	struct intel_gt *gt = arg;
381	struct i915_gpu_error *global = &gt->i915->gpu_error;
382	struct intel_engine_cs *engine;
383	unsigned int reset_count, count;
384	enum intel_engine_id id;
385	IGT_TIMEOUT(end_time);
386	int err = 0;
387
388	/* Check that we can reset during non-user portions of requests */
389
390	reset_count = i915_reset_count(global);
391	count = 0;
392	do {
393		for_each_engine(engine, gt, id) {
394			struct intel_context *ce;
395			int i;
396
397			ce = intel_context_create(engine);
398			if (IS_ERR(ce)) {
399				err = PTR_ERR(ce);
400				break;
401			}
402
403			for (i = 0; i < 16; i++) {
404				struct i915_request *rq;
405
406				rq = intel_context_create_request(ce);
407				if (IS_ERR(rq)) {
408					err = PTR_ERR(rq);
409					break;
410				}
411
412				i915_request_add(rq);
413			}
414
415			intel_context_put(ce);
416		}
417
418		igt_global_reset_lock(gt);
419		intel_gt_reset(gt, ALL_ENGINES, NULL);
420		igt_global_reset_unlock(gt);
421
422		if (intel_gt_is_wedged(gt)) {
423			err = -EIO;
424			break;
425		}
426
427		if (i915_reset_count(global) != reset_count + ++count) {
428			pr_err("Full GPU reset not recorded!\n");
429			err = -EINVAL;
430			break;
431		}
432
433		err = igt_flush_test(gt->i915);
434		if (err)
435			break;
436	} while (time_before(jiffies, end_time));
437	pr_info("%s: %d resets\n", __func__, count);
438
439	if (igt_flush_test(gt->i915))
440		err = -EIO;
441	return err;
442}
443
444static int igt_reset_nop_engine(void *arg)
445{
446	struct intel_gt *gt = arg;
447	struct i915_gpu_error *global = &gt->i915->gpu_error;
448	struct intel_engine_cs *engine;
449	enum intel_engine_id id;
450
451	/* Check that we can engine-reset during non-user portions */
452
453	if (!intel_has_reset_engine(gt))
454		return 0;
455
456	for_each_engine(engine, gt, id) {
457		unsigned int reset_count, reset_engine_count, count;
458		struct intel_context *ce;
459		IGT_TIMEOUT(end_time);
460		int err;
461
462		ce = intel_context_create(engine);
463		if (IS_ERR(ce))
464			return PTR_ERR(ce);
465
466		reset_count = i915_reset_count(global);
467		reset_engine_count = i915_reset_engine_count(global, engine);
468		count = 0;
469
470		st_engine_heartbeat_disable(engine);
471		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
472		do {
473			int i;
474
475			if (!wait_for_idle(engine)) {
476				pr_err("%s failed to idle before reset\n",
477				       engine->name);
478				err = -EIO;
479				break;
480			}
481
482			for (i = 0; i < 16; i++) {
483				struct i915_request *rq;
484
485				rq = intel_context_create_request(ce);
486				if (IS_ERR(rq)) {
487					struct drm_printer p =
488						drm_info_printer(gt->i915->drm.dev);
489					intel_engine_dump(engine, &p,
490							  "%s(%s): failed to submit request\n",
491							  __func__,
492							  engine->name);
493
494					GEM_TRACE("%s(%s): failed to submit request\n",
495						  __func__,
496						  engine->name);
497					GEM_TRACE_DUMP();
498
499					intel_gt_set_wedged(gt);
500
501					err = PTR_ERR(rq);
502					break;
503				}
504
505				i915_request_add(rq);
506			}
507			err = intel_engine_reset(engine, NULL);
508			if (err) {
509				pr_err("i915_reset_engine failed\n");
510				break;
511			}
512
513			if (i915_reset_count(global) != reset_count) {
514				pr_err("Full GPU reset recorded! (engine reset expected)\n");
515				err = -EINVAL;
516				break;
517			}
518
519			if (i915_reset_engine_count(global, engine) !=
520			    reset_engine_count + ++count) {
521				pr_err("%s engine reset not recorded!\n",
522				       engine->name);
523				err = -EINVAL;
524				break;
525			}
526		} while (time_before(jiffies, end_time));
527		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
528		st_engine_heartbeat_enable(engine);
529
530		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
531
532		intel_context_put(ce);
533		if (igt_flush_test(gt->i915))
534			err = -EIO;
535		if (err)
536			return err;
537	}
538
539	return 0;
540}
541
542static int __igt_reset_engine(struct intel_gt *gt, bool active)
543{
544	struct i915_gpu_error *global = &gt->i915->gpu_error;
545	struct intel_engine_cs *engine;
546	enum intel_engine_id id;
547	struct hang h;
548	int err = 0;
549
550	/* Check that we can issue an engine reset on an idle engine (no-op) */
551
552	if (!intel_has_reset_engine(gt))
553		return 0;
554
555	if (active) {
556		err = hang_init(&h, gt);
557		if (err)
558			return err;
559	}
560
561	for_each_engine(engine, gt, id) {
562		unsigned int reset_count, reset_engine_count;
563		IGT_TIMEOUT(end_time);
564
565		if (active && !intel_engine_can_store_dword(engine))
566			continue;
567
568		if (!wait_for_idle(engine)) {
569			pr_err("%s failed to idle before reset\n",
570			       engine->name);
571			err = -EIO;
572			break;
573		}
574
575		reset_count = i915_reset_count(global);
576		reset_engine_count = i915_reset_engine_count(global, engine);
577
578		st_engine_heartbeat_disable(engine);
579		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
580		do {
581			if (active) {
582				struct i915_request *rq;
583
584				rq = hang_create_request(&h, engine);
585				if (IS_ERR(rq)) {
586					err = PTR_ERR(rq);
587					break;
588				}
589
590				i915_request_get(rq);
591				i915_request_add(rq);
592
593				if (!wait_until_running(&h, rq)) {
594					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
595
596					pr_err("%s: Failed to start request %llx, at %x\n",
597					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
598					intel_engine_dump(engine, &p,
599							  "%s\n", engine->name);
600
601					i915_request_put(rq);
602					err = -EIO;
603					break;
604				}
605
606				i915_request_put(rq);
607			}
608
609			err = intel_engine_reset(engine, NULL);
610			if (err) {
611				pr_err("i915_reset_engine failed\n");
612				break;
613			}
614
615			if (i915_reset_count(global) != reset_count) {
616				pr_err("Full GPU reset recorded! (engine reset expected)\n");
617				err = -EINVAL;
618				break;
619			}
620
621			if (i915_reset_engine_count(global, engine) !=
622			    ++reset_engine_count) {
623				pr_err("%s engine reset not recorded!\n",
624				       engine->name);
625				err = -EINVAL;
626				break;
627			}
628		} while (time_before(jiffies, end_time));
629		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
630		st_engine_heartbeat_enable(engine);
631
632		if (err)
633			break;
634
635		err = igt_flush_test(gt->i915);
636		if (err)
637			break;
638	}
639
640	if (intel_gt_is_wedged(gt))
641		err = -EIO;
642
643	if (active)
644		hang_fini(&h);
645
646	return err;
647}
648
649static int igt_reset_idle_engine(void *arg)
650{
651	return __igt_reset_engine(arg, false);
652}
653
654static int igt_reset_active_engine(void *arg)
655{
656	return __igt_reset_engine(arg, true);
657}
658
659struct active_engine {
660	struct task_struct *task;
661	struct intel_engine_cs *engine;
662	unsigned long resets;
663	unsigned int flags;
664};
665
666#define TEST_ACTIVE	BIT(0)
667#define TEST_OTHERS	BIT(1)
668#define TEST_SELF	BIT(2)
669#define TEST_PRIORITY	BIT(3)
670
671static int active_request_put(struct i915_request *rq)
672{
673	int err = 0;
674
675	if (!rq)
676		return 0;
677
678	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
679		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
680			  rq->engine->name,
681			  rq->fence.context,
682			  rq->fence.seqno);
683		GEM_TRACE_DUMP();
684
685		intel_gt_set_wedged(rq->engine->gt);
686		err = -EIO;
687	}
688
689	i915_request_put(rq);
690
691	return err;
692}
693
694static int active_engine(void *data)
695{
696	I915_RND_STATE(prng);
697	struct active_engine *arg = data;
698	struct intel_engine_cs *engine = arg->engine;
699	struct i915_request *rq[8] = {};
700	struct intel_context *ce[ARRAY_SIZE(rq)];
701	unsigned long count;
702	int err = 0;
703
704	for (count = 0; count < ARRAY_SIZE(ce); count++) {
705		ce[count] = intel_context_create(engine);
706		if (IS_ERR(ce[count])) {
707			err = PTR_ERR(ce[count]);
708			while (--count)
709				intel_context_put(ce[count]);
710			return err;
711		}
712	}
713
714	count = 0;
715	while (!kthread_should_stop()) {
716		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
717		struct i915_request *old = rq[idx];
718		struct i915_request *new;
719
720		new = intel_context_create_request(ce[idx]);
721		if (IS_ERR(new)) {
722			err = PTR_ERR(new);
723			break;
724		}
725
726		rq[idx] = i915_request_get(new);
727		i915_request_add(new);
728
729		if (engine->schedule && arg->flags & TEST_PRIORITY) {
730			struct i915_sched_attr attr = {
731				.priority =
732					i915_prandom_u32_max_state(512, &prng),
733			};
734			engine->schedule(rq[idx], &attr);
735		}
736
737		err = active_request_put(old);
738		if (err)
739			break;
740
741		cond_resched();
742	}
743
744	for (count = 0; count < ARRAY_SIZE(rq); count++) {
745		int err__ = active_request_put(rq[count]);
746
747		/* Keep the first error */
748		if (!err)
749			err = err__;
750
751		intel_context_put(ce[count]);
752	}
753
754	return err;
755}
756
757static int __igt_reset_engines(struct intel_gt *gt,
758			       const char *test_name,
759			       unsigned int flags)
760{
761	struct i915_gpu_error *global = &gt->i915->gpu_error;
762	struct intel_engine_cs *engine, *other;
763	enum intel_engine_id id, tmp;
764	struct hang h;
765	int err = 0;
766
767	/* Check that issuing a reset on one engine does not interfere
768	 * with any other engine.
769	 */
770
771	if (!intel_has_reset_engine(gt))
772		return 0;
773
774	if (flags & TEST_ACTIVE) {
775		err = hang_init(&h, gt);
776		if (err)
777			return err;
778
779		if (flags & TEST_PRIORITY)
780			h.ctx->sched.priority = 1024;
781	}
782
783	for_each_engine(engine, gt, id) {
784		struct active_engine threads[I915_NUM_ENGINES] = {};
785		unsigned long device = i915_reset_count(global);
786		unsigned long count = 0, reported;
787		IGT_TIMEOUT(end_time);
788
789		if (flags & TEST_ACTIVE &&
790		    !intel_engine_can_store_dword(engine))
791			continue;
792
793		if (!wait_for_idle(engine)) {
794			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
795			       engine->name, test_name);
796			err = -EIO;
797			break;
798		}
799
800		memset(threads, 0, sizeof(threads));
801		for_each_engine(other, gt, tmp) {
802			struct task_struct *tsk;
803
804			threads[tmp].resets =
805				i915_reset_engine_count(global, other);
806
807			if (other == engine && !(flags & TEST_SELF))
808				continue;
809
810			if (other != engine && !(flags & TEST_OTHERS))
811				continue;
812
813			threads[tmp].engine = other;
814			threads[tmp].flags = flags;
815
816			tsk = kthread_run(active_engine, &threads[tmp],
817					  "igt/%s", other->name);
818			if (IS_ERR(tsk)) {
819				err = PTR_ERR(tsk);
820				goto unwind;
821			}
822
823			threads[tmp].task = tsk;
824			get_task_struct(tsk);
825		}
826
827		yield(); /* start all threads before we begin */
828
829		st_engine_heartbeat_disable(engine);
830		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
831		do {
832			struct i915_request *rq = NULL;
833
834			if (flags & TEST_ACTIVE) {
835				rq = hang_create_request(&h, engine);
836				if (IS_ERR(rq)) {
837					err = PTR_ERR(rq);
838					break;
839				}
840
841				i915_request_get(rq);
842				i915_request_add(rq);
843
844				if (!wait_until_running(&h, rq)) {
845					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
846
847					pr_err("%s: Failed to start request %llx, at %x\n",
848					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
849					intel_engine_dump(engine, &p,
850							  "%s\n", engine->name);
851
852					i915_request_put(rq);
853					err = -EIO;
854					break;
855				}
856			}
857
858			err = intel_engine_reset(engine, NULL);
859			if (err) {
860				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
861				       engine->name, test_name, err);
862				break;
863			}
864
865			count++;
866
867			if (rq) {
868				if (rq->fence.error != -EIO) {
869					pr_err("i915_reset_engine(%s:%s):"
870					       " failed to reset request %llx:%lld\n",
871					       engine->name, test_name,
872					       rq->fence.context,
873					       rq->fence.seqno);
874					i915_request_put(rq);
875
876					GEM_TRACE_DUMP();
877					intel_gt_set_wedged(gt);
878					err = -EIO;
879					break;
880				}
881
882				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
883					struct drm_printer p =
884						drm_info_printer(gt->i915->drm.dev);
885
886					pr_err("i915_reset_engine(%s:%s):"
887					       " failed to complete request %llx:%lld after reset\n",
888					       engine->name, test_name,
889					       rq->fence.context,
890					       rq->fence.seqno);
891					intel_engine_dump(engine, &p,
892							  "%s\n", engine->name);
893					i915_request_put(rq);
894
895					GEM_TRACE_DUMP();
896					intel_gt_set_wedged(gt);
897					err = -EIO;
898					break;
899				}
900
901				i915_request_put(rq);
902			}
903
904			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
905				struct drm_printer p =
906					drm_info_printer(gt->i915->drm.dev);
907
908				pr_err("i915_reset_engine(%s:%s):"
909				       " failed to idle after reset\n",
910				       engine->name, test_name);
911				intel_engine_dump(engine, &p,
912						  "%s\n", engine->name);
913
914				err = -EIO;
915				break;
916			}
917		} while (time_before(jiffies, end_time));
918		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
919		st_engine_heartbeat_enable(engine);
920
921		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
922			engine->name, test_name, count);
923
924		reported = i915_reset_engine_count(global, engine);
925		reported -= threads[engine->id].resets;
926		if (reported != count) {
927			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
928			       engine->name, test_name, count, reported);
929			if (!err)
930				err = -EINVAL;
931		}
932
933unwind:
934		for_each_engine(other, gt, tmp) {
935			int ret;
936
937			if (!threads[tmp].task)
938				continue;
939
940			ret = kthread_stop(threads[tmp].task);
941			if (ret) {
942				pr_err("kthread for other engine %s failed, err=%d\n",
943				       other->name, ret);
944				if (!err)
945					err = ret;
946			}
947			put_task_struct(threads[tmp].task);
948
949			if (other->uabi_class != engine->uabi_class &&
950			    threads[tmp].resets !=
951			    i915_reset_engine_count(global, other)) {
952				pr_err("Innocent engine %s was reset (count=%ld)\n",
953				       other->name,
954				       i915_reset_engine_count(global, other) -
955				       threads[tmp].resets);
956				if (!err)
957					err = -EINVAL;
958			}
959		}
960
961		if (device != i915_reset_count(global)) {
962			pr_err("Global reset (count=%ld)!\n",
963			       i915_reset_count(global) - device);
964			if (!err)
965				err = -EINVAL;
966		}
967
968		if (err)
969			break;
970
971		err = igt_flush_test(gt->i915);
972		if (err)
973			break;
974	}
975
976	if (intel_gt_is_wedged(gt))
977		err = -EIO;
978
979	if (flags & TEST_ACTIVE)
980		hang_fini(&h);
981
982	return err;
983}
984
985static int igt_reset_engines(void *arg)
986{
987	static const struct {
988		const char *name;
989		unsigned int flags;
990	} phases[] = {
991		{ "idle", 0 },
992		{ "active", TEST_ACTIVE },
993		{ "others-idle", TEST_OTHERS },
994		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
995		{
996			"others-priority",
997			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
998		},
999		{
1000			"self-priority",
1001			TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1002		},
1003		{ }
1004	};
1005	struct intel_gt *gt = arg;
1006	typeof(*phases) *p;
1007	int err;
1008
1009	for (p = phases; p->name; p++) {
1010		if (p->flags & TEST_PRIORITY) {
1011			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1012				continue;
1013		}
1014
1015		err = __igt_reset_engines(arg, p->name, p->flags);
1016		if (err)
1017			return err;
1018	}
1019
1020	return 0;
1021}
1022
1023static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1024{
1025	u32 count = i915_reset_count(&gt->i915->gpu_error);
1026
1027	intel_gt_reset(gt, mask, NULL);
1028
1029	return count;
1030}
1031
1032static int igt_reset_wait(void *arg)
1033{
1034	struct intel_gt *gt = arg;
1035	struct i915_gpu_error *global = &gt->i915->gpu_error;
1036	struct intel_engine_cs *engine = gt->engine[RCS0];
1037	struct i915_request *rq;
1038	unsigned int reset_count;
1039	struct hang h;
1040	long timeout;
1041	int err;
1042
1043	if (!engine || !intel_engine_can_store_dword(engine))
1044		return 0;
1045
1046	/* Check that we detect a stuck waiter and issue a reset */
1047
1048	igt_global_reset_lock(gt);
1049
1050	err = hang_init(&h, gt);
1051	if (err)
1052		goto unlock;
1053
1054	rq = hang_create_request(&h, engine);
1055	if (IS_ERR(rq)) {
1056		err = PTR_ERR(rq);
1057		goto fini;
1058	}
1059
1060	i915_request_get(rq);
1061	i915_request_add(rq);
1062
1063	if (!wait_until_running(&h, rq)) {
1064		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1065
1066		pr_err("%s: Failed to start request %llx, at %x\n",
1067		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1068		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1069
1070		intel_gt_set_wedged(gt);
1071
1072		err = -EIO;
1073		goto out_rq;
1074	}
1075
1076	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1077
1078	timeout = i915_request_wait(rq, 0, 10);
1079	if (timeout < 0) {
1080		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1081		       timeout);
1082		err = timeout;
1083		goto out_rq;
1084	}
1085
1086	if (i915_reset_count(global) == reset_count) {
1087		pr_err("No GPU reset recorded!\n");
1088		err = -EINVAL;
1089		goto out_rq;
1090	}
1091
1092out_rq:
1093	i915_request_put(rq);
1094fini:
1095	hang_fini(&h);
1096unlock:
1097	igt_global_reset_unlock(gt);
1098
1099	if (intel_gt_is_wedged(gt))
1100		return -EIO;
1101
1102	return err;
1103}
1104
1105struct evict_vma {
1106	struct completion completion;
1107	struct i915_vma *vma;
1108};
1109
1110static int evict_vma(void *data)
1111{
1112	struct evict_vma *arg = data;
1113	struct i915_address_space *vm = arg->vma->vm;
1114	struct drm_mm_node evict = arg->vma->node;
1115	int err;
1116
1117	complete(&arg->completion);
1118
1119	mutex_lock(&vm->mutex);
1120	err = i915_gem_evict_for_node(vm, &evict, 0);
1121	mutex_unlock(&vm->mutex);
1122
1123	return err;
1124}
1125
1126static int evict_fence(void *data)
1127{
1128	struct evict_vma *arg = data;
1129	int err;
1130
1131	complete(&arg->completion);
1132
1133	/* Mark the fence register as dirty to force the mmio update. */
1134	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1135	if (err) {
1136		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1137		return err;
1138	}
1139
1140	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1141	if (err) {
1142		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1143		return err;
1144	}
1145
1146	err = i915_vma_pin_fence(arg->vma);
1147	i915_vma_unpin(arg->vma);
1148	if (err) {
1149		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1150		return err;
1151	}
1152
1153	i915_vma_unpin_fence(arg->vma);
1154
1155	return 0;
1156}
1157
1158static int __igt_reset_evict_vma(struct intel_gt *gt,
1159				 struct i915_address_space *vm,
1160				 int (*fn)(void *),
1161				 unsigned int flags)
1162{
1163	struct intel_engine_cs *engine = gt->engine[RCS0];
1164	struct drm_i915_gem_object *obj;
1165	struct task_struct *tsk = NULL;
1166	struct i915_request *rq;
1167	struct evict_vma arg;
1168	struct hang h;
1169	unsigned int pin_flags;
1170	int err;
1171
1172	if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1173		return 0;
1174
1175	if (!engine || !intel_engine_can_store_dword(engine))
1176		return 0;
1177
1178	/* Check that we can recover an unbind stuck on a hanging request */
1179
1180	err = hang_init(&h, gt);
1181	if (err)
1182		return err;
1183
1184	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1185	if (IS_ERR(obj)) {
1186		err = PTR_ERR(obj);
1187		goto fini;
1188	}
1189
1190	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1191		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1192		if (err) {
1193			pr_err("Invalid X-tiling settings; err:%d\n", err);
1194			goto out_obj;
1195		}
1196	}
1197
1198	arg.vma = i915_vma_instance(obj, vm, NULL);
1199	if (IS_ERR(arg.vma)) {
1200		err = PTR_ERR(arg.vma);
1201		goto out_obj;
1202	}
1203
1204	rq = hang_create_request(&h, engine);
1205	if (IS_ERR(rq)) {
1206		err = PTR_ERR(rq);
1207		goto out_obj;
1208	}
1209
1210	pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1211
1212	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1213		pin_flags |= PIN_MAPPABLE;
1214
1215	err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1216	if (err) {
1217		i915_request_add(rq);
1218		goto out_obj;
1219	}
1220
1221	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1222		err = i915_vma_pin_fence(arg.vma);
1223		if (err) {
1224			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1225			i915_vma_unpin(arg.vma);
1226			i915_request_add(rq);
1227			goto out_obj;
1228		}
1229	}
1230
1231	i915_vma_lock(arg.vma);
1232	err = i915_request_await_object(rq, arg.vma->obj,
1233					flags & EXEC_OBJECT_WRITE);
1234	if (err == 0)
1235		err = i915_vma_move_to_active(arg.vma, rq, flags);
1236	i915_vma_unlock(arg.vma);
1237
1238	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1239		i915_vma_unpin_fence(arg.vma);
1240	i915_vma_unpin(arg.vma);
1241
1242	i915_request_get(rq);
1243	i915_request_add(rq);
1244	if (err)
1245		goto out_rq;
1246
1247	if (!wait_until_running(&h, rq)) {
1248		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1249
1250		pr_err("%s: Failed to start request %llx, at %x\n",
1251		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1252		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1253
1254		intel_gt_set_wedged(gt);
1255		goto out_reset;
1256	}
1257
1258	init_completion(&arg.completion);
1259
1260	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1261	if (IS_ERR(tsk)) {
1262		err = PTR_ERR(tsk);
1263		tsk = NULL;
1264		goto out_reset;
1265	}
1266	get_task_struct(tsk);
1267
1268	wait_for_completion(&arg.completion);
1269
1270	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1271		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1272
1273		pr_err("igt/evict_vma kthread did not wait\n");
1274		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1275
1276		intel_gt_set_wedged(gt);
1277		goto out_reset;
1278	}
1279
1280out_reset:
1281	igt_global_reset_lock(gt);
1282	fake_hangcheck(gt, rq->engine->mask);
1283	igt_global_reset_unlock(gt);
1284
1285	if (tsk) {
1286		struct intel_wedge_me w;
1287
1288		/* The reset, even indirectly, should take less than 10ms. */
1289		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1290			err = kthread_stop(tsk);
1291
1292		put_task_struct(tsk);
1293	}
1294
1295out_rq:
1296	i915_request_put(rq);
1297out_obj:
1298	i915_gem_object_put(obj);
1299fini:
1300	hang_fini(&h);
1301	if (intel_gt_is_wedged(gt))
1302		return -EIO;
1303
1304	return err;
1305}
1306
1307static int igt_reset_evict_ggtt(void *arg)
1308{
1309	struct intel_gt *gt = arg;
1310
1311	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1312				     evict_vma, EXEC_OBJECT_WRITE);
1313}
1314
1315static int igt_reset_evict_ppgtt(void *arg)
1316{
1317	struct intel_gt *gt = arg;
1318	struct i915_ppgtt *ppgtt;
1319	int err;
1320
1321	/* aliasing == global gtt locking, covered above */
1322	if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1323		return 0;
1324
1325	ppgtt = i915_ppgtt_create(gt);
1326	if (IS_ERR(ppgtt))
1327		return PTR_ERR(ppgtt);
1328
1329	err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1330				    evict_vma, EXEC_OBJECT_WRITE);
1331	i915_vm_put(&ppgtt->vm);
1332
1333	return err;
1334}
1335
1336static int igt_reset_evict_fence(void *arg)
1337{
1338	struct intel_gt *gt = arg;
1339
1340	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1341				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1342}
1343
1344static int wait_for_others(struct intel_gt *gt,
1345			   struct intel_engine_cs *exclude)
1346{
1347	struct intel_engine_cs *engine;
1348	enum intel_engine_id id;
1349
1350	for_each_engine(engine, gt, id) {
1351		if (engine == exclude)
1352			continue;
1353
1354		if (!wait_for_idle(engine))
1355			return -EIO;
1356	}
1357
1358	return 0;
1359}
1360
1361static int igt_reset_queue(void *arg)
1362{
1363	struct intel_gt *gt = arg;
1364	struct i915_gpu_error *global = &gt->i915->gpu_error;
1365	struct intel_engine_cs *engine;
1366	enum intel_engine_id id;
1367	struct hang h;
1368	int err;
1369
1370	/* Check that we replay pending requests following a hang */
1371
1372	igt_global_reset_lock(gt);
1373
1374	err = hang_init(&h, gt);
1375	if (err)
1376		goto unlock;
1377
1378	for_each_engine(engine, gt, id) {
1379		struct i915_request *prev;
1380		IGT_TIMEOUT(end_time);
1381		unsigned int count;
1382
1383		if (!intel_engine_can_store_dword(engine))
1384			continue;
1385
1386		prev = hang_create_request(&h, engine);
1387		if (IS_ERR(prev)) {
1388			err = PTR_ERR(prev);
1389			goto fini;
1390		}
1391
1392		i915_request_get(prev);
1393		i915_request_add(prev);
1394
1395		count = 0;
1396		do {
1397			struct i915_request *rq;
1398			unsigned int reset_count;
1399
1400			rq = hang_create_request(&h, engine);
1401			if (IS_ERR(rq)) {
1402				err = PTR_ERR(rq);
1403				goto fini;
1404			}
1405
1406			i915_request_get(rq);
1407			i915_request_add(rq);
1408
1409			/*
1410			 * XXX We don't handle resetting the kernel context
1411			 * very well. If we trigger a device reset twice in
1412			 * quick succession while the kernel context is
1413			 * executing, we may end up skipping the breadcrumb.
1414			 * This is really only a problem for the selftest as
1415			 * normally there is a large interlude between resets
1416			 * (hangcheck), or we focus on resetting just one
1417			 * engine and so avoid repeatedly resetting innocents.
1418			 */
1419			err = wait_for_others(gt, engine);
1420			if (err) {
1421				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1422				       __func__, engine->name);
1423				i915_request_put(rq);
1424				i915_request_put(prev);
1425
1426				GEM_TRACE_DUMP();
1427				intel_gt_set_wedged(gt);
1428				goto fini;
1429			}
1430
1431			if (!wait_until_running(&h, prev)) {
1432				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1433
1434				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1435				       __func__, engine->name,
1436				       prev->fence.seqno, hws_seqno(&h, prev));
1437				intel_engine_dump(engine, &p,
1438						  "%s\n", engine->name);
1439
1440				i915_request_put(rq);
1441				i915_request_put(prev);
1442
1443				intel_gt_set_wedged(gt);
1444
1445				err = -EIO;
1446				goto fini;
1447			}
1448
1449			reset_count = fake_hangcheck(gt, BIT(id));
1450
1451			if (prev->fence.error != -EIO) {
1452				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1453				       prev->fence.error);
1454				i915_request_put(rq);
1455				i915_request_put(prev);
1456				err = -EINVAL;
1457				goto fini;
1458			}
1459
1460			if (rq->fence.error) {
1461				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1462				       rq->fence.error);
1463				i915_request_put(rq);
1464				i915_request_put(prev);
1465				err = -EINVAL;
1466				goto fini;
1467			}
1468
1469			if (i915_reset_count(global) == reset_count) {
1470				pr_err("No GPU reset recorded!\n");
1471				i915_request_put(rq);
1472				i915_request_put(prev);
1473				err = -EINVAL;
1474				goto fini;
1475			}
1476
1477			i915_request_put(prev);
1478			prev = rq;
1479			count++;
1480		} while (time_before(jiffies, end_time));
1481		pr_info("%s: Completed %d resets\n", engine->name, count);
1482
1483		*h.batch = MI_BATCH_BUFFER_END;
1484		intel_gt_chipset_flush(engine->gt);
1485
1486		i915_request_put(prev);
1487
1488		err = igt_flush_test(gt->i915);
1489		if (err)
1490			break;
1491	}
1492
1493fini:
1494	hang_fini(&h);
1495unlock:
1496	igt_global_reset_unlock(gt);
1497
1498	if (intel_gt_is_wedged(gt))
1499		return -EIO;
1500
1501	return err;
1502}
1503
1504static int igt_handle_error(void *arg)
1505{
1506	struct intel_gt *gt = arg;
1507	struct i915_gpu_error *global = &gt->i915->gpu_error;
1508	struct intel_engine_cs *engine = gt->engine[RCS0];
1509	struct hang h;
1510	struct i915_request *rq;
1511	struct i915_gpu_coredump *error;
1512	int err;
1513
1514	/* Check that we can issue a global GPU and engine reset */
1515
1516	if (!intel_has_reset_engine(gt))
1517		return 0;
1518
1519	if (!engine || !intel_engine_can_store_dword(engine))
1520		return 0;
1521
1522	err = hang_init(&h, gt);
1523	if (err)
1524		return err;
1525
1526	rq = hang_create_request(&h, engine);
1527	if (IS_ERR(rq)) {
1528		err = PTR_ERR(rq);
1529		goto err_fini;
1530	}
1531
1532	i915_request_get(rq);
1533	i915_request_add(rq);
1534
1535	if (!wait_until_running(&h, rq)) {
1536		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1537
1538		pr_err("%s: Failed to start request %llx, at %x\n",
1539		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1540		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1541
1542		intel_gt_set_wedged(gt);
1543
1544		err = -EIO;
1545		goto err_request;
1546	}
1547
1548	/* Temporarily disable error capture */
1549	error = xchg(&global->first_error, (void *)-1);
1550
1551	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1552
1553	xchg(&global->first_error, error);
1554
1555	if (rq->fence.error != -EIO) {
1556		pr_err("Guilty request not identified!\n");
1557		err = -EINVAL;
1558		goto err_request;
1559	}
1560
1561err_request:
1562	i915_request_put(rq);
1563err_fini:
1564	hang_fini(&h);
1565	return err;
1566}
1567
1568static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1569				     const struct igt_atomic_section *p,
1570				     const char *mode)
1571{
1572	struct tasklet_struct * const t = &engine->execlists.tasklet;
1573	int err;
1574
1575	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1576		  engine->name, mode, p->name);
1577
1578	tasklet_disable(t);
1579	p->critical_section_begin();
1580
1581	err = intel_engine_reset(engine, NULL);
1582
1583	p->critical_section_end();
1584	tasklet_enable(t);
1585
1586	if (err)
1587		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1588		       engine->name, mode, p->name);
1589
1590	return err;
1591}
1592
1593static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1594				   const struct igt_atomic_section *p)
1595{
1596	struct i915_request *rq;
1597	struct hang h;
1598	int err;
1599
1600	err = __igt_atomic_reset_engine(engine, p, "idle");
1601	if (err)
1602		return err;
1603
1604	err = hang_init(&h, engine->gt);
1605	if (err)
1606		return err;
1607
1608	rq = hang_create_request(&h, engine);
1609	if (IS_ERR(rq)) {
1610		err = PTR_ERR(rq);
1611		goto out;
1612	}
1613
1614	i915_request_get(rq);
1615	i915_request_add(rq);
1616
1617	if (wait_until_running(&h, rq)) {
1618		err = __igt_atomic_reset_engine(engine, p, "active");
1619	} else {
1620		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1621		       __func__, engine->name,
1622		       rq->fence.seqno, hws_seqno(&h, rq));
1623		intel_gt_set_wedged(engine->gt);
1624		err = -EIO;
1625	}
1626
1627	if (err == 0) {
1628		struct intel_wedge_me w;
1629
1630		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1631			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1632		if (intel_gt_is_wedged(engine->gt))
1633			err = -EIO;
1634	}
1635
1636	i915_request_put(rq);
1637out:
1638	hang_fini(&h);
1639	return err;
1640}
1641
1642static int igt_reset_engines_atomic(void *arg)
1643{
1644	struct intel_gt *gt = arg;
1645	const typeof(*igt_atomic_phases) *p;
1646	int err = 0;
1647
1648	/* Check that the engines resets are usable from atomic context */
1649
1650	if (!intel_has_reset_engine(gt))
1651		return 0;
1652
1653	if (intel_uc_uses_guc_submission(&gt->uc))
1654		return 0;
1655
1656	igt_global_reset_lock(gt);
1657
1658	/* Flush any requests before we get started and check basics */
1659	if (!igt_force_reset(gt))
1660		goto unlock;
1661
1662	for (p = igt_atomic_phases; p->name; p++) {
1663		struct intel_engine_cs *engine;
1664		enum intel_engine_id id;
1665
1666		for_each_engine(engine, gt, id) {
1667			err = igt_atomic_reset_engine(engine, p);
1668			if (err)
1669				goto out;
1670		}
1671	}
1672
1673out:
1674	/* As we poke around the guts, do a full reset before continuing. */
1675	igt_force_reset(gt);
1676unlock:
1677	igt_global_reset_unlock(gt);
1678
1679	return err;
1680}
1681
1682int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1683{
1684	static const struct i915_subtest tests[] = {
1685		SUBTEST(igt_hang_sanitycheck),
1686		SUBTEST(igt_reset_nop),
1687		SUBTEST(igt_reset_nop_engine),
1688		SUBTEST(igt_reset_idle_engine),
1689		SUBTEST(igt_reset_active_engine),
1690		SUBTEST(igt_reset_engines),
1691		SUBTEST(igt_reset_engines_atomic),
1692		SUBTEST(igt_reset_queue),
1693		SUBTEST(igt_reset_wait),
1694		SUBTEST(igt_reset_evict_ggtt),
1695		SUBTEST(igt_reset_evict_ppgtt),
1696		SUBTEST(igt_reset_evict_fence),
1697		SUBTEST(igt_handle_error),
1698	};
1699	struct intel_gt *gt = &i915->gt;
1700	intel_wakeref_t wakeref;
1701	int err;
1702
1703	if (!intel_has_gpu_reset(gt))
1704		return 0;
1705
1706	if (intel_gt_is_wedged(gt))
1707		return -EIO; /* we're long past hope of a successful reset */
1708
1709	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1710
1711	err = intel_gt_live_subtests(tests, gt);
1712
1713	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1714
1715	return err;
1716}
1717