18c2ecf20Sopenharmony_ci/*
28c2ecf20Sopenharmony_ci * SPDX-License-Identifier: MIT
38c2ecf20Sopenharmony_ci *
48c2ecf20Sopenharmony_ci * Copyright © 2008-2018 Intel Corporation
58c2ecf20Sopenharmony_ci */
68c2ecf20Sopenharmony_ci
78c2ecf20Sopenharmony_ci#include <linux/sched/mm.h>
88c2ecf20Sopenharmony_ci#include <linux/stop_machine.h>
98c2ecf20Sopenharmony_ci
108c2ecf20Sopenharmony_ci#include "display/intel_display_types.h"
118c2ecf20Sopenharmony_ci#include "display/intel_overlay.h"
128c2ecf20Sopenharmony_ci
138c2ecf20Sopenharmony_ci#include "gem/i915_gem_context.h"
148c2ecf20Sopenharmony_ci
158c2ecf20Sopenharmony_ci#include "i915_drv.h"
168c2ecf20Sopenharmony_ci#include "i915_gpu_error.h"
178c2ecf20Sopenharmony_ci#include "i915_irq.h"
188c2ecf20Sopenharmony_ci#include "intel_breadcrumbs.h"
198c2ecf20Sopenharmony_ci#include "intel_engine_pm.h"
208c2ecf20Sopenharmony_ci#include "intel_gt.h"
218c2ecf20Sopenharmony_ci#include "intel_gt_pm.h"
228c2ecf20Sopenharmony_ci#include "intel_reset.h"
238c2ecf20Sopenharmony_ci
248c2ecf20Sopenharmony_ci#include "uc/intel_guc.h"
258c2ecf20Sopenharmony_ci#include "uc/intel_guc_submission.h"
268c2ecf20Sopenharmony_ci
278c2ecf20Sopenharmony_ci#define RESET_MAX_RETRIES 3
288c2ecf20Sopenharmony_ci
298c2ecf20Sopenharmony_ci/* XXX How to handle concurrent GGTT updates using tiling registers? */
308c2ecf20Sopenharmony_ci#define RESET_UNDER_STOP_MACHINE 0
318c2ecf20Sopenharmony_ci
328c2ecf20Sopenharmony_cistatic void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set)
338c2ecf20Sopenharmony_ci{
348c2ecf20Sopenharmony_ci	intel_uncore_rmw_fw(uncore, reg, 0, set);
358c2ecf20Sopenharmony_ci}
368c2ecf20Sopenharmony_ci
378c2ecf20Sopenharmony_cistatic void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
388c2ecf20Sopenharmony_ci{
398c2ecf20Sopenharmony_ci	intel_uncore_rmw_fw(uncore, reg, clr, 0);
408c2ecf20Sopenharmony_ci}
418c2ecf20Sopenharmony_ci
428c2ecf20Sopenharmony_cistatic void engine_skip_context(struct i915_request *rq)
438c2ecf20Sopenharmony_ci{
448c2ecf20Sopenharmony_ci	struct intel_engine_cs *engine = rq->engine;
458c2ecf20Sopenharmony_ci	struct intel_context *hung_ctx = rq->context;
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci	if (!i915_request_is_active(rq))
488c2ecf20Sopenharmony_ci		return;
498c2ecf20Sopenharmony_ci
508c2ecf20Sopenharmony_ci	lockdep_assert_held(&engine->active.lock);
518c2ecf20Sopenharmony_ci	list_for_each_entry_continue(rq, &engine->active.requests, sched.link)
528c2ecf20Sopenharmony_ci		if (rq->context == hung_ctx) {
538c2ecf20Sopenharmony_ci			i915_request_set_error_once(rq, -EIO);
548c2ecf20Sopenharmony_ci			__i915_request_skip(rq);
558c2ecf20Sopenharmony_ci		}
568c2ecf20Sopenharmony_ci}
578c2ecf20Sopenharmony_ci
588c2ecf20Sopenharmony_cistatic void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
598c2ecf20Sopenharmony_ci{
608c2ecf20Sopenharmony_ci	struct drm_i915_file_private *file_priv = ctx->file_priv;
618c2ecf20Sopenharmony_ci	unsigned long prev_hang;
628c2ecf20Sopenharmony_ci	unsigned int score;
638c2ecf20Sopenharmony_ci
648c2ecf20Sopenharmony_ci	if (IS_ERR_OR_NULL(file_priv))
658c2ecf20Sopenharmony_ci		return;
668c2ecf20Sopenharmony_ci
678c2ecf20Sopenharmony_ci	score = 0;
688c2ecf20Sopenharmony_ci	if (banned)
698c2ecf20Sopenharmony_ci		score = I915_CLIENT_SCORE_CONTEXT_BAN;
708c2ecf20Sopenharmony_ci
718c2ecf20Sopenharmony_ci	prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
728c2ecf20Sopenharmony_ci	if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
738c2ecf20Sopenharmony_ci		score += I915_CLIENT_SCORE_HANG_FAST;
748c2ecf20Sopenharmony_ci
758c2ecf20Sopenharmony_ci	if (score) {
768c2ecf20Sopenharmony_ci		atomic_add(score, &file_priv->ban_score);
778c2ecf20Sopenharmony_ci
788c2ecf20Sopenharmony_ci		drm_dbg(&ctx->i915->drm,
798c2ecf20Sopenharmony_ci			"client %s: gained %u ban score, now %u\n",
808c2ecf20Sopenharmony_ci			ctx->name, score,
818c2ecf20Sopenharmony_ci			atomic_read(&file_priv->ban_score));
828c2ecf20Sopenharmony_ci	}
838c2ecf20Sopenharmony_ci}
848c2ecf20Sopenharmony_ci
858c2ecf20Sopenharmony_cistatic bool mark_guilty(struct i915_request *rq)
868c2ecf20Sopenharmony_ci{
878c2ecf20Sopenharmony_ci	struct i915_gem_context *ctx;
888c2ecf20Sopenharmony_ci	unsigned long prev_hang;
898c2ecf20Sopenharmony_ci	bool banned;
908c2ecf20Sopenharmony_ci	int i;
918c2ecf20Sopenharmony_ci
928c2ecf20Sopenharmony_ci	if (intel_context_is_closed(rq->context)) {
938c2ecf20Sopenharmony_ci		intel_context_set_banned(rq->context);
948c2ecf20Sopenharmony_ci		return true;
958c2ecf20Sopenharmony_ci	}
968c2ecf20Sopenharmony_ci
978c2ecf20Sopenharmony_ci	rcu_read_lock();
988c2ecf20Sopenharmony_ci	ctx = rcu_dereference(rq->context->gem_context);
998c2ecf20Sopenharmony_ci	if (ctx && !kref_get_unless_zero(&ctx->ref))
1008c2ecf20Sopenharmony_ci		ctx = NULL;
1018c2ecf20Sopenharmony_ci	rcu_read_unlock();
1028c2ecf20Sopenharmony_ci	if (!ctx)
1038c2ecf20Sopenharmony_ci		return intel_context_is_banned(rq->context);
1048c2ecf20Sopenharmony_ci
1058c2ecf20Sopenharmony_ci	atomic_inc(&ctx->guilty_count);
1068c2ecf20Sopenharmony_ci
1078c2ecf20Sopenharmony_ci	/* Cool contexts are too cool to be banned! (Used for reset testing.) */
1088c2ecf20Sopenharmony_ci	if (!i915_gem_context_is_bannable(ctx)) {
1098c2ecf20Sopenharmony_ci		banned = false;
1108c2ecf20Sopenharmony_ci		goto out;
1118c2ecf20Sopenharmony_ci	}
1128c2ecf20Sopenharmony_ci
1138c2ecf20Sopenharmony_ci	drm_notice(&ctx->i915->drm,
1148c2ecf20Sopenharmony_ci		   "%s context reset due to GPU hang\n",
1158c2ecf20Sopenharmony_ci		   ctx->name);
1168c2ecf20Sopenharmony_ci
1178c2ecf20Sopenharmony_ci	/* Record the timestamp for the last N hangs */
1188c2ecf20Sopenharmony_ci	prev_hang = ctx->hang_timestamp[0];
1198c2ecf20Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
1208c2ecf20Sopenharmony_ci		ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
1218c2ecf20Sopenharmony_ci	ctx->hang_timestamp[i] = jiffies;
1228c2ecf20Sopenharmony_ci
1238c2ecf20Sopenharmony_ci	/* If we have hung N+1 times in rapid succession, we ban the context! */
1248c2ecf20Sopenharmony_ci	banned = !i915_gem_context_is_recoverable(ctx);
1258c2ecf20Sopenharmony_ci	if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
1268c2ecf20Sopenharmony_ci		banned = true;
1278c2ecf20Sopenharmony_ci	if (banned) {
1288c2ecf20Sopenharmony_ci		drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n",
1298c2ecf20Sopenharmony_ci			ctx->name, atomic_read(&ctx->guilty_count));
1308c2ecf20Sopenharmony_ci		intel_context_set_banned(rq->context);
1318c2ecf20Sopenharmony_ci	}
1328c2ecf20Sopenharmony_ci
1338c2ecf20Sopenharmony_ci	client_mark_guilty(ctx, banned);
1348c2ecf20Sopenharmony_ci
1358c2ecf20Sopenharmony_ciout:
1368c2ecf20Sopenharmony_ci	i915_gem_context_put(ctx);
1378c2ecf20Sopenharmony_ci	return banned;
1388c2ecf20Sopenharmony_ci}
1398c2ecf20Sopenharmony_ci
1408c2ecf20Sopenharmony_cistatic void mark_innocent(struct i915_request *rq)
1418c2ecf20Sopenharmony_ci{
1428c2ecf20Sopenharmony_ci	struct i915_gem_context *ctx;
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ci	rcu_read_lock();
1458c2ecf20Sopenharmony_ci	ctx = rcu_dereference(rq->context->gem_context);
1468c2ecf20Sopenharmony_ci	if (ctx)
1478c2ecf20Sopenharmony_ci		atomic_inc(&ctx->active_count);
1488c2ecf20Sopenharmony_ci	rcu_read_unlock();
1498c2ecf20Sopenharmony_ci}
1508c2ecf20Sopenharmony_ci
1518c2ecf20Sopenharmony_civoid __i915_request_reset(struct i915_request *rq, bool guilty)
1528c2ecf20Sopenharmony_ci{
1538c2ecf20Sopenharmony_ci	RQ_TRACE(rq, "guilty? %s\n", yesno(guilty));
1548c2ecf20Sopenharmony_ci
1558c2ecf20Sopenharmony_ci	GEM_BUG_ON(i915_request_completed(rq));
1568c2ecf20Sopenharmony_ci
1578c2ecf20Sopenharmony_ci	rcu_read_lock(); /* protect the GEM context */
1588c2ecf20Sopenharmony_ci	if (guilty) {
1598c2ecf20Sopenharmony_ci		i915_request_set_error_once(rq, -EIO);
1608c2ecf20Sopenharmony_ci		__i915_request_skip(rq);
1618c2ecf20Sopenharmony_ci		if (mark_guilty(rq))
1628c2ecf20Sopenharmony_ci			engine_skip_context(rq);
1638c2ecf20Sopenharmony_ci	} else {
1648c2ecf20Sopenharmony_ci		i915_request_set_error_once(rq, -EAGAIN);
1658c2ecf20Sopenharmony_ci		mark_innocent(rq);
1668c2ecf20Sopenharmony_ci	}
1678c2ecf20Sopenharmony_ci	rcu_read_unlock();
1688c2ecf20Sopenharmony_ci}
1698c2ecf20Sopenharmony_ci
1708c2ecf20Sopenharmony_cistatic bool i915_in_reset(struct pci_dev *pdev)
1718c2ecf20Sopenharmony_ci{
1728c2ecf20Sopenharmony_ci	u8 gdrst;
1738c2ecf20Sopenharmony_ci
1748c2ecf20Sopenharmony_ci	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
1758c2ecf20Sopenharmony_ci	return gdrst & GRDOM_RESET_STATUS;
1768c2ecf20Sopenharmony_ci}
1778c2ecf20Sopenharmony_ci
1788c2ecf20Sopenharmony_cistatic int i915_do_reset(struct intel_gt *gt,
1798c2ecf20Sopenharmony_ci			 intel_engine_mask_t engine_mask,
1808c2ecf20Sopenharmony_ci			 unsigned int retry)
1818c2ecf20Sopenharmony_ci{
1828c2ecf20Sopenharmony_ci	struct pci_dev *pdev = gt->i915->drm.pdev;
1838c2ecf20Sopenharmony_ci	int err;
1848c2ecf20Sopenharmony_ci
1858c2ecf20Sopenharmony_ci	/* Assert reset for at least 20 usec, and wait for acknowledgement. */
1868c2ecf20Sopenharmony_ci	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
1878c2ecf20Sopenharmony_ci	udelay(50);
1888c2ecf20Sopenharmony_ci	err = wait_for_atomic(i915_in_reset(pdev), 50);
1898c2ecf20Sopenharmony_ci
1908c2ecf20Sopenharmony_ci	/* Clear the reset request. */
1918c2ecf20Sopenharmony_ci	pci_write_config_byte(pdev, I915_GDRST, 0);
1928c2ecf20Sopenharmony_ci	udelay(50);
1938c2ecf20Sopenharmony_ci	if (!err)
1948c2ecf20Sopenharmony_ci		err = wait_for_atomic(!i915_in_reset(pdev), 50);
1958c2ecf20Sopenharmony_ci
1968c2ecf20Sopenharmony_ci	return err;
1978c2ecf20Sopenharmony_ci}
1988c2ecf20Sopenharmony_ci
1998c2ecf20Sopenharmony_cistatic bool g4x_reset_complete(struct pci_dev *pdev)
2008c2ecf20Sopenharmony_ci{
2018c2ecf20Sopenharmony_ci	u8 gdrst;
2028c2ecf20Sopenharmony_ci
2038c2ecf20Sopenharmony_ci	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
2048c2ecf20Sopenharmony_ci	return (gdrst & GRDOM_RESET_ENABLE) == 0;
2058c2ecf20Sopenharmony_ci}
2068c2ecf20Sopenharmony_ci
2078c2ecf20Sopenharmony_cistatic int g33_do_reset(struct intel_gt *gt,
2088c2ecf20Sopenharmony_ci			intel_engine_mask_t engine_mask,
2098c2ecf20Sopenharmony_ci			unsigned int retry)
2108c2ecf20Sopenharmony_ci{
2118c2ecf20Sopenharmony_ci	struct pci_dev *pdev = gt->i915->drm.pdev;
2128c2ecf20Sopenharmony_ci
2138c2ecf20Sopenharmony_ci	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
2148c2ecf20Sopenharmony_ci	return wait_for_atomic(g4x_reset_complete(pdev), 50);
2158c2ecf20Sopenharmony_ci}
2168c2ecf20Sopenharmony_ci
2178c2ecf20Sopenharmony_cistatic int g4x_do_reset(struct intel_gt *gt,
2188c2ecf20Sopenharmony_ci			intel_engine_mask_t engine_mask,
2198c2ecf20Sopenharmony_ci			unsigned int retry)
2208c2ecf20Sopenharmony_ci{
2218c2ecf20Sopenharmony_ci	struct pci_dev *pdev = gt->i915->drm.pdev;
2228c2ecf20Sopenharmony_ci	struct intel_uncore *uncore = gt->uncore;
2238c2ecf20Sopenharmony_ci	int ret;
2248c2ecf20Sopenharmony_ci
2258c2ecf20Sopenharmony_ci	/* WaVcpClkGateDisableForMediaReset:ctg,elk */
2268c2ecf20Sopenharmony_ci	rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
2278c2ecf20Sopenharmony_ci	intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
2288c2ecf20Sopenharmony_ci
2298c2ecf20Sopenharmony_ci	pci_write_config_byte(pdev, I915_GDRST,
2308c2ecf20Sopenharmony_ci			      GRDOM_MEDIA | GRDOM_RESET_ENABLE);
2318c2ecf20Sopenharmony_ci	ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
2328c2ecf20Sopenharmony_ci	if (ret) {
2338c2ecf20Sopenharmony_ci		drm_dbg(&gt->i915->drm, "Wait for media reset failed\n");
2348c2ecf20Sopenharmony_ci		goto out;
2358c2ecf20Sopenharmony_ci	}
2368c2ecf20Sopenharmony_ci
2378c2ecf20Sopenharmony_ci	pci_write_config_byte(pdev, I915_GDRST,
2388c2ecf20Sopenharmony_ci			      GRDOM_RENDER | GRDOM_RESET_ENABLE);
2398c2ecf20Sopenharmony_ci	ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
2408c2ecf20Sopenharmony_ci	if (ret) {
2418c2ecf20Sopenharmony_ci		drm_dbg(&gt->i915->drm, "Wait for render reset failed\n");
2428c2ecf20Sopenharmony_ci		goto out;
2438c2ecf20Sopenharmony_ci	}
2448c2ecf20Sopenharmony_ci
2458c2ecf20Sopenharmony_ciout:
2468c2ecf20Sopenharmony_ci	pci_write_config_byte(pdev, I915_GDRST, 0);
2478c2ecf20Sopenharmony_ci
2488c2ecf20Sopenharmony_ci	rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
2498c2ecf20Sopenharmony_ci	intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
2508c2ecf20Sopenharmony_ci
2518c2ecf20Sopenharmony_ci	return ret;
2528c2ecf20Sopenharmony_ci}
2538c2ecf20Sopenharmony_ci
2548c2ecf20Sopenharmony_cistatic int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask,
2558c2ecf20Sopenharmony_ci			unsigned int retry)
2568c2ecf20Sopenharmony_ci{
2578c2ecf20Sopenharmony_ci	struct intel_uncore *uncore = gt->uncore;
2588c2ecf20Sopenharmony_ci	int ret;
2598c2ecf20Sopenharmony_ci
2608c2ecf20Sopenharmony_ci	intel_uncore_write_fw(uncore, ILK_GDSR,
2618c2ecf20Sopenharmony_ci			      ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
2628c2ecf20Sopenharmony_ci	ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
2638c2ecf20Sopenharmony_ci					   ILK_GRDOM_RESET_ENABLE, 0,
2648c2ecf20Sopenharmony_ci					   5000, 0,
2658c2ecf20Sopenharmony_ci					   NULL);
2668c2ecf20Sopenharmony_ci	if (ret) {
2678c2ecf20Sopenharmony_ci		drm_dbg(&gt->i915->drm, "Wait for render reset failed\n");
2688c2ecf20Sopenharmony_ci		goto out;
2698c2ecf20Sopenharmony_ci	}
2708c2ecf20Sopenharmony_ci
2718c2ecf20Sopenharmony_ci	intel_uncore_write_fw(uncore, ILK_GDSR,
2728c2ecf20Sopenharmony_ci			      ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
2738c2ecf20Sopenharmony_ci	ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
2748c2ecf20Sopenharmony_ci					   ILK_GRDOM_RESET_ENABLE, 0,
2758c2ecf20Sopenharmony_ci					   5000, 0,
2768c2ecf20Sopenharmony_ci					   NULL);
2778c2ecf20Sopenharmony_ci	if (ret) {
2788c2ecf20Sopenharmony_ci		drm_dbg(&gt->i915->drm, "Wait for media reset failed\n");
2798c2ecf20Sopenharmony_ci		goto out;
2808c2ecf20Sopenharmony_ci	}
2818c2ecf20Sopenharmony_ci
2828c2ecf20Sopenharmony_ciout:
2838c2ecf20Sopenharmony_ci	intel_uncore_write_fw(uncore, ILK_GDSR, 0);
2848c2ecf20Sopenharmony_ci	intel_uncore_posting_read_fw(uncore, ILK_GDSR);
2858c2ecf20Sopenharmony_ci	return ret;
2868c2ecf20Sopenharmony_ci}
2878c2ecf20Sopenharmony_ci
2888c2ecf20Sopenharmony_ci/* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
2898c2ecf20Sopenharmony_cistatic int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask)
2908c2ecf20Sopenharmony_ci{
2918c2ecf20Sopenharmony_ci	struct intel_uncore *uncore = gt->uncore;
2928c2ecf20Sopenharmony_ci	int loops = 2;
2938c2ecf20Sopenharmony_ci	int err;
2948c2ecf20Sopenharmony_ci
2958c2ecf20Sopenharmony_ci	/*
2968c2ecf20Sopenharmony_ci	 * GEN6_GDRST is not in the gt power well, no need to check
2978c2ecf20Sopenharmony_ci	 * for fifo space for the write or forcewake the chip for
2988c2ecf20Sopenharmony_ci	 * the read
2998c2ecf20Sopenharmony_ci	 */
3008c2ecf20Sopenharmony_ci	do {
3018c2ecf20Sopenharmony_ci		intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask);
3028c2ecf20Sopenharmony_ci
3038c2ecf20Sopenharmony_ci		/*
3048c2ecf20Sopenharmony_ci		 * Wait for the device to ack the reset requests.
3058c2ecf20Sopenharmony_ci		 *
3068c2ecf20Sopenharmony_ci		 * On some platforms, e.g. Jasperlake, we see that the
3078c2ecf20Sopenharmony_ci		 * engine register state is not cleared until shortly after
3088c2ecf20Sopenharmony_ci		 * GDRST reports completion, causing a failure as we try
3098c2ecf20Sopenharmony_ci		 * to immediately resume while the internal state is still
3108c2ecf20Sopenharmony_ci		 * in flux. If we immediately repeat the reset, the second
3118c2ecf20Sopenharmony_ci		 * reset appears to serialise with the first, and since
3128c2ecf20Sopenharmony_ci		 * it is a no-op, the registers should retain their reset
3138c2ecf20Sopenharmony_ci		 * value. However, there is still a concern that upon
3148c2ecf20Sopenharmony_ci		 * leaving the second reset, the internal engine state
3158c2ecf20Sopenharmony_ci		 * is still in flux and not ready for resuming.
3168c2ecf20Sopenharmony_ci		 */
3178c2ecf20Sopenharmony_ci		err = __intel_wait_for_register_fw(uncore, GEN6_GDRST,
3188c2ecf20Sopenharmony_ci						   hw_domain_mask, 0,
3198c2ecf20Sopenharmony_ci						   2000, 0,
3208c2ecf20Sopenharmony_ci						   NULL);
3218c2ecf20Sopenharmony_ci	} while (err == 0 && --loops);
3228c2ecf20Sopenharmony_ci	if (err)
3238c2ecf20Sopenharmony_ci		drm_dbg(&gt->i915->drm,
3248c2ecf20Sopenharmony_ci			"Wait for 0x%08x engines reset failed\n",
3258c2ecf20Sopenharmony_ci			hw_domain_mask);
3268c2ecf20Sopenharmony_ci
3278c2ecf20Sopenharmony_ci	/*
3288c2ecf20Sopenharmony_ci	 * As we have observed that the engine state is still volatile
3298c2ecf20Sopenharmony_ci	 * after GDRST is acked, impose a small delay to let everything settle.
3308c2ecf20Sopenharmony_ci	 */
3318c2ecf20Sopenharmony_ci	udelay(50);
3328c2ecf20Sopenharmony_ci
3338c2ecf20Sopenharmony_ci	return err;
3348c2ecf20Sopenharmony_ci}
3358c2ecf20Sopenharmony_ci
3368c2ecf20Sopenharmony_cistatic int gen6_reset_engines(struct intel_gt *gt,
3378c2ecf20Sopenharmony_ci			      intel_engine_mask_t engine_mask,
3388c2ecf20Sopenharmony_ci			      unsigned int retry)
3398c2ecf20Sopenharmony_ci{
3408c2ecf20Sopenharmony_ci	static const u32 hw_engine_mask[] = {
3418c2ecf20Sopenharmony_ci		[RCS0]  = GEN6_GRDOM_RENDER,
3428c2ecf20Sopenharmony_ci		[BCS0]  = GEN6_GRDOM_BLT,
3438c2ecf20Sopenharmony_ci		[VCS0]  = GEN6_GRDOM_MEDIA,
3448c2ecf20Sopenharmony_ci		[VCS1]  = GEN8_GRDOM_MEDIA2,
3458c2ecf20Sopenharmony_ci		[VECS0] = GEN6_GRDOM_VECS,
3468c2ecf20Sopenharmony_ci	};
3478c2ecf20Sopenharmony_ci	struct intel_engine_cs *engine;
3488c2ecf20Sopenharmony_ci	u32 hw_mask;
3498c2ecf20Sopenharmony_ci
3508c2ecf20Sopenharmony_ci	if (engine_mask == ALL_ENGINES) {
3518c2ecf20Sopenharmony_ci		hw_mask = GEN6_GRDOM_FULL;
3528c2ecf20Sopenharmony_ci	} else {
3538c2ecf20Sopenharmony_ci		intel_engine_mask_t tmp;
3548c2ecf20Sopenharmony_ci
3558c2ecf20Sopenharmony_ci		hw_mask = 0;
3568c2ecf20Sopenharmony_ci		for_each_engine_masked(engine, gt, engine_mask, tmp) {
3578c2ecf20Sopenharmony_ci			GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
3588c2ecf20Sopenharmony_ci			hw_mask |= hw_engine_mask[engine->id];
3598c2ecf20Sopenharmony_ci		}
3608c2ecf20Sopenharmony_ci	}
3618c2ecf20Sopenharmony_ci
3628c2ecf20Sopenharmony_ci	return gen6_hw_domain_reset(gt, hw_mask);
3638c2ecf20Sopenharmony_ci}
3648c2ecf20Sopenharmony_ci
3658c2ecf20Sopenharmony_cistatic int gen11_lock_sfc(struct intel_engine_cs *engine, u32 *hw_mask)
3668c2ecf20Sopenharmony_ci{
3678c2ecf20Sopenharmony_ci	struct intel_uncore *uncore = engine->uncore;
3688c2ecf20Sopenharmony_ci	u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
3698c2ecf20Sopenharmony_ci	i915_reg_t sfc_forced_lock, sfc_forced_lock_ack;
3708c2ecf20Sopenharmony_ci	u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit;
3718c2ecf20Sopenharmony_ci	i915_reg_t sfc_usage;
3728c2ecf20Sopenharmony_ci	u32 sfc_usage_bit;
3738c2ecf20Sopenharmony_ci	u32 sfc_reset_bit;
3748c2ecf20Sopenharmony_ci	int ret;
3758c2ecf20Sopenharmony_ci
3768c2ecf20Sopenharmony_ci	switch (engine->class) {
3778c2ecf20Sopenharmony_ci	case VIDEO_DECODE_CLASS:
3788c2ecf20Sopenharmony_ci		if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
3798c2ecf20Sopenharmony_ci			return 0;
3808c2ecf20Sopenharmony_ci
3818c2ecf20Sopenharmony_ci		sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
3828c2ecf20Sopenharmony_ci		sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
3838c2ecf20Sopenharmony_ci
3848c2ecf20Sopenharmony_ci		sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine);
3858c2ecf20Sopenharmony_ci		sfc_forced_lock_ack_bit  = GEN11_VCS_SFC_LOCK_ACK_BIT;
3868c2ecf20Sopenharmony_ci
3878c2ecf20Sopenharmony_ci		sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine);
3888c2ecf20Sopenharmony_ci		sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT;
3898c2ecf20Sopenharmony_ci		sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
3908c2ecf20Sopenharmony_ci		break;
3918c2ecf20Sopenharmony_ci
3928c2ecf20Sopenharmony_ci	case VIDEO_ENHANCEMENT_CLASS:
3938c2ecf20Sopenharmony_ci		sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
3948c2ecf20Sopenharmony_ci		sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
3958c2ecf20Sopenharmony_ci
3968c2ecf20Sopenharmony_ci		sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine);
3978c2ecf20Sopenharmony_ci		sfc_forced_lock_ack_bit  = GEN11_VECS_SFC_LOCK_ACK_BIT;
3988c2ecf20Sopenharmony_ci
3998c2ecf20Sopenharmony_ci		sfc_usage = GEN11_VECS_SFC_USAGE(engine);
4008c2ecf20Sopenharmony_ci		sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT;
4018c2ecf20Sopenharmony_ci		sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
4028c2ecf20Sopenharmony_ci		break;
4038c2ecf20Sopenharmony_ci
4048c2ecf20Sopenharmony_ci	default:
4058c2ecf20Sopenharmony_ci		return 0;
4068c2ecf20Sopenharmony_ci	}
4078c2ecf20Sopenharmony_ci
4088c2ecf20Sopenharmony_ci	/*
4098c2ecf20Sopenharmony_ci	 * If the engine is using a SFC, tell the engine that a software reset
4108c2ecf20Sopenharmony_ci	 * is going to happen. The engine will then try to force lock the SFC.
4118c2ecf20Sopenharmony_ci	 * If SFC ends up being locked to the engine we want to reset, we have
4128c2ecf20Sopenharmony_ci	 * to reset it as well (we will unlock it once the reset sequence is
4138c2ecf20Sopenharmony_ci	 * completed).
4148c2ecf20Sopenharmony_ci	 */
4158c2ecf20Sopenharmony_ci	if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
4168c2ecf20Sopenharmony_ci		return 0;
4178c2ecf20Sopenharmony_ci
4188c2ecf20Sopenharmony_ci	rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
4198c2ecf20Sopenharmony_ci
4208c2ecf20Sopenharmony_ci	ret = __intel_wait_for_register_fw(uncore,
4218c2ecf20Sopenharmony_ci					   sfc_forced_lock_ack,
4228c2ecf20Sopenharmony_ci					   sfc_forced_lock_ack_bit,
4238c2ecf20Sopenharmony_ci					   sfc_forced_lock_ack_bit,
4248c2ecf20Sopenharmony_ci					   1000, 0, NULL);
4258c2ecf20Sopenharmony_ci
4268c2ecf20Sopenharmony_ci	/* Was the SFC released while we were trying to lock it? */
4278c2ecf20Sopenharmony_ci	if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
4288c2ecf20Sopenharmony_ci		return 0;
4298c2ecf20Sopenharmony_ci
4308c2ecf20Sopenharmony_ci	if (ret) {
4318c2ecf20Sopenharmony_ci		drm_dbg(&engine->i915->drm,
4328c2ecf20Sopenharmony_ci			"Wait for SFC forced lock ack failed\n");
4338c2ecf20Sopenharmony_ci		return ret;
4348c2ecf20Sopenharmony_ci	}
4358c2ecf20Sopenharmony_ci
4368c2ecf20Sopenharmony_ci	*hw_mask |= sfc_reset_bit;
4378c2ecf20Sopenharmony_ci	return 0;
4388c2ecf20Sopenharmony_ci}
4398c2ecf20Sopenharmony_ci
4408c2ecf20Sopenharmony_cistatic void gen11_unlock_sfc(struct intel_engine_cs *engine)
4418c2ecf20Sopenharmony_ci{
4428c2ecf20Sopenharmony_ci	struct intel_uncore *uncore = engine->uncore;
4438c2ecf20Sopenharmony_ci	u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
4448c2ecf20Sopenharmony_ci	i915_reg_t sfc_forced_lock;
4458c2ecf20Sopenharmony_ci	u32 sfc_forced_lock_bit;
4468c2ecf20Sopenharmony_ci
4478c2ecf20Sopenharmony_ci	switch (engine->class) {
4488c2ecf20Sopenharmony_ci	case VIDEO_DECODE_CLASS:
4498c2ecf20Sopenharmony_ci		if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
4508c2ecf20Sopenharmony_ci			return;
4518c2ecf20Sopenharmony_ci
4528c2ecf20Sopenharmony_ci		sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
4538c2ecf20Sopenharmony_ci		sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
4548c2ecf20Sopenharmony_ci		break;
4558c2ecf20Sopenharmony_ci
4568c2ecf20Sopenharmony_ci	case VIDEO_ENHANCEMENT_CLASS:
4578c2ecf20Sopenharmony_ci		sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
4588c2ecf20Sopenharmony_ci		sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
4598c2ecf20Sopenharmony_ci		break;
4608c2ecf20Sopenharmony_ci
4618c2ecf20Sopenharmony_ci	default:
4628c2ecf20Sopenharmony_ci		return;
4638c2ecf20Sopenharmony_ci	}
4648c2ecf20Sopenharmony_ci
4658c2ecf20Sopenharmony_ci	rmw_clear_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
4668c2ecf20Sopenharmony_ci}
4678c2ecf20Sopenharmony_ci
4688c2ecf20Sopenharmony_cistatic int gen11_reset_engines(struct intel_gt *gt,
4698c2ecf20Sopenharmony_ci			       intel_engine_mask_t engine_mask,
4708c2ecf20Sopenharmony_ci			       unsigned int retry)
4718c2ecf20Sopenharmony_ci{
4728c2ecf20Sopenharmony_ci	static const u32 hw_engine_mask[] = {
4738c2ecf20Sopenharmony_ci		[RCS0]  = GEN11_GRDOM_RENDER,
4748c2ecf20Sopenharmony_ci		[BCS0]  = GEN11_GRDOM_BLT,
4758c2ecf20Sopenharmony_ci		[VCS0]  = GEN11_GRDOM_MEDIA,
4768c2ecf20Sopenharmony_ci		[VCS1]  = GEN11_GRDOM_MEDIA2,
4778c2ecf20Sopenharmony_ci		[VCS2]  = GEN11_GRDOM_MEDIA3,
4788c2ecf20Sopenharmony_ci		[VCS3]  = GEN11_GRDOM_MEDIA4,
4798c2ecf20Sopenharmony_ci		[VECS0] = GEN11_GRDOM_VECS,
4808c2ecf20Sopenharmony_ci		[VECS1] = GEN11_GRDOM_VECS2,
4818c2ecf20Sopenharmony_ci	};
4828c2ecf20Sopenharmony_ci	struct intel_engine_cs *engine;
4838c2ecf20Sopenharmony_ci	intel_engine_mask_t tmp;
4848c2ecf20Sopenharmony_ci	u32 hw_mask;
4858c2ecf20Sopenharmony_ci	int ret;
4868c2ecf20Sopenharmony_ci
4878c2ecf20Sopenharmony_ci	if (engine_mask == ALL_ENGINES) {
4888c2ecf20Sopenharmony_ci		hw_mask = GEN11_GRDOM_FULL;
4898c2ecf20Sopenharmony_ci	} else {
4908c2ecf20Sopenharmony_ci		hw_mask = 0;
4918c2ecf20Sopenharmony_ci		for_each_engine_masked(engine, gt, engine_mask, tmp) {
4928c2ecf20Sopenharmony_ci			GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
4938c2ecf20Sopenharmony_ci			hw_mask |= hw_engine_mask[engine->id];
4948c2ecf20Sopenharmony_ci			ret = gen11_lock_sfc(engine, &hw_mask);
4958c2ecf20Sopenharmony_ci			if (ret)
4968c2ecf20Sopenharmony_ci				goto sfc_unlock;
4978c2ecf20Sopenharmony_ci		}
4988c2ecf20Sopenharmony_ci	}
4998c2ecf20Sopenharmony_ci
5008c2ecf20Sopenharmony_ci	ret = gen6_hw_domain_reset(gt, hw_mask);
5018c2ecf20Sopenharmony_ci
5028c2ecf20Sopenharmony_cisfc_unlock:
5038c2ecf20Sopenharmony_ci	/*
5048c2ecf20Sopenharmony_ci	 * We unlock the SFC based on the lock status and not the result of
5058c2ecf20Sopenharmony_ci	 * gen11_lock_sfc to make sure that we clean properly if something
5068c2ecf20Sopenharmony_ci	 * wrong happened during the lock (e.g. lock acquired after timeout
5078c2ecf20Sopenharmony_ci	 * expiration).
5088c2ecf20Sopenharmony_ci	 */
5098c2ecf20Sopenharmony_ci	if (engine_mask != ALL_ENGINES)
5108c2ecf20Sopenharmony_ci		for_each_engine_masked(engine, gt, engine_mask, tmp)
5118c2ecf20Sopenharmony_ci			gen11_unlock_sfc(engine);
5128c2ecf20Sopenharmony_ci
5138c2ecf20Sopenharmony_ci	return ret;
5148c2ecf20Sopenharmony_ci}
5158c2ecf20Sopenharmony_ci
5168c2ecf20Sopenharmony_cistatic int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
5178c2ecf20Sopenharmony_ci{
5188c2ecf20Sopenharmony_ci	struct intel_uncore *uncore = engine->uncore;
5198c2ecf20Sopenharmony_ci	const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base);
5208c2ecf20Sopenharmony_ci	u32 request, mask, ack;
5218c2ecf20Sopenharmony_ci	int ret;
5228c2ecf20Sopenharmony_ci
5238c2ecf20Sopenharmony_ci	ack = intel_uncore_read_fw(uncore, reg);
5248c2ecf20Sopenharmony_ci	if (ack & RESET_CTL_CAT_ERROR) {
5258c2ecf20Sopenharmony_ci		/*
5268c2ecf20Sopenharmony_ci		 * For catastrophic errors, ready-for-reset sequence
5278c2ecf20Sopenharmony_ci		 * needs to be bypassed: HAS#396813
5288c2ecf20Sopenharmony_ci		 */
5298c2ecf20Sopenharmony_ci		request = RESET_CTL_CAT_ERROR;
5308c2ecf20Sopenharmony_ci		mask = RESET_CTL_CAT_ERROR;
5318c2ecf20Sopenharmony_ci
5328c2ecf20Sopenharmony_ci		/* Catastrophic errors need to be cleared by HW */
5338c2ecf20Sopenharmony_ci		ack = 0;
5348c2ecf20Sopenharmony_ci	} else if (!(ack & RESET_CTL_READY_TO_RESET)) {
5358c2ecf20Sopenharmony_ci		request = RESET_CTL_REQUEST_RESET;
5368c2ecf20Sopenharmony_ci		mask = RESET_CTL_READY_TO_RESET;
5378c2ecf20Sopenharmony_ci		ack = RESET_CTL_READY_TO_RESET;
5388c2ecf20Sopenharmony_ci	} else {
5398c2ecf20Sopenharmony_ci		return 0;
5408c2ecf20Sopenharmony_ci	}
5418c2ecf20Sopenharmony_ci
5428c2ecf20Sopenharmony_ci	intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request));
5438c2ecf20Sopenharmony_ci	ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
5448c2ecf20Sopenharmony_ci					   700, 0, NULL);
5458c2ecf20Sopenharmony_ci	if (ret)
5468c2ecf20Sopenharmony_ci		drm_err(&engine->i915->drm,
5478c2ecf20Sopenharmony_ci			"%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
5488c2ecf20Sopenharmony_ci			engine->name, request,
5498c2ecf20Sopenharmony_ci			intel_uncore_read_fw(uncore, reg));
5508c2ecf20Sopenharmony_ci
5518c2ecf20Sopenharmony_ci	return ret;
5528c2ecf20Sopenharmony_ci}
5538c2ecf20Sopenharmony_ci
5548c2ecf20Sopenharmony_cistatic void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
5558c2ecf20Sopenharmony_ci{
5568c2ecf20Sopenharmony_ci	intel_uncore_write_fw(engine->uncore,
5578c2ecf20Sopenharmony_ci			      RING_RESET_CTL(engine->mmio_base),
5588c2ecf20Sopenharmony_ci			      _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
5598c2ecf20Sopenharmony_ci}
5608c2ecf20Sopenharmony_ci
5618c2ecf20Sopenharmony_cistatic int gen8_reset_engines(struct intel_gt *gt,
5628c2ecf20Sopenharmony_ci			      intel_engine_mask_t engine_mask,
5638c2ecf20Sopenharmony_ci			      unsigned int retry)
5648c2ecf20Sopenharmony_ci{
5658c2ecf20Sopenharmony_ci	struct intel_engine_cs *engine;
5668c2ecf20Sopenharmony_ci	const bool reset_non_ready = retry >= 1;
5678c2ecf20Sopenharmony_ci	intel_engine_mask_t tmp;
5688c2ecf20Sopenharmony_ci	int ret;
5698c2ecf20Sopenharmony_ci
5708c2ecf20Sopenharmony_ci	for_each_engine_masked(engine, gt, engine_mask, tmp) {
5718c2ecf20Sopenharmony_ci		ret = gen8_engine_reset_prepare(engine);
5728c2ecf20Sopenharmony_ci		if (ret && !reset_non_ready)
5738c2ecf20Sopenharmony_ci			goto skip_reset;
5748c2ecf20Sopenharmony_ci
5758c2ecf20Sopenharmony_ci		/*
5768c2ecf20Sopenharmony_ci		 * If this is not the first failed attempt to prepare,
5778c2ecf20Sopenharmony_ci		 * we decide to proceed anyway.
5788c2ecf20Sopenharmony_ci		 *
5798c2ecf20Sopenharmony_ci		 * By doing so we risk context corruption and with
5808c2ecf20Sopenharmony_ci		 * some gens (kbl), possible system hang if reset
5818c2ecf20Sopenharmony_ci		 * happens during active bb execution.
5828c2ecf20Sopenharmony_ci		 *
5838c2ecf20Sopenharmony_ci		 * We rather take context corruption instead of
5848c2ecf20Sopenharmony_ci		 * failed reset with a wedged driver/gpu. And
5858c2ecf20Sopenharmony_ci		 * active bb execution case should be covered by
5868c2ecf20Sopenharmony_ci		 * stop_engines() we have before the reset.
5878c2ecf20Sopenharmony_ci		 */
5888c2ecf20Sopenharmony_ci	}
5898c2ecf20Sopenharmony_ci
5908c2ecf20Sopenharmony_ci	if (INTEL_GEN(gt->i915) >= 11)
5918c2ecf20Sopenharmony_ci		ret = gen11_reset_engines(gt, engine_mask, retry);
5928c2ecf20Sopenharmony_ci	else
5938c2ecf20Sopenharmony_ci		ret = gen6_reset_engines(gt, engine_mask, retry);
5948c2ecf20Sopenharmony_ci
5958c2ecf20Sopenharmony_ciskip_reset:
5968c2ecf20Sopenharmony_ci	for_each_engine_masked(engine, gt, engine_mask, tmp)
5978c2ecf20Sopenharmony_ci		gen8_engine_reset_cancel(engine);
5988c2ecf20Sopenharmony_ci
5998c2ecf20Sopenharmony_ci	return ret;
6008c2ecf20Sopenharmony_ci}
6018c2ecf20Sopenharmony_ci
6028c2ecf20Sopenharmony_cistatic int mock_reset(struct intel_gt *gt,
6038c2ecf20Sopenharmony_ci		      intel_engine_mask_t mask,
6048c2ecf20Sopenharmony_ci		      unsigned int retry)
6058c2ecf20Sopenharmony_ci{
6068c2ecf20Sopenharmony_ci	return 0;
6078c2ecf20Sopenharmony_ci}
6088c2ecf20Sopenharmony_ci
6098c2ecf20Sopenharmony_citypedef int (*reset_func)(struct intel_gt *,
6108c2ecf20Sopenharmony_ci			  intel_engine_mask_t engine_mask,
6118c2ecf20Sopenharmony_ci			  unsigned int retry);
6128c2ecf20Sopenharmony_ci
6138c2ecf20Sopenharmony_cistatic reset_func intel_get_gpu_reset(const struct intel_gt *gt)
6148c2ecf20Sopenharmony_ci{
6158c2ecf20Sopenharmony_ci	struct drm_i915_private *i915 = gt->i915;
6168c2ecf20Sopenharmony_ci
6178c2ecf20Sopenharmony_ci	if (is_mock_gt(gt))
6188c2ecf20Sopenharmony_ci		return mock_reset;
6198c2ecf20Sopenharmony_ci	else if (INTEL_GEN(i915) >= 8)
6208c2ecf20Sopenharmony_ci		return gen8_reset_engines;
6218c2ecf20Sopenharmony_ci	else if (INTEL_GEN(i915) >= 6)
6228c2ecf20Sopenharmony_ci		return gen6_reset_engines;
6238c2ecf20Sopenharmony_ci	else if (INTEL_GEN(i915) >= 5)
6248c2ecf20Sopenharmony_ci		return ilk_do_reset;
6258c2ecf20Sopenharmony_ci	else if (IS_G4X(i915))
6268c2ecf20Sopenharmony_ci		return g4x_do_reset;
6278c2ecf20Sopenharmony_ci	else if (IS_G33(i915) || IS_PINEVIEW(i915))
6288c2ecf20Sopenharmony_ci		return g33_do_reset;
6298c2ecf20Sopenharmony_ci	else if (INTEL_GEN(i915) >= 3)
6308c2ecf20Sopenharmony_ci		return i915_do_reset;
6318c2ecf20Sopenharmony_ci	else
6328c2ecf20Sopenharmony_ci		return NULL;
6338c2ecf20Sopenharmony_ci}
6348c2ecf20Sopenharmony_ci
6358c2ecf20Sopenharmony_ciint __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
6368c2ecf20Sopenharmony_ci{
6378c2ecf20Sopenharmony_ci	const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
6388c2ecf20Sopenharmony_ci	reset_func reset;
6398c2ecf20Sopenharmony_ci	int ret = -ETIMEDOUT;
6408c2ecf20Sopenharmony_ci	int retry;
6418c2ecf20Sopenharmony_ci
6428c2ecf20Sopenharmony_ci	reset = intel_get_gpu_reset(gt);
6438c2ecf20Sopenharmony_ci	if (!reset)
6448c2ecf20Sopenharmony_ci		return -ENODEV;
6458c2ecf20Sopenharmony_ci
6468c2ecf20Sopenharmony_ci	/*
6478c2ecf20Sopenharmony_ci	 * If the power well sleeps during the reset, the reset
6488c2ecf20Sopenharmony_ci	 * request may be dropped and never completes (causing -EIO).
6498c2ecf20Sopenharmony_ci	 */
6508c2ecf20Sopenharmony_ci	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
6518c2ecf20Sopenharmony_ci	for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
6528c2ecf20Sopenharmony_ci		GT_TRACE(gt, "engine_mask=%x\n", engine_mask);
6538c2ecf20Sopenharmony_ci		preempt_disable();
6548c2ecf20Sopenharmony_ci		ret = reset(gt, engine_mask, retry);
6558c2ecf20Sopenharmony_ci		preempt_enable();
6568c2ecf20Sopenharmony_ci	}
6578c2ecf20Sopenharmony_ci	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
6588c2ecf20Sopenharmony_ci
6598c2ecf20Sopenharmony_ci	return ret;
6608c2ecf20Sopenharmony_ci}
6618c2ecf20Sopenharmony_ci
6628c2ecf20Sopenharmony_cibool intel_has_gpu_reset(const struct intel_gt *gt)
6638c2ecf20Sopenharmony_ci{
6648c2ecf20Sopenharmony_ci	if (!gt->i915->params.reset)
6658c2ecf20Sopenharmony_ci		return NULL;
6668c2ecf20Sopenharmony_ci
6678c2ecf20Sopenharmony_ci	return intel_get_gpu_reset(gt);
6688c2ecf20Sopenharmony_ci}
6698c2ecf20Sopenharmony_ci
6708c2ecf20Sopenharmony_cibool intel_has_reset_engine(const struct intel_gt *gt)
6718c2ecf20Sopenharmony_ci{
6728c2ecf20Sopenharmony_ci	if (gt->i915->params.reset < 2)
6738c2ecf20Sopenharmony_ci		return false;
6748c2ecf20Sopenharmony_ci
6758c2ecf20Sopenharmony_ci	return INTEL_INFO(gt->i915)->has_reset_engine;
6768c2ecf20Sopenharmony_ci}
6778c2ecf20Sopenharmony_ci
6788c2ecf20Sopenharmony_ciint intel_reset_guc(struct intel_gt *gt)
6798c2ecf20Sopenharmony_ci{
6808c2ecf20Sopenharmony_ci	u32 guc_domain =
6818c2ecf20Sopenharmony_ci		INTEL_GEN(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
6828c2ecf20Sopenharmony_ci	int ret;
6838c2ecf20Sopenharmony_ci
6848c2ecf20Sopenharmony_ci	GEM_BUG_ON(!HAS_GT_UC(gt->i915));
6858c2ecf20Sopenharmony_ci
6868c2ecf20Sopenharmony_ci	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
6878c2ecf20Sopenharmony_ci	ret = gen6_hw_domain_reset(gt, guc_domain);
6888c2ecf20Sopenharmony_ci	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
6898c2ecf20Sopenharmony_ci
6908c2ecf20Sopenharmony_ci	return ret;
6918c2ecf20Sopenharmony_ci}
6928c2ecf20Sopenharmony_ci
6938c2ecf20Sopenharmony_ci/*
6948c2ecf20Sopenharmony_ci * Ensure irq handler finishes, and not run again.
6958c2ecf20Sopenharmony_ci * Also return the active request so that we only search for it once.
6968c2ecf20Sopenharmony_ci */
6978c2ecf20Sopenharmony_cistatic void reset_prepare_engine(struct intel_engine_cs *engine)
6988c2ecf20Sopenharmony_ci{
6998c2ecf20Sopenharmony_ci	/*
7008c2ecf20Sopenharmony_ci	 * During the reset sequence, we must prevent the engine from
7018c2ecf20Sopenharmony_ci	 * entering RC6. As the context state is undefined until we restart
7028c2ecf20Sopenharmony_ci	 * the engine, if it does enter RC6 during the reset, the state
7038c2ecf20Sopenharmony_ci	 * written to the powercontext is undefined and so we may lose
7048c2ecf20Sopenharmony_ci	 * GPU state upon resume, i.e. fail to restart after a reset.
7058c2ecf20Sopenharmony_ci	 */
7068c2ecf20Sopenharmony_ci	intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
7078c2ecf20Sopenharmony_ci	if (engine->reset.prepare)
7088c2ecf20Sopenharmony_ci		engine->reset.prepare(engine);
7098c2ecf20Sopenharmony_ci}
7108c2ecf20Sopenharmony_ci
7118c2ecf20Sopenharmony_cistatic void revoke_mmaps(struct intel_gt *gt)
7128c2ecf20Sopenharmony_ci{
7138c2ecf20Sopenharmony_ci	int i;
7148c2ecf20Sopenharmony_ci
7158c2ecf20Sopenharmony_ci	for (i = 0; i < gt->ggtt->num_fences; i++) {
7168c2ecf20Sopenharmony_ci		struct drm_vma_offset_node *node;
7178c2ecf20Sopenharmony_ci		struct i915_vma *vma;
7188c2ecf20Sopenharmony_ci		u64 vma_offset;
7198c2ecf20Sopenharmony_ci
7208c2ecf20Sopenharmony_ci		vma = READ_ONCE(gt->ggtt->fence_regs[i].vma);
7218c2ecf20Sopenharmony_ci		if (!vma)
7228c2ecf20Sopenharmony_ci			continue;
7238c2ecf20Sopenharmony_ci
7248c2ecf20Sopenharmony_ci		if (!i915_vma_has_userfault(vma))
7258c2ecf20Sopenharmony_ci			continue;
7268c2ecf20Sopenharmony_ci
7278c2ecf20Sopenharmony_ci		GEM_BUG_ON(vma->fence != &gt->ggtt->fence_regs[i]);
7288c2ecf20Sopenharmony_ci
7298c2ecf20Sopenharmony_ci		if (!vma->mmo)
7308c2ecf20Sopenharmony_ci			continue;
7318c2ecf20Sopenharmony_ci
7328c2ecf20Sopenharmony_ci		node = &vma->mmo->vma_node;
7338c2ecf20Sopenharmony_ci		vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
7348c2ecf20Sopenharmony_ci
7358c2ecf20Sopenharmony_ci		unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping,
7368c2ecf20Sopenharmony_ci				    drm_vma_node_offset_addr(node) + vma_offset,
7378c2ecf20Sopenharmony_ci				    vma->size,
7388c2ecf20Sopenharmony_ci				    1);
7398c2ecf20Sopenharmony_ci	}
7408c2ecf20Sopenharmony_ci}
7418c2ecf20Sopenharmony_ci
7428c2ecf20Sopenharmony_cistatic intel_engine_mask_t reset_prepare(struct intel_gt *gt)
7438c2ecf20Sopenharmony_ci{
7448c2ecf20Sopenharmony_ci	struct intel_engine_cs *engine;
7458c2ecf20Sopenharmony_ci	intel_engine_mask_t awake = 0;
7468c2ecf20Sopenharmony_ci	enum intel_engine_id id;
7478c2ecf20Sopenharmony_ci
7488c2ecf20Sopenharmony_ci	for_each_engine(engine, gt, id) {
7498c2ecf20Sopenharmony_ci		if (intel_engine_pm_get_if_awake(engine))
7508c2ecf20Sopenharmony_ci			awake |= engine->mask;
7518c2ecf20Sopenharmony_ci		reset_prepare_engine(engine);
7528c2ecf20Sopenharmony_ci	}
7538c2ecf20Sopenharmony_ci
7548c2ecf20Sopenharmony_ci	intel_uc_reset_prepare(&gt->uc);
7558c2ecf20Sopenharmony_ci
7568c2ecf20Sopenharmony_ci	return awake;
7578c2ecf20Sopenharmony_ci}
7588c2ecf20Sopenharmony_ci
7598c2ecf20Sopenharmony_cistatic void gt_revoke(struct intel_gt *gt)
7608c2ecf20Sopenharmony_ci{
7618c2ecf20Sopenharmony_ci	revoke_mmaps(gt);
7628c2ecf20Sopenharmony_ci}
7638c2ecf20Sopenharmony_ci
7648c2ecf20Sopenharmony_cistatic int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
7658c2ecf20Sopenharmony_ci{
7668c2ecf20Sopenharmony_ci	struct intel_engine_cs *engine;
7678c2ecf20Sopenharmony_ci	enum intel_engine_id id;
7688c2ecf20Sopenharmony_ci	int err;
7698c2ecf20Sopenharmony_ci
7708c2ecf20Sopenharmony_ci	/*
7718c2ecf20Sopenharmony_ci	 * Everything depends on having the GTT running, so we need to start
7728c2ecf20Sopenharmony_ci	 * there.
7738c2ecf20Sopenharmony_ci	 */
7748c2ecf20Sopenharmony_ci	err = i915_ggtt_enable_hw(gt->i915);
7758c2ecf20Sopenharmony_ci	if (err)
7768c2ecf20Sopenharmony_ci		return err;
7778c2ecf20Sopenharmony_ci
7788c2ecf20Sopenharmony_ci	for_each_engine(engine, gt, id)
7798c2ecf20Sopenharmony_ci		__intel_engine_reset(engine, stalled_mask & engine->mask);
7808c2ecf20Sopenharmony_ci
7818c2ecf20Sopenharmony_ci	intel_ggtt_restore_fences(gt->ggtt);
7828c2ecf20Sopenharmony_ci
7838c2ecf20Sopenharmony_ci	return err;
7848c2ecf20Sopenharmony_ci}
7858c2ecf20Sopenharmony_ci
7868c2ecf20Sopenharmony_cistatic void reset_finish_engine(struct intel_engine_cs *engine)
7878c2ecf20Sopenharmony_ci{
7888c2ecf20Sopenharmony_ci	if (engine->reset.finish)
7898c2ecf20Sopenharmony_ci		engine->reset.finish(engine);
7908c2ecf20Sopenharmony_ci	intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
7918c2ecf20Sopenharmony_ci
7928c2ecf20Sopenharmony_ci	intel_engine_signal_breadcrumbs(engine);
7938c2ecf20Sopenharmony_ci}
7948c2ecf20Sopenharmony_ci
7958c2ecf20Sopenharmony_cistatic void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake)
7968c2ecf20Sopenharmony_ci{
7978c2ecf20Sopenharmony_ci	struct intel_engine_cs *engine;
7988c2ecf20Sopenharmony_ci	enum intel_engine_id id;
7998c2ecf20Sopenharmony_ci
8008c2ecf20Sopenharmony_ci	for_each_engine(engine, gt, id) {
8018c2ecf20Sopenharmony_ci		reset_finish_engine(engine);
8028c2ecf20Sopenharmony_ci		if (awake & engine->mask)
8038c2ecf20Sopenharmony_ci			intel_engine_pm_put(engine);
8048c2ecf20Sopenharmony_ci	}
8058c2ecf20Sopenharmony_ci}
8068c2ecf20Sopenharmony_ci
8078c2ecf20Sopenharmony_cistatic void nop_submit_request(struct i915_request *request)
8088c2ecf20Sopenharmony_ci{
8098c2ecf20Sopenharmony_ci	struct intel_engine_cs *engine = request->engine;
8108c2ecf20Sopenharmony_ci	unsigned long flags;
8118c2ecf20Sopenharmony_ci
8128c2ecf20Sopenharmony_ci	RQ_TRACE(request, "-EIO\n");
8138c2ecf20Sopenharmony_ci	i915_request_set_error_once(request, -EIO);
8148c2ecf20Sopenharmony_ci
8158c2ecf20Sopenharmony_ci	spin_lock_irqsave(&engine->active.lock, flags);
8168c2ecf20Sopenharmony_ci	__i915_request_submit(request);
8178c2ecf20Sopenharmony_ci	i915_request_mark_complete(request);
8188c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&engine->active.lock, flags);
8198c2ecf20Sopenharmony_ci
8208c2ecf20Sopenharmony_ci	intel_engine_signal_breadcrumbs(engine);
8218c2ecf20Sopenharmony_ci}
8228c2ecf20Sopenharmony_ci
8238c2ecf20Sopenharmony_cistatic void __intel_gt_set_wedged(struct intel_gt *gt)
8248c2ecf20Sopenharmony_ci{
8258c2ecf20Sopenharmony_ci	struct intel_engine_cs *engine;
8268c2ecf20Sopenharmony_ci	intel_engine_mask_t awake;
8278c2ecf20Sopenharmony_ci	enum intel_engine_id id;
8288c2ecf20Sopenharmony_ci
8298c2ecf20Sopenharmony_ci	if (test_bit(I915_WEDGED, &gt->reset.flags))
8308c2ecf20Sopenharmony_ci		return;
8318c2ecf20Sopenharmony_ci
8328c2ecf20Sopenharmony_ci	GT_TRACE(gt, "start\n");
8338c2ecf20Sopenharmony_ci
8348c2ecf20Sopenharmony_ci	/*
8358c2ecf20Sopenharmony_ci	 * First, stop submission to hw, but do not yet complete requests by
8368c2ecf20Sopenharmony_ci	 * rolling the global seqno forward (since this would complete requests
8378c2ecf20Sopenharmony_ci	 * for which we haven't set the fence error to EIO yet).
8388c2ecf20Sopenharmony_ci	 */
8398c2ecf20Sopenharmony_ci	awake = reset_prepare(gt);
8408c2ecf20Sopenharmony_ci
8418c2ecf20Sopenharmony_ci	/* Even if the GPU reset fails, it should still stop the engines */
8428c2ecf20Sopenharmony_ci	if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
8438c2ecf20Sopenharmony_ci		__intel_gt_reset(gt, ALL_ENGINES);
8448c2ecf20Sopenharmony_ci
8458c2ecf20Sopenharmony_ci	for_each_engine(engine, gt, id)
8468c2ecf20Sopenharmony_ci		engine->submit_request = nop_submit_request;
8478c2ecf20Sopenharmony_ci
8488c2ecf20Sopenharmony_ci	/*
8498c2ecf20Sopenharmony_ci	 * Make sure no request can slip through without getting completed by
8508c2ecf20Sopenharmony_ci	 * either this call here to intel_engine_write_global_seqno, or the one
8518c2ecf20Sopenharmony_ci	 * in nop_submit_request.
8528c2ecf20Sopenharmony_ci	 */
8538c2ecf20Sopenharmony_ci	synchronize_rcu_expedited();
8548c2ecf20Sopenharmony_ci	set_bit(I915_WEDGED, &gt->reset.flags);
8558c2ecf20Sopenharmony_ci
8568c2ecf20Sopenharmony_ci	/* Mark all executing requests as skipped */
8578c2ecf20Sopenharmony_ci	for_each_engine(engine, gt, id)
8588c2ecf20Sopenharmony_ci		if (engine->reset.cancel)
8598c2ecf20Sopenharmony_ci			engine->reset.cancel(engine);
8608c2ecf20Sopenharmony_ci
8618c2ecf20Sopenharmony_ci	reset_finish(gt, awake);
8628c2ecf20Sopenharmony_ci
8638c2ecf20Sopenharmony_ci	GT_TRACE(gt, "end\n");
8648c2ecf20Sopenharmony_ci}
8658c2ecf20Sopenharmony_ci
8668c2ecf20Sopenharmony_civoid intel_gt_set_wedged(struct intel_gt *gt)
8678c2ecf20Sopenharmony_ci{
8688c2ecf20Sopenharmony_ci	intel_wakeref_t wakeref;
8698c2ecf20Sopenharmony_ci
8708c2ecf20Sopenharmony_ci	if (test_bit(I915_WEDGED, &gt->reset.flags))
8718c2ecf20Sopenharmony_ci		return;
8728c2ecf20Sopenharmony_ci
8738c2ecf20Sopenharmony_ci	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
8748c2ecf20Sopenharmony_ci	mutex_lock(&gt->reset.mutex);
8758c2ecf20Sopenharmony_ci
8768c2ecf20Sopenharmony_ci	if (GEM_SHOW_DEBUG()) {
8778c2ecf20Sopenharmony_ci		struct drm_printer p = drm_debug_printer(__func__);
8788c2ecf20Sopenharmony_ci		struct intel_engine_cs *engine;
8798c2ecf20Sopenharmony_ci		enum intel_engine_id id;
8808c2ecf20Sopenharmony_ci
8818c2ecf20Sopenharmony_ci		drm_printf(&p, "called from %pS\n", (void *)_RET_IP_);
8828c2ecf20Sopenharmony_ci		for_each_engine(engine, gt, id) {
8838c2ecf20Sopenharmony_ci			if (intel_engine_is_idle(engine))
8848c2ecf20Sopenharmony_ci				continue;
8858c2ecf20Sopenharmony_ci
8868c2ecf20Sopenharmony_ci			intel_engine_dump(engine, &p, "%s\n", engine->name);
8878c2ecf20Sopenharmony_ci		}
8888c2ecf20Sopenharmony_ci	}
8898c2ecf20Sopenharmony_ci
8908c2ecf20Sopenharmony_ci	__intel_gt_set_wedged(gt);
8918c2ecf20Sopenharmony_ci
8928c2ecf20Sopenharmony_ci	mutex_unlock(&gt->reset.mutex);
8938c2ecf20Sopenharmony_ci	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
8948c2ecf20Sopenharmony_ci}
8958c2ecf20Sopenharmony_ci
8968c2ecf20Sopenharmony_cistatic bool __intel_gt_unset_wedged(struct intel_gt *gt)
8978c2ecf20Sopenharmony_ci{
8988c2ecf20Sopenharmony_ci	struct intel_gt_timelines *timelines = &gt->timelines;
8998c2ecf20Sopenharmony_ci	struct intel_timeline *tl;
9008c2ecf20Sopenharmony_ci	bool ok;
9018c2ecf20Sopenharmony_ci
9028c2ecf20Sopenharmony_ci	if (!test_bit(I915_WEDGED, &gt->reset.flags))
9038c2ecf20Sopenharmony_ci		return true;
9048c2ecf20Sopenharmony_ci
9058c2ecf20Sopenharmony_ci	/* Never fully initialised, recovery impossible */
9068c2ecf20Sopenharmony_ci	if (intel_gt_has_unrecoverable_error(gt))
9078c2ecf20Sopenharmony_ci		return false;
9088c2ecf20Sopenharmony_ci
9098c2ecf20Sopenharmony_ci	GT_TRACE(gt, "start\n");
9108c2ecf20Sopenharmony_ci
9118c2ecf20Sopenharmony_ci	/*
9128c2ecf20Sopenharmony_ci	 * Before unwedging, make sure that all pending operations
9138c2ecf20Sopenharmony_ci	 * are flushed and errored out - we may have requests waiting upon
9148c2ecf20Sopenharmony_ci	 * third party fences. We marked all inflight requests as EIO, and
9158c2ecf20Sopenharmony_ci	 * every execbuf since returned EIO, for consistency we want all
9168c2ecf20Sopenharmony_ci	 * the currently pending requests to also be marked as EIO, which
9178c2ecf20Sopenharmony_ci	 * is done inside our nop_submit_request - and so we must wait.
9188c2ecf20Sopenharmony_ci	 *
9198c2ecf20Sopenharmony_ci	 * No more can be submitted until we reset the wedged bit.
9208c2ecf20Sopenharmony_ci	 */
9218c2ecf20Sopenharmony_ci	spin_lock(&timelines->lock);
9228c2ecf20Sopenharmony_ci	list_for_each_entry(tl, &timelines->active_list, link) {
9238c2ecf20Sopenharmony_ci		struct dma_fence *fence;
9248c2ecf20Sopenharmony_ci
9258c2ecf20Sopenharmony_ci		fence = i915_active_fence_get(&tl->last_request);
9268c2ecf20Sopenharmony_ci		if (!fence)
9278c2ecf20Sopenharmony_ci			continue;
9288c2ecf20Sopenharmony_ci
9298c2ecf20Sopenharmony_ci		spin_unlock(&timelines->lock);
9308c2ecf20Sopenharmony_ci
9318c2ecf20Sopenharmony_ci		/*
9328c2ecf20Sopenharmony_ci		 * All internal dependencies (i915_requests) will have
9338c2ecf20Sopenharmony_ci		 * been flushed by the set-wedge, but we may be stuck waiting
9348c2ecf20Sopenharmony_ci		 * for external fences. These should all be capped to 10s
9358c2ecf20Sopenharmony_ci		 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
9368c2ecf20Sopenharmony_ci		 * in the worst case.
9378c2ecf20Sopenharmony_ci		 */
9388c2ecf20Sopenharmony_ci		dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT);
9398c2ecf20Sopenharmony_ci		dma_fence_put(fence);
9408c2ecf20Sopenharmony_ci
9418c2ecf20Sopenharmony_ci		/* Restart iteration after droping lock */
9428c2ecf20Sopenharmony_ci		spin_lock(&timelines->lock);
9438c2ecf20Sopenharmony_ci		tl = list_entry(&timelines->active_list, typeof(*tl), link);
9448c2ecf20Sopenharmony_ci	}
9458c2ecf20Sopenharmony_ci	spin_unlock(&timelines->lock);
9468c2ecf20Sopenharmony_ci
9478c2ecf20Sopenharmony_ci	/* We must reset pending GPU events before restoring our submission */
9488c2ecf20Sopenharmony_ci	ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */
9498c2ecf20Sopenharmony_ci	if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
9508c2ecf20Sopenharmony_ci		ok = __intel_gt_reset(gt, ALL_ENGINES) == 0;
9518c2ecf20Sopenharmony_ci	if (!ok) {
9528c2ecf20Sopenharmony_ci		/*
9538c2ecf20Sopenharmony_ci		 * Warn CI about the unrecoverable wedged condition.
9548c2ecf20Sopenharmony_ci		 * Time for a reboot.
9558c2ecf20Sopenharmony_ci		 */
9568c2ecf20Sopenharmony_ci		add_taint_for_CI(gt->i915, TAINT_WARN);
9578c2ecf20Sopenharmony_ci		return false;
9588c2ecf20Sopenharmony_ci	}
9598c2ecf20Sopenharmony_ci
9608c2ecf20Sopenharmony_ci	/*
9618c2ecf20Sopenharmony_ci	 * Undo nop_submit_request. We prevent all new i915 requests from
9628c2ecf20Sopenharmony_ci	 * being queued (by disallowing execbuf whilst wedged) so having
9638c2ecf20Sopenharmony_ci	 * waited for all active requests above, we know the system is idle
9648c2ecf20Sopenharmony_ci	 * and do not have to worry about a thread being inside
9658c2ecf20Sopenharmony_ci	 * engine->submit_request() as we swap over. So unlike installing
9668c2ecf20Sopenharmony_ci	 * the nop_submit_request on reset, we can do this from normal
9678c2ecf20Sopenharmony_ci	 * context and do not require stop_machine().
9688c2ecf20Sopenharmony_ci	 */
9698c2ecf20Sopenharmony_ci	intel_engines_reset_default_submission(gt);
9708c2ecf20Sopenharmony_ci
9718c2ecf20Sopenharmony_ci	GT_TRACE(gt, "end\n");
9728c2ecf20Sopenharmony_ci
9738c2ecf20Sopenharmony_ci	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
9748c2ecf20Sopenharmony_ci	clear_bit(I915_WEDGED, &gt->reset.flags);
9758c2ecf20Sopenharmony_ci
9768c2ecf20Sopenharmony_ci	return true;
9778c2ecf20Sopenharmony_ci}
9788c2ecf20Sopenharmony_ci
9798c2ecf20Sopenharmony_cibool intel_gt_unset_wedged(struct intel_gt *gt)
9808c2ecf20Sopenharmony_ci{
9818c2ecf20Sopenharmony_ci	bool result;
9828c2ecf20Sopenharmony_ci
9838c2ecf20Sopenharmony_ci	mutex_lock(&gt->reset.mutex);
9848c2ecf20Sopenharmony_ci	result = __intel_gt_unset_wedged(gt);
9858c2ecf20Sopenharmony_ci	mutex_unlock(&gt->reset.mutex);
9868c2ecf20Sopenharmony_ci
9878c2ecf20Sopenharmony_ci	return result;
9888c2ecf20Sopenharmony_ci}
9898c2ecf20Sopenharmony_ci
9908c2ecf20Sopenharmony_cistatic int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
9918c2ecf20Sopenharmony_ci{
9928c2ecf20Sopenharmony_ci	int err, i;
9938c2ecf20Sopenharmony_ci
9948c2ecf20Sopenharmony_ci	gt_revoke(gt);
9958c2ecf20Sopenharmony_ci
9968c2ecf20Sopenharmony_ci	err = __intel_gt_reset(gt, ALL_ENGINES);
9978c2ecf20Sopenharmony_ci	for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
9988c2ecf20Sopenharmony_ci		msleep(10 * (i + 1));
9998c2ecf20Sopenharmony_ci		err = __intel_gt_reset(gt, ALL_ENGINES);
10008c2ecf20Sopenharmony_ci	}
10018c2ecf20Sopenharmony_ci	if (err)
10028c2ecf20Sopenharmony_ci		return err;
10038c2ecf20Sopenharmony_ci
10048c2ecf20Sopenharmony_ci	return gt_reset(gt, stalled_mask);
10058c2ecf20Sopenharmony_ci}
10068c2ecf20Sopenharmony_ci
10078c2ecf20Sopenharmony_cistatic int resume(struct intel_gt *gt)
10088c2ecf20Sopenharmony_ci{
10098c2ecf20Sopenharmony_ci	struct intel_engine_cs *engine;
10108c2ecf20Sopenharmony_ci	enum intel_engine_id id;
10118c2ecf20Sopenharmony_ci	int ret;
10128c2ecf20Sopenharmony_ci
10138c2ecf20Sopenharmony_ci	for_each_engine(engine, gt, id) {
10148c2ecf20Sopenharmony_ci		ret = intel_engine_resume(engine);
10158c2ecf20Sopenharmony_ci		if (ret)
10168c2ecf20Sopenharmony_ci			return ret;
10178c2ecf20Sopenharmony_ci	}
10188c2ecf20Sopenharmony_ci
10198c2ecf20Sopenharmony_ci	return 0;
10208c2ecf20Sopenharmony_ci}
10218c2ecf20Sopenharmony_ci
10228c2ecf20Sopenharmony_ci/**
10238c2ecf20Sopenharmony_ci * intel_gt_reset - reset chip after a hang
10248c2ecf20Sopenharmony_ci * @gt: #intel_gt to reset
10258c2ecf20Sopenharmony_ci * @stalled_mask: mask of the stalled engines with the guilty requests
10268c2ecf20Sopenharmony_ci * @reason: user error message for why we are resetting
10278c2ecf20Sopenharmony_ci *
10288c2ecf20Sopenharmony_ci * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
10298c2ecf20Sopenharmony_ci * on failure.
10308c2ecf20Sopenharmony_ci *
10318c2ecf20Sopenharmony_ci * Procedure is fairly simple:
10328c2ecf20Sopenharmony_ci *   - reset the chip using the reset reg
10338c2ecf20Sopenharmony_ci *   - re-init context state
10348c2ecf20Sopenharmony_ci *   - re-init hardware status page
10358c2ecf20Sopenharmony_ci *   - re-init ring buffer
10368c2ecf20Sopenharmony_ci *   - re-init interrupt state
10378c2ecf20Sopenharmony_ci *   - re-init display
10388c2ecf20Sopenharmony_ci */
10398c2ecf20Sopenharmony_civoid intel_gt_reset(struct intel_gt *gt,
10408c2ecf20Sopenharmony_ci		    intel_engine_mask_t stalled_mask,
10418c2ecf20Sopenharmony_ci		    const char *reason)
10428c2ecf20Sopenharmony_ci{
10438c2ecf20Sopenharmony_ci	intel_engine_mask_t awake;
10448c2ecf20Sopenharmony_ci	int ret;
10458c2ecf20Sopenharmony_ci
10468c2ecf20Sopenharmony_ci	GT_TRACE(gt, "flags=%lx\n", gt->reset.flags);
10478c2ecf20Sopenharmony_ci
10488c2ecf20Sopenharmony_ci	might_sleep();
10498c2ecf20Sopenharmony_ci	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
10508c2ecf20Sopenharmony_ci	mutex_lock(&gt->reset.mutex);
10518c2ecf20Sopenharmony_ci
10528c2ecf20Sopenharmony_ci	/* Clear any previous failed attempts at recovery. Time to try again. */
10538c2ecf20Sopenharmony_ci	if (!__intel_gt_unset_wedged(gt))
10548c2ecf20Sopenharmony_ci		goto unlock;
10558c2ecf20Sopenharmony_ci
10568c2ecf20Sopenharmony_ci	if (reason)
10578c2ecf20Sopenharmony_ci		drm_notice(&gt->i915->drm,
10588c2ecf20Sopenharmony_ci			   "Resetting chip for %s\n", reason);
10598c2ecf20Sopenharmony_ci	atomic_inc(&gt->i915->gpu_error.reset_count);
10608c2ecf20Sopenharmony_ci
10618c2ecf20Sopenharmony_ci	awake = reset_prepare(gt);
10628c2ecf20Sopenharmony_ci
10638c2ecf20Sopenharmony_ci	if (!intel_has_gpu_reset(gt)) {
10648c2ecf20Sopenharmony_ci		if (gt->i915->params.reset)
10658c2ecf20Sopenharmony_ci			drm_err(&gt->i915->drm, "GPU reset not supported\n");
10668c2ecf20Sopenharmony_ci		else
10678c2ecf20Sopenharmony_ci			drm_dbg(&gt->i915->drm, "GPU reset disabled\n");
10688c2ecf20Sopenharmony_ci		goto error;
10698c2ecf20Sopenharmony_ci	}
10708c2ecf20Sopenharmony_ci
10718c2ecf20Sopenharmony_ci	if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
10728c2ecf20Sopenharmony_ci		intel_runtime_pm_disable_interrupts(gt->i915);
10738c2ecf20Sopenharmony_ci
10748c2ecf20Sopenharmony_ci	if (do_reset(gt, stalled_mask)) {
10758c2ecf20Sopenharmony_ci		drm_err(&gt->i915->drm, "Failed to reset chip\n");
10768c2ecf20Sopenharmony_ci		goto taint;
10778c2ecf20Sopenharmony_ci	}
10788c2ecf20Sopenharmony_ci
10798c2ecf20Sopenharmony_ci	if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
10808c2ecf20Sopenharmony_ci		intel_runtime_pm_enable_interrupts(gt->i915);
10818c2ecf20Sopenharmony_ci
10828c2ecf20Sopenharmony_ci	intel_overlay_reset(gt->i915);
10838c2ecf20Sopenharmony_ci
10848c2ecf20Sopenharmony_ci	/*
10858c2ecf20Sopenharmony_ci	 * Next we need to restore the context, but we don't use those
10868c2ecf20Sopenharmony_ci	 * yet either...
10878c2ecf20Sopenharmony_ci	 *
10888c2ecf20Sopenharmony_ci	 * Ring buffer needs to be re-initialized in the KMS case, or if X
10898c2ecf20Sopenharmony_ci	 * was running at the time of the reset (i.e. we weren't VT
10908c2ecf20Sopenharmony_ci	 * switched away).
10918c2ecf20Sopenharmony_ci	 */
10928c2ecf20Sopenharmony_ci	ret = intel_gt_init_hw(gt);
10938c2ecf20Sopenharmony_ci	if (ret) {
10948c2ecf20Sopenharmony_ci		drm_err(&gt->i915->drm,
10958c2ecf20Sopenharmony_ci			"Failed to initialise HW following reset (%d)\n",
10968c2ecf20Sopenharmony_ci			ret);
10978c2ecf20Sopenharmony_ci		goto taint;
10988c2ecf20Sopenharmony_ci	}
10998c2ecf20Sopenharmony_ci
11008c2ecf20Sopenharmony_ci	ret = resume(gt);
11018c2ecf20Sopenharmony_ci	if (ret)
11028c2ecf20Sopenharmony_ci		goto taint;
11038c2ecf20Sopenharmony_ci
11048c2ecf20Sopenharmony_cifinish:
11058c2ecf20Sopenharmony_ci	reset_finish(gt, awake);
11068c2ecf20Sopenharmony_ciunlock:
11078c2ecf20Sopenharmony_ci	mutex_unlock(&gt->reset.mutex);
11088c2ecf20Sopenharmony_ci	return;
11098c2ecf20Sopenharmony_ci
11108c2ecf20Sopenharmony_citaint:
11118c2ecf20Sopenharmony_ci	/*
11128c2ecf20Sopenharmony_ci	 * History tells us that if we cannot reset the GPU now, we
11138c2ecf20Sopenharmony_ci	 * never will. This then impacts everything that is run
11148c2ecf20Sopenharmony_ci	 * subsequently. On failing the reset, we mark the driver
11158c2ecf20Sopenharmony_ci	 * as wedged, preventing further execution on the GPU.
11168c2ecf20Sopenharmony_ci	 * We also want to go one step further and add a taint to the
11178c2ecf20Sopenharmony_ci	 * kernel so that any subsequent faults can be traced back to
11188c2ecf20Sopenharmony_ci	 * this failure. This is important for CI, where if the
11198c2ecf20Sopenharmony_ci	 * GPU/driver fails we would like to reboot and restart testing
11208c2ecf20Sopenharmony_ci	 * rather than continue on into oblivion. For everyone else,
11218c2ecf20Sopenharmony_ci	 * the system should still plod along, but they have been warned!
11228c2ecf20Sopenharmony_ci	 */
11238c2ecf20Sopenharmony_ci	add_taint_for_CI(gt->i915, TAINT_WARN);
11248c2ecf20Sopenharmony_cierror:
11258c2ecf20Sopenharmony_ci	__intel_gt_set_wedged(gt);
11268c2ecf20Sopenharmony_ci	goto finish;
11278c2ecf20Sopenharmony_ci}
11288c2ecf20Sopenharmony_ci
11298c2ecf20Sopenharmony_cistatic inline int intel_gt_reset_engine(struct intel_engine_cs *engine)
11308c2ecf20Sopenharmony_ci{
11318c2ecf20Sopenharmony_ci	return __intel_gt_reset(engine->gt, engine->mask);
11328c2ecf20Sopenharmony_ci}
11338c2ecf20Sopenharmony_ci
11348c2ecf20Sopenharmony_ci/**
11358c2ecf20Sopenharmony_ci * intel_engine_reset - reset GPU engine to recover from a hang
11368c2ecf20Sopenharmony_ci * @engine: engine to reset
11378c2ecf20Sopenharmony_ci * @msg: reason for GPU reset; or NULL for no drm_notice()
11388c2ecf20Sopenharmony_ci *
11398c2ecf20Sopenharmony_ci * Reset a specific GPU engine. Useful if a hang is detected.
11408c2ecf20Sopenharmony_ci * Returns zero on successful reset or otherwise an error code.
11418c2ecf20Sopenharmony_ci *
11428c2ecf20Sopenharmony_ci * Procedure is:
11438c2ecf20Sopenharmony_ci *  - identifies the request that caused the hang and it is dropped
11448c2ecf20Sopenharmony_ci *  - reset engine (which will force the engine to idle)
11458c2ecf20Sopenharmony_ci *  - re-init/configure engine
11468c2ecf20Sopenharmony_ci */
11478c2ecf20Sopenharmony_ciint intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
11488c2ecf20Sopenharmony_ci{
11498c2ecf20Sopenharmony_ci	struct intel_gt *gt = engine->gt;
11508c2ecf20Sopenharmony_ci	bool uses_guc = intel_engine_in_guc_submission_mode(engine);
11518c2ecf20Sopenharmony_ci	int ret;
11528c2ecf20Sopenharmony_ci
11538c2ecf20Sopenharmony_ci	ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags);
11548c2ecf20Sopenharmony_ci	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &gt->reset.flags));
11558c2ecf20Sopenharmony_ci
11568c2ecf20Sopenharmony_ci	if (!intel_engine_pm_get_if_awake(engine))
11578c2ecf20Sopenharmony_ci		return 0;
11588c2ecf20Sopenharmony_ci
11598c2ecf20Sopenharmony_ci	reset_prepare_engine(engine);
11608c2ecf20Sopenharmony_ci
11618c2ecf20Sopenharmony_ci	if (msg)
11628c2ecf20Sopenharmony_ci		drm_notice(&engine->i915->drm,
11638c2ecf20Sopenharmony_ci			   "Resetting %s for %s\n", engine->name, msg);
11648c2ecf20Sopenharmony_ci	atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
11658c2ecf20Sopenharmony_ci
11668c2ecf20Sopenharmony_ci	if (!uses_guc)
11678c2ecf20Sopenharmony_ci		ret = intel_gt_reset_engine(engine);
11688c2ecf20Sopenharmony_ci	else
11698c2ecf20Sopenharmony_ci		ret = intel_guc_reset_engine(&engine->gt->uc.guc, engine);
11708c2ecf20Sopenharmony_ci	if (ret) {
11718c2ecf20Sopenharmony_ci		/* If we fail here, we expect to fallback to a global reset */
11728c2ecf20Sopenharmony_ci		drm_dbg(&gt->i915->drm, "%sFailed to reset %s, ret=%d\n",
11738c2ecf20Sopenharmony_ci			uses_guc ? "GuC " : "", engine->name, ret);
11748c2ecf20Sopenharmony_ci		goto out;
11758c2ecf20Sopenharmony_ci	}
11768c2ecf20Sopenharmony_ci
11778c2ecf20Sopenharmony_ci	/*
11788c2ecf20Sopenharmony_ci	 * The request that caused the hang is stuck on elsp, we know the
11798c2ecf20Sopenharmony_ci	 * active request and can drop it, adjust head to skip the offending
11808c2ecf20Sopenharmony_ci	 * request to resume executing remaining requests in the queue.
11818c2ecf20Sopenharmony_ci	 */
11828c2ecf20Sopenharmony_ci	__intel_engine_reset(engine, true);
11838c2ecf20Sopenharmony_ci
11848c2ecf20Sopenharmony_ci	/*
11858c2ecf20Sopenharmony_ci	 * The engine and its registers (and workarounds in case of render)
11868c2ecf20Sopenharmony_ci	 * have been reset to their default values. Follow the init_ring
11878c2ecf20Sopenharmony_ci	 * process to program RING_MODE, HWSP and re-enable submission.
11888c2ecf20Sopenharmony_ci	 */
11898c2ecf20Sopenharmony_ci	ret = intel_engine_resume(engine);
11908c2ecf20Sopenharmony_ci
11918c2ecf20Sopenharmony_ciout:
11928c2ecf20Sopenharmony_ci	intel_engine_cancel_stop_cs(engine);
11938c2ecf20Sopenharmony_ci	reset_finish_engine(engine);
11948c2ecf20Sopenharmony_ci	intel_engine_pm_put_async(engine);
11958c2ecf20Sopenharmony_ci	return ret;
11968c2ecf20Sopenharmony_ci}
11978c2ecf20Sopenharmony_ci
11988c2ecf20Sopenharmony_cistatic void intel_gt_reset_global(struct intel_gt *gt,
11998c2ecf20Sopenharmony_ci				  u32 engine_mask,
12008c2ecf20Sopenharmony_ci				  const char *reason)
12018c2ecf20Sopenharmony_ci{
12028c2ecf20Sopenharmony_ci	struct kobject *kobj = &gt->i915->drm.primary->kdev->kobj;
12038c2ecf20Sopenharmony_ci	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
12048c2ecf20Sopenharmony_ci	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
12058c2ecf20Sopenharmony_ci	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
12068c2ecf20Sopenharmony_ci	struct intel_wedge_me w;
12078c2ecf20Sopenharmony_ci
12088c2ecf20Sopenharmony_ci	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
12098c2ecf20Sopenharmony_ci
12108c2ecf20Sopenharmony_ci	drm_dbg(&gt->i915->drm, "resetting chip, engines=%x\n", engine_mask);
12118c2ecf20Sopenharmony_ci	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
12128c2ecf20Sopenharmony_ci
12138c2ecf20Sopenharmony_ci	/* Use a watchdog to ensure that our reset completes */
12148c2ecf20Sopenharmony_ci	intel_wedge_on_timeout(&w, gt, 5 * HZ) {
12158c2ecf20Sopenharmony_ci		intel_prepare_reset(gt->i915);
12168c2ecf20Sopenharmony_ci
12178c2ecf20Sopenharmony_ci		/* Flush everyone using a resource about to be clobbered */
12188c2ecf20Sopenharmony_ci		synchronize_srcu_expedited(&gt->reset.backoff_srcu);
12198c2ecf20Sopenharmony_ci
12208c2ecf20Sopenharmony_ci		intel_gt_reset(gt, engine_mask, reason);
12218c2ecf20Sopenharmony_ci
12228c2ecf20Sopenharmony_ci		intel_finish_reset(gt->i915);
12238c2ecf20Sopenharmony_ci	}
12248c2ecf20Sopenharmony_ci
12258c2ecf20Sopenharmony_ci	if (!test_bit(I915_WEDGED, &gt->reset.flags))
12268c2ecf20Sopenharmony_ci		kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
12278c2ecf20Sopenharmony_ci}
12288c2ecf20Sopenharmony_ci
12298c2ecf20Sopenharmony_ci/**
12308c2ecf20Sopenharmony_ci * intel_gt_handle_error - handle a gpu error
12318c2ecf20Sopenharmony_ci * @gt: the intel_gt
12328c2ecf20Sopenharmony_ci * @engine_mask: mask representing engines that are hung
12338c2ecf20Sopenharmony_ci * @flags: control flags
12348c2ecf20Sopenharmony_ci * @fmt: Error message format string
12358c2ecf20Sopenharmony_ci *
12368c2ecf20Sopenharmony_ci * Do some basic checking of register state at error time and
12378c2ecf20Sopenharmony_ci * dump it to the syslog.  Also call i915_capture_error_state() to make
12388c2ecf20Sopenharmony_ci * sure we get a record and make it available in debugfs.  Fire a uevent
12398c2ecf20Sopenharmony_ci * so userspace knows something bad happened (should trigger collection
12408c2ecf20Sopenharmony_ci * of a ring dump etc.).
12418c2ecf20Sopenharmony_ci */
12428c2ecf20Sopenharmony_civoid intel_gt_handle_error(struct intel_gt *gt,
12438c2ecf20Sopenharmony_ci			   intel_engine_mask_t engine_mask,
12448c2ecf20Sopenharmony_ci			   unsigned long flags,
12458c2ecf20Sopenharmony_ci			   const char *fmt, ...)
12468c2ecf20Sopenharmony_ci{
12478c2ecf20Sopenharmony_ci	struct intel_engine_cs *engine;
12488c2ecf20Sopenharmony_ci	intel_wakeref_t wakeref;
12498c2ecf20Sopenharmony_ci	intel_engine_mask_t tmp;
12508c2ecf20Sopenharmony_ci	char error_msg[80];
12518c2ecf20Sopenharmony_ci	char *msg = NULL;
12528c2ecf20Sopenharmony_ci
12538c2ecf20Sopenharmony_ci	if (fmt) {
12548c2ecf20Sopenharmony_ci		va_list args;
12558c2ecf20Sopenharmony_ci
12568c2ecf20Sopenharmony_ci		va_start(args, fmt);
12578c2ecf20Sopenharmony_ci		vscnprintf(error_msg, sizeof(error_msg), fmt, args);
12588c2ecf20Sopenharmony_ci		va_end(args);
12598c2ecf20Sopenharmony_ci
12608c2ecf20Sopenharmony_ci		msg = error_msg;
12618c2ecf20Sopenharmony_ci	}
12628c2ecf20Sopenharmony_ci
12638c2ecf20Sopenharmony_ci	/*
12648c2ecf20Sopenharmony_ci	 * In most cases it's guaranteed that we get here with an RPM
12658c2ecf20Sopenharmony_ci	 * reference held, for example because there is a pending GPU
12668c2ecf20Sopenharmony_ci	 * request that won't finish until the reset is done. This
12678c2ecf20Sopenharmony_ci	 * isn't the case at least when we get here by doing a
12688c2ecf20Sopenharmony_ci	 * simulated reset via debugfs, so get an RPM reference.
12698c2ecf20Sopenharmony_ci	 */
12708c2ecf20Sopenharmony_ci	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
12718c2ecf20Sopenharmony_ci
12728c2ecf20Sopenharmony_ci	engine_mask &= gt->info.engine_mask;
12738c2ecf20Sopenharmony_ci
12748c2ecf20Sopenharmony_ci	if (flags & I915_ERROR_CAPTURE) {
12758c2ecf20Sopenharmony_ci		i915_capture_error_state(gt->i915);
12768c2ecf20Sopenharmony_ci		intel_gt_clear_error_registers(gt, engine_mask);
12778c2ecf20Sopenharmony_ci	}
12788c2ecf20Sopenharmony_ci
12798c2ecf20Sopenharmony_ci	/*
12808c2ecf20Sopenharmony_ci	 * Try engine reset when available. We fall back to full reset if
12818c2ecf20Sopenharmony_ci	 * single reset fails.
12828c2ecf20Sopenharmony_ci	 */
12838c2ecf20Sopenharmony_ci	if (intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) {
12848c2ecf20Sopenharmony_ci		for_each_engine_masked(engine, gt, engine_mask, tmp) {
12858c2ecf20Sopenharmony_ci			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
12868c2ecf20Sopenharmony_ci			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
12878c2ecf20Sopenharmony_ci					     &gt->reset.flags))
12888c2ecf20Sopenharmony_ci				continue;
12898c2ecf20Sopenharmony_ci
12908c2ecf20Sopenharmony_ci			if (intel_engine_reset(engine, msg) == 0)
12918c2ecf20Sopenharmony_ci				engine_mask &= ~engine->mask;
12928c2ecf20Sopenharmony_ci
12938c2ecf20Sopenharmony_ci			clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
12948c2ecf20Sopenharmony_ci					      &gt->reset.flags);
12958c2ecf20Sopenharmony_ci		}
12968c2ecf20Sopenharmony_ci	}
12978c2ecf20Sopenharmony_ci
12988c2ecf20Sopenharmony_ci	if (!engine_mask)
12998c2ecf20Sopenharmony_ci		goto out;
13008c2ecf20Sopenharmony_ci
13018c2ecf20Sopenharmony_ci	/* Full reset needs the mutex, stop any other user trying to do so. */
13028c2ecf20Sopenharmony_ci	if (test_and_set_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
13038c2ecf20Sopenharmony_ci		wait_event(gt->reset.queue,
13048c2ecf20Sopenharmony_ci			   !test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
13058c2ecf20Sopenharmony_ci		goto out; /* piggy-back on the other reset */
13068c2ecf20Sopenharmony_ci	}
13078c2ecf20Sopenharmony_ci
13088c2ecf20Sopenharmony_ci	/* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
13098c2ecf20Sopenharmony_ci	synchronize_rcu_expedited();
13108c2ecf20Sopenharmony_ci
13118c2ecf20Sopenharmony_ci	/* Prevent any other reset-engine attempt. */
13128c2ecf20Sopenharmony_ci	for_each_engine(engine, gt, tmp) {
13138c2ecf20Sopenharmony_ci		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
13148c2ecf20Sopenharmony_ci					&gt->reset.flags))
13158c2ecf20Sopenharmony_ci			wait_on_bit(&gt->reset.flags,
13168c2ecf20Sopenharmony_ci				    I915_RESET_ENGINE + engine->id,
13178c2ecf20Sopenharmony_ci				    TASK_UNINTERRUPTIBLE);
13188c2ecf20Sopenharmony_ci	}
13198c2ecf20Sopenharmony_ci
13208c2ecf20Sopenharmony_ci	intel_gt_reset_global(gt, engine_mask, msg);
13218c2ecf20Sopenharmony_ci
13228c2ecf20Sopenharmony_ci	for_each_engine(engine, gt, tmp)
13238c2ecf20Sopenharmony_ci		clear_bit_unlock(I915_RESET_ENGINE + engine->id,
13248c2ecf20Sopenharmony_ci				 &gt->reset.flags);
13258c2ecf20Sopenharmony_ci	clear_bit_unlock(I915_RESET_BACKOFF, &gt->reset.flags);
13268c2ecf20Sopenharmony_ci	smp_mb__after_atomic();
13278c2ecf20Sopenharmony_ci	wake_up_all(&gt->reset.queue);
13288c2ecf20Sopenharmony_ci
13298c2ecf20Sopenharmony_ciout:
13308c2ecf20Sopenharmony_ci	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
13318c2ecf20Sopenharmony_ci}
13328c2ecf20Sopenharmony_ci
13338c2ecf20Sopenharmony_ciint intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
13348c2ecf20Sopenharmony_ci{
13358c2ecf20Sopenharmony_ci	might_lock(&gt->reset.backoff_srcu);
13368c2ecf20Sopenharmony_ci	might_sleep();
13378c2ecf20Sopenharmony_ci
13388c2ecf20Sopenharmony_ci	rcu_read_lock();
13398c2ecf20Sopenharmony_ci	while (test_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
13408c2ecf20Sopenharmony_ci		rcu_read_unlock();
13418c2ecf20Sopenharmony_ci
13428c2ecf20Sopenharmony_ci		if (wait_event_interruptible(gt->reset.queue,
13438c2ecf20Sopenharmony_ci					     !test_bit(I915_RESET_BACKOFF,
13448c2ecf20Sopenharmony_ci						       &gt->reset.flags)))
13458c2ecf20Sopenharmony_ci			return -EINTR;
13468c2ecf20Sopenharmony_ci
13478c2ecf20Sopenharmony_ci		rcu_read_lock();
13488c2ecf20Sopenharmony_ci	}
13498c2ecf20Sopenharmony_ci	*srcu = srcu_read_lock(&gt->reset.backoff_srcu);
13508c2ecf20Sopenharmony_ci	rcu_read_unlock();
13518c2ecf20Sopenharmony_ci
13528c2ecf20Sopenharmony_ci	return 0;
13538c2ecf20Sopenharmony_ci}
13548c2ecf20Sopenharmony_ci
13558c2ecf20Sopenharmony_civoid intel_gt_reset_unlock(struct intel_gt *gt, int tag)
13568c2ecf20Sopenharmony_ci__releases(&gt->reset.backoff_srcu)
13578c2ecf20Sopenharmony_ci{
13588c2ecf20Sopenharmony_ci	srcu_read_unlock(&gt->reset.backoff_srcu, tag);
13598c2ecf20Sopenharmony_ci}
13608c2ecf20Sopenharmony_ci
13618c2ecf20Sopenharmony_ciint intel_gt_terminally_wedged(struct intel_gt *gt)
13628c2ecf20Sopenharmony_ci{
13638c2ecf20Sopenharmony_ci	might_sleep();
13648c2ecf20Sopenharmony_ci
13658c2ecf20Sopenharmony_ci	if (!intel_gt_is_wedged(gt))
13668c2ecf20Sopenharmony_ci		return 0;
13678c2ecf20Sopenharmony_ci
13688c2ecf20Sopenharmony_ci	if (intel_gt_has_unrecoverable_error(gt))
13698c2ecf20Sopenharmony_ci		return -EIO;
13708c2ecf20Sopenharmony_ci
13718c2ecf20Sopenharmony_ci	/* Reset still in progress? Maybe we will recover? */
13728c2ecf20Sopenharmony_ci	if (wait_event_interruptible(gt->reset.queue,
13738c2ecf20Sopenharmony_ci				     !test_bit(I915_RESET_BACKOFF,
13748c2ecf20Sopenharmony_ci					       &gt->reset.flags)))
13758c2ecf20Sopenharmony_ci		return -EINTR;
13768c2ecf20Sopenharmony_ci
13778c2ecf20Sopenharmony_ci	return intel_gt_is_wedged(gt) ? -EIO : 0;
13788c2ecf20Sopenharmony_ci}
13798c2ecf20Sopenharmony_ci
13808c2ecf20Sopenharmony_civoid intel_gt_set_wedged_on_init(struct intel_gt *gt)
13818c2ecf20Sopenharmony_ci{
13828c2ecf20Sopenharmony_ci	BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES >
13838c2ecf20Sopenharmony_ci		     I915_WEDGED_ON_INIT);
13848c2ecf20Sopenharmony_ci	intel_gt_set_wedged(gt);
13858c2ecf20Sopenharmony_ci	set_bit(I915_WEDGED_ON_INIT, &gt->reset.flags);
13868c2ecf20Sopenharmony_ci
13878c2ecf20Sopenharmony_ci	/* Wedged on init is non-recoverable */
13888c2ecf20Sopenharmony_ci	add_taint_for_CI(gt->i915, TAINT_WARN);
13898c2ecf20Sopenharmony_ci}
13908c2ecf20Sopenharmony_ci
13918c2ecf20Sopenharmony_civoid intel_gt_set_wedged_on_fini(struct intel_gt *gt)
13928c2ecf20Sopenharmony_ci{
13938c2ecf20Sopenharmony_ci	intel_gt_set_wedged(gt);
13948c2ecf20Sopenharmony_ci	set_bit(I915_WEDGED_ON_FINI, &gt->reset.flags);
13958c2ecf20Sopenharmony_ci}
13968c2ecf20Sopenharmony_ci
13978c2ecf20Sopenharmony_civoid intel_gt_init_reset(struct intel_gt *gt)
13988c2ecf20Sopenharmony_ci{
13998c2ecf20Sopenharmony_ci	init_waitqueue_head(&gt->reset.queue);
14008c2ecf20Sopenharmony_ci	mutex_init(&gt->reset.mutex);
14018c2ecf20Sopenharmony_ci	init_srcu_struct(&gt->reset.backoff_srcu);
14028c2ecf20Sopenharmony_ci
14038c2ecf20Sopenharmony_ci	/* no GPU until we are ready! */
14048c2ecf20Sopenharmony_ci	__set_bit(I915_WEDGED, &gt->reset.flags);
14058c2ecf20Sopenharmony_ci}
14068c2ecf20Sopenharmony_ci
14078c2ecf20Sopenharmony_civoid intel_gt_fini_reset(struct intel_gt *gt)
14088c2ecf20Sopenharmony_ci{
14098c2ecf20Sopenharmony_ci	cleanup_srcu_struct(&gt->reset.backoff_srcu);
14108c2ecf20Sopenharmony_ci}
14118c2ecf20Sopenharmony_ci
14128c2ecf20Sopenharmony_cistatic void intel_wedge_me(struct work_struct *work)
14138c2ecf20Sopenharmony_ci{
14148c2ecf20Sopenharmony_ci	struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
14158c2ecf20Sopenharmony_ci
14168c2ecf20Sopenharmony_ci	drm_err(&w->gt->i915->drm,
14178c2ecf20Sopenharmony_ci		"%s timed out, cancelling all in-flight rendering.\n",
14188c2ecf20Sopenharmony_ci		w->name);
14198c2ecf20Sopenharmony_ci	intel_gt_set_wedged(w->gt);
14208c2ecf20Sopenharmony_ci}
14218c2ecf20Sopenharmony_ci
14228c2ecf20Sopenharmony_civoid __intel_init_wedge(struct intel_wedge_me *w,
14238c2ecf20Sopenharmony_ci			struct intel_gt *gt,
14248c2ecf20Sopenharmony_ci			long timeout,
14258c2ecf20Sopenharmony_ci			const char *name)
14268c2ecf20Sopenharmony_ci{
14278c2ecf20Sopenharmony_ci	w->gt = gt;
14288c2ecf20Sopenharmony_ci	w->name = name;
14298c2ecf20Sopenharmony_ci
14308c2ecf20Sopenharmony_ci	INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me);
14318c2ecf20Sopenharmony_ci	schedule_delayed_work(&w->work, timeout);
14328c2ecf20Sopenharmony_ci}
14338c2ecf20Sopenharmony_ci
14348c2ecf20Sopenharmony_civoid __intel_fini_wedge(struct intel_wedge_me *w)
14358c2ecf20Sopenharmony_ci{
14368c2ecf20Sopenharmony_ci	cancel_delayed_work_sync(&w->work);
14378c2ecf20Sopenharmony_ci	destroy_delayed_work_on_stack(&w->work);
14388c2ecf20Sopenharmony_ci	w->gt = NULL;
14398c2ecf20Sopenharmony_ci}
14408c2ecf20Sopenharmony_ci
14418c2ecf20Sopenharmony_ci#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
14428c2ecf20Sopenharmony_ci#include "selftest_reset.c"
14438c2ecf20Sopenharmony_ci#include "selftest_hangcheck.c"
14448c2ecf20Sopenharmony_ci#endif
1445