18c2ecf20Sopenharmony_ci/* 28c2ecf20Sopenharmony_ci * SPDX-License-Identifier: MIT 38c2ecf20Sopenharmony_ci * 48c2ecf20Sopenharmony_ci * Copyright © 2008-2018 Intel Corporation 58c2ecf20Sopenharmony_ci */ 68c2ecf20Sopenharmony_ci 78c2ecf20Sopenharmony_ci#include <linux/sched/mm.h> 88c2ecf20Sopenharmony_ci#include <linux/stop_machine.h> 98c2ecf20Sopenharmony_ci 108c2ecf20Sopenharmony_ci#include "display/intel_display_types.h" 118c2ecf20Sopenharmony_ci#include "display/intel_overlay.h" 128c2ecf20Sopenharmony_ci 138c2ecf20Sopenharmony_ci#include "gem/i915_gem_context.h" 148c2ecf20Sopenharmony_ci 158c2ecf20Sopenharmony_ci#include "i915_drv.h" 168c2ecf20Sopenharmony_ci#include "i915_gpu_error.h" 178c2ecf20Sopenharmony_ci#include "i915_irq.h" 188c2ecf20Sopenharmony_ci#include "intel_breadcrumbs.h" 198c2ecf20Sopenharmony_ci#include "intel_engine_pm.h" 208c2ecf20Sopenharmony_ci#include "intel_gt.h" 218c2ecf20Sopenharmony_ci#include "intel_gt_pm.h" 228c2ecf20Sopenharmony_ci#include "intel_reset.h" 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_ci#include "uc/intel_guc.h" 258c2ecf20Sopenharmony_ci#include "uc/intel_guc_submission.h" 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_ci#define RESET_MAX_RETRIES 3 288c2ecf20Sopenharmony_ci 298c2ecf20Sopenharmony_ci/* XXX How to handle concurrent GGTT updates using tiling registers? */ 308c2ecf20Sopenharmony_ci#define RESET_UNDER_STOP_MACHINE 0 318c2ecf20Sopenharmony_ci 328c2ecf20Sopenharmony_cistatic void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set) 338c2ecf20Sopenharmony_ci{ 348c2ecf20Sopenharmony_ci intel_uncore_rmw_fw(uncore, reg, 0, set); 358c2ecf20Sopenharmony_ci} 368c2ecf20Sopenharmony_ci 378c2ecf20Sopenharmony_cistatic void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr) 388c2ecf20Sopenharmony_ci{ 398c2ecf20Sopenharmony_ci intel_uncore_rmw_fw(uncore, reg, clr, 0); 408c2ecf20Sopenharmony_ci} 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_cistatic void engine_skip_context(struct i915_request *rq) 438c2ecf20Sopenharmony_ci{ 448c2ecf20Sopenharmony_ci struct intel_engine_cs *engine = rq->engine; 458c2ecf20Sopenharmony_ci struct intel_context *hung_ctx = rq->context; 468c2ecf20Sopenharmony_ci 478c2ecf20Sopenharmony_ci if (!i915_request_is_active(rq)) 488c2ecf20Sopenharmony_ci return; 498c2ecf20Sopenharmony_ci 508c2ecf20Sopenharmony_ci lockdep_assert_held(&engine->active.lock); 518c2ecf20Sopenharmony_ci list_for_each_entry_continue(rq, &engine->active.requests, sched.link) 528c2ecf20Sopenharmony_ci if (rq->context == hung_ctx) { 538c2ecf20Sopenharmony_ci i915_request_set_error_once(rq, -EIO); 548c2ecf20Sopenharmony_ci __i915_request_skip(rq); 558c2ecf20Sopenharmony_ci } 568c2ecf20Sopenharmony_ci} 578c2ecf20Sopenharmony_ci 588c2ecf20Sopenharmony_cistatic void client_mark_guilty(struct i915_gem_context *ctx, bool banned) 598c2ecf20Sopenharmony_ci{ 608c2ecf20Sopenharmony_ci struct drm_i915_file_private *file_priv = ctx->file_priv; 618c2ecf20Sopenharmony_ci unsigned long prev_hang; 628c2ecf20Sopenharmony_ci unsigned int score; 638c2ecf20Sopenharmony_ci 648c2ecf20Sopenharmony_ci if (IS_ERR_OR_NULL(file_priv)) 658c2ecf20Sopenharmony_ci return; 668c2ecf20Sopenharmony_ci 678c2ecf20Sopenharmony_ci score = 0; 688c2ecf20Sopenharmony_ci if (banned) 698c2ecf20Sopenharmony_ci score = I915_CLIENT_SCORE_CONTEXT_BAN; 708c2ecf20Sopenharmony_ci 718c2ecf20Sopenharmony_ci prev_hang = xchg(&file_priv->hang_timestamp, jiffies); 728c2ecf20Sopenharmony_ci if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES)) 738c2ecf20Sopenharmony_ci score += I915_CLIENT_SCORE_HANG_FAST; 748c2ecf20Sopenharmony_ci 758c2ecf20Sopenharmony_ci if (score) { 768c2ecf20Sopenharmony_ci atomic_add(score, &file_priv->ban_score); 778c2ecf20Sopenharmony_ci 788c2ecf20Sopenharmony_ci drm_dbg(&ctx->i915->drm, 798c2ecf20Sopenharmony_ci "client %s: gained %u ban score, now %u\n", 808c2ecf20Sopenharmony_ci ctx->name, score, 818c2ecf20Sopenharmony_ci atomic_read(&file_priv->ban_score)); 828c2ecf20Sopenharmony_ci } 838c2ecf20Sopenharmony_ci} 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_cistatic bool mark_guilty(struct i915_request *rq) 868c2ecf20Sopenharmony_ci{ 878c2ecf20Sopenharmony_ci struct i915_gem_context *ctx; 888c2ecf20Sopenharmony_ci unsigned long prev_hang; 898c2ecf20Sopenharmony_ci bool banned; 908c2ecf20Sopenharmony_ci int i; 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_ci if (intel_context_is_closed(rq->context)) { 938c2ecf20Sopenharmony_ci intel_context_set_banned(rq->context); 948c2ecf20Sopenharmony_ci return true; 958c2ecf20Sopenharmony_ci } 968c2ecf20Sopenharmony_ci 978c2ecf20Sopenharmony_ci rcu_read_lock(); 988c2ecf20Sopenharmony_ci ctx = rcu_dereference(rq->context->gem_context); 998c2ecf20Sopenharmony_ci if (ctx && !kref_get_unless_zero(&ctx->ref)) 1008c2ecf20Sopenharmony_ci ctx = NULL; 1018c2ecf20Sopenharmony_ci rcu_read_unlock(); 1028c2ecf20Sopenharmony_ci if (!ctx) 1038c2ecf20Sopenharmony_ci return intel_context_is_banned(rq->context); 1048c2ecf20Sopenharmony_ci 1058c2ecf20Sopenharmony_ci atomic_inc(&ctx->guilty_count); 1068c2ecf20Sopenharmony_ci 1078c2ecf20Sopenharmony_ci /* Cool contexts are too cool to be banned! (Used for reset testing.) */ 1088c2ecf20Sopenharmony_ci if (!i915_gem_context_is_bannable(ctx)) { 1098c2ecf20Sopenharmony_ci banned = false; 1108c2ecf20Sopenharmony_ci goto out; 1118c2ecf20Sopenharmony_ci } 1128c2ecf20Sopenharmony_ci 1138c2ecf20Sopenharmony_ci drm_notice(&ctx->i915->drm, 1148c2ecf20Sopenharmony_ci "%s context reset due to GPU hang\n", 1158c2ecf20Sopenharmony_ci ctx->name); 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_ci /* Record the timestamp for the last N hangs */ 1188c2ecf20Sopenharmony_ci prev_hang = ctx->hang_timestamp[0]; 1198c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++) 1208c2ecf20Sopenharmony_ci ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1]; 1218c2ecf20Sopenharmony_ci ctx->hang_timestamp[i] = jiffies; 1228c2ecf20Sopenharmony_ci 1238c2ecf20Sopenharmony_ci /* If we have hung N+1 times in rapid succession, we ban the context! */ 1248c2ecf20Sopenharmony_ci banned = !i915_gem_context_is_recoverable(ctx); 1258c2ecf20Sopenharmony_ci if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES)) 1268c2ecf20Sopenharmony_ci banned = true; 1278c2ecf20Sopenharmony_ci if (banned) { 1288c2ecf20Sopenharmony_ci drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n", 1298c2ecf20Sopenharmony_ci ctx->name, atomic_read(&ctx->guilty_count)); 1308c2ecf20Sopenharmony_ci intel_context_set_banned(rq->context); 1318c2ecf20Sopenharmony_ci } 1328c2ecf20Sopenharmony_ci 1338c2ecf20Sopenharmony_ci client_mark_guilty(ctx, banned); 1348c2ecf20Sopenharmony_ci 1358c2ecf20Sopenharmony_ciout: 1368c2ecf20Sopenharmony_ci i915_gem_context_put(ctx); 1378c2ecf20Sopenharmony_ci return banned; 1388c2ecf20Sopenharmony_ci} 1398c2ecf20Sopenharmony_ci 1408c2ecf20Sopenharmony_cistatic void mark_innocent(struct i915_request *rq) 1418c2ecf20Sopenharmony_ci{ 1428c2ecf20Sopenharmony_ci struct i915_gem_context *ctx; 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ci rcu_read_lock(); 1458c2ecf20Sopenharmony_ci ctx = rcu_dereference(rq->context->gem_context); 1468c2ecf20Sopenharmony_ci if (ctx) 1478c2ecf20Sopenharmony_ci atomic_inc(&ctx->active_count); 1488c2ecf20Sopenharmony_ci rcu_read_unlock(); 1498c2ecf20Sopenharmony_ci} 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_civoid __i915_request_reset(struct i915_request *rq, bool guilty) 1528c2ecf20Sopenharmony_ci{ 1538c2ecf20Sopenharmony_ci RQ_TRACE(rq, "guilty? %s\n", yesno(guilty)); 1548c2ecf20Sopenharmony_ci 1558c2ecf20Sopenharmony_ci GEM_BUG_ON(i915_request_completed(rq)); 1568c2ecf20Sopenharmony_ci 1578c2ecf20Sopenharmony_ci rcu_read_lock(); /* protect the GEM context */ 1588c2ecf20Sopenharmony_ci if (guilty) { 1598c2ecf20Sopenharmony_ci i915_request_set_error_once(rq, -EIO); 1608c2ecf20Sopenharmony_ci __i915_request_skip(rq); 1618c2ecf20Sopenharmony_ci if (mark_guilty(rq)) 1628c2ecf20Sopenharmony_ci engine_skip_context(rq); 1638c2ecf20Sopenharmony_ci } else { 1648c2ecf20Sopenharmony_ci i915_request_set_error_once(rq, -EAGAIN); 1658c2ecf20Sopenharmony_ci mark_innocent(rq); 1668c2ecf20Sopenharmony_ci } 1678c2ecf20Sopenharmony_ci rcu_read_unlock(); 1688c2ecf20Sopenharmony_ci} 1698c2ecf20Sopenharmony_ci 1708c2ecf20Sopenharmony_cistatic bool i915_in_reset(struct pci_dev *pdev) 1718c2ecf20Sopenharmony_ci{ 1728c2ecf20Sopenharmony_ci u8 gdrst; 1738c2ecf20Sopenharmony_ci 1748c2ecf20Sopenharmony_ci pci_read_config_byte(pdev, I915_GDRST, &gdrst); 1758c2ecf20Sopenharmony_ci return gdrst & GRDOM_RESET_STATUS; 1768c2ecf20Sopenharmony_ci} 1778c2ecf20Sopenharmony_ci 1788c2ecf20Sopenharmony_cistatic int i915_do_reset(struct intel_gt *gt, 1798c2ecf20Sopenharmony_ci intel_engine_mask_t engine_mask, 1808c2ecf20Sopenharmony_ci unsigned int retry) 1818c2ecf20Sopenharmony_ci{ 1828c2ecf20Sopenharmony_ci struct pci_dev *pdev = gt->i915->drm.pdev; 1838c2ecf20Sopenharmony_ci int err; 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci /* Assert reset for at least 20 usec, and wait for acknowledgement. */ 1868c2ecf20Sopenharmony_ci pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); 1878c2ecf20Sopenharmony_ci udelay(50); 1888c2ecf20Sopenharmony_ci err = wait_for_atomic(i915_in_reset(pdev), 50); 1898c2ecf20Sopenharmony_ci 1908c2ecf20Sopenharmony_ci /* Clear the reset request. */ 1918c2ecf20Sopenharmony_ci pci_write_config_byte(pdev, I915_GDRST, 0); 1928c2ecf20Sopenharmony_ci udelay(50); 1938c2ecf20Sopenharmony_ci if (!err) 1948c2ecf20Sopenharmony_ci err = wait_for_atomic(!i915_in_reset(pdev), 50); 1958c2ecf20Sopenharmony_ci 1968c2ecf20Sopenharmony_ci return err; 1978c2ecf20Sopenharmony_ci} 1988c2ecf20Sopenharmony_ci 1998c2ecf20Sopenharmony_cistatic bool g4x_reset_complete(struct pci_dev *pdev) 2008c2ecf20Sopenharmony_ci{ 2018c2ecf20Sopenharmony_ci u8 gdrst; 2028c2ecf20Sopenharmony_ci 2038c2ecf20Sopenharmony_ci pci_read_config_byte(pdev, I915_GDRST, &gdrst); 2048c2ecf20Sopenharmony_ci return (gdrst & GRDOM_RESET_ENABLE) == 0; 2058c2ecf20Sopenharmony_ci} 2068c2ecf20Sopenharmony_ci 2078c2ecf20Sopenharmony_cistatic int g33_do_reset(struct intel_gt *gt, 2088c2ecf20Sopenharmony_ci intel_engine_mask_t engine_mask, 2098c2ecf20Sopenharmony_ci unsigned int retry) 2108c2ecf20Sopenharmony_ci{ 2118c2ecf20Sopenharmony_ci struct pci_dev *pdev = gt->i915->drm.pdev; 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); 2148c2ecf20Sopenharmony_ci return wait_for_atomic(g4x_reset_complete(pdev), 50); 2158c2ecf20Sopenharmony_ci} 2168c2ecf20Sopenharmony_ci 2178c2ecf20Sopenharmony_cistatic int g4x_do_reset(struct intel_gt *gt, 2188c2ecf20Sopenharmony_ci intel_engine_mask_t engine_mask, 2198c2ecf20Sopenharmony_ci unsigned int retry) 2208c2ecf20Sopenharmony_ci{ 2218c2ecf20Sopenharmony_ci struct pci_dev *pdev = gt->i915->drm.pdev; 2228c2ecf20Sopenharmony_ci struct intel_uncore *uncore = gt->uncore; 2238c2ecf20Sopenharmony_ci int ret; 2248c2ecf20Sopenharmony_ci 2258c2ecf20Sopenharmony_ci /* WaVcpClkGateDisableForMediaReset:ctg,elk */ 2268c2ecf20Sopenharmony_ci rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); 2278c2ecf20Sopenharmony_ci intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); 2288c2ecf20Sopenharmony_ci 2298c2ecf20Sopenharmony_ci pci_write_config_byte(pdev, I915_GDRST, 2308c2ecf20Sopenharmony_ci GRDOM_MEDIA | GRDOM_RESET_ENABLE); 2318c2ecf20Sopenharmony_ci ret = wait_for_atomic(g4x_reset_complete(pdev), 50); 2328c2ecf20Sopenharmony_ci if (ret) { 2338c2ecf20Sopenharmony_ci drm_dbg(>->i915->drm, "Wait for media reset failed\n"); 2348c2ecf20Sopenharmony_ci goto out; 2358c2ecf20Sopenharmony_ci } 2368c2ecf20Sopenharmony_ci 2378c2ecf20Sopenharmony_ci pci_write_config_byte(pdev, I915_GDRST, 2388c2ecf20Sopenharmony_ci GRDOM_RENDER | GRDOM_RESET_ENABLE); 2398c2ecf20Sopenharmony_ci ret = wait_for_atomic(g4x_reset_complete(pdev), 50); 2408c2ecf20Sopenharmony_ci if (ret) { 2418c2ecf20Sopenharmony_ci drm_dbg(>->i915->drm, "Wait for render reset failed\n"); 2428c2ecf20Sopenharmony_ci goto out; 2438c2ecf20Sopenharmony_ci } 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_ciout: 2468c2ecf20Sopenharmony_ci pci_write_config_byte(pdev, I915_GDRST, 0); 2478c2ecf20Sopenharmony_ci 2488c2ecf20Sopenharmony_ci rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); 2498c2ecf20Sopenharmony_ci intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); 2508c2ecf20Sopenharmony_ci 2518c2ecf20Sopenharmony_ci return ret; 2528c2ecf20Sopenharmony_ci} 2538c2ecf20Sopenharmony_ci 2548c2ecf20Sopenharmony_cistatic int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask, 2558c2ecf20Sopenharmony_ci unsigned int retry) 2568c2ecf20Sopenharmony_ci{ 2578c2ecf20Sopenharmony_ci struct intel_uncore *uncore = gt->uncore; 2588c2ecf20Sopenharmony_ci int ret; 2598c2ecf20Sopenharmony_ci 2608c2ecf20Sopenharmony_ci intel_uncore_write_fw(uncore, ILK_GDSR, 2618c2ecf20Sopenharmony_ci ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE); 2628c2ecf20Sopenharmony_ci ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, 2638c2ecf20Sopenharmony_ci ILK_GRDOM_RESET_ENABLE, 0, 2648c2ecf20Sopenharmony_ci 5000, 0, 2658c2ecf20Sopenharmony_ci NULL); 2668c2ecf20Sopenharmony_ci if (ret) { 2678c2ecf20Sopenharmony_ci drm_dbg(>->i915->drm, "Wait for render reset failed\n"); 2688c2ecf20Sopenharmony_ci goto out; 2698c2ecf20Sopenharmony_ci } 2708c2ecf20Sopenharmony_ci 2718c2ecf20Sopenharmony_ci intel_uncore_write_fw(uncore, ILK_GDSR, 2728c2ecf20Sopenharmony_ci ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE); 2738c2ecf20Sopenharmony_ci ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, 2748c2ecf20Sopenharmony_ci ILK_GRDOM_RESET_ENABLE, 0, 2758c2ecf20Sopenharmony_ci 5000, 0, 2768c2ecf20Sopenharmony_ci NULL); 2778c2ecf20Sopenharmony_ci if (ret) { 2788c2ecf20Sopenharmony_ci drm_dbg(>->i915->drm, "Wait for media reset failed\n"); 2798c2ecf20Sopenharmony_ci goto out; 2808c2ecf20Sopenharmony_ci } 2818c2ecf20Sopenharmony_ci 2828c2ecf20Sopenharmony_ciout: 2838c2ecf20Sopenharmony_ci intel_uncore_write_fw(uncore, ILK_GDSR, 0); 2848c2ecf20Sopenharmony_ci intel_uncore_posting_read_fw(uncore, ILK_GDSR); 2858c2ecf20Sopenharmony_ci return ret; 2868c2ecf20Sopenharmony_ci} 2878c2ecf20Sopenharmony_ci 2888c2ecf20Sopenharmony_ci/* Reset the hardware domains (GENX_GRDOM_*) specified by mask */ 2898c2ecf20Sopenharmony_cistatic int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask) 2908c2ecf20Sopenharmony_ci{ 2918c2ecf20Sopenharmony_ci struct intel_uncore *uncore = gt->uncore; 2928c2ecf20Sopenharmony_ci int loops = 2; 2938c2ecf20Sopenharmony_ci int err; 2948c2ecf20Sopenharmony_ci 2958c2ecf20Sopenharmony_ci /* 2968c2ecf20Sopenharmony_ci * GEN6_GDRST is not in the gt power well, no need to check 2978c2ecf20Sopenharmony_ci * for fifo space for the write or forcewake the chip for 2988c2ecf20Sopenharmony_ci * the read 2998c2ecf20Sopenharmony_ci */ 3008c2ecf20Sopenharmony_ci do { 3018c2ecf20Sopenharmony_ci intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask); 3028c2ecf20Sopenharmony_ci 3038c2ecf20Sopenharmony_ci /* 3048c2ecf20Sopenharmony_ci * Wait for the device to ack the reset requests. 3058c2ecf20Sopenharmony_ci * 3068c2ecf20Sopenharmony_ci * On some platforms, e.g. Jasperlake, we see that the 3078c2ecf20Sopenharmony_ci * engine register state is not cleared until shortly after 3088c2ecf20Sopenharmony_ci * GDRST reports completion, causing a failure as we try 3098c2ecf20Sopenharmony_ci * to immediately resume while the internal state is still 3108c2ecf20Sopenharmony_ci * in flux. If we immediately repeat the reset, the second 3118c2ecf20Sopenharmony_ci * reset appears to serialise with the first, and since 3128c2ecf20Sopenharmony_ci * it is a no-op, the registers should retain their reset 3138c2ecf20Sopenharmony_ci * value. However, there is still a concern that upon 3148c2ecf20Sopenharmony_ci * leaving the second reset, the internal engine state 3158c2ecf20Sopenharmony_ci * is still in flux and not ready for resuming. 3168c2ecf20Sopenharmony_ci */ 3178c2ecf20Sopenharmony_ci err = __intel_wait_for_register_fw(uncore, GEN6_GDRST, 3188c2ecf20Sopenharmony_ci hw_domain_mask, 0, 3198c2ecf20Sopenharmony_ci 2000, 0, 3208c2ecf20Sopenharmony_ci NULL); 3218c2ecf20Sopenharmony_ci } while (err == 0 && --loops); 3228c2ecf20Sopenharmony_ci if (err) 3238c2ecf20Sopenharmony_ci drm_dbg(>->i915->drm, 3248c2ecf20Sopenharmony_ci "Wait for 0x%08x engines reset failed\n", 3258c2ecf20Sopenharmony_ci hw_domain_mask); 3268c2ecf20Sopenharmony_ci 3278c2ecf20Sopenharmony_ci /* 3288c2ecf20Sopenharmony_ci * As we have observed that the engine state is still volatile 3298c2ecf20Sopenharmony_ci * after GDRST is acked, impose a small delay to let everything settle. 3308c2ecf20Sopenharmony_ci */ 3318c2ecf20Sopenharmony_ci udelay(50); 3328c2ecf20Sopenharmony_ci 3338c2ecf20Sopenharmony_ci return err; 3348c2ecf20Sopenharmony_ci} 3358c2ecf20Sopenharmony_ci 3368c2ecf20Sopenharmony_cistatic int gen6_reset_engines(struct intel_gt *gt, 3378c2ecf20Sopenharmony_ci intel_engine_mask_t engine_mask, 3388c2ecf20Sopenharmony_ci unsigned int retry) 3398c2ecf20Sopenharmony_ci{ 3408c2ecf20Sopenharmony_ci static const u32 hw_engine_mask[] = { 3418c2ecf20Sopenharmony_ci [RCS0] = GEN6_GRDOM_RENDER, 3428c2ecf20Sopenharmony_ci [BCS0] = GEN6_GRDOM_BLT, 3438c2ecf20Sopenharmony_ci [VCS0] = GEN6_GRDOM_MEDIA, 3448c2ecf20Sopenharmony_ci [VCS1] = GEN8_GRDOM_MEDIA2, 3458c2ecf20Sopenharmony_ci [VECS0] = GEN6_GRDOM_VECS, 3468c2ecf20Sopenharmony_ci }; 3478c2ecf20Sopenharmony_ci struct intel_engine_cs *engine; 3488c2ecf20Sopenharmony_ci u32 hw_mask; 3498c2ecf20Sopenharmony_ci 3508c2ecf20Sopenharmony_ci if (engine_mask == ALL_ENGINES) { 3518c2ecf20Sopenharmony_ci hw_mask = GEN6_GRDOM_FULL; 3528c2ecf20Sopenharmony_ci } else { 3538c2ecf20Sopenharmony_ci intel_engine_mask_t tmp; 3548c2ecf20Sopenharmony_ci 3558c2ecf20Sopenharmony_ci hw_mask = 0; 3568c2ecf20Sopenharmony_ci for_each_engine_masked(engine, gt, engine_mask, tmp) { 3578c2ecf20Sopenharmony_ci GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask)); 3588c2ecf20Sopenharmony_ci hw_mask |= hw_engine_mask[engine->id]; 3598c2ecf20Sopenharmony_ci } 3608c2ecf20Sopenharmony_ci } 3618c2ecf20Sopenharmony_ci 3628c2ecf20Sopenharmony_ci return gen6_hw_domain_reset(gt, hw_mask); 3638c2ecf20Sopenharmony_ci} 3648c2ecf20Sopenharmony_ci 3658c2ecf20Sopenharmony_cistatic int gen11_lock_sfc(struct intel_engine_cs *engine, u32 *hw_mask) 3668c2ecf20Sopenharmony_ci{ 3678c2ecf20Sopenharmony_ci struct intel_uncore *uncore = engine->uncore; 3688c2ecf20Sopenharmony_ci u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access; 3698c2ecf20Sopenharmony_ci i915_reg_t sfc_forced_lock, sfc_forced_lock_ack; 3708c2ecf20Sopenharmony_ci u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit; 3718c2ecf20Sopenharmony_ci i915_reg_t sfc_usage; 3728c2ecf20Sopenharmony_ci u32 sfc_usage_bit; 3738c2ecf20Sopenharmony_ci u32 sfc_reset_bit; 3748c2ecf20Sopenharmony_ci int ret; 3758c2ecf20Sopenharmony_ci 3768c2ecf20Sopenharmony_ci switch (engine->class) { 3778c2ecf20Sopenharmony_ci case VIDEO_DECODE_CLASS: 3788c2ecf20Sopenharmony_ci if ((BIT(engine->instance) & vdbox_sfc_access) == 0) 3798c2ecf20Sopenharmony_ci return 0; 3808c2ecf20Sopenharmony_ci 3818c2ecf20Sopenharmony_ci sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); 3828c2ecf20Sopenharmony_ci sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; 3838c2ecf20Sopenharmony_ci 3848c2ecf20Sopenharmony_ci sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine); 3858c2ecf20Sopenharmony_ci sfc_forced_lock_ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT; 3868c2ecf20Sopenharmony_ci 3878c2ecf20Sopenharmony_ci sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine); 3888c2ecf20Sopenharmony_ci sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT; 3898c2ecf20Sopenharmony_ci sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance); 3908c2ecf20Sopenharmony_ci break; 3918c2ecf20Sopenharmony_ci 3928c2ecf20Sopenharmony_ci case VIDEO_ENHANCEMENT_CLASS: 3938c2ecf20Sopenharmony_ci sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); 3948c2ecf20Sopenharmony_ci sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; 3958c2ecf20Sopenharmony_ci 3968c2ecf20Sopenharmony_ci sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine); 3978c2ecf20Sopenharmony_ci sfc_forced_lock_ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT; 3988c2ecf20Sopenharmony_ci 3998c2ecf20Sopenharmony_ci sfc_usage = GEN11_VECS_SFC_USAGE(engine); 4008c2ecf20Sopenharmony_ci sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT; 4018c2ecf20Sopenharmony_ci sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance); 4028c2ecf20Sopenharmony_ci break; 4038c2ecf20Sopenharmony_ci 4048c2ecf20Sopenharmony_ci default: 4058c2ecf20Sopenharmony_ci return 0; 4068c2ecf20Sopenharmony_ci } 4078c2ecf20Sopenharmony_ci 4088c2ecf20Sopenharmony_ci /* 4098c2ecf20Sopenharmony_ci * If the engine is using a SFC, tell the engine that a software reset 4108c2ecf20Sopenharmony_ci * is going to happen. The engine will then try to force lock the SFC. 4118c2ecf20Sopenharmony_ci * If SFC ends up being locked to the engine we want to reset, we have 4128c2ecf20Sopenharmony_ci * to reset it as well (we will unlock it once the reset sequence is 4138c2ecf20Sopenharmony_ci * completed). 4148c2ecf20Sopenharmony_ci */ 4158c2ecf20Sopenharmony_ci if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit)) 4168c2ecf20Sopenharmony_ci return 0; 4178c2ecf20Sopenharmony_ci 4188c2ecf20Sopenharmony_ci rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit); 4198c2ecf20Sopenharmony_ci 4208c2ecf20Sopenharmony_ci ret = __intel_wait_for_register_fw(uncore, 4218c2ecf20Sopenharmony_ci sfc_forced_lock_ack, 4228c2ecf20Sopenharmony_ci sfc_forced_lock_ack_bit, 4238c2ecf20Sopenharmony_ci sfc_forced_lock_ack_bit, 4248c2ecf20Sopenharmony_ci 1000, 0, NULL); 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_ci /* Was the SFC released while we were trying to lock it? */ 4278c2ecf20Sopenharmony_ci if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit)) 4288c2ecf20Sopenharmony_ci return 0; 4298c2ecf20Sopenharmony_ci 4308c2ecf20Sopenharmony_ci if (ret) { 4318c2ecf20Sopenharmony_ci drm_dbg(&engine->i915->drm, 4328c2ecf20Sopenharmony_ci "Wait for SFC forced lock ack failed\n"); 4338c2ecf20Sopenharmony_ci return ret; 4348c2ecf20Sopenharmony_ci } 4358c2ecf20Sopenharmony_ci 4368c2ecf20Sopenharmony_ci *hw_mask |= sfc_reset_bit; 4378c2ecf20Sopenharmony_ci return 0; 4388c2ecf20Sopenharmony_ci} 4398c2ecf20Sopenharmony_ci 4408c2ecf20Sopenharmony_cistatic void gen11_unlock_sfc(struct intel_engine_cs *engine) 4418c2ecf20Sopenharmony_ci{ 4428c2ecf20Sopenharmony_ci struct intel_uncore *uncore = engine->uncore; 4438c2ecf20Sopenharmony_ci u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access; 4448c2ecf20Sopenharmony_ci i915_reg_t sfc_forced_lock; 4458c2ecf20Sopenharmony_ci u32 sfc_forced_lock_bit; 4468c2ecf20Sopenharmony_ci 4478c2ecf20Sopenharmony_ci switch (engine->class) { 4488c2ecf20Sopenharmony_ci case VIDEO_DECODE_CLASS: 4498c2ecf20Sopenharmony_ci if ((BIT(engine->instance) & vdbox_sfc_access) == 0) 4508c2ecf20Sopenharmony_ci return; 4518c2ecf20Sopenharmony_ci 4528c2ecf20Sopenharmony_ci sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); 4538c2ecf20Sopenharmony_ci sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; 4548c2ecf20Sopenharmony_ci break; 4558c2ecf20Sopenharmony_ci 4568c2ecf20Sopenharmony_ci case VIDEO_ENHANCEMENT_CLASS: 4578c2ecf20Sopenharmony_ci sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); 4588c2ecf20Sopenharmony_ci sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; 4598c2ecf20Sopenharmony_ci break; 4608c2ecf20Sopenharmony_ci 4618c2ecf20Sopenharmony_ci default: 4628c2ecf20Sopenharmony_ci return; 4638c2ecf20Sopenharmony_ci } 4648c2ecf20Sopenharmony_ci 4658c2ecf20Sopenharmony_ci rmw_clear_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit); 4668c2ecf20Sopenharmony_ci} 4678c2ecf20Sopenharmony_ci 4688c2ecf20Sopenharmony_cistatic int gen11_reset_engines(struct intel_gt *gt, 4698c2ecf20Sopenharmony_ci intel_engine_mask_t engine_mask, 4708c2ecf20Sopenharmony_ci unsigned int retry) 4718c2ecf20Sopenharmony_ci{ 4728c2ecf20Sopenharmony_ci static const u32 hw_engine_mask[] = { 4738c2ecf20Sopenharmony_ci [RCS0] = GEN11_GRDOM_RENDER, 4748c2ecf20Sopenharmony_ci [BCS0] = GEN11_GRDOM_BLT, 4758c2ecf20Sopenharmony_ci [VCS0] = GEN11_GRDOM_MEDIA, 4768c2ecf20Sopenharmony_ci [VCS1] = GEN11_GRDOM_MEDIA2, 4778c2ecf20Sopenharmony_ci [VCS2] = GEN11_GRDOM_MEDIA3, 4788c2ecf20Sopenharmony_ci [VCS3] = GEN11_GRDOM_MEDIA4, 4798c2ecf20Sopenharmony_ci [VECS0] = GEN11_GRDOM_VECS, 4808c2ecf20Sopenharmony_ci [VECS1] = GEN11_GRDOM_VECS2, 4818c2ecf20Sopenharmony_ci }; 4828c2ecf20Sopenharmony_ci struct intel_engine_cs *engine; 4838c2ecf20Sopenharmony_ci intel_engine_mask_t tmp; 4848c2ecf20Sopenharmony_ci u32 hw_mask; 4858c2ecf20Sopenharmony_ci int ret; 4868c2ecf20Sopenharmony_ci 4878c2ecf20Sopenharmony_ci if (engine_mask == ALL_ENGINES) { 4888c2ecf20Sopenharmony_ci hw_mask = GEN11_GRDOM_FULL; 4898c2ecf20Sopenharmony_ci } else { 4908c2ecf20Sopenharmony_ci hw_mask = 0; 4918c2ecf20Sopenharmony_ci for_each_engine_masked(engine, gt, engine_mask, tmp) { 4928c2ecf20Sopenharmony_ci GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask)); 4938c2ecf20Sopenharmony_ci hw_mask |= hw_engine_mask[engine->id]; 4948c2ecf20Sopenharmony_ci ret = gen11_lock_sfc(engine, &hw_mask); 4958c2ecf20Sopenharmony_ci if (ret) 4968c2ecf20Sopenharmony_ci goto sfc_unlock; 4978c2ecf20Sopenharmony_ci } 4988c2ecf20Sopenharmony_ci } 4998c2ecf20Sopenharmony_ci 5008c2ecf20Sopenharmony_ci ret = gen6_hw_domain_reset(gt, hw_mask); 5018c2ecf20Sopenharmony_ci 5028c2ecf20Sopenharmony_cisfc_unlock: 5038c2ecf20Sopenharmony_ci /* 5048c2ecf20Sopenharmony_ci * We unlock the SFC based on the lock status and not the result of 5058c2ecf20Sopenharmony_ci * gen11_lock_sfc to make sure that we clean properly if something 5068c2ecf20Sopenharmony_ci * wrong happened during the lock (e.g. lock acquired after timeout 5078c2ecf20Sopenharmony_ci * expiration). 5088c2ecf20Sopenharmony_ci */ 5098c2ecf20Sopenharmony_ci if (engine_mask != ALL_ENGINES) 5108c2ecf20Sopenharmony_ci for_each_engine_masked(engine, gt, engine_mask, tmp) 5118c2ecf20Sopenharmony_ci gen11_unlock_sfc(engine); 5128c2ecf20Sopenharmony_ci 5138c2ecf20Sopenharmony_ci return ret; 5148c2ecf20Sopenharmony_ci} 5158c2ecf20Sopenharmony_ci 5168c2ecf20Sopenharmony_cistatic int gen8_engine_reset_prepare(struct intel_engine_cs *engine) 5178c2ecf20Sopenharmony_ci{ 5188c2ecf20Sopenharmony_ci struct intel_uncore *uncore = engine->uncore; 5198c2ecf20Sopenharmony_ci const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base); 5208c2ecf20Sopenharmony_ci u32 request, mask, ack; 5218c2ecf20Sopenharmony_ci int ret; 5228c2ecf20Sopenharmony_ci 5238c2ecf20Sopenharmony_ci ack = intel_uncore_read_fw(uncore, reg); 5248c2ecf20Sopenharmony_ci if (ack & RESET_CTL_CAT_ERROR) { 5258c2ecf20Sopenharmony_ci /* 5268c2ecf20Sopenharmony_ci * For catastrophic errors, ready-for-reset sequence 5278c2ecf20Sopenharmony_ci * needs to be bypassed: HAS#396813 5288c2ecf20Sopenharmony_ci */ 5298c2ecf20Sopenharmony_ci request = RESET_CTL_CAT_ERROR; 5308c2ecf20Sopenharmony_ci mask = RESET_CTL_CAT_ERROR; 5318c2ecf20Sopenharmony_ci 5328c2ecf20Sopenharmony_ci /* Catastrophic errors need to be cleared by HW */ 5338c2ecf20Sopenharmony_ci ack = 0; 5348c2ecf20Sopenharmony_ci } else if (!(ack & RESET_CTL_READY_TO_RESET)) { 5358c2ecf20Sopenharmony_ci request = RESET_CTL_REQUEST_RESET; 5368c2ecf20Sopenharmony_ci mask = RESET_CTL_READY_TO_RESET; 5378c2ecf20Sopenharmony_ci ack = RESET_CTL_READY_TO_RESET; 5388c2ecf20Sopenharmony_ci } else { 5398c2ecf20Sopenharmony_ci return 0; 5408c2ecf20Sopenharmony_ci } 5418c2ecf20Sopenharmony_ci 5428c2ecf20Sopenharmony_ci intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request)); 5438c2ecf20Sopenharmony_ci ret = __intel_wait_for_register_fw(uncore, reg, mask, ack, 5448c2ecf20Sopenharmony_ci 700, 0, NULL); 5458c2ecf20Sopenharmony_ci if (ret) 5468c2ecf20Sopenharmony_ci drm_err(&engine->i915->drm, 5478c2ecf20Sopenharmony_ci "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", 5488c2ecf20Sopenharmony_ci engine->name, request, 5498c2ecf20Sopenharmony_ci intel_uncore_read_fw(uncore, reg)); 5508c2ecf20Sopenharmony_ci 5518c2ecf20Sopenharmony_ci return ret; 5528c2ecf20Sopenharmony_ci} 5538c2ecf20Sopenharmony_ci 5548c2ecf20Sopenharmony_cistatic void gen8_engine_reset_cancel(struct intel_engine_cs *engine) 5558c2ecf20Sopenharmony_ci{ 5568c2ecf20Sopenharmony_ci intel_uncore_write_fw(engine->uncore, 5578c2ecf20Sopenharmony_ci RING_RESET_CTL(engine->mmio_base), 5588c2ecf20Sopenharmony_ci _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET)); 5598c2ecf20Sopenharmony_ci} 5608c2ecf20Sopenharmony_ci 5618c2ecf20Sopenharmony_cistatic int gen8_reset_engines(struct intel_gt *gt, 5628c2ecf20Sopenharmony_ci intel_engine_mask_t engine_mask, 5638c2ecf20Sopenharmony_ci unsigned int retry) 5648c2ecf20Sopenharmony_ci{ 5658c2ecf20Sopenharmony_ci struct intel_engine_cs *engine; 5668c2ecf20Sopenharmony_ci const bool reset_non_ready = retry >= 1; 5678c2ecf20Sopenharmony_ci intel_engine_mask_t tmp; 5688c2ecf20Sopenharmony_ci int ret; 5698c2ecf20Sopenharmony_ci 5708c2ecf20Sopenharmony_ci for_each_engine_masked(engine, gt, engine_mask, tmp) { 5718c2ecf20Sopenharmony_ci ret = gen8_engine_reset_prepare(engine); 5728c2ecf20Sopenharmony_ci if (ret && !reset_non_ready) 5738c2ecf20Sopenharmony_ci goto skip_reset; 5748c2ecf20Sopenharmony_ci 5758c2ecf20Sopenharmony_ci /* 5768c2ecf20Sopenharmony_ci * If this is not the first failed attempt to prepare, 5778c2ecf20Sopenharmony_ci * we decide to proceed anyway. 5788c2ecf20Sopenharmony_ci * 5798c2ecf20Sopenharmony_ci * By doing so we risk context corruption and with 5808c2ecf20Sopenharmony_ci * some gens (kbl), possible system hang if reset 5818c2ecf20Sopenharmony_ci * happens during active bb execution. 5828c2ecf20Sopenharmony_ci * 5838c2ecf20Sopenharmony_ci * We rather take context corruption instead of 5848c2ecf20Sopenharmony_ci * failed reset with a wedged driver/gpu. And 5858c2ecf20Sopenharmony_ci * active bb execution case should be covered by 5868c2ecf20Sopenharmony_ci * stop_engines() we have before the reset. 5878c2ecf20Sopenharmony_ci */ 5888c2ecf20Sopenharmony_ci } 5898c2ecf20Sopenharmony_ci 5908c2ecf20Sopenharmony_ci if (INTEL_GEN(gt->i915) >= 11) 5918c2ecf20Sopenharmony_ci ret = gen11_reset_engines(gt, engine_mask, retry); 5928c2ecf20Sopenharmony_ci else 5938c2ecf20Sopenharmony_ci ret = gen6_reset_engines(gt, engine_mask, retry); 5948c2ecf20Sopenharmony_ci 5958c2ecf20Sopenharmony_ciskip_reset: 5968c2ecf20Sopenharmony_ci for_each_engine_masked(engine, gt, engine_mask, tmp) 5978c2ecf20Sopenharmony_ci gen8_engine_reset_cancel(engine); 5988c2ecf20Sopenharmony_ci 5998c2ecf20Sopenharmony_ci return ret; 6008c2ecf20Sopenharmony_ci} 6018c2ecf20Sopenharmony_ci 6028c2ecf20Sopenharmony_cistatic int mock_reset(struct intel_gt *gt, 6038c2ecf20Sopenharmony_ci intel_engine_mask_t mask, 6048c2ecf20Sopenharmony_ci unsigned int retry) 6058c2ecf20Sopenharmony_ci{ 6068c2ecf20Sopenharmony_ci return 0; 6078c2ecf20Sopenharmony_ci} 6088c2ecf20Sopenharmony_ci 6098c2ecf20Sopenharmony_citypedef int (*reset_func)(struct intel_gt *, 6108c2ecf20Sopenharmony_ci intel_engine_mask_t engine_mask, 6118c2ecf20Sopenharmony_ci unsigned int retry); 6128c2ecf20Sopenharmony_ci 6138c2ecf20Sopenharmony_cistatic reset_func intel_get_gpu_reset(const struct intel_gt *gt) 6148c2ecf20Sopenharmony_ci{ 6158c2ecf20Sopenharmony_ci struct drm_i915_private *i915 = gt->i915; 6168c2ecf20Sopenharmony_ci 6178c2ecf20Sopenharmony_ci if (is_mock_gt(gt)) 6188c2ecf20Sopenharmony_ci return mock_reset; 6198c2ecf20Sopenharmony_ci else if (INTEL_GEN(i915) >= 8) 6208c2ecf20Sopenharmony_ci return gen8_reset_engines; 6218c2ecf20Sopenharmony_ci else if (INTEL_GEN(i915) >= 6) 6228c2ecf20Sopenharmony_ci return gen6_reset_engines; 6238c2ecf20Sopenharmony_ci else if (INTEL_GEN(i915) >= 5) 6248c2ecf20Sopenharmony_ci return ilk_do_reset; 6258c2ecf20Sopenharmony_ci else if (IS_G4X(i915)) 6268c2ecf20Sopenharmony_ci return g4x_do_reset; 6278c2ecf20Sopenharmony_ci else if (IS_G33(i915) || IS_PINEVIEW(i915)) 6288c2ecf20Sopenharmony_ci return g33_do_reset; 6298c2ecf20Sopenharmony_ci else if (INTEL_GEN(i915) >= 3) 6308c2ecf20Sopenharmony_ci return i915_do_reset; 6318c2ecf20Sopenharmony_ci else 6328c2ecf20Sopenharmony_ci return NULL; 6338c2ecf20Sopenharmony_ci} 6348c2ecf20Sopenharmony_ci 6358c2ecf20Sopenharmony_ciint __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) 6368c2ecf20Sopenharmony_ci{ 6378c2ecf20Sopenharmony_ci const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1; 6388c2ecf20Sopenharmony_ci reset_func reset; 6398c2ecf20Sopenharmony_ci int ret = -ETIMEDOUT; 6408c2ecf20Sopenharmony_ci int retry; 6418c2ecf20Sopenharmony_ci 6428c2ecf20Sopenharmony_ci reset = intel_get_gpu_reset(gt); 6438c2ecf20Sopenharmony_ci if (!reset) 6448c2ecf20Sopenharmony_ci return -ENODEV; 6458c2ecf20Sopenharmony_ci 6468c2ecf20Sopenharmony_ci /* 6478c2ecf20Sopenharmony_ci * If the power well sleeps during the reset, the reset 6488c2ecf20Sopenharmony_ci * request may be dropped and never completes (causing -EIO). 6498c2ecf20Sopenharmony_ci */ 6508c2ecf20Sopenharmony_ci intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 6518c2ecf20Sopenharmony_ci for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) { 6528c2ecf20Sopenharmony_ci GT_TRACE(gt, "engine_mask=%x\n", engine_mask); 6538c2ecf20Sopenharmony_ci preempt_disable(); 6548c2ecf20Sopenharmony_ci ret = reset(gt, engine_mask, retry); 6558c2ecf20Sopenharmony_ci preempt_enable(); 6568c2ecf20Sopenharmony_ci } 6578c2ecf20Sopenharmony_ci intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 6588c2ecf20Sopenharmony_ci 6598c2ecf20Sopenharmony_ci return ret; 6608c2ecf20Sopenharmony_ci} 6618c2ecf20Sopenharmony_ci 6628c2ecf20Sopenharmony_cibool intel_has_gpu_reset(const struct intel_gt *gt) 6638c2ecf20Sopenharmony_ci{ 6648c2ecf20Sopenharmony_ci if (!gt->i915->params.reset) 6658c2ecf20Sopenharmony_ci return NULL; 6668c2ecf20Sopenharmony_ci 6678c2ecf20Sopenharmony_ci return intel_get_gpu_reset(gt); 6688c2ecf20Sopenharmony_ci} 6698c2ecf20Sopenharmony_ci 6708c2ecf20Sopenharmony_cibool intel_has_reset_engine(const struct intel_gt *gt) 6718c2ecf20Sopenharmony_ci{ 6728c2ecf20Sopenharmony_ci if (gt->i915->params.reset < 2) 6738c2ecf20Sopenharmony_ci return false; 6748c2ecf20Sopenharmony_ci 6758c2ecf20Sopenharmony_ci return INTEL_INFO(gt->i915)->has_reset_engine; 6768c2ecf20Sopenharmony_ci} 6778c2ecf20Sopenharmony_ci 6788c2ecf20Sopenharmony_ciint intel_reset_guc(struct intel_gt *gt) 6798c2ecf20Sopenharmony_ci{ 6808c2ecf20Sopenharmony_ci u32 guc_domain = 6818c2ecf20Sopenharmony_ci INTEL_GEN(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC; 6828c2ecf20Sopenharmony_ci int ret; 6838c2ecf20Sopenharmony_ci 6848c2ecf20Sopenharmony_ci GEM_BUG_ON(!HAS_GT_UC(gt->i915)); 6858c2ecf20Sopenharmony_ci 6868c2ecf20Sopenharmony_ci intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 6878c2ecf20Sopenharmony_ci ret = gen6_hw_domain_reset(gt, guc_domain); 6888c2ecf20Sopenharmony_ci intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 6898c2ecf20Sopenharmony_ci 6908c2ecf20Sopenharmony_ci return ret; 6918c2ecf20Sopenharmony_ci} 6928c2ecf20Sopenharmony_ci 6938c2ecf20Sopenharmony_ci/* 6948c2ecf20Sopenharmony_ci * Ensure irq handler finishes, and not run again. 6958c2ecf20Sopenharmony_ci * Also return the active request so that we only search for it once. 6968c2ecf20Sopenharmony_ci */ 6978c2ecf20Sopenharmony_cistatic void reset_prepare_engine(struct intel_engine_cs *engine) 6988c2ecf20Sopenharmony_ci{ 6998c2ecf20Sopenharmony_ci /* 7008c2ecf20Sopenharmony_ci * During the reset sequence, we must prevent the engine from 7018c2ecf20Sopenharmony_ci * entering RC6. As the context state is undefined until we restart 7028c2ecf20Sopenharmony_ci * the engine, if it does enter RC6 during the reset, the state 7038c2ecf20Sopenharmony_ci * written to the powercontext is undefined and so we may lose 7048c2ecf20Sopenharmony_ci * GPU state upon resume, i.e. fail to restart after a reset. 7058c2ecf20Sopenharmony_ci */ 7068c2ecf20Sopenharmony_ci intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL); 7078c2ecf20Sopenharmony_ci if (engine->reset.prepare) 7088c2ecf20Sopenharmony_ci engine->reset.prepare(engine); 7098c2ecf20Sopenharmony_ci} 7108c2ecf20Sopenharmony_ci 7118c2ecf20Sopenharmony_cistatic void revoke_mmaps(struct intel_gt *gt) 7128c2ecf20Sopenharmony_ci{ 7138c2ecf20Sopenharmony_ci int i; 7148c2ecf20Sopenharmony_ci 7158c2ecf20Sopenharmony_ci for (i = 0; i < gt->ggtt->num_fences; i++) { 7168c2ecf20Sopenharmony_ci struct drm_vma_offset_node *node; 7178c2ecf20Sopenharmony_ci struct i915_vma *vma; 7188c2ecf20Sopenharmony_ci u64 vma_offset; 7198c2ecf20Sopenharmony_ci 7208c2ecf20Sopenharmony_ci vma = READ_ONCE(gt->ggtt->fence_regs[i].vma); 7218c2ecf20Sopenharmony_ci if (!vma) 7228c2ecf20Sopenharmony_ci continue; 7238c2ecf20Sopenharmony_ci 7248c2ecf20Sopenharmony_ci if (!i915_vma_has_userfault(vma)) 7258c2ecf20Sopenharmony_ci continue; 7268c2ecf20Sopenharmony_ci 7278c2ecf20Sopenharmony_ci GEM_BUG_ON(vma->fence != >->ggtt->fence_regs[i]); 7288c2ecf20Sopenharmony_ci 7298c2ecf20Sopenharmony_ci if (!vma->mmo) 7308c2ecf20Sopenharmony_ci continue; 7318c2ecf20Sopenharmony_ci 7328c2ecf20Sopenharmony_ci node = &vma->mmo->vma_node; 7338c2ecf20Sopenharmony_ci vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT; 7348c2ecf20Sopenharmony_ci 7358c2ecf20Sopenharmony_ci unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping, 7368c2ecf20Sopenharmony_ci drm_vma_node_offset_addr(node) + vma_offset, 7378c2ecf20Sopenharmony_ci vma->size, 7388c2ecf20Sopenharmony_ci 1); 7398c2ecf20Sopenharmony_ci } 7408c2ecf20Sopenharmony_ci} 7418c2ecf20Sopenharmony_ci 7428c2ecf20Sopenharmony_cistatic intel_engine_mask_t reset_prepare(struct intel_gt *gt) 7438c2ecf20Sopenharmony_ci{ 7448c2ecf20Sopenharmony_ci struct intel_engine_cs *engine; 7458c2ecf20Sopenharmony_ci intel_engine_mask_t awake = 0; 7468c2ecf20Sopenharmony_ci enum intel_engine_id id; 7478c2ecf20Sopenharmony_ci 7488c2ecf20Sopenharmony_ci for_each_engine(engine, gt, id) { 7498c2ecf20Sopenharmony_ci if (intel_engine_pm_get_if_awake(engine)) 7508c2ecf20Sopenharmony_ci awake |= engine->mask; 7518c2ecf20Sopenharmony_ci reset_prepare_engine(engine); 7528c2ecf20Sopenharmony_ci } 7538c2ecf20Sopenharmony_ci 7548c2ecf20Sopenharmony_ci intel_uc_reset_prepare(>->uc); 7558c2ecf20Sopenharmony_ci 7568c2ecf20Sopenharmony_ci return awake; 7578c2ecf20Sopenharmony_ci} 7588c2ecf20Sopenharmony_ci 7598c2ecf20Sopenharmony_cistatic void gt_revoke(struct intel_gt *gt) 7608c2ecf20Sopenharmony_ci{ 7618c2ecf20Sopenharmony_ci revoke_mmaps(gt); 7628c2ecf20Sopenharmony_ci} 7638c2ecf20Sopenharmony_ci 7648c2ecf20Sopenharmony_cistatic int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask) 7658c2ecf20Sopenharmony_ci{ 7668c2ecf20Sopenharmony_ci struct intel_engine_cs *engine; 7678c2ecf20Sopenharmony_ci enum intel_engine_id id; 7688c2ecf20Sopenharmony_ci int err; 7698c2ecf20Sopenharmony_ci 7708c2ecf20Sopenharmony_ci /* 7718c2ecf20Sopenharmony_ci * Everything depends on having the GTT running, so we need to start 7728c2ecf20Sopenharmony_ci * there. 7738c2ecf20Sopenharmony_ci */ 7748c2ecf20Sopenharmony_ci err = i915_ggtt_enable_hw(gt->i915); 7758c2ecf20Sopenharmony_ci if (err) 7768c2ecf20Sopenharmony_ci return err; 7778c2ecf20Sopenharmony_ci 7788c2ecf20Sopenharmony_ci for_each_engine(engine, gt, id) 7798c2ecf20Sopenharmony_ci __intel_engine_reset(engine, stalled_mask & engine->mask); 7808c2ecf20Sopenharmony_ci 7818c2ecf20Sopenharmony_ci intel_ggtt_restore_fences(gt->ggtt); 7828c2ecf20Sopenharmony_ci 7838c2ecf20Sopenharmony_ci return err; 7848c2ecf20Sopenharmony_ci} 7858c2ecf20Sopenharmony_ci 7868c2ecf20Sopenharmony_cistatic void reset_finish_engine(struct intel_engine_cs *engine) 7878c2ecf20Sopenharmony_ci{ 7888c2ecf20Sopenharmony_ci if (engine->reset.finish) 7898c2ecf20Sopenharmony_ci engine->reset.finish(engine); 7908c2ecf20Sopenharmony_ci intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL); 7918c2ecf20Sopenharmony_ci 7928c2ecf20Sopenharmony_ci intel_engine_signal_breadcrumbs(engine); 7938c2ecf20Sopenharmony_ci} 7948c2ecf20Sopenharmony_ci 7958c2ecf20Sopenharmony_cistatic void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake) 7968c2ecf20Sopenharmony_ci{ 7978c2ecf20Sopenharmony_ci struct intel_engine_cs *engine; 7988c2ecf20Sopenharmony_ci enum intel_engine_id id; 7998c2ecf20Sopenharmony_ci 8008c2ecf20Sopenharmony_ci for_each_engine(engine, gt, id) { 8018c2ecf20Sopenharmony_ci reset_finish_engine(engine); 8028c2ecf20Sopenharmony_ci if (awake & engine->mask) 8038c2ecf20Sopenharmony_ci intel_engine_pm_put(engine); 8048c2ecf20Sopenharmony_ci } 8058c2ecf20Sopenharmony_ci} 8068c2ecf20Sopenharmony_ci 8078c2ecf20Sopenharmony_cistatic void nop_submit_request(struct i915_request *request) 8088c2ecf20Sopenharmony_ci{ 8098c2ecf20Sopenharmony_ci struct intel_engine_cs *engine = request->engine; 8108c2ecf20Sopenharmony_ci unsigned long flags; 8118c2ecf20Sopenharmony_ci 8128c2ecf20Sopenharmony_ci RQ_TRACE(request, "-EIO\n"); 8138c2ecf20Sopenharmony_ci i915_request_set_error_once(request, -EIO); 8148c2ecf20Sopenharmony_ci 8158c2ecf20Sopenharmony_ci spin_lock_irqsave(&engine->active.lock, flags); 8168c2ecf20Sopenharmony_ci __i915_request_submit(request); 8178c2ecf20Sopenharmony_ci i915_request_mark_complete(request); 8188c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&engine->active.lock, flags); 8198c2ecf20Sopenharmony_ci 8208c2ecf20Sopenharmony_ci intel_engine_signal_breadcrumbs(engine); 8218c2ecf20Sopenharmony_ci} 8228c2ecf20Sopenharmony_ci 8238c2ecf20Sopenharmony_cistatic void __intel_gt_set_wedged(struct intel_gt *gt) 8248c2ecf20Sopenharmony_ci{ 8258c2ecf20Sopenharmony_ci struct intel_engine_cs *engine; 8268c2ecf20Sopenharmony_ci intel_engine_mask_t awake; 8278c2ecf20Sopenharmony_ci enum intel_engine_id id; 8288c2ecf20Sopenharmony_ci 8298c2ecf20Sopenharmony_ci if (test_bit(I915_WEDGED, >->reset.flags)) 8308c2ecf20Sopenharmony_ci return; 8318c2ecf20Sopenharmony_ci 8328c2ecf20Sopenharmony_ci GT_TRACE(gt, "start\n"); 8338c2ecf20Sopenharmony_ci 8348c2ecf20Sopenharmony_ci /* 8358c2ecf20Sopenharmony_ci * First, stop submission to hw, but do not yet complete requests by 8368c2ecf20Sopenharmony_ci * rolling the global seqno forward (since this would complete requests 8378c2ecf20Sopenharmony_ci * for which we haven't set the fence error to EIO yet). 8388c2ecf20Sopenharmony_ci */ 8398c2ecf20Sopenharmony_ci awake = reset_prepare(gt); 8408c2ecf20Sopenharmony_ci 8418c2ecf20Sopenharmony_ci /* Even if the GPU reset fails, it should still stop the engines */ 8428c2ecf20Sopenharmony_ci if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 8438c2ecf20Sopenharmony_ci __intel_gt_reset(gt, ALL_ENGINES); 8448c2ecf20Sopenharmony_ci 8458c2ecf20Sopenharmony_ci for_each_engine(engine, gt, id) 8468c2ecf20Sopenharmony_ci engine->submit_request = nop_submit_request; 8478c2ecf20Sopenharmony_ci 8488c2ecf20Sopenharmony_ci /* 8498c2ecf20Sopenharmony_ci * Make sure no request can slip through without getting completed by 8508c2ecf20Sopenharmony_ci * either this call here to intel_engine_write_global_seqno, or the one 8518c2ecf20Sopenharmony_ci * in nop_submit_request. 8528c2ecf20Sopenharmony_ci */ 8538c2ecf20Sopenharmony_ci synchronize_rcu_expedited(); 8548c2ecf20Sopenharmony_ci set_bit(I915_WEDGED, >->reset.flags); 8558c2ecf20Sopenharmony_ci 8568c2ecf20Sopenharmony_ci /* Mark all executing requests as skipped */ 8578c2ecf20Sopenharmony_ci for_each_engine(engine, gt, id) 8588c2ecf20Sopenharmony_ci if (engine->reset.cancel) 8598c2ecf20Sopenharmony_ci engine->reset.cancel(engine); 8608c2ecf20Sopenharmony_ci 8618c2ecf20Sopenharmony_ci reset_finish(gt, awake); 8628c2ecf20Sopenharmony_ci 8638c2ecf20Sopenharmony_ci GT_TRACE(gt, "end\n"); 8648c2ecf20Sopenharmony_ci} 8658c2ecf20Sopenharmony_ci 8668c2ecf20Sopenharmony_civoid intel_gt_set_wedged(struct intel_gt *gt) 8678c2ecf20Sopenharmony_ci{ 8688c2ecf20Sopenharmony_ci intel_wakeref_t wakeref; 8698c2ecf20Sopenharmony_ci 8708c2ecf20Sopenharmony_ci if (test_bit(I915_WEDGED, >->reset.flags)) 8718c2ecf20Sopenharmony_ci return; 8728c2ecf20Sopenharmony_ci 8738c2ecf20Sopenharmony_ci wakeref = intel_runtime_pm_get(gt->uncore->rpm); 8748c2ecf20Sopenharmony_ci mutex_lock(>->reset.mutex); 8758c2ecf20Sopenharmony_ci 8768c2ecf20Sopenharmony_ci if (GEM_SHOW_DEBUG()) { 8778c2ecf20Sopenharmony_ci struct drm_printer p = drm_debug_printer(__func__); 8788c2ecf20Sopenharmony_ci struct intel_engine_cs *engine; 8798c2ecf20Sopenharmony_ci enum intel_engine_id id; 8808c2ecf20Sopenharmony_ci 8818c2ecf20Sopenharmony_ci drm_printf(&p, "called from %pS\n", (void *)_RET_IP_); 8828c2ecf20Sopenharmony_ci for_each_engine(engine, gt, id) { 8838c2ecf20Sopenharmony_ci if (intel_engine_is_idle(engine)) 8848c2ecf20Sopenharmony_ci continue; 8858c2ecf20Sopenharmony_ci 8868c2ecf20Sopenharmony_ci intel_engine_dump(engine, &p, "%s\n", engine->name); 8878c2ecf20Sopenharmony_ci } 8888c2ecf20Sopenharmony_ci } 8898c2ecf20Sopenharmony_ci 8908c2ecf20Sopenharmony_ci __intel_gt_set_wedged(gt); 8918c2ecf20Sopenharmony_ci 8928c2ecf20Sopenharmony_ci mutex_unlock(>->reset.mutex); 8938c2ecf20Sopenharmony_ci intel_runtime_pm_put(gt->uncore->rpm, wakeref); 8948c2ecf20Sopenharmony_ci} 8958c2ecf20Sopenharmony_ci 8968c2ecf20Sopenharmony_cistatic bool __intel_gt_unset_wedged(struct intel_gt *gt) 8978c2ecf20Sopenharmony_ci{ 8988c2ecf20Sopenharmony_ci struct intel_gt_timelines *timelines = >->timelines; 8998c2ecf20Sopenharmony_ci struct intel_timeline *tl; 9008c2ecf20Sopenharmony_ci bool ok; 9018c2ecf20Sopenharmony_ci 9028c2ecf20Sopenharmony_ci if (!test_bit(I915_WEDGED, >->reset.flags)) 9038c2ecf20Sopenharmony_ci return true; 9048c2ecf20Sopenharmony_ci 9058c2ecf20Sopenharmony_ci /* Never fully initialised, recovery impossible */ 9068c2ecf20Sopenharmony_ci if (intel_gt_has_unrecoverable_error(gt)) 9078c2ecf20Sopenharmony_ci return false; 9088c2ecf20Sopenharmony_ci 9098c2ecf20Sopenharmony_ci GT_TRACE(gt, "start\n"); 9108c2ecf20Sopenharmony_ci 9118c2ecf20Sopenharmony_ci /* 9128c2ecf20Sopenharmony_ci * Before unwedging, make sure that all pending operations 9138c2ecf20Sopenharmony_ci * are flushed and errored out - we may have requests waiting upon 9148c2ecf20Sopenharmony_ci * third party fences. We marked all inflight requests as EIO, and 9158c2ecf20Sopenharmony_ci * every execbuf since returned EIO, for consistency we want all 9168c2ecf20Sopenharmony_ci * the currently pending requests to also be marked as EIO, which 9178c2ecf20Sopenharmony_ci * is done inside our nop_submit_request - and so we must wait. 9188c2ecf20Sopenharmony_ci * 9198c2ecf20Sopenharmony_ci * No more can be submitted until we reset the wedged bit. 9208c2ecf20Sopenharmony_ci */ 9218c2ecf20Sopenharmony_ci spin_lock(&timelines->lock); 9228c2ecf20Sopenharmony_ci list_for_each_entry(tl, &timelines->active_list, link) { 9238c2ecf20Sopenharmony_ci struct dma_fence *fence; 9248c2ecf20Sopenharmony_ci 9258c2ecf20Sopenharmony_ci fence = i915_active_fence_get(&tl->last_request); 9268c2ecf20Sopenharmony_ci if (!fence) 9278c2ecf20Sopenharmony_ci continue; 9288c2ecf20Sopenharmony_ci 9298c2ecf20Sopenharmony_ci spin_unlock(&timelines->lock); 9308c2ecf20Sopenharmony_ci 9318c2ecf20Sopenharmony_ci /* 9328c2ecf20Sopenharmony_ci * All internal dependencies (i915_requests) will have 9338c2ecf20Sopenharmony_ci * been flushed by the set-wedge, but we may be stuck waiting 9348c2ecf20Sopenharmony_ci * for external fences. These should all be capped to 10s 9358c2ecf20Sopenharmony_ci * (I915_FENCE_TIMEOUT) so this wait should not be unbounded 9368c2ecf20Sopenharmony_ci * in the worst case. 9378c2ecf20Sopenharmony_ci */ 9388c2ecf20Sopenharmony_ci dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT); 9398c2ecf20Sopenharmony_ci dma_fence_put(fence); 9408c2ecf20Sopenharmony_ci 9418c2ecf20Sopenharmony_ci /* Restart iteration after droping lock */ 9428c2ecf20Sopenharmony_ci spin_lock(&timelines->lock); 9438c2ecf20Sopenharmony_ci tl = list_entry(&timelines->active_list, typeof(*tl), link); 9448c2ecf20Sopenharmony_ci } 9458c2ecf20Sopenharmony_ci spin_unlock(&timelines->lock); 9468c2ecf20Sopenharmony_ci 9478c2ecf20Sopenharmony_ci /* We must reset pending GPU events before restoring our submission */ 9488c2ecf20Sopenharmony_ci ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */ 9498c2ecf20Sopenharmony_ci if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 9508c2ecf20Sopenharmony_ci ok = __intel_gt_reset(gt, ALL_ENGINES) == 0; 9518c2ecf20Sopenharmony_ci if (!ok) { 9528c2ecf20Sopenharmony_ci /* 9538c2ecf20Sopenharmony_ci * Warn CI about the unrecoverable wedged condition. 9548c2ecf20Sopenharmony_ci * Time for a reboot. 9558c2ecf20Sopenharmony_ci */ 9568c2ecf20Sopenharmony_ci add_taint_for_CI(gt->i915, TAINT_WARN); 9578c2ecf20Sopenharmony_ci return false; 9588c2ecf20Sopenharmony_ci } 9598c2ecf20Sopenharmony_ci 9608c2ecf20Sopenharmony_ci /* 9618c2ecf20Sopenharmony_ci * Undo nop_submit_request. We prevent all new i915 requests from 9628c2ecf20Sopenharmony_ci * being queued (by disallowing execbuf whilst wedged) so having 9638c2ecf20Sopenharmony_ci * waited for all active requests above, we know the system is idle 9648c2ecf20Sopenharmony_ci * and do not have to worry about a thread being inside 9658c2ecf20Sopenharmony_ci * engine->submit_request() as we swap over. So unlike installing 9668c2ecf20Sopenharmony_ci * the nop_submit_request on reset, we can do this from normal 9678c2ecf20Sopenharmony_ci * context and do not require stop_machine(). 9688c2ecf20Sopenharmony_ci */ 9698c2ecf20Sopenharmony_ci intel_engines_reset_default_submission(gt); 9708c2ecf20Sopenharmony_ci 9718c2ecf20Sopenharmony_ci GT_TRACE(gt, "end\n"); 9728c2ecf20Sopenharmony_ci 9738c2ecf20Sopenharmony_ci smp_mb__before_atomic(); /* complete takeover before enabling execbuf */ 9748c2ecf20Sopenharmony_ci clear_bit(I915_WEDGED, >->reset.flags); 9758c2ecf20Sopenharmony_ci 9768c2ecf20Sopenharmony_ci return true; 9778c2ecf20Sopenharmony_ci} 9788c2ecf20Sopenharmony_ci 9798c2ecf20Sopenharmony_cibool intel_gt_unset_wedged(struct intel_gt *gt) 9808c2ecf20Sopenharmony_ci{ 9818c2ecf20Sopenharmony_ci bool result; 9828c2ecf20Sopenharmony_ci 9838c2ecf20Sopenharmony_ci mutex_lock(>->reset.mutex); 9848c2ecf20Sopenharmony_ci result = __intel_gt_unset_wedged(gt); 9858c2ecf20Sopenharmony_ci mutex_unlock(>->reset.mutex); 9868c2ecf20Sopenharmony_ci 9878c2ecf20Sopenharmony_ci return result; 9888c2ecf20Sopenharmony_ci} 9898c2ecf20Sopenharmony_ci 9908c2ecf20Sopenharmony_cistatic int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask) 9918c2ecf20Sopenharmony_ci{ 9928c2ecf20Sopenharmony_ci int err, i; 9938c2ecf20Sopenharmony_ci 9948c2ecf20Sopenharmony_ci gt_revoke(gt); 9958c2ecf20Sopenharmony_ci 9968c2ecf20Sopenharmony_ci err = __intel_gt_reset(gt, ALL_ENGINES); 9978c2ecf20Sopenharmony_ci for (i = 0; err && i < RESET_MAX_RETRIES; i++) { 9988c2ecf20Sopenharmony_ci msleep(10 * (i + 1)); 9998c2ecf20Sopenharmony_ci err = __intel_gt_reset(gt, ALL_ENGINES); 10008c2ecf20Sopenharmony_ci } 10018c2ecf20Sopenharmony_ci if (err) 10028c2ecf20Sopenharmony_ci return err; 10038c2ecf20Sopenharmony_ci 10048c2ecf20Sopenharmony_ci return gt_reset(gt, stalled_mask); 10058c2ecf20Sopenharmony_ci} 10068c2ecf20Sopenharmony_ci 10078c2ecf20Sopenharmony_cistatic int resume(struct intel_gt *gt) 10088c2ecf20Sopenharmony_ci{ 10098c2ecf20Sopenharmony_ci struct intel_engine_cs *engine; 10108c2ecf20Sopenharmony_ci enum intel_engine_id id; 10118c2ecf20Sopenharmony_ci int ret; 10128c2ecf20Sopenharmony_ci 10138c2ecf20Sopenharmony_ci for_each_engine(engine, gt, id) { 10148c2ecf20Sopenharmony_ci ret = intel_engine_resume(engine); 10158c2ecf20Sopenharmony_ci if (ret) 10168c2ecf20Sopenharmony_ci return ret; 10178c2ecf20Sopenharmony_ci } 10188c2ecf20Sopenharmony_ci 10198c2ecf20Sopenharmony_ci return 0; 10208c2ecf20Sopenharmony_ci} 10218c2ecf20Sopenharmony_ci 10228c2ecf20Sopenharmony_ci/** 10238c2ecf20Sopenharmony_ci * intel_gt_reset - reset chip after a hang 10248c2ecf20Sopenharmony_ci * @gt: #intel_gt to reset 10258c2ecf20Sopenharmony_ci * @stalled_mask: mask of the stalled engines with the guilty requests 10268c2ecf20Sopenharmony_ci * @reason: user error message for why we are resetting 10278c2ecf20Sopenharmony_ci * 10288c2ecf20Sopenharmony_ci * Reset the chip. Useful if a hang is detected. Marks the device as wedged 10298c2ecf20Sopenharmony_ci * on failure. 10308c2ecf20Sopenharmony_ci * 10318c2ecf20Sopenharmony_ci * Procedure is fairly simple: 10328c2ecf20Sopenharmony_ci * - reset the chip using the reset reg 10338c2ecf20Sopenharmony_ci * - re-init context state 10348c2ecf20Sopenharmony_ci * - re-init hardware status page 10358c2ecf20Sopenharmony_ci * - re-init ring buffer 10368c2ecf20Sopenharmony_ci * - re-init interrupt state 10378c2ecf20Sopenharmony_ci * - re-init display 10388c2ecf20Sopenharmony_ci */ 10398c2ecf20Sopenharmony_civoid intel_gt_reset(struct intel_gt *gt, 10408c2ecf20Sopenharmony_ci intel_engine_mask_t stalled_mask, 10418c2ecf20Sopenharmony_ci const char *reason) 10428c2ecf20Sopenharmony_ci{ 10438c2ecf20Sopenharmony_ci intel_engine_mask_t awake; 10448c2ecf20Sopenharmony_ci int ret; 10458c2ecf20Sopenharmony_ci 10468c2ecf20Sopenharmony_ci GT_TRACE(gt, "flags=%lx\n", gt->reset.flags); 10478c2ecf20Sopenharmony_ci 10488c2ecf20Sopenharmony_ci might_sleep(); 10498c2ecf20Sopenharmony_ci GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, >->reset.flags)); 10508c2ecf20Sopenharmony_ci mutex_lock(>->reset.mutex); 10518c2ecf20Sopenharmony_ci 10528c2ecf20Sopenharmony_ci /* Clear any previous failed attempts at recovery. Time to try again. */ 10538c2ecf20Sopenharmony_ci if (!__intel_gt_unset_wedged(gt)) 10548c2ecf20Sopenharmony_ci goto unlock; 10558c2ecf20Sopenharmony_ci 10568c2ecf20Sopenharmony_ci if (reason) 10578c2ecf20Sopenharmony_ci drm_notice(>->i915->drm, 10588c2ecf20Sopenharmony_ci "Resetting chip for %s\n", reason); 10598c2ecf20Sopenharmony_ci atomic_inc(>->i915->gpu_error.reset_count); 10608c2ecf20Sopenharmony_ci 10618c2ecf20Sopenharmony_ci awake = reset_prepare(gt); 10628c2ecf20Sopenharmony_ci 10638c2ecf20Sopenharmony_ci if (!intel_has_gpu_reset(gt)) { 10648c2ecf20Sopenharmony_ci if (gt->i915->params.reset) 10658c2ecf20Sopenharmony_ci drm_err(>->i915->drm, "GPU reset not supported\n"); 10668c2ecf20Sopenharmony_ci else 10678c2ecf20Sopenharmony_ci drm_dbg(>->i915->drm, "GPU reset disabled\n"); 10688c2ecf20Sopenharmony_ci goto error; 10698c2ecf20Sopenharmony_ci } 10708c2ecf20Sopenharmony_ci 10718c2ecf20Sopenharmony_ci if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 10728c2ecf20Sopenharmony_ci intel_runtime_pm_disable_interrupts(gt->i915); 10738c2ecf20Sopenharmony_ci 10748c2ecf20Sopenharmony_ci if (do_reset(gt, stalled_mask)) { 10758c2ecf20Sopenharmony_ci drm_err(>->i915->drm, "Failed to reset chip\n"); 10768c2ecf20Sopenharmony_ci goto taint; 10778c2ecf20Sopenharmony_ci } 10788c2ecf20Sopenharmony_ci 10798c2ecf20Sopenharmony_ci if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 10808c2ecf20Sopenharmony_ci intel_runtime_pm_enable_interrupts(gt->i915); 10818c2ecf20Sopenharmony_ci 10828c2ecf20Sopenharmony_ci intel_overlay_reset(gt->i915); 10838c2ecf20Sopenharmony_ci 10848c2ecf20Sopenharmony_ci /* 10858c2ecf20Sopenharmony_ci * Next we need to restore the context, but we don't use those 10868c2ecf20Sopenharmony_ci * yet either... 10878c2ecf20Sopenharmony_ci * 10888c2ecf20Sopenharmony_ci * Ring buffer needs to be re-initialized in the KMS case, or if X 10898c2ecf20Sopenharmony_ci * was running at the time of the reset (i.e. we weren't VT 10908c2ecf20Sopenharmony_ci * switched away). 10918c2ecf20Sopenharmony_ci */ 10928c2ecf20Sopenharmony_ci ret = intel_gt_init_hw(gt); 10938c2ecf20Sopenharmony_ci if (ret) { 10948c2ecf20Sopenharmony_ci drm_err(>->i915->drm, 10958c2ecf20Sopenharmony_ci "Failed to initialise HW following reset (%d)\n", 10968c2ecf20Sopenharmony_ci ret); 10978c2ecf20Sopenharmony_ci goto taint; 10988c2ecf20Sopenharmony_ci } 10998c2ecf20Sopenharmony_ci 11008c2ecf20Sopenharmony_ci ret = resume(gt); 11018c2ecf20Sopenharmony_ci if (ret) 11028c2ecf20Sopenharmony_ci goto taint; 11038c2ecf20Sopenharmony_ci 11048c2ecf20Sopenharmony_cifinish: 11058c2ecf20Sopenharmony_ci reset_finish(gt, awake); 11068c2ecf20Sopenharmony_ciunlock: 11078c2ecf20Sopenharmony_ci mutex_unlock(>->reset.mutex); 11088c2ecf20Sopenharmony_ci return; 11098c2ecf20Sopenharmony_ci 11108c2ecf20Sopenharmony_citaint: 11118c2ecf20Sopenharmony_ci /* 11128c2ecf20Sopenharmony_ci * History tells us that if we cannot reset the GPU now, we 11138c2ecf20Sopenharmony_ci * never will. This then impacts everything that is run 11148c2ecf20Sopenharmony_ci * subsequently. On failing the reset, we mark the driver 11158c2ecf20Sopenharmony_ci * as wedged, preventing further execution on the GPU. 11168c2ecf20Sopenharmony_ci * We also want to go one step further and add a taint to the 11178c2ecf20Sopenharmony_ci * kernel so that any subsequent faults can be traced back to 11188c2ecf20Sopenharmony_ci * this failure. This is important for CI, where if the 11198c2ecf20Sopenharmony_ci * GPU/driver fails we would like to reboot and restart testing 11208c2ecf20Sopenharmony_ci * rather than continue on into oblivion. For everyone else, 11218c2ecf20Sopenharmony_ci * the system should still plod along, but they have been warned! 11228c2ecf20Sopenharmony_ci */ 11238c2ecf20Sopenharmony_ci add_taint_for_CI(gt->i915, TAINT_WARN); 11248c2ecf20Sopenharmony_cierror: 11258c2ecf20Sopenharmony_ci __intel_gt_set_wedged(gt); 11268c2ecf20Sopenharmony_ci goto finish; 11278c2ecf20Sopenharmony_ci} 11288c2ecf20Sopenharmony_ci 11298c2ecf20Sopenharmony_cistatic inline int intel_gt_reset_engine(struct intel_engine_cs *engine) 11308c2ecf20Sopenharmony_ci{ 11318c2ecf20Sopenharmony_ci return __intel_gt_reset(engine->gt, engine->mask); 11328c2ecf20Sopenharmony_ci} 11338c2ecf20Sopenharmony_ci 11348c2ecf20Sopenharmony_ci/** 11358c2ecf20Sopenharmony_ci * intel_engine_reset - reset GPU engine to recover from a hang 11368c2ecf20Sopenharmony_ci * @engine: engine to reset 11378c2ecf20Sopenharmony_ci * @msg: reason for GPU reset; or NULL for no drm_notice() 11388c2ecf20Sopenharmony_ci * 11398c2ecf20Sopenharmony_ci * Reset a specific GPU engine. Useful if a hang is detected. 11408c2ecf20Sopenharmony_ci * Returns zero on successful reset or otherwise an error code. 11418c2ecf20Sopenharmony_ci * 11428c2ecf20Sopenharmony_ci * Procedure is: 11438c2ecf20Sopenharmony_ci * - identifies the request that caused the hang and it is dropped 11448c2ecf20Sopenharmony_ci * - reset engine (which will force the engine to idle) 11458c2ecf20Sopenharmony_ci * - re-init/configure engine 11468c2ecf20Sopenharmony_ci */ 11478c2ecf20Sopenharmony_ciint intel_engine_reset(struct intel_engine_cs *engine, const char *msg) 11488c2ecf20Sopenharmony_ci{ 11498c2ecf20Sopenharmony_ci struct intel_gt *gt = engine->gt; 11508c2ecf20Sopenharmony_ci bool uses_guc = intel_engine_in_guc_submission_mode(engine); 11518c2ecf20Sopenharmony_ci int ret; 11528c2ecf20Sopenharmony_ci 11538c2ecf20Sopenharmony_ci ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags); 11548c2ecf20Sopenharmony_ci GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, >->reset.flags)); 11558c2ecf20Sopenharmony_ci 11568c2ecf20Sopenharmony_ci if (!intel_engine_pm_get_if_awake(engine)) 11578c2ecf20Sopenharmony_ci return 0; 11588c2ecf20Sopenharmony_ci 11598c2ecf20Sopenharmony_ci reset_prepare_engine(engine); 11608c2ecf20Sopenharmony_ci 11618c2ecf20Sopenharmony_ci if (msg) 11628c2ecf20Sopenharmony_ci drm_notice(&engine->i915->drm, 11638c2ecf20Sopenharmony_ci "Resetting %s for %s\n", engine->name, msg); 11648c2ecf20Sopenharmony_ci atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]); 11658c2ecf20Sopenharmony_ci 11668c2ecf20Sopenharmony_ci if (!uses_guc) 11678c2ecf20Sopenharmony_ci ret = intel_gt_reset_engine(engine); 11688c2ecf20Sopenharmony_ci else 11698c2ecf20Sopenharmony_ci ret = intel_guc_reset_engine(&engine->gt->uc.guc, engine); 11708c2ecf20Sopenharmony_ci if (ret) { 11718c2ecf20Sopenharmony_ci /* If we fail here, we expect to fallback to a global reset */ 11728c2ecf20Sopenharmony_ci drm_dbg(>->i915->drm, "%sFailed to reset %s, ret=%d\n", 11738c2ecf20Sopenharmony_ci uses_guc ? "GuC " : "", engine->name, ret); 11748c2ecf20Sopenharmony_ci goto out; 11758c2ecf20Sopenharmony_ci } 11768c2ecf20Sopenharmony_ci 11778c2ecf20Sopenharmony_ci /* 11788c2ecf20Sopenharmony_ci * The request that caused the hang is stuck on elsp, we know the 11798c2ecf20Sopenharmony_ci * active request and can drop it, adjust head to skip the offending 11808c2ecf20Sopenharmony_ci * request to resume executing remaining requests in the queue. 11818c2ecf20Sopenharmony_ci */ 11828c2ecf20Sopenharmony_ci __intel_engine_reset(engine, true); 11838c2ecf20Sopenharmony_ci 11848c2ecf20Sopenharmony_ci /* 11858c2ecf20Sopenharmony_ci * The engine and its registers (and workarounds in case of render) 11868c2ecf20Sopenharmony_ci * have been reset to their default values. Follow the init_ring 11878c2ecf20Sopenharmony_ci * process to program RING_MODE, HWSP and re-enable submission. 11888c2ecf20Sopenharmony_ci */ 11898c2ecf20Sopenharmony_ci ret = intel_engine_resume(engine); 11908c2ecf20Sopenharmony_ci 11918c2ecf20Sopenharmony_ciout: 11928c2ecf20Sopenharmony_ci intel_engine_cancel_stop_cs(engine); 11938c2ecf20Sopenharmony_ci reset_finish_engine(engine); 11948c2ecf20Sopenharmony_ci intel_engine_pm_put_async(engine); 11958c2ecf20Sopenharmony_ci return ret; 11968c2ecf20Sopenharmony_ci} 11978c2ecf20Sopenharmony_ci 11988c2ecf20Sopenharmony_cistatic void intel_gt_reset_global(struct intel_gt *gt, 11998c2ecf20Sopenharmony_ci u32 engine_mask, 12008c2ecf20Sopenharmony_ci const char *reason) 12018c2ecf20Sopenharmony_ci{ 12028c2ecf20Sopenharmony_ci struct kobject *kobj = >->i915->drm.primary->kdev->kobj; 12038c2ecf20Sopenharmony_ci char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; 12048c2ecf20Sopenharmony_ci char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; 12058c2ecf20Sopenharmony_ci char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; 12068c2ecf20Sopenharmony_ci struct intel_wedge_me w; 12078c2ecf20Sopenharmony_ci 12088c2ecf20Sopenharmony_ci kobject_uevent_env(kobj, KOBJ_CHANGE, error_event); 12098c2ecf20Sopenharmony_ci 12108c2ecf20Sopenharmony_ci drm_dbg(>->i915->drm, "resetting chip, engines=%x\n", engine_mask); 12118c2ecf20Sopenharmony_ci kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event); 12128c2ecf20Sopenharmony_ci 12138c2ecf20Sopenharmony_ci /* Use a watchdog to ensure that our reset completes */ 12148c2ecf20Sopenharmony_ci intel_wedge_on_timeout(&w, gt, 5 * HZ) { 12158c2ecf20Sopenharmony_ci intel_prepare_reset(gt->i915); 12168c2ecf20Sopenharmony_ci 12178c2ecf20Sopenharmony_ci /* Flush everyone using a resource about to be clobbered */ 12188c2ecf20Sopenharmony_ci synchronize_srcu_expedited(>->reset.backoff_srcu); 12198c2ecf20Sopenharmony_ci 12208c2ecf20Sopenharmony_ci intel_gt_reset(gt, engine_mask, reason); 12218c2ecf20Sopenharmony_ci 12228c2ecf20Sopenharmony_ci intel_finish_reset(gt->i915); 12238c2ecf20Sopenharmony_ci } 12248c2ecf20Sopenharmony_ci 12258c2ecf20Sopenharmony_ci if (!test_bit(I915_WEDGED, >->reset.flags)) 12268c2ecf20Sopenharmony_ci kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event); 12278c2ecf20Sopenharmony_ci} 12288c2ecf20Sopenharmony_ci 12298c2ecf20Sopenharmony_ci/** 12308c2ecf20Sopenharmony_ci * intel_gt_handle_error - handle a gpu error 12318c2ecf20Sopenharmony_ci * @gt: the intel_gt 12328c2ecf20Sopenharmony_ci * @engine_mask: mask representing engines that are hung 12338c2ecf20Sopenharmony_ci * @flags: control flags 12348c2ecf20Sopenharmony_ci * @fmt: Error message format string 12358c2ecf20Sopenharmony_ci * 12368c2ecf20Sopenharmony_ci * Do some basic checking of register state at error time and 12378c2ecf20Sopenharmony_ci * dump it to the syslog. Also call i915_capture_error_state() to make 12388c2ecf20Sopenharmony_ci * sure we get a record and make it available in debugfs. Fire a uevent 12398c2ecf20Sopenharmony_ci * so userspace knows something bad happened (should trigger collection 12408c2ecf20Sopenharmony_ci * of a ring dump etc.). 12418c2ecf20Sopenharmony_ci */ 12428c2ecf20Sopenharmony_civoid intel_gt_handle_error(struct intel_gt *gt, 12438c2ecf20Sopenharmony_ci intel_engine_mask_t engine_mask, 12448c2ecf20Sopenharmony_ci unsigned long flags, 12458c2ecf20Sopenharmony_ci const char *fmt, ...) 12468c2ecf20Sopenharmony_ci{ 12478c2ecf20Sopenharmony_ci struct intel_engine_cs *engine; 12488c2ecf20Sopenharmony_ci intel_wakeref_t wakeref; 12498c2ecf20Sopenharmony_ci intel_engine_mask_t tmp; 12508c2ecf20Sopenharmony_ci char error_msg[80]; 12518c2ecf20Sopenharmony_ci char *msg = NULL; 12528c2ecf20Sopenharmony_ci 12538c2ecf20Sopenharmony_ci if (fmt) { 12548c2ecf20Sopenharmony_ci va_list args; 12558c2ecf20Sopenharmony_ci 12568c2ecf20Sopenharmony_ci va_start(args, fmt); 12578c2ecf20Sopenharmony_ci vscnprintf(error_msg, sizeof(error_msg), fmt, args); 12588c2ecf20Sopenharmony_ci va_end(args); 12598c2ecf20Sopenharmony_ci 12608c2ecf20Sopenharmony_ci msg = error_msg; 12618c2ecf20Sopenharmony_ci } 12628c2ecf20Sopenharmony_ci 12638c2ecf20Sopenharmony_ci /* 12648c2ecf20Sopenharmony_ci * In most cases it's guaranteed that we get here with an RPM 12658c2ecf20Sopenharmony_ci * reference held, for example because there is a pending GPU 12668c2ecf20Sopenharmony_ci * request that won't finish until the reset is done. This 12678c2ecf20Sopenharmony_ci * isn't the case at least when we get here by doing a 12688c2ecf20Sopenharmony_ci * simulated reset via debugfs, so get an RPM reference. 12698c2ecf20Sopenharmony_ci */ 12708c2ecf20Sopenharmony_ci wakeref = intel_runtime_pm_get(gt->uncore->rpm); 12718c2ecf20Sopenharmony_ci 12728c2ecf20Sopenharmony_ci engine_mask &= gt->info.engine_mask; 12738c2ecf20Sopenharmony_ci 12748c2ecf20Sopenharmony_ci if (flags & I915_ERROR_CAPTURE) { 12758c2ecf20Sopenharmony_ci i915_capture_error_state(gt->i915); 12768c2ecf20Sopenharmony_ci intel_gt_clear_error_registers(gt, engine_mask); 12778c2ecf20Sopenharmony_ci } 12788c2ecf20Sopenharmony_ci 12798c2ecf20Sopenharmony_ci /* 12808c2ecf20Sopenharmony_ci * Try engine reset when available. We fall back to full reset if 12818c2ecf20Sopenharmony_ci * single reset fails. 12828c2ecf20Sopenharmony_ci */ 12838c2ecf20Sopenharmony_ci if (intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) { 12848c2ecf20Sopenharmony_ci for_each_engine_masked(engine, gt, engine_mask, tmp) { 12858c2ecf20Sopenharmony_ci BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE); 12868c2ecf20Sopenharmony_ci if (test_and_set_bit(I915_RESET_ENGINE + engine->id, 12878c2ecf20Sopenharmony_ci >->reset.flags)) 12888c2ecf20Sopenharmony_ci continue; 12898c2ecf20Sopenharmony_ci 12908c2ecf20Sopenharmony_ci if (intel_engine_reset(engine, msg) == 0) 12918c2ecf20Sopenharmony_ci engine_mask &= ~engine->mask; 12928c2ecf20Sopenharmony_ci 12938c2ecf20Sopenharmony_ci clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id, 12948c2ecf20Sopenharmony_ci >->reset.flags); 12958c2ecf20Sopenharmony_ci } 12968c2ecf20Sopenharmony_ci } 12978c2ecf20Sopenharmony_ci 12988c2ecf20Sopenharmony_ci if (!engine_mask) 12998c2ecf20Sopenharmony_ci goto out; 13008c2ecf20Sopenharmony_ci 13018c2ecf20Sopenharmony_ci /* Full reset needs the mutex, stop any other user trying to do so. */ 13028c2ecf20Sopenharmony_ci if (test_and_set_bit(I915_RESET_BACKOFF, >->reset.flags)) { 13038c2ecf20Sopenharmony_ci wait_event(gt->reset.queue, 13048c2ecf20Sopenharmony_ci !test_bit(I915_RESET_BACKOFF, >->reset.flags)); 13058c2ecf20Sopenharmony_ci goto out; /* piggy-back on the other reset */ 13068c2ecf20Sopenharmony_ci } 13078c2ecf20Sopenharmony_ci 13088c2ecf20Sopenharmony_ci /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */ 13098c2ecf20Sopenharmony_ci synchronize_rcu_expedited(); 13108c2ecf20Sopenharmony_ci 13118c2ecf20Sopenharmony_ci /* Prevent any other reset-engine attempt. */ 13128c2ecf20Sopenharmony_ci for_each_engine(engine, gt, tmp) { 13138c2ecf20Sopenharmony_ci while (test_and_set_bit(I915_RESET_ENGINE + engine->id, 13148c2ecf20Sopenharmony_ci >->reset.flags)) 13158c2ecf20Sopenharmony_ci wait_on_bit(>->reset.flags, 13168c2ecf20Sopenharmony_ci I915_RESET_ENGINE + engine->id, 13178c2ecf20Sopenharmony_ci TASK_UNINTERRUPTIBLE); 13188c2ecf20Sopenharmony_ci } 13198c2ecf20Sopenharmony_ci 13208c2ecf20Sopenharmony_ci intel_gt_reset_global(gt, engine_mask, msg); 13218c2ecf20Sopenharmony_ci 13228c2ecf20Sopenharmony_ci for_each_engine(engine, gt, tmp) 13238c2ecf20Sopenharmony_ci clear_bit_unlock(I915_RESET_ENGINE + engine->id, 13248c2ecf20Sopenharmony_ci >->reset.flags); 13258c2ecf20Sopenharmony_ci clear_bit_unlock(I915_RESET_BACKOFF, >->reset.flags); 13268c2ecf20Sopenharmony_ci smp_mb__after_atomic(); 13278c2ecf20Sopenharmony_ci wake_up_all(>->reset.queue); 13288c2ecf20Sopenharmony_ci 13298c2ecf20Sopenharmony_ciout: 13308c2ecf20Sopenharmony_ci intel_runtime_pm_put(gt->uncore->rpm, wakeref); 13318c2ecf20Sopenharmony_ci} 13328c2ecf20Sopenharmony_ci 13338c2ecf20Sopenharmony_ciint intel_gt_reset_trylock(struct intel_gt *gt, int *srcu) 13348c2ecf20Sopenharmony_ci{ 13358c2ecf20Sopenharmony_ci might_lock(>->reset.backoff_srcu); 13368c2ecf20Sopenharmony_ci might_sleep(); 13378c2ecf20Sopenharmony_ci 13388c2ecf20Sopenharmony_ci rcu_read_lock(); 13398c2ecf20Sopenharmony_ci while (test_bit(I915_RESET_BACKOFF, >->reset.flags)) { 13408c2ecf20Sopenharmony_ci rcu_read_unlock(); 13418c2ecf20Sopenharmony_ci 13428c2ecf20Sopenharmony_ci if (wait_event_interruptible(gt->reset.queue, 13438c2ecf20Sopenharmony_ci !test_bit(I915_RESET_BACKOFF, 13448c2ecf20Sopenharmony_ci >->reset.flags))) 13458c2ecf20Sopenharmony_ci return -EINTR; 13468c2ecf20Sopenharmony_ci 13478c2ecf20Sopenharmony_ci rcu_read_lock(); 13488c2ecf20Sopenharmony_ci } 13498c2ecf20Sopenharmony_ci *srcu = srcu_read_lock(>->reset.backoff_srcu); 13508c2ecf20Sopenharmony_ci rcu_read_unlock(); 13518c2ecf20Sopenharmony_ci 13528c2ecf20Sopenharmony_ci return 0; 13538c2ecf20Sopenharmony_ci} 13548c2ecf20Sopenharmony_ci 13558c2ecf20Sopenharmony_civoid intel_gt_reset_unlock(struct intel_gt *gt, int tag) 13568c2ecf20Sopenharmony_ci__releases(>->reset.backoff_srcu) 13578c2ecf20Sopenharmony_ci{ 13588c2ecf20Sopenharmony_ci srcu_read_unlock(>->reset.backoff_srcu, tag); 13598c2ecf20Sopenharmony_ci} 13608c2ecf20Sopenharmony_ci 13618c2ecf20Sopenharmony_ciint intel_gt_terminally_wedged(struct intel_gt *gt) 13628c2ecf20Sopenharmony_ci{ 13638c2ecf20Sopenharmony_ci might_sleep(); 13648c2ecf20Sopenharmony_ci 13658c2ecf20Sopenharmony_ci if (!intel_gt_is_wedged(gt)) 13668c2ecf20Sopenharmony_ci return 0; 13678c2ecf20Sopenharmony_ci 13688c2ecf20Sopenharmony_ci if (intel_gt_has_unrecoverable_error(gt)) 13698c2ecf20Sopenharmony_ci return -EIO; 13708c2ecf20Sopenharmony_ci 13718c2ecf20Sopenharmony_ci /* Reset still in progress? Maybe we will recover? */ 13728c2ecf20Sopenharmony_ci if (wait_event_interruptible(gt->reset.queue, 13738c2ecf20Sopenharmony_ci !test_bit(I915_RESET_BACKOFF, 13748c2ecf20Sopenharmony_ci >->reset.flags))) 13758c2ecf20Sopenharmony_ci return -EINTR; 13768c2ecf20Sopenharmony_ci 13778c2ecf20Sopenharmony_ci return intel_gt_is_wedged(gt) ? -EIO : 0; 13788c2ecf20Sopenharmony_ci} 13798c2ecf20Sopenharmony_ci 13808c2ecf20Sopenharmony_civoid intel_gt_set_wedged_on_init(struct intel_gt *gt) 13818c2ecf20Sopenharmony_ci{ 13828c2ecf20Sopenharmony_ci BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES > 13838c2ecf20Sopenharmony_ci I915_WEDGED_ON_INIT); 13848c2ecf20Sopenharmony_ci intel_gt_set_wedged(gt); 13858c2ecf20Sopenharmony_ci set_bit(I915_WEDGED_ON_INIT, >->reset.flags); 13868c2ecf20Sopenharmony_ci 13878c2ecf20Sopenharmony_ci /* Wedged on init is non-recoverable */ 13888c2ecf20Sopenharmony_ci add_taint_for_CI(gt->i915, TAINT_WARN); 13898c2ecf20Sopenharmony_ci} 13908c2ecf20Sopenharmony_ci 13918c2ecf20Sopenharmony_civoid intel_gt_set_wedged_on_fini(struct intel_gt *gt) 13928c2ecf20Sopenharmony_ci{ 13938c2ecf20Sopenharmony_ci intel_gt_set_wedged(gt); 13948c2ecf20Sopenharmony_ci set_bit(I915_WEDGED_ON_FINI, >->reset.flags); 13958c2ecf20Sopenharmony_ci} 13968c2ecf20Sopenharmony_ci 13978c2ecf20Sopenharmony_civoid intel_gt_init_reset(struct intel_gt *gt) 13988c2ecf20Sopenharmony_ci{ 13998c2ecf20Sopenharmony_ci init_waitqueue_head(>->reset.queue); 14008c2ecf20Sopenharmony_ci mutex_init(>->reset.mutex); 14018c2ecf20Sopenharmony_ci init_srcu_struct(>->reset.backoff_srcu); 14028c2ecf20Sopenharmony_ci 14038c2ecf20Sopenharmony_ci /* no GPU until we are ready! */ 14048c2ecf20Sopenharmony_ci __set_bit(I915_WEDGED, >->reset.flags); 14058c2ecf20Sopenharmony_ci} 14068c2ecf20Sopenharmony_ci 14078c2ecf20Sopenharmony_civoid intel_gt_fini_reset(struct intel_gt *gt) 14088c2ecf20Sopenharmony_ci{ 14098c2ecf20Sopenharmony_ci cleanup_srcu_struct(>->reset.backoff_srcu); 14108c2ecf20Sopenharmony_ci} 14118c2ecf20Sopenharmony_ci 14128c2ecf20Sopenharmony_cistatic void intel_wedge_me(struct work_struct *work) 14138c2ecf20Sopenharmony_ci{ 14148c2ecf20Sopenharmony_ci struct intel_wedge_me *w = container_of(work, typeof(*w), work.work); 14158c2ecf20Sopenharmony_ci 14168c2ecf20Sopenharmony_ci drm_err(&w->gt->i915->drm, 14178c2ecf20Sopenharmony_ci "%s timed out, cancelling all in-flight rendering.\n", 14188c2ecf20Sopenharmony_ci w->name); 14198c2ecf20Sopenharmony_ci intel_gt_set_wedged(w->gt); 14208c2ecf20Sopenharmony_ci} 14218c2ecf20Sopenharmony_ci 14228c2ecf20Sopenharmony_civoid __intel_init_wedge(struct intel_wedge_me *w, 14238c2ecf20Sopenharmony_ci struct intel_gt *gt, 14248c2ecf20Sopenharmony_ci long timeout, 14258c2ecf20Sopenharmony_ci const char *name) 14268c2ecf20Sopenharmony_ci{ 14278c2ecf20Sopenharmony_ci w->gt = gt; 14288c2ecf20Sopenharmony_ci w->name = name; 14298c2ecf20Sopenharmony_ci 14308c2ecf20Sopenharmony_ci INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me); 14318c2ecf20Sopenharmony_ci schedule_delayed_work(&w->work, timeout); 14328c2ecf20Sopenharmony_ci} 14338c2ecf20Sopenharmony_ci 14348c2ecf20Sopenharmony_civoid __intel_fini_wedge(struct intel_wedge_me *w) 14358c2ecf20Sopenharmony_ci{ 14368c2ecf20Sopenharmony_ci cancel_delayed_work_sync(&w->work); 14378c2ecf20Sopenharmony_ci destroy_delayed_work_on_stack(&w->work); 14388c2ecf20Sopenharmony_ci w->gt = NULL; 14398c2ecf20Sopenharmony_ci} 14408c2ecf20Sopenharmony_ci 14418c2ecf20Sopenharmony_ci#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 14428c2ecf20Sopenharmony_ci#include "selftest_reset.c" 14438c2ecf20Sopenharmony_ci#include "selftest_hangcheck.c" 14448c2ecf20Sopenharmony_ci#endif 1445