diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 76a7b37a4a53..d45db6ff1fa3 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -764,3 +764,32 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address) { return address & GENMASK_ULL(xe->info.va_bits - 1, 0); } + +/** + * xe_device_declare_wedged - Declare device wedged + * @xe: xe device instance + * + * This is a final state that can only be cleared with a module + * re-probe (unbind + bind). + * In this state every IOCTL will be blocked so the GT cannot be used. + * In general it will be called upon any critical error such as gt reset + * failure or guc loading failure. + * If xe.wedged module parameter is set to 2, this function will be called + * on every single execution timeout (a.k.a. GPU hang) right after devcoredump + * snapshot capture. In this mode, GT reset won't be attempted so the state of + * the issue is preserved for further debugging. + */ +void xe_device_declare_wedged(struct xe_device *xe) +{ + if (xe_modparam.wedged_mode == 0) + return; + + if (!atomic_xchg(&xe->wedged, 1)) { + xe->needs_flr_on_fini = true; + drm_err(&xe->drm, + "CRITICAL: Xe has declared device %s as wedged.\n" + "IOCTLs and executions are blocked. Only a rebind may clear the failure\n" + "Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n", + dev_name(xe->drm.dev)); + } +} diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h index d2e4249d37ce..9ede45fc062a 100644 --- a/drivers/gpu/drm/xe/xe_device.h +++ b/drivers/gpu/drm/xe/xe_device.h @@ -172,19 +172,6 @@ static inline bool xe_device_wedged(struct xe_device *xe) return atomic_read(&xe->wedged); } -static inline void xe_device_declare_wedged(struct xe_device *xe) -{ - if (!atomic_xchg(&xe->wedged, 1)) { - xe->needs_flr_on_fini = true; - drm_err(&xe->drm, - "CRITICAL: Xe has declared device %s as wedged.\n" - "IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n" - "echo '%s' > /sys/bus/pci/drivers/xe/unbind\n" - "echo '%s' > /sys/bus/pci/drivers/xe/bind\n" - "Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n", - dev_name(xe->drm.dev), dev_name(xe->drm.dev), - dev_name(xe->drm.dev)); - } -} +void xe_device_declare_wedged(struct xe_device *xe); #endif diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h index 02ce8d204622..48f6da53a292 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.h +++ b/drivers/gpu/drm/xe/xe_exec_queue.h @@ -26,6 +26,15 @@ void xe_exec_queue_fini(struct xe_exec_queue *q); void xe_exec_queue_destroy(struct kref *ref); void xe_exec_queue_assign_name(struct xe_exec_queue *q, u32 instance); +static inline struct xe_exec_queue * +xe_exec_queue_get_unless_zero(struct xe_exec_queue *q) +{ + if (kref_get_unless_zero(&q->refcount)) + return q; + + return NULL; +} + struct xe_exec_queue *xe_exec_queue_lookup(struct xe_file *xef, u32 id); static inline struct xe_exec_queue *xe_exec_queue_get(struct xe_exec_queue *q) diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c index 93df2d7969b3..8e9c4b990fbb 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c @@ -245,7 +245,7 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt) return seqno; xe_gt_tlb_invalidation_wait(gt, seqno); - } else if (xe_device_uc_enabled(xe)) { + } else if (xe_device_uc_enabled(xe) && !xe_device_wedged(xe)) { xe_gt_WARN_ON(gt, xe_force_wake_get(gt_to_fw(gt), XE_FW_GT)); if (xe->info.platform == XE_PVC || GRAPHICS_VER(xe) >= 20) { xe_mmio_write32(gt, PVC_GUC_TLB_INV_DESC1, diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c index 1aafa486edec..db817a46f157 100644 --- a/drivers/gpu/drm/xe/xe_guc_ads.c +++ b/drivers/gpu/drm/xe/xe_guc_ads.c @@ -20,6 +20,7 @@ #include "xe_lrc.h" #include "xe_map.h" #include "xe_mmio.h" +#include "xe_module.h" #include "xe_platform_types.h" #include "xe_wa.h" @@ -440,11 +441,17 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads) static void guc_policies_init(struct xe_guc_ads *ads) { + u32 global_flags = 0; + ads_blob_write(ads, policies.dpc_promote_time, GLOBAL_POLICY_DEFAULT_DPC_PROMOTE_TIME_US); ads_blob_write(ads, policies.max_num_work_items, GLOBAL_POLICY_MAX_NUM_WI); - ads_blob_write(ads, policies.global_flags, 0); + + if (xe_modparam.wedged_mode == 2) + global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET; + + ads_blob_write(ads, policies.global_flags, global_flags); ads_blob_write(ads, policies.is_valid, 1); } diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index c7d38469fb46..0bea17536659 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -35,6 +35,7 @@ #include "xe_macros.h" #include "xe_map.h" #include "xe_mocs.h" +#include "xe_module.h" #include "xe_ring_ops_types.h" #include "xe_sched_job.h" #include "xe_trace.h" @@ -59,6 +60,7 @@ exec_queue_to_guc(struct xe_exec_queue *q) #define ENGINE_STATE_SUSPENDED (1 << 5) #define EXEC_QUEUE_STATE_RESET (1 << 6) #define ENGINE_STATE_KILLED (1 << 7) +#define EXEC_QUEUE_STATE_WEDGED (1 << 8) static bool exec_queue_registered(struct xe_exec_queue *q) { @@ -175,9 +177,20 @@ static void set_exec_queue_killed(struct xe_exec_queue *q) atomic_or(ENGINE_STATE_KILLED, &q->guc->state); } -static bool exec_queue_killed_or_banned(struct xe_exec_queue *q) +static bool exec_queue_wedged(struct xe_exec_queue *q) { - return exec_queue_killed(q) || exec_queue_banned(q); + return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_WEDGED; +} + +static void set_exec_queue_wedged(struct xe_exec_queue *q) +{ + atomic_or(EXEC_QUEUE_STATE_WEDGED, &q->guc->state); +} + +static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q) +{ + return exec_queue_banned(q) || (atomic_read(&q->guc->state) & + (EXEC_QUEUE_STATE_WEDGED | ENGINE_STATE_KILLED)); } #ifdef CONFIG_PROVE_LOCKING @@ -240,6 +253,17 @@ static void guc_submit_fini(struct drm_device *drm, void *arg) free_submit_wq(guc); } +static void guc_submit_wedged_fini(struct drm_device *drm, void *arg) +{ + struct xe_guc *guc = arg; + struct xe_exec_queue *q; + unsigned long index; + + xa_for_each(&guc->submission_state.exec_queue_lookup, index, q) + if (exec_queue_wedged(q)) + xe_exec_queue_put(q); +} + static const struct xe_exec_queue_ops guc_exec_queue_ops; static void primelockdep(struct xe_guc *guc) @@ -708,7 +732,7 @@ guc_exec_queue_run_job(struct drm_sched_job *drm_job) trace_xe_sched_job_run(job); - if (!exec_queue_killed_or_banned(q) && !xe_sched_job_is_error(job)) { + if (!exec_queue_killed_or_banned_or_wedged(q) && !xe_sched_job_is_error(job)) { if (!exec_queue_registered(q)) register_engine(q); if (!lr) /* LR jobs are emitted in the exec IOCTL */ @@ -844,6 +868,28 @@ static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q) xe_sched_tdr_queue_imm(&q->guc->sched); } +static void guc_submit_wedged(struct xe_guc *guc) +{ + struct xe_exec_queue *q; + unsigned long index; + int err; + + xe_device_declare_wedged(guc_to_xe(guc)); + xe_guc_submit_reset_prepare(guc); + xe_guc_ct_stop(&guc->ct); + + err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm, + guc_submit_wedged_fini, guc); + if (err) + return; + + mutex_lock(&guc->submission_state.lock); + xa_for_each(&guc->submission_state.exec_queue_lookup, index, q) + if (xe_exec_queue_get_unless_zero(q)) + set_exec_queue_wedged(q); + mutex_unlock(&guc->submission_state.lock); +} + static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w) { struct xe_guc_exec_queue *ge = @@ -852,10 +898,16 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w) struct xe_guc *guc = exec_queue_to_guc(q); struct xe_device *xe = guc_to_xe(guc); struct xe_gpu_scheduler *sched = &ge->sched; + bool wedged = xe_device_wedged(xe); xe_assert(xe, xe_exec_queue_is_lr(q)); trace_xe_exec_queue_lr_cleanup(q); + if (!wedged && xe_modparam.wedged_mode == 2) { + guc_submit_wedged(exec_queue_to_guc(q)); + wedged = true; + } + /* Kill the run_job / process_msg entry points */ xe_sched_submission_stop(sched); @@ -870,7 +922,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w) * xe_guc_deregister_done_handler() which treats it as an unexpected * state. */ - if (exec_queue_registered(q) && !exec_queue_destroyed(q)) { + if (!wedged && exec_queue_registered(q) && !exec_queue_destroyed(q)) { struct xe_guc *guc = exec_queue_to_guc(q); int ret; @@ -905,6 +957,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) struct xe_device *xe = guc_to_xe(exec_queue_to_guc(q)); int err = -ETIME; int i = 0; + bool wedged = xe_device_wedged(xe); /* * TDR has fired before free job worker. Common if exec queue @@ -928,6 +981,11 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) trace_xe_sched_job_timedout(job); + if (!wedged && xe_modparam.wedged_mode == 2) { + guc_submit_wedged(exec_queue_to_guc(q)); + wedged = true; + } + /* Kill the run_job entry point */ xe_sched_submission_stop(sched); @@ -935,8 +993,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) * Kernel jobs should never fail, nor should VM jobs if they do * somethings has gone wrong and the GT needs a reset */ - if (q->flags & EXEC_QUEUE_FLAG_KERNEL || - (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q))) { + if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL || + (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) { if (!xe_sched_invalidate_job(job, 2)) { xe_sched_add_pending_job(sched, job); xe_sched_submission_start(sched); @@ -946,7 +1004,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) } /* Engine state now stable, disable scheduling if needed */ - if (exec_queue_registered(q)) { + if (!wedged && exec_queue_registered(q)) { struct xe_guc *guc = exec_queue_to_guc(q); int ret; @@ -989,6 +1047,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) */ xe_sched_add_pending_job(sched, job); xe_sched_submission_start(sched); + xe_guc_exec_queue_trigger_cleanup(q); /* Mark all outstanding jobs as bad, thus completing them */ @@ -1028,7 +1087,7 @@ static void guc_exec_queue_fini_async(struct xe_exec_queue *q) INIT_WORK(&q->guc->fini_async, __guc_exec_queue_fini_async); /* We must block on kernel engines so slabs are empty on driver unload */ - if (q->flags & EXEC_QUEUE_FLAG_PERMANENT) + if (q->flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged(q)) __guc_exec_queue_fini_async(&q->guc->fini_async); else queue_work(system_wq, &q->guc->fini_async); @@ -1063,7 +1122,7 @@ static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg) static bool guc_exec_queue_allowed_to_change_state(struct xe_exec_queue *q) { - return !exec_queue_killed_or_banned(q) && exec_queue_registered(q); + return !exec_queue_killed_or_banned_or_wedged(q) && exec_queue_registered(q); } static void __guc_exec_queue_process_msg_set_sched_props(struct xe_sched_msg *msg) @@ -1274,7 +1333,7 @@ static void guc_exec_queue_fini(struct xe_exec_queue *q) { struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_CLEANUP; - if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT)) + if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && !exec_queue_wedged(q)) guc_exec_queue_add_msg(q, msg, CLEANUP); else __guc_exec_queue_fini(exec_queue_to_guc(q), q); @@ -1285,7 +1344,8 @@ static int guc_exec_queue_set_priority(struct xe_exec_queue *q, { struct xe_sched_msg *msg; - if (q->sched_props.priority == priority || exec_queue_killed_or_banned(q)) + if (q->sched_props.priority == priority || + exec_queue_killed_or_banned_or_wedged(q)) return 0; msg = kmalloc(sizeof(*msg), GFP_KERNEL); @@ -1303,7 +1363,7 @@ static int guc_exec_queue_set_timeslice(struct xe_exec_queue *q, u32 timeslice_u struct xe_sched_msg *msg; if (q->sched_props.timeslice_us == timeslice_us || - exec_queue_killed_or_banned(q)) + exec_queue_killed_or_banned_or_wedged(q)) return 0; msg = kmalloc(sizeof(*msg), GFP_KERNEL); @@ -1322,7 +1382,7 @@ static int guc_exec_queue_set_preempt_timeout(struct xe_exec_queue *q, struct xe_sched_msg *msg; if (q->sched_props.preempt_timeout_us == preempt_timeout_us || - exec_queue_killed_or_banned(q)) + exec_queue_killed_or_banned_or_wedged(q)) return 0; msg = kmalloc(sizeof(*msg), GFP_KERNEL); @@ -1339,7 +1399,7 @@ static int guc_exec_queue_suspend(struct xe_exec_queue *q) { struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_SUSPEND; - if (exec_queue_killed_or_banned(q) || q->guc->suspend_pending) + if (exec_queue_killed_or_banned_or_wedged(q) || q->guc->suspend_pending) return -EINVAL; q->guc->suspend_pending = true; @@ -1485,7 +1545,7 @@ static void guc_exec_queue_start(struct xe_exec_queue *q) { struct xe_gpu_scheduler *sched = &q->guc->sched; - if (!exec_queue_killed_or_banned(q)) { + if (!exec_queue_killed_or_banned_or_wedged(q)) { int i; trace_xe_exec_queue_resubmit(q); diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c index ceb8345cbca6..3edeb30d5ccb 100644 --- a/drivers/gpu/drm/xe/xe_module.c +++ b/drivers/gpu/drm/xe/xe_module.c @@ -17,6 +17,7 @@ struct xe_modparam xe_modparam = { .enable_display = true, .guc_log_level = 5, .force_probe = CONFIG_DRM_XE_FORCE_PROBE, + .wedged_mode = 1, /* the rest are 0 by default */ }; @@ -55,6 +56,10 @@ MODULE_PARM_DESC(max_vfs, "(0 = no VFs [default]; N = allow up to N VFs)"); #endif +module_param_named_unsafe(wedged_mode, xe_modparam.wedged_mode, int, 0600); +MODULE_PARM_DESC(wedged_mode, + "Module's default policy for the wedged mode - 0=never, 1=upon-critical-errors[default], 2=upon-any-hang"); + struct init_funcs { int (*init)(void); void (*exit)(void); diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h index b369984f08ec..61a0d28a28c8 100644 --- a/drivers/gpu/drm/xe/xe_module.h +++ b/drivers/gpu/drm/xe/xe_module.h @@ -21,6 +21,7 @@ struct xe_modparam { #ifdef CONFIG_PCI_IOV unsigned int max_vfs; #endif + int wedged_mode; }; extern struct xe_modparam xe_modparam;