From 408c2f14a5d3d7ac4824b96e52693ab271efb738 Mon Sep 17 00:00:00 2001
From: Ashutosh Dixit <ashutosh.dixit@intel.com>
Date: Thu, 11 Jul 2024 14:12:03 -0700
Subject: [PATCH 1/4] drm/xe/exec: Fix minor bug related to
 xe_sync_entry_cleanup

Increment num_syncs after xe_sync_entry_parse() is successful to ensure
the xe_sync_entry_cleanup() logic under "err_syncs" label works correctly.

v2: Use the same pattern as that in xe_vm.c (Matt Brost)

Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs")
Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240711211203.3728180-1-ashutosh.dixit@intel.com
(cherry picked from commit 43a6faa6d9b5e9139758200a79fe9c8f4aaa0c8d)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/xe/xe_exec.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
index 2d72cdec3a0b..f36980aa26e6 100644
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -118,7 +118,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	u64 addresses[XE_HW_ENGINE_MAX_INSTANCE];
 	struct drm_gpuvm_exec vm_exec = {.extra.fn = xe_exec_fn};
 	struct drm_exec *exec = &vm_exec.exec;
-	u32 i, num_syncs = 0, num_ufence = 0;
+	u32 i, num_syncs, num_ufence = 0;
 	struct xe_sched_job *job;
 	struct xe_vm *vm;
 	bool write_locked, skip_retry = false;
@@ -156,15 +156,15 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 
 	vm = q->vm;
 
-	for (i = 0; i < args->num_syncs; i++) {
-		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs++],
-					  &syncs_user[i], SYNC_PARSE_FLAG_EXEC |
+	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
+		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
+					  &syncs_user[num_syncs], SYNC_PARSE_FLAG_EXEC |
 					  (xe_vm_in_lr_mode(vm) ?
 					   SYNC_PARSE_FLAG_LR_MODE : 0));
 		if (err)
 			goto err_syncs;
 
-		if (xe_sync_is_ufence(&syncs[i]))
+		if (xe_sync_is_ufence(&syncs[num_syncs]))
 			num_ufence++;
 	}
 
@@ -325,8 +325,8 @@ err_unlock_list:
 	if (err == -EAGAIN && !skip_retry)
 		goto retry;
 err_syncs:
-	for (i = 0; i < num_syncs; i++)
-		xe_sync_entry_cleanup(&syncs[i]);
+	while (num_syncs--)
+		xe_sync_entry_cleanup(&syncs[num_syncs]);
 	kfree(syncs);
 err_exec_queue:
 	xe_exec_queue_put(q);

From bf07ca963d4fd11c88a9d4b058f2bd62e8d46a98 Mon Sep 17 00:00:00 2001
From: Michal Wajdeczko <michal.wajdeczko@intel.com>
Date: Thu, 11 Jul 2024 21:23:19 +0200
Subject: [PATCH 2/4] drm/xe/pf: Limit fair VF LMEM provisioning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Due to the current design of the BO and VRAM manager, any object
with XE_BO_FLAG_PINNED flag, which the PF driver uses during VF
LMEM provisionining, is created with the TTM_PL_FLAG_CONTIGUOUS
flag, which may cause VRAM fragmentation that prevents subsequent
allocations of larger objects, like fair VF LMEM provisioning.

To avoid such failures, round down fair VF LMEM provisioning size
to next power of two size, to compensate what xe_ttm_vram_mgr is
doing to achieve contiguous allocations.

Fixes: ac6598aed1b3 ("drm/xe/pf: Add support to configure SR-IOV VFs")
Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Reviewed-by: Piotr Piórkowski <piotr.piorkowski@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240711192320.1198-2-michal.wajdeczko@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
(cherry picked from commit 4c3fe5eae46b92e2fd961b19f7779608352e5368)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
index db6c213da847..4699b7836001 100644
--- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
+++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
@@ -1543,6 +1543,7 @@ static u64 pf_estimate_fair_lmem(struct xe_gt *gt, unsigned int num_vfs)
 	u64 fair;
 
 	fair = div_u64(available, num_vfs);
+	fair = rounddown_pow_of_two(fair);	/* XXX: ttm_vram_mgr & drm_buddy limitation */
 	fair = ALIGN_DOWN(fair, alignment);
 #ifdef MAX_FAIR_LMEM
 	fair = min_t(u64, MAX_FAIR_LMEM, fair);

From c9474b726b932b5d555effd9ed0ae19f4da2367c Mon Sep 17 00:00:00 2001
From: Matthew Brost <matthew.brost@intel.com>
Date: Mon, 15 Jul 2024 23:39:01 -0700
Subject: [PATCH 3/4] drm/xe: Wedge the entire device

Wedge the entire device, not just GT which may have triggered the wedge.
To implement this, cleanup the layering so xe_device_declare_wedged()
calls into the lower layers (GT) to ensure entire device is wedged.

While we are here, also signal any pending GT TLB invalidations upon
wedging device.

Lastly, short circuit reset wait if device is wedged.

v2:
 - Short circuit reset wait if device is wedged (Local testing)

Fixes: 8ed9aaae39f3 ("drm/xe: Force wedged state and block GT reset upon any GPU hang")
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240716063902.1390130-1-matthew.brost@intel.com
(cherry picked from commit 7dbe8af13c189f5937e87e9fb924d5bbc49e6f71)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/xe/xe_device.c     |  6 ++++
 drivers/gpu/drm/xe/xe_gt.c         | 15 ++++++++++
 drivers/gpu/drm/xe/xe_gt.h         |  1 +
 drivers/gpu/drm/xe/xe_guc.c        | 16 ++++++++++
 drivers/gpu/drm/xe/xe_guc.h        |  1 +
 drivers/gpu/drm/xe/xe_guc_submit.c | 48 +++++++++++++++++++-----------
 drivers/gpu/drm/xe/xe_guc_submit.h |  1 +
 drivers/gpu/drm/xe/xe_uc.c         | 14 +++++++++
 drivers/gpu/drm/xe/xe_uc.h         |  1 +
 9 files changed, 85 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 03492fbcb8fb..41548eff953d 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -870,6 +870,9 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
  */
 void xe_device_declare_wedged(struct xe_device *xe)
 {
+	struct xe_gt *gt;
+	u8 id;
+
 	if (xe->wedged.mode == 0) {
 		drm_dbg(&xe->drm, "Wedged mode is forcibly disabled\n");
 		return;
@@ -883,4 +886,7 @@ void xe_device_declare_wedged(struct xe_device *xe)
 			"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
 			dev_name(xe->drm.dev));
 	}
+
+	for_each_gt(gt, xe, id)
+		xe_gt_declare_wedged(gt);
 }
diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
index 0ba2e2d0289b..31b2e64c70c6 100644
--- a/drivers/gpu/drm/xe/xe_gt.c
+++ b/drivers/gpu/drm/xe/xe_gt.c
@@ -904,3 +904,18 @@ struct xe_hw_engine *xe_gt_any_hw_engine(struct xe_gt *gt)
 
 	return NULL;
 }
+
+/**
+ * xe_gt_declare_wedged() - Declare GT wedged
+ * @gt: the GT object
+ *
+ * Wedge the GT which stops all submission, saves desired debug state, and
+ * cleans up anything which could timeout.
+ */
+void xe_gt_declare_wedged(struct xe_gt *gt)
+{
+	xe_gt_assert(gt, gt_to_xe(gt)->wedged.mode);
+
+	xe_uc_declare_wedged(&gt->uc);
+	xe_gt_tlb_invalidation_reset(gt);
+}
diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
index 1123fdfc4ebc..8b1a5027dcf2 100644
--- a/drivers/gpu/drm/xe/xe_gt.h
+++ b/drivers/gpu/drm/xe/xe_gt.h
@@ -37,6 +37,7 @@ struct xe_gt *xe_gt_alloc(struct xe_tile *tile);
 int xe_gt_init_hwconfig(struct xe_gt *gt);
 int xe_gt_init_early(struct xe_gt *gt);
 int xe_gt_init(struct xe_gt *gt);
+void xe_gt_declare_wedged(struct xe_gt *gt);
 int xe_gt_record_default_lrcs(struct xe_gt *gt);
 
 /**
diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
index eb655cee19f7..de0fe9e65746 100644
--- a/drivers/gpu/drm/xe/xe_guc.c
+++ b/drivers/gpu/drm/xe/xe_guc.c
@@ -1178,3 +1178,19 @@ void xe_guc_print_info(struct xe_guc *guc, struct drm_printer *p)
 	xe_guc_ct_print(&guc->ct, p, false);
 	xe_guc_submit_print(guc, p);
 }
+
+/**
+ * xe_guc_declare_wedged() - Declare GuC wedged
+ * @guc: the GuC object
+ *
+ * Wedge the GuC which stops all submission, saves desired debug state, and
+ * cleans up anything which could timeout.
+ */
+void xe_guc_declare_wedged(struct xe_guc *guc)
+{
+	xe_gt_assert(guc_to_gt(guc), guc_to_xe(guc)->wedged.mode);
+
+	xe_guc_reset_prepare(guc);
+	xe_guc_ct_stop(&guc->ct);
+	xe_guc_submit_wedge(guc);
+}
diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h
index af59c9545753..e0bbf98f849d 100644
--- a/drivers/gpu/drm/xe/xe_guc.h
+++ b/drivers/gpu/drm/xe/xe_guc.h
@@ -37,6 +37,7 @@ void xe_guc_reset_wait(struct xe_guc *guc);
 void xe_guc_stop_prepare(struct xe_guc *guc);
 void xe_guc_stop(struct xe_guc *guc);
 int xe_guc_start(struct xe_guc *guc);
+void xe_guc_declare_wedged(struct xe_guc *guc);
 
 static inline u16 xe_engine_class_to_guc_class(enum xe_engine_class class)
 {
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 373447758a60..8d7e7f4bbff7 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -861,13 +861,40 @@ static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
 		xe_sched_tdr_queue_imm(&q->guc->sched);
 }
 
-static bool guc_submit_hint_wedged(struct xe_guc *guc)
+/**
+ * xe_guc_submit_wedge() - Wedge GuC submission
+ * @guc: the GuC object
+ *
+ * Save exec queue's registered with GuC state by taking a ref to each queue.
+ * Register a DRMM handler to drop refs upon driver unload.
+ */
+void xe_guc_submit_wedge(struct xe_guc *guc)
 {
 	struct xe_device *xe = guc_to_xe(guc);
 	struct xe_exec_queue *q;
 	unsigned long index;
 	int err;
 
+	xe_gt_assert(guc_to_gt(guc), guc_to_xe(guc)->wedged.mode);
+
+	err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm,
+				       guc_submit_wedged_fini, guc);
+	if (err) {
+		drm_err(&xe->drm, "Failed to register xe_guc_submit clean-up on wedged.mode=2. Although device is wedged.\n");
+		return;
+	}
+
+	mutex_lock(&guc->submission_state.lock);
+	xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
+		if (xe_exec_queue_get_unless_zero(q))
+			set_exec_queue_wedged(q);
+	mutex_unlock(&guc->submission_state.lock);
+}
+
+static bool guc_submit_hint_wedged(struct xe_guc *guc)
+{
+	struct xe_device *xe = guc_to_xe(guc);
+
 	if (xe->wedged.mode != 2)
 		return false;
 
@@ -876,22 +903,6 @@ static bool guc_submit_hint_wedged(struct xe_guc *guc)
 
 	xe_device_declare_wedged(xe);
 
-	xe_guc_submit_reset_prepare(guc);
-	xe_guc_ct_stop(&guc->ct);
-
-	err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm,
-				       guc_submit_wedged_fini, guc);
-	if (err) {
-		drm_err(&xe->drm, "Failed to register xe_guc_submit clean-up on wedged.mode=2. Although device is wedged.\n");
-		return true; /* Device is wedged anyway */
-	}
-
-	mutex_lock(&guc->submission_state.lock);
-	xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
-		if (xe_exec_queue_get_unless_zero(q))
-			set_exec_queue_wedged(q);
-	mutex_unlock(&guc->submission_state.lock);
-
 	return true;
 }
 
@@ -1677,7 +1688,8 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc)
 
 void xe_guc_submit_reset_wait(struct xe_guc *guc)
 {
-	wait_event(guc->ct.wq, !guc_read_stopped(guc));
+	wait_event(guc->ct.wq, xe_device_wedged(guc_to_xe(guc)) ||
+		   !guc_read_stopped(guc));
 }
 
 void xe_guc_submit_stop(struct xe_guc *guc)
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/xe_guc_submit.h
index 4ad5f4c1b084..bdf8c9f3d24a 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.h
+++ b/drivers/gpu/drm/xe/xe_guc_submit.h
@@ -18,6 +18,7 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc);
 void xe_guc_submit_reset_wait(struct xe_guc *guc);
 void xe_guc_submit_stop(struct xe_guc *guc);
 int xe_guc_submit_start(struct xe_guc *guc);
+void xe_guc_submit_wedge(struct xe_guc *guc);
 
 int xe_guc_sched_done_handler(struct xe_guc *guc, u32 *msg, u32 len);
 int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len);
diff --git a/drivers/gpu/drm/xe/xe_uc.c b/drivers/gpu/drm/xe/xe_uc.c
index 0f240534fb72..0d073a9987c2 100644
--- a/drivers/gpu/drm/xe/xe_uc.c
+++ b/drivers/gpu/drm/xe/xe_uc.c
@@ -300,3 +300,17 @@ void xe_uc_remove(struct xe_uc *uc)
 {
 	xe_gsc_remove(&uc->gsc);
 }
+
+/**
+ * xe_uc_declare_wedged() - Declare UC wedged
+ * @uc: the UC object
+ *
+ * Wedge the UC which stops all submission, saves desired debug state, and
+ * cleans up anything which could timeout.
+ */
+void xe_uc_declare_wedged(struct xe_uc *uc)
+{
+	xe_gt_assert(uc_to_gt(uc), uc_to_xe(uc)->wedged.mode);
+
+	xe_guc_declare_wedged(&uc->guc);
+}
diff --git a/drivers/gpu/drm/xe/xe_uc.h b/drivers/gpu/drm/xe/xe_uc.h
index 11856f24e6f9..506517c11333 100644
--- a/drivers/gpu/drm/xe/xe_uc.h
+++ b/drivers/gpu/drm/xe/xe_uc.h
@@ -21,5 +21,6 @@ int xe_uc_start(struct xe_uc *uc);
 int xe_uc_suspend(struct xe_uc *uc);
 int xe_uc_sanitize_reset(struct xe_uc *uc);
 void xe_uc_remove(struct xe_uc *uc);
+void xe_uc_declare_wedged(struct xe_uc *uc);
 
 #endif

From 90936a0a4c54f0a1cdf4538f9128821ad70c36ab Mon Sep 17 00:00:00 2001
From: Matthew Brost <matthew.brost@intel.com>
Date: Mon, 15 Jul 2024 23:39:02 -0700
Subject: [PATCH 4/4] drm/xe: Don't suspend device upon wedge

When wedging a device we shouldn't be suspending device as state for
debug will be lost.

Also this appears to not work as the below stack trace pops upon trying
to resume a wedged device:

[  304.245044] INFO: task cat:12115 blocked for more than 151 seconds.
[  304.251333]       Tainted: G        W          6.10.0-rc7-xe+ #3518
[  304.257617] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  304.265459] task:cat             state:D stack:13384 pid:12115 tgid:12115 ppid:3986   flags:0x00000006
[  304.265465] Call Trace:
[  304.265467]  <TASK>
[  304.265469]  __schedule+0x3c4/0xdf0
[  304.265478]  schedule+0x3c/0x140
[  304.265481]  rpm_resume+0x1cc/0x740
[  304.265484]  ? __pfx_autoremove_wake_function+0x10/0x10
[  304.265489]  __pm_runtime_resume+0x49/0x80
[  304.265494]  guc_info+0x6b/0xb0 [xe]
[  304.265538]  ? __pfx___drm_printfn_seq_file+0x10/0x10
[  304.265541]  ? __pfx___drm_puts_seq_file+0x10/0x10
[  304.265545]  seq_read_iter+0x111/0x4c0
[  304.265551]  seq_read+0xfc/0x140
[  304.265556]  full_proxy_read+0x58/0x80
[  304.265560]  vfs_read+0xa7/0x360
[  304.265563]  ? find_held_lock+0x2b/0x80
[  304.265568]  ksys_read+0x64/0xe0
[  304.265571]  do_syscall_64+0x68/0x140
[  304.265575]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  304.265578] RIP: 0033:0x7f4254d14992
[  304.265580] RSP: 002b:00007ffc558666f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[  304.265583] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f4254d14992
[  304.265584] RDX: 0000000000020000 RSI: 00007f4254ebb000 RDI: 0000000000000003
[  304.265586] RBP: 00007f4254ebb000 R08: 00007f4254eba010 R09: 00007f4254eba010
[  304.265587] R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000022000
[  304.265588] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000
[  304.265593]  </TASK>
[  304.265594]
               Showing all locks held in the system:
[  304.265598] 1 lock held by khungtaskd/57:
[  304.265599]  #0: ffffffff8273b860 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x36/0x1c0
[  304.265607] 3 locks held by kworker/6:1/90:
[  304.265610] 1 lock held by in:imklog/547:
[  304.265611]  #0: ffff88810498cd88 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x76/0xc0
[  304.265620] 1 lock held by dmesg/1310:

v2: Drop local 'err' variable (Jonathan)

Fixes: 8ed9aaae39f3 ("drm/xe: Force wedged state and block GT reset upon any GPU hang")
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240716063902.1390130-2-matthew.brost@intel.com
(cherry picked from commit 452bca0edbd0764ca0284239d5438b3edd305ab3)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/xe/xe_device.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 41548eff953d..76109415eba6 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -854,6 +854,13 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
 	return address & GENMASK_ULL(xe->info.va_bits - 1, 0);
 }
 
+static void xe_device_wedged_fini(struct drm_device *drm, void *arg)
+{
+	struct xe_device *xe = arg;
+
+	xe_pm_runtime_put(xe);
+}
+
 /**
  * xe_device_declare_wedged - Declare device wedged
  * @xe: xe device instance
@@ -878,6 +885,13 @@ void xe_device_declare_wedged(struct xe_device *xe)
 		return;
 	}
 
+	if (drmm_add_action_or_reset(&xe->drm, xe_device_wedged_fini, xe)) {
+		drm_err(&xe->drm, "Failed to register xe_device_wedged_fini clean-up. Although device is wedged.\n");
+		return;
+	}
+
+	xe_pm_runtime_get_noresume(xe);
+
 	if (!atomic_xchg(&xe->wedged.flag, 1)) {
 		xe->needs_flr_on_fini = true;
 		drm_err(&xe->drm,