drm/amdkfd: Introduce KFD module parameter halt_if_hws_hang
This avoids triggering a GPU reset or otherwise changing the HW state. Instead KFD will hang, which allows HW debugging tools to analyze the problem. Signed-off-by: Yong Zhao <yong.zhao@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
This commit is contained in:
parent
a29ec470b1
commit
0e9a860c72
@ -1217,6 +1217,13 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
|
||||
while (*fence_addr != fence_value) {
|
||||
if (time_after(jiffies, end_jiffies)) {
|
||||
pr_err("qcm fence wait loop timeout expired\n");
|
||||
/* In HWS case, this is used to halt the driver thread
|
||||
* in order not to mess up CP states before doing
|
||||
* scandumps for FW debugging.
|
||||
*/
|
||||
while (halt_if_hws_hang)
|
||||
schedule();
|
||||
|
||||
return -ETIME;
|
||||
}
|
||||
schedule();
|
||||
|
@ -92,6 +92,10 @@ MODULE_PARM_DESC(noretry,
|
||||
|
||||
static int amdkfd_init_completed;
|
||||
|
||||
int halt_if_hws_hang;
|
||||
module_param(halt_if_hws_hang, int, 0644);
|
||||
MODULE_PARM_DESC(halt_if_hws_hang, "Halt if HWS hang is detected (0 = off (default), 1 = on)");
|
||||
|
||||
int kgd2kfd_init(unsigned int interface_version,
|
||||
const struct kgd2kfd_calls **g2f)
|
||||
{
|
||||
|
@ -144,6 +144,11 @@ extern int ignore_crat;
|
||||
*/
|
||||
extern int vega10_noretry;
|
||||
|
||||
/*
|
||||
* Halt if HWS hang is detected
|
||||
*/
|
||||
extern int halt_if_hws_hang;
|
||||
|
||||
/**
|
||||
* enum kfd_sched_policy
|
||||
*
|
||||
|
Loading…
x
Reference in New Issue
Block a user