drm/amd/virt: For SRIOV send GPU reset directly to TDR queue.
No need to to trigger another work queue inside the work queue. v3: Problem: Extra reset caused by host side FLR notification following guest side triggered reset. Fix: Preven qeuing flr_work from mailbox irq if guest already executing a reset. Suggested-by: Liu Shaoyun <Shaoyun.Liu@amd.com> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Reviewed-by: Liu Shaoyun <Shaoyun.Liu@amd.com> Link: https://www.spinics.net/lists/amd-gfx/msg74114.html
This commit is contained in:
parent
54f329cc7a
commit
02599bc7f7
@ -282,7 +282,7 @@ flr_done:
|
||||
if (amdgpu_device_should_recover_gpu(adev)
|
||||
&& (!amdgpu_device_has_job_running(adev) ||
|
||||
adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
|
||||
amdgpu_device_gpu_recover(adev, NULL);
|
||||
amdgpu_device_gpu_recover_imp(adev, NULL);
|
||||
}
|
||||
|
||||
static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
|
||||
@ -307,8 +307,11 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
|
||||
|
||||
switch (event) {
|
||||
case IDH_FLR_NOTIFICATION:
|
||||
if (amdgpu_sriov_runtime(adev))
|
||||
schedule_work(&adev->virt.flr_work);
|
||||
if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
|
||||
WARN_ONCE(!queue_work(adev->reset_domain.wq,
|
||||
&adev->virt.flr_work),
|
||||
"Failed to queue work! at %s",
|
||||
__func__);
|
||||
break;
|
||||
case IDH_QUERY_ALIVE:
|
||||
xgpu_ai_mailbox_send_ack(adev);
|
||||
|
@ -309,7 +309,7 @@ flr_done:
|
||||
adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
|
||||
adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
|
||||
adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
|
||||
amdgpu_device_gpu_recover(adev, NULL);
|
||||
amdgpu_device_gpu_recover_imp(adev, NULL);
|
||||
}
|
||||
|
||||
static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev,
|
||||
@ -337,8 +337,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev,
|
||||
|
||||
switch (event) {
|
||||
case IDH_FLR_NOTIFICATION:
|
||||
if (amdgpu_sriov_runtime(adev))
|
||||
schedule_work(&adev->virt.flr_work);
|
||||
if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
|
||||
WARN_ONCE(!queue_work(adev->reset_domain.wq,
|
||||
&adev->virt.flr_work),
|
||||
"Failed to queue work! at %s",
|
||||
__func__);
|
||||
break;
|
||||
/* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
|
||||
* it byfar since that polling thread will handle it,
|
||||
|
@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
|
||||
|
||||
/* Trigger recovery due to world switch failure */
|
||||
if (amdgpu_device_should_recover_gpu(adev))
|
||||
amdgpu_device_gpu_recover(adev, NULL);
|
||||
amdgpu_device_gpu_recover_imp(adev, NULL);
|
||||
}
|
||||
|
||||
static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
|
||||
@ -550,8 +550,11 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev,
|
||||
r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
|
||||
|
||||
/* only handle FLR_NOTIFY now */
|
||||
if (!r)
|
||||
schedule_work(&adev->virt.flr_work);
|
||||
if (!r && !amdgpu_in_reset(adev))
|
||||
WARN_ONCE(!queue_work(adev->reset_domain.wq,
|
||||
&adev->virt.flr_work),
|
||||
"Failed to queue work! at %s",
|
||||
__func__);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
Loading…
x
Reference in New Issue
Block a user