drm/amdgpu: let mode2 reset fallback to default when failure
- introduce AMDGPU_SKIP_MODE2_RESET flag - let mode2 reset fallback to default reset method if failed v2: move this part out from the asic specific part Signed-off-by: Victor Zhao <Victor.Zhao@amd.com> Acked-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
672c0218e3
commit
dac6b80818
@ -135,6 +135,7 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
|
|||||||
reset_context.method = AMD_RESET_METHOD_NONE;
|
reset_context.method = AMD_RESET_METHOD_NONE;
|
||||||
reset_context.reset_req_dev = adev;
|
reset_context.reset_req_dev = adev;
|
||||||
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||||
|
clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
|
||||||
|
|
||||||
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
|
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
|
||||||
}
|
}
|
||||||
|
@ -5148,6 +5148,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||||||
|
|
||||||
reset_context->job = job;
|
reset_context->job = job;
|
||||||
reset_context->hive = hive;
|
reset_context->hive = hive;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Build list of devices to reset.
|
* Build list of devices to reset.
|
||||||
* In case we are in XGMI hive mode, resort the device list
|
* In case we are in XGMI hive mode, resort the device list
|
||||||
@ -5267,8 +5268,11 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
|
|||||||
amdgpu_ras_resume(adev);
|
amdgpu_ras_resume(adev);
|
||||||
} else {
|
} else {
|
||||||
r = amdgpu_do_asic_reset(device_list_handle, reset_context);
|
r = amdgpu_do_asic_reset(device_list_handle, reset_context);
|
||||||
if (r && r == -EAGAIN)
|
if (r && r == -EAGAIN) {
|
||||||
|
set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags);
|
||||||
|
adev->asic_reset_res = 0;
|
||||||
goto retry;
|
goto retry;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
skip_hw_reset:
|
skip_hw_reset:
|
||||||
@ -5699,6 +5703,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
|
|||||||
reset_context.reset_req_dev = adev;
|
reset_context.reset_req_dev = adev;
|
||||||
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||||
set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
|
set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
|
||||||
|
set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
|
||||||
|
|
||||||
adev->no_hw_access = true;
|
adev->no_hw_access = true;
|
||||||
r = amdgpu_device_pre_asic_reset(adev, &reset_context);
|
r = amdgpu_device_pre_asic_reset(adev, &reset_context);
|
||||||
|
@ -71,6 +71,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
|
|||||||
reset_context.method = AMD_RESET_METHOD_NONE;
|
reset_context.method = AMD_RESET_METHOD_NONE;
|
||||||
reset_context.reset_req_dev = adev;
|
reset_context.reset_req_dev = adev;
|
||||||
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||||
|
clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
|
||||||
|
|
||||||
r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
|
r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
|
||||||
if (r)
|
if (r)
|
||||||
|
@ -1949,6 +1949,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
|
|||||||
reset_context.method = AMD_RESET_METHOD_NONE;
|
reset_context.method = AMD_RESET_METHOD_NONE;
|
||||||
reset_context.reset_req_dev = adev;
|
reset_context.reset_req_dev = adev;
|
||||||
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||||
|
clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
|
||||||
|
|
||||||
amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
|
amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
|
||||||
}
|
}
|
||||||
|
@ -74,6 +74,9 @@ int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev,
|
|||||||
{
|
{
|
||||||
struct amdgpu_reset_handler *reset_handler = NULL;
|
struct amdgpu_reset_handler *reset_handler = NULL;
|
||||||
|
|
||||||
|
if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
|
||||||
|
return -ENOSYS;
|
||||||
|
|
||||||
if (adev->reset_cntl && adev->reset_cntl->get_reset_handler)
|
if (adev->reset_cntl && adev->reset_cntl->get_reset_handler)
|
||||||
reset_handler = adev->reset_cntl->get_reset_handler(
|
reset_handler = adev->reset_cntl->get_reset_handler(
|
||||||
adev->reset_cntl, reset_context);
|
adev->reset_cntl, reset_context);
|
||||||
@ -90,6 +93,9 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
|
|||||||
int ret;
|
int ret;
|
||||||
struct amdgpu_reset_handler *reset_handler = NULL;
|
struct amdgpu_reset_handler *reset_handler = NULL;
|
||||||
|
|
||||||
|
if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
|
||||||
|
return -ENOSYS;
|
||||||
|
|
||||||
if (adev->reset_cntl)
|
if (adev->reset_cntl)
|
||||||
reset_handler = adev->reset_cntl->get_reset_handler(
|
reset_handler = adev->reset_cntl->get_reset_handler(
|
||||||
adev->reset_cntl, reset_context);
|
adev->reset_cntl, reset_context);
|
||||||
|
@ -30,6 +30,7 @@ enum AMDGPU_RESET_FLAGS {
|
|||||||
|
|
||||||
AMDGPU_NEED_FULL_RESET = 0,
|
AMDGPU_NEED_FULL_RESET = 0,
|
||||||
AMDGPU_SKIP_HW_RESET = 1,
|
AMDGPU_SKIP_HW_RESET = 1,
|
||||||
|
AMDGPU_SKIP_MODE2_RESET = 2,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct amdgpu_reset_context {
|
struct amdgpu_reset_context {
|
||||||
|
@ -290,6 +290,7 @@ flr_done:
|
|||||||
reset_context.method = AMD_RESET_METHOD_NONE;
|
reset_context.method = AMD_RESET_METHOD_NONE;
|
||||||
reset_context.reset_req_dev = adev;
|
reset_context.reset_req_dev = adev;
|
||||||
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||||
|
clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
|
||||||
|
|
||||||
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
|
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
|
||||||
}
|
}
|
||||||
|
@ -317,6 +317,7 @@ flr_done:
|
|||||||
reset_context.method = AMD_RESET_METHOD_NONE;
|
reset_context.method = AMD_RESET_METHOD_NONE;
|
||||||
reset_context.reset_req_dev = adev;
|
reset_context.reset_req_dev = adev;
|
||||||
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||||
|
clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
|
||||||
|
|
||||||
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
|
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
|
||||||
}
|
}
|
||||||
|
@ -529,6 +529,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
|
|||||||
reset_context.method = AMD_RESET_METHOD_NONE;
|
reset_context.method = AMD_RESET_METHOD_NONE;
|
||||||
reset_context.reset_req_dev = adev;
|
reset_context.reset_req_dev = adev;
|
||||||
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||||
|
clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
|
||||||
|
|
||||||
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
|
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user