From 9ca14f94d294862d6f5ee30a6b73f295cfaa5d08 Mon Sep 17 00:00:00 2001 From: Niranjana Vishwanathapura Date: Mon, 8 May 2023 05:22:23 +0000 Subject: [PATCH] drm/xe: Handle -EDEADLK case in preempt worker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With multiple active VMs, under memory pressure, it is possible that ttm_bo_validate() run into -EDEADLK in ttm_mem_evict_wait_busy() and return -ENOMEM. Until ttm properly handles locking in such scenarios, best thing the driver can do is unwind the lock and retry. Update preempt worker to retry validating BOs with a timeout upon -ENOMEM. v2: revert retry timeout upon -EAGAIN (Thomas) Reviewed-by: Thomas Hellström Signed-off-by: Niranjana Vishwanathapura Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_vm.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 06b559ff80bf..d9579bf5002d 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -508,6 +509,8 @@ void xe_vm_unlock_dma_resv(struct xe_vm *vm, kvfree(tv); } +#define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000 + static void preempt_rebind_work_func(struct work_struct *w) { struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work); @@ -519,6 +522,7 @@ static void preempt_rebind_work_func(struct work_struct *w) struct dma_fence *rebind_fence; unsigned int fence_count = 0; LIST_HEAD(preempt_fences); + ktime_t end = 0; int err; long wait; int __maybe_unused tries = 0; @@ -637,6 +641,24 @@ out_unlock_outer: trace_xe_vm_rebind_worker_retry(vm); goto retry; } + + /* + * With multiple active VMs, under memory pressure, it is possible that + * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM. + * Until ttm properly handles locking in such scenarios, best thing the + * driver can do is retry with a timeout. Killing the VM or putting it + * in error state after timeout or other error scenarios is still TBD. + */ + if (err == -ENOMEM) { + ktime_t cur = ktime_get(); + + end = end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS); + if (ktime_before(cur, end)) { + msleep(20); + trace_xe_vm_rebind_worker_retry(vm); + goto retry; + } + } up_write(&vm->lock); free_preempt_fences(&preempt_fences);