diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index e58b7e249816..edf10618fe2b 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -370,7 +370,13 @@ static void ttm_bo_release(struct kref *kref) spin_unlock(&bo->bdev->lru_lock); INIT_WORK(&bo->delayed_delete, ttm_bo_delayed_delete); - queue_work(bdev->wq, &bo->delayed_delete); + + /* Schedule the worker on the closest NUMA node. This + * improves performance since system memory might be + * cleared on free and that is best done on a CPU core + * close to it. + */ + queue_work_node(bdev->pool.nid, bdev->wq, &bo->delayed_delete); return; } diff --git a/drivers/gpu/drm/ttm/ttm_device.c b/drivers/gpu/drm/ttm/ttm_device.c index d48b39132b32..f5187b384ae9 100644 --- a/drivers/gpu/drm/ttm/ttm_device.c +++ b/drivers/gpu/drm/ttm/ttm_device.c @@ -204,7 +204,8 @@ int ttm_device_init(struct ttm_device *bdev, const struct ttm_device_funcs *func if (ret) return ret; - bdev->wq = alloc_workqueue("ttm", WQ_MEM_RECLAIM | WQ_HIGHPRI, 16); + bdev->wq = alloc_workqueue("ttm", + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 16); if (!bdev->wq) { ttm_global_release(); return -ENOMEM; @@ -213,7 +214,8 @@ int ttm_device_init(struct ttm_device *bdev, const struct ttm_device_funcs *func bdev->funcs = funcs; ttm_sys_man_init(bdev); - ttm_pool_init(&bdev->pool, dev, NUMA_NO_NODE, use_dma_alloc, use_dma32); + + ttm_pool_init(&bdev->pool, dev, dev_to_node(dev), use_dma_alloc, use_dma32); bdev->vma_manager = vma_manager; spin_lock_init(&bdev->lru_lock);