linux/drivers/gpu/drm/etnaviv/etnaviv_sched.c
Matthew Brost a6149f0393 drm/sched: Convert drm scheduler to use a work queue rather than kthread
In Xe, the new Intel GPU driver, a choice has made to have a 1 to 1
mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
seems a bit odd but let us explain the reasoning below.

1. In Xe the submission order from multiple drm_sched_entity is not
guaranteed to be the same completion even if targeting the same hardware
engine. This is because in Xe we have a firmware scheduler, the GuC,
which allowed to reorder, timeslice, and preempt submissions. If a using
shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
apart as the TDR expects submission order == completion order. Using a
dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.

2. In Xe submissions are done via programming a ring buffer (circular
buffer), a drm_gpu_scheduler provides a limit on number of jobs, if the
limit of number jobs is set to RING_SIZE / MAX_SIZE_PER_JOB we get flow
control on the ring for free.

A problem with this design is currently a drm_gpu_scheduler uses a
kthread for submission / job cleanup. This doesn't scale if a large
number of drm_gpu_scheduler are used. To work around the scaling issue,
use a worker rather than kthread for submission / job cleanup.

v2:
  - (Rob Clark) Fix msm build
  - Pass in run work queue
v3:
  - (Boris) don't have loop in worker
v4:
  - (Tvrtko) break out submit ready, stop, start helpers into own patch
v5:
  - (Boris) default to ordered work queue
v6:
  - (Luben / checkpatch) fix alignment in msm_ringbuffer.c
  - (Luben) s/drm_sched_submit_queue/drm_sched_wqueue_enqueue
  - (Luben) Update comment for drm_sched_wqueue_enqueue
  - (Luben) Positive check for submit_wq in drm_sched_init
  - (Luben) s/alloc_submit_wq/own_submit_wq
v7:
  - (Luben) s/drm_sched_wqueue_enqueue/drm_sched_run_job_queue
v8:
  - (Luben) Adjust var names / comments

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Luben Tuikov <luben.tuikov@amd.com>
Link: https://lore.kernel.org/r/20231031032439.1558703-3-matthew.brost@intel.com
Signed-off-by: Luben Tuikov <ltuikov89@gmail.com>
2023-11-01 17:29:21 -04:00

152 lines
3.8 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2017 Etnaviv Project
*/
#include <linux/moduleparam.h>
#include "etnaviv_drv.h"
#include "etnaviv_dump.h"
#include "etnaviv_gem.h"
#include "etnaviv_gpu.h"
#include "etnaviv_sched.h"
#include "state.xml.h"
static int etnaviv_job_hang_limit = 0;
module_param_named(job_hang_limit, etnaviv_job_hang_limit, int , 0444);
static int etnaviv_hw_jobs_limit = 4;
module_param_named(hw_job_limit, etnaviv_hw_jobs_limit, int , 0444);
static struct dma_fence *etnaviv_sched_run_job(struct drm_sched_job *sched_job)
{
struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
struct dma_fence *fence = NULL;
if (likely(!sched_job->s_fence->finished.error))
fence = etnaviv_gpu_submit(submit);
else
dev_dbg(submit->gpu->dev, "skipping bad job\n");
return fence;
}
static enum drm_gpu_sched_stat etnaviv_sched_timedout_job(struct drm_sched_job
*sched_job)
{
struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
struct etnaviv_gpu *gpu = submit->gpu;
u32 dma_addr;
int change;
/* block scheduler */
drm_sched_stop(&gpu->sched, sched_job);
/*
* If the GPU managed to complete this jobs fence, the timout is
* spurious. Bail out.
*/
if (dma_fence_is_signaled(submit->out_fence))
goto out_no_timeout;
/*
* If the GPU is still making forward progress on the front-end (which
* should never loop) we shift out the timeout to give it a chance to
* finish the job.
*/
dma_addr = gpu_read(gpu, VIVS_FE_DMA_ADDRESS);
change = dma_addr - gpu->hangcheck_dma_addr;
if (gpu->state == ETNA_GPU_STATE_RUNNING &&
(gpu->completed_fence != gpu->hangcheck_fence ||
change < 0 || change > 16)) {
gpu->hangcheck_dma_addr = dma_addr;
gpu->hangcheck_fence = gpu->completed_fence;
goto out_no_timeout;
}
if(sched_job)
drm_sched_increase_karma(sched_job);
/* get the GPU back into the init state */
etnaviv_core_dump(submit);
etnaviv_gpu_recover_hang(submit);
drm_sched_resubmit_jobs(&gpu->sched);
drm_sched_start(&gpu->sched, true);
return DRM_GPU_SCHED_STAT_NOMINAL;
out_no_timeout:
/* restart scheduler after GPU is usable again */
drm_sched_start(&gpu->sched, true);
return DRM_GPU_SCHED_STAT_NOMINAL;
}
static void etnaviv_sched_free_job(struct drm_sched_job *sched_job)
{
struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
drm_sched_job_cleanup(sched_job);
etnaviv_submit_put(submit);
}
static const struct drm_sched_backend_ops etnaviv_sched_ops = {
.run_job = etnaviv_sched_run_job,
.timedout_job = etnaviv_sched_timedout_job,
.free_job = etnaviv_sched_free_job,
};
int etnaviv_sched_push_job(struct etnaviv_gem_submit *submit)
{
struct etnaviv_gpu *gpu = submit->gpu;
int ret;
/*
* Hold the sched lock across the whole operation to avoid jobs being
* pushed out of order with regard to their sched fence seqnos as
* allocated in drm_sched_job_arm.
*/
mutex_lock(&gpu->sched_lock);
drm_sched_job_arm(&submit->sched_job);
submit->out_fence = dma_fence_get(&submit->sched_job.s_fence->finished);
ret = xa_alloc_cyclic(&gpu->user_fences, &submit->out_fence_id,
submit->out_fence, xa_limit_32b,
&gpu->next_user_fence, GFP_KERNEL);
if (ret < 0) {
drm_sched_job_cleanup(&submit->sched_job);
goto out_unlock;
}
/* the scheduler holds on to the job now */
kref_get(&submit->refcount);
drm_sched_entity_push_job(&submit->sched_job);
out_unlock:
mutex_unlock(&gpu->sched_lock);
return ret;
}
int etnaviv_sched_init(struct etnaviv_gpu *gpu)
{
int ret;
ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops, NULL,
DRM_SCHED_PRIORITY_COUNT,
etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
msecs_to_jiffies(500), NULL, NULL,
dev_name(gpu->dev), gpu->dev);
if (ret)
return ret;
return 0;
}
void etnaviv_sched_fini(struct etnaviv_gpu *gpu)
{
drm_sched_fini(&gpu->sched);
}