2015-07-21 13:45:14 +08:00
/*
* Copyright 2015 Advanced Micro Devices , Inc .
*
* Permission is hereby granted , free of charge , to any person obtaining a
* copy of this software and associated documentation files ( the " Software " ) ,
* to deal in the Software without restriction , including without limitation
* the rights to use , copy , modify , merge , publish , distribute , sublicense ,
* and / or sell copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following conditions :
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software .
*
* THE SOFTWARE IS PROVIDED " AS IS " , WITHOUT WARRANTY OF ANY KIND , EXPRESS OR
* IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY ,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT . IN NO EVENT SHALL
* THE COPYRIGHT HOLDER ( S ) OR AUTHOR ( S ) BE LIABLE FOR ANY CLAIM , DAMAGES OR
* OTHER LIABILITY , WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE ,
* ARISING FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE .
*
*
*/
# include <linux/kthread.h>
# include <linux/wait.h>
# include <linux/sched.h>
2019-06-10 00:07:56 +02:00
2015-07-21 13:45:14 +08:00
# include "amdgpu.h"
2015-11-11 14:56:00 +08:00
# include "amdgpu_trace.h"
2015-07-21 13:45:14 +08:00
2017-12-06 17:49:39 +01:00
static void amdgpu_job_timedout ( struct drm_sched_job * s_job )
2016-03-04 18:51:02 +08:00
{
2018-07-13 15:08:44 +02:00
struct amdgpu_ring * ring = to_amdgpu_ring ( s_job - > sched ) ;
struct amdgpu_job * job = to_amdgpu_job ( s_job ) ;
2018-12-18 09:14:47 +08:00
struct amdgpu_task_info ti ;
2020-04-07 18:08:39 +08:00
struct amdgpu_device * adev = ring - > adev ;
2018-12-18 09:14:47 +08:00
memset ( & ti , 0 , sizeof ( struct amdgpu_task_info ) ) ;
2016-05-18 14:19:32 +02:00
2020-07-06 18:23:17 -04:00
if ( amdgpu_gpu_recovery & &
amdgpu_ring_soft_recovery ( ring , job - > vmid , s_job - > s_fence - > parent ) ) {
2018-08-21 11:11:36 +02:00
DRM_ERROR ( " ring %s timeout, but soft recovered \n " ,
s_job - > sched - > name ) ;
return ;
}
2018-12-18 09:14:47 +08:00
amdgpu_vm_get_task_info ( ring - > adev , job - > pasid , & ti ) ;
2018-07-13 14:01:08 +02:00
DRM_ERROR ( " ring %s timeout, signaled seq=%u, emitted seq=%u \n " ,
2018-07-13 15:08:44 +02:00
job - > base . sched - > name , atomic_read ( & ring - > fence_drv . last_seq ) ,
ring - > fence_drv . sync_seq ) ;
2018-12-18 09:14:47 +08:00
DRM_ERROR ( " Process information: process %s pid %d thread %s pid %d \n " ,
ti . process_name , ti . tgid , ti . task_name , ti . pid ) ;
2017-05-05 15:09:42 +08:00
2020-04-07 18:08:39 +08:00
if ( amdgpu_device_should_recover_gpu ( ring - > adev ) ) {
2018-08-21 10:45:29 +02:00
amdgpu_device_gpu_recover ( ring - > adev , job ) ;
2020-04-07 18:08:39 +08:00
} else {
2019-05-13 13:57:29 +08:00
drm_sched_suspend_timeout ( & ring - > sched ) ;
2020-04-07 18:08:39 +08:00
if ( amdgpu_sriov_vf ( adev ) )
adev - > virt . tdr_debug = true ;
}
2016-03-04 18:51:02 +08:00
}
2016-02-03 13:44:52 +01:00
int amdgpu_job_alloc ( struct amdgpu_device * adev , unsigned num_ibs ,
2016-04-19 20:11:32 +08:00
struct amdgpu_job * * job , struct amdgpu_vm * vm )
2016-02-03 13:44:52 +01:00
{
size_t size = sizeof ( struct amdgpu_job ) ;
if ( num_ibs = = 0 )
return - EINVAL ;
size + = sizeof ( struct amdgpu_ib ) * num_ibs ;
* job = kzalloc ( size , GFP_KERNEL ) ;
if ( ! * job )
return - ENOMEM ;
2018-07-13 17:15:54 +02:00
/*
* Initialize the scheduler to at least some ring so that we always
* have a pointer to adev .
*/
( * job ) - > base . sched = & adev - > rings [ 0 ] - > sched ;
2016-04-19 20:11:32 +08:00
( * job ) - > vm = vm ;
2016-02-03 13:44:52 +01:00
( * job ) - > ibs = ( void * ) & ( * job ) [ 1 ] ;
( * job ) - > num_ibs = num_ibs ;
2016-02-08 12:13:05 +01:00
amdgpu_sync_create ( & ( * job ) - > sync ) ;
2017-05-09 15:50:22 +08:00
amdgpu_sync_create ( & ( * job ) - > sched_sync ) ;
2017-10-16 20:02:08 +08:00
( * job ) - > vram_lost_counter = atomic_read ( & adev - > vram_lost_counter ) ;
2018-09-10 18:43:58 -04:00
( * job ) - > vm_pd_addr = AMDGPU_BO_INVALID_OFFSET ;
2016-02-08 12:13:05 +01:00
2016-02-03 13:44:52 +01:00
return 0 ;
}
2016-02-01 12:20:25 +01:00
int amdgpu_job_alloc_with_ib ( struct amdgpu_device * adev , unsigned size ,
2020-03-26 08:38:29 +08:00
enum amdgpu_ib_pool_type pool_type ,
struct amdgpu_job * * job )
2016-02-01 12:20:25 +01:00
{
int r ;
2016-04-19 20:11:32 +08:00
r = amdgpu_job_alloc ( adev , 1 , job , NULL ) ;
2016-02-01 12:20:25 +01:00
if ( r )
return r ;
2020-03-26 08:38:29 +08:00
r = amdgpu_ib_get ( adev , NULL , size , pool_type , & ( * job ) - > ibs [ 0 ] ) ;
2016-02-01 12:20:25 +01:00
if ( r )
kfree ( * job ) ;
return r ;
}
2016-06-29 15:10:31 +02:00
void amdgpu_job_free_resources ( struct amdgpu_job * job )
2016-02-03 13:44:52 +01:00
{
2018-07-13 17:15:54 +02:00
struct amdgpu_ring * ring = to_amdgpu_ring ( job - > base . sched ) ;
2016-10-25 13:00:45 +01:00
struct dma_fence * f ;
2016-05-18 13:09:47 +02:00
unsigned i ;
2016-03-17 13:57:09 +08:00
/* use sched fence if available */
2016-05-20 12:53:52 +02:00
f = job - > base . s_fence ? & job - > base . s_fence - > finished : job - > fence ;
2016-02-03 13:44:52 +01:00
for ( i = 0 ; i < job - > num_ibs ; + + i )
2018-07-13 17:15:54 +02:00
amdgpu_ib_free ( ring - > adev , & job - > ibs [ i ] , f ) ;
2016-02-01 12:20:25 +01:00
}
2017-12-06 17:49:39 +01:00
static void amdgpu_job_free_cb ( struct drm_sched_job * s_job )
2016-03-10 12:14:44 +08:00
{
2018-07-13 15:08:44 +02:00
struct amdgpu_job * job = to_amdgpu_job ( s_job ) ;
2016-05-19 09:54:15 +02:00
2018-10-29 15:02:28 +05:30
drm_sched_job_cleanup ( s_job ) ;
2016-10-25 13:00:45 +01:00
dma_fence_put ( job - > fence ) ;
2016-06-29 13:29:57 +02:00
amdgpu_sync_free ( & job - > sync ) ;
2017-05-09 15:50:22 +08:00
amdgpu_sync_free ( & job - > sched_sync ) ;
2016-03-10 12:14:44 +08:00
kfree ( job ) ;
}
2016-05-18 13:12:12 +02:00
void amdgpu_job_free ( struct amdgpu_job * job )
{
amdgpu_job_free_resources ( job ) ;
2016-06-29 13:29:57 +02:00
2016-10-25 13:00:45 +01:00
dma_fence_put ( job - > fence ) ;
2016-06-29 13:29:57 +02:00
amdgpu_sync_free ( & job - > sync ) ;
2017-05-09 15:50:22 +08:00
amdgpu_sync_free ( & job - > sched_sync ) ;
2016-05-18 13:12:12 +02:00
kfree ( job ) ;
}
2018-07-13 13:54:56 +02:00
int amdgpu_job_submit ( struct amdgpu_job * job , struct drm_sched_entity * entity ,
void * owner , struct dma_fence * * f )
2016-02-01 12:20:25 +01:00
{
2016-03-07 12:49:55 +08:00
int r ;
2016-02-01 12:20:25 +01:00
2016-03-07 12:49:55 +08:00
if ( ! f )
return - EINVAL ;
2018-07-20 17:51:05 +05:30
r = drm_sched_job_init ( & job - > base , entity , owner ) ;
2016-03-07 12:49:55 +08:00
if ( r )
return r ;
2016-02-01 12:20:25 +01:00
2016-10-25 13:00:45 +01:00
* f = dma_fence_get ( & job - > base . s_fence - > finished ) ;
2016-06-29 15:10:31 +02:00
amdgpu_job_free_resources ( job ) ;
2017-12-06 17:49:39 +01:00
drm_sched_entity_push_job ( & job - > base , entity ) ;
2016-02-01 12:20:25 +01:00
return 0 ;
2016-02-03 13:44:52 +01:00
}
2018-07-13 16:29:10 +02:00
int amdgpu_job_submit_direct ( struct amdgpu_job * job , struct amdgpu_ring * ring ,
struct dma_fence * * fence )
{
int r ;
job - > base . sched = & ring - > sched ;
r = amdgpu_ib_schedule ( ring , job - > num_ibs , job - > ibs , NULL , fence ) ;
job - > fence = dma_fence_get ( * fence ) ;
if ( r )
return r ;
amdgpu_job_free ( job ) ;
return 0 ;
}
2017-12-06 17:49:39 +01:00
static struct dma_fence * amdgpu_job_dependency ( struct drm_sched_job * sched_job ,
struct drm_sched_entity * s_entity )
2015-08-25 11:05:36 +02:00
{
2018-07-20 17:51:06 +05:30
struct amdgpu_ring * ring = to_amdgpu_ring ( s_entity - > rq - > sched ) ;
2015-09-09 09:21:19 +08:00
struct amdgpu_job * job = to_amdgpu_job ( sched_job ) ;
2016-04-19 20:11:32 +08:00
struct amdgpu_vm * vm = job - > vm ;
2018-07-13 14:01:08 +02:00
struct dma_fence * fence ;
2017-05-09 15:50:22 +08:00
int r ;
2017-11-13 14:47:52 -05:00
2020-05-27 10:31:08 +02:00
fence = amdgpu_sync_get_fence ( & job - > sync ) ;
if ( fence & & drm_sched_dependency_optimized ( fence , s_entity ) ) {
r = amdgpu_sync_fence ( & job - > sched_sync , fence ) ;
if ( r )
DRM_ERROR ( " Error adding fence (%d) \n " , r ) ;
2017-05-18 15:19:03 +08:00
}
2017-11-13 14:47:52 -05:00
2017-12-18 17:08:25 +01:00
while ( fence = = NULL & & vm & & ! job - > vmid ) {
2017-12-18 16:53:03 +01:00
r = amdgpu_vmid_grab ( vm , ring , & job - > sync ,
& job - > base . s_fence - > finished ,
job ) ;
2016-01-18 17:01:42 +01:00
if ( r )
2015-11-03 20:58:50 +01:00
DRM_ERROR ( " Error getting VM ID (%d) \n " , r ) ;
2020-05-27 10:31:08 +02:00
fence = amdgpu_sync_get_fence ( & job - > sync ) ;
2015-11-03 20:58:50 +01:00
}
return fence ;
2015-08-25 11:05:36 +02:00
}
2017-12-06 17:49:39 +01:00
static struct dma_fence * amdgpu_job_run ( struct drm_sched_job * sched_job )
2015-07-21 13:45:14 +08:00
{
2018-07-13 15:08:44 +02:00
struct amdgpu_ring * ring = to_amdgpu_ring ( sched_job - > sched ) ;
2017-10-25 16:21:08 +08:00
struct dma_fence * fence = NULL , * finished ;
2015-09-09 09:05:55 +08:00
struct amdgpu_job * job ;
2019-10-24 15:44:10 -04:00
int r = 0 ;
2015-07-21 13:45:14 +08:00
2015-09-09 09:21:19 +08:00
job = to_amdgpu_job ( sched_job ) ;
2017-10-25 16:21:08 +08:00
finished = & job - > base . s_fence - > finished ;
2016-02-08 12:13:05 +01:00
2016-06-01 10:47:36 +02:00
BUG_ON ( amdgpu_sync_peek_fence ( & job - > sync , NULL ) ) ;
2016-02-08 12:13:05 +01:00
2015-11-11 14:56:00 +08:00
trace_amdgpu_sched_run_job ( job ) ;
2017-10-25 16:21:08 +08:00
2020-08-12 17:48:26 +02:00
if ( job - > vram_lost_counter ! = atomic_read ( & ring - > adev - > vram_lost_counter ) )
dma_fence_set_error ( finished , - ECANCELED ) ; /* skip IB as well if VRAM lost */
if ( finished - > error < 0 ) {
DRM_INFO ( " Skip scheduling IBs! \n " ) ;
} else {
2018-07-13 15:08:44 +02:00
r = amdgpu_ib_schedule ( ring , job - > num_ibs , job - > ibs , job ,
2020-08-12 17:48:26 +02:00
& fence ) ;
2017-05-16 14:34:27 +08:00
if ( r )
DRM_ERROR ( " Error scheduling IBs (%d) \n " , r ) ;
}
2016-06-30 17:30:42 +08:00
/* if gpu reset, hw fence will be replaced here */
2016-10-25 13:00:45 +01:00
dma_fence_put ( job - > fence ) ;
job - > fence = dma_fence_get ( fence ) ;
2017-02-20 17:53:19 -05:00
2016-07-05 14:48:17 +02:00
amdgpu_job_free_resources ( job ) ;
2019-10-24 15:44:10 -04:00
fence = r ? ERR_PTR ( r ) : fence ;
2016-02-01 11:56:35 +01:00
return fence ;
2015-07-21 13:45:14 +08:00
}
2019-09-13 17:40:32 -05:00
# define to_drm_sched_job(sched_job) \
container_of ( ( sched_job ) , struct drm_sched_job , queue_node )
void amdgpu_job_stop_all_jobs_on_sched ( struct drm_gpu_scheduler * sched )
{
struct drm_sched_job * s_job ;
struct drm_sched_entity * s_entity = NULL ;
int i ;
/* Signal all jobs not yet scheduled */
2020-08-11 19:59:58 -04:00
for ( i = DRM_SCHED_PRIORITY_COUNT - 1 ; i > = DRM_SCHED_PRIORITY_MIN ; i - - ) {
2019-09-13 17:40:32 -05:00
struct drm_sched_rq * rq = & sched - > sched_rq [ i ] ;
if ( ! rq )
continue ;
spin_lock ( & rq - > lock ) ;
list_for_each_entry ( s_entity , & rq - > entities , list ) {
while ( ( s_job = to_drm_sched_job ( spsc_queue_pop ( & s_entity - > job_queue ) ) ) ) {
struct drm_sched_fence * s_fence = s_job - > s_fence ;
dma_fence_signal ( & s_fence - > scheduled ) ;
dma_fence_set_error ( & s_fence - > finished , - EHWPOISON ) ;
dma_fence_signal ( & s_fence - > finished ) ;
}
}
spin_unlock ( & rq - > lock ) ;
}
/* Signal all jobs already scheduled to HW */
list_for_each_entry ( s_job , & sched - > ring_mirror_list , node ) {
struct drm_sched_fence * s_fence = s_job - > s_fence ;
dma_fence_set_error ( & s_fence - > finished , - EHWPOISON ) ;
dma_fence_signal ( & s_fence - > finished ) ;
}
}
2017-12-06 17:49:39 +01:00
const struct drm_sched_backend_ops amdgpu_sched_ops = {
2016-02-01 12:31:01 +01:00
. dependency = amdgpu_job_dependency ,
. run_job = amdgpu_job_run ,
2016-05-18 14:19:32 +02:00
. timedout_job = amdgpu_job_timedout ,
2016-05-19 09:54:15 +02:00
. free_job = amdgpu_job_free_cb
2015-07-21 13:45:14 +08:00
} ;