2021-01-22 19:29:04 +00:00
// SPDX-License-Identifier: GPL-2.0
2019-08-08 21:27:58 +01:00
/*
* Copyright © 2018 Intel Corporation
*/
2020-12-23 12:23:58 +00:00
# include <linux/sort.h>
2020-12-16 13:54:52 +00:00
2019-08-08 21:27:58 +01:00
# include "i915_selftest.h"
2022-01-10 21:15:56 -08:00
# include "intel_engine_regs.h"
2020-12-23 12:23:58 +00:00
# include "intel_gpu_commands.h"
# include "intel_gt_clock_utils.h"
2019-08-08 21:27:58 +01:00
# include "selftest_engine.h"
2020-06-17 14:09:15 +01:00
# include "selftest_engine_heartbeat.h"
2019-08-08 21:27:58 +01:00
# include "selftests/igt_atomic.h"
2020-06-17 14:09:15 +01:00
# include "selftests/igt_flush_test.h"
# include "selftests/igt_spinner.h"
2020-12-23 12:23:58 +00:00
# define COUNT 5
static int cmp_u64 ( const void * A , const void * B )
{
const u64 * a = A , * b = B ;
return * a - * b ;
}
static u64 trifilter ( u64 * a )
{
sort ( a , COUNT , sizeof ( * a ) , cmp_u64 , NULL ) ;
return ( a [ 1 ] + 2 * a [ 2 ] + a [ 3 ] ) > > 2 ;
}
static u32 * emit_wait ( u32 * cs , u32 offset , int op , u32 value )
{
* cs + + = MI_SEMAPHORE_WAIT |
MI_SEMAPHORE_GLOBAL_GTT |
MI_SEMAPHORE_POLL |
op ;
* cs + + = value ;
* cs + + = offset ;
* cs + + = 0 ;
return cs ;
}
static u32 * emit_store ( u32 * cs , u32 offset , u32 value )
{
* cs + + = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT ;
* cs + + = offset ;
* cs + + = 0 ;
* cs + + = value ;
return cs ;
}
static u32 * emit_srm ( u32 * cs , i915_reg_t reg , u32 offset )
{
* cs + + = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT ;
* cs + + = i915_mmio_reg_offset ( reg ) ;
* cs + + = offset ;
* cs + + = 0 ;
return cs ;
}
static void write_semaphore ( u32 * x , u32 value )
{
WRITE_ONCE ( * x , value ) ;
wmb ( ) ;
}
static int __measure_timestamps ( struct intel_context * ce ,
u64 * dt , u64 * d_ring , u64 * d_ctx )
{
struct intel_engine_cs * engine = ce - > engine ;
u32 * sema = memset32 ( engine - > status_page . addr + 1000 , 0 , 5 ) ;
u32 offset = i915_ggtt_offset ( engine - > status_page . vma ) ;
struct i915_request * rq ;
u32 * cs ;
rq = intel_context_create_request ( ce ) ;
if ( IS_ERR ( rq ) )
return PTR_ERR ( rq ) ;
cs = intel_ring_begin ( rq , 28 ) ;
if ( IS_ERR ( cs ) ) {
i915_request_add ( rq ) ;
return PTR_ERR ( cs ) ;
}
/* Signal & wait for start */
cs = emit_store ( cs , offset + 4008 , 1 ) ;
cs = emit_wait ( cs , offset + 4008 , MI_SEMAPHORE_SAD_NEQ_SDD , 1 ) ;
cs = emit_srm ( cs , RING_TIMESTAMP ( engine - > mmio_base ) , offset + 4000 ) ;
cs = emit_srm ( cs , RING_CTX_TIMESTAMP ( engine - > mmio_base ) , offset + 4004 ) ;
/* Busy wait */
cs = emit_wait ( cs , offset + 4008 , MI_SEMAPHORE_SAD_EQ_SDD , 1 ) ;
cs = emit_srm ( cs , RING_TIMESTAMP ( engine - > mmio_base ) , offset + 4016 ) ;
cs = emit_srm ( cs , RING_CTX_TIMESTAMP ( engine - > mmio_base ) , offset + 4012 ) ;
intel_ring_advance ( rq , cs ) ;
i915_request_get ( rq ) ;
i915_request_add ( rq ) ;
intel_engine_flush_submission ( engine ) ;
/* Wait for the request to start executing, that then waits for us */
while ( READ_ONCE ( sema [ 2 ] ) = = 0 )
cpu_relax ( ) ;
/* Run the request for a 100us, sampling timestamps before/after */
2021-02-05 11:29:12 +00:00
local_irq_disable ( ) ;
2020-12-23 12:23:58 +00:00
write_semaphore ( & sema [ 2 ] , 0 ) ;
2021-02-05 11:29:12 +00:00
while ( READ_ONCE ( sema [ 1 ] ) = = 0 ) /* wait for the gpu to catch up */
cpu_relax ( ) ;
* dt = local_clock ( ) ;
2020-12-23 12:23:58 +00:00
udelay ( 100 ) ;
2021-01-08 20:40:22 +00:00
* dt = local_clock ( ) - * dt ;
2020-12-23 12:23:58 +00:00
write_semaphore ( & sema [ 2 ] , 1 ) ;
2021-02-05 11:29:12 +00:00
local_irq_enable ( ) ;
2020-12-23 12:23:58 +00:00
if ( i915_request_wait ( rq , 0 , HZ / 2 ) < 0 ) {
i915_request_put ( rq ) ;
return - ETIME ;
}
i915_request_put ( rq ) ;
pr_debug ( " %s CTX_TIMESTAMP: [%x, %x], RING_TIMESTAMP: [%x, %x] \n " ,
engine - > name , sema [ 1 ] , sema [ 3 ] , sema [ 0 ] , sema [ 4 ] ) ;
* d_ctx = sema [ 3 ] - sema [ 1 ] ;
* d_ring = sema [ 4 ] - sema [ 0 ] ;
return 0 ;
}
static int __live_engine_timestamps ( struct intel_engine_cs * engine )
{
u64 s_ring [ COUNT ] , s_ctx [ COUNT ] , st [ COUNT ] , d_ring , d_ctx , dt ;
struct intel_context * ce ;
int i , err = 0 ;
ce = intel_context_create ( engine ) ;
if ( IS_ERR ( ce ) )
return PTR_ERR ( ce ) ;
for ( i = 0 ; i < COUNT ; i + + ) {
err = __measure_timestamps ( ce , & st [ i ] , & s_ring [ i ] , & s_ctx [ i ] ) ;
if ( err )
break ;
}
intel_context_put ( ce ) ;
if ( err )
return err ;
dt = trifilter ( st ) ;
d_ring = trifilter ( s_ring ) ;
d_ctx = trifilter ( s_ctx ) ;
2020-12-23 12:23:59 +00:00
pr_info ( " %s elapsed:%lldns, CTX_TIMESTAMP:%lldns, RING_TIMESTAMP:%lldns \n " ,
2020-12-23 12:23:58 +00:00
engine - > name , dt ,
intel_gt_clock_interval_to_ns ( engine - > gt , d_ctx ) ,
intel_gt_clock_interval_to_ns ( engine - > gt , d_ring ) ) ;
d_ring = intel_gt_clock_interval_to_ns ( engine - > gt , d_ring ) ;
if ( 3 * dt > 4 * d_ring | | 4 * dt < 3 * d_ring ) {
pr_err ( " %s Mismatch between ring timestamp and walltime! \n " ,
engine - > name ) ;
return - EINVAL ;
}
d_ring = trifilter ( s_ring ) ;
d_ctx = trifilter ( s_ctx ) ;
2020-12-23 12:23:59 +00:00
d_ctx * = engine - > gt - > clock_frequency ;
2021-06-24 16:52:50 +05:30
if ( GRAPHICS_VER ( engine - > i915 ) = = 11 )
d_ring * = 12500000 ; /* Fixed 80ns for GEN11 ctx timestamp? */
2020-12-23 12:23:58 +00:00
else
2020-12-23 12:23:59 +00:00
d_ring * = engine - > gt - > clock_frequency ;
2020-12-23 12:23:58 +00:00
if ( 3 * d_ctx > 4 * d_ring | | 4 * d_ctx < 3 * d_ring ) {
pr_err ( " %s Mismatch between ring and context timestamps! \n " ,
engine - > name ) ;
return - EINVAL ;
}
return 0 ;
}
static int live_engine_timestamps ( void * arg )
{
struct intel_gt * gt = arg ;
struct intel_engine_cs * engine ;
enum intel_engine_id id ;
/*
* Check that CS_TIMESTAMP / CTX_TIMESTAMP are in sync , i . e . share
* the same CS clock .
*/
2021-06-05 08:53:52 -07:00
if ( GRAPHICS_VER ( gt - > i915 ) < 8 )
2020-12-23 12:23:58 +00:00
return 0 ;
for_each_engine ( engine , gt , id ) {
int err ;
st_engine_heartbeat_disable ( engine ) ;
err = __live_engine_timestamps ( engine ) ;
st_engine_heartbeat_enable ( engine ) ;
if ( err )
return err ;
}
return 0 ;
}
drm/i915/pmu: Connect engine busyness stats from GuC to pmu
With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:
- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)
At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation
engine busyness = total + (now - start)
All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.
The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.
Note:
There might be an over-accounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.
v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at GuC level to update engine stats
- Document worker specific details
v3: (Tvrtko/Umesh)
- Demarcate GuC and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of GuC state
- Add hooks to gt park/unpark for GuC busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to GuC initialization
- Drop helpers that are called only once
v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and GuC stats objects
- Since disable_submission is called from many places, move resetting
stats to intel_guc_submission_reset_prepare
v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
callbacks and worker with gt reset
v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
called during i915 load. This ends up calling the GuC busyness unpark
hook and results in kick-starting an uninitialized worker. Let
park/unpark hooks check if GuC submission has been initialized.
- drop cant_sleep() from trylock helper since rcu_read_lock takes care
of that.
v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For GuC mode of submission the engine busyness is derived from gt time
domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
longer and falls within the busyness tolerances in selftest.
v8:
- Use ktime_get in selftest as before
- intel_reset_trylock_no_wait results in a lockdep splat that is not
trivial to fix since the PMU callback runs in irq context and the
reset paths are tightly knit into the driver. The test that uncovers
this is igt@perf_pmu@faulting-read. Drop intel_reset_trylock_no_wait,
instead use the reset_count to synchronize with gt reset during pmu
callback. For the ping, continue to use intel_reset_trylock since ping
is not run in irq context.
- GuC PM timestamp does not tick when GuC is idle. This can potentially
result in wrong busyness values when a context is active on the
engine, but GuC is idle. Use the RING TIMESTAMP as GPU timestamp to
process the GuC busyness stats. This works since both GuC timestamp and
RING timestamp are synced with the same clock.
- The busyness stats may get updated after the batch starts running.
This delay causes the busyness reported for 100us duration to fall
below 95% in the selftest. The only option at this time is to wait for
GuC busyness to change from idle to active before we sample busyness
over a 100us period.
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20211027004821.66097-2-umesh.nerlige.ramappa@intel.com
2021-10-26 17:48:21 -07:00
static int __spin_until_busier ( struct intel_engine_cs * engine , ktime_t busyness )
{
ktime_t start , unused , dt ;
if ( ! intel_engine_uses_guc ( engine ) )
return 0 ;
/*
* In GuC mode of submission , the busyness stats may get updated after
* the batch starts running . Poll for a change in busyness and timeout
* after 500 us .
*/
start = ktime_get ( ) ;
while ( intel_engine_get_busy_time ( engine , & unused ) = = busyness ) {
dt = ktime_get ( ) - start ;
2021-12-08 10:33:13 -08:00
if ( dt > 10000000 ) {
drm/i915/pmu: Connect engine busyness stats from GuC to pmu
With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:
- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)
At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation
engine busyness = total + (now - start)
All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.
The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.
Note:
There might be an over-accounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.
v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at GuC level to update engine stats
- Document worker specific details
v3: (Tvrtko/Umesh)
- Demarcate GuC and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of GuC state
- Add hooks to gt park/unpark for GuC busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to GuC initialization
- Drop helpers that are called only once
v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and GuC stats objects
- Since disable_submission is called from many places, move resetting
stats to intel_guc_submission_reset_prepare
v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
callbacks and worker with gt reset
v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
called during i915 load. This ends up calling the GuC busyness unpark
hook and results in kick-starting an uninitialized worker. Let
park/unpark hooks check if GuC submission has been initialized.
- drop cant_sleep() from trylock helper since rcu_read_lock takes care
of that.
v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For GuC mode of submission the engine busyness is derived from gt time
domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
longer and falls within the busyness tolerances in selftest.
v8:
- Use ktime_get in selftest as before
- intel_reset_trylock_no_wait results in a lockdep splat that is not
trivial to fix since the PMU callback runs in irq context and the
reset paths are tightly knit into the driver. The test that uncovers
this is igt@perf_pmu@faulting-read. Drop intel_reset_trylock_no_wait,
instead use the reset_count to synchronize with gt reset during pmu
callback. For the ping, continue to use intel_reset_trylock since ping
is not run in irq context.
- GuC PM timestamp does not tick when GuC is idle. This can potentially
result in wrong busyness values when a context is active on the
engine, but GuC is idle. Use the RING TIMESTAMP as GPU timestamp to
process the GuC busyness stats. This works since both GuC timestamp and
RING timestamp are synced with the same clock.
- The busyness stats may get updated after the batch starts running.
This delay causes the busyness reported for 100us duration to fall
below 95% in the selftest. The only option at this time is to wait for
GuC busyness to change from idle to active before we sample busyness
over a 100us period.
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20211027004821.66097-2-umesh.nerlige.ramappa@intel.com
2021-10-26 17:48:21 -07:00
pr_err ( " active wait timed out %lld \n " , dt ) ;
ENGINE_TRACE ( engine , " active wait time out %lld \n " , dt ) ;
return - ETIME ;
}
}
return 0 ;
}
2020-06-17 14:09:15 +01:00
static int live_engine_busy_stats ( void * arg )
{
struct intel_gt * gt = arg ;
struct intel_engine_cs * engine ;
enum intel_engine_id id ;
struct igt_spinner spin ;
int err = 0 ;
/*
* Check that if an engine supports busy - stats , they tell the truth .
*/
if ( igt_spinner_init ( & spin , gt ) )
return - ENOMEM ;
GEM_BUG_ON ( intel_gt_pm_is_awake ( gt ) ) ;
for_each_engine ( engine , gt , id ) {
struct i915_request * rq ;
drm/i915/pmu: Connect engine busyness stats from GuC to pmu
With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:
- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)
At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation
engine busyness = total + (now - start)
All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.
The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.
Note:
There might be an over-accounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.
v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at GuC level to update engine stats
- Document worker specific details
v3: (Tvrtko/Umesh)
- Demarcate GuC and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of GuC state
- Add hooks to gt park/unpark for GuC busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to GuC initialization
- Drop helpers that are called only once
v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and GuC stats objects
- Since disable_submission is called from many places, move resetting
stats to intel_guc_submission_reset_prepare
v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
callbacks and worker with gt reset
v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
called during i915 load. This ends up calling the GuC busyness unpark
hook and results in kick-starting an uninitialized worker. Let
park/unpark hooks check if GuC submission has been initialized.
- drop cant_sleep() from trylock helper since rcu_read_lock takes care
of that.
v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For GuC mode of submission the engine busyness is derived from gt time
domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
longer and falls within the busyness tolerances in selftest.
v8:
- Use ktime_get in selftest as before
- intel_reset_trylock_no_wait results in a lockdep splat that is not
trivial to fix since the PMU callback runs in irq context and the
reset paths are tightly knit into the driver. The test that uncovers
this is igt@perf_pmu@faulting-read. Drop intel_reset_trylock_no_wait,
instead use the reset_count to synchronize with gt reset during pmu
callback. For the ping, continue to use intel_reset_trylock since ping
is not run in irq context.
- GuC PM timestamp does not tick when GuC is idle. This can potentially
result in wrong busyness values when a context is active on the
engine, but GuC is idle. Use the RING TIMESTAMP as GPU timestamp to
process the GuC busyness stats. This works since both GuC timestamp and
RING timestamp are synced with the same clock.
- The busyness stats may get updated after the batch starts running.
This delay causes the busyness reported for 100us duration to fall
below 95% in the selftest. The only option at this time is to wait for
GuC busyness to change from idle to active before we sample busyness
over a 100us period.
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20211027004821.66097-2-umesh.nerlige.ramappa@intel.com
2021-10-26 17:48:21 -07:00
ktime_t busyness , dummy ;
2020-06-17 14:09:16 +01:00
ktime_t de , dt ;
ktime_t t [ 2 ] ;
2020-06-17 14:09:15 +01:00
if ( ! intel_engine_supports_stats ( engine ) )
continue ;
if ( ! intel_engine_can_store_dword ( engine ) )
continue ;
if ( intel_gt_pm_wait_for_idle ( gt ) ) {
err = - EBUSY ;
break ;
}
st_engine_heartbeat_disable ( engine ) ;
ENGINE_TRACE ( engine , " measuring idle time \n " ) ;
preempt_disable ( ) ;
2020-06-17 14:09:16 +01:00
de = intel_engine_get_busy_time ( engine , & t [ 0 ] ) ;
2020-06-17 14:09:15 +01:00
udelay ( 100 ) ;
2020-06-17 14:09:16 +01:00
de = ktime_sub ( intel_engine_get_busy_time ( engine , & t [ 1 ] ) , de ) ;
2020-06-17 14:09:15 +01:00
preempt_enable ( ) ;
2020-06-17 14:09:16 +01:00
dt = ktime_sub ( t [ 1 ] , t [ 0 ] ) ;
2020-06-17 14:09:15 +01:00
if ( de < 0 | | de > 10 ) {
pr_err ( " %s: reported %lldns [%d%%] busyness while sleeping [for %lldns] \n " ,
engine - > name ,
de , ( int ) div64_u64 ( 100 * de , dt ) , dt ) ;
GEM_TRACE_DUMP ( ) ;
err = - EINVAL ;
goto end ;
}
/* 100% busy */
rq = igt_spinner_create_request ( & spin ,
engine - > kernel_context ,
MI_NOOP ) ;
if ( IS_ERR ( rq ) ) {
err = PTR_ERR ( rq ) ;
goto end ;
}
i915_request_add ( rq ) ;
drm/i915/pmu: Connect engine busyness stats from GuC to pmu
With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:
- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)
At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation
engine busyness = total + (now - start)
All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.
The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.
Note:
There might be an over-accounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.
v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at GuC level to update engine stats
- Document worker specific details
v3: (Tvrtko/Umesh)
- Demarcate GuC and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of GuC state
- Add hooks to gt park/unpark for GuC busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to GuC initialization
- Drop helpers that are called only once
v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and GuC stats objects
- Since disable_submission is called from many places, move resetting
stats to intel_guc_submission_reset_prepare
v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
callbacks and worker with gt reset
v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
called during i915 load. This ends up calling the GuC busyness unpark
hook and results in kick-starting an uninitialized worker. Let
park/unpark hooks check if GuC submission has been initialized.
- drop cant_sleep() from trylock helper since rcu_read_lock takes care
of that.
v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For GuC mode of submission the engine busyness is derived from gt time
domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
longer and falls within the busyness tolerances in selftest.
v8:
- Use ktime_get in selftest as before
- intel_reset_trylock_no_wait results in a lockdep splat that is not
trivial to fix since the PMU callback runs in irq context and the
reset paths are tightly knit into the driver. The test that uncovers
this is igt@perf_pmu@faulting-read. Drop intel_reset_trylock_no_wait,
instead use the reset_count to synchronize with gt reset during pmu
callback. For the ping, continue to use intel_reset_trylock since ping
is not run in irq context.
- GuC PM timestamp does not tick when GuC is idle. This can potentially
result in wrong busyness values when a context is active on the
engine, but GuC is idle. Use the RING TIMESTAMP as GPU timestamp to
process the GuC busyness stats. This works since both GuC timestamp and
RING timestamp are synced with the same clock.
- The busyness stats may get updated after the batch starts running.
This delay causes the busyness reported for 100us duration to fall
below 95% in the selftest. The only option at this time is to wait for
GuC busyness to change from idle to active before we sample busyness
over a 100us period.
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20211027004821.66097-2-umesh.nerlige.ramappa@intel.com
2021-10-26 17:48:21 -07:00
busyness = intel_engine_get_busy_time ( engine , & dummy ) ;
2020-06-17 14:09:15 +01:00
if ( ! igt_wait_for_spinner ( & spin , rq ) ) {
intel_gt_set_wedged ( engine - > gt ) ;
err = - ETIME ;
goto end ;
}
drm/i915/pmu: Connect engine busyness stats from GuC to pmu
With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:
- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)
At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation
engine busyness = total + (now - start)
All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.
The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.
Note:
There might be an over-accounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.
v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at GuC level to update engine stats
- Document worker specific details
v3: (Tvrtko/Umesh)
- Demarcate GuC and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of GuC state
- Add hooks to gt park/unpark for GuC busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to GuC initialization
- Drop helpers that are called only once
v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and GuC stats objects
- Since disable_submission is called from many places, move resetting
stats to intel_guc_submission_reset_prepare
v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
callbacks and worker with gt reset
v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
called during i915 load. This ends up calling the GuC busyness unpark
hook and results in kick-starting an uninitialized worker. Let
park/unpark hooks check if GuC submission has been initialized.
- drop cant_sleep() from trylock helper since rcu_read_lock takes care
of that.
v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For GuC mode of submission the engine busyness is derived from gt time
domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
longer and falls within the busyness tolerances in selftest.
v8:
- Use ktime_get in selftest as before
- intel_reset_trylock_no_wait results in a lockdep splat that is not
trivial to fix since the PMU callback runs in irq context and the
reset paths are tightly knit into the driver. The test that uncovers
this is igt@perf_pmu@faulting-read. Drop intel_reset_trylock_no_wait,
instead use the reset_count to synchronize with gt reset during pmu
callback. For the ping, continue to use intel_reset_trylock since ping
is not run in irq context.
- GuC PM timestamp does not tick when GuC is idle. This can potentially
result in wrong busyness values when a context is active on the
engine, but GuC is idle. Use the RING TIMESTAMP as GPU timestamp to
process the GuC busyness stats. This works since both GuC timestamp and
RING timestamp are synced with the same clock.
- The busyness stats may get updated after the batch starts running.
This delay causes the busyness reported for 100us duration to fall
below 95% in the selftest. The only option at this time is to wait for
GuC busyness to change from idle to active before we sample busyness
over a 100us period.
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20211027004821.66097-2-umesh.nerlige.ramappa@intel.com
2021-10-26 17:48:21 -07:00
err = __spin_until_busier ( engine , busyness ) ;
if ( err ) {
GEM_TRACE_DUMP ( ) ;
goto end ;
}
2020-06-17 14:09:15 +01:00
ENGINE_TRACE ( engine , " measuring busy time \n " ) ;
preempt_disable ( ) ;
2020-06-17 14:09:16 +01:00
de = intel_engine_get_busy_time ( engine , & t [ 0 ] ) ;
2021-11-15 14:16:40 -08:00
mdelay ( 10 ) ;
2020-06-17 14:09:16 +01:00
de = ktime_sub ( intel_engine_get_busy_time ( engine , & t [ 1 ] ) , de ) ;
2020-06-17 14:09:15 +01:00
preempt_enable ( ) ;
2020-06-17 14:09:16 +01:00
dt = ktime_sub ( t [ 1 ] , t [ 0 ] ) ;
2020-06-17 14:09:15 +01:00
if ( 100 * de < 95 * dt | | 95 * de > 100 * dt ) {
pr_err ( " %s: reported %lldns [%d%%] busyness while spinning [for %lldns] \n " ,
engine - > name ,
de , ( int ) div64_u64 ( 100 * de , dt ) , dt ) ;
GEM_TRACE_DUMP ( ) ;
err = - EINVAL ;
goto end ;
}
end :
st_engine_heartbeat_enable ( engine ) ;
igt_spinner_end ( & spin ) ;
if ( igt_flush_test ( gt - > i915 ) )
err = - EIO ;
if ( err )
break ;
}
igt_spinner_fini ( & spin ) ;
if ( igt_flush_test ( gt - > i915 ) )
err = - EIO ;
return err ;
}
2019-08-08 21:27:58 +01:00
static int live_engine_pm ( void * arg )
{
struct intel_gt * gt = arg ;
struct intel_engine_cs * engine ;
enum intel_engine_id id ;
/*
* Check we can call intel_engine_pm_put from any context . No
* failures are reported directly , but if we mess up lockdep should
* tell us .
*/
if ( intel_gt_pm_wait_for_idle ( gt ) ) {
pr_err ( " Unable to flush GT pm before test \n " ) ;
return - EBUSY ;
}
GEM_BUG_ON ( intel_gt_pm_is_awake ( gt ) ) ;
2019-10-17 10:45:00 +01:00
for_each_engine ( engine , gt , id ) {
2019-08-08 21:27:58 +01:00
const typeof ( * igt_atomic_phases ) * p ;
for ( p = igt_atomic_phases ; p - > name ; p + + ) {
/*
* Acquisition is always synchronous , except if we
* know that the engine is already awake , in which
* case we should use intel_engine_pm_get_if_awake ( )
* to atomically grab the wakeref .
*
* In practice ,
* intel_engine_pm_get ( ) ;
* intel_engine_pm_put ( ) ;
* occurs in one thread , while simultaneously
* intel_engine_pm_get_if_awake ( ) ;
* intel_engine_pm_put ( ) ;
* occurs from atomic context in another .
*/
GEM_BUG_ON ( intel_engine_pm_is_awake ( engine ) ) ;
intel_engine_pm_get ( engine ) ;
p - > critical_section_begin ( ) ;
if ( ! intel_engine_pm_get_if_awake ( engine ) )
pr_err ( " intel_engine_pm_get_if_awake(%s) failed under %s \n " ,
engine - > name , p - > name ) ;
else
2019-11-20 12:54:33 +00:00
intel_engine_pm_put_async ( engine ) ;
intel_engine_pm_put_async ( engine ) ;
2019-08-08 21:27:58 +01:00
p - > critical_section_end ( ) ;
2019-11-20 12:54:33 +00:00
intel_engine_pm_flush ( engine ) ;
2019-08-08 21:27:58 +01:00
if ( intel_engine_pm_is_awake ( engine ) ) {
pr_err ( " %s is still awake after flushing pm \n " ,
engine - > name ) ;
return - EINVAL ;
}
/* gt wakeref is async (deferred to workqueue) */
if ( intel_gt_pm_wait_for_idle ( gt ) ) {
pr_err ( " GT failed to idle \n " ) ;
return - EINVAL ;
}
}
}
return 0 ;
}
int live_engine_pm_selftests ( struct intel_gt * gt )
{
static const struct i915_subtest tests [ ] = {
2020-12-23 12:23:58 +00:00
SUBTEST ( live_engine_timestamps ) ,
2020-06-17 14:09:15 +01:00
SUBTEST ( live_engine_busy_stats ) ,
2019-08-08 21:27:58 +01:00
SUBTEST ( live_engine_pm ) ,
} ;
return intel_gt_live_subtests ( tests , gt ) ;
}