2016-07-13 16:03:40 +01:00
/*
* Copyright © 2016 Intel Corporation
*
* Permission is hereby granted , free of charge , to any person obtaining a
* copy of this software and associated documentation files ( the " Software " ) ,
* to deal in the Software without restriction , including without limitation
* the rights to use , copy , modify , merge , publish , distribute , sublicense ,
* and / or sell copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following conditions :
*
* The above copyright notice and this permission notice ( including the next
* paragraph ) shall be included in all copies or substantial portions of the
* Software .
*
* THE SOFTWARE IS PROVIDED " AS IS " , WITHOUT WARRANTY OF ANY KIND , EXPRESS OR
* IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY ,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT . IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER
* LIABILITY , WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE .
*
*/
2017-10-09 12:02:57 +01:00
# include <drm/drm_print.h>
2019-05-28 10:29:49 +01:00
# include "gem/i915_gem_context.h"
2016-07-13 16:03:40 +01:00
# include "i915_drv.h"
2019-04-24 18:48:39 +01:00
2020-07-31 16:48:34 +01:00
# include "intel_breadcrumbs.h"
2019-11-25 10:58:58 +00:00
# include "intel_context.h"
2019-04-24 18:48:39 +01:00
# include "intel_engine.h"
drm/i915: Invert the GEM wakeref hierarchy
In the current scheme, on submitting a request we take a single global
GEM wakeref, which trickles down to wake up all GT power domains. This
is undesirable as we would like to be able to localise our power
management to the available power domains and to remove the global GEM
operations from the heart of the driver. (The intent there is to push
global GEM decisions to the boundary as used by the GEM user interface.)
Now during request construction, each request is responsible via its
logical context to acquire a wakeref on each power domain it intends to
utilize. Currently, each request takes a wakeref on the engine(s) and
the engines themselves take a chipset wakeref. This gives us a
transition on each engine which we can extend if we want to insert more
powermangement control (such as soft rc6). The global GEM operations
that currently require a struct_mutex are reduced to listening to pm
events from the chipset GT wakeref. As we reduce the struct_mutex
requirement, these listeners should evaporate.
Perhaps the biggest immediate change is that this removes the
struct_mutex requirement around GT power management, allowing us greater
flexibility in request construction. Another important knock-on effect,
is that by tracking engine usage, we can insert a switch back to the
kernel context on that engine immediately, avoiding any extra delay or
inserting global synchronisation barriers. This makes tracking when an
engine and its associated contexts are idle much easier -- important for
when we forgo our assumed execution ordering and need idle barriers to
unpin used contexts. In the process, it means we remove a large chunk of
code whose only purpose was to switch back to the kernel context.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Imre Deak <imre.deak@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190424200717.1686-5-chris@chris-wilson.co.uk
2019-04-24 21:07:17 +01:00
# include "intel_engine_pm.h"
2019-08-06 13:43:00 +01:00
# include "intel_engine_user.h"
2020-12-09 23:36:17 +00:00
# include "intel_execlists_submission.h"
2019-11-25 10:58:58 +00:00
# include "intel_gt.h"
# include "intel_gt_requests.h"
2020-01-17 18:03:09 +00:00
# include "intel_gt_pm.h"
2020-12-09 23:36:16 +00:00
# include "intel_lrc_reg.h"
2019-04-24 18:48:39 +01:00
# include "intel_reset.h"
2019-10-24 11:03:44 +01:00
# include "intel_ring.h"
2021-01-12 18:12:35 -08:00
# include "uc/intel_guc_submission.h"
2016-07-13 16:03:40 +01:00
2017-04-28 10:53:36 +03:00
/* Haswell does have the CXT_SIZE register however it does not appear to be
* valid . Now , docs explain in dwords what is in the context object . The full
* size is 70720 bytes , however , the power context and execlist context will
* never be saved ( power context is stored elsewhere , and execlists don ' t work
* on HSW ) - so the final size , including the extra state required for the
* Resource Streamer , is 66944 bytes , which rounds to 17 pages .
*/
# define HSW_CXT_TOTAL_SIZE (17 * PAGE_SIZE)
2018-01-11 14:55:06 -08:00
# define DEFAULT_LR_CONTEXT_RENDER_SIZE (22 * PAGE_SIZE)
2017-04-28 10:53:36 +03:00
# define GEN8_LR_CONTEXT_RENDER_SIZE (20 * PAGE_SIZE)
# define GEN9_LR_CONTEXT_RENDER_SIZE (22 * PAGE_SIZE)
2017-10-04 08:39:52 -07:00
# define GEN10_LR_CONTEXT_RENDER_SIZE (18 * PAGE_SIZE)
2018-01-11 14:55:07 -08:00
# define GEN11_LR_CONTEXT_RENDER_SIZE (14 * PAGE_SIZE)
2017-04-28 10:53:36 +03:00
# define GEN8_LR_CONTEXT_OTHER_SIZE ( 2 * PAGE_SIZE)
2018-03-14 11:26:50 -07:00
# define MAX_MMIO_BASES 3
2017-04-10 07:34:32 -07:00
struct engine_info {
2017-03-01 20:26:15 +00:00
unsigned int hw_id ;
2017-04-10 07:34:29 -07:00
u8 class ;
u8 instance ;
2018-03-14 11:26:50 -07:00
/* mmio bases table *must* be sorted in reverse gen order */
struct engine_mmio_base {
u32 gen : 8 ;
u32 base : 24 ;
} mmio_bases [ MAX_MMIO_BASES ] ;
2017-04-10 07:34:32 -07:00
} ;
static const struct engine_info intel_engines [ ] = {
2019-03-05 18:03:30 +00:00
[ RCS0 ] = {
. hw_id = RCS0_HW ,
2017-04-10 07:34:29 -07:00
. class = RENDER_CLASS ,
. instance = 0 ,
2018-03-14 11:26:50 -07:00
. mmio_bases = {
{ . gen = 1 , . base = RENDER_RING_BASE }
} ,
2016-07-13 16:03:40 +01:00
} ,
2019-03-05 18:03:30 +00:00
[ BCS0 ] = {
. hw_id = BCS0_HW ,
2017-04-10 07:34:29 -07:00
. class = COPY_ENGINE_CLASS ,
. instance = 0 ,
2018-03-14 11:26:50 -07:00
. mmio_bases = {
{ . gen = 6 , . base = BLT_RING_BASE }
} ,
2016-07-13 16:03:40 +01:00
} ,
2019-03-05 18:03:30 +00:00
[ VCS0 ] = {
. hw_id = VCS0_HW ,
2017-04-10 07:34:29 -07:00
. class = VIDEO_DECODE_CLASS ,
. instance = 0 ,
2018-03-14 11:26:50 -07:00
. mmio_bases = {
{ . gen = 11 , . base = GEN11_BSD_RING_BASE } ,
{ . gen = 6 , . base = GEN6_BSD_RING_BASE } ,
{ . gen = 4 , . base = BSD_RING_BASE }
} ,
2016-07-13 16:03:40 +01:00
} ,
2019-03-05 18:03:30 +00:00
[ VCS1 ] = {
. hw_id = VCS1_HW ,
2017-04-10 07:34:29 -07:00
. class = VIDEO_DECODE_CLASS ,
. instance = 1 ,
2018-03-14 11:26:50 -07:00
. mmio_bases = {
{ . gen = 11 , . base = GEN11_BSD2_RING_BASE } ,
{ . gen = 8 , . base = GEN8_BSD2_RING_BASE }
} ,
2016-07-13 16:03:40 +01:00
} ,
2019-03-05 18:03:30 +00:00
[ VCS2 ] = {
. hw_id = VCS2_HW ,
2018-03-02 18:14:57 +02:00
. class = VIDEO_DECODE_CLASS ,
. instance = 2 ,
2018-03-14 11:26:50 -07:00
. mmio_bases = {
{ . gen = 11 , . base = GEN11_BSD3_RING_BASE }
} ,
2018-03-02 18:14:57 +02:00
} ,
2019-03-05 18:03:30 +00:00
[ VCS3 ] = {
. hw_id = VCS3_HW ,
2018-03-02 18:14:57 +02:00
. class = VIDEO_DECODE_CLASS ,
. instance = 3 ,
2018-03-14 11:26:50 -07:00
. mmio_bases = {
{ . gen = 11 , . base = GEN11_BSD4_RING_BASE }
} ,
2018-03-02 18:14:57 +02:00
} ,
2019-03-05 18:03:30 +00:00
[ VECS0 ] = {
. hw_id = VECS0_HW ,
2017-04-10 07:34:29 -07:00
. class = VIDEO_ENHANCEMENT_CLASS ,
. instance = 0 ,
2018-03-14 11:26:50 -07:00
. mmio_bases = {
{ . gen = 11 , . base = GEN11_VEBOX_RING_BASE } ,
{ . gen = 7 , . base = VEBOX_RING_BASE }
} ,
2016-07-13 16:03:40 +01:00
} ,
2019-03-05 18:03:30 +00:00
[ VECS1 ] = {
. hw_id = VECS1_HW ,
2018-03-02 18:14:57 +02:00
. class = VIDEO_ENHANCEMENT_CLASS ,
. instance = 1 ,
2018-03-14 11:26:50 -07:00
. mmio_bases = {
{ . gen = 11 , . base = GEN11_VEBOX2_RING_BASE }
} ,
2018-03-02 18:14:57 +02:00
} ,
2016-07-13 16:03:40 +01:00
} ;
2017-04-28 10:53:36 +03:00
/**
2019-05-27 18:35:59 +00:00
* intel_engine_context_size ( ) - return the size of the context for an engine
2019-12-05 16:44:22 +00:00
* @ gt : the gt
2017-04-28 10:53:36 +03:00
* @ class : engine class
*
* Each engine class may require a different amount of space for a context
* image .
*
* Return : size ( in bytes ) of an engine class specific context image
*
* Note : this size includes the HWSP , which is part of the context image
* in LRC mode , but does not include the " shared data page " used with
* GuC submission . The caller should account for this if using the GuC .
*/
2019-12-05 16:44:22 +00:00
u32 intel_engine_context_size ( struct intel_gt * gt , u8 class )
2017-04-28 10:53:36 +03:00
{
2019-12-05 16:44:22 +00:00
struct intel_uncore * uncore = gt - > uncore ;
2017-04-28 10:53:36 +03:00
u32 cxt_size ;
BUILD_BUG_ON ( I915_GTT_PAGE_SIZE ! = PAGE_SIZE ) ;
switch ( class ) {
case RENDER_CLASS :
2019-12-05 16:44:22 +00:00
switch ( INTEL_GEN ( gt - > i915 ) ) {
2017-04-28 10:53:36 +03:00
default :
2019-12-05 16:44:22 +00:00
MISSING_CASE ( INTEL_GEN ( gt - > i915 ) ) ;
2018-01-11 14:55:06 -08:00
return DEFAULT_LR_CONTEXT_RENDER_SIZE ;
2019-08-17 02:38:48 -07:00
case 12 :
2018-01-11 14:55:07 -08:00
case 11 :
return GEN11_LR_CONTEXT_RENDER_SIZE ;
2017-07-06 14:06:24 -07:00
case 10 :
2017-09-21 16:19:49 -07:00
return GEN10_LR_CONTEXT_RENDER_SIZE ;
2017-04-28 10:53:36 +03:00
case 9 :
return GEN9_LR_CONTEXT_RENDER_SIZE ;
case 8 :
2017-11-20 20:55:00 +00:00
return GEN8_LR_CONTEXT_RENDER_SIZE ;
2017-04-28 10:53:36 +03:00
case 7 :
2019-12-05 16:44:22 +00:00
if ( IS_HASWELL ( gt - > i915 ) )
2017-04-28 10:53:36 +03:00
return HSW_CXT_TOTAL_SIZE ;
2019-12-05 16:44:22 +00:00
cxt_size = intel_uncore_read ( uncore , GEN7_CXT_SIZE ) ;
2017-04-28 10:53:36 +03:00
return round_up ( GEN7_CXT_TOTAL_SIZE ( cxt_size ) * 64 ,
PAGE_SIZE ) ;
case 6 :
2019-12-05 16:44:22 +00:00
cxt_size = intel_uncore_read ( uncore , CXT_SIZE ) ;
2017-04-28 10:53:36 +03:00
return round_up ( GEN6_CXT_TOTAL_SIZE ( cxt_size ) * 64 ,
PAGE_SIZE ) ;
case 5 :
drm/i915: Enable render context support for gen4 (Broadwater to Cantiga)
Broadwater and the rest of gen4 do support being able to saving and
reloading context specific registers between contexts, providing isolation
of the basic GPU state (as programmable by userspace). This allows
userspace to assume that the GPU retains their state from one batch to the
next, minimising the amount of state it needs to reload and manually save
across batches.
v2: CONSTANT_BUFFER woes
Running through piglit turned up an interesting issue, a GPU hang inside
the context load. The context image includes the CONSTANT_BUFFER command
that loads an address into a on-gpu buffer, and the context load was
executing that immediately. However, since it was reading from the GTT
there is no guarantee that the GTT retains the same configuration as
when the context was saved, resulting in stray reads and a GPU hang.
Having tried issuing a CONSTANT_BUFFER (to disable the command) from the
ring before saving the context to no avail, we resort to patching out
the instruction inside the context image before loading.
This does impose that gen4 always reissues CONSTANT_BUFFER commands on
each batch, but due to the use of a shared GTT that was and will remain
a requirement.
v3: ECOSKPD to the rescue
Ville found the magic bit in the ECOSKPD to disable saving and restoring
the CONSTANT_BUFFER from the context image, thereby completely avoiding
the GPU hangs from chasing invalid pointers. This appears to be the
default behaviour for gen5, and so we just need to tweak gen4 to match.
v4: Fix spelling of ECOSKPD and discover it already exists
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190419172720.5462-1-chris@chris-wilson.co.uk
2019-04-19 18:27:20 +01:00
case 4 :
2019-04-19 12:17:48 +01:00
/*
* There is a discrepancy here between the size reported
* by the register and the size of the context layout
* in the docs . Both are described as authorative !
*
* The discrepancy is on the order of a few cachelines ,
* but the total is under one page ( 4 k ) , which is our
* minimum allocation anyway so it should all come
* out in the wash .
*/
2019-12-05 16:44:22 +00:00
cxt_size = intel_uncore_read ( uncore , CXT_SIZE ) + 1 ;
2020-01-28 10:14:33 +03:00
drm_dbg ( & gt - > i915 - > drm ,
" gen%d CXT_SIZE = %d bytes [0x%08x] \n " ,
INTEL_GEN ( gt - > i915 ) , cxt_size * 64 ,
cxt_size - 1 ) ;
2019-04-19 12:17:48 +01:00
return round_up ( cxt_size * 64 , PAGE_SIZE ) ;
2017-04-28 10:53:36 +03:00
case 3 :
case 2 :
/* For the special day when i810 gets merged. */
case 1 :
return 0 ;
}
break ;
default :
MISSING_CASE ( class ) ;
2020-08-23 17:36:59 -05:00
fallthrough ;
2017-04-28 10:53:36 +03:00
case VIDEO_DECODE_CLASS :
case VIDEO_ENHANCEMENT_CLASS :
case COPY_ENGINE_CLASS :
2019-12-05 16:44:22 +00:00
if ( INTEL_GEN ( gt - > i915 ) < 8 )
2017-04-28 10:53:36 +03:00
return 0 ;
return GEN8_LR_CONTEXT_OTHER_SIZE ;
}
}
2018-03-14 11:26:50 -07:00
static u32 __engine_mmio_base ( struct drm_i915_private * i915 ,
const struct engine_mmio_base * bases )
{
int i ;
for ( i = 0 ; i < MAX_MMIO_BASES ; i + + )
if ( INTEL_GEN ( i915 ) > = bases [ i ] . gen )
break ;
GEM_BUG_ON ( i = = MAX_MMIO_BASES ) ;
GEM_BUG_ON ( ! bases [ i ] . base ) ;
return bases [ i ] . base ;
}
2019-08-07 12:04:31 +01:00
static void __sprint_engine_name ( struct intel_engine_cs * engine )
2018-03-14 11:26:51 -07:00
{
2019-08-07 12:04:31 +01:00
/*
* Before we know what the uABI name for this engine will be ,
* we still would like to keep track of this engine in the debug logs .
* We throw in a ' here as a reminder that this isn ' t its final name .
*/
GEM_WARN_ON ( snprintf ( engine - > name , sizeof ( engine - > name ) , " %s'%u " ,
intel_engine_class_repr ( engine - > class ) ,
engine - > instance ) > = sizeof ( engine - > name ) ) ;
2018-03-14 11:26:51 -07:00
}
2018-12-18 10:27:12 +00:00
void intel_engine_set_hwsp_writemask ( struct intel_engine_cs * engine , u32 mask )
{
/*
* Though they added more rings on g4x / ilk , they did not add
* per - engine HWSTAM until gen6 .
*/
2019-03-25 14:49:40 -07:00
if ( INTEL_GEN ( engine - > i915 ) < 6 & & engine - > class ! = RENDER_CLASS )
2018-12-18 10:27:12 +00:00
return ;
2019-03-25 14:49:40 -07:00
if ( INTEL_GEN ( engine - > i915 ) > = 3 )
ENGINE_WRITE ( engine , RING_HWSTAM , mask ) ;
2018-12-18 10:27:12 +00:00
else
2019-03-25 14:49:40 -07:00
ENGINE_WRITE16 ( engine , RING_HWSTAM , mask ) ;
2018-12-18 10:27:12 +00:00
}
static void intel_engine_sanitize_mmio ( struct intel_engine_cs * engine )
{
/* Mask off all writes into the unknown HWSP */
intel_engine_set_hwsp_writemask ( engine , ~ 0u ) ;
}
2019-08-06 13:43:00 +01:00
static int intel_engine_setup ( struct intel_gt * gt , enum intel_engine_id id )
2016-07-13 16:03:40 +01:00
{
const struct engine_info * info = & intel_engines [ id ] ;
2020-03-12 11:57:48 +00:00
struct drm_i915_private * i915 = gt - > i915 ;
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-13 22:44:48 +05:30
struct intel_engine_cs * engine ;
2018-03-02 18:14:58 +02:00
BUILD_BUG_ON ( MAX_ENGINE_CLASS > = BIT ( GEN11_ENGINE_CLASS_WIDTH ) ) ;
BUILD_BUG_ON ( MAX_ENGINE_INSTANCE > = BIT ( GEN11_ENGINE_INSTANCE_WIDTH ) ) ;
2019-10-17 17:18:52 +01:00
if ( GEM_DEBUG_WARN_ON ( id > = ARRAY_SIZE ( gt - > engine ) ) )
return - EINVAL ;
2018-10-12 07:31:42 +01:00
if ( GEM_DEBUG_WARN_ON ( info - > class > MAX_ENGINE_CLASS ) )
drm/i915/pmu: Expose a PMU interface for perf queries
From: Chris Wilson <chris@chris-wilson.co.uk>
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
The first goal is to be able to measure GPU (and invidual ring) busyness
without having to poll registers from userspace. (Which not only incurs
holding the forcewake lock indefinitely, perturbing the system, but also
runs the risk of hanging the machine.) As an alternative we can use the
perf event counter interface to sample the ring registers periodically
and send those results to userspace.
Functionality we are exporting to userspace is via the existing perf PMU
API and can be exercised via the existing tools. For example:
perf stat -a -e i915/rcs0-busy/ -I 1000
Will print the render engine busynnes once per second. All the performance
counters can be enumerated (perf list) and have their unit of measure
correctly reported in sysfs.
v1-v2 (Chris Wilson):
v2: Use a common timer for the ring sampling.
v3: (Tvrtko Ursulin)
* Decouple uAPI from i915 engine ids.
* Complete uAPI defines.
* Refactor some code to helpers for clarity.
* Skip sampling disabled engines.
* Expose counters in sysfs.
* Pass in fake regs to avoid null ptr deref in perf core.
* Convert to class/instance uAPI.
* Use shared driver code for rc6 residency, power and frequency.
v4: (Dmitry Rogozhkin)
* Register PMU with .task_ctx_nr=perf_invalid_context
* Expose cpumask for the PMU with the single CPU in the mask
* Properly support pmu->stop(): it should call pmu->read()
* Properly support pmu->del(): it should call stop(event, PERF_EF_UPDATE)
* Introduce refcounting of event subscriptions.
* Make pmu.busy_stats a refcounter to avoid busy stats going away
with some deleted event.
* Expose cpumask for i915 PMU to avoid multiple events creation of
the same type followed by counter aggregation by perf-stat.
* Track CPUs getting online/offline to migrate perf context. If (likely)
cpumask will initially set CPU0, CONFIG_BOOTPARAM_HOTPLUG_CPU0 will be
needed to see effect of CPU status tracking.
* End result is that only global events are supported and perf stat
works correctly.
* Deny perf driver level sampling - it is prohibited for uncore PMU.
v5: (Tvrtko Ursulin)
* Don't hardcode number of engine samplers.
* Rewrite event ref-counting for correctness and simplicity.
* Store initial counter value when starting already enabled events
to correctly report values to all listeners.
* Fix RC6 residency readout.
* Comments, GPL header.
v6:
* Add missing entry to v4 changelog.
* Fix accounting in CPU hotplug case by copying the approach from
arch/x86/events/intel/cstate.c. (Dmitry Rogozhkin)
v7:
* Log failure message only on failure.
* Remove CPU hotplug notification state on unregister.
v8:
* Fix error unwind on failed registration.
* Checkpatch cleanup.
v9:
* Drop the energy metric, it is available via intel_rapl_perf.
(Ville Syrjälä)
* Use HAS_RC6(p). (Chris Wilson)
* Handle unsupported non-engine events. (Dmitry Rogozhkin)
* Rebase for intel_rc6_residency_ns needing caller managed
runtime pm.
* Drop HAS_RC6 checks from the read callback since creating those
events will be rejected at init time already.
* Add counter units to sysfs so perf stat output is nicer.
* Cleanup the attribute tables for brevity and readability.
v10:
* Fixed queued accounting.
v11:
* Move intel_engine_lookup_user to intel_engine_cs.c
* Commit update. (Joonas Lahtinen)
v12:
* More accurate sampling. (Chris Wilson)
* Store and report frequency in MHz for better usability from
perf stat.
* Removed metrics: queued, interrupts, rc6 counters.
* Sample engine busyness based on seqno difference only
for less MMIO (and forcewake) on all platforms. (Chris Wilson)
v13:
* Comment spelling, use mul_u32_u32 to work around potential GCC
issue and somne code alignment changes. (Chris Wilson)
v14:
* Rebase.
v15:
* Rebase for RPS refactoring.
v16:
* Use the dynamic slot in the CPU hotplug state machine so that we are
free to setup our state as multi-instance. Previously we were re-using
the CPUHP_AP_PERF_X86_UNCORE_ONLINE slot which is neither used as
multi-instance, nor owned by our driver to start with.
* Register the CPU hotplug handlers after the PMU, otherwise the callback
will get called before the PMU is initialized which can end up in
perf_pmu_migrate_context with an un-initialized base.
* Added workaround for a probable bug in cpuhp core.
v17:
* Remove workaround for the cpuhp bug.
v18:
* Rebase for drm_i915_gem_engine_class getting upstream before us.
v19:
* Rebase. (trivial)
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171121181852.16128-2-tvrtko.ursulin@linux.intel.com
2017-11-21 18:18:45 +00:00
return - EINVAL ;
2018-10-12 07:31:42 +01:00
if ( GEM_DEBUG_WARN_ON ( info - > instance > MAX_ENGINE_INSTANCE ) )
drm/i915/pmu: Expose a PMU interface for perf queries
From: Chris Wilson <chris@chris-wilson.co.uk>
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
The first goal is to be able to measure GPU (and invidual ring) busyness
without having to poll registers from userspace. (Which not only incurs
holding the forcewake lock indefinitely, perturbing the system, but also
runs the risk of hanging the machine.) As an alternative we can use the
perf event counter interface to sample the ring registers periodically
and send those results to userspace.
Functionality we are exporting to userspace is via the existing perf PMU
API and can be exercised via the existing tools. For example:
perf stat -a -e i915/rcs0-busy/ -I 1000
Will print the render engine busynnes once per second. All the performance
counters can be enumerated (perf list) and have their unit of measure
correctly reported in sysfs.
v1-v2 (Chris Wilson):
v2: Use a common timer for the ring sampling.
v3: (Tvrtko Ursulin)
* Decouple uAPI from i915 engine ids.
* Complete uAPI defines.
* Refactor some code to helpers for clarity.
* Skip sampling disabled engines.
* Expose counters in sysfs.
* Pass in fake regs to avoid null ptr deref in perf core.
* Convert to class/instance uAPI.
* Use shared driver code for rc6 residency, power and frequency.
v4: (Dmitry Rogozhkin)
* Register PMU with .task_ctx_nr=perf_invalid_context
* Expose cpumask for the PMU with the single CPU in the mask
* Properly support pmu->stop(): it should call pmu->read()
* Properly support pmu->del(): it should call stop(event, PERF_EF_UPDATE)
* Introduce refcounting of event subscriptions.
* Make pmu.busy_stats a refcounter to avoid busy stats going away
with some deleted event.
* Expose cpumask for i915 PMU to avoid multiple events creation of
the same type followed by counter aggregation by perf-stat.
* Track CPUs getting online/offline to migrate perf context. If (likely)
cpumask will initially set CPU0, CONFIG_BOOTPARAM_HOTPLUG_CPU0 will be
needed to see effect of CPU status tracking.
* End result is that only global events are supported and perf stat
works correctly.
* Deny perf driver level sampling - it is prohibited for uncore PMU.
v5: (Tvrtko Ursulin)
* Don't hardcode number of engine samplers.
* Rewrite event ref-counting for correctness and simplicity.
* Store initial counter value when starting already enabled events
to correctly report values to all listeners.
* Fix RC6 residency readout.
* Comments, GPL header.
v6:
* Add missing entry to v4 changelog.
* Fix accounting in CPU hotplug case by copying the approach from
arch/x86/events/intel/cstate.c. (Dmitry Rogozhkin)
v7:
* Log failure message only on failure.
* Remove CPU hotplug notification state on unregister.
v8:
* Fix error unwind on failed registration.
* Checkpatch cleanup.
v9:
* Drop the energy metric, it is available via intel_rapl_perf.
(Ville Syrjälä)
* Use HAS_RC6(p). (Chris Wilson)
* Handle unsupported non-engine events. (Dmitry Rogozhkin)
* Rebase for intel_rc6_residency_ns needing caller managed
runtime pm.
* Drop HAS_RC6 checks from the read callback since creating those
events will be rejected at init time already.
* Add counter units to sysfs so perf stat output is nicer.
* Cleanup the attribute tables for brevity and readability.
v10:
* Fixed queued accounting.
v11:
* Move intel_engine_lookup_user to intel_engine_cs.c
* Commit update. (Joonas Lahtinen)
v12:
* More accurate sampling. (Chris Wilson)
* Store and report frequency in MHz for better usability from
perf stat.
* Removed metrics: queued, interrupts, rc6 counters.
* Sample engine busyness based on seqno difference only
for less MMIO (and forcewake) on all platforms. (Chris Wilson)
v13:
* Comment spelling, use mul_u32_u32 to work around potential GCC
issue and somne code alignment changes. (Chris Wilson)
v14:
* Rebase.
v15:
* Rebase for RPS refactoring.
v16:
* Use the dynamic slot in the CPU hotplug state machine so that we are
free to setup our state as multi-instance. Previously we were re-using
the CPUHP_AP_PERF_X86_UNCORE_ONLINE slot which is neither used as
multi-instance, nor owned by our driver to start with.
* Register the CPU hotplug handlers after the PMU, otherwise the callback
will get called before the PMU is initialized which can end up in
perf_pmu_migrate_context with an un-initialized base.
* Added workaround for a probable bug in cpuhp core.
v17:
* Remove workaround for the cpuhp bug.
v18:
* Rebase for drm_i915_gem_engine_class getting upstream before us.
v19:
* Rebase. (trivial)
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171121181852.16128-2-tvrtko.ursulin@linux.intel.com
2017-11-21 18:18:45 +00:00
return - EINVAL ;
2019-08-06 13:43:00 +01:00
if ( GEM_DEBUG_WARN_ON ( gt - > engine_class [ info - > class ] [ info - > instance ] ) )
drm/i915/pmu: Expose a PMU interface for perf queries
From: Chris Wilson <chris@chris-wilson.co.uk>
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
The first goal is to be able to measure GPU (and invidual ring) busyness
without having to poll registers from userspace. (Which not only incurs
holding the forcewake lock indefinitely, perturbing the system, but also
runs the risk of hanging the machine.) As an alternative we can use the
perf event counter interface to sample the ring registers periodically
and send those results to userspace.
Functionality we are exporting to userspace is via the existing perf PMU
API and can be exercised via the existing tools. For example:
perf stat -a -e i915/rcs0-busy/ -I 1000
Will print the render engine busynnes once per second. All the performance
counters can be enumerated (perf list) and have their unit of measure
correctly reported in sysfs.
v1-v2 (Chris Wilson):
v2: Use a common timer for the ring sampling.
v3: (Tvrtko Ursulin)
* Decouple uAPI from i915 engine ids.
* Complete uAPI defines.
* Refactor some code to helpers for clarity.
* Skip sampling disabled engines.
* Expose counters in sysfs.
* Pass in fake regs to avoid null ptr deref in perf core.
* Convert to class/instance uAPI.
* Use shared driver code for rc6 residency, power and frequency.
v4: (Dmitry Rogozhkin)
* Register PMU with .task_ctx_nr=perf_invalid_context
* Expose cpumask for the PMU with the single CPU in the mask
* Properly support pmu->stop(): it should call pmu->read()
* Properly support pmu->del(): it should call stop(event, PERF_EF_UPDATE)
* Introduce refcounting of event subscriptions.
* Make pmu.busy_stats a refcounter to avoid busy stats going away
with some deleted event.
* Expose cpumask for i915 PMU to avoid multiple events creation of
the same type followed by counter aggregation by perf-stat.
* Track CPUs getting online/offline to migrate perf context. If (likely)
cpumask will initially set CPU0, CONFIG_BOOTPARAM_HOTPLUG_CPU0 will be
needed to see effect of CPU status tracking.
* End result is that only global events are supported and perf stat
works correctly.
* Deny perf driver level sampling - it is prohibited for uncore PMU.
v5: (Tvrtko Ursulin)
* Don't hardcode number of engine samplers.
* Rewrite event ref-counting for correctness and simplicity.
* Store initial counter value when starting already enabled events
to correctly report values to all listeners.
* Fix RC6 residency readout.
* Comments, GPL header.
v6:
* Add missing entry to v4 changelog.
* Fix accounting in CPU hotplug case by copying the approach from
arch/x86/events/intel/cstate.c. (Dmitry Rogozhkin)
v7:
* Log failure message only on failure.
* Remove CPU hotplug notification state on unregister.
v8:
* Fix error unwind on failed registration.
* Checkpatch cleanup.
v9:
* Drop the energy metric, it is available via intel_rapl_perf.
(Ville Syrjälä)
* Use HAS_RC6(p). (Chris Wilson)
* Handle unsupported non-engine events. (Dmitry Rogozhkin)
* Rebase for intel_rc6_residency_ns needing caller managed
runtime pm.
* Drop HAS_RC6 checks from the read callback since creating those
events will be rejected at init time already.
* Add counter units to sysfs so perf stat output is nicer.
* Cleanup the attribute tables for brevity and readability.
v10:
* Fixed queued accounting.
v11:
* Move intel_engine_lookup_user to intel_engine_cs.c
* Commit update. (Joonas Lahtinen)
v12:
* More accurate sampling. (Chris Wilson)
* Store and report frequency in MHz for better usability from
perf stat.
* Removed metrics: queued, interrupts, rc6 counters.
* Sample engine busyness based on seqno difference only
for less MMIO (and forcewake) on all platforms. (Chris Wilson)
v13:
* Comment spelling, use mul_u32_u32 to work around potential GCC
issue and somne code alignment changes. (Chris Wilson)
v14:
* Rebase.
v15:
* Rebase for RPS refactoring.
v16:
* Use the dynamic slot in the CPU hotplug state machine so that we are
free to setup our state as multi-instance. Previously we were re-using
the CPUHP_AP_PERF_X86_UNCORE_ONLINE slot which is neither used as
multi-instance, nor owned by our driver to start with.
* Register the CPU hotplug handlers after the PMU, otherwise the callback
will get called before the PMU is initialized which can end up in
perf_pmu_migrate_context with an un-initialized base.
* Added workaround for a probable bug in cpuhp core.
v17:
* Remove workaround for the cpuhp bug.
v18:
* Rebase for drm_i915_gem_engine_class getting upstream before us.
v19:
* Rebase. (trivial)
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171121181852.16128-2-tvrtko.ursulin@linux.intel.com
2017-11-21 18:18:45 +00:00
return - EINVAL ;
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-13 22:44:48 +05:30
engine = kzalloc ( sizeof ( * engine ) , GFP_KERNEL ) ;
if ( ! engine )
return - ENOMEM ;
2016-07-13 16:03:40 +01:00
2019-03-05 18:03:30 +00:00
BUILD_BUG_ON ( BITS_PER_TYPE ( engine - > mask ) < I915_NUM_ENGINES ) ;
2016-07-13 16:03:40 +01:00
engine - > id = id ;
2019-10-17 17:18:52 +01:00
engine - > legacy_idx = INVALID_ENGINE ;
2019-03-05 18:03:30 +00:00
engine - > mask = BIT ( id ) ;
2020-03-12 11:57:48 +00:00
engine - > i915 = i915 ;
2019-08-06 13:43:00 +01:00
engine - > gt = gt ;
engine - > uncore = gt - > uncore ;
2020-03-12 11:57:48 +00:00
engine - > mmio_base = __engine_mmio_base ( i915 , info - > mmio_bases ) ;
2020-10-28 07:58:24 -07:00
engine - > hw_id = info - > hw_id ;
engine - > guc_id = MAKE_GUC_ID ( info - > class , info - > instance ) ;
2019-08-07 12:04:31 +01:00
2017-04-10 07:34:29 -07:00
engine - > class = info - > class ;
engine - > instance = info - > instance ;
2019-08-07 12:04:31 +01:00
__sprint_engine_name ( engine ) ;
2016-07-13 16:03:40 +01:00
2019-10-23 14:31:08 +01:00
engine - > props . heartbeat_interval_ms =
CONFIG_DRM_I915_HEARTBEAT_INTERVAL ;
2020-02-28 13:17:13 +00:00
engine - > props . max_busywait_duration_ns =
CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT ;
2019-10-23 14:31:05 +01:00
engine - > props . preempt_timeout_ms =
CONFIG_DRM_I915_PREEMPT_TIMEOUT ;
2019-10-23 14:31:04 +01:00
engine - > props . stop_timeout_ms =
CONFIG_DRM_I915_STOP_TIMEOUT ;
2019-10-29 09:16:32 +00:00
engine - > props . timeslice_duration_ms =
CONFIG_DRM_I915_TIMESLICE_DURATION ;
2019-10-23 14:31:04 +01:00
2020-03-12 11:57:48 +00:00
/* Override to uninterruptible for OpenCL workloads. */
if ( INTEL_GEN ( i915 ) = = 12 & & engine - > class = = RENDER_CLASS )
engine - > props . preempt_timeout_ms = 0 ;
2020-05-14 07:29:05 +01:00
engine - > defaults = engine - > props ; /* never to change again */
2019-12-05 16:44:22 +00:00
engine - > context_size = intel_engine_context_size ( gt , engine - > class ) ;
2017-04-28 10:53:36 +03:00
if ( WARN_ON ( engine - > context_size > BIT ( 20 ) ) )
engine - > context_size = 0 ;
2018-07-06 11:14:41 +01:00
if ( engine - > context_size )
2020-03-12 11:57:48 +00:00
DRIVER_CAPS ( i915 ) - > has_logical_contexts = true ;
2017-04-28 10:53:36 +03:00
2016-11-14 20:41:01 +00:00
/* Nothing to do here, execute in order of dependencies */
engine - > schedule = NULL ;
2019-12-19 12:43:53 +00:00
ewma__engine_latency_init ( & engine - > latency ) ;
2018-04-26 08:47:16 +01:00
seqlock_init ( & engine - > stats . lock ) ;
2017-11-21 18:18:48 +00:00
2017-03-13 10:47:11 +08:00
ATOMIC_INIT_NOTIFIER_HEAD ( & engine - > context_status_notifier ) ;
2018-12-18 10:27:12 +00:00
/* Scrub mmio state on takeover */
intel_engine_sanitize_mmio ( engine ) ;
2019-08-06 13:43:00 +01:00
gt - > engine_class [ info - > class ] [ info - > instance ] = engine ;
2019-10-17 17:18:52 +01:00
gt - > engine [ id ] = engine ;
2019-08-06 13:43:00 +01:00
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-13 22:44:48 +05:30
return 0 ;
2016-07-13 16:03:40 +01:00
}
2019-05-22 10:00:54 +01:00
static void __setup_engine_capabilities ( struct intel_engine_cs * engine )
{
struct drm_i915_private * i915 = engine - > i915 ;
if ( engine - > class = = VIDEO_DECODE_CLASS ) {
/*
* HEVC support is present on first engine instance
* before Gen11 and on all instances afterwards .
*/
if ( INTEL_GEN ( i915 ) > = 11 | |
( INTEL_GEN ( i915 ) > = 9 & & engine - > instance = = 0 ) )
engine - > uabi_capabilities | =
I915_VIDEO_CLASS_CAPABILITY_HEVC ;
/*
* SFC block is present only on even logical engine
* instances .
*/
if ( ( INTEL_GEN ( i915 ) > = 11 & &
2020-11-05 17:18:42 -08:00
( engine - > gt - > info . vdbox_sfc_access &
BIT ( engine - > instance ) ) ) | |
2019-05-22 10:00:54 +01:00
( INTEL_GEN ( i915 ) > = 9 & & engine - > instance = = 0 ) )
engine - > uabi_capabilities | =
I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC ;
} else if ( engine - > class = = VIDEO_ENHANCEMENT_CLASS ) {
if ( INTEL_GEN ( i915 ) > = 9 )
engine - > uabi_capabilities | =
I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC ;
}
}
2019-10-22 10:47:16 +01:00
static void intel_setup_engine_capabilities ( struct intel_gt * gt )
2019-05-22 10:00:54 +01:00
{
struct intel_engine_cs * engine ;
enum intel_engine_id id ;
2019-10-22 10:47:16 +01:00
for_each_engine ( engine , gt , id )
2019-05-22 10:00:54 +01:00
__setup_engine_capabilities ( engine ) ;
}
2019-05-01 11:32:04 +01:00
/**
2019-12-22 12:07:52 +00:00
* intel_engines_release ( ) - free the resources allocated for Command Streamers
2019-10-22 10:47:17 +01:00
* @ gt : pointer to struct intel_gt
2019-05-01 11:32:04 +01:00
*/
2019-12-22 12:07:52 +00:00
void intel_engines_release ( struct intel_gt * gt )
2019-05-01 11:32:04 +01:00
{
struct intel_engine_cs * engine ;
enum intel_engine_id id ;
2020-01-17 18:03:09 +00:00
/*
* Before we release the resources held by engine , we must be certain
* that the HW is no longer accessing them - - having the GPU scribble
* to or read from a page being used for something else causes no end
* of fun .
*
* The GPU should be reset by this point , but assume the worst just
* in case we aborted before completely initialising the engines .
*/
GEM_BUG_ON ( intel_gt_pm_is_awake ( gt ) ) ;
if ( ! INTEL_INFO ( gt - > i915 ) - > gpu_reset_clobbers_display )
__intel_gt_reset ( gt , ALL_ENGINES ) ;
2019-12-22 12:07:52 +00:00
/* Decouple the backend; but keep the layout for late GPU resets */
2019-10-22 10:47:17 +01:00
for_each_engine ( engine , gt , id ) {
2019-12-22 12:07:52 +00:00
if ( ! engine - > release )
continue ;
2020-06-01 08:24:11 +01:00
intel_wakeref_wait_for_idle ( & engine - > wakeref ) ;
GEM_BUG_ON ( intel_engine_pm_is_awake ( engine ) ) ;
2019-12-22 12:07:52 +00:00
engine - > release ( engine ) ;
engine - > release = NULL ;
memset ( & engine - > reset , 0 , sizeof ( engine - > reset ) ) ;
2019-05-01 11:32:04 +01:00
}
}
2020-04-03 21:33:03 +01:00
void intel_engine_free_request_pool ( struct intel_engine_cs * engine )
{
if ( ! engine - > request_pool )
return ;
kmem_cache_free ( i915_request_slab_cache ( ) , engine - > request_pool ) ;
}
2019-12-22 12:07:52 +00:00
void intel_engines_free ( struct intel_gt * gt )
{
struct intel_engine_cs * engine ;
enum intel_engine_id id ;
2020-04-02 19:40:37 +01:00
/* Free the requests! dma-resv keeps fences around for an eternity */
rcu_barrier ( ) ;
2019-12-22 12:07:52 +00:00
for_each_engine ( engine , gt , id ) {
2020-04-03 21:33:03 +01:00
intel_engine_free_request_pool ( engine ) ;
2019-12-22 12:07:52 +00:00
kfree ( engine ) ;
gt - > engine [ id ] = NULL ;
}
}
2020-07-07 17:39:46 -07:00
/*
* Determine which engines are fused off in our particular hardware .
* Note that we have a catch - 22 situation where we need to be able to access
* the blitter forcewake domain to read the engine fuses , but at the same time
* we need to know which engines are available on the system to know which
* forcewake domains are present . We solve this by intializing the forcewake
* domains based on the full engine mask in the platform capabilities before
* calling this function and pruning the domains for fused - off engines
* afterwards .
*/
static intel_engine_mask_t init_engine_mask ( struct intel_gt * gt )
{
struct drm_i915_private * i915 = gt - > i915 ;
2020-07-07 17:39:47 -07:00
struct intel_gt_info * info = & gt - > info ;
2020-07-07 17:39:46 -07:00
struct intel_uncore * uncore = gt - > uncore ;
unsigned int logical_vdbox = 0 ;
unsigned int i ;
u32 media_fuse ;
u16 vdbox_mask ;
u16 vebox_mask ;
2020-07-07 17:39:47 -07:00
info - > engine_mask = INTEL_INFO ( i915 ) - > platform_engine_mask ;
2020-07-07 17:39:46 -07:00
if ( INTEL_GEN ( i915 ) < 11 )
return info - > engine_mask ;
media_fuse = ~ intel_uncore_read ( uncore , GEN11_GT_VEBOX_VDBOX_DISABLE ) ;
vdbox_mask = media_fuse & GEN11_GT_VDBOX_DISABLE_MASK ;
vebox_mask = ( media_fuse & GEN11_GT_VEBOX_DISABLE_MASK ) > >
GEN11_GT_VEBOX_DISABLE_SHIFT ;
for ( i = 0 ; i < I915_MAX_VCS ; i + + ) {
if ( ! HAS_ENGINE ( gt , _VCS ( i ) ) ) {
vdbox_mask & = ~ BIT ( i ) ;
continue ;
}
if ( ! ( BIT ( i ) & vdbox_mask ) ) {
info - > engine_mask & = ~ BIT ( _VCS ( i ) ) ;
drm_dbg ( & i915 - > drm , " vcs%u fused off \n " , i ) ;
continue ;
}
/*
* In Gen11 , only even numbered logical VDBOXes are
* hooked up to an SFC ( Scaler & Format Converter ) unit .
* In TGL each VDBOX has access to an SFC .
*/
if ( INTEL_GEN ( i915 ) > = 12 | | logical_vdbox + + % 2 = = 0 )
2020-07-07 17:39:47 -07:00
gt - > info . vdbox_sfc_access | = BIT ( i ) ;
2020-07-07 17:39:46 -07:00
}
drm_dbg ( & i915 - > drm , " vdbox enable: %04x, instances: %04lx \n " ,
vdbox_mask , VDBOX_MASK ( gt ) ) ;
GEM_BUG_ON ( vdbox_mask ! = VDBOX_MASK ( gt ) ) ;
for ( i = 0 ; i < I915_MAX_VECS ; i + + ) {
if ( ! HAS_ENGINE ( gt , _VECS ( i ) ) ) {
vebox_mask & = ~ BIT ( i ) ;
continue ;
}
if ( ! ( BIT ( i ) & vebox_mask ) ) {
info - > engine_mask & = ~ BIT ( _VECS ( i ) ) ;
drm_dbg ( & i915 - > drm , " vecs%u fused off \n " , i ) ;
}
}
drm_dbg ( & i915 - > drm , " vebox enable: %04x, instances: %04lx \n " ,
vebox_mask , VEBOX_MASK ( gt ) ) ;
GEM_BUG_ON ( vebox_mask ! = VEBOX_MASK ( gt ) ) ;
return info - > engine_mask ;
}
2016-07-13 16:03:40 +01:00
/**
2017-04-28 10:53:36 +03:00
* intel_engines_init_mmio ( ) - allocate and prepare the Engine Command Streamers
2019-10-22 10:47:15 +01:00
* @ gt : pointer to struct intel_gt
2016-07-13 16:03:40 +01:00
*
* Return : non - zero if the initialization failed .
*/
2019-10-22 10:47:15 +01:00
int intel_engines_init_mmio ( struct intel_gt * gt )
2016-07-13 16:03:40 +01:00
{
2019-10-22 10:47:15 +01:00
struct drm_i915_private * i915 = gt - > i915 ;
2020-07-07 17:39:46 -07:00
const unsigned int engine_mask = init_engine_mask ( gt ) ;
2017-04-11 17:56:58 +01:00
unsigned int mask = 0 ;
2016-07-13 16:03:40 +01:00
unsigned int i ;
2017-01-24 11:01:34 +00:00
int err ;
2016-07-13 16:03:40 +01:00
drm/i915/gt: Make WARN* drm specific where drm_priv ptr is available
drm specific WARN* calls include device information in the
backtrace, so we know what device the warnings originate from.
Covert all the calls of WARN* with device specific drm_WARN*
variants in functions where drm_i915_private struct pointer is readily
available.
The conversion was done automatically with below coccinelle semantic
patch. checkpatch errors/warnings are fixed manually.
@rule1@
identifier func, T;
@@
func(...) {
...
struct drm_i915_private *T = ...;
<+...
(
-WARN(
+drm_WARN(&T->drm,
...)
|
-WARN_ON(
+drm_WARN_ON(&T->drm,
...)
|
-WARN_ONCE(
+drm_WARN_ONCE(&T->drm,
...)
|
-WARN_ON_ONCE(
+drm_WARN_ON_ONCE(&T->drm,
...)
)
...+>
}
@rule2@
identifier func, T;
@@
func(struct drm_i915_private *T,...) {
<+...
(
-WARN(
+drm_WARN(&T->drm,
...)
|
-WARN_ON(
+drm_WARN_ON(&T->drm,
...)
|
-WARN_ONCE(
+drm_WARN_ONCE(&T->drm,
...)
|
-WARN_ON_ONCE(
+drm_WARN_ON_ONCE(&T->drm,
...)
)
...+>
}
command: spatch --sp-file <script> --dir drivers/gpu/drm/i915/gt \
--linux-spacing --in-place
Signed-off-by: Pankaj Bharadiya <pankaj.laxminarayan.bharadiya@intel.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200115034455.17658-7-pankaj.laxminarayan.bharadiya@intel.com
2020-01-15 09:14:50 +05:30
drm_WARN_ON ( & i915 - > drm , engine_mask = = 0 ) ;
drm_WARN_ON ( & i915 - > drm , engine_mask &
GENMASK ( BITS_PER_TYPE ( mask ) - 1 , I915_NUM_ENGINES ) ) ;
2016-07-13 16:03:40 +01:00
2019-08-02 18:40:50 +00:00
if ( i915_inject_probe_failure ( i915 ) )
2018-10-11 13:00:08 +00:00
return - ENODEV ;
2016-07-13 16:03:40 +01:00
for ( i = 0 ; i < ARRAY_SIZE ( intel_engines ) ; i + + ) {
2020-07-07 17:39:45 -07:00
if ( ! HAS_ENGINE ( gt , i ) )
2016-07-13 16:03:40 +01:00
continue ;
2019-10-22 10:47:15 +01:00
err = intel_engine_setup ( gt , i ) ;
2017-01-24 11:01:34 +00:00
if ( err )
goto cleanup ;
2019-03-05 18:03:30 +00:00
mask | = BIT ( i ) ;
2017-01-24 11:01:34 +00:00
}
/*
* Catch failures to update intel_engines table when the new engines
* are added to the driver by a warning and disabling the forgotten
* engines .
*/
drm/i915/gt: Make WARN* drm specific where drm_priv ptr is available
drm specific WARN* calls include device information in the
backtrace, so we know what device the warnings originate from.
Covert all the calls of WARN* with device specific drm_WARN*
variants in functions where drm_i915_private struct pointer is readily
available.
The conversion was done automatically with below coccinelle semantic
patch. checkpatch errors/warnings are fixed manually.
@rule1@
identifier func, T;
@@
func(...) {
...
struct drm_i915_private *T = ...;
<+...
(
-WARN(
+drm_WARN(&T->drm,
...)
|
-WARN_ON(
+drm_WARN_ON(&T->drm,
...)
|
-WARN_ONCE(
+drm_WARN_ONCE(&T->drm,
...)
|
-WARN_ON_ONCE(
+drm_WARN_ON_ONCE(&T->drm,
...)
)
...+>
}
@rule2@
identifier func, T;
@@
func(struct drm_i915_private *T,...) {
<+...
(
-WARN(
+drm_WARN(&T->drm,
...)
|
-WARN_ON(
+drm_WARN_ON(&T->drm,
...)
|
-WARN_ONCE(
+drm_WARN_ONCE(&T->drm,
...)
|
-WARN_ON_ONCE(
+drm_WARN_ON_ONCE(&T->drm,
...)
)
...+>
}
command: spatch --sp-file <script> --dir drivers/gpu/drm/i915/gt \
--linux-spacing --in-place
Signed-off-by: Pankaj Bharadiya <pankaj.laxminarayan.bharadiya@intel.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200115034455.17658-7-pankaj.laxminarayan.bharadiya@intel.com
2020-01-15 09:14:50 +05:30
if ( drm_WARN_ON ( & i915 - > drm , mask ! = engine_mask ) )
2020-07-07 17:39:47 -07:00
gt - > info . engine_mask = mask ;
2017-01-24 11:01:34 +00:00
2020-07-07 17:39:47 -07:00
gt - > info . num_engines = hweight32 ( mask ) ;
2017-01-24 11:01:34 +00:00
2019-10-22 10:47:15 +01:00
intel_gt_check_and_clear_faults ( gt ) ;
2017-11-10 16:44:47 -08:00
2019-10-22 10:47:16 +01:00
intel_setup_engine_capabilities ( gt ) ;
2019-05-22 10:00:54 +01:00
2020-07-07 17:39:46 -07:00
intel_uncore_prune_engine_fw_domains ( gt - > uncore , gt ) ;
2017-01-24 11:01:34 +00:00
return 0 ;
cleanup :
2019-12-22 12:07:52 +00:00
intel_engines_free ( gt ) ;
2017-01-24 11:01:34 +00:00
return err ;
}
drm/i915: Invert the GEM wakeref hierarchy
In the current scheme, on submitting a request we take a single global
GEM wakeref, which trickles down to wake up all GT power domains. This
is undesirable as we would like to be able to localise our power
management to the available power domains and to remove the global GEM
operations from the heart of the driver. (The intent there is to push
global GEM decisions to the boundary as used by the GEM user interface.)
Now during request construction, each request is responsible via its
logical context to acquire a wakeref on each power domain it intends to
utilize. Currently, each request takes a wakeref on the engine(s) and
the engines themselves take a chipset wakeref. This gives us a
transition on each engine which we can extend if we want to insert more
powermangement control (such as soft rc6). The global GEM operations
that currently require a struct_mutex are reduced to listening to pm
events from the chipset GT wakeref. As we reduce the struct_mutex
requirement, these listeners should evaporate.
Perhaps the biggest immediate change is that this removes the
struct_mutex requirement around GT power management, allowing us greater
flexibility in request construction. Another important knock-on effect,
is that by tracking engine usage, we can insert a switch back to the
kernel context on that engine immediately, avoiding any extra delay or
inserting global synchronisation barriers. This makes tracking when an
engine and its associated contexts are idle much easier -- important for
when we forgo our assumed execution ordering and need idle barriers to
unpin used contexts. In the process, it means we remove a large chunk of
code whose only purpose was to switch back to the kernel context.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Imre Deak <imre.deak@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190424200717.1686-5-chris@chris-wilson.co.uk
2019-04-24 21:07:17 +01:00
void intel_engine_init_execlists ( struct intel_engine_cs * engine )
2017-09-22 15:43:04 +03:00
{
struct intel_engine_execlists * const execlists = & engine - > execlists ;
2017-09-22 15:43:07 +03:00
execlists - > port_mask = 1 ;
2018-10-16 15:29:38 +03:00
GEM_BUG_ON ( ! is_power_of_2 ( execlists_num_ports ( execlists ) ) ) ;
2017-09-22 15:43:07 +03:00
GEM_BUG_ON ( execlists_num_ports ( execlists ) > EXECLIST_MAX_PORTS ) ;
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 15:20:51 +01:00
memset ( execlists - > pending , 0 , sizeof ( execlists - > pending ) ) ;
execlists - > active =
memset ( execlists - > inflight , 0 , sizeof ( execlists - > inflight ) ) ;
2019-01-29 18:54:51 +00:00
execlists - > queue_priority_hint = INT_MIN ;
2018-06-29 08:53:20 +01:00
execlists - > queue = RB_ROOT_CACHED ;
2017-09-22 15:43:04 +03:00
}
2018-09-03 16:23:03 +01:00
static void cleanup_status_page ( struct intel_engine_cs * engine )
2017-09-13 09:56:02 +01:00
{
2019-01-28 10:23:55 +00:00
struct i915_vma * vma ;
2018-12-18 10:27:12 +00:00
/* Prevent writes into HWSP after returning the page to the system */
intel_engine_set_hwsp_writemask ( engine , ~ 0u ) ;
2019-01-28 10:23:55 +00:00
vma = fetch_and_zero ( & engine - > status_page . vma ) ;
if ( ! vma )
return ;
2017-09-13 09:56:02 +01:00
2019-01-28 10:23:55 +00:00
if ( ! HWS_NEEDS_PHYSICAL ( engine - > i915 ) )
i915_vma_unpin ( vma ) ;
i915_gem_object_unpin_map ( vma - > obj ) ;
2019-05-28 10:29:56 +01:00
i915_gem_object_put ( vma - > obj ) ;
2019-01-28 10:23:55 +00:00
}
static int pin_ggtt_status_page ( struct intel_engine_cs * engine ,
struct i915_vma * vma )
{
unsigned int flags ;
2019-10-29 09:58:54 +00:00
if ( ! HAS_LLC ( engine - > i915 ) & & i915_ggtt_has_aperture ( engine - > gt - > ggtt ) )
2019-01-28 10:23:55 +00:00
/*
* On g33 , we cannot place HWS above 256 MiB , so
* restrict its pinning to the low mappable arena .
* Though this restriction is not documented for
* gen4 , gen5 , or byt , they also behave similarly
* and hang if the HWS is placed at the top of the
* GTT . To generalise , it appears that all ! llc
* platforms have issues with us placing the HWS
* above the mappable region ( even though we never
* actually map it ) .
*/
2020-01-30 18:17:10 +00:00
flags = PIN_MAPPABLE ;
2019-01-28 10:23:55 +00:00
else
2020-01-30 18:17:10 +00:00
flags = PIN_HIGH ;
2017-09-13 09:56:02 +01:00
2020-08-19 16:08:54 +02:00
return i915_ggtt_pin ( vma , NULL , 0 , flags ) ;
2017-09-13 09:56:02 +01:00
}
static int init_status_page ( struct intel_engine_cs * engine )
{
struct drm_i915_gem_object * obj ;
struct i915_vma * vma ;
void * vaddr ;
int ret ;
2020-12-22 10:42:42 +00:00
INIT_LIST_HEAD ( & engine - > status_page . timelines ) ;
2019-01-28 10:23:55 +00:00
/*
* Though the HWS register does support 36 bit addresses , historically
* we have had hangs and corruption reported due to wild writes if
* the HWS is placed above 4 G . We only allow objects to be allocated
* in GFP_DMA32 for i965 , and no earlier physical address users had
* access to more than 4 G .
*/
2017-09-13 09:56:02 +01:00
obj = i915_gem_object_create_internal ( engine - > i915 , PAGE_SIZE ) ;
if ( IS_ERR ( obj ) ) {
2020-01-28 10:14:33 +03:00
drm_err ( & engine - > i915 - > drm ,
" Failed to allocate status page \n " ) ;
2017-09-13 09:56:02 +01:00
return PTR_ERR ( obj ) ;
}
drm/i915: Flush pages on acquisition
When we return pages to the system, we ensure that they are marked as
being in the CPU domain since any external access is uncontrolled and we
must assume the worst. This means that we need to always flush the pages
on acquisition if we need to use them on the GPU, and from the beginning
have used set-domain. Set-domain is overkill for the purpose as it is a
general synchronisation barrier, but our intent is to only flush the
pages being swapped in. If we move that flush into the pages acquisition
phase, we know then that when we have obj->mm.pages, they are coherent
with the GPU and need only maintain that status without resorting to
heavy handed use of set-domain.
The principle knock-on effect for userspace is through mmap-gtt
pagefaulting. Our uAPI has always implied that the GTT mmap was async
(especially as when any pagefault occurs is unpredicatable to userspace)
and so userspace had to apply explicit domain control itself
(set-domain). However, swapping is transparent to the kernel, and so on
first fault we need to acquire the pages and make them coherent for
access through the GTT. Our use of set-domain here leaks into the uABI
that the first pagefault was synchronous. This is unintentional and
baring a few igt should be unoticed, nevertheless we bump the uABI
version for mmap-gtt to reflect the change in behaviour.
Another implication of the change is that gem_create() is presumed to
create an object that is coherent with the CPU and is in the CPU write
domain, so a set-domain(CPU) following a gem_create() would be a minor
operation that merely checked whether we could allocate all pages for
the object. On applying this change, a set-domain(CPU) causes a clflush
as we acquire the pages. This will have a small impact on mesa as we move
the clflush here on !llc from execbuf time to create, but that should
have minimal performance impact as the same clflush exists but is now
done early and because of the clflush issue, userspace recycles bo and
so should resist allocating fresh objects.
Internally, the presumption that objects are created in the CPU
write-domain and remain so through writes to obj->mm.mapping is more
prevalent than I expected; but easy enough to catch and apply a manual
flush.
For the future, we should push the page flush from the central
set_pages() into the callers so that we can more finely control when it
is applied, but for now doing it one location is easier to validate, at
the cost of sometimes flushing when there is no need.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Matthew Auld <matthew.william.auld@gmail.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Antonio Argenziano <antonio.argenziano@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Matthew Auld <matthew.william.auld@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190321161908.8007-1-chris@chris-wilson.co.uk
2019-03-21 16:19:07 +00:00
i915_gem_object_set_cache_coherency ( obj , I915_CACHE_LLC ) ;
2017-09-13 09:56:02 +01:00
2019-06-21 08:08:08 +01:00
vma = i915_vma_instance ( obj , & engine - > gt - > ggtt - > vm , NULL ) ;
2017-09-13 09:56:02 +01:00
if ( IS_ERR ( vma ) ) {
ret = PTR_ERR ( vma ) ;
goto err ;
}
vaddr = i915_gem_object_pin_map ( obj , I915_MAP_WB ) ;
if ( IS_ERR ( vaddr ) ) {
ret = PTR_ERR ( vaddr ) ;
2019-01-28 10:23:55 +00:00
goto err ;
2017-09-13 09:56:02 +01:00
}
2019-01-28 10:23:55 +00:00
engine - > status_page . addr = memset ( vaddr , 0 , PAGE_SIZE ) ;
2017-09-13 09:56:02 +01:00
engine - > status_page . vma = vma ;
2019-01-28 10:23:55 +00:00
if ( ! HWS_NEEDS_PHYSICAL ( engine - > i915 ) ) {
ret = pin_ggtt_status_page ( engine , vma ) ;
if ( ret )
goto err_unpin ;
}
2017-09-13 09:56:02 +01:00
return 0 ;
err_unpin :
2019-01-28 10:23:55 +00:00
i915_gem_object_unpin_map ( obj ) ;
2017-09-13 09:56:02 +01:00
err :
i915_gem_object_put ( obj ) ;
return ret ;
}
2019-12-22 14:40:45 +00:00
static int engine_setup_common ( struct intel_engine_cs * engine )
2019-01-28 18:18:09 +00:00
{
int err ;
drm/i915: Keep contexts pinned until after the next kernel context switch
We need to keep the context image pinned in memory until after the GPU
has finished writing into it. Since it continues to write as we signal
the final breadcrumb, we need to keep it pinned until the request after
it is complete. Currently we know the order in which requests execute on
each engine, and so to remove that presumption we need to identify a
request/context-switch we know must occur after our completion. Any
request queued after the signal must imply a context switch, for
simplicity we use a fresh request from the kernel context.
The sequence of operations for keeping the context pinned until saved is:
- On context activation, we preallocate a node for each physical engine
the context may operate on. This is to avoid allocations during
unpinning, which may be from inside FS_RECLAIM context (aka the
shrinker)
- On context deactivation on retirement of the last active request (which
is before we know the context has been saved), we add the
preallocated node onto a barrier list on each engine
- On engine idling, we emit a switch to kernel context. When this
switch completes, we know that all previous contexts must have been
saved, and so on retiring this request we can finally unpin all the
contexts that were marked as deactivated prior to the switch.
We can enhance this in future by flushing all the idle contexts on a
regular heartbeat pulse of a switch to kernel context, which will also
be used to check for hung engines.
v2: intel_context_active_acquire/_release
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190614164606.15633-1-chris@chris-wilson.co.uk
2019-06-14 17:46:04 +01:00
init_llist_head ( & engine - > barrier_tasks ) ;
2019-01-28 18:18:09 +00:00
err = init_status_page ( engine ) ;
if ( err )
return err ;
2020-07-31 16:48:34 +01:00
engine - > breadcrumbs = intel_breadcrumbs_create ( engine ) ;
if ( ! engine - > breadcrumbs ) {
err = - ENOMEM ;
goto err_status ;
}
2019-06-14 17:46:06 +01:00
intel_engine_init_active ( engine , ENGINE_PHYSICAL ) ;
drm/i915: Invert the GEM wakeref hierarchy
In the current scheme, on submitting a request we take a single global
GEM wakeref, which trickles down to wake up all GT power domains. This
is undesirable as we would like to be able to localise our power
management to the available power domains and to remove the global GEM
operations from the heart of the driver. (The intent there is to push
global GEM decisions to the boundary as used by the GEM user interface.)
Now during request construction, each request is responsible via its
logical context to acquire a wakeref on each power domain it intends to
utilize. Currently, each request takes a wakeref on the engine(s) and
the engines themselves take a chipset wakeref. This gives us a
transition on each engine which we can extend if we want to insert more
powermangement control (such as soft rc6). The global GEM operations
that currently require a struct_mutex are reduced to listening to pm
events from the chipset GT wakeref. As we reduce the struct_mutex
requirement, these listeners should evaporate.
Perhaps the biggest immediate change is that this removes the
struct_mutex requirement around GT power management, allowing us greater
flexibility in request construction. Another important knock-on effect,
is that by tracking engine usage, we can insert a switch back to the
kernel context on that engine immediately, avoiding any extra delay or
inserting global synchronisation barriers. This makes tracking when an
engine and its associated contexts are idle much easier -- important for
when we forgo our assumed execution ordering and need idle barriers to
unpin used contexts. In the process, it means we remove a large chunk of
code whose only purpose was to switch back to the kernel context.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Imre Deak <imre.deak@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190424200717.1686-5-chris@chris-wilson.co.uk
2019-04-24 21:07:17 +01:00
intel_engine_init_execlists ( engine ) ;
2019-01-28 18:18:09 +00:00
intel_engine_init_cmd_parser ( engine ) ;
drm/i915: Invert the GEM wakeref hierarchy
In the current scheme, on submitting a request we take a single global
GEM wakeref, which trickles down to wake up all GT power domains. This
is undesirable as we would like to be able to localise our power
management to the available power domains and to remove the global GEM
operations from the heart of the driver. (The intent there is to push
global GEM decisions to the boundary as used by the GEM user interface.)
Now during request construction, each request is responsible via its
logical context to acquire a wakeref on each power domain it intends to
utilize. Currently, each request takes a wakeref on the engine(s) and
the engines themselves take a chipset wakeref. This gives us a
transition on each engine which we can extend if we want to insert more
powermangement control (such as soft rc6). The global GEM operations
that currently require a struct_mutex are reduced to listening to pm
events from the chipset GT wakeref. As we reduce the struct_mutex
requirement, these listeners should evaporate.
Perhaps the biggest immediate change is that this removes the
struct_mutex requirement around GT power management, allowing us greater
flexibility in request construction. Another important knock-on effect,
is that by tracking engine usage, we can insert a switch back to the
kernel context on that engine immediately, avoiding any extra delay or
inserting global synchronisation barriers. This makes tracking when an
engine and its associated contexts are idle much easier -- important for
when we forgo our assumed execution ordering and need idle barriers to
unpin used contexts. In the process, it means we remove a large chunk of
code whose only purpose was to switch back to the kernel context.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Imre Deak <imre.deak@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190424200717.1686-5-chris@chris-wilson.co.uk
2019-04-24 21:07:17 +01:00
intel_engine_init__pm ( engine ) ;
2019-11-25 10:58:58 +00:00
intel_engine_init_retire ( engine ) ;
2019-01-28 18:18:09 +00:00
2019-04-24 10:51:34 +01:00
/* Use the whole device by default */
engine - > sseu =
2020-07-07 17:39:50 -07:00
intel_sseu_from_device_info ( & engine - > gt - > info . sseu ) ;
2019-04-24 10:51:34 +01:00
2019-07-03 14:58:05 +01:00
intel_engine_init_workarounds ( engine ) ;
intel_engine_init_whitelist ( engine ) ;
intel_engine_init_ctx_wa ( engine ) ;
2021-01-12 18:12:36 -08:00
if ( INTEL_GEN ( engine - > i915 ) > = 12 )
engine - > flags | = I915_ENGINE_HAS_RELATIVE_MMIO ;
2019-01-28 18:18:09 +00:00
return 0 ;
2020-07-31 16:48:34 +01:00
err_status :
cleanup_status_page ( engine ) ;
return err ;
2019-01-28 18:18:09 +00:00
}
2019-01-25 10:05:20 +00:00
struct measure_breadcrumb {
struct i915_request rq ;
struct intel_ring ring ;
drm/i915/gt: Incrementally check for rewinding
In commit 5ba32c7be81e ("drm/i915/execlists: Always force a context
reload when rewinding RING_TAIL"), we placed the check for rewinding a
context on actually submitting the next request in that context. This
was so that we only had to check once, and could do so with precision
avoiding as many forced restores as possible. For example, to ensure
that we can resubmit the same request a couple of times, we include a
small wa_tail such that on the next submission, the ring->tail will
appear to move forwards when resubmitting the same request. This is very
common as it will happen for every lite-restore to fill the second port
after a context switch.
However, intel_ring_direction() is limited in precision to movements of
upto half the ring size. The consequence being that if we tried to
unwind many requests, we could exceed half the ring and flip the sense
of the direction, so missing a force restore. As no request can be
greater than half the ring (i.e. 2048 bytes in the smallest case), we
can check for rollback incrementally. As we check against the tail that
would be submitted, we do not lose any sensitivity and allow lite
restores for the simple case. We still need to double check upon
submitting the context, to allow for multiple preemptions and
resubmissions.
Fixes: 5ba32c7be81e ("drm/i915/execlists: Always force a context reload when rewinding RING_TAIL")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: <stable@vger.kernel.org> # v5.4+
Reviewed-by: Bruce Chang <yu.bruce.chang@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200609151723.12971-1-chris@chris-wilson.co.uk
2020-06-09 16:17:23 +01:00
u32 cs [ 2048 ] ;
2019-01-25 10:05:20 +00:00
} ;
2020-02-07 12:58:27 +00:00
static int measure_breadcrumb_dw ( struct intel_context * ce )
2019-01-25 10:05:20 +00:00
{
2020-02-07 12:58:27 +00:00
struct intel_engine_cs * engine = ce - > engine ;
2019-01-25 10:05:20 +00:00
struct measure_breadcrumb * frame ;
2020-02-22 13:47:55 +00:00
int dw ;
2019-01-25 10:05:20 +00:00
2019-06-21 08:08:11 +01:00
GEM_BUG_ON ( ! engine - > gt - > scratch ) ;
2019-01-25 10:05:20 +00:00
frame = kzalloc ( sizeof ( * frame ) , GFP_KERNEL ) ;
if ( ! frame )
return - ENOMEM ;
2020-02-07 12:58:27 +00:00
frame - > rq . engine = engine ;
frame - > rq . context = ce ;
rcu_assign_pointer ( frame - > rq . timeline , ce - > timeline ) ;
drm/i915: Mark i915_request.timeline as a volatile, rcu pointer
The request->timeline is only valid until the request is retired (i.e.
before it is completed). Upon retiring the request, the context may be
unpinned and freed, and along with it the timeline may be freed. We
therefore need to be very careful when chasing rq->timeline that the
pointer does not disappear beneath us. The vast majority of users are in
a protected context, either during request construction or retirement,
where the timeline->mutex is held and the timeline cannot disappear. It
is those few off the beaten path (where we access a second timeline) that
need extra scrutiny -- to be added in the next patch after first adding
the warnings about dangerous access.
One complication, where we cannot use the timeline->mutex itself, is
during request submission onto hardware (under spinlocks). Here, we want
to check on the timeline to finalize the breadcrumb, and so we need to
impose a second rule to ensure that the request->timeline is indeed
valid. As we are submitting the request, it's context and timeline must
be pinned, as it will be used by the hardware. Since it is pinned, we
know the request->timeline must still be valid, and we cannot submit the
idle barrier until after we release the engine->active.lock, ergo while
submitting and holding that spinlock, a second thread cannot release the
timeline.
v2: Don't be lazy inside selftests; hold the timeline->mutex for as long
as we need it, and tidy up acquiring the timeline with a bit of
refactoring (i915_active_add_request)
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190919111912.21631-1-chris@chris-wilson.co.uk
2019-09-19 12:19:10 +01:00
2019-01-25 10:05:20 +00:00
frame - > ring . vaddr = frame - > cs ;
frame - > ring . size = sizeof ( frame - > cs ) ;
drm/i915/gt: Incrementally check for rewinding
In commit 5ba32c7be81e ("drm/i915/execlists: Always force a context
reload when rewinding RING_TAIL"), we placed the check for rewinding a
context on actually submitting the next request in that context. This
was so that we only had to check once, and could do so with precision
avoiding as many forced restores as possible. For example, to ensure
that we can resubmit the same request a couple of times, we include a
small wa_tail such that on the next submission, the ring->tail will
appear to move forwards when resubmitting the same request. This is very
common as it will happen for every lite-restore to fill the second port
after a context switch.
However, intel_ring_direction() is limited in precision to movements of
upto half the ring size. The consequence being that if we tried to
unwind many requests, we could exceed half the ring and flip the sense
of the direction, so missing a force restore. As no request can be
greater than half the ring (i.e. 2048 bytes in the smallest case), we
can check for rollback incrementally. As we check against the tail that
would be submitted, we do not lose any sensitivity and allow lite
restores for the simple case. We still need to double check upon
submitting the context, to allow for multiple preemptions and
resubmissions.
Fixes: 5ba32c7be81e ("drm/i915/execlists: Always force a context reload when rewinding RING_TAIL")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: <stable@vger.kernel.org> # v5.4+
Reviewed-by: Bruce Chang <yu.bruce.chang@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200609151723.12971-1-chris@chris-wilson.co.uk
2020-06-09 16:17:23 +01:00
frame - > ring . wrap =
BITS_PER_TYPE ( frame - > ring . size ) - ilog2 ( frame - > ring . size ) ;
2019-01-25 10:05:20 +00:00
frame - > ring . effective_size = frame - > ring . size ;
intel_ring_update_space ( & frame - > ring ) ;
frame - > rq . ring = & frame - > ring ;
2019-01-28 18:18:11 +00:00
2020-02-07 12:58:27 +00:00
mutex_lock ( & ce - > timeline - > mutex ) ;
drm/i915: Mark i915_request.timeline as a volatile, rcu pointer
The request->timeline is only valid until the request is retired (i.e.
before it is completed). Upon retiring the request, the context may be
unpinned and freed, and along with it the timeline may be freed. We
therefore need to be very careful when chasing rq->timeline that the
pointer does not disappear beneath us. The vast majority of users are in
a protected context, either during request construction or retirement,
where the timeline->mutex is held and the timeline cannot disappear. It
is those few off the beaten path (where we access a second timeline) that
need extra scrutiny -- to be added in the next patch after first adding
the warnings about dangerous access.
One complication, where we cannot use the timeline->mutex itself, is
during request submission onto hardware (under spinlocks). Here, we want
to check on the timeline to finalize the breadcrumb, and so we need to
impose a second rule to ensure that the request->timeline is indeed
valid. As we are submitting the request, it's context and timeline must
be pinned, as it will be used by the hardware. Since it is pinned, we
know the request->timeline must still be valid, and we cannot submit the
idle barrier until after we release the engine->active.lock, ergo while
submitting and holding that spinlock, a second thread cannot release the
timeline.
v2: Don't be lazy inside selftests; hold the timeline->mutex for as long
as we need it, and tidy up acquiring the timeline with a bit of
refactoring (i915_active_add_request)
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190919111912.21631-1-chris@chris-wilson.co.uk
2019-09-19 12:19:10 +01:00
spin_lock_irq ( & engine - > active . lock ) ;
2020-02-07 12:58:27 +00:00
2019-01-29 18:54:50 +00:00
dw = engine - > emit_fini_breadcrumb ( & frame - > rq , frame - > cs ) - frame - > cs ;
2020-02-07 12:58:27 +00:00
drm/i915: Mark i915_request.timeline as a volatile, rcu pointer
The request->timeline is only valid until the request is retired (i.e.
before it is completed). Upon retiring the request, the context may be
unpinned and freed, and along with it the timeline may be freed. We
therefore need to be very careful when chasing rq->timeline that the
pointer does not disappear beneath us. The vast majority of users are in
a protected context, either during request construction or retirement,
where the timeline->mutex is held and the timeline cannot disappear. It
is those few off the beaten path (where we access a second timeline) that
need extra scrutiny -- to be added in the next patch after first adding
the warnings about dangerous access.
One complication, where we cannot use the timeline->mutex itself, is
during request submission onto hardware (under spinlocks). Here, we want
to check on the timeline to finalize the breadcrumb, and so we need to
impose a second rule to ensure that the request->timeline is indeed
valid. As we are submitting the request, it's context and timeline must
be pinned, as it will be used by the hardware. Since it is pinned, we
know the request->timeline must still be valid, and we cannot submit the
idle barrier until after we release the engine->active.lock, ergo while
submitting and holding that spinlock, a second thread cannot release the
timeline.
v2: Don't be lazy inside selftests; hold the timeline->mutex for as long
as we need it, and tidy up acquiring the timeline with a bit of
refactoring (i915_active_add_request)
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190919111912.21631-1-chris@chris-wilson.co.uk
2019-09-19 12:19:10 +01:00
spin_unlock_irq ( & engine - > active . lock ) ;
2020-02-07 12:58:27 +00:00
mutex_unlock ( & ce - > timeline - > mutex ) ;
drm/i915: Mark i915_request.timeline as a volatile, rcu pointer
The request->timeline is only valid until the request is retired (i.e.
before it is completed). Upon retiring the request, the context may be
unpinned and freed, and along with it the timeline may be freed. We
therefore need to be very careful when chasing rq->timeline that the
pointer does not disappear beneath us. The vast majority of users are in
a protected context, either during request construction or retirement,
where the timeline->mutex is held and the timeline cannot disappear. It
is those few off the beaten path (where we access a second timeline) that
need extra scrutiny -- to be added in the next patch after first adding
the warnings about dangerous access.
One complication, where we cannot use the timeline->mutex itself, is
during request submission onto hardware (under spinlocks). Here, we want
to check on the timeline to finalize the breadcrumb, and so we need to
impose a second rule to ensure that the request->timeline is indeed
valid. As we are submitting the request, it's context and timeline must
be pinned, as it will be used by the hardware. Since it is pinned, we
know the request->timeline must still be valid, and we cannot submit the
idle barrier until after we release the engine->active.lock, ergo while
submitting and holding that spinlock, a second thread cannot release the
timeline.
v2: Don't be lazy inside selftests; hold the timeline->mutex for as long
as we need it, and tidy up acquiring the timeline with a bit of
refactoring (i915_active_add_request)
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190919111912.21631-1-chris@chris-wilson.co.uk
2019-09-19 12:19:10 +01:00
2019-05-08 09:06:25 +01:00
GEM_BUG_ON ( dw & 1 ) ; /* RING_TAIL must be qword aligned */
2019-01-25 10:05:20 +00:00
2019-01-28 18:18:09 +00:00
kfree ( frame ) ;
2019-01-25 10:05:20 +00:00
return dw ;
}
2019-06-14 17:46:06 +01:00
void
intel_engine_init_active ( struct intel_engine_cs * engine , unsigned int subclass )
{
INIT_LIST_HEAD ( & engine - > active . requests ) ;
2020-01-16 18:47:53 +00:00
INIT_LIST_HEAD ( & engine - > active . hold ) ;
2019-06-14 17:46:06 +01:00
spin_lock_init ( & engine - > active . lock ) ;
lockdep_set_subclass ( & engine - > active . lock , subclass ) ;
/*
* Due to an interesting quirk in lockdep ' s internal debug tracking ,
* after setting a subclass we must ensure the lock is used . Otherwise ,
* nr_unused_locks is incremented once too often .
*/
# ifdef CONFIG_DEBUG_LOCK_ALLOC
local_irq_disable ( ) ;
lock_map_acquire ( & engine - > active . lock . dep_map ) ;
lock_map_release ( & engine - > active . lock . dep_map ) ;
local_irq_enable ( ) ;
# endif
}
2019-08-08 12:06:11 +01:00
static struct intel_context *
2020-07-30 19:39:06 +01:00
create_pinned_context ( struct intel_engine_cs * engine ,
unsigned int hwsp ,
struct lock_class_key * key ,
const char * name )
2019-08-08 12:06:11 +01:00
{
struct intel_context * ce ;
int err ;
2019-12-21 16:03:24 +00:00
ce = intel_context_create ( engine ) ;
2019-08-08 12:06:11 +01:00
if ( IS_ERR ( ce ) )
return ce ;
2019-12-21 16:03:24 +00:00
__set_bit ( CONTEXT_BARRIER_BIT , & ce - > flags ) ;
2020-07-30 19:39:06 +01:00
ce - > timeline = page_pack_bits ( NULL , hwsp ) ;
2019-08-09 19:25:17 +01:00
2019-12-21 16:03:24 +00:00
err = intel_context_pin ( ce ) ; /* perma-pin so it is always available */
2019-08-08 12:06:11 +01:00
if ( err ) {
intel_context_put ( ce ) ;
return ERR_PTR ( err ) ;
}
2019-10-08 19:59:41 +01:00
/*
* Give our perma - pinned kernel timelines a separate lockdep class ,
* so that we can use them from within the normal user timelines
* should we need to inject GPU operations during their request
* construction .
*/
2020-07-30 19:39:06 +01:00
lockdep_set_class_and_name ( & ce - > timeline - > mutex , key , name ) ;
2019-10-08 19:59:41 +01:00
2019-08-08 12:06:11 +01:00
return ce ;
}
2020-12-22 10:42:42 +00:00
static void destroy_pinned_context ( struct intel_context * ce )
{
struct intel_engine_cs * engine = ce - > engine ;
struct i915_vma * hwsp = engine - > status_page . vma ;
GEM_BUG_ON ( ce - > timeline - > hwsp_ggtt ! = hwsp ) ;
mutex_lock ( & hwsp - > vm - > mutex ) ;
list_del ( & ce - > timeline - > engine_link ) ;
mutex_unlock ( & hwsp - > vm - > mutex ) ;
intel_context_unpin ( ce ) ;
intel_context_put ( ce ) ;
}
2020-07-30 19:39:06 +01:00
static struct intel_context *
create_kernel_context ( struct intel_engine_cs * engine )
{
static struct lock_class_key kernel ;
return create_pinned_context ( engine , I915_GEM_HWS_SEQNO_ADDR ,
& kernel , " kernel_context " ) ;
}
2016-07-13 16:03:41 +01:00
/**
* intel_engines_init_common - initialize cengine state which might require hw access
* @ engine : Engine to initialize .
*
* Initializes @ engine @ structure members shared between legacy and execlists
* submission modes which do require hardware access .
*
* Typcally done at later stages of submission mode specific engine setup .
*
* Returns zero on success or an error code on failure .
*/
2019-12-22 14:40:45 +00:00
static int engine_init_common ( struct intel_engine_cs * engine )
2016-07-13 16:03:41 +01:00
{
2019-08-08 12:06:11 +01:00
struct intel_context * ce ;
2016-07-13 16:03:41 +01:00
int ret ;
2019-07-09 10:12:33 +01:00
engine - > set_default_submission ( engine ) ;
2019-08-08 12:06:11 +01:00
/*
* We may need to do things with the shrinker which
drm/i915: Unify active context tracking between legacy/execlists/guc
The requests conversion introduced a nasty bug where we could generate a
new request in the middle of constructing a request if we needed to idle
the system in order to evict space for a context. The request to idle
would be executed (and waited upon) before the current one, creating a
minor havoc in the seqno accounting, as we will consider the current
request to already be completed (prior to deferred seqno assignment) but
ring->last_retired_head would have been updated and still could allow
us to overwrite the current request before execution.
We also employed two different mechanisms to track the active context
until it was switched out. The legacy method allowed for waiting upon an
active context (it could forcibly evict any vma, including context's),
but the execlists method took a step backwards by pinning the vma for
the entire active lifespan of the context (the only way to evict was to
idle the entire GPU, not individual contexts). However, to circumvent
the tricky issue of locking (i.e. we cannot take struct_mutex at the
time of i915_gem_request_submit(), where we would want to move the
previous context onto the active tracker and unpin it), we take the
execlists approach and keep the contexts pinned until retirement.
The benefit of the execlists approach, more important for execlists than
legacy, was the reduction in work in pinning the context for each
request - as the context was kept pinned until idle, it could short
circuit the pinning for all active contexts.
We introduce new engine vfuncs to pin and unpin the context
respectively. The context is pinned at the start of the request, and
only unpinned when the following request is retired (this ensures that
the context is idle and coherent in main memory before we unpin it). We
move the engine->last_context tracking into the retirement itself
(rather than during request submission) in order to allow the submission
to be reordered or unwound without undue difficultly.
And finally an ulterior motive for unifying context handling was to
prepare for mock requests.
v2: Rename to last_retired_context, split out legacy_context tracking
for MI_SET_CONTEXT.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161218153724.8439-3-chris@chris-wilson.co.uk
2016-12-18 15:37:20 +00:00
* require us to immediately switch back to the default
* context . This can cause a problem as pinning the
* default context also requires GTT space which may not
* be available . To avoid this we always pin the default
* context .
*/
2019-08-08 12:06:11 +01:00
ce = create_kernel_context ( engine ) ;
if ( IS_ERR ( ce ) )
return PTR_ERR ( ce ) ;
2020-02-07 12:58:27 +00:00
ret = measure_breadcrumb_dw ( ce ) ;
if ( ret < 0 )
goto err_context ;
engine - > emit_fini_breadcrumb_dw = ret ;
2019-08-08 12:06:11 +01:00
engine - > kernel_context = ce ;
2016-07-13 16:03:41 +01:00
2019-03-08 13:25:21 +00:00
return 0 ;
2020-02-07 12:58:27 +00:00
err_context :
intel_context_put ( ce ) ;
return ret ;
2016-07-13 16:03:41 +01:00
}
2016-08-03 13:19:16 +01:00
2019-12-22 14:40:45 +00:00
int intel_engines_init ( struct intel_gt * gt )
{
int ( * setup ) ( struct intel_engine_cs * engine ) ;
struct intel_engine_cs * engine ;
enum intel_engine_id id ;
int err ;
2021-01-12 18:12:35 -08:00
if ( intel_uc_uses_guc_submission ( & gt - > uc ) )
setup = intel_guc_submission_setup ;
else if ( HAS_EXECLISTS ( gt - > i915 ) )
2019-12-22 14:40:45 +00:00
setup = intel_execlists_submission_setup ;
else
setup = intel_ring_submission_setup ;
for_each_engine ( engine , gt , id ) {
err = engine_setup_common ( engine ) ;
if ( err )
return err ;
err = setup ( engine ) ;
if ( err )
return err ;
err = engine_init_common ( engine ) ;
if ( err )
return err ;
intel_engine_add_user ( engine ) ;
}
return 0 ;
}
2016-08-03 13:19:16 +01:00
/**
* intel_engines_cleanup_common - cleans up the engine state created by
* the common initiailizers .
* @ engine : Engine to cleanup .
*
* This cleans up everything created by the common helpers .
*/
void intel_engine_cleanup_common ( struct intel_engine_cs * engine )
{
2019-06-14 17:46:06 +01:00
GEM_BUG_ON ( ! list_empty ( & engine - > active . requests ) ) ;
2019-12-22 12:07:52 +00:00
tasklet_kill ( & engine - > execlists . tasklet ) ; /* flush the callback */
2019-06-14 17:46:06 +01:00
2020-07-31 16:48:34 +01:00
intel_breadcrumbs_free ( engine - > breadcrumbs ) ;
2017-09-13 09:56:02 +01:00
2019-11-25 10:58:58 +00:00
intel_engine_fini_retire ( engine ) ;
2016-08-18 17:17:10 +01:00
intel_engine_cleanup_cmd_parser ( engine ) ;
drm/i915: Unify active context tracking between legacy/execlists/guc
The requests conversion introduced a nasty bug where we could generate a
new request in the middle of constructing a request if we needed to idle
the system in order to evict space for a context. The request to idle
would be executed (and waited upon) before the current one, creating a
minor havoc in the seqno accounting, as we will consider the current
request to already be completed (prior to deferred seqno assignment) but
ring->last_retired_head would have been updated and still could allow
us to overwrite the current request before execution.
We also employed two different mechanisms to track the active context
until it was switched out. The legacy method allowed for waiting upon an
active context (it could forcibly evict any vma, including context's),
but the execlists method took a step backwards by pinning the vma for
the entire active lifespan of the context (the only way to evict was to
idle the entire GPU, not individual contexts). However, to circumvent
the tricky issue of locking (i.e. we cannot take struct_mutex at the
time of i915_gem_request_submit(), where we would want to move the
previous context onto the active tracker and unpin it), we take the
execlists approach and keep the contexts pinned until retirement.
The benefit of the execlists approach, more important for execlists than
legacy, was the reduction in work in pinning the context for each
request - as the context was kept pinned until idle, it could short
circuit the pinning for all active contexts.
We introduce new engine vfuncs to pin and unpin the context
respectively. The context is pinned at the start of the request, and
only unpinned when the following request is retired (this ensures that
the context is idle and coherent in main memory before we unpin it). We
move the engine->last_context tracking into the retirement itself
(rather than during request submission) in order to allow the submission
to be reordered or unwound without undue difficultly.
And finally an ulterior motive for unifying context handling was to
prepare for mock requests.
v2: Rename to last_retired_context, split out legacy_context tracking
for MI_SET_CONTEXT.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161218153724.8439-3-chris@chris-wilson.co.uk
2016-12-18 15:37:20 +00:00
2017-11-10 14:26:33 +00:00
if ( engine - > default_state )
drm/i915/gt: Keep a no-frills swappable copy of the default context state
We need to keep the default context state around to instantiate new
contexts (aka golden rendercontext), and we also keep it pinned while
the engine is active so that we can quickly reset a hanging context.
However, the default contexts are large enough to merit keeping in
swappable memory as opposed to kernel memory, so we store them inside
shmemfs. Currently, we use the normal GEM objects to create the default
context image, but we can throw away all but the shmemfs file.
This greatly simplifies the tricky power management code which wants to
run underneath the normal GT locking, and we definitely do not want to
use any high level objects that may appear to recurse back into the GT.
Though perhaps the primary advantage of the complex GEM object is that
we aggressively cache the mapping, but here we are recreating the
vm_area everytime time we unpark. At the worst, we add a lightweight
cache, but first find a microbenchmark that is impacted.
Having started to create some utility functions to make working with
shmemfs objects easier, we can start putting them to wider use, where
GEM objects are overkill, such as storing persistent error state.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Matthew Auld <matthew.auld@intel.com>
Cc: Ramalingam C <ramalingam.c@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200429172429.6054-1-chris@chris-wilson.co.uk
2020-04-29 18:24:29 +01:00
fput ( engine - > default_state ) ;
2017-11-10 14:26:33 +00:00
2020-12-22 10:42:42 +00:00
if ( engine - > kernel_context )
destroy_pinned_context ( engine - > kernel_context ) ;
drm/i915: Keep contexts pinned until after the next kernel context switch
We need to keep the context image pinned in memory until after the GPU
has finished writing into it. Since it continues to write as we signal
the final breadcrumb, we need to keep it pinned until the request after
it is complete. Currently we know the order in which requests execute on
each engine, and so to remove that presumption we need to identify a
request/context-switch we know must occur after our completion. Any
request queued after the signal must imply a context switch, for
simplicity we use a fresh request from the kernel context.
The sequence of operations for keeping the context pinned until saved is:
- On context activation, we preallocate a node for each physical engine
the context may operate on. This is to avoid allocations during
unpinning, which may be from inside FS_RECLAIM context (aka the
shrinker)
- On context deactivation on retirement of the last active request (which
is before we know the context has been saved), we add the
preallocated node onto a barrier list on each engine
- On engine idling, we emit a switch to kernel context. When this
switch completes, we know that all previous contexts must have been
saved, and so on retiring this request we can finally unpin all the
contexts that were marked as deactivated prior to the switch.
We can enhance this in future by flushing all the idle contexts on a
regular heartbeat pulse of a switch to kernel context, which will also
be used to check for hung engines.
v2: intel_context_active_acquire/_release
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190614164606.15633-1-chris@chris-wilson.co.uk
2019-06-14 17:46:04 +01:00
GEM_BUG_ON ( ! llist_empty ( & engine - > barrier_tasks ) ) ;
2020-12-22 10:42:42 +00:00
cleanup_status_page ( engine ) ;
2018-05-02 17:38:39 +01:00
2018-12-03 13:33:57 +00:00
intel_wa_list_free ( & engine - > ctx_wa_list ) ;
2018-12-03 13:33:41 +00:00
intel_wa_list_free ( & engine - > wa_list ) ;
2018-12-03 12:50:12 +00:00
intel_wa_list_free ( & engine - > whitelist ) ;
2016-08-03 13:19:16 +01:00
}
2016-10-04 21:11:31 +01:00
2020-01-31 07:57:15 +00:00
/**
* intel_engine_resume - re - initializes the HW state of the engine
* @ engine : Engine to resume .
*
* Returns zero on success or an error code on failure .
*/
int intel_engine_resume ( struct intel_engine_cs * engine )
{
intel_engine_apply_workarounds ( engine ) ;
intel_engine_apply_whitelist ( engine ) ;
return engine - > resume ( engine ) ;
}
2018-02-12 10:24:15 +00:00
u64 intel_engine_get_active_head ( const struct intel_engine_cs * engine )
2016-10-04 21:11:31 +01:00
{
2019-03-25 14:49:40 -07:00
struct drm_i915_private * i915 = engine - > i915 ;
2016-10-04 21:11:31 +01:00
u64 acthd ;
2019-03-25 14:49:40 -07:00
if ( INTEL_GEN ( i915 ) > = 8 )
acthd = ENGINE_READ64 ( engine , RING_ACTHD , RING_ACTHD_UDW ) ;
else if ( INTEL_GEN ( i915 ) > = 4 )
acthd = ENGINE_READ ( engine , RING_ACTHD ) ;
2016-10-04 21:11:31 +01:00
else
2019-03-25 14:49:40 -07:00
acthd = ENGINE_READ ( engine , ACTHD ) ;
2016-10-04 21:11:31 +01:00
return acthd ;
}
2018-02-12 10:24:15 +00:00
u64 intel_engine_get_last_batch_head ( const struct intel_engine_cs * engine )
2016-10-04 21:11:31 +01:00
{
u64 bbaddr ;
2019-03-25 14:49:40 -07:00
if ( INTEL_GEN ( engine - > i915 ) > = 8 )
bbaddr = ENGINE_READ64 ( engine , RING_BBADDR , RING_BBADDR_UDW ) ;
2016-10-04 21:11:31 +01:00
else
2019-03-25 14:49:40 -07:00
bbaddr = ENGINE_READ ( engine , RING_BBADDR ) ;
2016-10-04 21:11:31 +01:00
return bbaddr ;
}
2016-10-12 10:05:17 +01:00
2019-10-23 14:31:04 +01:00
static unsigned long stop_timeout ( const struct intel_engine_cs * engine )
{
if ( in_atomic ( ) | | irqs_disabled ( ) ) /* inside atomic preempt-reset? */
return 0 ;
/*
* If we are doing a normal GPU reset , we can take our time and allow
* the engine to quiesce . We ' ve stopped submission to the engine , and
* if we wait long enough an innocent context should complete and
* leave the engine idle . So they should not be caught unaware by
* the forthcoming GPU reset ( which usually follows the stop_cs ) !
*/
return READ_ONCE ( engine - > props . stop_timeout_ms ) ;
}
2020-12-24 13:55:36 +00:00
static int __intel_engine_stop_cs ( struct intel_engine_cs * engine ,
int fast_timeout_us ,
int slow_timeout_ms )
2018-05-16 19:33:55 +01:00
{
2019-03-25 14:49:40 -07:00
struct intel_uncore * uncore = engine - > uncore ;
2020-12-24 13:55:36 +00:00
const i915_reg_t mode = RING_MI_MODE ( engine - > mmio_base ) ;
2018-05-16 19:33:55 +01:00
int err ;
2020-12-24 13:55:36 +00:00
intel_uncore_write_fw ( uncore , mode , _MASKED_BIT_ENABLE ( STOP_RING ) ) ;
err = __intel_wait_for_register_fw ( engine - > uncore , mode ,
MODE_IDLE , MODE_IDLE ,
fast_timeout_us ,
slow_timeout_ms ,
NULL ) ;
/* A final mmio read to let GPU writes be hopefully flushed to memory */
intel_uncore_posting_read_fw ( uncore , mode ) ;
return err ;
}
int intel_engine_stop_cs ( struct intel_engine_cs * engine )
{
int err = 0 ;
2019-03-25 14:49:38 -07:00
if ( INTEL_GEN ( engine - > i915 ) < 3 )
2018-05-16 19:33:55 +01:00
return - ENODEV ;
2019-12-13 07:51:52 -08:00
ENGINE_TRACE ( engine , " \n " ) ;
2020-12-24 13:55:36 +00:00
if ( __intel_engine_stop_cs ( engine , 1000 , stop_timeout ( engine ) ) ) {
2019-12-13 07:51:52 -08:00
ENGINE_TRACE ( engine , " timed out on STOP_RING -> IDLE \n " ) ;
2018-05-16 19:33:55 +01:00
err = - ETIMEDOUT ;
}
return err ;
}
2018-08-14 18:18:57 +01:00
void intel_engine_cancel_stop_cs ( struct intel_engine_cs * engine )
{
2019-12-13 07:51:52 -08:00
ENGINE_TRACE ( engine , " \n " ) ;
2018-08-14 18:18:57 +01:00
2019-03-25 14:49:40 -07:00
ENGINE_WRITE_FW ( engine , RING_MI_MODE , _MASKED_BIT_DISABLE ( STOP_RING ) ) ;
2018-08-14 18:18:57 +01:00
}
2016-10-12 10:05:17 +01:00
const char * i915_cache_level_str ( struct drm_i915_private * i915 , int type )
{
switch ( type ) {
case I915_CACHE_NONE : return " uncached " ;
case I915_CACHE_LLC : return HAS_LLC ( i915 ) ? " LLC " : " snooped " ;
case I915_CACHE_L3_LLC : return " L3+LLC " ;
case I915_CACHE_WT : return " WT " ;
default : return " " ;
}
}
2019-06-10 13:57:06 +01:00
static u32
2020-01-10 12:30:56 +00:00
read_subslice_reg ( const struct intel_engine_cs * engine ,
int slice , int subslice , i915_reg_t reg )
2016-10-12 10:05:17 +01:00
{
2019-06-10 13:57:06 +01:00
struct drm_i915_private * i915 = engine - > i915 ;
struct intel_uncore * uncore = engine - > uncore ;
2019-07-17 19:06:20 +01:00
u32 mcr_mask , mcr_ss , mcr , old_mcr , val ;
2016-10-12 10:05:17 +01:00
enum forcewake_domains fw_domains ;
2019-06-10 13:57:06 +01:00
if ( INTEL_GEN ( i915 ) > = 11 ) {
2019-07-17 19:06:20 +01:00
mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK ;
mcr_ss = GEN11_MCR_SLICE ( slice ) | GEN11_MCR_SUBSLICE ( subslice ) ;
2018-03-16 14:14:51 +02:00
} else {
2019-07-17 19:06:20 +01:00
mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK ;
mcr_ss = GEN8_MCR_SLICE ( slice ) | GEN8_MCR_SUBSLICE ( subslice ) ;
2018-03-16 14:14:51 +02:00
}
2019-03-25 14:49:37 -07:00
fw_domains = intel_uncore_forcewake_for_reg ( uncore , reg ,
2016-10-12 10:05:17 +01:00
FW_REG_READ ) ;
2019-03-25 14:49:37 -07:00
fw_domains | = intel_uncore_forcewake_for_reg ( uncore ,
2016-10-12 10:05:17 +01:00
GEN8_MCR_SELECTOR ,
FW_REG_READ | FW_REG_WRITE ) ;
2019-03-25 14:49:37 -07:00
spin_lock_irq ( & uncore - > lock ) ;
intel_uncore_forcewake_get__locked ( uncore , fw_domains ) ;
2016-10-12 10:05:17 +01:00
2019-07-17 19:06:20 +01:00
old_mcr = mcr = intel_uncore_read_fw ( uncore , GEN8_MCR_SELECTOR ) ;
2018-05-18 15:39:57 -07:00
2019-07-17 19:06:20 +01:00
mcr & = ~ mcr_mask ;
mcr | = mcr_ss ;
2019-03-25 14:49:37 -07:00
intel_uncore_write_fw ( uncore , GEN8_MCR_SELECTOR , mcr ) ;
2016-10-12 10:05:17 +01:00
2019-07-17 19:06:20 +01:00
val = intel_uncore_read_fw ( uncore , reg ) ;
2016-10-12 10:05:17 +01:00
2019-07-17 19:06:20 +01:00
mcr & = ~ mcr_mask ;
mcr | = old_mcr & mcr_mask ;
2018-05-18 15:39:57 -07:00
2019-03-25 14:49:37 -07:00
intel_uncore_write_fw ( uncore , GEN8_MCR_SELECTOR , mcr ) ;
2016-10-12 10:05:17 +01:00
2019-03-25 14:49:37 -07:00
intel_uncore_forcewake_put__locked ( uncore , fw_domains ) ;
spin_unlock_irq ( & uncore - > lock ) ;
2016-10-12 10:05:17 +01:00
2019-07-17 19:06:20 +01:00
return val ;
2016-10-12 10:05:17 +01:00
}
/* NB: please notice the memset */
2020-01-10 12:30:56 +00:00
void intel_engine_get_instdone ( const struct intel_engine_cs * engine ,
2016-10-12 10:05:17 +01:00
struct intel_instdone * instdone )
{
2019-06-10 13:57:06 +01:00
struct drm_i915_private * i915 = engine - > i915 ;
2020-07-07 17:39:50 -07:00
const struct sseu_dev_info * sseu = & engine - > gt - > info . sseu ;
2019-03-25 14:49:40 -07:00
struct intel_uncore * uncore = engine - > uncore ;
2016-10-12 10:05:17 +01:00
u32 mmio_base = engine - > mmio_base ;
int slice ;
int subslice ;
memset ( instdone , 0 , sizeof ( * instdone ) ) ;
2019-06-10 13:57:06 +01:00
switch ( INTEL_GEN ( i915 ) ) {
2016-10-12 10:05:17 +01:00
default :
2019-03-25 14:49:40 -07:00
instdone - > instdone =
intel_uncore_read ( uncore , RING_INSTDONE ( mmio_base ) ) ;
2016-10-12 10:05:17 +01:00
2019-03-05 18:03:30 +00:00
if ( engine - > id ! = RCS0 )
2016-10-12 10:05:17 +01:00
break ;
2019-03-25 14:49:40 -07:00
instdone - > slice_common =
intel_uncore_read ( uncore , GEN7_SC_INSTDONE ) ;
2020-01-29 20:16:38 +02:00
if ( INTEL_GEN ( i915 ) > = 12 ) {
instdone - > slice_common_extra [ 0 ] =
intel_uncore_read ( uncore , GEN12_SC_INSTDONE_EXTRA ) ;
instdone - > slice_common_extra [ 1 ] =
intel_uncore_read ( uncore , GEN12_SC_INSTDONE_EXTRA2 ) ;
}
2019-08-23 09:03:05 -07:00
for_each_instdone_slice_subslice ( i915 , sseu , slice , subslice ) {
2016-10-12 10:05:17 +01:00
instdone - > sampler [ slice ] [ subslice ] =
2019-06-10 13:57:06 +01:00
read_subslice_reg ( engine , slice , subslice ,
2016-10-12 10:05:17 +01:00
GEN7_SAMPLER_INSTDONE ) ;
instdone - > row [ slice ] [ subslice ] =
2019-06-10 13:57:06 +01:00
read_subslice_reg ( engine , slice , subslice ,
2016-10-12 10:05:17 +01:00
GEN7_ROW_INSTDONE ) ;
}
break ;
case 7 :
2019-03-25 14:49:40 -07:00
instdone - > instdone =
intel_uncore_read ( uncore , RING_INSTDONE ( mmio_base ) ) ;
2016-10-12 10:05:17 +01:00
2019-03-05 18:03:30 +00:00
if ( engine - > id ! = RCS0 )
2016-10-12 10:05:17 +01:00
break ;
2019-03-25 14:49:40 -07:00
instdone - > slice_common =
intel_uncore_read ( uncore , GEN7_SC_INSTDONE ) ;
instdone - > sampler [ 0 ] [ 0 ] =
intel_uncore_read ( uncore , GEN7_SAMPLER_INSTDONE ) ;
instdone - > row [ 0 ] [ 0 ] =
intel_uncore_read ( uncore , GEN7_ROW_INSTDONE ) ;
2016-10-12 10:05:17 +01:00
break ;
case 6 :
case 5 :
case 4 :
2019-03-25 14:49:40 -07:00
instdone - > instdone =
intel_uncore_read ( uncore , RING_INSTDONE ( mmio_base ) ) ;
2019-03-05 18:03:30 +00:00
if ( engine - > id = = RCS0 )
2016-10-12 10:05:17 +01:00
/* HACK: Using the wrong struct member */
2019-03-25 14:49:40 -07:00
instdone - > slice_common =
intel_uncore_read ( uncore , GEN4_INSTDONE1 ) ;
2016-10-12 10:05:17 +01:00
break ;
case 3 :
case 2 :
2019-03-25 14:49:40 -07:00
instdone - > instdone = intel_uncore_read ( uncore , GEN2_INSTDONE ) ;
2016-10-12 10:05:17 +01:00
break ;
}
}
2017-02-13 17:15:14 +00:00
2017-05-30 13:13:33 +01:00
static bool ring_is_idle ( struct intel_engine_cs * engine )
{
bool idle = true ;
2019-01-18 11:22:25 +00:00
if ( I915_SELFTEST_ONLY ( ! engine - > mmio_base ) )
return true ;
2019-08-12 10:10:44 +01:00
if ( ! intel_engine_pm_get_if_awake ( engine ) )
2018-02-12 09:39:28 +00:00
return true ;
2017-05-30 13:13:33 +01:00
2019-02-27 20:46:53 +00:00
/* First check that no commands are left in the ring */
2019-03-25 14:49:40 -07:00
if ( ( ENGINE_READ ( engine , RING_HEAD ) & HEAD_ADDR ) ! =
( ENGINE_READ ( engine , RING_TAIL ) & TAIL_ADDR ) )
2019-02-27 20:46:53 +00:00
idle = false ;
2017-05-30 13:13:34 +01:00
2019-02-27 20:46:53 +00:00
/* No bit for gen2, so assume the CS parser is idle */
2019-08-12 10:10:44 +01:00
if ( INTEL_GEN ( engine - > i915 ) > 2 & &
2019-03-25 14:49:40 -07:00
! ( ENGINE_READ ( engine , RING_MI_MODE ) & MODE_IDLE ) )
2017-05-30 13:13:33 +01:00
idle = false ;
2019-08-12 10:10:44 +01:00
intel_engine_pm_put ( engine ) ;
2017-05-30 13:13:33 +01:00
return idle ;
}
2020-12-20 13:48:58 +00:00
void __intel_engine_flush_submission ( struct intel_engine_cs * engine , bool sync )
2019-10-08 11:56:55 +01:00
{
struct tasklet_struct * t = & engine - > execlists . tasklet ;
2020-06-15 19:39:35 +01:00
if ( ! t - > func )
return ;
2020-06-15 17:50:13 +01:00
local_bh_disable ( ) ;
if ( tasklet_trylock ( t ) ) {
/* Must wait for any GPU reset in progress. */
if ( __tasklet_is_enabled ( t ) )
t - > func ( t - > data ) ;
tasklet_unlock ( t ) ;
2019-10-08 11:56:55 +01:00
}
2020-06-15 17:50:13 +01:00
local_bh_enable ( ) ;
2020-12-20 13:48:58 +00:00
/* Synchronise and wait for the tasklet on another CPU */
if ( sync )
tasklet_unlock_wait ( t ) ;
2019-10-08 11:56:55 +01:00
}
2017-03-03 12:19:46 +00:00
/**
* intel_engine_is_idle ( ) - Report if the engine has finished process all work
* @ engine : the intel_engine_cs
*
* Return true if there are no requests pending , nothing left to be submitted
* to hardware , and that the engine is idle .
*/
bool intel_engine_is_idle ( struct intel_engine_cs * engine )
{
2017-04-11 20:00:42 +01:00
/* More white lies, if wedged, hw state is inconsistent */
2019-07-12 20:29:53 +01:00
if ( intel_gt_is_wedged ( engine - > gt ) )
2017-04-11 20:00:42 +01:00
return true ;
2019-06-25 14:01:14 +01:00
if ( ! intel_engine_pm_is_awake ( engine ) )
2019-05-03 12:52:15 +01:00
return true ;
2017-10-23 22:32:36 +01:00
/* Waiting to drain ELSP? */
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 15:20:51 +01:00
if ( execlists_active ( & engine - > execlists ) ) {
2019-07-02 18:17:23 +03:00
synchronize_hardirq ( engine - > i915 - > drm . pdev - > irq ) ;
2019-05-03 09:09:42 +01:00
2019-10-08 11:56:55 +01:00
intel_engine_flush_submission ( engine ) ;
2018-09-14 09:00:16 +01:00
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 15:20:51 +01:00
if ( execlists_active ( & engine - > execlists ) )
2018-05-06 18:13:28 +01:00
return false ;
}
2017-03-03 12:19:46 +00:00
2018-05-06 18:13:28 +01:00
/* ELSP is empty, but there are ready requests? E.g. after reset */
2018-06-29 08:53:20 +01:00
if ( ! RB_EMPTY_ROOT ( & engine - > execlists . queue . rb_root ) )
2017-07-21 13:32:24 +01:00
return false ;
2017-03-03 12:19:46 +00:00
/* Ring stopped? */
2019-01-18 11:22:25 +00:00
return ring_is_idle ( engine ) ;
2017-03-03 12:19:46 +00:00
}
2019-07-12 20:29:53 +01:00
bool intel_engines_are_idle ( struct intel_gt * gt )
2017-03-03 12:19:47 +00:00
{
struct intel_engine_cs * engine ;
enum intel_engine_id id ;
2017-12-12 13:21:48 +00:00
/*
* If the driver is wedged , HW state may be very inconsistent and
2017-03-30 15:50:37 +01:00
* report that it is still busy , even though we have stopped using it .
*/
2019-07-12 20:29:53 +01:00
if ( intel_gt_is_wedged ( gt ) )
2017-03-30 15:50:37 +01:00
return true ;
2019-02-27 21:41:59 +00:00
/* Already parked (and passed an idleness test); must still be idle */
2019-07-12 20:29:53 +01:00
if ( ! READ_ONCE ( gt - > awake ) )
2019-02-27 21:41:59 +00:00
return true ;
2019-10-17 10:45:00 +01:00
for_each_engine ( engine , gt , id ) {
2017-03-03 12:19:47 +00:00
if ( ! intel_engine_is_idle ( engine ) )
return false ;
}
return true ;
}
2019-07-12 20:29:53 +01:00
void intel_engines_reset_default_submission ( struct intel_gt * gt )
2017-03-16 17:13:03 +00:00
{
struct intel_engine_cs * engine ;
enum intel_engine_id id ;
2020-12-22 10:42:42 +00:00
for_each_engine ( engine , gt , id ) {
if ( engine - > sanitize )
engine - > sanitize ( engine ) ;
2017-03-16 17:13:03 +00:00
engine - > set_default_submission ( engine ) ;
2020-12-22 10:42:42 +00:00
}
2017-03-16 17:13:03 +00:00
}
2017-09-06 16:28:59 +01:00
bool intel_engine_can_store_dword ( struct intel_engine_cs * engine )
{
switch ( INTEL_GEN ( engine - > i915 ) ) {
case 2 :
return false ; /* uses physical not virtual addresses */
case 3 :
/* maybe only uses physical not virtual addresses */
return ! ( IS_I915G ( engine - > i915 ) | | IS_I915GM ( engine - > i915 ) ) ;
2019-08-26 14:38:37 +01:00
case 4 :
return ! IS_I965G ( engine - > i915 ) ; /* who knows! */
2017-09-06 16:28:59 +01:00
case 6 :
return engine - > class ! = VIDEO_DECODE_CLASS ; /* b0rked */
default :
return true ;
}
}
2020-03-31 10:42:39 +01:00
static struct intel_timeline * get_timeline ( struct i915_request * rq )
{
struct intel_timeline * tl ;
/*
* Even though we are holding the engine - > active . lock here , there
* is no control over the submission queue per - se and we are
* inspecting the active state at a random point in time , with an
* unknown queue . Play safe and make sure the timeline remains valid .
* ( Only being used for pretty printing , one extra kref shouldn ' t
* cause a camel stampede ! )
*/
rcu_read_lock ( ) ;
tl = rcu_dereference ( rq - > timeline ) ;
if ( ! kref_get_unless_zero ( & tl - > kref ) )
tl = NULL ;
rcu_read_unlock ( ) ;
return tl ;
}
static int print_ring ( char * buf , int sz , struct i915_request * rq )
{
int len = 0 ;
if ( ! i915_request_signaled ( rq ) ) {
struct intel_timeline * tl = get_timeline ( rq ) ;
len = scnprintf ( buf , sz ,
" ring:{start:%08x, hwsp:%08x, seqno:%08x, runtime:%llums}, " ,
i915_ggtt_offset ( rq - > ring - > vma ) ,
tl ? tl - > hwsp_offset : 0 ,
hwsp_seqno ( rq ) ,
DIV_ROUND_CLOSEST_ULL ( intel_context_get_total_runtime_ns ( rq - > context ) ,
1000 * 1000 ) ) ;
if ( tl )
intel_timeline_put ( tl ) ;
}
return len ;
}
2017-12-22 18:25:21 +00:00
static void hexdump ( struct drm_printer * m , const void * buf , size_t len )
{
const size_t rowsize = 8 * sizeof ( u32 ) ;
const void * prev = NULL ;
bool skip = false ;
size_t pos ;
for ( pos = 0 ; pos < len ; pos + = rowsize ) {
char line [ 128 ] ;
if ( prev & & ! memcmp ( prev , buf + pos , rowsize ) ) {
if ( ! skip ) {
drm_printf ( m , " * \n " ) ;
skip = true ;
}
continue ;
}
WARN_ON_ONCE ( hex_dump_to_buffer ( buf + pos , len - pos ,
rowsize , sizeof ( u32 ) ,
line , sizeof ( line ) ,
false ) > = sizeof ( line ) ) ;
2018-06-14 10:41:01 +01:00
drm_printf ( m , " [%04zx] %s \n " , pos , line ) ;
2017-12-22 18:25:21 +00:00
prev = buf + pos ;
skip = false ;
}
}
2019-10-16 11:08:51 +01:00
static const char * repr_timer ( const struct timer_list * t )
{
if ( ! READ_ONCE ( t - > expires ) )
return " inactive " ;
if ( timer_pending ( t ) )
return " active " ;
return " expired " ;
}
2019-06-18 17:19:51 +01:00
static void intel_engine_print_registers ( struct intel_engine_cs * engine ,
2018-02-12 10:24:15 +00:00
struct drm_printer * m )
2017-10-09 12:02:57 +01:00
{
struct drm_i915_private * dev_priv = engine - > i915 ;
2019-10-09 17:09:06 +01:00
struct intel_engine_execlists * const execlists = & engine - > execlists ;
2017-10-09 12:02:57 +01:00
u64 addr ;
2019-08-13 10:41:20 -07:00
if ( engine - > id = = RENDER_CLASS & & IS_GEN_RANGE ( dev_priv , 4 , 7 ) )
2019-03-25 14:49:40 -07:00
drm_printf ( m , " \t CCID: 0x%08x \n " , ENGINE_READ ( engine , CCID ) ) ;
drm/i915/gt: Yield the timeslice if caught waiting on a user semaphore
If we find ourselves waiting on a MI_SEMAPHORE_WAIT, either within the
user batch or in our own preamble, the engine raises a
GT_WAIT_ON_SEMAPHORE interrupt. We can unmask that interrupt and so
respond to a semaphore wait by yielding the timeslice, if we have
another context to yield to!
The only real complication is that the interrupt is only generated for
the start of the semaphore wait, and is asynchronous to our
process_csb() -- that is, we may not have registered the timeslice before
we see the interrupt. To ensure we don't miss a potential semaphore
blocking forward progress (e.g. selftests/live_timeslice_preempt) we mark
the interrupt and apply it to the next timeslice regardless of whether it
was active at the time.
v2: We use semaphores in preempt-to-busy, within the timeslicing
implementation itself! Ergo, when we do insert a preemption due to an
expired timeslice, the new context may start with the missed semaphore
flagged by the retired context and be yielded, ad infinitum. To avoid
this, read the context id at the time of the semaphore interrupt and
only yield if that context is still active.
Fixes: 8ee36e048c98 ("drm/i915/execlists: Minimalistic timeslicing")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200407130811.17321-1-chris@chris-wilson.co.uk
2020-04-07 14:08:11 +01:00
if ( HAS_EXECLISTS ( dev_priv ) ) {
drm_printf ( m , " \t EL_STAT_HI: 0x%08x \n " ,
ENGINE_READ ( engine , RING_EXECLIST_STATUS_HI ) ) ;
drm_printf ( m , " \t EL_STAT_LO: 0x%08x \n " ,
ENGINE_READ ( engine , RING_EXECLIST_STATUS_LO ) ) ;
}
2018-02-12 10:24:15 +00:00
drm_printf ( m , " \t RING_START: 0x%08x \n " ,
2019-03-25 14:49:40 -07:00
ENGINE_READ ( engine , RING_START ) ) ;
2018-02-12 10:24:15 +00:00
drm_printf ( m , " \t RING_HEAD: 0x%08x \n " ,
2019-03-25 14:49:40 -07:00
ENGINE_READ ( engine , RING_HEAD ) & HEAD_ADDR ) ;
2018-02-12 10:24:15 +00:00
drm_printf ( m , " \t RING_TAIL: 0x%08x \n " ,
2019-03-25 14:49:40 -07:00
ENGINE_READ ( engine , RING_TAIL ) & TAIL_ADDR ) ;
2017-10-26 12:50:48 +01:00
drm_printf ( m , " \t RING_CTL: 0x%08x%s \n " ,
2019-03-25 14:49:40 -07:00
ENGINE_READ ( engine , RING_CTL ) ,
ENGINE_READ ( engine , RING_CTL ) & ( RING_WAIT | RING_WAIT_SEMAPHORE ) ? " [waiting] " : " " ) ;
2017-10-26 12:50:48 +01:00
if ( INTEL_GEN ( engine - > i915 ) > 2 ) {
drm_printf ( m , " \t RING_MODE: 0x%08x%s \n " ,
2019-03-25 14:49:40 -07:00
ENGINE_READ ( engine , RING_MI_MODE ) ,
ENGINE_READ ( engine , RING_MI_MODE ) & ( MODE_IDLE ) ? " [idle] " : " " ) ;
2017-10-26 12:50:48 +01:00
}
2018-02-12 10:24:15 +00:00
if ( INTEL_GEN ( dev_priv ) > = 6 ) {
2020-01-28 20:43:15 +00:00
drm_printf ( m , " \t RING_IMR: 0x%08x \n " ,
2019-03-25 14:49:40 -07:00
ENGINE_READ ( engine , RING_IMR ) ) ;
2020-01-28 20:43:15 +00:00
drm_printf ( m , " \t RING_ESR: 0x%08x \n " ,
ENGINE_READ ( engine , RING_ESR ) ) ;
drm_printf ( m , " \t RING_EMR: 0x%08x \n " ,
ENGINE_READ ( engine , RING_EMR ) ) ;
drm_printf ( m , " \t RING_EIR: 0x%08x \n " ,
ENGINE_READ ( engine , RING_EIR ) ) ;
2018-02-12 10:24:15 +00:00
}
2017-10-09 12:02:57 +01:00
addr = intel_engine_get_active_head ( engine ) ;
drm_printf ( m , " \t ACTHD: 0x%08x_%08x \n " ,
upper_32_bits ( addr ) , lower_32_bits ( addr ) ) ;
addr = intel_engine_get_last_batch_head ( engine ) ;
drm_printf ( m , " \t BBADDR: 0x%08x_%08x \n " ,
upper_32_bits ( addr ) , lower_32_bits ( addr ) ) ;
2017-12-18 12:39:14 +00:00
if ( INTEL_GEN ( dev_priv ) > = 8 )
2019-03-25 14:49:40 -07:00
addr = ENGINE_READ64 ( engine , RING_DMA_FADD , RING_DMA_FADD_UDW ) ;
2017-12-18 12:39:14 +00:00
else if ( INTEL_GEN ( dev_priv ) > = 4 )
2019-03-25 14:49:40 -07:00
addr = ENGINE_READ ( engine , RING_DMA_FADD ) ;
2017-12-18 12:39:14 +00:00
else
2019-03-25 14:49:40 -07:00
addr = ENGINE_READ ( engine , DMA_FADD_I8XX ) ;
2017-12-18 12:39:14 +00:00
drm_printf ( m , " \t DMA_FADDR: 0x%08x_%08x \n " ,
upper_32_bits ( addr ) , lower_32_bits ( addr ) ) ;
if ( INTEL_GEN ( dev_priv ) > = 4 ) {
drm_printf ( m , " \t IPEIR: 0x%08x \n " ,
2019-03-25 14:49:40 -07:00
ENGINE_READ ( engine , RING_IPEIR ) ) ;
2017-12-18 12:39:14 +00:00
drm_printf ( m , " \t IPEHR: 0x%08x \n " ,
2019-03-25 14:49:40 -07:00
ENGINE_READ ( engine , RING_IPEHR ) ) ;
2017-12-18 12:39:14 +00:00
} else {
2019-03-25 14:49:40 -07:00
drm_printf ( m , " \t IPEIR: 0x%08x \n " , ENGINE_READ ( engine , IPEIR ) ) ;
drm_printf ( m , " \t IPEHR: 0x%08x \n " , ENGINE_READ ( engine , IPEHR ) ) ;
2017-12-18 12:39:14 +00:00
}
2017-10-09 12:02:57 +01:00
2021-01-12 18:12:34 -08:00
if ( intel_engine_in_guc_submission_mode ( engine ) ) {
/* nothing to print yet */
} else if ( HAS_EXECLISTS ( dev_priv ) ) {
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 15:20:51 +01:00
struct i915_request * const * port , * rq ;
2019-01-28 10:23:55 +00:00
const u32 * hws =
& engine - > status_page . addr [ I915_HWS_CSB_BUF0_INDEX ] ;
2019-04-05 21:46:56 +01:00
const u8 num_entries = execlists - > csb_size ;
2017-10-09 12:02:57 +01:00
unsigned int idx ;
2018-08-21 11:11:38 +01:00
u8 read , write ;
2017-10-09 12:02:57 +01:00
2019-10-23 14:31:05 +01:00
drm_printf ( m , " \t Execlist tasklet queued? %s (%s), preempt? %s, timeslice? %s \n " ,
2019-10-16 11:08:51 +01:00
yesno ( test_bit ( TASKLET_STATE_SCHED ,
& engine - > execlists . tasklet . state ) ) ,
enableddisabled ( ! atomic_read ( & engine - > execlists . tasklet . count ) ) ,
2019-10-23 14:31:05 +01:00
repr_timer ( & engine - > execlists . preempt ) ,
2019-10-16 11:08:51 +01:00
repr_timer ( & engine - > execlists . timer ) ) ;
2017-10-09 12:02:57 +01:00
2018-08-21 11:11:38 +01:00
read = execlists - > csb_head ;
write = READ_ONCE ( * execlists - > csb_write ) ;
2019-10-16 11:08:51 +01:00
drm_printf ( m , " \t Execlist status: 0x%08x %08x; CSB read:%d, write:%d, entries:%d \n " ,
ENGINE_READ ( engine , RING_EXECLIST_STATUS_LO ) ,
ENGINE_READ ( engine , RING_EXECLIST_STATUS_HI ) ,
read , write , num_entries ) ;
2019-04-05 21:46:56 +01:00
if ( read > = num_entries )
2017-10-09 12:02:57 +01:00
read = 0 ;
2019-04-05 21:46:56 +01:00
if ( write > = num_entries )
2017-10-09 12:02:57 +01:00
write = 0 ;
if ( read > write )
2019-04-05 21:46:56 +01:00
write + = num_entries ;
2017-10-09 12:02:57 +01:00
while ( read < write ) {
2019-04-05 21:46:56 +01:00
idx = + + read % num_entries ;
drm_printf ( m , " \t Execlist CSB[%d]: 0x%08x, context: %d \n " ,
idx , hws [ idx * 2 ] , hws [ idx * 2 + 1 ] ) ;
2017-10-09 12:02:57 +01:00
}
2019-10-09 17:09:06 +01:00
execlists_active_lock_bh ( execlists ) ;
2019-11-11 11:43:19 +00:00
rcu_read_lock ( ) ;
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 15:20:51 +01:00
for ( port = execlists - > active ; ( rq = * port ) ; port + + ) {
2020-02-18 16:21:40 +00:00
char hdr [ 160 ] ;
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 15:20:51 +01:00
int len ;
2020-03-11 08:32:56 +01:00
len = scnprintf ( hdr , sizeof ( hdr ) ,
2020-06-10 16:40:46 +01:00
" \t \t Active[%d]: ccid:%08x%s%s, " ,
2020-03-31 10:42:39 +01:00
( int ) ( port - execlists - > active ) ,
2020-06-10 16:40:46 +01:00
rq - > context - > lrc . ccid ,
intel_context_is_closed ( rq - > context ) ? " ! " : " " ,
intel_context_is_banned ( rq - > context ) ? " * " : " " ) ;
2020-03-31 10:42:39 +01:00
len + = print_ring ( hdr + len , sizeof ( hdr ) - len , rq ) ;
2020-03-11 08:32:56 +01:00
scnprintf ( hdr + len , sizeof ( hdr ) - len , " rq: " ) ;
2020-11-19 16:56:13 +00:00
i915_request_show ( m , rq , hdr , 0 ) ;
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 15:20:51 +01:00
}
for ( port = execlists - > pending ; ( rq = * port ) ; port + + ) {
2020-03-31 10:42:39 +01:00
char hdr [ 160 ] ;
int len ;
drm/i915: Mark i915_request.timeline as a volatile, rcu pointer
The request->timeline is only valid until the request is retired (i.e.
before it is completed). Upon retiring the request, the context may be
unpinned and freed, and along with it the timeline may be freed. We
therefore need to be very careful when chasing rq->timeline that the
pointer does not disappear beneath us. The vast majority of users are in
a protected context, either during request construction or retirement,
where the timeline->mutex is held and the timeline cannot disappear. It
is those few off the beaten path (where we access a second timeline) that
need extra scrutiny -- to be added in the next patch after first adding
the warnings about dangerous access.
One complication, where we cannot use the timeline->mutex itself, is
during request submission onto hardware (under spinlocks). Here, we want
to check on the timeline to finalize the breadcrumb, and so we need to
impose a second rule to ensure that the request->timeline is indeed
valid. As we are submitting the request, it's context and timeline must
be pinned, as it will be used by the hardware. Since it is pinned, we
know the request->timeline must still be valid, and we cannot submit the
idle barrier until after we release the engine->active.lock, ergo while
submitting and holding that spinlock, a second thread cannot release the
timeline.
v2: Don't be lazy inside selftests; hold the timeline->mutex for as long
as we need it, and tidy up acquiring the timeline with a bit of
refactoring (i915_active_add_request)
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190919111912.21631-1-chris@chris-wilson.co.uk
2019-09-19 12:19:10 +01:00
2020-03-31 10:42:39 +01:00
len = scnprintf ( hdr , sizeof ( hdr ) ,
2020-06-10 16:40:46 +01:00
" \t \t Pending[%d]: ccid:%08x%s%s, " ,
2020-03-31 10:42:39 +01:00
( int ) ( port - execlists - > pending ) ,
2020-06-10 16:40:46 +01:00
rq - > context - > lrc . ccid ,
intel_context_is_closed ( rq - > context ) ? " ! " : " " ,
intel_context_is_banned ( rq - > context ) ? " * " : " " ) ;
2020-03-31 10:42:39 +01:00
len + = print_ring ( hdr + len , sizeof ( hdr ) - len , rq ) ;
scnprintf ( hdr + len , sizeof ( hdr ) - len , " rq: " ) ;
2020-11-19 16:56:13 +00:00
i915_request_show ( m , rq , hdr , 0 ) ;
2017-10-09 12:02:57 +01:00
}
2019-11-11 11:43:19 +00:00
rcu_read_unlock ( ) ;
2019-10-09 17:09:06 +01:00
execlists_active_unlock_bh ( execlists ) ;
2017-10-09 12:02:57 +01:00
} else if ( INTEL_GEN ( dev_priv ) > 6 ) {
drm_printf ( m , " \t PP_DIR_BASE: 0x%08x \n " ,
2019-03-25 14:49:40 -07:00
ENGINE_READ ( engine , RING_PP_DIR_BASE ) ) ;
2017-10-09 12:02:57 +01:00
drm_printf ( m , " \t PP_DIR_BASE_READ: 0x%08x \n " ,
2019-03-25 14:49:40 -07:00
ENGINE_READ ( engine , RING_PP_DIR_BASE_READ ) ) ;
2017-10-09 12:02:57 +01:00
drm_printf ( m , " \t PP_DIR_DCLV: 0x%08x \n " ,
2019-03-25 14:49:40 -07:00
ENGINE_READ ( engine , RING_PP_DIR_DCLV ) ) ;
2017-10-09 12:02:57 +01:00
}
2018-02-12 10:24:15 +00:00
}
2018-06-14 13:21:50 +01:00
static void print_request_ring ( struct drm_printer * m , struct i915_request * rq )
{
void * ring ;
int size ;
drm_printf ( m ,
" [head %04x, postfix %04x, tail %04x, batch 0x%08x_%08x]: \n " ,
rq - > head , rq - > postfix , rq - > tail ,
rq - > batch ? upper_32_bits ( rq - > batch - > node . start ) : ~ 0u ,
rq - > batch ? lower_32_bits ( rq - > batch - > node . start ) : ~ 0u ) ;
size = rq - > tail - rq - > head ;
if ( rq - > tail < rq - > head )
size + = rq - > ring - > size ;
ring = kmalloc ( size , GFP_ATOMIC ) ;
if ( ring ) {
const void * vaddr = rq - > ring - > vaddr ;
unsigned int head = rq - > head ;
unsigned int len = 0 ;
if ( rq - > tail < head ) {
len = rq - > ring - > size - head ;
memcpy ( ring , vaddr + head , len ) ;
head = 0 ;
}
memcpy ( ring + len , vaddr + head , size - len ) ;
hexdump ( m , ring , size ) ;
kfree ( ring ) ;
}
}
2020-01-16 18:47:53 +00:00
static unsigned long list_count ( struct list_head * list )
{
struct list_head * pos ;
unsigned long count = 0 ;
list_for_each ( pos , list )
count + + ;
return count ;
}
2020-09-16 10:00:57 +01:00
static unsigned long read_ul ( void * p , size_t x )
{
return * ( unsigned long * ) ( p + x ) ;
}
static void print_properties ( struct intel_engine_cs * engine ,
struct drm_printer * m )
{
static const struct pmap {
size_t offset ;
const char * name ;
} props [ ] = {
# define P(x) { \
. offset = offsetof ( typeof ( engine - > props ) , x ) , \
. name = # x \
}
P ( heartbeat_interval_ms ) ,
P ( max_busywait_duration_ns ) ,
P ( preempt_timeout_ms ) ,
P ( stop_timeout_ms ) ,
P ( timeslice_duration_ms ) ,
{ } ,
# undef P
} ;
const struct pmap * p ;
drm_printf ( m , " \t Properties: \n " ) ;
for ( p = props ; p - > name ; p + + )
drm_printf ( m , " \t \t %s: %lu [default %lu] \n " ,
p - > name ,
read_ul ( & engine - > props , p - > offset ) ,
read_ul ( & engine - > defaults , p - > offset ) ) ;
}
2018-02-12 10:24:15 +00:00
void intel_engine_dump ( struct intel_engine_cs * engine ,
struct drm_printer * m ,
const char * header , . . . )
{
struct i915_gpu_error * const error = & engine - > i915 - > gpu_error ;
2019-01-15 21:29:48 +00:00
struct i915_request * rq ;
2019-01-14 14:21:18 +00:00
intel_wakeref_t wakeref ;
2019-07-15 09:09:28 +01:00
unsigned long flags ;
2020-06-19 20:10:53 +01:00
ktime_t dummy ;
2018-02-12 10:24:15 +00:00
if ( header ) {
va_list ap ;
va_start ( ap , header ) ;
drm_vprintf ( m , header , & ap ) ;
va_end ( ap ) ;
}
2019-07-12 20:29:53 +01:00
if ( intel_gt_is_wedged ( engine - > gt ) )
2018-02-12 10:24:15 +00:00
drm_printf ( m , " *** WEDGED *** \n " ) ;
drm/i915: Invert the GEM wakeref hierarchy
In the current scheme, on submitting a request we take a single global
GEM wakeref, which trickles down to wake up all GT power domains. This
is undesirable as we would like to be able to localise our power
management to the available power domains and to remove the global GEM
operations from the heart of the driver. (The intent there is to push
global GEM decisions to the boundary as used by the GEM user interface.)
Now during request construction, each request is responsible via its
logical context to acquire a wakeref on each power domain it intends to
utilize. Currently, each request takes a wakeref on the engine(s) and
the engines themselves take a chipset wakeref. This gives us a
transition on each engine which we can extend if we want to insert more
powermangement control (such as soft rc6). The global GEM operations
that currently require a struct_mutex are reduced to listening to pm
events from the chipset GT wakeref. As we reduce the struct_mutex
requirement, these listeners should evaporate.
Perhaps the biggest immediate change is that this removes the
struct_mutex requirement around GT power management, allowing us greater
flexibility in request construction. Another important knock-on effect,
is that by tracking engine usage, we can insert a switch back to the
kernel context on that engine immediately, avoiding any extra delay or
inserting global synchronisation barriers. This makes tracking when an
engine and its associated contexts are idle much easier -- important for
when we forgo our assumed execution ordering and need idle barriers to
unpin used contexts. In the process, it means we remove a large chunk of
code whose only purpose was to switch back to the kernel context.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Imre Deak <imre.deak@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190424200717.1686-5-chris@chris-wilson.co.uk
2019-04-24 21:07:17 +01:00
drm_printf ( m , " \t Awake? %d \n " , atomic_read ( & engine - > wakeref . count ) ) ;
2019-11-28 10:25:46 +00:00
drm_printf ( m , " \t Barriers?: %s \n " ,
yesno ( ! llist_empty ( & engine - > barrier_tasks ) ) ) ;
2019-12-19 12:43:53 +00:00
drm_printf ( m , " \t Latency: %luus \n " ,
ewma__engine_latency_read ( & engine - > latency ) ) ;
2020-06-19 20:10:53 +01:00
if ( intel_engine_supports_stats ( engine ) )
drm_printf ( m , " \t Runtime: %llums \n " ,
ktime_to_ms ( intel_engine_get_busy_time ( engine ,
& dummy ) ) ) ;
2020-06-05 15:47:05 +01:00
drm_printf ( m , " \t Forcewake: %x domains, %d active \n " ,
engine - > fw_domain , atomic_read ( & engine - > fw_active ) ) ;
2019-10-23 14:31:08 +01:00
rcu_read_lock ( ) ;
rq = READ_ONCE ( engine - > heartbeat . systole ) ;
if ( rq )
drm_printf ( m , " \t Heartbeat: %d ms ago \n " ,
jiffies_to_msecs ( jiffies - rq - > emitted_jiffies ) ) ;
rcu_read_unlock ( ) ;
2018-02-12 10:24:15 +00:00
drm_printf ( m , " \t Reset count: %d (global %d) \n " ,
i915_reset_engine_count ( error , engine ) ,
i915_reset_count ( error ) ) ;
2020-09-16 10:00:57 +01:00
print_properties ( engine , m ) ;
2018-02-12 10:24:15 +00:00
drm_printf ( m , " \t Requests: \n " ) ;
2019-07-15 09:09:28 +01:00
spin_lock_irqsave ( & engine - > active . lock , flags ) ;
2019-03-05 18:03:32 +00:00
rq = intel_engine_find_active_request ( engine ) ;
2018-02-12 10:24:15 +00:00
if ( rq ) {
drm/i915: Mark i915_request.timeline as a volatile, rcu pointer
The request->timeline is only valid until the request is retired (i.e.
before it is completed). Upon retiring the request, the context may be
unpinned and freed, and along with it the timeline may be freed. We
therefore need to be very careful when chasing rq->timeline that the
pointer does not disappear beneath us. The vast majority of users are in
a protected context, either during request construction or retirement,
where the timeline->mutex is held and the timeline cannot disappear. It
is those few off the beaten path (where we access a second timeline) that
need extra scrutiny -- to be added in the next patch after first adding
the warnings about dangerous access.
One complication, where we cannot use the timeline->mutex itself, is
during request submission onto hardware (under spinlocks). Here, we want
to check on the timeline to finalize the breadcrumb, and so we need to
impose a second rule to ensure that the request->timeline is indeed
valid. As we are submitting the request, it's context and timeline must
be pinned, as it will be used by the hardware. Since it is pinned, we
know the request->timeline must still be valid, and we cannot submit the
idle barrier until after we release the engine->active.lock, ergo while
submitting and holding that spinlock, a second thread cannot release the
timeline.
v2: Don't be lazy inside selftests; hold the timeline->mutex for as long
as we need it, and tidy up acquiring the timeline with a bit of
refactoring (i915_active_add_request)
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190919111912.21631-1-chris@chris-wilson.co.uk
2019-09-19 12:19:10 +01:00
struct intel_timeline * tl = get_timeline ( rq ) ;
2020-11-19 16:56:13 +00:00
i915_request_show ( m , rq , " \t \t active " , 0 ) ;
2018-06-14 13:21:50 +01:00
2018-03-07 13:42:24 +00:00
drm_printf ( m , " \t \t ring->start: 0x%08x \n " ,
2018-02-12 10:24:15 +00:00
i915_ggtt_offset ( rq - > ring - > vma ) ) ;
2018-03-07 13:42:24 +00:00
drm_printf ( m , " \t \t ring->head: 0x%08x \n " ,
2018-02-12 10:24:15 +00:00
rq - > ring - > head ) ;
2018-03-07 13:42:24 +00:00
drm_printf ( m , " \t \t ring->tail: 0x%08x \n " ,
2018-02-12 10:24:15 +00:00
rq - > ring - > tail ) ;
2018-03-07 13:42:24 +00:00
drm_printf ( m , " \t \t ring->emit: 0x%08x \n " ,
rq - > ring - > emit ) ;
drm_printf ( m , " \t \t ring->space: 0x%08x \n " ,
rq - > ring - > space ) ;
drm/i915: Mark i915_request.timeline as a volatile, rcu pointer
The request->timeline is only valid until the request is retired (i.e.
before it is completed). Upon retiring the request, the context may be
unpinned and freed, and along with it the timeline may be freed. We
therefore need to be very careful when chasing rq->timeline that the
pointer does not disappear beneath us. The vast majority of users are in
a protected context, either during request construction or retirement,
where the timeline->mutex is held and the timeline cannot disappear. It
is those few off the beaten path (where we access a second timeline) that
need extra scrutiny -- to be added in the next patch after first adding
the warnings about dangerous access.
One complication, where we cannot use the timeline->mutex itself, is
during request submission onto hardware (under spinlocks). Here, we want
to check on the timeline to finalize the breadcrumb, and so we need to
impose a second rule to ensure that the request->timeline is indeed
valid. As we are submitting the request, it's context and timeline must
be pinned, as it will be used by the hardware. Since it is pinned, we
know the request->timeline must still be valid, and we cannot submit the
idle barrier until after we release the engine->active.lock, ergo while
submitting and holding that spinlock, a second thread cannot release the
timeline.
v2: Don't be lazy inside selftests; hold the timeline->mutex for as long
as we need it, and tidy up acquiring the timeline with a bit of
refactoring (i915_active_add_request)
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190919111912.21631-1-chris@chris-wilson.co.uk
2019-09-19 12:19:10 +01:00
if ( tl ) {
drm_printf ( m , " \t \t ring->hwsp: 0x%08x \n " ,
tl - > hwsp_offset ) ;
intel_timeline_put ( tl ) ;
}
2018-06-14 13:21:50 +01:00
print_request_ring ( m , rq ) ;
2019-09-15 21:37:00 +01:00
2019-12-20 10:12:29 +00:00
if ( rq - > context - > lrc_reg_state ) {
2019-09-15 21:37:00 +01:00
drm_printf ( m , " Logical Ring Context: \n " ) ;
2019-12-20 10:12:29 +00:00
hexdump ( m , rq - > context - > lrc_reg_state , PAGE_SIZE ) ;
2019-09-15 21:37:00 +01:00
}
2018-02-12 10:24:15 +00:00
}
2020-01-16 18:47:53 +00:00
drm_printf ( m , " \t On hold?: %lu \n " , list_count ( & engine - > active . hold ) ) ;
2019-07-15 09:09:28 +01:00
spin_unlock_irqrestore ( & engine - > active . lock , flags ) ;
2018-02-12 10:24:15 +00:00
2019-08-13 22:57:07 +01:00
drm_printf ( m , " \t MMIO base: 0x%08x \n " , engine - > mmio_base ) ;
2019-10-07 16:45:31 +01:00
wakeref = intel_runtime_pm_get_if_in_use ( engine - > uncore - > rpm ) ;
2019-01-14 14:21:18 +00:00
if ( wakeref ) {
2018-02-12 10:24:15 +00:00
intel_engine_print_registers ( engine , m ) ;
2019-10-07 16:45:31 +01:00
intel_runtime_pm_put ( engine - > uncore - > rpm , wakeref ) ;
2018-02-12 10:24:15 +00:00
} else {
drm_printf ( m , " \t Device is asleep; skipping register dump \n " ) ;
}
2017-10-09 12:02:57 +01:00
2020-11-19 16:56:12 +00:00
intel_execlists_show_requests ( engine , m , i915_request_show , 8 ) ;
2017-10-15 21:43:10 +01:00
2017-12-22 18:25:21 +00:00
drm_printf ( m , " HWSP: \n " ) ;
2019-01-28 10:23:55 +00:00
hexdump ( m , engine - > status_page . addr , PAGE_SIZE ) ;
2017-12-22 18:25:21 +00:00
2017-11-07 15:22:11 +00:00
drm_printf ( m , " Idle? %s \n " , yesno ( intel_engine_is_idle ( engine ) ) ) ;
drm/i915: Replace global breadcrumbs with per-context interrupt tracking
A few years ago, see commit 688e6c725816 ("drm/i915: Slaughter the
thundering i915_wait_request herd"), the issue of handling multiple
clients waiting in parallel was brought to our attention. The
requirement was that every client should be woken immediately upon its
request being signaled, without incurring any cpu overhead.
To handle certain fragility of our hw meant that we could not do a
simple check inside the irq handler (some generations required almost
unbounded delays before we could be sure of seqno coherency) and so
request completion checking required delegation.
Before commit 688e6c725816, the solution was simple. Every client
waiting on a request would be woken on every interrupt and each would do
a heavyweight check to see if their request was complete. Commit
688e6c725816 introduced an rbtree so that only the earliest waiter on
the global timeline would woken, and would wake the next and so on.
(Along with various complications to handle requests being reordered
along the global timeline, and also a requirement for kthread to provide
a delegate for fence signaling that had no process context.)
The global rbtree depends on knowing the execution timeline (and global
seqno). Without knowing that order, we must instead check all contexts
queued to the HW to see which may have advanced. We trim that list by
only checking queued contexts that are being waited on, but still we
keep a list of all active contexts and their active signalers that we
inspect from inside the irq handler. By moving the waiters onto the fence
signal list, we can combine the client wakeup with the dma_fence
signaling (a dramatic reduction in complexity, but does require the HW
being coherent, the seqno must be visible from the cpu before the
interrupt is raised - we keep a timer backup just in case).
Having previously fixed all the issues with irq-seqno serialisation (by
inserting delays onto the GPU after each request instead of random delays
on the CPU after each interrupt), we can rely on the seqno state to
perfom direct wakeups from the interrupt handler. This allows us to
preserve our single context switch behaviour of the current routine,
with the only downside that we lose the RT priority sorting of wakeups.
In general, direct wakeup latency of multiple clients is about the same
(about 10% better in most cases) with a reduction in total CPU time spent
in the waiter (about 20-50% depending on gen). Average herd behaviour is
improved, but at the cost of not delegating wakeups on task_prio.
v2: Capture fence signaling state for error state and add comments to
warm even the most cold of hearts.
v3: Check if the request is still active before busywaiting
v4: Reduce the amount of pointer misdirection with list_for_each_safe
and using a local i915_request variable inside the loops
v5: Add a missing pluralisation to a purely informative selftest message.
References: 688e6c725816 ("drm/i915: Slaughter the thundering i915_wait_request herd")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190129205230.19056-2-chris@chris-wilson.co.uk
2019-01-29 20:52:29 +00:00
intel_engine_print_breadcrumbs ( engine , m ) ;
2017-10-09 12:02:57 +01:00
}
2020-06-17 14:09:16 +01:00
static ktime_t __intel_engine_get_busy_time ( struct intel_engine_cs * engine ,
ktime_t * now )
2017-11-21 18:18:48 +00:00
{
ktime_t total = engine - > stats . total ;
/*
* If the engine is executing something at the moment
* add it to the total .
*/
2020-06-17 14:09:16 +01:00
* now = ktime_get ( ) ;
2020-04-29 21:54:41 +01:00
if ( atomic_read ( & engine - > stats . active ) )
2020-06-17 14:09:16 +01:00
total = ktime_add ( total , ktime_sub ( * now , engine - > stats . start ) ) ;
2017-11-21 18:18:48 +00:00
return total ;
}
/**
* intel_engine_get_busy_time ( ) - Return current accumulated engine busyness
* @ engine : engine to report on
2020-06-17 14:09:16 +01:00
* @ now : monotonic timestamp of sampling
2017-11-21 18:18:48 +00:00
*
* Returns accumulated time @ engine was busy since engine stats were enabled .
*/
2020-06-17 14:09:16 +01:00
ktime_t intel_engine_get_busy_time ( struct intel_engine_cs * engine , ktime_t * now )
2017-11-21 18:18:48 +00:00
{
2018-04-26 08:47:16 +01:00
unsigned int seq ;
2017-11-21 18:18:48 +00:00
ktime_t total ;
2018-04-26 08:47:16 +01:00
do {
seq = read_seqbegin ( & engine - > stats . lock ) ;
2020-06-17 14:09:16 +01:00
total = __intel_engine_get_busy_time ( engine , now ) ;
2018-04-26 08:47:16 +01:00
} while ( read_seqretry ( & engine - > stats . lock , seq ) ) ;
2017-11-21 18:18:48 +00:00
return total ;
}
2019-03-05 18:03:32 +00:00
static bool match_ring ( struct i915_request * rq )
{
2019-03-25 14:49:40 -07:00
u32 ring = ENGINE_READ ( rq - > engine , RING_START ) ;
2019-03-05 18:03:32 +00:00
return ring = = i915_ggtt_offset ( rq - > ring - > vma ) ;
}
struct i915_request *
intel_engine_find_active_request ( struct intel_engine_cs * engine )
{
struct i915_request * request , * active = NULL ;
/*
* We are called by the error capture , reset and to dump engine
* state at random points in time . In particular , note that neither is
* crucially ordered with an interrupt . After a hang , the GPU is dead
* and we assume that no more writes can happen ( we waited long enough
* for all writes that were in transaction to be flushed ) - adding an
* extra delay for a recent interrupt is pointless . Hence , we do
* not need an engine - > irq_seqno_barrier ( ) before the seqno reads .
* At all other times , we must assume the GPU is still running , but
* we only care about the snapshot of this moment .
*/
2019-07-15 09:09:28 +01:00
lockdep_assert_held ( & engine - > active . lock ) ;
2020-01-17 11:32:59 +00:00
rcu_read_lock ( ) ;
request = execlists_active ( & engine - > execlists ) ;
if ( request ) {
struct intel_timeline * tl = request - > context - > timeline ;
list_for_each_entry_from_reverse ( request , & tl - > requests , link ) {
if ( i915_request_completed ( request ) )
break ;
active = request ;
}
}
rcu_read_unlock ( ) ;
if ( active )
return active ;
2019-06-14 17:46:06 +01:00
list_for_each_entry ( request , & engine - > active . requests , sched . link ) {
2019-03-05 18:03:32 +00:00
if ( i915_request_completed ( request ) )
continue ;
if ( ! i915_request_started ( request ) )
2019-06-14 17:46:06 +01:00
continue ;
2019-03-05 18:03:32 +00:00
/* More than one preemptible request may match! */
if ( ! match_ring ( request ) )
2019-06-14 17:46:06 +01:00
continue ;
2019-03-05 18:03:32 +00:00
active = request ;
break ;
}
return active ;
}
2017-02-13 17:15:14 +00:00
# if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
2019-08-08 12:06:11 +01:00
# include "mock_engine.c"
2019-08-08 21:27:58 +01:00
# include "selftest_engine.c"
2019-04-24 18:48:39 +01:00
# include "selftest_engine_cs.c"
2017-02-13 17:15:14 +00:00
# endif