drm/i915: Update reset path to fix incomplete requests

Update reset path in preparation for engine reset which requires
identification of incomplete requests and associated context and fixing
their state so that engine can resume correctly after reset.

The request that caused the hang will be skipped and head is reset to the
start of breadcrumb. This allows us to resume from where we left-off.
Since this request didn't complete normally we also need to cleanup elsp
queue manually. This is vital if we employ nonblocking request
submission where we may have a web of dependencies upon the hung request
and so advancing the seqno manually is no longer trivial.

ABI: gem_reset_stats / DRM_IOCTL_I915_GET_RESET_STATS

We change the way we count pending batches. Only the active context
involved in the reset is marked as either innocent or guilty, and not
mark the entire world as pending. By inspection this only affects
igt/gem_reset_stats (which assumes implementation details) and not
piglit.

ARB_robustness gives this guide on how we expect the user of this
interface to behave:

 * Provide a mechanism for an OpenGL application to learn about
   graphics resets that affect the context.  When a graphics reset
   occurs, the OpenGL context becomes unusable and the application
   must create a new context to continue operation. Detecting a
   graphics reset happens through an inexpensive query.

And with regards to the actual meaning of the reset values:

   Certain events can result in a reset of the GL context. Such a reset
   causes all context state to be lost. Recovery from such events
   requires recreation of all objects in the affected context. The
   current status of the graphics reset state is returned by

	enum GetGraphicsResetStatusARB();

   The symbolic constant returned indicates if the GL context has been
   in a reset state at any point since the last call to
   GetGraphicsResetStatusARB. NO_ERROR indicates that the GL context
   has not been in a reset state since the last call.
   GUILTY_CONTEXT_RESET_ARB indicates that a reset has been detected
   that is attributable to the current GL context.
   INNOCENT_CONTEXT_RESET_ARB indicates a reset has been detected that
   is not attributable to the current GL context.
   UNKNOWN_CONTEXT_RESET_ARB indicates a detected graphics reset whose
   cause is unknown.

The language here is explicit in that we must mark up the guilty batch,
but is loose enough for us to relax the innocent (i.e. pending)
accounting as only the active batches are involved with the reset.

In the future, we are looking towards single engine resetting (with
minimal locking), where it seems inappropriate to mark the entire world
as innocent since the reset occurred on a different engine. Reducing the
information available means we only have to encounter the pain once, and
also reduces the information leaking from one context to another.

v2: Legacy ringbuffer submission required a reset following hibernation,
or else we restore stale values to the RING_HEAD and walked over
stolen garbage.

v3: GuC requires replaying the requests after a reset.

v4: Restore engine IRQ after reset (so waiters will be woken!)
    Rearm hangcheck if resetting with a waiter.

Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Arun Siluvery <arun.siluvery@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160909131201.16673-13-chris@chris-wilson.co.uk
This commit is contained in:
Chris Wilson 2016-09-09 14:11:53 +01:00
parent 780f262a70
commit 821ed7df6e
10 changed files with 190 additions and 104 deletions

View File

@ -559,7 +559,6 @@ static void i915_gem_fini(struct drm_device *dev)
}
mutex_lock(&dev->struct_mutex);
i915_gem_reset(dev);
i915_gem_cleanup_engines(dev);
i915_gem_context_fini(dev);
mutex_unlock(&dev->struct_mutex);
@ -1579,7 +1578,7 @@ static int i915_drm_resume(struct drm_device *dev)
mutex_lock(&dev->struct_mutex);
if (i915_gem_init_hw(dev)) {
DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
i915_gem_set_wedged(dev_priv);
}
mutex_unlock(&dev->struct_mutex);
@ -1755,9 +1754,6 @@ void i915_reset(struct drm_i915_private *dev_priv)
error->reset_count++;
pr_notice("drm/i915: Resetting chip after gpu hang\n");
i915_gem_reset(dev);
ret = intel_gpu_reset(dev_priv, ALL_ENGINES);
if (ret) {
if (ret != -ENODEV)
@ -1767,6 +1763,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
goto error;
}
i915_gem_reset(dev_priv);
intel_overlay_reset(dev_priv);
/* Ok, now get things going again... */
@ -1803,7 +1800,7 @@ wakeup:
return;
error:
set_bit(I915_WEDGED, &error->flags);
i915_gem_set_wedged(dev_priv);
goto wakeup;
}

View File

@ -2042,6 +2042,7 @@ struct drm_i915_private {
/* Abstract the submission mechanism (legacy ringbuffer or execlists) away */
struct {
void (*resume)(struct drm_i915_private *);
void (*cleanup_engine)(struct intel_engine_cs *engine);
/**
@ -3263,7 +3264,8 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error)
return READ_ONCE(error->reset_count);
}
void i915_gem_reset(struct drm_device *dev);
void i915_gem_reset(struct drm_i915_private *dev_priv);
void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
int __must_check i915_gem_init(struct drm_device *dev);
int __must_check i915_gem_init_hw(struct drm_device *dev);
@ -3392,7 +3394,6 @@ void i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj);
int __must_check i915_gem_context_init(struct drm_device *dev);
void i915_gem_context_lost(struct drm_i915_private *dev_priv);
void i915_gem_context_fini(struct drm_device *dev);
void i915_gem_context_reset(struct drm_device *dev);
int i915_gem_context_open(struct drm_device *dev, struct drm_file *file);
void i915_gem_context_close(struct drm_device *dev, struct drm_file *file);
int i915_switch_context(struct drm_i915_gem_request *req);

View File

@ -2555,29 +2555,85 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
return NULL;
}
static void i915_gem_reset_engine_status(struct intel_engine_cs *engine)
static void reset_request(struct drm_i915_gem_request *request)
{
struct drm_i915_gem_request *request;
bool ring_hung;
void *vaddr = request->ring->vaddr;
u32 head;
request = i915_gem_find_active_request(engine);
if (request == NULL)
return;
ring_hung = engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG;
i915_set_reset_status(request->ctx, ring_hung);
list_for_each_entry_continue(request, &engine->request_list, link)
i915_set_reset_status(request->ctx, false);
/* As this request likely depends on state from the lost
* context, clear out all the user operations leaving the
* breadcrumb at the end (so we get the fence notifications).
*/
head = request->head;
if (request->postfix < head) {
memset(vaddr + head, 0, request->ring->size - head);
head = 0;
}
memset(vaddr + head, 0, request->postfix - head);
}
static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
static void i915_gem_reset_engine(struct intel_engine_cs *engine)
{
struct drm_i915_gem_request *request;
struct intel_ring *ring;
struct i915_gem_context *incomplete_ctx;
bool ring_hung;
/* Ensure irq handler finishes, and not run again. */
tasklet_kill(&engine->irq_tasklet);
if (engine->irq_seqno_barrier)
engine->irq_seqno_barrier(engine);
request = i915_gem_find_active_request(engine);
if (!request)
return;
ring_hung = engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG;
i915_set_reset_status(request->ctx, ring_hung);
if (!ring_hung)
return;
DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
engine->name, request->fence.seqno);
/* Setup the CS to resume from the breadcrumb of the hung request */
engine->reset_hw(engine, request);
/* Users of the default context do not rely on logical state
* preserved between batches. They have to emit full state on
* every batch and so it is safe to execute queued requests following
* the hang.
*
* Other contexts preserve state, now corrupt. We want to skip all
* queued requests that reference the corrupt context.
*/
incomplete_ctx = request->ctx;
if (i915_gem_context_is_default(incomplete_ctx))
return;
list_for_each_entry_continue(request, &engine->request_list, link)
if (request->ctx == incomplete_ctx)
reset_request(request);
}
void i915_gem_reset(struct drm_i915_private *dev_priv)
{
struct intel_engine_cs *engine;
i915_gem_retire_requests(dev_priv);
for_each_engine(engine, dev_priv)
i915_gem_reset_engine(engine);
i915_gem_restore_fences(&dev_priv->drm);
}
static void nop_submit_request(struct drm_i915_gem_request *request)
{
}
static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)
{
engine->submit_request = nop_submit_request;
/* Mark all pending requests as complete so that any concurrent
* (lockless) lookup doesn't try and wait upon the request as we
@ -2600,54 +2656,22 @@ static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
spin_unlock(&engine->execlist_lock);
}
/*
* We must free the requests after all the corresponding objects have
* been moved off active lists. Which is the same order as the normal
* retire_requests function does. This is important if object hold
* implicit references on things like e.g. ppgtt address spaces through
* the request.
*/
request = i915_gem_active_raw(&engine->last_request,
&engine->i915->drm.struct_mutex);
if (request)
i915_gem_request_retire_upto(request);
GEM_BUG_ON(intel_engine_is_active(engine));
/* Having flushed all requests from all queues, we know that all
* ringbuffers must now be empty. However, since we do not reclaim
* all space when retiring the request (to prevent HEADs colliding
* with rapid ringbuffer wraparound) the amount of available space
* upon reset is less than when we start. Do one more pass over
* all the ringbuffers to reset last_retired_head.
*/
list_for_each_entry(ring, &engine->buffers, link) {
ring->last_retired_head = ring->tail;
intel_ring_update_space(ring);
}
engine->i915->gt.active_engines &= ~intel_engine_flag(engine);
}
void i915_gem_reset(struct drm_device *dev)
void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
{
struct drm_i915_private *dev_priv = to_i915(dev);
struct intel_engine_cs *engine;
/*
* Before we free the objects from the requests, we need to inspect
* them for finding the guilty party. As the requests only borrow
* their reference to the objects, the inspection must be done first.
*/
for_each_engine(engine, dev_priv)
i915_gem_reset_engine_status(engine);
lockdep_assert_held(&dev_priv->drm.struct_mutex);
set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
i915_gem_context_lost(dev_priv);
for_each_engine(engine, dev_priv)
i915_gem_reset_engine_cleanup(engine);
i915_gem_cleanup_engine(engine);
mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
i915_gem_context_reset(dev);
i915_gem_restore_fences(dev);
i915_gem_retire_requests(dev_priv);
}
static void
@ -4343,8 +4367,7 @@ void i915_gem_resume(struct drm_device *dev)
* guarantee that the context image is complete. So let's just reset
* it and start again.
*/
if (i915.enable_execlists)
intel_lr_context_reset(dev_priv, dev_priv->kernel_context);
dev_priv->gt.resume(dev_priv);
mutex_unlock(&dev->struct_mutex);
}
@ -4496,8 +4519,10 @@ int i915_gem_init(struct drm_device *dev)
mutex_lock(&dev->struct_mutex);
if (!i915.enable_execlists) {
dev_priv->gt.resume = intel_legacy_submission_resume;
dev_priv->gt.cleanup_engine = intel_engine_cleanup;
} else {
dev_priv->gt.resume = intel_lr_context_resume;
dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
}
@ -4530,7 +4555,7 @@ int i915_gem_init(struct drm_device *dev)
* for all other failure, such as an allocation failure, bail.
*/
DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
i915_gem_set_wedged(dev_priv);
ret = 0;
}

View File

@ -420,22 +420,6 @@ static void i915_gem_context_unpin(struct i915_gem_context *ctx,
}
}
void i915_gem_context_reset(struct drm_device *dev)
{
struct drm_i915_private *dev_priv = to_i915(dev);
lockdep_assert_held(&dev->struct_mutex);
if (i915.enable_execlists) {
struct i915_gem_context *ctx;
list_for_each_entry(ctx, &dev_priv->context_list, link)
intel_lr_context_reset(dev_priv, ctx);
}
i915_gem_context_lost(dev_priv);
}
int i915_gem_context_init(struct drm_device *dev)
{
struct drm_i915_private *dev_priv = to_i915(dev);

View File

@ -994,6 +994,7 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
struct intel_guc *guc = &dev_priv->guc;
struct i915_guc_client *client;
struct intel_engine_cs *engine;
struct drm_i915_gem_request *request;
/* client for execbuf submission */
client = guc_client_alloc(dev_priv,
@ -1010,9 +1011,14 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
guc_init_doorbell_hw(guc);
/* Take over from manual control of ELSP (execlists) */
for_each_engine(engine, dev_priv)
for_each_engine(engine, dev_priv) {
engine->submit_request = i915_guc_submit;
/* Replay the current set of previously submitted requests */
list_for_each_entry(request, &engine->request_list, link)
i915_guc_submit(request);
}
return 0;
}

View File

@ -211,6 +211,8 @@ void intel_engine_init_hangcheck(struct intel_engine_cs *engine)
{
memset(&engine->hangcheck, 0, sizeof(engine->hangcheck));
clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
if (intel_engine_has_waiter(engine))
i915_queue_hangcheck(engine->i915);
}
static void intel_engine_init_requests(struct intel_engine_cs *engine)
@ -230,7 +232,6 @@ static void intel_engine_init_requests(struct intel_engine_cs *engine)
*/
void intel_engine_setup_common(struct intel_engine_cs *engine)
{
INIT_LIST_HEAD(&engine->buffers);
INIT_LIST_HEAD(&engine->execlist_queue);
spin_lock_init(&engine->execlist_lock);
@ -306,6 +307,18 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
return 0;
}
void intel_engine_reset_irq(struct intel_engine_cs *engine)
{
struct drm_i915_private *dev_priv = engine->i915;
spin_lock_irq(&dev_priv->irq_lock);
if (intel_engine_has_waiter(engine))
engine->irq_enable(engine);
else
engine->irq_disable(engine);
spin_unlock_irq(&dev_priv->irq_lock);
}
/**
* intel_engines_cleanup_common - cleans up the engine state created by
* the common initiailizers.

View File

@ -1222,11 +1222,16 @@ static void lrc_init_hws(struct intel_engine_cs *engine)
static int gen8_init_common_ring(struct intel_engine_cs *engine)
{
struct drm_i915_private *dev_priv = engine->i915;
int ret;
ret = intel_mocs_init_engine(engine);
if (ret)
return ret;
lrc_init_hws(engine);
I915_WRITE_IMR(engine,
~(engine->irq_enable_mask | engine->irq_keep_mask));
intel_engine_reset_irq(engine);
I915_WRITE(RING_HWSTAM(engine->mmio_base), 0xffffffff);
I915_WRITE(RING_MODE_GEN7(engine),
@ -1237,7 +1242,10 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
intel_engine_init_hangcheck(engine);
return intel_mocs_init_engine(engine);
if (!execlists_elsp_idle(engine))
execlists_submit_ports(engine);
return 0;
}
static int gen8_init_render_ring(struct intel_engine_cs *engine)
@ -1273,6 +1281,36 @@ static int gen9_init_render_ring(struct intel_engine_cs *engine)
return init_workarounds_ring(engine);
}
static void reset_common_ring(struct intel_engine_cs *engine,
struct drm_i915_gem_request *request)
{
struct drm_i915_private *dev_priv = engine->i915;
struct execlist_port *port = engine->execlist_port;
struct intel_context *ce = &request->ctx->engine[engine->id];
/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
request->ring->head = request->postfix;
request->ring->last_retired_head = -1;
intel_ring_update_space(request->ring);
if (i915.enable_guc_submission)
return;
/* Catch up with any missed context-switch interrupts */
I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));
if (request->ctx != port[0].request->ctx) {
i915_gem_request_put(port[0].request);
port[0] = port[1];
memset(&port[1], 0, sizeof(port[1]));
}
/* CS is stopped, and we will resubmit both ports on resume */
GEM_BUG_ON(request->ctx != port[0].request->ctx);
port[0].count = 0;
port[1].count = 0;
}
static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
{
struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt;
@ -1635,6 +1673,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
{
/* Default vfuncs which can be overriden by each engine. */
engine->init_hw = gen8_init_common_ring;
engine->reset_hw = reset_common_ring;
engine->emit_flush = gen8_emit_flush;
engine->emit_request = gen8_emit_request;
engine->submit_request = execlists_submit_request;
@ -2087,9 +2126,9 @@ error_deref_obj:
return ret;
}
void intel_lr_context_reset(struct drm_i915_private *dev_priv,
struct i915_gem_context *ctx)
void intel_lr_context_resume(struct drm_i915_private *dev_priv)
{
struct i915_gem_context *ctx = dev_priv->kernel_context;
struct intel_engine_cs *engine;
for_each_engine(engine, dev_priv) {

View File

@ -87,8 +87,7 @@ void intel_lr_context_unpin(struct i915_gem_context *ctx,
struct drm_i915_private;
void intel_lr_context_reset(struct drm_i915_private *dev_priv,
struct i915_gem_context *ctx);
void intel_lr_context_resume(struct drm_i915_private *dev_priv);
uint64_t intel_lr_context_descriptor(struct i915_gem_context *ctx,
struct intel_engine_cs *engine);

View File

@ -564,6 +564,8 @@ static int init_ring_common(struct intel_engine_cs *engine)
else
intel_ring_setup_status_page(engine);
intel_engine_reset_irq(engine);
/* Enforce ordering by reading HEAD register back */
I915_READ_HEAD(engine);
@ -577,34 +579,33 @@ static int init_ring_common(struct intel_engine_cs *engine)
if (I915_READ_HEAD(engine))
DRM_DEBUG("%s initialization failed [head=%08x], fudging\n",
engine->name, I915_READ_HEAD(engine));
I915_WRITE_HEAD(engine, 0);
(void)I915_READ_HEAD(engine);
intel_ring_update_space(ring);
I915_WRITE_HEAD(engine, ring->head);
I915_WRITE_TAIL(engine, ring->tail);
(void)I915_READ_TAIL(engine);
I915_WRITE_CTL(engine,
((ring->size - PAGE_SIZE) & RING_NR_PAGES)
| RING_VALID);
/* If the head is still not zero, the ring is dead */
if (wait_for((I915_READ_CTL(engine) & RING_VALID) != 0 &&
I915_READ_START(engine) == i915_ggtt_offset(ring->vma) &&
(I915_READ_HEAD(engine) & HEAD_ADDR) == 0, 50)) {
if (intel_wait_for_register_fw(dev_priv, RING_CTL(engine->mmio_base),
RING_VALID, RING_VALID,
50)) {
DRM_ERROR("%s initialization failed "
"ctl %08x (valid? %d) head %08x tail %08x start %08x [expected %08x]\n",
"ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
engine->name,
I915_READ_CTL(engine),
I915_READ_CTL(engine) & RING_VALID,
I915_READ_HEAD(engine), I915_READ_TAIL(engine),
I915_READ_HEAD(engine), ring->head,
I915_READ_TAIL(engine), ring->tail,
I915_READ_START(engine),
i915_ggtt_offset(ring->vma));
ret = -EIO;
goto out;
}
ring->last_retired_head = -1;
ring->head = I915_READ_HEAD(engine);
ring->tail = I915_READ_TAIL(engine) & TAIL_ADDR;
intel_ring_update_space(ring);
intel_engine_init_hangcheck(engine);
out:
@ -613,6 +614,15 @@ out:
return ret;
}
static void reset_ring_common(struct intel_engine_cs *engine,
struct drm_i915_gem_request *request)
{
struct intel_ring *ring = request->ring;
ring->head = request->postfix;
ring->last_retired_head = -1;
}
static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
{
struct intel_ring *ring = req->ring;
@ -2007,7 +2017,6 @@ intel_engine_create_ring(struct intel_engine_cs *engine, int size)
}
ring->vma = vma;
list_add(&ring->link, &engine->buffers);
return ring;
}
@ -2015,7 +2024,6 @@ void
intel_ring_free(struct intel_ring *ring)
{
i915_vma_put(ring->vma);
list_del(&ring->link);
kfree(ring);
}
@ -2169,6 +2177,16 @@ void intel_engine_cleanup(struct intel_engine_cs *engine)
engine->i915 = NULL;
}
void intel_legacy_submission_resume(struct drm_i915_private *dev_priv)
{
struct intel_engine_cs *engine;
for_each_engine(engine, dev_priv) {
engine->buffer->head = engine->buffer->tail;
engine->buffer->last_retired_head = -1;
}
}
int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request)
{
int ret;
@ -2654,6 +2672,7 @@ static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
intel_ring_init_semaphores(dev_priv, engine);
engine->init_hw = init_ring_common;
engine->reset_hw = reset_ring_common;
engine->emit_request = i9xx_emit_request;
if (i915.semaphores)

View File

@ -87,7 +87,6 @@ struct intel_ring {
void *vaddr;
struct intel_engine_cs *engine;
struct list_head link;
struct list_head request_list;
@ -157,7 +156,6 @@ struct intel_engine_cs {
u32 mmio_base;
unsigned int irq_shift;
struct intel_ring *buffer;
struct list_head buffers;
/* Rather than have every client wait upon all user interrupts,
* with the herd waking after every interrupt and each doing the
@ -211,6 +209,8 @@ struct intel_engine_cs {
void (*irq_disable)(struct intel_engine_cs *engine);
int (*init_hw)(struct intel_engine_cs *engine);
void (*reset_hw)(struct intel_engine_cs *engine,
struct drm_i915_gem_request *req);
int (*init_context)(struct drm_i915_gem_request *req);
@ -444,6 +444,8 @@ void intel_ring_free(struct intel_ring *ring);
void intel_engine_stop(struct intel_engine_cs *engine);
void intel_engine_cleanup(struct intel_engine_cs *engine);
void intel_legacy_submission_resume(struct drm_i915_private *dev_priv);
int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request);
int __must_check intel_ring_begin(struct drm_i915_gem_request *req, int n);
@ -482,6 +484,7 @@ int __intel_ring_space(int head, int tail, int size);
void intel_ring_update_space(struct intel_ring *ring);
void intel_engine_init_seqno(struct intel_engine_cs *engine, u32 seqno);
void intel_engine_reset_irq(struct intel_engine_cs *engine);
void intel_engine_setup_common(struct intel_engine_cs *engine);
int intel_engine_init_common(struct intel_engine_cs *engine);