drm/i915: Beware temporary wedging when determining -EIO
At a few points in our uABI, we check to see if the driver is wedged and report -EIO back to the user in that case. However, as we perform the check and reset asynchronously (where once before they were both serialised by the struct_mutex), we may instead see the temporary wedging used to cancel inflight rendering to avoid a deadlock during reset (caused by either us timing out in our reset handler, i915_wedge_on_timeout or with malice aforethought in intel_reset_prepare for a stuck modeset). If we suspect this is the case, that is we see a wedged driver *and* reset in progress, then wait until the reset is resolved before reporting upon the wedged status. v2: might_sleep() (Mika) Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109580 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190220145637.23503-1-chris@chris-wilson.co.uk
This commit is contained in:
parent
47ed55a9bb
commit
c41166f9a1
@ -3833,11 +3833,18 @@ static const struct file_operations i915_cur_wm_latency_fops = {
|
||||
static int
|
||||
i915_wedged_get(void *data, u64 *val)
|
||||
{
|
||||
struct drm_i915_private *dev_priv = data;
|
||||
int ret = i915_terminally_wedged(data);
|
||||
|
||||
*val = i915_terminally_wedged(&dev_priv->gpu_error);
|
||||
|
||||
return 0;
|
||||
switch (ret) {
|
||||
case -EIO:
|
||||
*val = 1;
|
||||
return 0;
|
||||
case 0:
|
||||
*val = 0;
|
||||
return 0;
|
||||
default:
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
@ -3918,7 +3925,7 @@ i915_drop_caches_set(void *data, u64 val)
|
||||
mutex_unlock(&i915->drm.struct_mutex);
|
||||
}
|
||||
|
||||
if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(&i915->gpu_error))
|
||||
if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(i915))
|
||||
i915_handle_error(i915, ALL_ENGINES, 0, NULL);
|
||||
|
||||
fs_reclaim_acquire(GFP_KERNEL);
|
||||
|
@ -3020,11 +3020,16 @@ int __must_check i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno);
|
||||
struct i915_request *
|
||||
i915_gem_find_active_request(struct intel_engine_cs *engine);
|
||||
|
||||
static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
|
||||
static inline bool __i915_wedged(struct i915_gpu_error *error)
|
||||
{
|
||||
return unlikely(test_bit(I915_WEDGED, &error->flags));
|
||||
}
|
||||
|
||||
static inline bool i915_reset_failed(struct drm_i915_private *i915)
|
||||
{
|
||||
return __i915_wedged(&i915->gpu_error);
|
||||
}
|
||||
|
||||
static inline u32 i915_reset_count(struct i915_gpu_error *error)
|
||||
{
|
||||
return READ_ONCE(error->reset_count);
|
||||
|
@ -1928,7 +1928,7 @@ err:
|
||||
* fail). But any other -EIO isn't ours (e.g. swap in failure)
|
||||
* and so needs to be reported.
|
||||
*/
|
||||
if (!i915_terminally_wedged(&dev_priv->gpu_error))
|
||||
if (!i915_terminally_wedged(dev_priv))
|
||||
return VM_FAULT_SIGBUS;
|
||||
/* else: fall through */
|
||||
case -EAGAIN:
|
||||
@ -2958,7 +2958,7 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915)
|
||||
struct intel_engine_cs *engine;
|
||||
enum intel_engine_id id;
|
||||
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_reset_failed(i915))
|
||||
return;
|
||||
|
||||
GEM_BUG_ON(i915->gt.active_requests);
|
||||
@ -3806,8 +3806,9 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
|
||||
long ret;
|
||||
|
||||
/* ABI: return -EIO if already wedged */
|
||||
if (i915_terminally_wedged(&dev_priv->gpu_error))
|
||||
return -EIO;
|
||||
ret = i915_terminally_wedged(dev_priv);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
spin_lock(&file_priv->mm.lock);
|
||||
list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
|
||||
@ -4460,7 +4461,7 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
|
||||
* back to defaults, recovering from whatever wedged state we left it
|
||||
* in and so worth trying to use the device once more.
|
||||
*/
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_terminally_wedged(i915))
|
||||
i915_gem_unset_wedged(i915);
|
||||
|
||||
/*
|
||||
@ -4504,7 +4505,7 @@ int i915_gem_suspend(struct drm_i915_private *i915)
|
||||
* state. Fortunately, the kernel_context is disposable and we do
|
||||
* not rely on its state.
|
||||
*/
|
||||
if (!i915_terminally_wedged(&i915->gpu_error)) {
|
||||
if (!i915_reset_failed(i915)) {
|
||||
ret = i915_gem_switch_to_kernel_context(i915);
|
||||
if (ret)
|
||||
goto err_unlock;
|
||||
@ -4625,8 +4626,9 @@ out_unlock:
|
||||
return;
|
||||
|
||||
err_wedged:
|
||||
if (!i915_terminally_wedged(&i915->gpu_error)) {
|
||||
DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
|
||||
if (!i915_reset_failed(i915)) {
|
||||
dev_err(i915->drm.dev,
|
||||
"Failed to re-initialize GPU, declaring it wedged!\n");
|
||||
i915_gem_set_wedged(i915);
|
||||
}
|
||||
goto out_unlock;
|
||||
@ -4731,10 +4733,9 @@ int i915_gem_init_hw(struct drm_i915_private *dev_priv)
|
||||
init_unused_rings(dev_priv);
|
||||
|
||||
BUG_ON(!dev_priv->kernel_context);
|
||||
if (i915_terminally_wedged(&dev_priv->gpu_error)) {
|
||||
ret = -EIO;
|
||||
ret = i915_terminally_wedged(dev_priv);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = i915_ppgtt_init_hw(dev_priv);
|
||||
if (ret) {
|
||||
@ -5107,7 +5108,7 @@ err_uc_misc:
|
||||
* wedged. But we only want to do this where the GPU is angry,
|
||||
* for all other failure, such as an allocation failure, bail.
|
||||
*/
|
||||
if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
|
||||
if (!i915_reset_failed(dev_priv)) {
|
||||
i915_load_error(dev_priv,
|
||||
"Failed to initialize GPU, declaring it wedged!\n");
|
||||
i915_gem_set_wedged(dev_priv);
|
||||
|
@ -559,8 +559,9 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
|
||||
* ABI: Before userspace accesses the GPU (e.g. execbuffer), report
|
||||
* EIO if the GPU is already wedged.
|
||||
*/
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
return ERR_PTR(-EIO);
|
||||
ret = i915_terminally_wedged(i915);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
/*
|
||||
* Pinning the contexts may generate requests in order to acquire
|
||||
|
@ -1032,7 +1032,7 @@ void i915_reset(struct drm_i915_private *i915,
|
||||
|
||||
finish:
|
||||
reset_finish(i915);
|
||||
if (!i915_terminally_wedged(error))
|
||||
if (!__i915_wedged(error))
|
||||
reset_restart(i915);
|
||||
return;
|
||||
|
||||
@ -1253,7 +1253,7 @@ void i915_handle_error(struct drm_i915_private *i915,
|
||||
* Try engine reset when available. We fall back to full reset if
|
||||
* single reset fails.
|
||||
*/
|
||||
if (intel_has_reset_engine(i915) && !i915_terminally_wedged(error)) {
|
||||
if (intel_has_reset_engine(i915) && !__i915_wedged(error)) {
|
||||
for_each_engine_masked(engine, i915, engine_mask, tmp) {
|
||||
BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
|
||||
if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
|
||||
@ -1339,6 +1339,31 @@ __releases(&i915->gpu_error.reset_backoff_srcu)
|
||||
srcu_read_unlock(&error->reset_backoff_srcu, tag);
|
||||
}
|
||||
|
||||
int i915_terminally_wedged(struct drm_i915_private *i915)
|
||||
{
|
||||
struct i915_gpu_error *error = &i915->gpu_error;
|
||||
|
||||
might_sleep();
|
||||
|
||||
if (!__i915_wedged(error))
|
||||
return 0;
|
||||
|
||||
/* Reset still in progress? Maybe we will recover? */
|
||||
if (!test_bit(I915_RESET_BACKOFF, &error->flags))
|
||||
return -EIO;
|
||||
|
||||
/* XXX intel_reset_finish() still takes struct_mutex!!! */
|
||||
if (mutex_is_locked(&i915->drm.struct_mutex))
|
||||
return -EAGAIN;
|
||||
|
||||
if (wait_event_interruptible(error->reset_queue,
|
||||
!test_bit(I915_RESET_BACKOFF,
|
||||
&error->flags)))
|
||||
return -EINTR;
|
||||
|
||||
return __i915_wedged(error) ? -EIO : 0;
|
||||
}
|
||||
|
||||
bool i915_reset_flush(struct drm_i915_private *i915)
|
||||
{
|
||||
int err;
|
||||
|
@ -36,6 +36,8 @@ bool i915_reset_flush(struct drm_i915_private *i915);
|
||||
int __must_check i915_reset_trylock(struct drm_i915_private *i915);
|
||||
void i915_reset_unlock(struct drm_i915_private *i915, int tag);
|
||||
|
||||
int i915_terminally_wedged(struct drm_i915_private *i915);
|
||||
|
||||
bool intel_has_gpu_reset(struct drm_i915_private *i915);
|
||||
bool intel_has_reset_engine(struct drm_i915_private *i915);
|
||||
|
||||
|
@ -1007,10 +1007,8 @@ static bool ring_is_idle(struct intel_engine_cs *engine)
|
||||
*/
|
||||
bool intel_engine_is_idle(struct intel_engine_cs *engine)
|
||||
{
|
||||
struct drm_i915_private *dev_priv = engine->i915;
|
||||
|
||||
/* More white lies, if wedged, hw state is inconsistent */
|
||||
if (i915_terminally_wedged(&dev_priv->gpu_error))
|
||||
if (i915_reset_failed(engine->i915))
|
||||
return true;
|
||||
|
||||
/* Any inflight/incomplete requests? */
|
||||
@ -1054,7 +1052,7 @@ bool intel_engines_are_idle(struct drm_i915_private *dev_priv)
|
||||
* If the driver is wedged, HW state may be very inconsistent and
|
||||
* report that it is still busy, even though we have stopped using it.
|
||||
*/
|
||||
if (i915_terminally_wedged(&dev_priv->gpu_error))
|
||||
if (i915_reset_failed(dev_priv))
|
||||
return true;
|
||||
|
||||
for_each_engine(engine, dev_priv, id) {
|
||||
@ -1496,7 +1494,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
if (i915_terminally_wedged(&engine->i915->gpu_error))
|
||||
if (i915_reset_failed(engine->i915))
|
||||
drm_printf(m, "*** WEDGED ***\n");
|
||||
|
||||
drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
|
||||
|
@ -263,7 +263,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
|
||||
if (!READ_ONCE(dev_priv->gt.awake))
|
||||
return;
|
||||
|
||||
if (i915_terminally_wedged(&dev_priv->gpu_error))
|
||||
if (i915_terminally_wedged(dev_priv))
|
||||
return;
|
||||
|
||||
/* As enabling the GPU requires fairly extensive mmio access,
|
||||
|
@ -1764,7 +1764,7 @@ int i915_gem_huge_page_live_selftests(struct drm_i915_private *dev_priv)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (i915_terminally_wedged(&dev_priv->gpu_error))
|
||||
if (i915_terminally_wedged(dev_priv))
|
||||
return 0;
|
||||
|
||||
file = mock_file(dev_priv);
|
||||
|
@ -150,7 +150,7 @@ int i915_active_live_selftests(struct drm_i915_private *i915)
|
||||
SUBTEST(live_active_retire),
|
||||
};
|
||||
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_terminally_wedged(i915))
|
||||
return 0;
|
||||
|
||||
return i915_subtests(tests, i915);
|
||||
|
@ -248,12 +248,12 @@ static bool always_valid(struct drm_i915_private *i915)
|
||||
|
||||
static bool needs_fence_registers(struct drm_i915_private *i915)
|
||||
{
|
||||
return !i915_terminally_wedged(&i915->gpu_error);
|
||||
return !i915_terminally_wedged(i915);
|
||||
}
|
||||
|
||||
static bool needs_mi_store_dword(struct drm_i915_private *i915)
|
||||
{
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_terminally_wedged(i915))
|
||||
return false;
|
||||
|
||||
return intel_engine_can_store_dword(i915->engine[RCS]);
|
||||
|
@ -1632,7 +1632,7 @@ int i915_gem_context_live_selftests(struct drm_i915_private *dev_priv)
|
||||
SUBTEST(igt_vm_isolation),
|
||||
};
|
||||
|
||||
if (i915_terminally_wedged(&dev_priv->gpu_error))
|
||||
if (i915_terminally_wedged(dev_priv))
|
||||
return 0;
|
||||
|
||||
return i915_subtests(tests, dev_priv);
|
||||
|
@ -547,7 +547,7 @@ int i915_gem_evict_live_selftests(struct drm_i915_private *i915)
|
||||
SUBTEST(igt_evict_contexts),
|
||||
};
|
||||
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_terminally_wedged(i915))
|
||||
return 0;
|
||||
|
||||
return i915_subtests(tests, i915);
|
||||
|
@ -583,7 +583,7 @@ static int igt_mmap_offset_exhaustion(void *arg)
|
||||
for (loop = 0; loop < 3; loop++) {
|
||||
intel_wakeref_t wakeref;
|
||||
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_terminally_wedged(i915))
|
||||
break;
|
||||
|
||||
obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
|
||||
|
@ -1246,7 +1246,7 @@ int i915_request_live_selftests(struct drm_i915_private *i915)
|
||||
SUBTEST(live_breadcrumbs_smoketest),
|
||||
};
|
||||
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_terminally_wedged(i915))
|
||||
return 0;
|
||||
|
||||
return i915_subtests(tests, i915);
|
||||
|
@ -29,5 +29,5 @@ int igt_flush_test(struct drm_i915_private *i915, unsigned int flags)
|
||||
i915_gem_set_wedged(i915);
|
||||
}
|
||||
|
||||
return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
|
||||
return i915_terminally_wedged(i915);
|
||||
}
|
||||
|
@ -342,7 +342,7 @@ static int igt_hang_sanitycheck(void *arg)
|
||||
timeout = i915_request_wait(rq,
|
||||
I915_WAIT_LOCKED,
|
||||
MAX_SCHEDULE_TIMEOUT);
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_reset_failed(i915))
|
||||
timeout = -EIO;
|
||||
|
||||
i915_request_put(rq);
|
||||
@ -383,7 +383,7 @@ static int igt_global_reset(void *arg)
|
||||
|
||||
igt_global_reset_unlock(i915);
|
||||
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_reset_failed(i915))
|
||||
err = -EIO;
|
||||
|
||||
return err;
|
||||
@ -401,13 +401,13 @@ static int igt_wedged_reset(void *arg)
|
||||
|
||||
i915_gem_set_wedged(i915);
|
||||
|
||||
GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
|
||||
GEM_BUG_ON(!i915_reset_failed(i915));
|
||||
i915_reset(i915, ALL_ENGINES, NULL);
|
||||
|
||||
intel_runtime_pm_put(i915, wakeref);
|
||||
igt_global_reset_unlock(i915);
|
||||
|
||||
return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
|
||||
return i915_reset_failed(i915) ? -EIO : 0;
|
||||
}
|
||||
|
||||
static bool wait_for_idle(struct intel_engine_cs *engine)
|
||||
@ -529,7 +529,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
|
||||
break;
|
||||
}
|
||||
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_reset_failed(i915))
|
||||
err = -EIO;
|
||||
|
||||
if (active) {
|
||||
@ -843,7 +843,7 @@ unwind:
|
||||
break;
|
||||
}
|
||||
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_reset_failed(i915))
|
||||
err = -EIO;
|
||||
|
||||
if (flags & TEST_ACTIVE) {
|
||||
@ -969,7 +969,7 @@ unlock:
|
||||
mutex_unlock(&i915->drm.struct_mutex);
|
||||
igt_global_reset_unlock(i915);
|
||||
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_reset_failed(i915))
|
||||
return -EIO;
|
||||
|
||||
return err;
|
||||
@ -1166,7 +1166,7 @@ fini:
|
||||
unlock:
|
||||
mutex_unlock(&i915->drm.struct_mutex);
|
||||
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_reset_failed(i915))
|
||||
return -EIO;
|
||||
|
||||
return err;
|
||||
@ -1372,7 +1372,7 @@ unlock:
|
||||
mutex_unlock(&i915->drm.struct_mutex);
|
||||
igt_global_reset_unlock(i915);
|
||||
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_reset_failed(i915))
|
||||
return -EIO;
|
||||
|
||||
return err;
|
||||
@ -1552,7 +1552,7 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
|
||||
i915_request_wait(rq,
|
||||
I915_WAIT_LOCKED,
|
||||
MAX_SCHEDULE_TIMEOUT);
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_reset_failed(i915))
|
||||
err = -EIO;
|
||||
}
|
||||
|
||||
@ -1591,7 +1591,7 @@ static int igt_atomic_reset(void *arg)
|
||||
|
||||
/* Flush any requests before we get started and check basics */
|
||||
force_reset(i915);
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_reset_failed(i915))
|
||||
goto unlock;
|
||||
|
||||
if (intel_has_gpu_reset(i915)) {
|
||||
@ -1665,7 +1665,7 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
|
||||
if (!intel_has_gpu_reset(i915))
|
||||
return 0;
|
||||
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_terminally_wedged(i915))
|
||||
return -EIO; /* we're long past hope of a successful reset */
|
||||
|
||||
wakeref = intel_runtime_pm_get(i915);
|
||||
|
@ -895,7 +895,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
|
||||
if (!HAS_EXECLISTS(i915))
|
||||
return 0;
|
||||
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_terminally_wedged(i915))
|
||||
return 0;
|
||||
|
||||
return i915_subtests(tests, i915);
|
||||
|
@ -181,7 +181,7 @@ static int check_whitelist(struct i915_gem_context *ctx,
|
||||
err = 0;
|
||||
igt_wedge_on_timeout(&wedge, ctx->i915, HZ / 5) /* a safety net! */
|
||||
err = i915_gem_object_set_to_cpu_domain(results, false);
|
||||
if (i915_terminally_wedged(&ctx->i915->gpu_error))
|
||||
if (i915_terminally_wedged(ctx->i915))
|
||||
err = -EIO;
|
||||
if (err)
|
||||
goto out_put;
|
||||
@ -510,7 +510,7 @@ int intel_workarounds_live_selftests(struct drm_i915_private *i915)
|
||||
};
|
||||
int err;
|
||||
|
||||
if (i915_terminally_wedged(&i915->gpu_error))
|
||||
if (i915_terminally_wedged(i915))
|
||||
return 0;
|
||||
|
||||
mutex_lock(&i915->drm.struct_mutex);
|
||||
|
Loading…
x
Reference in New Issue
Block a user