drm/radeon: rework gpu lockup detection and processing

Previusly multiple rings could trigger multiple GPU
resets at the same time.

Signed-off-by: Christian König <deathsimple@vodafone.de>
Reviewed-by: Jerome Glisse <jglisse@redhat.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
This commit is contained in:
Christian König 2012-05-02 15:11:13 +02:00 committed by Dave Airlie
parent 7bd560e885
commit 36abacaed3
2 changed files with 75 additions and 74 deletions

View File

@ -255,8 +255,7 @@ struct radeon_fence_driver {
volatile uint32_t *cpu_addr;
atomic_t seq;
uint32_t last_seq;
unsigned long last_jiffies;
unsigned long last_timeout;
unsigned long last_activity;
wait_queue_head_t queue;
struct list_head created;
struct list_head emitted;

View File

@ -74,6 +74,10 @@ int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence *fence)
radeon_fence_ring_emit(rdev, fence->ring, fence);
trace_radeon_fence_emit(rdev->ddev, fence->seq);
fence->emitted = true;
/* are we the first fence on a previusly idle ring? */
if (list_empty(&rdev->fence_drv[fence->ring].emitted)) {
rdev->fence_drv[fence->ring].last_activity = jiffies;
}
list_move_tail(&fence->list, &rdev->fence_drv[fence->ring].emitted);
write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
return 0;
@ -85,34 +89,14 @@ static bool radeon_fence_poll_locked(struct radeon_device *rdev, int ring)
struct list_head *i, *n;
uint32_t seq;
bool wake = false;
unsigned long cjiffies;
seq = radeon_fence_read(rdev, ring);
if (seq != rdev->fence_drv[ring].last_seq) {
rdev->fence_drv[ring].last_seq = seq;
rdev->fence_drv[ring].last_jiffies = jiffies;
rdev->fence_drv[ring].last_timeout = RADEON_FENCE_JIFFIES_TIMEOUT;
} else {
cjiffies = jiffies;
if (time_after(cjiffies, rdev->fence_drv[ring].last_jiffies)) {
cjiffies -= rdev->fence_drv[ring].last_jiffies;
if (time_after(rdev->fence_drv[ring].last_timeout, cjiffies)) {
/* update the timeout */
rdev->fence_drv[ring].last_timeout -= cjiffies;
} else {
/* the 500ms timeout is elapsed we should test
* for GPU lockup
*/
rdev->fence_drv[ring].last_timeout = 1;
}
} else {
/* wrap around update last jiffies, we will just wait
* a little longer
*/
rdev->fence_drv[ring].last_jiffies = cjiffies;
}
if (seq == rdev->fence_drv[ring].last_seq)
return false;
}
rdev->fence_drv[ring].last_seq = seq;
rdev->fence_drv[ring].last_activity = jiffies;
n = NULL;
list_for_each(i, &rdev->fence_drv[ring].emitted) {
fence = list_entry(i, struct radeon_fence, list);
@ -207,49 +191,68 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
struct radeon_device *rdev;
unsigned long irq_flags, timeout;
u32 seq;
int r;
int i, r;
bool signaled;
if (fence == NULL) {
WARN(1, "Querying an invalid fence : %p !\n", fence);
return 0;
return -EINVAL;
}
rdev = fence->rdev;
if (radeon_fence_signaled(fence)) {
return 0;
signaled = radeon_fence_signaled(fence);
while (!signaled) {
read_lock_irqsave(&rdev->fence_lock, irq_flags);
timeout = jiffies - RADEON_FENCE_JIFFIES_TIMEOUT;
if (time_after(rdev->fence_drv[fence->ring].last_activity, timeout)) {
/* the normal case, timeout is somewhere before last_activity */
timeout = rdev->fence_drv[fence->ring].last_activity - timeout;
} else {
/* either jiffies wrapped around, or no fence was signaled in the last 500ms
* anyway we will just wait for the minimum amount and then check for a lockup */
timeout = 1;
}
timeout = rdev->fence_drv[fence->ring].last_timeout;
retry:
/* save current sequence used to check for GPU lockup */
/* save current sequence value used to check for GPU lockups */
seq = rdev->fence_drv[fence->ring].last_seq;
read_unlock_irqrestore(&rdev->fence_lock, irq_flags);
trace_radeon_fence_wait_begin(rdev->ddev, seq);
if (intr) {
radeon_irq_kms_sw_irq_get(rdev, fence->ring);
r = wait_event_interruptible_timeout(rdev->fence_drv[fence->ring].queue,
radeon_fence_signaled(fence), timeout);
if (intr) {
r = wait_event_interruptible_timeout(
rdev->fence_drv[fence->ring].queue,
(signaled = radeon_fence_signaled(fence)), timeout);
} else {
r = wait_event_timeout(
rdev->fence_drv[fence->ring].queue,
(signaled = radeon_fence_signaled(fence)), timeout);
}
radeon_irq_kms_sw_irq_put(rdev, fence->ring);
if (unlikely(r < 0)) {
return r;
}
} else {
radeon_irq_kms_sw_irq_get(rdev, fence->ring);
r = wait_event_timeout(rdev->fence_drv[fence->ring].queue,
radeon_fence_signaled(fence), timeout);
radeon_irq_kms_sw_irq_put(rdev, fence->ring);
}
trace_radeon_fence_wait_end(rdev->ddev, seq);
if (unlikely(!radeon_fence_signaled(fence))) {
/* we were interrupted for some reason and fence isn't
* isn't signaled yet, resume wait
*/
if (unlikely(!signaled)) {
/* we were interrupted for some reason and fence
* isn't signaled yet, resume waiting */
if (r) {
timeout = r;
goto retry;
continue;
}
/* don't protect read access to rdev->fence_drv[t].last_seq
* if we experiencing a lockup the value doesn't change
*/
if (seq == rdev->fence_drv[fence->ring].last_seq &&
radeon_ring_is_lockup(rdev, fence->ring, &rdev->ring[fence->ring])) {
write_lock_irqsave(&rdev->fence_lock, irq_flags);
/* check if sequence value has changed since last_activity */
if (seq != rdev->fence_drv[fence->ring].last_seq) {
write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
continue;
}
/* change sequence value on all rings, so nobody else things there is a lockup */
for (i = 0; i < RADEON_NUM_RINGS; ++i)
rdev->fence_drv[i].last_seq -= 0x10000;
write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
if (radeon_ring_is_lockup(rdev, fence->ring, &rdev->ring[fence->ring])) {
/* good news we believe it's a lockup */
printk(KERN_WARNING "GPU lockup (waiting for 0x%08X last fence id 0x%08X)\n",
@ -260,13 +263,12 @@ retry:
r = radeon_gpu_reset(rdev);
if (r)
return r;
}
timeout = RADEON_FENCE_JIFFIES_TIMEOUT;
write_lock_irqsave(&rdev->fence_lock, irq_flags);
rdev->fence_drv[fence->ring].last_timeout = RADEON_FENCE_JIFFIES_TIMEOUT;
rdev->fence_drv[fence->ring].last_jiffies = jiffies;
rdev->fence_drv[fence->ring].last_activity = jiffies;
write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
goto retry;
}
}
}
return 0;
}