diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c index 29819dc820c5..e2b85559257c 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c @@ -25,7 +25,7 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work) tlb_invalidation.fence_tdr.work); struct xe_gt_tlb_invalidation_fence *fence, *next; - mutex_lock(>->uc.guc.ct.lock); + spin_lock_irq(>->tlb_invalidation.pending_lock); list_for_each_entry_safe(fence, next, >->tlb_invalidation.pending_fences, link) { s64 since_inval_ms = ktime_ms_delta(ktime_get(), @@ -47,7 +47,7 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work) queue_delayed_work(system_wq, >->tlb_invalidation.fence_tdr, TLB_TIMEOUT); - mutex_unlock(>->uc.guc.ct.lock); + spin_unlock_irq(>->tlb_invalidation.pending_lock); } /** @@ -63,6 +63,7 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt) { gt->tlb_invalidation.seqno = 1; INIT_LIST_HEAD(>->tlb_invalidation.pending_fences); + spin_lock_init(>->tlb_invalidation.pending_lock); spin_lock_init(>->tlb_invalidation.lock); gt->tlb_invalidation.fence_context = dma_fence_context_alloc(1); INIT_DELAYED_WORK(>->tlb_invalidation.fence_tdr, @@ -72,14 +73,20 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt) } static void -invalidation_fence_signal(struct xe_gt_tlb_invalidation_fence *fence) +__invalidation_fence_signal(struct xe_gt_tlb_invalidation_fence *fence) { trace_xe_gt_tlb_invalidation_fence_signal(fence); - list_del(&fence->link); dma_fence_signal(&fence->base); dma_fence_put(&fence->base); } +static void +invalidation_fence_signal(struct xe_gt_tlb_invalidation_fence *fence) +{ + list_del(&fence->link); + __invalidation_fence_signal(fence); +} + /** * xe_gt_tlb_invalidation_reset - Initialize GT TLB invalidation reset * @gt: graphics tile @@ -98,6 +105,7 @@ void xe_gt_tlb_invalidation_reset(struct xe_gt *gt) */ mutex_lock(>->uc.guc.ct.lock); + spin_lock_irq(>->tlb_invalidation.pending_lock); cancel_delayed_work(>->tlb_invalidation.fence_tdr); /* * We might have various kworkers waiting for TLB flushes to complete @@ -116,9 +124,23 @@ void xe_gt_tlb_invalidation_reset(struct xe_gt *gt) list_for_each_entry_safe(fence, next, >->tlb_invalidation.pending_fences, link) invalidation_fence_signal(fence); + spin_unlock_irq(>->tlb_invalidation.pending_lock); mutex_unlock(>->uc.guc.ct.lock); } +static bool tlb_invalidation_seqno_past(struct xe_gt *gt, int seqno) +{ + int seqno_recv = READ_ONCE(gt->tlb_invalidation.seqno_recv); + + if (seqno - seqno_recv < -(TLB_INVALIDATION_SEQNO_MAX / 2)) + return false; + + if (seqno - seqno_recv > (TLB_INVALIDATION_SEQNO_MAX / 2)) + return true; + + return seqno_recv >= seqno; +} + static int send_tlb_invalidation(struct xe_guc *guc, struct xe_gt_tlb_invalidation_fence *fence, u32 *action, int len) @@ -126,7 +148,6 @@ static int send_tlb_invalidation(struct xe_guc *guc, struct xe_gt *gt = guc_to_gt(guc); int seqno; int ret; - bool queue_work; /* * XXX: The seqno algorithm relies on TLB invalidation being processed @@ -137,21 +158,35 @@ static int send_tlb_invalidation(struct xe_guc *guc, mutex_lock(&guc->ct.lock); seqno = gt->tlb_invalidation.seqno; if (fence) { - queue_work = list_empty(>->tlb_invalidation.pending_fences); fence->seqno = seqno; - list_add_tail(&fence->link, - >->tlb_invalidation.pending_fences); trace_xe_gt_tlb_invalidation_fence_send(fence); } action[1] = seqno; ret = xe_guc_ct_send_locked(&guc->ct, action, len, G2H_LEN_DW_TLB_INVALIDATE, 1); if (!ret && fence) { - fence->invalidation_time = ktime_get(); - if (queue_work) - queue_delayed_work(system_wq, - >->tlb_invalidation.fence_tdr, - TLB_TIMEOUT); + spin_lock_irq(>->tlb_invalidation.pending_lock); + /* + * We haven't actually published the TLB fence as per + * pending_fences, but in theory our seqno could have already + * been written as we acquired the pending_lock. In such a case + * we can just go ahead and signal the fence here. + */ + if (tlb_invalidation_seqno_past(gt, seqno)) { + __invalidation_fence_signal(fence); + } else { + fence->invalidation_time = ktime_get(); + list_add_tail(&fence->link, + >->tlb_invalidation.pending_fences); + + if (list_is_singular(>->tlb_invalidation.pending_fences)) + queue_delayed_work(system_wq, + >->tlb_invalidation.fence_tdr, + TLB_TIMEOUT); + } + spin_unlock_irq(>->tlb_invalidation.pending_lock); + } else if (ret < 0 && fence) { + __invalidation_fence_signal(fence); } if (!ret) { gt->tlb_invalidation.seqno = (gt->tlb_invalidation.seqno + 1) % @@ -160,8 +195,6 @@ static int send_tlb_invalidation(struct xe_guc *guc, gt->tlb_invalidation.seqno = 1; ret = seqno; } - if (ret < 0 && fence) - invalidation_fence_signal(fence); mutex_unlock(&guc->ct.lock); return ret; @@ -276,19 +309,6 @@ int xe_gt_tlb_invalidation_vma(struct xe_gt *gt, return ret; } -static bool tlb_invalidation_seqno_past(struct xe_gt *gt, int seqno) -{ - int seqno_recv = READ_ONCE(gt->tlb_invalidation.seqno_recv); - - if (seqno - seqno_recv < -(TLB_INVALIDATION_SEQNO_MAX / 2)) - return false; - - if (seqno - seqno_recv > (TLB_INVALIDATION_SEQNO_MAX / 2)) - return true; - - return seqno_recv >= seqno; -} - /** * xe_gt_tlb_invalidation_wait - Wait for TLB to complete * @gt: graphics tile @@ -336,22 +356,31 @@ int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno) int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len) { struct xe_gt *gt = guc_to_gt(guc); - struct xe_gt_tlb_invalidation_fence *fence; - int expected_seqno; - - lockdep_assert_held(&guc->ct.lock); + struct xe_gt_tlb_invalidation_fence *fence, *next; + unsigned long flags; if (unlikely(len != 1)) return -EPROTO; - /* Sanity check on seqno */ - expected_seqno = (gt->tlb_invalidation.seqno_recv + 1) % - TLB_INVALIDATION_SEQNO_MAX; - if (!expected_seqno) - expected_seqno = 1; - if (drm_WARN_ON(>_to_xe(gt)->drm, expected_seqno != msg[0])) { - drm_err(>_to_xe(gt)->drm, "TLB expected_seqno(%d) != msg(%u)\n", - expected_seqno, msg[0]); + /* + * This can also be run both directly from the IRQ handler and also in + * process_g2h_msg(). Only one may process any individual CT message, + * however the order they are processed here could result in skipping a + * seqno. To handle that we just process all the seqnos from the last + * seqno_recv up to and including the one in msg[0]. The delta should be + * very small so there shouldn't be much of pending_fences we actually + * need to iterate over here. + * + * From GuC POV we expect the seqnos to always appear in-order, so if we + * see something later in the timeline we can be sure that anything + * appearing earlier has already signalled, just that we have yet to + * officially process the CT message like if racing against + * process_g2h_msg(). + */ + spin_lock_irqsave(>->tlb_invalidation.pending_lock, flags); + if (tlb_invalidation_seqno_past(gt, msg[0])) { + spin_unlock_irqrestore(>->tlb_invalidation.pending_lock, flags); + return 0; } /* @@ -361,19 +390,24 @@ int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len) WRITE_ONCE(gt->tlb_invalidation.seqno_recv, msg[0]); wake_up_all(&guc->ct.wq); - fence = list_first_entry_or_null(>->tlb_invalidation.pending_fences, - typeof(*fence), link); - if (fence) + list_for_each_entry_safe(fence, next, + >->tlb_invalidation.pending_fences, link) { trace_xe_gt_tlb_invalidation_fence_recv(fence); - if (fence && tlb_invalidation_seqno_past(gt, fence->seqno)) { + + if (!tlb_invalidation_seqno_past(gt, fence->seqno)) + break; + invalidation_fence_signal(fence); - if (!list_empty(>->tlb_invalidation.pending_fences)) - mod_delayed_work(system_wq, - >->tlb_invalidation.fence_tdr, - TLB_TIMEOUT); - else - cancel_delayed_work(>->tlb_invalidation.fence_tdr); } + if (!list_empty(>->tlb_invalidation.pending_fences)) + mod_delayed_work(system_wq, + >->tlb_invalidation.fence_tdr, + TLB_TIMEOUT); + else + cancel_delayed_work(>->tlb_invalidation.fence_tdr); + + spin_unlock_irqrestore(>->tlb_invalidation.pending_lock, flags); + return 0; } diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h index 7d4de019f9a5..28b8e8a86fc9 100644 --- a/drivers/gpu/drm/xe/xe_gt_types.h +++ b/drivers/gpu/drm/xe/xe_gt_types.h @@ -163,6 +163,11 @@ struct xe_gt { * invaliations, protected by CT lock */ struct list_head pending_fences; + /** + * @pending_lock: protects @pending_fences and updating + * @seqno_recv. + */ + spinlock_t pending_lock; /** * @fence_tdr: schedules a delayed call to * xe_gt_tlb_fence_timeout after the timeut interval is over. diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index 0b086d17c083..9fb5fd4391d2 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -994,15 +994,8 @@ static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path) return 0; switch (FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, msg[1])) { - /* - * FIXME: We really should process - * XE_GUC_ACTION_TLB_INVALIDATION_DONE here in the fast-path as - * these critical for page fault performance. We currently can't - * due to TLB invalidation done algorithm expecting the seqno - * returned in-order. With some small changes to the algorithm - * and locking we should be able to support out-of-order seqno. - */ case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC: + case XE_GUC_ACTION_TLB_INVALIDATION_DONE: break; /* Process these in fast-path */ default: return 0; @@ -1056,8 +1049,7 @@ void xe_guc_ct_fast_path(struct xe_guc_ct *ct) struct xe_device *xe = ct_to_xe(ct); int len; - if (!xe_device_in_fault_mode(xe) || - !xe_device_mem_access_get_if_ongoing(xe)) + if (!xe_device_mem_access_get_if_ongoing(xe)) return; spin_lock(&ct->fast_lock);