diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
index 29819dc820c5..e2b85559257c 100644
--- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
+++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
@@ -25,7 +25,7 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work)
 					tlb_invalidation.fence_tdr.work);
 	struct xe_gt_tlb_invalidation_fence *fence, *next;
 
-	mutex_lock(&gt->uc.guc.ct.lock);
+	spin_lock_irq(&gt->tlb_invalidation.pending_lock);
 	list_for_each_entry_safe(fence, next,
 				 &gt->tlb_invalidation.pending_fences, link) {
 		s64 since_inval_ms = ktime_ms_delta(ktime_get(),
@@ -47,7 +47,7 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work)
 		queue_delayed_work(system_wq,
 				   &gt->tlb_invalidation.fence_tdr,
 				   TLB_TIMEOUT);
-	mutex_unlock(&gt->uc.guc.ct.lock);
+	spin_unlock_irq(&gt->tlb_invalidation.pending_lock);
 }
 
 /**
@@ -63,6 +63,7 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt)
 {
 	gt->tlb_invalidation.seqno = 1;
 	INIT_LIST_HEAD(&gt->tlb_invalidation.pending_fences);
+	spin_lock_init(&gt->tlb_invalidation.pending_lock);
 	spin_lock_init(&gt->tlb_invalidation.lock);
 	gt->tlb_invalidation.fence_context = dma_fence_context_alloc(1);
 	INIT_DELAYED_WORK(&gt->tlb_invalidation.fence_tdr,
@@ -72,14 +73,20 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt)
 }
 
 static void
-invalidation_fence_signal(struct xe_gt_tlb_invalidation_fence *fence)
+__invalidation_fence_signal(struct xe_gt_tlb_invalidation_fence *fence)
 {
 	trace_xe_gt_tlb_invalidation_fence_signal(fence);
-	list_del(&fence->link);
 	dma_fence_signal(&fence->base);
 	dma_fence_put(&fence->base);
 }
 
+static void
+invalidation_fence_signal(struct xe_gt_tlb_invalidation_fence *fence)
+{
+	list_del(&fence->link);
+	__invalidation_fence_signal(fence);
+}
+
 /**
  * xe_gt_tlb_invalidation_reset - Initialize GT TLB invalidation reset
  * @gt: graphics tile
@@ -98,6 +105,7 @@ void xe_gt_tlb_invalidation_reset(struct xe_gt *gt)
 	 */
 
 	mutex_lock(&gt->uc.guc.ct.lock);
+	spin_lock_irq(&gt->tlb_invalidation.pending_lock);
 	cancel_delayed_work(&gt->tlb_invalidation.fence_tdr);
 	/*
 	 * We might have various kworkers waiting for TLB flushes to complete
@@ -116,9 +124,23 @@ void xe_gt_tlb_invalidation_reset(struct xe_gt *gt)
 	list_for_each_entry_safe(fence, next,
 				 &gt->tlb_invalidation.pending_fences, link)
 		invalidation_fence_signal(fence);
+	spin_unlock_irq(&gt->tlb_invalidation.pending_lock);
 	mutex_unlock(&gt->uc.guc.ct.lock);
 }
 
+static bool tlb_invalidation_seqno_past(struct xe_gt *gt, int seqno)
+{
+	int seqno_recv = READ_ONCE(gt->tlb_invalidation.seqno_recv);
+
+	if (seqno - seqno_recv < -(TLB_INVALIDATION_SEQNO_MAX / 2))
+		return false;
+
+	if (seqno - seqno_recv > (TLB_INVALIDATION_SEQNO_MAX / 2))
+		return true;
+
+	return seqno_recv >= seqno;
+}
+
 static int send_tlb_invalidation(struct xe_guc *guc,
 				 struct xe_gt_tlb_invalidation_fence *fence,
 				 u32 *action, int len)
@@ -126,7 +148,6 @@ static int send_tlb_invalidation(struct xe_guc *guc,
 	struct xe_gt *gt = guc_to_gt(guc);
 	int seqno;
 	int ret;
-	bool queue_work;
 
 	/*
 	 * XXX: The seqno algorithm relies on TLB invalidation being processed
@@ -137,21 +158,35 @@ static int send_tlb_invalidation(struct xe_guc *guc,
 	mutex_lock(&guc->ct.lock);
 	seqno = gt->tlb_invalidation.seqno;
 	if (fence) {
-		queue_work = list_empty(&gt->tlb_invalidation.pending_fences);
 		fence->seqno = seqno;
-		list_add_tail(&fence->link,
-			      &gt->tlb_invalidation.pending_fences);
 		trace_xe_gt_tlb_invalidation_fence_send(fence);
 	}
 	action[1] = seqno;
 	ret = xe_guc_ct_send_locked(&guc->ct, action, len,
 				    G2H_LEN_DW_TLB_INVALIDATE, 1);
 	if (!ret && fence) {
-		fence->invalidation_time = ktime_get();
-		if (queue_work)
-			queue_delayed_work(system_wq,
-					   &gt->tlb_invalidation.fence_tdr,
-					   TLB_TIMEOUT);
+		spin_lock_irq(&gt->tlb_invalidation.pending_lock);
+		/*
+		 * We haven't actually published the TLB fence as per
+		 * pending_fences, but in theory our seqno could have already
+		 * been written as we acquired the pending_lock. In such a case
+		 * we can just go ahead and signal the fence here.
+		 */
+		if (tlb_invalidation_seqno_past(gt, seqno)) {
+			__invalidation_fence_signal(fence);
+		} else {
+			fence->invalidation_time = ktime_get();
+			list_add_tail(&fence->link,
+				      &gt->tlb_invalidation.pending_fences);
+
+			if (list_is_singular(&gt->tlb_invalidation.pending_fences))
+				queue_delayed_work(system_wq,
+						   &gt->tlb_invalidation.fence_tdr,
+						   TLB_TIMEOUT);
+		}
+		spin_unlock_irq(&gt->tlb_invalidation.pending_lock);
+	} else if (ret < 0 && fence) {
+		__invalidation_fence_signal(fence);
 	}
 	if (!ret) {
 		gt->tlb_invalidation.seqno = (gt->tlb_invalidation.seqno + 1) %
@@ -160,8 +195,6 @@ static int send_tlb_invalidation(struct xe_guc *guc,
 			gt->tlb_invalidation.seqno = 1;
 		ret = seqno;
 	}
-	if (ret < 0 && fence)
-		invalidation_fence_signal(fence);
 	mutex_unlock(&guc->ct.lock);
 
 	return ret;
@@ -276,19 +309,6 @@ int xe_gt_tlb_invalidation_vma(struct xe_gt *gt,
 	return ret;
 }
 
-static bool tlb_invalidation_seqno_past(struct xe_gt *gt, int seqno)
-{
-	int seqno_recv = READ_ONCE(gt->tlb_invalidation.seqno_recv);
-
-	if (seqno - seqno_recv < -(TLB_INVALIDATION_SEQNO_MAX / 2))
-		return false;
-
-	if (seqno - seqno_recv > (TLB_INVALIDATION_SEQNO_MAX / 2))
-		return true;
-
-	return seqno_recv >= seqno;
-}
-
 /**
  * xe_gt_tlb_invalidation_wait - Wait for TLB to complete
  * @gt: graphics tile
@@ -336,22 +356,31 @@ int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno)
 int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
 {
 	struct xe_gt *gt = guc_to_gt(guc);
-	struct xe_gt_tlb_invalidation_fence *fence;
-	int expected_seqno;
-
-	lockdep_assert_held(&guc->ct.lock);
+	struct xe_gt_tlb_invalidation_fence *fence, *next;
+	unsigned long flags;
 
 	if (unlikely(len != 1))
 		return -EPROTO;
 
-	/* Sanity check on seqno */
-	expected_seqno = (gt->tlb_invalidation.seqno_recv + 1) %
-		TLB_INVALIDATION_SEQNO_MAX;
-	if (!expected_seqno)
-		expected_seqno = 1;
-	if (drm_WARN_ON(&gt_to_xe(gt)->drm, expected_seqno != msg[0])) {
-		drm_err(&gt_to_xe(gt)->drm, "TLB expected_seqno(%d) != msg(%u)\n",
-			expected_seqno, msg[0]);
+	/*
+	 * This can also be run both directly from the IRQ handler and also in
+	 * process_g2h_msg(). Only one may process any individual CT message,
+	 * however the order they are processed here could result in skipping a
+	 * seqno. To handle that we just process all the seqnos from the last
+	 * seqno_recv up to and including the one in msg[0]. The delta should be
+	 * very small so there shouldn't be much of pending_fences we actually
+	 * need to iterate over here.
+	 *
+	 * From GuC POV we expect the seqnos to always appear in-order, so if we
+	 * see something later in the timeline we can be sure that anything
+	 * appearing earlier has already signalled, just that we have yet to
+	 * officially process the CT message like if racing against
+	 * process_g2h_msg().
+	 */
+	spin_lock_irqsave(&gt->tlb_invalidation.pending_lock, flags);
+	if (tlb_invalidation_seqno_past(gt, msg[0])) {
+		spin_unlock_irqrestore(&gt->tlb_invalidation.pending_lock, flags);
+		return 0;
 	}
 
 	/*
@@ -361,19 +390,24 @@ int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
 	WRITE_ONCE(gt->tlb_invalidation.seqno_recv, msg[0]);
 	wake_up_all(&guc->ct.wq);
 
-	fence = list_first_entry_or_null(&gt->tlb_invalidation.pending_fences,
-					 typeof(*fence), link);
-	if (fence)
+	list_for_each_entry_safe(fence, next,
+				 &gt->tlb_invalidation.pending_fences, link) {
 		trace_xe_gt_tlb_invalidation_fence_recv(fence);
-	if (fence && tlb_invalidation_seqno_past(gt, fence->seqno)) {
+
+		if (!tlb_invalidation_seqno_past(gt, fence->seqno))
+			break;
+
 		invalidation_fence_signal(fence);
-		if (!list_empty(&gt->tlb_invalidation.pending_fences))
-			mod_delayed_work(system_wq,
-					 &gt->tlb_invalidation.fence_tdr,
-					 TLB_TIMEOUT);
-		else
-			cancel_delayed_work(&gt->tlb_invalidation.fence_tdr);
 	}
 
+	if (!list_empty(&gt->tlb_invalidation.pending_fences))
+		mod_delayed_work(system_wq,
+				 &gt->tlb_invalidation.fence_tdr,
+				 TLB_TIMEOUT);
+	else
+		cancel_delayed_work(&gt->tlb_invalidation.fence_tdr);
+
+	spin_unlock_irqrestore(&gt->tlb_invalidation.pending_lock, flags);
+
 	return 0;
 }
diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
index 7d4de019f9a5..28b8e8a86fc9 100644
--- a/drivers/gpu/drm/xe/xe_gt_types.h
+++ b/drivers/gpu/drm/xe/xe_gt_types.h
@@ -163,6 +163,11 @@ struct xe_gt {
 		 * invaliations, protected by CT lock
 		 */
 		struct list_head pending_fences;
+		/**
+		 * @pending_lock: protects @pending_fences and updating
+		 * @seqno_recv.
+		 */
+		spinlock_t pending_lock;
 		/**
 		 * @fence_tdr: schedules a delayed call to
 		 * xe_gt_tlb_fence_timeout after the timeut interval is over.
diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
index 0b086d17c083..9fb5fd4391d2 100644
--- a/drivers/gpu/drm/xe/xe_guc_ct.c
+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
@@ -994,15 +994,8 @@ static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path)
 			return 0;
 
 		switch (FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, msg[1])) {
-		/*
-		 * FIXME: We really should process
-		 * XE_GUC_ACTION_TLB_INVALIDATION_DONE here in the fast-path as
-		 * these critical for page fault performance. We currently can't
-		 * due to TLB invalidation done algorithm expecting the seqno
-		 * returned in-order. With some small changes to the algorithm
-		 * and locking we should be able to support out-of-order seqno.
-		 */
 		case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC:
+		case XE_GUC_ACTION_TLB_INVALIDATION_DONE:
 			break;	/* Process these in fast-path */
 		default:
 			return 0;
@@ -1056,8 +1049,7 @@ void xe_guc_ct_fast_path(struct xe_guc_ct *ct)
 	struct xe_device *xe = ct_to_xe(ct);
 	int len;
 
-	if (!xe_device_in_fault_mode(xe) ||
-	    !xe_device_mem_access_get_if_ongoing(xe))
+	if (!xe_device_mem_access_get_if_ongoing(xe))
 		return;
 
 	spin_lock(&ct->fast_lock);