drm/i915/execlists: Preempt-to-busy

When using a global seqno, we required a precise stop-the-workd event to handle preemption and unwind the global seqno counter. To accomplish this, we would preempt to a special out-of-band context and wait for the machine to report that it was idle. Given an idle machine, we could very precisely see which requests had completed and which we needed to feed back into the run queue. However, now that we have scrapped the global seqno, we no longer need to precisely unwind the global counter and only track requests by their per-context seqno. This allows us to loosely unwind inflight requests while scheduling a preemption, with the enormous caveat that the requests we put back on the run queue are still _inflight_ (until the preemption request is complete). This makes request tracking much more messy, as at any point then we can see a completed request that we believe is not currently scheduled for execution. We also have to be careful not to rewind RING_TAIL past RING_HEAD on preempting to the running context, and for this we use a semaphore to prevent completion of the request before continuing. To accomplish this feat, we change how we track requests scheduled to the HW. Instead of appending our requests onto a single list as we submit, we track each submission to ELSP as its own block. Then upon receiving the CS preemption event, we promote the pending block to the inflight block (discarding what was previously being tracked). As normal CS completion events arrive, we then remove stale entries from the inflight tracker. v2: Be a tinge paranoid and ensure we flush the write into the HWS page for the GPU semaphore to pick in a timely fashion. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 15:20:51 +01:00
parent 9e138ea1bd
commit 22b7a426bb
13 changed files with 521 additions and 623 deletions
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -646,7 +646,7 @@ static void init_contexts(struct drm_i915_private *i915)

 static bool needs_preempt_context(struct drm_i915_private *i915)
 {
-	return HAS_EXECLISTS(i915);
+	return USES_GUC_SUBMISSION(i915);
 }

 int i915_gem_contexts_init(struct drm_i915_private *dev_priv)
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -13,6 +13,7 @@
 #include <linux/types.h>

 #include "i915_active_types.h"
+#include "i915_utils.h"
 #include "intel_engine_types.h"
 #include "intel_sseu.h"

@@ -38,6 +39,10 @@ struct intel_context {
 	struct i915_gem_context *gem_context;
 	struct intel_engine_cs *engine;
 	struct intel_engine_cs *inflight;
+#define intel_context_inflight(ce) ptr_mask_bits((ce)->inflight, 2)
+#define intel_context_inflight_count(ce)  ptr_unmask_bits((ce)->inflight, 2)
+#define intel_context_inflight_inc(ce) ptr_count_inc(&(ce)->inflight)
+#define intel_context_inflight_dec(ce) ptr_count_dec(&(ce)->inflight)

 	struct list_head signal_link;
 	struct list_head signals;
--- a/drivers/gpu/drm/i915/gt/intel_engine.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine.h
@@ -125,71 +125,26 @@ hangcheck_action_to_str(const enum intel_engine_hangcheck_action a)

 void intel_engines_set_scheduler_caps(struct drm_i915_private *i915);

-static inline void
-execlists_set_active(struct intel_engine_execlists *execlists,
-		     unsigned int bit)
-{
-	__set_bit(bit, (unsigned long *)&execlists->active);
-}
-
-static inline bool
-execlists_set_active_once(struct intel_engine_execlists *execlists,
-			  unsigned int bit)
-{
-	return !__test_and_set_bit(bit, (unsigned long *)&execlists->active);
-}
-
-static inline void
-execlists_clear_active(struct intel_engine_execlists *execlists,
-		       unsigned int bit)
-{
-	__clear_bit(bit, (unsigned long *)&execlists->active);
-}
-
-static inline void
-execlists_clear_all_active(struct intel_engine_execlists *execlists)
-{
-	execlists->active = 0;
-}
-
-static inline bool
-execlists_is_active(const struct intel_engine_execlists *execlists,
-		    unsigned int bit)
-{
-	return test_bit(bit, (unsigned long *)&execlists->active);
-}
-
-void execlists_user_begin(struct intel_engine_execlists *execlists,
-			  const struct execlist_port *port);
-void execlists_user_end(struct intel_engine_execlists *execlists);
-
-void
-execlists_cancel_port_requests(struct intel_engine_execlists * const execlists);
-
-struct i915_request *
-execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists);
-
 static inline unsigned int
 execlists_num_ports(const struct intel_engine_execlists * const execlists)
 {
 	return execlists->port_mask + 1;
 }

-static inline struct execlist_port *
-execlists_port_complete(struct intel_engine_execlists * const execlists,
-			struct execlist_port * const port)
+static inline struct i915_request *
+execlists_active(const struct intel_engine_execlists *execlists)
 {
-	const unsigned int m = execlists->port_mask;
-
-	GEM_BUG_ON(port_index(port, execlists) != 0);
-	GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_USER));
-
-	memmove(port, port + 1, m * sizeof(struct execlist_port));
-	memset(port + m, 0, sizeof(struct execlist_port));
-
-	return port;
+	GEM_BUG_ON(execlists->active - execlists->inflight >
+		   execlists_num_ports(execlists));
+	return READ_ONCE(*execlists->active);
 }

+void
+execlists_cancel_port_requests(struct intel_engine_execlists * const execlists);
+
+struct i915_request *
+execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists);
+
 static inline u32
 intel_read_status_page(const struct intel_engine_cs *engine, int reg)
 {
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -508,6 +508,10 @@ void intel_engine_init_execlists(struct intel_engine_cs *engine)
 	GEM_BUG_ON(!is_power_of_2(execlists_num_ports(execlists)));
 	GEM_BUG_ON(execlists_num_ports(execlists) > EXECLIST_MAX_PORTS);

+	memset(execlists->pending, 0, sizeof(execlists->pending));
+	execlists->active =
+		memset(execlists->inflight, 0, sizeof(execlists->inflight));
+
 	execlists->queue_priority_hint = INT_MIN;
 	execlists->queue = RB_ROOT_CACHED;
 }
@@ -1152,7 +1156,7 @@ bool intel_engine_is_idle(struct intel_engine_cs *engine)
 		return true;

 	/* Waiting to drain ELSP? */
-	if (READ_ONCE(engine->execlists.active)) {
+	if (execlists_active(&engine->execlists)) {
 		struct tasklet_struct *t = &engine->execlists.tasklet;

 		synchronize_hardirq(engine->i915->drm.irq);
@@ -1169,7 +1173,7 @@ bool intel_engine_is_idle(struct intel_engine_cs *engine)
 		/* Otherwise flush the tasklet if it was on another cpu */
 		tasklet_unlock_wait(t);

-		if (READ_ONCE(engine->execlists.active))
+		if (execlists_active(&engine->execlists))
 			return false;
 	}

@@ -1367,6 +1371,7 @@ static void intel_engine_print_registers(struct intel_engine_cs *engine,
 	}

 	if (HAS_EXECLISTS(dev_priv)) {
+		struct i915_request * const *port, *rq;
 		const u32 *hws =
 			&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
 		const u8 num_entries = execlists->csb_size;
@@ -1399,27 +1404,33 @@ static void intel_engine_print_registers(struct intel_engine_cs *engine,
 		}

 		spin_lock_irqsave(&engine->active.lock, flags);
-		for (idx = 0; idx < execlists_num_ports(execlists); idx++) {
-			struct i915_request *rq;
-			unsigned int count;
+		for (port = execlists->active; (rq = *port); port++) {
+			char hdr[80];
+			int len;
+
+			len = snprintf(hdr, sizeof(hdr),
+				       "\t\tActive[%d: ",
+				       (int)(port - execlists->active));
+			if (!i915_request_signaled(rq))
+				len += snprintf(hdr + len, sizeof(hdr) - len,
+						"ring:{start:%08x, hwsp:%08x, seqno:%08x}, ",
+						i915_ggtt_offset(rq->ring->vma),
+						rq->timeline->hwsp_offset,
+						hwsp_seqno(rq));
+			snprintf(hdr + len, sizeof(hdr) - len, "rq: ");
+			print_request(m, rq, hdr);
+		}
+		for (port = execlists->pending; (rq = *port); port++) {
 			char hdr[80];

-			rq = port_unpack(&execlists->port[idx], &count);
-			if (!rq) {
-				drm_printf(m, "\t\tELSP[%d] idle\n", idx);
-			} else if (!i915_request_signaled(rq)) {
-				snprintf(hdr, sizeof(hdr),
-					 "\t\tELSP[%d] count=%d, ring:{start:%08x, hwsp:%08x, seqno:%08x}, rq: ",
-					 idx, count,
-					 i915_ggtt_offset(rq->ring->vma),
-					 rq->timeline->hwsp_offset,
-					 hwsp_seqno(rq));
-				print_request(m, rq, hdr);
-			} else {
-				print_request(m, rq, "\t\tELSP[%d] rq: ");
-			}
+			snprintf(hdr, sizeof(hdr),
+				 "\t\tPending[%d] ring:{start:%08x, hwsp:%08x, seqno:%08x}, rq: ",
+				 (int)(port - execlists->pending),
+				 i915_ggtt_offset(rq->ring->vma),
+				 rq->timeline->hwsp_offset,
+				 hwsp_seqno(rq));
+			print_request(m, rq, hdr);
 		}
-		drm_printf(m, "\t\tHW active? 0x%x\n", execlists->active);
 		spin_unlock_irqrestore(&engine->active.lock, flags);
 	} else if (INTEL_GEN(dev_priv) > 6) {
 		drm_printf(m, "\tPP_DIR_BASE: 0x%08x\n",
@@ -1583,15 +1594,19 @@ int intel_enable_engine_stats(struct intel_engine_cs *engine)
 	}

 	if (engine->stats.enabled++ == 0) {
-		const struct execlist_port *port = execlists->port;
-		unsigned int num_ports = execlists_num_ports(execlists);
+		struct i915_request * const *port;
+		struct i915_request *rq;

 		engine->stats.enabled_at = ktime_get();

 		/* XXX submission method oblivious? */
-		while (num_ports-- && port_isset(port)) {
+		for (port = execlists->active; (rq = *port); port++)
 			engine->stats.active++;
-			port++;
+
+		for (port = execlists->pending; (rq = *port); port++) {
+			/* Exclude any contexts already counted in active */
+			if (intel_context_inflight_count(rq->hw_context) == 1)
+				engine->stats.active++;
 		}

 		if (engine->stats.active)
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -172,51 +172,28 @@ struct intel_engine_execlists {
 	 */
 	u32 __iomem *ctrl_reg;

-	/**
-	 * @port: execlist port states
-	 *
-	 * For each hardware ELSP (ExecList Submission Port) we keep
-	 * track of the last request and the number of times we submitted
-	 * that port to hw. We then count the number of times the hw reports
-	 * a context completion or preemption. As only one context can
-	 * be active on hw, we limit resubmission of context to port[0]. This
-	 * is called Lite Restore, of the context.
-	 */
-	struct execlist_port {
-		/**
-		 * @request_count: combined request and submission count
-		 */
-		struct i915_request *request_count;
-#define EXECLIST_COUNT_BITS 2
-#define port_request(p) ptr_mask_bits((p)->request_count, EXECLIST_COUNT_BITS)
-#define port_count(p) ptr_unmask_bits((p)->request_count, EXECLIST_COUNT_BITS)
-#define port_pack(rq, count) ptr_pack_bits(rq, count, EXECLIST_COUNT_BITS)
-#define port_unpack(p, count) ptr_unpack_bits((p)->request_count, count, EXECLIST_COUNT_BITS)
-#define port_set(p, packed) ((p)->request_count = (packed))
-#define port_isset(p) ((p)->request_count)
-#define port_index(p, execlists) ((p) - (execlists)->port)
-
-		/**
-		 * @context_id: context ID for port
-		 */
-		GEM_DEBUG_DECL(u32 context_id);
-
 #define EXECLIST_MAX_PORTS 2
-	} port[EXECLIST_MAX_PORTS];
-
 	/**
-	 * @active: is the HW active? We consider the HW as active after
-	 * submitting any context for execution and until we have seen the
-	 * last context completion event. After that, we do not expect any
-	 * more events until we submit, and so can park the HW.
-	 *
-	 * As we have a small number of different sources from which we feed
-	 * the HW, we track the state of each inside a single bitfield.
+	 * @active: the currently known context executing on HW
 	 */
-	unsigned int active;
-#define EXECLISTS_ACTIVE_USER 0
-#define EXECLISTS_ACTIVE_PREEMPT 1
-#define EXECLISTS_ACTIVE_HWACK 2
+	struct i915_request * const *active;
+	/**
+	 * @inflight: the set of contexts submitted and acknowleged by HW
+	 *
+	 * The set of inflight contexts is managed by reading CS events
+	 * from the HW. On a context-switch event (not preemption), we
+	 * know the HW has transitioned from port0 to port1, and we
+	 * advance our inflight/active tracking accordingly.
+	 */
+	struct i915_request *inflight[EXECLIST_MAX_PORTS + 1 /* sentinel */];
+	/**
+	 * @pending: the next set of contexts submitted to ELSP
+	 *
+	 * We store the array of contexts that we submit to HW (via ELSP) and
+	 * promote them to the inflight array once HW has signaled the
+	 * preemption or idle-to-active event.
+	 */
+	struct i915_request *pending[EXECLIST_MAX_PORTS + 1];

 	/**
 	 * @port_mask: number of execlist ports - 1
@@ -257,11 +234,6 @@ struct intel_engine_execlists {
 	 */
 	u32 *csb_status;

-	/**
-	 * @preempt_complete_status: expected CSB upon completing preemption
-	 */
-	u32 preempt_complete_status;
-
 	/**
 	 * @csb_size: context status buffer FIFO size
 	 */
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1248,10 +1248,10 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
 	}
 }

-static void record_request(struct i915_request *request,
+static void record_request(const struct i915_request *request,
 			   struct drm_i915_error_request *erq)
 {
-	struct i915_gem_context *ctx = request->gem_context;
+	const struct i915_gem_context *ctx = request->gem_context;

 	erq->flags = request->fence.flags;
 	erq->context = request->fence.context;
@@ -1315,20 +1315,15 @@ static void engine_record_requests(struct intel_engine_cs *engine,
 	ee->num_requests = count;
 }

-static void error_record_engine_execlists(struct intel_engine_cs *engine,
+static void error_record_engine_execlists(const struct intel_engine_cs *engine,
 					  struct drm_i915_error_engine *ee)
 {
 	const struct intel_engine_execlists * const execlists = &engine->execlists;
-	unsigned int n;
+	struct i915_request * const *port = execlists->active;
+	unsigned int n = 0;

-	for (n = 0; n < execlists_num_ports(execlists); n++) {
-		struct i915_request *rq = port_request(&execlists->port[n]);
-
-		if (!rq)
-			break;
-
-		record_request(rq, &ee->execlist[n]);
-	}
+	while (*port)
+		record_request(*port++, &ee->execlist[n++]);

 	ee->num_ports = n;
 }
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -276,6 +276,12 @@ static bool i915_request_retire(struct i915_request *rq)

 	local_irq_disable();

+	/*
+	 * We only loosely track inflight requests across preemption,
+	 * and so we may find ourselves attempting to retire a _completed_
+	 * request that we have removed from the HW and put back on a run
+	 * queue.
+	 */
 	spin_lock(&rq->engine->active.lock);
 	list_del(&rq->sched.link);
 	spin_unlock(&rq->engine->active.lock);
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -28,6 +28,7 @@
 #include <linux/dma-fence.h>
 #include <linux/lockdep.h>

+#include "gt/intel_context_types.h"
 #include "gt/intel_engine_types.h"

 #include "i915_gem.h"
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -179,8 +179,7 @@ static inline int rq_prio(const struct i915_request *rq)

 static void kick_submission(struct intel_engine_cs *engine, int prio)
 {
-	const struct i915_request *inflight =
-		port_request(engine->execlists.port);
+	const struct i915_request *inflight = *engine->execlists.active;

 	/*
 	 * If we are already the currently executing context, don't
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -131,6 +131,18 @@ __check_struct_size(size_t base, size_t arr, size_t count, size_t *size)
 	((typeof(ptr))((unsigned long)(ptr) | __bits));			\
 })

+#define ptr_count_dec(p_ptr) do {					\
+	typeof(p_ptr) __p = (p_ptr);					\
+	unsigned long __v = (unsigned long)(*__p);			\
+	*__p = (typeof(*p_ptr))(--__v);					\
+} while (0)
+
+#define ptr_count_inc(p_ptr) do {					\
+	typeof(p_ptr) __p = (p_ptr);					\
+	unsigned long __v = (unsigned long)(*__p);			\
+	*__p = (typeof(*p_ptr))(++__v);					\
+} while (0)
+
 #define page_mask_bits(ptr) ptr_mask_bits(ptr, PAGE_SHIFT)
 #define page_unmask_bits(ptr) ptr_unmask_bits(ptr, PAGE_SHIFT)
 #define page_pack_bits(ptr, bits) ptr_pack_bits(ptr, bits, PAGE_SHIFT)
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -32,7 +32,11 @@
 #include "intel_guc_submission.h"
 #include "i915_drv.h"

-#define GUC_PREEMPT_FINISHED		0x1
+enum {
+	GUC_PREEMPT_NONE = 0,
+	GUC_PREEMPT_INPROGRESS,
+	GUC_PREEMPT_FINISHED,
+};
 #define GUC_PREEMPT_BREADCRUMB_DWORDS	0x8
 #define GUC_PREEMPT_BREADCRUMB_BYTES	\
 	(sizeof(u32) * GUC_PREEMPT_BREADCRUMB_DWORDS)
@@ -537,15 +541,11 @@ static void guc_add_request(struct intel_guc *guc, struct i915_request *rq)
 	u32 ctx_desc = lower_32_bits(rq->hw_context->lrc_desc);
 	u32 ring_tail = intel_ring_set_tail(rq->ring, rq->tail) / sizeof(u64);

-	spin_lock(&client->wq_lock);
-
 	guc_wq_item_append(client, engine->guc_id, ctx_desc,
 			   ring_tail, rq->fence.seqno);
 	guc_ring_doorbell(client);

 	client->submissions[engine->id] += 1;
-
-	spin_unlock(&client->wq_lock);
 }

 /*
@@ -631,8 +631,9 @@ static void inject_preempt_context(struct work_struct *work)
 	data[6] = intel_guc_ggtt_offset(guc, guc->shared_data);

 	if (WARN_ON(intel_guc_send(guc, data, ARRAY_SIZE(data)))) {
-		execlists_clear_active(&engine->execlists,
-				       EXECLISTS_ACTIVE_PREEMPT);
+		intel_write_status_page(engine,
+					I915_GEM_HWS_PREEMPT,
+					GUC_PREEMPT_NONE);
 		tasklet_schedule(&engine->execlists.tasklet);
 	}

@@ -672,8 +673,6 @@ static void complete_preempt_context(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists *execlists = &engine->execlists;

-	GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT));
-
 	if (inject_preempt_hang(execlists))
 		return;

@@ -681,89 +680,90 @@ static void complete_preempt_context(struct intel_engine_cs *engine)
 	execlists_unwind_incomplete_requests(execlists);

 	wait_for_guc_preempt_report(engine);
-	intel_write_status_page(engine, I915_GEM_HWS_PREEMPT, 0);
+	intel_write_status_page(engine, I915_GEM_HWS_PREEMPT, GUC_PREEMPT_NONE);
 }

-/**
- * guc_submit() - Submit commands through GuC
- * @engine: engine associated with the commands
- *
- * The only error here arises if the doorbell hardware isn't functioning
- * as expected, which really shouln't happen.
- */
-static void guc_submit(struct intel_engine_cs *engine)
+static void guc_submit(struct intel_engine_cs *engine,
+		       struct i915_request **out,
+		       struct i915_request **end)
 {
 	struct intel_guc *guc = &engine->i915->guc;
-	struct intel_engine_execlists * const execlists = &engine->execlists;
-	struct execlist_port *port = execlists->port;
-	unsigned int n;
+	struct intel_guc_client *client = guc->execbuf_client;

-	for (n = 0; n < execlists_num_ports(execlists); n++) {
-		struct i915_request *rq;
-		unsigned int count;
+	spin_lock(&client->wq_lock);

-		rq = port_unpack(&port[n], &count);
-		if (rq && count == 0) {
-			port_set(&port[n], port_pack(rq, ++count));
+	do {
+		struct i915_request *rq = *out++;

-			flush_ggtt_writes(rq->ring->vma);
+		flush_ggtt_writes(rq->ring->vma);
+		guc_add_request(guc, rq);
+	} while (out != end);

-			guc_add_request(guc, rq);
-		}
-	}
-}
-
-static void port_assign(struct execlist_port *port, struct i915_request *rq)
-{
-	GEM_BUG_ON(port_isset(port));
-
-	port_set(port, i915_request_get(rq));
+	spin_unlock(&client->wq_lock);
 }

 static inline int rq_prio(const struct i915_request *rq)
 {
-	return rq->sched.attr.priority;
+	return rq->sched.attr.priority | __NO_PREEMPTION;
 }

-static inline int port_prio(const struct execlist_port *port)
+static struct i915_request *schedule_in(struct i915_request *rq, int idx)
 {
-	return rq_prio(port_request(port)) | __NO_PREEMPTION;
+	trace_i915_request_in(rq, idx);
+
+	if (!rq->hw_context->inflight)
+		rq->hw_context->inflight = rq->engine;
+	intel_context_inflight_inc(rq->hw_context);
+
+	return i915_request_get(rq);
 }

-static bool __guc_dequeue(struct intel_engine_cs *engine)
+static void schedule_out(struct i915_request *rq)
+{
+	trace_i915_request_out(rq);
+
+	intel_context_inflight_dec(rq->hw_context);
+	if (!intel_context_inflight_count(rq->hw_context))
+		rq->hw_context->inflight = NULL;
+
+	i915_request_put(rq);
+}
+
+static void __guc_dequeue(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
-	struct execlist_port *port = execlists->port;
-	struct i915_request *last = NULL;
-	const struct execlist_port * const last_port =
-		&execlists->port[execlists->port_mask];
+	struct i915_request **first = execlists->inflight;
+	struct i915_request ** const last_port = first + execlists->port_mask;
+	struct i915_request *last = first[0];
+	struct i915_request **port;
 	bool submit = false;
 	struct rb_node *rb;

 	lockdep_assert_held(&engine->active.lock);

-	if (port_isset(port)) {
+	if (last) {
 		if (intel_engine_has_preemption(engine)) {
 			struct guc_preempt_work *preempt_work =
 				&engine->i915->guc.preempt_work[engine->id];
 			int prio = execlists->queue_priority_hint;

-			if (i915_scheduler_need_preempt(prio,
-							port_prio(port))) {
-				execlists_set_active(execlists,
-						     EXECLISTS_ACTIVE_PREEMPT);
+			if (i915_scheduler_need_preempt(prio, rq_prio(last))) {
+				intel_write_status_page(engine,
+							I915_GEM_HWS_PREEMPT,
+							GUC_PREEMPT_INPROGRESS);
 				queue_work(engine->i915->guc.preempt_wq,
 					   &preempt_work->work);
-				return false;
+				return;
 			}
 		}

-		port++;
-		if (port_isset(port))
-			return false;
-	}
-	GEM_BUG_ON(port_isset(port));
+		if (*++first)
+			return;

+		last = NULL;
+	}
+
+	port = first;
 	while ((rb = rb_first_cached(&execlists->queue))) {
 		struct i915_priolist *p = to_priolist(rb);
 		struct i915_request *rq, *rn;
@@ -774,18 +774,15 @@ static bool __guc_dequeue(struct intel_engine_cs *engine)
 				if (port == last_port)
 					goto done;

-				if (submit)
-					port_assign(port, last);
+				*port = schedule_in(last,
+						    port - execlists->inflight);
 				port++;
 			}

 			list_del_init(&rq->sched.link);
-
 			__i915_request_submit(rq);
-			trace_i915_request_in(rq, port_index(port, execlists));
-
-			last = rq;
 			submit = true;
+			last = rq;
 		}

 		rb_erase_cached(&p->node, &execlists->queue);
@@ -794,58 +791,41 @@ static bool __guc_dequeue(struct intel_engine_cs *engine)
 done:
 	execlists->queue_priority_hint =
 		rb ? to_priolist(rb)->priority : INT_MIN;
-	if (submit)
-		port_assign(port, last);
-	if (last)
-		execlists_user_begin(execlists, execlists->port);
-
-	/* We must always keep the beast fed if we have work piled up */
-	GEM_BUG_ON(port_isset(execlists->port) &&
-		   !execlists_is_active(execlists, EXECLISTS_ACTIVE_USER));
-	GEM_BUG_ON(rb_first_cached(&execlists->queue) &&
-		   !port_isset(execlists->port));
-
-	return submit;
-}
-
-static void guc_dequeue(struct intel_engine_cs *engine)
-{
-	if (__guc_dequeue(engine))
-		guc_submit(engine);
+	if (submit) {
+		*port = schedule_in(last, port - execlists->inflight);
+		*++port = NULL;
+		guc_submit(engine, first, port);
+	}
+	execlists->active = execlists->inflight;
 }

 static void guc_submission_tasklet(unsigned long data)
 {
 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
 	struct intel_engine_execlists * const execlists = &engine->execlists;
-	struct execlist_port *port = execlists->port;
-	struct i915_request *rq;
+	struct i915_request **port, *rq;
 	unsigned long flags;

 	spin_lock_irqsave(&engine->active.lock, flags);

-	rq = port_request(port);
-	while (rq && i915_request_completed(rq)) {
-		trace_i915_request_out(rq);
-		i915_request_put(rq);
+	for (port = execlists->inflight; (rq = *port); port++) {
+		if (!i915_request_completed(rq))
+			break;

-		port = execlists_port_complete(execlists, port);
-		if (port_isset(port)) {
-			execlists_user_begin(execlists, port);
-			rq = port_request(port);
-		} else {
-			execlists_user_end(execlists);
-			rq = NULL;
-		}
+		schedule_out(rq);
+	}
+	if (port != execlists->inflight) {
+		int idx = port - execlists->inflight;
+		int rem = ARRAY_SIZE(execlists->inflight) - idx;
+		memmove(execlists->inflight, port, rem * sizeof(*port));
 	}

-	if (execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT) &&
-	    intel_read_status_page(engine, I915_GEM_HWS_PREEMPT) ==
+	if (intel_read_status_page(engine, I915_GEM_HWS_PREEMPT) ==
 	    GUC_PREEMPT_FINISHED)
 		complete_preempt_context(engine);

-	if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT))
-		guc_dequeue(engine);
+	if (!intel_read_status_page(engine, I915_GEM_HWS_PREEMPT))
+		__guc_dequeue(engine);

 	spin_unlock_irqrestore(&engine->active.lock, flags);
 }
@@ -959,7 +939,6 @@ static void guc_cancel_requests(struct intel_engine_cs *engine)

 	execlists->queue_priority_hint = INT_MIN;
 	execlists->queue = RB_ROOT_CACHED;
-	GEM_BUG_ON(port_isset(execlists->port));

 	spin_unlock_irqrestore(&engine->active.lock, flags);
 }
@@ -1422,7 +1401,7 @@ int intel_guc_submission_enable(struct intel_guc *guc)
 	 * and it is guaranteed that it will remove the work item from the
 	 * queue before our request is completed.
 	 */
-	BUILD_BUG_ON(ARRAY_SIZE(engine->execlists.port) *
+	BUILD_BUG_ON(ARRAY_SIZE(engine->execlists.inflight) *
 		     sizeof(struct guc_wq_item) *
 		     I915_NUM_ENGINES > GUC_WQ_SIZE);

--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -366,13 +366,15 @@ static int __igt_breadcrumbs_smoketest(void *arg)

 		if (!wait_event_timeout(wait->wait,
 					i915_sw_fence_done(wait),
-					HZ / 2)) {
+					5 * HZ)) {
 			struct i915_request *rq = requests[count - 1];

-			pr_err("waiting for %d fences (last %llx:%lld) on %s timed out!\n",
-			       count,
+			pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
+			       atomic_read(&wait->pending), count,
 			       rq->fence.context, rq->fence.seqno,
 			       t->engine->name);
+			GEM_TRACE_DUMP();
+
 			i915_gem_set_wedged(t->engine->i915);
 			GEM_BUG_ON(!i915_request_completed(rq));
 			i915_sw_fence_wait(wait);