From 97f9ca383dca6f4b425fb3c4709405fb8272a15f Mon Sep 17 00:00:00 2001 From: Colin Xu Date: Fri, 11 Sep 2020 14:52:39 +0800 Subject: [PATCH 01/10] drm/i915/gvt: Allow zero out HWSP addr on hws_pga_write Guest driver may reset HWSP to 0 as init value during D3->D0: The full sequence is: - Boot ->D0 - Update HWSP - D0->D3 - ...In D3 state... - D3->D0 - DMLR reset. - Set engine HWSP to 0. - Set engine ring mode to 0. - Set engine HWSP to correct value. - Set engine ring mode to correct value. Ring mode is masked register so set 0 won't take effect. However HWPS addr 0 is considered as invalid GGTT address which will report error like: gvt: vgpu 1: write invalid HWSP address, reg:0x2080, value:0x0 gvt: vgpu 1: fail to emulate MMIO write 00002080 len 4 Detected your guest driver doesn't support GVT-g. Now vgpu 2 will enter failsafe mode. Zero out HWSP addr is considered as a valid setting from device driver so don't treat it as invalid HWSP addr. V2: Treat HWSP addr 0 as valid. (zhenyu) V3: Change patch title. Reviewed-by: Zhenyu Wang Signed-off-by: Colin Xu Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20200911065239.147789-1-colin.xu@intel.com --- drivers/gpu/drm/i915/gvt/handlers.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c index 05f3bc98d242..388982fe3e02 100644 --- a/drivers/gpu/drm/i915/gvt/handlers.c +++ b/drivers/gpu/drm/i915/gvt/handlers.c @@ -1489,7 +1489,8 @@ static int hws_pga_write(struct intel_vgpu *vgpu, unsigned int offset, const struct intel_engine_cs *engine = intel_gvt_render_mmio_to_engine(vgpu->gvt, offset); - if (!intel_gvt_ggtt_validate_range(vgpu, value, I915_GTT_PAGE_SIZE)) { + if (value != 0 && + !intel_gvt_ggtt_validate_range(vgpu, value, I915_GTT_PAGE_SIZE)) { gvt_vgpu_err("write invalid HWSP address, reg:0x%x, value:0x%x\n", offset, value); return -EINVAL; From 8fe105679765700378eb328495fcfe1566cdbbd0 Mon Sep 17 00:00:00 2001 From: Colin Xu Date: Mon, 12 Oct 2020 12:52:31 +0800 Subject: [PATCH 02/10] drm/i915/gvt: Set SNOOP for PAT3 on BXT/APL to workaround GPU BB hang If guest fills non-priv bb on ApolloLake/Broxton as Mesa i965 does in: 717e7539124d (i965: Use a WC map and memcpy for the batch instead of pw-) Due to the missing flush of bb filled by VM vCPU, host GPU hangs on executing these MI_BATCH_BUFFER. Temporarily workaround this by setting SNOOP bit for PAT3 used by PPGTT PML4 PTE: PAT(0) PCD(1) PWT(1). The performance is still expected to be low, will need further improvement. Acked-by: Zhenyu Wang Signed-off-by: Colin Xu Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20201012045231.226748-1-colin.xu@intel.com --- drivers/gpu/drm/i915/gvt/handlers.c | 32 ++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c index 388982fe3e02..beafc5e435b4 100644 --- a/drivers/gpu/drm/i915/gvt/handlers.c +++ b/drivers/gpu/drm/i915/gvt/handlers.c @@ -1651,6 +1651,34 @@ static int edp_psr_imr_iir_write(struct intel_vgpu *vgpu, return 0; } +/** + * FixMe: + * If guest fills non-priv batch buffer on ApolloLake/Broxton as Mesa i965 did: + * 717e7539124d (i965: Use a WC map and memcpy for the batch instead of pwrite.) + * Due to the missing flush of bb filled by VM vCPU, host GPU hangs on executing + * these MI_BATCH_BUFFER. + * Temporarily workaround this by setting SNOOP bit for PAT3 used by PPGTT + * PML4 PTE: PAT(0) PCD(1) PWT(1). + * The performance is still expected to be low, will need further improvement. + */ +static int bxt_ppat_low_write(struct intel_vgpu *vgpu, unsigned int offset, + void *p_data, unsigned int bytes) +{ + u64 pat = + GEN8_PPAT(0, CHV_PPAT_SNOOP) | + GEN8_PPAT(1, 0) | + GEN8_PPAT(2, 0) | + GEN8_PPAT(3, CHV_PPAT_SNOOP) | + GEN8_PPAT(4, CHV_PPAT_SNOOP) | + GEN8_PPAT(5, CHV_PPAT_SNOOP) | + GEN8_PPAT(6, CHV_PPAT_SNOOP) | + GEN8_PPAT(7, CHV_PPAT_SNOOP); + + vgpu_vreg(vgpu, offset) = lower_32_bits(pat); + + return 0; +} + static int guc_status_read(struct intel_vgpu *vgpu, unsigned int offset, void *p_data, unsigned int bytes) @@ -2812,7 +2840,7 @@ static int init_bdw_mmio_info(struct intel_gvt *gvt) MMIO_DH(GEN6_PCODE_MAILBOX, D_BDW_PLUS, NULL, mailbox_write); - MMIO_D(GEN8_PRIVATE_PAT_LO, D_BDW_PLUS); + MMIO_D(GEN8_PRIVATE_PAT_LO, D_BDW_PLUS & ~D_BXT); MMIO_D(GEN8_PRIVATE_PAT_HI, D_BDW_PLUS); MMIO_D(GAMTARBMODE, D_BDW_PLUS); @@ -3316,6 +3344,8 @@ static int init_bxt_mmio_info(struct intel_gvt *gvt) MMIO_DFH(GEN9_CTX_PREEMPT_REG, D_BXT, F_CMD_ACCESS, NULL, NULL); + MMIO_DH(GEN8_PRIVATE_PAT_LO, D_BXT, NULL, bxt_ppat_low_write); + return 0; } From baec997285e63ad3e03d8b8d45e14776cd737f62 Mon Sep 17 00:00:00 2001 From: Colin Xu Date: Fri, 16 Oct 2020 13:40:59 +0800 Subject: [PATCH 03/10] drm/i915/gvt: Only pin/unpin intel_context along with workload One issue exposed after below commit with which the system will freeze at suspend after vGPU is created (no need to activate the vGPU). commit e6ba76480299 ("drm/i915: Remove i915->kernel_context") Old implementation pin the intel_context at setup_submission and unpin it at clean_submission. So after some vGPU is created, the intel_context is always pinned there although no workload using it. It will then block i915 enter suspend state. There is no need to pin it all the time. Pin/unpin it around workload lifecycle is more reasonable. After GVT enabled suspend/resume, the pinned intel_context will also get unpined when userspace put VM process into suspend state since all workloads are retired, then it's safe to unpin all intel_context for workloads created. So move the pin/unpin to create_workload and destroy_workload, while still keep the create/destroy in old place. V2: Rebase. Fixes: e6ba76480299 ("drm/i915: Remove i915->kernel_context") Reviewed-by: Zhenyu Wang Signed-off-by: Colin Xu Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20201016054059.238371-1-colin.xu@intel.com --- drivers/gpu/drm/i915/gvt/scheduler.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c index 3c3b9842bbbd..68b2d10108fd 100644 --- a/drivers/gpu/drm/i915/gvt/scheduler.c +++ b/drivers/gpu/drm/i915/gvt/scheduler.c @@ -1268,7 +1268,7 @@ void intel_vgpu_clean_submission(struct intel_vgpu *vgpu) i915_context_ppgtt_root_restore(s, i915_vm_to_ppgtt(s->shadow[0]->vm)); for_each_engine(engine, vgpu->gvt->gt, id) - intel_context_unpin(s->shadow[id]); + intel_context_put(s->shadow[id]); kmem_cache_destroy(s->workloads); } @@ -1360,11 +1360,6 @@ int intel_vgpu_setup_submission(struct intel_vgpu *vgpu) ce->ring = __intel_context_ring_size(ring_size); } - ret = intel_context_pin(ce); - intel_context_put(ce); - if (ret) - goto out_shadow_ctx; - s->shadow[i] = ce; } @@ -1396,7 +1391,6 @@ out_shadow_ctx: if (IS_ERR(s->shadow[i])) break; - intel_context_unpin(s->shadow[i]); intel_context_put(s->shadow[i]); } i915_vm_put(&ppgtt->vm); @@ -1470,6 +1464,7 @@ void intel_vgpu_destroy_workload(struct intel_vgpu_workload *workload) { struct intel_vgpu_submission *s = &workload->vgpu->submission; + intel_context_unpin(s->shadow[workload->engine->id]); release_shadow_batch_buffer(workload); release_shadow_wa_ctx(&workload->wa_ctx); @@ -1715,6 +1710,12 @@ intel_vgpu_create_workload(struct intel_vgpu *vgpu, return ERR_PTR(ret); } + ret = intel_context_pin(s->shadow[engine->id]); + if (ret) { + intel_vgpu_destroy_workload(workload); + return ERR_PTR(ret); + } + return workload; } From 92010a97098c4c9fd777408cc98064d26b32695b Mon Sep 17 00:00:00 2001 From: Colin Xu Date: Fri, 16 Oct 2020 13:29:13 +0800 Subject: [PATCH 04/10] drm/i915/gvt: Fix mmio handler break on BXT/APL. - Remove dup mmio handler for BXT/APL. Otherwise mmio handler will fail to init. - Add engine GPR with F_CMD_ACCESS since BXT/APL will load them via LRI. Otherwise, guest will enter failsafe mode. V2: Use RCS/BCS GPR macros instead of offset. Revise commit message. V3: Use GEN8_RING_CS_GPR macros on ring base. Reviewed-by: Zhenyu Wang Signed-off-by: Colin Xu Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20201016052913.209248-1-colin.xu@intel.com --- drivers/gpu/drm/i915/gvt/handlers.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c index 6a026539c873..eb342a759943 100644 --- a/drivers/gpu/drm/i915/gvt/handlers.c +++ b/drivers/gpu/drm/i915/gvt/handlers.c @@ -3168,7 +3168,7 @@ static int init_skl_mmio_info(struct intel_gvt *gvt) NULL, NULL); MMIO_DFH(GAMT_CHKN_BIT_REG, D_KBL | D_CFL, F_CMD_ACCESS, NULL, NULL); - MMIO_D(GEN9_CTX_PREEMPT_REG, D_SKL_PLUS); + MMIO_D(GEN9_CTX_PREEMPT_REG, D_SKL_PLUS & ~D_BXT); return 0; } @@ -3342,6 +3342,16 @@ static int init_bxt_mmio_info(struct intel_gvt *gvt) MMIO_D(GEN8_PUSHBUS_SHIFT, D_BXT); MMIO_D(GEN6_GFXPAUSE, D_BXT); MMIO_DFH(GEN8_L3SQCREG1, D_BXT, F_CMD_ACCESS, NULL, NULL); + MMIO_DFH(GEN8_L3CNTLREG, D_BXT, F_CMD_ACCESS, NULL, NULL); + MMIO_DFH(_MMIO(0x20D8), D_BXT, F_CMD_ACCESS, NULL, NULL); + MMIO_F(GEN8_RING_CS_GPR(RENDER_RING_BASE, 0), 0x40, F_CMD_ACCESS, + 0, 0, D_BXT, NULL, NULL); + MMIO_F(GEN8_RING_CS_GPR(GEN6_BSD_RING_BASE, 0), 0x40, F_CMD_ACCESS, + 0, 0, D_BXT, NULL, NULL); + MMIO_F(GEN8_RING_CS_GPR(BLT_RING_BASE, 0), 0x40, F_CMD_ACCESS, + 0, 0, D_BXT, NULL, NULL); + MMIO_F(GEN8_RING_CS_GPR(VEBOX_RING_BASE, 0), 0x40, F_CMD_ACCESS, + 0, 0, D_BXT, NULL, NULL); MMIO_DFH(GEN9_CTX_PREEMPT_REG, D_BXT, F_CMD_ACCESS, NULL, NULL); From 59dd13ad310793757e34afa489dd6fc8544fc3da Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 19 Oct 2020 21:38:25 +0100 Subject: [PATCH 05/10] drm/i915/gem: Flush coherency domains on first set-domain-ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid skipping what appears to be a no-op set-domain-ioctl if the cache coherency state is inconsistent with our target domain. This also has the utility of using the population of the pages to validate the backing store. The danger in skipping the first set-domain is leaving the cache inconsistent and submitting stale data, or worse leaving the clean data in the cache and not flushing it to the GPU. The impact should be small as it requires a no-op set-domain as the very first ioctl in a particular sequence not found in typical userspace. Reported-by: Zbigniew Kempczyński Fixes: 754a25442705 ("drm/i915: Skip object locking around a no-op set-domain ioctl") Testcase: igt/gem_mmap_offset/blt-coherency Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: Matthew Auld Cc: Zbigniew Kempczyński Cc: # v5.2+ Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20201019203825.10966-1-chris@chris-wilson.co.uk (cherry picked from commit 44c2200afcd59f441b43f27829b4003397cc495d) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gem/i915_gem_domain.c | 28 ++++++++++------------ 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c index 7c90a63c273d..fcce6909f201 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c @@ -508,21 +508,6 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data, if (!obj) return -ENOENT; - /* - * Already in the desired write domain? Nothing for us to do! - * - * We apply a little bit of cunning here to catch a broader set of - * no-ops. If obj->write_domain is set, we must be in the same - * obj->read_domains, and only that domain. Therefore, if that - * obj->write_domain matches the request read_domains, we are - * already in the same read/write domain and can skip the operation, - * without having to further check the requested write_domain. - */ - if (READ_ONCE(obj->write_domain) == read_domains) { - err = 0; - goto out; - } - /* * Try to flush the object off the GPU without holding the lock. * We will repeat the flush holding the lock in the normal manner @@ -560,6 +545,19 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data, if (err) goto out; + /* + * Already in the desired write domain? Nothing for us to do! + * + * We apply a little bit of cunning here to catch a broader set of + * no-ops. If obj->write_domain is set, we must be in the same + * obj->read_domains, and only that domain. Therefore, if that + * obj->write_domain matches the request read_domains, we are + * already in the same read/write domain and can skip the operation, + * without having to further check the requested write_domain. + */ + if (READ_ONCE(obj->write_domain) == read_domains) + goto out_unpin; + err = i915_gem_object_lock_interruptible(obj, NULL); if (err) goto out_unpin; From 8ce70996f759a37bac92e69ae0addd715227bfd1 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 22 Oct 2020 07:41:27 +0100 Subject: [PATCH 06/10] drm/i915/gt: Use the local HWSP offset during submission We wrap the timeline on construction of the next request, but there may still be requests in flight that have not yet finalized the breadcrumb. (The breadcrumb is delayed as we need engine-local offsets, and for the virtual engine that is not known until execution.) As such, by the time we write to the timeline's HWSP offset it may have changed, and we should use the value we preserved in the request instead. Though the window is small and infrequent (at full flow we can expect a timeline's seqno to wrap once every 30 minutes), the impact of writing the old seqno into the new HWSP is severe: the old requests are never completed, and the new requests are completed before they are even submitted. Fixes: ebece7539242 ("drm/i915: Keep timeline HWSP allocated until idle across the system") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Cc: Joonas Lahtinen Cc: # v5.2+ Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201022064127.10159-1-chris@chris-wilson.co.uk (cherry picked from commit c10f6019d0b2dc8a6a62b55459f3ada5bc4e5e1a) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/intel_lrc.c | 27 +++++++++++++------ drivers/gpu/drm/i915/gt/intel_timeline.c | 18 +++++++------ .../gpu/drm/i915/gt/intel_timeline_types.h | 2 ++ 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index a32aabce7901..d12861cf0753 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -3547,6 +3547,19 @@ static const struct intel_context_ops execlists_context_ops = { .destroy = execlists_context_destroy, }; +static u32 hwsp_offset(const struct i915_request *rq) +{ + const struct intel_timeline_cacheline *cl; + + /* Before the request is executed, the timeline/cachline is fixed */ + + cl = rcu_dereference_protected(rq->hwsp_cacheline, 1); + if (cl) + return cl->ggtt_offset; + + return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset; +} + static int gen8_emit_init_breadcrumb(struct i915_request *rq) { u32 *cs; @@ -3569,7 +3582,7 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq) *cs++ = MI_NOOP; *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; - *cs++ = i915_request_timeline(rq)->hwsp_offset; + *cs++ = hwsp_offset(rq); *cs++ = 0; *cs++ = rq->fence.seqno - 1; @@ -4886,11 +4899,9 @@ gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) return gen8_emit_wa_tail(request, cs); } -static u32 *emit_xcs_breadcrumb(struct i915_request *request, u32 *cs) +static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs) { - u32 addr = i915_request_active_timeline(request)->hwsp_offset; - - return gen8_emit_ggtt_write(cs, request->fence.seqno, addr, 0); + return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0); } static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) @@ -4909,7 +4920,7 @@ static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ cs = gen8_emit_ggtt_write_rcs(cs, request->fence.seqno, - i915_request_active_timeline(request)->hwsp_offset, + hwsp_offset(request), PIPE_CONTROL_FLUSH_ENABLE | PIPE_CONTROL_CS_STALL); @@ -4921,7 +4932,7 @@ gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) { cs = gen8_emit_ggtt_write_rcs(cs, request->fence.seqno, - i915_request_active_timeline(request)->hwsp_offset, + hwsp_offset(request), PIPE_CONTROL_CS_STALL | PIPE_CONTROL_TILE_CACHE_FLUSH | PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | @@ -4991,7 +5002,7 @@ gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) { cs = gen12_emit_ggtt_write_rcs(cs, request->fence.seqno, - i915_request_active_timeline(request)->hwsp_offset, + hwsp_offset(request), PIPE_CONTROL0_HDC_PIPELINE_FLUSH, PIPE_CONTROL_CS_STALL | PIPE_CONTROL_TILE_CACHE_FLUSH | diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c index a2f74cefe4c3..7ea94d201fe6 100644 --- a/drivers/gpu/drm/i915/gt/intel_timeline.c +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c @@ -188,10 +188,14 @@ cacheline_alloc(struct intel_timeline_hwsp *hwsp, unsigned int cacheline) return cl; } -static void cacheline_acquire(struct intel_timeline_cacheline *cl) +static void cacheline_acquire(struct intel_timeline_cacheline *cl, + u32 ggtt_offset) { - if (cl) - i915_active_acquire(&cl->active); + if (!cl) + return; + + cl->ggtt_offset = ggtt_offset; + i915_active_acquire(&cl->active); } static void cacheline_release(struct intel_timeline_cacheline *cl) @@ -340,7 +344,7 @@ int intel_timeline_pin(struct intel_timeline *tl, struct i915_gem_ww_ctx *ww) GT_TRACE(tl->gt, "timeline:%llx using HWSP offset:%x\n", tl->fence_context, tl->hwsp_offset); - cacheline_acquire(tl->hwsp_cacheline); + cacheline_acquire(tl->hwsp_cacheline, tl->hwsp_offset); if (atomic_fetch_inc(&tl->pin_count)) { cacheline_release(tl->hwsp_cacheline); __i915_vma_unpin(tl->hwsp_ggtt); @@ -515,7 +519,7 @@ __intel_timeline_get_seqno(struct intel_timeline *tl, GT_TRACE(tl->gt, "timeline:%llx using HWSP offset:%x\n", tl->fence_context, tl->hwsp_offset); - cacheline_acquire(cl); + cacheline_acquire(cl, tl->hwsp_offset); tl->hwsp_cacheline = cl; *seqno = timeline_advance(tl); @@ -573,9 +577,7 @@ int intel_timeline_read_hwsp(struct i915_request *from, if (err) goto out; - *hwsp = i915_ggtt_offset(cl->hwsp->vma) + - ptr_unmask_bits(cl->vaddr, CACHELINE_BITS) * CACHELINE_BYTES; - + *hwsp = cl->ggtt_offset; out: i915_active_release(&cl->active); return err; diff --git a/drivers/gpu/drm/i915/gt/intel_timeline_types.h b/drivers/gpu/drm/i915/gt/intel_timeline_types.h index 02181c5020db..4474f487f589 100644 --- a/drivers/gpu/drm/i915/gt/intel_timeline_types.h +++ b/drivers/gpu/drm/i915/gt/intel_timeline_types.h @@ -94,6 +94,8 @@ struct intel_timeline_cacheline { struct intel_timeline_hwsp *hwsp; void *vaddr; + u32 ggtt_offset; + struct rcu_head rcu; }; From d9a57c853975742c8281f703b9e536d8aa016ec2 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Tue, 27 Oct 2020 18:09:28 +0200 Subject: [PATCH 07/10] drm/i915: Fix encoder lookup during PSR atomic check The atomic check hooks must look up the encoder to be used with a connector from the connector's atomic state, and not assume that it's the connector's current attached encoder. The latter one can change under the atomic check func, or can be unset yet as in the case of MST connectors. This fixes [ 7.940719] Oops: 0000 [#1] SMP NOPTI [ 7.944407] CPU: 2 PID: 143 Comm: kworker/2:2 Not tainted 5.6.0-1023-oem #23-Ubuntu [ 7.952102] Hardware name: Dell Inc. Latitude 7320/, BIOS 88.87.11 09/07/2020 [ 7.959278] Workqueue: events output_poll_execute [drm_kms_helper] [ 7.965511] RIP: 0010:intel_psr_atomic_check+0x37/0xa0 [i915] [ 7.971327] Code: 80 2d 06 00 00 20 74 42 80 b8 34 71 00 00 00 74 39 48 8b 72 08 48 85 f6 74 30 80 b8 f8 71 00 00 00 74 27 4c 8b 87 80 04 00 00 <41> 8b 78 78 83 ff 08 77 19 31 c9 83 ff 05 77 19 48 81 c1 20 01 00 [ 7.977541] input: PS/2 Generic Mouse as /devices/platform/i8042/serio1/input/input5 [ 7.990154] RSP: 0018:ffffb864c073fac8 EFLAGS: 00010202 [ 7.990155] RAX: ffff8c5d55ce0000 RBX: ffff8c5d54519000 RCX: 0000000000000000 [ 7.990155] RDX: ffff8c5d55cb30c0 RSI: ffff8c5d89a0c800 RDI: ffff8c5d55fcf800 [ 7.990156] RBP: ffffb864c073fac8 R08: 0000000000000000 R09: ffff8c5d55d9f3a0 [ 7.990156] R10: ffff8c5d55cb30c0 R11: 0000000000000009 R12: ffff8c5d55fcf800 [ 7.990156] R13: ffff8c5d55cb30c0 R14: ffff8c5d56989cc0 R15: ffff8c5d56989cc0 [ 7.990158] FS: 0000000000000000(0000) GS:ffff8c5d8e480000(0000) knlGS:0000000000000000 [ 8.047193] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8.052970] CR2: 0000000000000078 CR3: 0000000856500005 CR4: 0000000000760ee0 [ 8.060137] PKRU: 55555554 [ 8.062867] Call Trace: [ 8.065361] intel_digital_connector_atomic_check+0x53/0x130 [i915] [ 8.071703] intel_dp_mst_atomic_check+0x5b/0x200 [i915] [ 8.077074] drm_atomic_helper_check_modeset+0x1db/0x790 [drm_kms_helper] [ 8.083942] intel_atomic_check+0x92/0xc50 [i915] [ 8.088705] ? drm_plane_check_pixel_format+0x4f/0xb0 [drm] [ 8.094345] ? drm_atomic_plane_check+0x7a/0x3a0 [drm] [ 8.099548] drm_atomic_check_only+0x2b1/0x450 [drm] [ 8.104573] drm_atomic_commit+0x18/0x50 [drm] [ 8.109070] drm_client_modeset_commit_atomic+0x1c9/0x200 [drm] [ 8.115056] drm_client_modeset_commit_force+0x55/0x160 [drm] [ 8.120866] drm_fb_helper_restore_fbdev_mode_unlocked+0x54/0xb0 [drm_kms_helper] [ 8.128415] drm_fb_helper_set_par+0x34/0x50 [drm_kms_helper] [ 8.134225] drm_fb_helper_hotplug_event.part.0+0xb4/0xe0 [drm_kms_helper] [ 8.141150] drm_fb_helper_hotplug_event+0x1c/0x30 [drm_kms_helper] [ 8.147481] intel_fbdev_output_poll_changed+0x6f/0xa0 [i915] [ 8.153287] drm_kms_helper_hotplug_event+0x2c/0x40 [drm_kms_helper] [ 8.159709] output_poll_execute+0x1aa/0x1c0 [drm_kms_helper] [ 8.165506] process_one_work+0x1e8/0x3b0 [ 8.169561] worker_thread+0x4d/0x400 [ 8.173249] kthread+0x104/0x140 [ 8.176515] ? process_one_work+0x3b0/0x3b0 [ 8.180726] ? kthread_park+0x90/0x90 [ 8.184416] ret_from_fork+0x1f/0x40 Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2361 References: https://gitlab.freedesktop.org/drm/intel/-/issues/2486 Reported-by: William Tseng Reported-by: Cooper Chiou Cc: Signed-off-by: Imre Deak Reviewed-by: Anshuman Gupta Link: https://patchwork.freedesktop.org/patch/msgid/20201027160928.3665377-1-imre.deak@intel.com (cherry picked from commit 00e5deb5c4f5fe367311465e720e65cfa1178792) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_psr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index 8a9d0bdde1bf..40e9cb29233d 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -1754,7 +1754,7 @@ void intel_psr_atomic_check(struct drm_connector *connector, return; intel_connector = to_intel_connector(connector); - dig_port = enc_to_dig_port(intel_attached_encoder(intel_connector)); + dig_port = enc_to_dig_port(to_intel_encoder(new_state->best_encoder)); if (dev_priv->psr.dp != &dig_port->dp) return; From 306bb61d6bb3531b0d05429a771ac13a75aa30c8 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 2 Nov 2020 22:10:56 +0000 Subject: [PATCH 08/10] drm/i915/gt: Expose more parameters for emitting writes into the ring Add another lower level to emit_ggtt_write so that the GGTT nature of the write is not hardcoded into the emitter. Signed-off-by: Chris Wilson Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201102221057.29626-1-chris@chris-wilson.co.uk (cherry picked from commit 2739d8cfc50aafff49d599cc0a5bc855445e99a7) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/intel_engine.h | 55 ++++++++++++++++---------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h index 7c3a1012e702..760fefdfe392 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine.h +++ b/drivers/gpu/drm/i915/gt/intel_engine.h @@ -245,22 +245,14 @@ static inline u32 *gen12_emit_pipe_control(u32 *batch, u32 flags0, u32 flags1, u } static inline u32 * -__gen8_emit_ggtt_write_rcs(u32 *cs, u32 value, u32 gtt_offset, u32 flags0, u32 flags1) +__gen8_emit_write_rcs(u32 *cs, u32 value, u32 offset, u32 flags0, u32 flags1) { - /* We're using qword write, offset should be aligned to 8 bytes. */ - GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8)); - - /* w/a for post sync ops following a GPGPU operation we - * need a prior CS_STALL, which is emitted by the flush - * following the batch. - */ *cs++ = GFX_OP_PIPE_CONTROL(6) | flags0; - *cs++ = flags1 | PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_GLOBAL_GTT_IVB; - *cs++ = gtt_offset; + *cs++ = flags1 | PIPE_CONTROL_QW_WRITE; + *cs++ = offset; *cs++ = 0; *cs++ = value; - /* We're thrashing one dword of HWS. */ - *cs++ = 0; + *cs++ = 0; /* We're thrashing one extra dword. */ return cs; } @@ -268,13 +260,38 @@ __gen8_emit_ggtt_write_rcs(u32 *cs, u32 value, u32 gtt_offset, u32 flags0, u32 f static inline u32* gen8_emit_ggtt_write_rcs(u32 *cs, u32 value, u32 gtt_offset, u32 flags) { - return __gen8_emit_ggtt_write_rcs(cs, value, gtt_offset, 0, flags); + /* We're using qword write, offset should be aligned to 8 bytes. */ + GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8)); + + return __gen8_emit_write_rcs(cs, + value, + gtt_offset, + 0, + flags | PIPE_CONTROL_GLOBAL_GTT_IVB); } static inline u32* gen12_emit_ggtt_write_rcs(u32 *cs, u32 value, u32 gtt_offset, u32 flags0, u32 flags1) { - return __gen8_emit_ggtt_write_rcs(cs, value, gtt_offset, flags0, flags1); + /* We're using qword write, offset should be aligned to 8 bytes. */ + GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8)); + + return __gen8_emit_write_rcs(cs, + value, + gtt_offset, + flags0, + flags1 | PIPE_CONTROL_GLOBAL_GTT_IVB); +} + +static inline u32 * +__gen8_emit_flush_dw(u32 *cs, u32 value, u32 gtt_offset, u32 flags) +{ + *cs++ = (MI_FLUSH_DW + 1) | flags; + *cs++ = gtt_offset; + *cs++ = 0; + *cs++ = value; + + return cs; } static inline u32 * @@ -285,12 +302,10 @@ gen8_emit_ggtt_write(u32 *cs, u32 value, u32 gtt_offset, u32 flags) /* Offset should be aligned to 8 bytes for both (QW/DW) write types */ GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8)); - *cs++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW | flags; - *cs++ = gtt_offset | MI_FLUSH_DW_USE_GTT; - *cs++ = 0; - *cs++ = value; - - return cs; + return __gen8_emit_flush_dw(cs, + value, + gtt_offset | MI_FLUSH_DW_USE_GTT, + flags | MI_FLUSH_DW_OP_STOREDW); } static inline void __intel_engine_reset(struct intel_engine_cs *engine, From e67d01d8494640018b08cd767aeb2824a8e11983 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 2 Nov 2020 22:10:57 +0000 Subject: [PATCH 09/10] drm/i915/gt: Flush xcs before tgl breadcrumbs In a simple test case that writes to scratch and then busy-waits for the batch to be signaled, we observe that the signal is before the write is posted. That is bad news. Splitting the flush + write_dword into two separate flush_dw prevents the issue from being reproduced, we can presume the post-sync op is not so post-sync. Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/216 Testcase: igt/gem_exec_fence/parallel Testcase: igt/i915_selftest/live/gt_timelines Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: stable@vger.kernel.org Acked-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201102221057.29626-2-chris@chris-wilson.co.uk (cherry picked from commit 09212e81e5450743e5b06b27c4e344e4c45b630d) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/intel_lrc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index d12861cf0753..f82c6dd1de18 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -4994,7 +4994,9 @@ gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) { - return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); + /* XXX Stalling flush before seqno write; post-sync not */ + cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0)); + return gen12_emit_fini_breadcrumb_tail(rq, cs); } static u32 * From 537457a979a02a410b555fab289dcb28b588f33b Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 2 Nov 2020 16:19:31 +0000 Subject: [PATCH 10/10] drm/i915: Hold onto an explicit ref to i915_vma_work.pinned Since __vma_release is run by a kworker after the fence has been signaled, it is no longer protected by the active reference on the vma, and so the alias of vw->pinned to vma->obj is also not protected by a reference on the object. Add an explicit reference for vw->pinned so it will always be safe. Found by inspection. Fixes: 54d7195f8c64 ("drm/i915: Unpin vma->obj on early error") Reported-by: Tvrtko Ursulin Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Cc: # v5.6+ Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20201102161931.30031-1-chris@chris-wilson.co.uk (cherry picked from commit bc73e5d33048b7ab5f12b11b5d923700467a8e1d) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_vma.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index ffb5287e055a..caa9b041616b 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -314,8 +314,10 @@ static void __vma_release(struct dma_fence_work *work) { struct i915_vma_work *vw = container_of(work, typeof(*vw), base); - if (vw->pinned) + if (vw->pinned) { __i915_gem_object_unpin_pages(vw->pinned); + i915_gem_object_put(vw->pinned); + } i915_vm_free_pt_stash(vw->vm, &vw->stash); i915_vm_put(vw->vm); @@ -431,7 +433,7 @@ int i915_vma_bind(struct i915_vma *vma, if (vma->obj) { __i915_gem_object_pin_pages(vma->obj); - work->pinned = vma->obj; + work->pinned = i915_gem_object_get(vma->obj); } } else { vma->ops->bind_vma(vma->vm, NULL, vma, cache_level, bind_flags);