diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index 6530f5d2b35b..8614ed249bb4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -32,6 +32,7 @@ #include "amdgpu.h" #include "amdgpu_gmc.h" #include "amdgpu_ras.h" +#include "amdgpu_reset.h" #include "amdgpu_xgmi.h" #include @@ -630,6 +631,65 @@ error_alloc: dev_err(adev->dev, "Error flushing GPU TLB using the SDMA (%d)!\n", r); } +int amdgpu_gmc_flush_gpu_tlb_pasid(struct amdgpu_device *adev, uint16_t pasid, + uint32_t flush_type, bool all_hub, + uint32_t inst) +{ + u32 usec_timeout = amdgpu_sriov_vf(adev) ? SRIOV_USEC_TIMEOUT : + adev->usec_timeout; + struct amdgpu_ring *ring = &adev->gfx.kiq[inst].ring; + struct amdgpu_kiq *kiq = &adev->gfx.kiq[inst]; + unsigned int ndw; + signed long r; + uint32_t seq; + + if (!adev->gmc.flush_pasid_uses_kiq || !ring->sched.ready || + !down_read_trylock(&adev->reset_domain->sem)) { + return adev->gmc.gmc_funcs->flush_gpu_tlb_pasid(adev, pasid, + flush_type, + all_hub, inst); + } + + /* 2 dwords flush + 8 dwords fence */ + ndw = kiq->pmf->invalidate_tlbs_size + 8; + + if (adev->gmc.flush_tlb_needs_extra_type_2) + ndw += kiq->pmf->invalidate_tlbs_size; + + if (adev->gmc.flush_tlb_needs_extra_type_0) + ndw += kiq->pmf->invalidate_tlbs_size; + + spin_lock(&adev->gfx.kiq[inst].ring_lock); + amdgpu_ring_alloc(ring, ndw); + if (adev->gmc.flush_tlb_needs_extra_type_2) + kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 2, all_hub); + + if (flush_type == 2 && adev->gmc.flush_tlb_needs_extra_type_0) + kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 0, all_hub); + + kiq->pmf->kiq_invalidate_tlbs(ring, pasid, flush_type, all_hub); + r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT); + if (r) { + amdgpu_ring_undo(ring); + spin_unlock(&adev->gfx.kiq[inst].ring_lock); + goto error_unlock_reset; + } + + amdgpu_ring_commit(ring); + spin_unlock(&adev->gfx.kiq[inst].ring_lock); + r = amdgpu_fence_wait_polling(ring, seq, usec_timeout); + if (r < 1) { + dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r); + r = -ETIME; + goto error_unlock_reset; + } + r = 0; + +error_unlock_reset: + up_read(&adev->reset_domain->sem); + return r; +} + /** * amdgpu_gmc_tmz_set -- check and set if a device supports TMZ * @adev: amdgpu_device pointer diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index 9e7df2f69123..7732d4ef845e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -335,11 +335,12 @@ struct amdgpu_gmc { u64 MC_VM_MX_L1_TLB_CNTL; u64 noretry_flags; + + bool flush_tlb_needs_extra_type_0; + bool flush_tlb_needs_extra_type_2; + bool flush_pasid_uses_kiq; }; -#define amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, type, allhub, inst) \ - ((adev)->gmc.gmc_funcs->flush_gpu_tlb_pasid \ - ((adev), (pasid), (type), (allhub), (inst))) #define amdgpu_gmc_emit_flush_gpu_tlb(r, vmid, addr) (r)->adev->gmc.gmc_funcs->emit_flush_gpu_tlb((r), (vmid), (addr)) #define amdgpu_gmc_emit_pasid_mapping(r, vmid, pasid) (r)->adev->gmc.gmc_funcs->emit_pasid_mapping((r), (vmid), (pasid)) #define amdgpu_gmc_map_mtype(adev, flags) (adev)->gmc.gmc_funcs->map_mtype((adev),(flags)) @@ -404,6 +405,9 @@ void amdgpu_gmc_ras_fini(struct amdgpu_device *adev); int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev); void amdgpu_gmc_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, uint32_t vmhub, uint32_t flush_type); +int amdgpu_gmc_flush_gpu_tlb_pasid(struct amdgpu_device *adev, uint16_t pasid, + uint32_t flush_type, bool all_hub, + uint32_t inst); extern void amdgpu_gmc_tmz_set(struct amdgpu_device *adev); extern void amdgpu_gmc_noretry_set(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index f7d0545598bd..95a60f6cd359 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -956,89 +956,30 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, uint16_t pasid, uint32_t flush_type, bool all_hub, uint32_t inst) { - int vmid, i; - signed long r; - uint32_t seq; - uint16_t queried_pasid; - bool ret; - u32 usec_timeout = amdgpu_sriov_vf(adev) ? SRIOV_USEC_TIMEOUT : adev->usec_timeout; - struct amdgpu_ring *ring = &adev->gfx.kiq[inst].ring; - struct amdgpu_kiq *kiq = &adev->gfx.kiq[inst]; - - if (amdgpu_in_reset(adev)) - return -EIO; - - if (ring->sched.ready && down_read_trylock(&adev->reset_domain->sem)) { - /* Vega20+XGMI caches PTEs in TC and TLB. Add a - * heavy-weight TLB flush (type 2), which flushes - * both. Due to a race condition with concurrent - * memory accesses using the same TLB cache line, we - * still need a second TLB flush after this. - */ - bool vega20_xgmi_wa = (adev->gmc.xgmi.num_physical_nodes && - amdgpu_ip_version(adev, GC_HWIP, 0) == - IP_VERSION(9, 4, 0)); - /* 2 dwords flush + 8 dwords fence */ - unsigned int ndw = kiq->pmf->invalidate_tlbs_size + 8; - - if (vega20_xgmi_wa) - ndw += kiq->pmf->invalidate_tlbs_size; - - spin_lock(&adev->gfx.kiq[inst].ring_lock); - /* 2 dwords flush + 8 dwords fence */ - amdgpu_ring_alloc(ring, ndw); - if (vega20_xgmi_wa) - kiq->pmf->kiq_invalidate_tlbs(ring, - pasid, 2, all_hub); - - if (flush_type == 2 && - amdgpu_ip_version(adev, GC_HWIP, 0) == - IP_VERSION(9, 4, 3) && - adev->rev_id == 0) - kiq->pmf->kiq_invalidate_tlbs(ring, - pasid, 0, all_hub); - - kiq->pmf->kiq_invalidate_tlbs(ring, - pasid, flush_type, all_hub); - r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT); - if (r) { - amdgpu_ring_undo(ring); - spin_unlock(&adev->gfx.kiq[inst].ring_lock); - up_read(&adev->reset_domain->sem); - return -ETIME; - } - - amdgpu_ring_commit(ring); - spin_unlock(&adev->gfx.kiq[inst].ring_lock); - r = amdgpu_fence_wait_polling(ring, seq, usec_timeout); - if (r < 1) { - dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r); - up_read(&adev->reset_domain->sem); - return -ETIME; - } - up_read(&adev->reset_domain->sem); - return 0; - } + uint16_t queried; + int i, vmid; for (vmid = 1; vmid < 16; vmid++) { + bool valid; - ret = gmc_v9_0_get_atc_vmid_pasid_mapping_info(adev, vmid, - &queried_pasid); - if (ret && queried_pasid == pasid) { - if (all_hub) { - for_each_set_bit(i, adev->vmhubs_mask, AMDGPU_MAX_VMHUBS) - gmc_v9_0_flush_gpu_tlb(adev, vmid, - i, flush_type); - } else { - gmc_v9_0_flush_gpu_tlb(adev, vmid, - AMDGPU_GFXHUB(0), flush_type); - } - break; + valid = gmc_v9_0_get_atc_vmid_pasid_mapping_info(adev, vmid, + &queried); + if (!valid || queried != pasid) + continue; + + if (all_hub) { + for_each_set_bit(i, adev->vmhubs_mask, + AMDGPU_MAX_VMHUBS) + gmc_v9_0_flush_gpu_tlb(adev, vmid, i, + flush_type); + } else { + gmc_v9_0_flush_gpu_tlb(adev, vmid, + AMDGPU_GFXHUB(0), + flush_type); } } return 0; - } static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring, @@ -2362,6 +2303,24 @@ static int gmc_v9_0_hw_init(void *handle) bool value; int i, r; + adev->gmc.flush_pasid_uses_kiq = true; + + /* Vega20+XGMI caches PTEs in TC and TLB. Add a heavy-weight TLB flush + * (type 2), which flushes both. Due to a race condition with + * concurrent memory accesses using the same TLB cache line, we still + * need a second TLB flush after this. + */ + adev->gmc.flush_tlb_needs_extra_type_2 = + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 0) && + adev->gmc.xgmi.num_physical_nodes; + /* + * TODO: This workaround is badly documented and had a buggy + * implementation. We should probably verify what we do here. + */ + adev->gmc.flush_tlb_needs_extra_type_0 = + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && + adev->rev_id == 0; + /* The sequence of these two function calls matters.*/ gmc_v9_0_init_golden_registers(adev);